xref: /linux-6.15/net/core/dev.c (revision 1ce84604)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <[email protected]>
12  *				Mark Evans, <[email protected]>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <[email protected]>
16  *		Alan Cox <[email protected]>
17  *		David Hinds <[email protected]>
18  *		Alexey Kuznetsov <[email protected]>
19  *		Adam Sulmicki <[email protected]>
20  *              Pekka Riikonen <[email protected]>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <linux/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <linux/bpf.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <net/busy_poll.h>
101 #include <linux/rtnetlink.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/dst_metadata.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
123 #include <net/ip.h>
124 #include <net/mpls.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/static_key.h>
136 #include <linux/hashtable.h>
137 #include <linux/vmalloc.h>
138 #include <linux/if_macvlan.h>
139 #include <linux/errqueue.h>
140 #include <linux/hrtimer.h>
141 #include <linux/netfilter_ingress.h>
142 #include <linux/crash_dump.h>
143 
144 #include "net-sysfs.h"
145 
146 /* Instead of increasing this, you should create a hash table. */
147 #define MAX_GRO_SKBS 8
148 
149 /* This should be increased if a protocol with a bigger head is added. */
150 #define GRO_MAX_HEAD (MAX_HEADER + 128)
151 
152 static DEFINE_SPINLOCK(ptype_lock);
153 static DEFINE_SPINLOCK(offload_lock);
154 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
155 struct list_head ptype_all __read_mostly;	/* Taps */
156 static struct list_head offload_base __read_mostly;
157 
158 static int netif_rx_internal(struct sk_buff *skb);
159 static int call_netdevice_notifiers_info(unsigned long val,
160 					 struct net_device *dev,
161 					 struct netdev_notifier_info *info);
162 
163 /*
164  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
165  * semaphore.
166  *
167  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
168  *
169  * Writers must hold the rtnl semaphore while they loop through the
170  * dev_base_head list, and hold dev_base_lock for writing when they do the
171  * actual updates.  This allows pure readers to access the list even
172  * while a writer is preparing to update it.
173  *
174  * To put it another way, dev_base_lock is held for writing only to
175  * protect against pure readers; the rtnl semaphore provides the
176  * protection against other writers.
177  *
178  * See, for example usages, register_netdevice() and
179  * unregister_netdevice(), which must be called with the rtnl
180  * semaphore held.
181  */
182 DEFINE_RWLOCK(dev_base_lock);
183 EXPORT_SYMBOL(dev_base_lock);
184 
185 /* protects napi_hash addition/deletion and napi_gen_id */
186 static DEFINE_SPINLOCK(napi_hash_lock);
187 
188 static unsigned int napi_gen_id = NR_CPUS;
189 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
190 
191 static seqcount_t devnet_rename_seq;
192 
193 static inline void dev_base_seq_inc(struct net *net)
194 {
195 	while (++net->dev_base_seq == 0);
196 }
197 
198 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
199 {
200 	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
201 
202 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
203 }
204 
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
208 }
209 
210 static inline void rps_lock(struct softnet_data *sd)
211 {
212 #ifdef CONFIG_RPS
213 	spin_lock(&sd->input_pkt_queue.lock);
214 #endif
215 }
216 
217 static inline void rps_unlock(struct softnet_data *sd)
218 {
219 #ifdef CONFIG_RPS
220 	spin_unlock(&sd->input_pkt_queue.lock);
221 #endif
222 }
223 
224 /* Device list insertion */
225 static void list_netdevice(struct net_device *dev)
226 {
227 	struct net *net = dev_net(dev);
228 
229 	ASSERT_RTNL();
230 
231 	write_lock_bh(&dev_base_lock);
232 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
233 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
234 	hlist_add_head_rcu(&dev->index_hlist,
235 			   dev_index_hash(net, dev->ifindex));
236 	write_unlock_bh(&dev_base_lock);
237 
238 	dev_base_seq_inc(net);
239 }
240 
241 /* Device list removal
242  * caller must respect a RCU grace period before freeing/reusing dev
243  */
244 static void unlist_netdevice(struct net_device *dev)
245 {
246 	ASSERT_RTNL();
247 
248 	/* Unlink dev from the device chain */
249 	write_lock_bh(&dev_base_lock);
250 	list_del_rcu(&dev->dev_list);
251 	hlist_del_rcu(&dev->name_hlist);
252 	hlist_del_rcu(&dev->index_hlist);
253 	write_unlock_bh(&dev_base_lock);
254 
255 	dev_base_seq_inc(dev_net(dev));
256 }
257 
258 /*
259  *	Our notifier list
260  */
261 
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263 
264 /*
265  *	Device drivers call our routines to queue packets here. We empty the
266  *	queue in the local softnet handler.
267  */
268 
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271 
272 #ifdef CONFIG_LOCKDEP
273 /*
274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275  * according to dev->type
276  */
277 static const unsigned short netdev_lock_type[] =
278 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
291 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
292 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
293 
294 static const char *const netdev_lock_name[] =
295 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
308 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
309 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
310 
311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
313 
314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315 {
316 	int i;
317 
318 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 		if (netdev_lock_type[i] == dev_type)
320 			return i;
321 	/* the last key is used by default */
322 	return ARRAY_SIZE(netdev_lock_type) - 1;
323 }
324 
325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 						 unsigned short dev_type)
327 {
328 	int i;
329 
330 	i = netdev_lock_pos(dev_type);
331 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 				   netdev_lock_name[i]);
333 }
334 
335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336 {
337 	int i;
338 
339 	i = netdev_lock_pos(dev->type);
340 	lockdep_set_class_and_name(&dev->addr_list_lock,
341 				   &netdev_addr_lock_key[i],
342 				   netdev_lock_name[i]);
343 }
344 #else
345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 						 unsigned short dev_type)
347 {
348 }
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350 {
351 }
352 #endif
353 
354 /*******************************************************************************
355 
356 		Protocol management and registration routines
357 
358 *******************************************************************************/
359 
360 /*
361  *	Add a protocol ID to the list. Now that the input handler is
362  *	smarter we can dispense with all the messy stuff that used to be
363  *	here.
364  *
365  *	BEWARE!!! Protocol handlers, mangling input packets,
366  *	MUST BE last in hash buckets and checking protocol handlers
367  *	MUST start from promiscuous ptype_all chain in net_bh.
368  *	It is true now, do not change it.
369  *	Explanation follows: if protocol handler, mangling packet, will
370  *	be the first on list, it is not able to sense, that packet
371  *	is cloned and should be copied-on-write, so that it will
372  *	change it and subsequent readers will get broken packet.
373  *							--ANK (980803)
374  */
375 
376 static inline struct list_head *ptype_head(const struct packet_type *pt)
377 {
378 	if (pt->type == htons(ETH_P_ALL))
379 		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
380 	else
381 		return pt->dev ? &pt->dev->ptype_specific :
382 				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383 }
384 
385 /**
386  *	dev_add_pack - add packet handler
387  *	@pt: packet type declaration
388  *
389  *	Add a protocol handler to the networking stack. The passed &packet_type
390  *	is linked into kernel lists and may not be freed until it has been
391  *	removed from the kernel lists.
392  *
393  *	This call does not sleep therefore it can not
394  *	guarantee all CPU's that are in middle of receiving packets
395  *	will see the new packet type (until the next received packet).
396  */
397 
398 void dev_add_pack(struct packet_type *pt)
399 {
400 	struct list_head *head = ptype_head(pt);
401 
402 	spin_lock(&ptype_lock);
403 	list_add_rcu(&pt->list, head);
404 	spin_unlock(&ptype_lock);
405 }
406 EXPORT_SYMBOL(dev_add_pack);
407 
408 /**
409  *	__dev_remove_pack	 - remove packet handler
410  *	@pt: packet type declaration
411  *
412  *	Remove a protocol handler that was previously added to the kernel
413  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
414  *	from the kernel lists and can be freed or reused once this function
415  *	returns.
416  *
417  *      The packet type might still be in use by receivers
418  *	and must not be freed until after all the CPU's have gone
419  *	through a quiescent state.
420  */
421 void __dev_remove_pack(struct packet_type *pt)
422 {
423 	struct list_head *head = ptype_head(pt);
424 	struct packet_type *pt1;
425 
426 	spin_lock(&ptype_lock);
427 
428 	list_for_each_entry(pt1, head, list) {
429 		if (pt == pt1) {
430 			list_del_rcu(&pt->list);
431 			goto out;
432 		}
433 	}
434 
435 	pr_warn("dev_remove_pack: %p not found\n", pt);
436 out:
437 	spin_unlock(&ptype_lock);
438 }
439 EXPORT_SYMBOL(__dev_remove_pack);
440 
441 /**
442  *	dev_remove_pack	 - remove packet handler
443  *	@pt: packet type declaration
444  *
445  *	Remove a protocol handler that was previously added to the kernel
446  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
447  *	from the kernel lists and can be freed or reused once this function
448  *	returns.
449  *
450  *	This call sleeps to guarantee that no CPU is looking at the packet
451  *	type after return.
452  */
453 void dev_remove_pack(struct packet_type *pt)
454 {
455 	__dev_remove_pack(pt);
456 
457 	synchronize_net();
458 }
459 EXPORT_SYMBOL(dev_remove_pack);
460 
461 
462 /**
463  *	dev_add_offload - register offload handlers
464  *	@po: protocol offload declaration
465  *
466  *	Add protocol offload handlers to the networking stack. The passed
467  *	&proto_offload is linked into kernel lists and may not be freed until
468  *	it has been removed from the kernel lists.
469  *
470  *	This call does not sleep therefore it can not
471  *	guarantee all CPU's that are in middle of receiving packets
472  *	will see the new offload handlers (until the next received packet).
473  */
474 void dev_add_offload(struct packet_offload *po)
475 {
476 	struct packet_offload *elem;
477 
478 	spin_lock(&offload_lock);
479 	list_for_each_entry(elem, &offload_base, list) {
480 		if (po->priority < elem->priority)
481 			break;
482 	}
483 	list_add_rcu(&po->list, elem->list.prev);
484 	spin_unlock(&offload_lock);
485 }
486 EXPORT_SYMBOL(dev_add_offload);
487 
488 /**
489  *	__dev_remove_offload	 - remove offload handler
490  *	@po: packet offload declaration
491  *
492  *	Remove a protocol offload handler that was previously added to the
493  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
494  *	is removed from the kernel lists and can be freed or reused once this
495  *	function returns.
496  *
497  *      The packet type might still be in use by receivers
498  *	and must not be freed until after all the CPU's have gone
499  *	through a quiescent state.
500  */
501 static void __dev_remove_offload(struct packet_offload *po)
502 {
503 	struct list_head *head = &offload_base;
504 	struct packet_offload *po1;
505 
506 	spin_lock(&offload_lock);
507 
508 	list_for_each_entry(po1, head, list) {
509 		if (po == po1) {
510 			list_del_rcu(&po->list);
511 			goto out;
512 		}
513 	}
514 
515 	pr_warn("dev_remove_offload: %p not found\n", po);
516 out:
517 	spin_unlock(&offload_lock);
518 }
519 
520 /**
521  *	dev_remove_offload	 - remove packet offload handler
522  *	@po: packet offload declaration
523  *
524  *	Remove a packet offload handler that was previously added to the kernel
525  *	offload handlers by dev_add_offload(). The passed &offload_type is
526  *	removed from the kernel lists and can be freed or reused once this
527  *	function returns.
528  *
529  *	This call sleeps to guarantee that no CPU is looking at the packet
530  *	type after return.
531  */
532 void dev_remove_offload(struct packet_offload *po)
533 {
534 	__dev_remove_offload(po);
535 
536 	synchronize_net();
537 }
538 EXPORT_SYMBOL(dev_remove_offload);
539 
540 /******************************************************************************
541 
542 		      Device Boot-time Settings Routines
543 
544 *******************************************************************************/
545 
546 /* Boot time configuration table */
547 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
548 
549 /**
550  *	netdev_boot_setup_add	- add new setup entry
551  *	@name: name of the device
552  *	@map: configured settings for the device
553  *
554  *	Adds new setup entry to the dev_boot_setup list.  The function
555  *	returns 0 on error and 1 on success.  This is a generic routine to
556  *	all netdevices.
557  */
558 static int netdev_boot_setup_add(char *name, struct ifmap *map)
559 {
560 	struct netdev_boot_setup *s;
561 	int i;
562 
563 	s = dev_boot_setup;
564 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
565 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
566 			memset(s[i].name, 0, sizeof(s[i].name));
567 			strlcpy(s[i].name, name, IFNAMSIZ);
568 			memcpy(&s[i].map, map, sizeof(s[i].map));
569 			break;
570 		}
571 	}
572 
573 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
574 }
575 
576 /**
577  *	netdev_boot_setup_check	- check boot time settings
578  *	@dev: the netdevice
579  *
580  * 	Check boot time settings for the device.
581  *	The found settings are set for the device to be used
582  *	later in the device probing.
583  *	Returns 0 if no settings found, 1 if they are.
584  */
585 int netdev_boot_setup_check(struct net_device *dev)
586 {
587 	struct netdev_boot_setup *s = dev_boot_setup;
588 	int i;
589 
590 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
591 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
592 		    !strcmp(dev->name, s[i].name)) {
593 			dev->irq 	= s[i].map.irq;
594 			dev->base_addr 	= s[i].map.base_addr;
595 			dev->mem_start 	= s[i].map.mem_start;
596 			dev->mem_end 	= s[i].map.mem_end;
597 			return 1;
598 		}
599 	}
600 	return 0;
601 }
602 EXPORT_SYMBOL(netdev_boot_setup_check);
603 
604 
605 /**
606  *	netdev_boot_base	- get address from boot time settings
607  *	@prefix: prefix for network device
608  *	@unit: id for network device
609  *
610  * 	Check boot time settings for the base address of device.
611  *	The found settings are set for the device to be used
612  *	later in the device probing.
613  *	Returns 0 if no settings found.
614  */
615 unsigned long netdev_boot_base(const char *prefix, int unit)
616 {
617 	const struct netdev_boot_setup *s = dev_boot_setup;
618 	char name[IFNAMSIZ];
619 	int i;
620 
621 	sprintf(name, "%s%d", prefix, unit);
622 
623 	/*
624 	 * If device already registered then return base of 1
625 	 * to indicate not to probe for this interface
626 	 */
627 	if (__dev_get_by_name(&init_net, name))
628 		return 1;
629 
630 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
631 		if (!strcmp(name, s[i].name))
632 			return s[i].map.base_addr;
633 	return 0;
634 }
635 
636 /*
637  * Saves at boot time configured settings for any netdevice.
638  */
639 int __init netdev_boot_setup(char *str)
640 {
641 	int ints[5];
642 	struct ifmap map;
643 
644 	str = get_options(str, ARRAY_SIZE(ints), ints);
645 	if (!str || !*str)
646 		return 0;
647 
648 	/* Save settings */
649 	memset(&map, 0, sizeof(map));
650 	if (ints[0] > 0)
651 		map.irq = ints[1];
652 	if (ints[0] > 1)
653 		map.base_addr = ints[2];
654 	if (ints[0] > 2)
655 		map.mem_start = ints[3];
656 	if (ints[0] > 3)
657 		map.mem_end = ints[4];
658 
659 	/* Add new entry to the list */
660 	return netdev_boot_setup_add(str, &map);
661 }
662 
663 __setup("netdev=", netdev_boot_setup);
664 
665 /*******************************************************************************
666 
667 			    Device Interface Subroutines
668 
669 *******************************************************************************/
670 
671 /**
672  *	dev_get_iflink	- get 'iflink' value of a interface
673  *	@dev: targeted interface
674  *
675  *	Indicates the ifindex the interface is linked to.
676  *	Physical interfaces have the same 'ifindex' and 'iflink' values.
677  */
678 
679 int dev_get_iflink(const struct net_device *dev)
680 {
681 	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
682 		return dev->netdev_ops->ndo_get_iflink(dev);
683 
684 	return dev->ifindex;
685 }
686 EXPORT_SYMBOL(dev_get_iflink);
687 
688 /**
689  *	dev_fill_metadata_dst - Retrieve tunnel egress information.
690  *	@dev: targeted interface
691  *	@skb: The packet.
692  *
693  *	For better visibility of tunnel traffic OVS needs to retrieve
694  *	egress tunnel information for a packet. Following API allows
695  *	user to get this info.
696  */
697 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
698 {
699 	struct ip_tunnel_info *info;
700 
701 	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
702 		return -EINVAL;
703 
704 	info = skb_tunnel_info_unclone(skb);
705 	if (!info)
706 		return -ENOMEM;
707 	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
708 		return -EINVAL;
709 
710 	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
711 }
712 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
713 
714 /**
715  *	__dev_get_by_name	- find a device by its name
716  *	@net: the applicable net namespace
717  *	@name: name to find
718  *
719  *	Find an interface by name. Must be called under RTNL semaphore
720  *	or @dev_base_lock. If the name is found a pointer to the device
721  *	is returned. If the name is not found then %NULL is returned. The
722  *	reference counters are not incremented so the caller must be
723  *	careful with locks.
724  */
725 
726 struct net_device *__dev_get_by_name(struct net *net, const char *name)
727 {
728 	struct net_device *dev;
729 	struct hlist_head *head = dev_name_hash(net, name);
730 
731 	hlist_for_each_entry(dev, head, name_hlist)
732 		if (!strncmp(dev->name, name, IFNAMSIZ))
733 			return dev;
734 
735 	return NULL;
736 }
737 EXPORT_SYMBOL(__dev_get_by_name);
738 
739 /**
740  *	dev_get_by_name_rcu	- find a device by its name
741  *	@net: the applicable net namespace
742  *	@name: name to find
743  *
744  *	Find an interface by name.
745  *	If the name is found a pointer to the device is returned.
746  * 	If the name is not found then %NULL is returned.
747  *	The reference counters are not incremented so the caller must be
748  *	careful with locks. The caller must hold RCU lock.
749  */
750 
751 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
752 {
753 	struct net_device *dev;
754 	struct hlist_head *head = dev_name_hash(net, name);
755 
756 	hlist_for_each_entry_rcu(dev, head, name_hlist)
757 		if (!strncmp(dev->name, name, IFNAMSIZ))
758 			return dev;
759 
760 	return NULL;
761 }
762 EXPORT_SYMBOL(dev_get_by_name_rcu);
763 
764 /**
765  *	dev_get_by_name		- find a device by its name
766  *	@net: the applicable net namespace
767  *	@name: name to find
768  *
769  *	Find an interface by name. This can be called from any
770  *	context and does its own locking. The returned handle has
771  *	the usage count incremented and the caller must use dev_put() to
772  *	release it when it is no longer needed. %NULL is returned if no
773  *	matching device is found.
774  */
775 
776 struct net_device *dev_get_by_name(struct net *net, const char *name)
777 {
778 	struct net_device *dev;
779 
780 	rcu_read_lock();
781 	dev = dev_get_by_name_rcu(net, name);
782 	if (dev)
783 		dev_hold(dev);
784 	rcu_read_unlock();
785 	return dev;
786 }
787 EXPORT_SYMBOL(dev_get_by_name);
788 
789 /**
790  *	__dev_get_by_index - find a device by its ifindex
791  *	@net: the applicable net namespace
792  *	@ifindex: index of device
793  *
794  *	Search for an interface by index. Returns %NULL if the device
795  *	is not found or a pointer to the device. The device has not
796  *	had its reference counter increased so the caller must be careful
797  *	about locking. The caller must hold either the RTNL semaphore
798  *	or @dev_base_lock.
799  */
800 
801 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
802 {
803 	struct net_device *dev;
804 	struct hlist_head *head = dev_index_hash(net, ifindex);
805 
806 	hlist_for_each_entry(dev, head, index_hlist)
807 		if (dev->ifindex == ifindex)
808 			return dev;
809 
810 	return NULL;
811 }
812 EXPORT_SYMBOL(__dev_get_by_index);
813 
814 /**
815  *	dev_get_by_index_rcu - find a device by its ifindex
816  *	@net: the applicable net namespace
817  *	@ifindex: index of device
818  *
819  *	Search for an interface by index. Returns %NULL if the device
820  *	is not found or a pointer to the device. The device has not
821  *	had its reference counter increased so the caller must be careful
822  *	about locking. The caller must hold RCU lock.
823  */
824 
825 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
826 {
827 	struct net_device *dev;
828 	struct hlist_head *head = dev_index_hash(net, ifindex);
829 
830 	hlist_for_each_entry_rcu(dev, head, index_hlist)
831 		if (dev->ifindex == ifindex)
832 			return dev;
833 
834 	return NULL;
835 }
836 EXPORT_SYMBOL(dev_get_by_index_rcu);
837 
838 
839 /**
840  *	dev_get_by_index - find a device by its ifindex
841  *	@net: the applicable net namespace
842  *	@ifindex: index of device
843  *
844  *	Search for an interface by index. Returns NULL if the device
845  *	is not found or a pointer to the device. The device returned has
846  *	had a reference added and the pointer is safe until the user calls
847  *	dev_put to indicate they have finished with it.
848  */
849 
850 struct net_device *dev_get_by_index(struct net *net, int ifindex)
851 {
852 	struct net_device *dev;
853 
854 	rcu_read_lock();
855 	dev = dev_get_by_index_rcu(net, ifindex);
856 	if (dev)
857 		dev_hold(dev);
858 	rcu_read_unlock();
859 	return dev;
860 }
861 EXPORT_SYMBOL(dev_get_by_index);
862 
863 /**
864  *	netdev_get_name - get a netdevice name, knowing its ifindex.
865  *	@net: network namespace
866  *	@name: a pointer to the buffer where the name will be stored.
867  *	@ifindex: the ifindex of the interface to get the name from.
868  *
869  *	The use of raw_seqcount_begin() and cond_resched() before
870  *	retrying is required as we want to give the writers a chance
871  *	to complete when CONFIG_PREEMPT is not set.
872  */
873 int netdev_get_name(struct net *net, char *name, int ifindex)
874 {
875 	struct net_device *dev;
876 	unsigned int seq;
877 
878 retry:
879 	seq = raw_seqcount_begin(&devnet_rename_seq);
880 	rcu_read_lock();
881 	dev = dev_get_by_index_rcu(net, ifindex);
882 	if (!dev) {
883 		rcu_read_unlock();
884 		return -ENODEV;
885 	}
886 
887 	strcpy(name, dev->name);
888 	rcu_read_unlock();
889 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
890 		cond_resched();
891 		goto retry;
892 	}
893 
894 	return 0;
895 }
896 
897 /**
898  *	dev_getbyhwaddr_rcu - find a device by its hardware address
899  *	@net: the applicable net namespace
900  *	@type: media type of device
901  *	@ha: hardware address
902  *
903  *	Search for an interface by MAC address. Returns NULL if the device
904  *	is not found or a pointer to the device.
905  *	The caller must hold RCU or RTNL.
906  *	The returned device has not had its ref count increased
907  *	and the caller must therefore be careful about locking
908  *
909  */
910 
911 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
912 				       const char *ha)
913 {
914 	struct net_device *dev;
915 
916 	for_each_netdev_rcu(net, dev)
917 		if (dev->type == type &&
918 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
919 			return dev;
920 
921 	return NULL;
922 }
923 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
924 
925 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
926 {
927 	struct net_device *dev;
928 
929 	ASSERT_RTNL();
930 	for_each_netdev(net, dev)
931 		if (dev->type == type)
932 			return dev;
933 
934 	return NULL;
935 }
936 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
937 
938 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
939 {
940 	struct net_device *dev, *ret = NULL;
941 
942 	rcu_read_lock();
943 	for_each_netdev_rcu(net, dev)
944 		if (dev->type == type) {
945 			dev_hold(dev);
946 			ret = dev;
947 			break;
948 		}
949 	rcu_read_unlock();
950 	return ret;
951 }
952 EXPORT_SYMBOL(dev_getfirstbyhwtype);
953 
954 /**
955  *	__dev_get_by_flags - find any device with given flags
956  *	@net: the applicable net namespace
957  *	@if_flags: IFF_* values
958  *	@mask: bitmask of bits in if_flags to check
959  *
960  *	Search for any interface with the given flags. Returns NULL if a device
961  *	is not found or a pointer to the device. Must be called inside
962  *	rtnl_lock(), and result refcount is unchanged.
963  */
964 
965 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
966 				      unsigned short mask)
967 {
968 	struct net_device *dev, *ret;
969 
970 	ASSERT_RTNL();
971 
972 	ret = NULL;
973 	for_each_netdev(net, dev) {
974 		if (((dev->flags ^ if_flags) & mask) == 0) {
975 			ret = dev;
976 			break;
977 		}
978 	}
979 	return ret;
980 }
981 EXPORT_SYMBOL(__dev_get_by_flags);
982 
983 /**
984  *	dev_valid_name - check if name is okay for network device
985  *	@name: name string
986  *
987  *	Network device names need to be valid file names to
988  *	to allow sysfs to work.  We also disallow any kind of
989  *	whitespace.
990  */
991 bool dev_valid_name(const char *name)
992 {
993 	if (*name == '\0')
994 		return false;
995 	if (strlen(name) >= IFNAMSIZ)
996 		return false;
997 	if (!strcmp(name, ".") || !strcmp(name, ".."))
998 		return false;
999 
1000 	while (*name) {
1001 		if (*name == '/' || *name == ':' || isspace(*name))
1002 			return false;
1003 		name++;
1004 	}
1005 	return true;
1006 }
1007 EXPORT_SYMBOL(dev_valid_name);
1008 
1009 /**
1010  *	__dev_alloc_name - allocate a name for a device
1011  *	@net: network namespace to allocate the device name in
1012  *	@name: name format string
1013  *	@buf:  scratch buffer and result name string
1014  *
1015  *	Passed a format string - eg "lt%d" it will try and find a suitable
1016  *	id. It scans list of devices to build up a free map, then chooses
1017  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1018  *	while allocating the name and adding the device in order to avoid
1019  *	duplicates.
1020  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021  *	Returns the number of the unit assigned or a negative errno code.
1022  */
1023 
1024 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025 {
1026 	int i = 0;
1027 	const char *p;
1028 	const int max_netdevices = 8*PAGE_SIZE;
1029 	unsigned long *inuse;
1030 	struct net_device *d;
1031 
1032 	p = strnchr(name, IFNAMSIZ-1, '%');
1033 	if (p) {
1034 		/*
1035 		 * Verify the string as this thing may have come from
1036 		 * the user.  There must be either one "%d" and no other "%"
1037 		 * characters.
1038 		 */
1039 		if (p[1] != 'd' || strchr(p + 2, '%'))
1040 			return -EINVAL;
1041 
1042 		/* Use one page as a bit array of possible slots */
1043 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044 		if (!inuse)
1045 			return -ENOMEM;
1046 
1047 		for_each_netdev(net, d) {
1048 			if (!sscanf(d->name, name, &i))
1049 				continue;
1050 			if (i < 0 || i >= max_netdevices)
1051 				continue;
1052 
1053 			/*  avoid cases where sscanf is not exact inverse of printf */
1054 			snprintf(buf, IFNAMSIZ, name, i);
1055 			if (!strncmp(buf, d->name, IFNAMSIZ))
1056 				set_bit(i, inuse);
1057 		}
1058 
1059 		i = find_first_zero_bit(inuse, max_netdevices);
1060 		free_page((unsigned long) inuse);
1061 	}
1062 
1063 	if (buf != name)
1064 		snprintf(buf, IFNAMSIZ, name, i);
1065 	if (!__dev_get_by_name(net, buf))
1066 		return i;
1067 
1068 	/* It is possible to run out of possible slots
1069 	 * when the name is long and there isn't enough space left
1070 	 * for the digits, or if all bits are used.
1071 	 */
1072 	return -ENFILE;
1073 }
1074 
1075 /**
1076  *	dev_alloc_name - allocate a name for a device
1077  *	@dev: device
1078  *	@name: name format string
1079  *
1080  *	Passed a format string - eg "lt%d" it will try and find a suitable
1081  *	id. It scans list of devices to build up a free map, then chooses
1082  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1083  *	while allocating the name and adding the device in order to avoid
1084  *	duplicates.
1085  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086  *	Returns the number of the unit assigned or a negative errno code.
1087  */
1088 
1089 int dev_alloc_name(struct net_device *dev, const char *name)
1090 {
1091 	char buf[IFNAMSIZ];
1092 	struct net *net;
1093 	int ret;
1094 
1095 	BUG_ON(!dev_net(dev));
1096 	net = dev_net(dev);
1097 	ret = __dev_alloc_name(net, name, buf);
1098 	if (ret >= 0)
1099 		strlcpy(dev->name, buf, IFNAMSIZ);
1100 	return ret;
1101 }
1102 EXPORT_SYMBOL(dev_alloc_name);
1103 
1104 static int dev_alloc_name_ns(struct net *net,
1105 			     struct net_device *dev,
1106 			     const char *name)
1107 {
1108 	char buf[IFNAMSIZ];
1109 	int ret;
1110 
1111 	ret = __dev_alloc_name(net, name, buf);
1112 	if (ret >= 0)
1113 		strlcpy(dev->name, buf, IFNAMSIZ);
1114 	return ret;
1115 }
1116 
1117 static int dev_get_valid_name(struct net *net,
1118 			      struct net_device *dev,
1119 			      const char *name)
1120 {
1121 	BUG_ON(!net);
1122 
1123 	if (!dev_valid_name(name))
1124 		return -EINVAL;
1125 
1126 	if (strchr(name, '%'))
1127 		return dev_alloc_name_ns(net, dev, name);
1128 	else if (__dev_get_by_name(net, name))
1129 		return -EEXIST;
1130 	else if (dev->name != name)
1131 		strlcpy(dev->name, name, IFNAMSIZ);
1132 
1133 	return 0;
1134 }
1135 
1136 /**
1137  *	dev_change_name - change name of a device
1138  *	@dev: device
1139  *	@newname: name (or format string) must be at least IFNAMSIZ
1140  *
1141  *	Change name of a device, can pass format strings "eth%d".
1142  *	for wildcarding.
1143  */
1144 int dev_change_name(struct net_device *dev, const char *newname)
1145 {
1146 	unsigned char old_assign_type;
1147 	char oldname[IFNAMSIZ];
1148 	int err = 0;
1149 	int ret;
1150 	struct net *net;
1151 
1152 	ASSERT_RTNL();
1153 	BUG_ON(!dev_net(dev));
1154 
1155 	net = dev_net(dev);
1156 	if (dev->flags & IFF_UP)
1157 		return -EBUSY;
1158 
1159 	write_seqcount_begin(&devnet_rename_seq);
1160 
1161 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162 		write_seqcount_end(&devnet_rename_seq);
1163 		return 0;
1164 	}
1165 
1166 	memcpy(oldname, dev->name, IFNAMSIZ);
1167 
1168 	err = dev_get_valid_name(net, dev, newname);
1169 	if (err < 0) {
1170 		write_seqcount_end(&devnet_rename_seq);
1171 		return err;
1172 	}
1173 
1174 	if (oldname[0] && !strchr(oldname, '%'))
1175 		netdev_info(dev, "renamed from %s\n", oldname);
1176 
1177 	old_assign_type = dev->name_assign_type;
1178 	dev->name_assign_type = NET_NAME_RENAMED;
1179 
1180 rollback:
1181 	ret = device_rename(&dev->dev, dev->name);
1182 	if (ret) {
1183 		memcpy(dev->name, oldname, IFNAMSIZ);
1184 		dev->name_assign_type = old_assign_type;
1185 		write_seqcount_end(&devnet_rename_seq);
1186 		return ret;
1187 	}
1188 
1189 	write_seqcount_end(&devnet_rename_seq);
1190 
1191 	netdev_adjacent_rename_links(dev, oldname);
1192 
1193 	write_lock_bh(&dev_base_lock);
1194 	hlist_del_rcu(&dev->name_hlist);
1195 	write_unlock_bh(&dev_base_lock);
1196 
1197 	synchronize_rcu();
1198 
1199 	write_lock_bh(&dev_base_lock);
1200 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201 	write_unlock_bh(&dev_base_lock);
1202 
1203 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204 	ret = notifier_to_errno(ret);
1205 
1206 	if (ret) {
1207 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1208 		if (err >= 0) {
1209 			err = ret;
1210 			write_seqcount_begin(&devnet_rename_seq);
1211 			memcpy(dev->name, oldname, IFNAMSIZ);
1212 			memcpy(oldname, newname, IFNAMSIZ);
1213 			dev->name_assign_type = old_assign_type;
1214 			old_assign_type = NET_NAME_RENAMED;
1215 			goto rollback;
1216 		} else {
1217 			pr_err("%s: name change rollback failed: %d\n",
1218 			       dev->name, ret);
1219 		}
1220 	}
1221 
1222 	return err;
1223 }
1224 
1225 /**
1226  *	dev_set_alias - change ifalias of a device
1227  *	@dev: device
1228  *	@alias: name up to IFALIASZ
1229  *	@len: limit of bytes to copy from info
1230  *
1231  *	Set ifalias for a device,
1232  */
1233 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234 {
1235 	char *new_ifalias;
1236 
1237 	ASSERT_RTNL();
1238 
1239 	if (len >= IFALIASZ)
1240 		return -EINVAL;
1241 
1242 	if (!len) {
1243 		kfree(dev->ifalias);
1244 		dev->ifalias = NULL;
1245 		return 0;
1246 	}
1247 
1248 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249 	if (!new_ifalias)
1250 		return -ENOMEM;
1251 	dev->ifalias = new_ifalias;
1252 
1253 	strlcpy(dev->ifalias, alias, len+1);
1254 	return len;
1255 }
1256 
1257 
1258 /**
1259  *	netdev_features_change - device changes features
1260  *	@dev: device to cause notification
1261  *
1262  *	Called to indicate a device has changed features.
1263  */
1264 void netdev_features_change(struct net_device *dev)
1265 {
1266 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267 }
1268 EXPORT_SYMBOL(netdev_features_change);
1269 
1270 /**
1271  *	netdev_state_change - device changes state
1272  *	@dev: device to cause notification
1273  *
1274  *	Called to indicate a device has changed state. This function calls
1275  *	the notifier chains for netdev_chain and sends a NEWLINK message
1276  *	to the routing socket.
1277  */
1278 void netdev_state_change(struct net_device *dev)
1279 {
1280 	if (dev->flags & IFF_UP) {
1281 		struct netdev_notifier_change_info change_info;
1282 
1283 		change_info.flags_changed = 0;
1284 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285 					      &change_info.info);
1286 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287 	}
1288 }
1289 EXPORT_SYMBOL(netdev_state_change);
1290 
1291 /**
1292  * 	netdev_notify_peers - notify network peers about existence of @dev
1293  * 	@dev: network device
1294  *
1295  * Generate traffic such that interested network peers are aware of
1296  * @dev, such as by generating a gratuitous ARP. This may be used when
1297  * a device wants to inform the rest of the network about some sort of
1298  * reconfiguration such as a failover event or virtual machine
1299  * migration.
1300  */
1301 void netdev_notify_peers(struct net_device *dev)
1302 {
1303 	rtnl_lock();
1304 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305 	rtnl_unlock();
1306 }
1307 EXPORT_SYMBOL(netdev_notify_peers);
1308 
1309 static int __dev_open(struct net_device *dev)
1310 {
1311 	const struct net_device_ops *ops = dev->netdev_ops;
1312 	int ret;
1313 
1314 	ASSERT_RTNL();
1315 
1316 	if (!netif_device_present(dev))
1317 		return -ENODEV;
1318 
1319 	/* Block netpoll from trying to do any rx path servicing.
1320 	 * If we don't do this there is a chance ndo_poll_controller
1321 	 * or ndo_poll may be running while we open the device
1322 	 */
1323 	netpoll_poll_disable(dev);
1324 
1325 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326 	ret = notifier_to_errno(ret);
1327 	if (ret)
1328 		return ret;
1329 
1330 	set_bit(__LINK_STATE_START, &dev->state);
1331 
1332 	if (ops->ndo_validate_addr)
1333 		ret = ops->ndo_validate_addr(dev);
1334 
1335 	if (!ret && ops->ndo_open)
1336 		ret = ops->ndo_open(dev);
1337 
1338 	netpoll_poll_enable(dev);
1339 
1340 	if (ret)
1341 		clear_bit(__LINK_STATE_START, &dev->state);
1342 	else {
1343 		dev->flags |= IFF_UP;
1344 		dev_set_rx_mode(dev);
1345 		dev_activate(dev);
1346 		add_device_randomness(dev->dev_addr, dev->addr_len);
1347 	}
1348 
1349 	return ret;
1350 }
1351 
1352 /**
1353  *	dev_open	- prepare an interface for use.
1354  *	@dev:	device to open
1355  *
1356  *	Takes a device from down to up state. The device's private open
1357  *	function is invoked and then the multicast lists are loaded. Finally
1358  *	the device is moved into the up state and a %NETDEV_UP message is
1359  *	sent to the netdev notifier chain.
1360  *
1361  *	Calling this function on an active interface is a nop. On a failure
1362  *	a negative errno code is returned.
1363  */
1364 int dev_open(struct net_device *dev)
1365 {
1366 	int ret;
1367 
1368 	if (dev->flags & IFF_UP)
1369 		return 0;
1370 
1371 	ret = __dev_open(dev);
1372 	if (ret < 0)
1373 		return ret;
1374 
1375 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376 	call_netdevice_notifiers(NETDEV_UP, dev);
1377 
1378 	return ret;
1379 }
1380 EXPORT_SYMBOL(dev_open);
1381 
1382 static int __dev_close_many(struct list_head *head)
1383 {
1384 	struct net_device *dev;
1385 
1386 	ASSERT_RTNL();
1387 	might_sleep();
1388 
1389 	list_for_each_entry(dev, head, close_list) {
1390 		/* Temporarily disable netpoll until the interface is down */
1391 		netpoll_poll_disable(dev);
1392 
1393 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394 
1395 		clear_bit(__LINK_STATE_START, &dev->state);
1396 
1397 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1398 		 * can be even on different cpu. So just clear netif_running().
1399 		 *
1400 		 * dev->stop() will invoke napi_disable() on all of it's
1401 		 * napi_struct instances on this device.
1402 		 */
1403 		smp_mb__after_atomic(); /* Commit netif_running(). */
1404 	}
1405 
1406 	dev_deactivate_many(head);
1407 
1408 	list_for_each_entry(dev, head, close_list) {
1409 		const struct net_device_ops *ops = dev->netdev_ops;
1410 
1411 		/*
1412 		 *	Call the device specific close. This cannot fail.
1413 		 *	Only if device is UP
1414 		 *
1415 		 *	We allow it to be called even after a DETACH hot-plug
1416 		 *	event.
1417 		 */
1418 		if (ops->ndo_stop)
1419 			ops->ndo_stop(dev);
1420 
1421 		dev->flags &= ~IFF_UP;
1422 		netpoll_poll_enable(dev);
1423 	}
1424 
1425 	return 0;
1426 }
1427 
1428 static int __dev_close(struct net_device *dev)
1429 {
1430 	int retval;
1431 	LIST_HEAD(single);
1432 
1433 	list_add(&dev->close_list, &single);
1434 	retval = __dev_close_many(&single);
1435 	list_del(&single);
1436 
1437 	return retval;
1438 }
1439 
1440 int dev_close_many(struct list_head *head, bool unlink)
1441 {
1442 	struct net_device *dev, *tmp;
1443 
1444 	/* Remove the devices that don't need to be closed */
1445 	list_for_each_entry_safe(dev, tmp, head, close_list)
1446 		if (!(dev->flags & IFF_UP))
1447 			list_del_init(&dev->close_list);
1448 
1449 	__dev_close_many(head);
1450 
1451 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1452 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1454 		if (unlink)
1455 			list_del_init(&dev->close_list);
1456 	}
1457 
1458 	return 0;
1459 }
1460 EXPORT_SYMBOL(dev_close_many);
1461 
1462 /**
1463  *	dev_close - shutdown an interface.
1464  *	@dev: device to shutdown
1465  *
1466  *	This function moves an active device into down state. A
1467  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469  *	chain.
1470  */
1471 int dev_close(struct net_device *dev)
1472 {
1473 	if (dev->flags & IFF_UP) {
1474 		LIST_HEAD(single);
1475 
1476 		list_add(&dev->close_list, &single);
1477 		dev_close_many(&single, true);
1478 		list_del(&single);
1479 	}
1480 	return 0;
1481 }
1482 EXPORT_SYMBOL(dev_close);
1483 
1484 
1485 /**
1486  *	dev_disable_lro - disable Large Receive Offload on a device
1487  *	@dev: device
1488  *
1489  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1490  *	called under RTNL.  This is needed if received packets may be
1491  *	forwarded to another interface.
1492  */
1493 void dev_disable_lro(struct net_device *dev)
1494 {
1495 	struct net_device *lower_dev;
1496 	struct list_head *iter;
1497 
1498 	dev->wanted_features &= ~NETIF_F_LRO;
1499 	netdev_update_features(dev);
1500 
1501 	if (unlikely(dev->features & NETIF_F_LRO))
1502 		netdev_WARN(dev, "failed to disable LRO!\n");
1503 
1504 	netdev_for_each_lower_dev(dev, lower_dev, iter)
1505 		dev_disable_lro(lower_dev);
1506 }
1507 EXPORT_SYMBOL(dev_disable_lro);
1508 
1509 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510 				   struct net_device *dev)
1511 {
1512 	struct netdev_notifier_info info;
1513 
1514 	netdev_notifier_info_init(&info, dev);
1515 	return nb->notifier_call(nb, val, &info);
1516 }
1517 
1518 static int dev_boot_phase = 1;
1519 
1520 /**
1521  *	register_netdevice_notifier - register a network notifier block
1522  *	@nb: notifier
1523  *
1524  *	Register a notifier to be called when network device events occur.
1525  *	The notifier passed is linked into the kernel structures and must
1526  *	not be reused until it has been unregistered. A negative errno code
1527  *	is returned on a failure.
1528  *
1529  * 	When registered all registration and up events are replayed
1530  *	to the new notifier to allow device to have a race free
1531  *	view of the network device list.
1532  */
1533 
1534 int register_netdevice_notifier(struct notifier_block *nb)
1535 {
1536 	struct net_device *dev;
1537 	struct net_device *last;
1538 	struct net *net;
1539 	int err;
1540 
1541 	rtnl_lock();
1542 	err = raw_notifier_chain_register(&netdev_chain, nb);
1543 	if (err)
1544 		goto unlock;
1545 	if (dev_boot_phase)
1546 		goto unlock;
1547 	for_each_net(net) {
1548 		for_each_netdev(net, dev) {
1549 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550 			err = notifier_to_errno(err);
1551 			if (err)
1552 				goto rollback;
1553 
1554 			if (!(dev->flags & IFF_UP))
1555 				continue;
1556 
1557 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1558 		}
1559 	}
1560 
1561 unlock:
1562 	rtnl_unlock();
1563 	return err;
1564 
1565 rollback:
1566 	last = dev;
1567 	for_each_net(net) {
1568 		for_each_netdev(net, dev) {
1569 			if (dev == last)
1570 				goto outroll;
1571 
1572 			if (dev->flags & IFF_UP) {
1573 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574 							dev);
1575 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576 			}
1577 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578 		}
1579 	}
1580 
1581 outroll:
1582 	raw_notifier_chain_unregister(&netdev_chain, nb);
1583 	goto unlock;
1584 }
1585 EXPORT_SYMBOL(register_netdevice_notifier);
1586 
1587 /**
1588  *	unregister_netdevice_notifier - unregister a network notifier block
1589  *	@nb: notifier
1590  *
1591  *	Unregister a notifier previously registered by
1592  *	register_netdevice_notifier(). The notifier is unlinked into the
1593  *	kernel structures and may then be reused. A negative errno code
1594  *	is returned on a failure.
1595  *
1596  * 	After unregistering unregister and down device events are synthesized
1597  *	for all devices on the device list to the removed notifier to remove
1598  *	the need for special case cleanup code.
1599  */
1600 
1601 int unregister_netdevice_notifier(struct notifier_block *nb)
1602 {
1603 	struct net_device *dev;
1604 	struct net *net;
1605 	int err;
1606 
1607 	rtnl_lock();
1608 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609 	if (err)
1610 		goto unlock;
1611 
1612 	for_each_net(net) {
1613 		for_each_netdev(net, dev) {
1614 			if (dev->flags & IFF_UP) {
1615 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616 							dev);
1617 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618 			}
1619 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620 		}
1621 	}
1622 unlock:
1623 	rtnl_unlock();
1624 	return err;
1625 }
1626 EXPORT_SYMBOL(unregister_netdevice_notifier);
1627 
1628 /**
1629  *	call_netdevice_notifiers_info - call all network notifier blocks
1630  *	@val: value passed unmodified to notifier function
1631  *	@dev: net_device pointer passed unmodified to notifier function
1632  *	@info: notifier information data
1633  *
1634  *	Call all network notifier blocks.  Parameters and return value
1635  *	are as for raw_notifier_call_chain().
1636  */
1637 
1638 static int call_netdevice_notifiers_info(unsigned long val,
1639 					 struct net_device *dev,
1640 					 struct netdev_notifier_info *info)
1641 {
1642 	ASSERT_RTNL();
1643 	netdev_notifier_info_init(info, dev);
1644 	return raw_notifier_call_chain(&netdev_chain, val, info);
1645 }
1646 
1647 /**
1648  *	call_netdevice_notifiers - call all network notifier blocks
1649  *      @val: value passed unmodified to notifier function
1650  *      @dev: net_device pointer passed unmodified to notifier function
1651  *
1652  *	Call all network notifier blocks.  Parameters and return value
1653  *	are as for raw_notifier_call_chain().
1654  */
1655 
1656 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657 {
1658 	struct netdev_notifier_info info;
1659 
1660 	return call_netdevice_notifiers_info(val, dev, &info);
1661 }
1662 EXPORT_SYMBOL(call_netdevice_notifiers);
1663 
1664 #ifdef CONFIG_NET_INGRESS
1665 static struct static_key ingress_needed __read_mostly;
1666 
1667 void net_inc_ingress_queue(void)
1668 {
1669 	static_key_slow_inc(&ingress_needed);
1670 }
1671 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672 
1673 void net_dec_ingress_queue(void)
1674 {
1675 	static_key_slow_dec(&ingress_needed);
1676 }
1677 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678 #endif
1679 
1680 #ifdef CONFIG_NET_EGRESS
1681 static struct static_key egress_needed __read_mostly;
1682 
1683 void net_inc_egress_queue(void)
1684 {
1685 	static_key_slow_inc(&egress_needed);
1686 }
1687 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688 
1689 void net_dec_egress_queue(void)
1690 {
1691 	static_key_slow_dec(&egress_needed);
1692 }
1693 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694 #endif
1695 
1696 static struct static_key netstamp_needed __read_mostly;
1697 #ifdef HAVE_JUMP_LABEL
1698 /* We are not allowed to call static_key_slow_dec() from irq context
1699  * If net_disable_timestamp() is called from irq context, defer the
1700  * static_key_slow_dec() calls.
1701  */
1702 static atomic_t netstamp_needed_deferred;
1703 #endif
1704 
1705 void net_enable_timestamp(void)
1706 {
1707 #ifdef HAVE_JUMP_LABEL
1708 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1709 
1710 	if (deferred) {
1711 		while (--deferred)
1712 			static_key_slow_dec(&netstamp_needed);
1713 		return;
1714 	}
1715 #endif
1716 	static_key_slow_inc(&netstamp_needed);
1717 }
1718 EXPORT_SYMBOL(net_enable_timestamp);
1719 
1720 void net_disable_timestamp(void)
1721 {
1722 #ifdef HAVE_JUMP_LABEL
1723 	if (in_interrupt()) {
1724 		atomic_inc(&netstamp_needed_deferred);
1725 		return;
1726 	}
1727 #endif
1728 	static_key_slow_dec(&netstamp_needed);
1729 }
1730 EXPORT_SYMBOL(net_disable_timestamp);
1731 
1732 static inline void net_timestamp_set(struct sk_buff *skb)
1733 {
1734 	skb->tstamp = 0;
1735 	if (static_key_false(&netstamp_needed))
1736 		__net_timestamp(skb);
1737 }
1738 
1739 #define net_timestamp_check(COND, SKB)			\
1740 	if (static_key_false(&netstamp_needed)) {		\
1741 		if ((COND) && !(SKB)->tstamp)	\
1742 			__net_timestamp(SKB);		\
1743 	}						\
1744 
1745 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1746 {
1747 	unsigned int len;
1748 
1749 	if (!(dev->flags & IFF_UP))
1750 		return false;
1751 
1752 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1753 	if (skb->len <= len)
1754 		return true;
1755 
1756 	/* if TSO is enabled, we don't care about the length as the packet
1757 	 * could be forwarded without being segmented before
1758 	 */
1759 	if (skb_is_gso(skb))
1760 		return true;
1761 
1762 	return false;
1763 }
1764 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1765 
1766 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1767 {
1768 	int ret = ____dev_forward_skb(dev, skb);
1769 
1770 	if (likely(!ret)) {
1771 		skb->protocol = eth_type_trans(skb, dev);
1772 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1773 	}
1774 
1775 	return ret;
1776 }
1777 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1778 
1779 /**
1780  * dev_forward_skb - loopback an skb to another netif
1781  *
1782  * @dev: destination network device
1783  * @skb: buffer to forward
1784  *
1785  * return values:
1786  *	NET_RX_SUCCESS	(no congestion)
1787  *	NET_RX_DROP     (packet was dropped, but freed)
1788  *
1789  * dev_forward_skb can be used for injecting an skb from the
1790  * start_xmit function of one device into the receive queue
1791  * of another device.
1792  *
1793  * The receiving device may be in another namespace, so
1794  * we have to clear all information in the skb that could
1795  * impact namespace isolation.
1796  */
1797 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1798 {
1799 	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1800 }
1801 EXPORT_SYMBOL_GPL(dev_forward_skb);
1802 
1803 static inline int deliver_skb(struct sk_buff *skb,
1804 			      struct packet_type *pt_prev,
1805 			      struct net_device *orig_dev)
1806 {
1807 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1808 		return -ENOMEM;
1809 	atomic_inc(&skb->users);
1810 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1811 }
1812 
1813 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1814 					  struct packet_type **pt,
1815 					  struct net_device *orig_dev,
1816 					  __be16 type,
1817 					  struct list_head *ptype_list)
1818 {
1819 	struct packet_type *ptype, *pt_prev = *pt;
1820 
1821 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1822 		if (ptype->type != type)
1823 			continue;
1824 		if (pt_prev)
1825 			deliver_skb(skb, pt_prev, orig_dev);
1826 		pt_prev = ptype;
1827 	}
1828 	*pt = pt_prev;
1829 }
1830 
1831 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1832 {
1833 	if (!ptype->af_packet_priv || !skb->sk)
1834 		return false;
1835 
1836 	if (ptype->id_match)
1837 		return ptype->id_match(ptype, skb->sk);
1838 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1839 		return true;
1840 
1841 	return false;
1842 }
1843 
1844 /*
1845  *	Support routine. Sends outgoing frames to any network
1846  *	taps currently in use.
1847  */
1848 
1849 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1850 {
1851 	struct packet_type *ptype;
1852 	struct sk_buff *skb2 = NULL;
1853 	struct packet_type *pt_prev = NULL;
1854 	struct list_head *ptype_list = &ptype_all;
1855 
1856 	rcu_read_lock();
1857 again:
1858 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1859 		/* Never send packets back to the socket
1860 		 * they originated from - MvS ([email protected])
1861 		 */
1862 		if (skb_loop_sk(ptype, skb))
1863 			continue;
1864 
1865 		if (pt_prev) {
1866 			deliver_skb(skb2, pt_prev, skb->dev);
1867 			pt_prev = ptype;
1868 			continue;
1869 		}
1870 
1871 		/* need to clone skb, done only once */
1872 		skb2 = skb_clone(skb, GFP_ATOMIC);
1873 		if (!skb2)
1874 			goto out_unlock;
1875 
1876 		net_timestamp_set(skb2);
1877 
1878 		/* skb->nh should be correctly
1879 		 * set by sender, so that the second statement is
1880 		 * just protection against buggy protocols.
1881 		 */
1882 		skb_reset_mac_header(skb2);
1883 
1884 		if (skb_network_header(skb2) < skb2->data ||
1885 		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1886 			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1887 					     ntohs(skb2->protocol),
1888 					     dev->name);
1889 			skb_reset_network_header(skb2);
1890 		}
1891 
1892 		skb2->transport_header = skb2->network_header;
1893 		skb2->pkt_type = PACKET_OUTGOING;
1894 		pt_prev = ptype;
1895 	}
1896 
1897 	if (ptype_list == &ptype_all) {
1898 		ptype_list = &dev->ptype_all;
1899 		goto again;
1900 	}
1901 out_unlock:
1902 	if (pt_prev)
1903 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1904 	rcu_read_unlock();
1905 }
1906 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1907 
1908 /**
1909  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1910  * @dev: Network device
1911  * @txq: number of queues available
1912  *
1913  * If real_num_tx_queues is changed the tc mappings may no longer be
1914  * valid. To resolve this verify the tc mapping remains valid and if
1915  * not NULL the mapping. With no priorities mapping to this
1916  * offset/count pair it will no longer be used. In the worst case TC0
1917  * is invalid nothing can be done so disable priority mappings. If is
1918  * expected that drivers will fix this mapping if they can before
1919  * calling netif_set_real_num_tx_queues.
1920  */
1921 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1922 {
1923 	int i;
1924 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1925 
1926 	/* If TC0 is invalidated disable TC mapping */
1927 	if (tc->offset + tc->count > txq) {
1928 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1929 		dev->num_tc = 0;
1930 		return;
1931 	}
1932 
1933 	/* Invalidated prio to tc mappings set to TC0 */
1934 	for (i = 1; i < TC_BITMASK + 1; i++) {
1935 		int q = netdev_get_prio_tc_map(dev, i);
1936 
1937 		tc = &dev->tc_to_txq[q];
1938 		if (tc->offset + tc->count > txq) {
1939 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1940 				i, q);
1941 			netdev_set_prio_tc_map(dev, i, 0);
1942 		}
1943 	}
1944 }
1945 
1946 int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1947 {
1948 	if (dev->num_tc) {
1949 		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1950 		int i;
1951 
1952 		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1953 			if ((txq - tc->offset) < tc->count)
1954 				return i;
1955 		}
1956 
1957 		return -1;
1958 	}
1959 
1960 	return 0;
1961 }
1962 
1963 #ifdef CONFIG_XPS
1964 static DEFINE_MUTEX(xps_map_mutex);
1965 #define xmap_dereference(P)		\
1966 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1967 
1968 static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1969 			     int tci, u16 index)
1970 {
1971 	struct xps_map *map = NULL;
1972 	int pos;
1973 
1974 	if (dev_maps)
1975 		map = xmap_dereference(dev_maps->cpu_map[tci]);
1976 	if (!map)
1977 		return false;
1978 
1979 	for (pos = map->len; pos--;) {
1980 		if (map->queues[pos] != index)
1981 			continue;
1982 
1983 		if (map->len > 1) {
1984 			map->queues[pos] = map->queues[--map->len];
1985 			break;
1986 		}
1987 
1988 		RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
1989 		kfree_rcu(map, rcu);
1990 		return false;
1991 	}
1992 
1993 	return true;
1994 }
1995 
1996 static bool remove_xps_queue_cpu(struct net_device *dev,
1997 				 struct xps_dev_maps *dev_maps,
1998 				 int cpu, u16 offset, u16 count)
1999 {
2000 	int num_tc = dev->num_tc ? : 1;
2001 	bool active = false;
2002 	int tci;
2003 
2004 	for (tci = cpu * num_tc; num_tc--; tci++) {
2005 		int i, j;
2006 
2007 		for (i = count, j = offset; i--; j++) {
2008 			if (!remove_xps_queue(dev_maps, cpu, j))
2009 				break;
2010 		}
2011 
2012 		active |= i < 0;
2013 	}
2014 
2015 	return active;
2016 }
2017 
2018 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2019 				   u16 count)
2020 {
2021 	struct xps_dev_maps *dev_maps;
2022 	int cpu, i;
2023 	bool active = false;
2024 
2025 	mutex_lock(&xps_map_mutex);
2026 	dev_maps = xmap_dereference(dev->xps_maps);
2027 
2028 	if (!dev_maps)
2029 		goto out_no_maps;
2030 
2031 	for_each_possible_cpu(cpu)
2032 		active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2033 					       offset, count);
2034 
2035 	if (!active) {
2036 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2037 		kfree_rcu(dev_maps, rcu);
2038 	}
2039 
2040 	for (i = offset + (count - 1); count--; i--)
2041 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2042 					     NUMA_NO_NODE);
2043 
2044 out_no_maps:
2045 	mutex_unlock(&xps_map_mutex);
2046 }
2047 
2048 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2049 {
2050 	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2051 }
2052 
2053 static struct xps_map *expand_xps_map(struct xps_map *map,
2054 				      int cpu, u16 index)
2055 {
2056 	struct xps_map *new_map;
2057 	int alloc_len = XPS_MIN_MAP_ALLOC;
2058 	int i, pos;
2059 
2060 	for (pos = 0; map && pos < map->len; pos++) {
2061 		if (map->queues[pos] != index)
2062 			continue;
2063 		return map;
2064 	}
2065 
2066 	/* Need to add queue to this CPU's existing map */
2067 	if (map) {
2068 		if (pos < map->alloc_len)
2069 			return map;
2070 
2071 		alloc_len = map->alloc_len * 2;
2072 	}
2073 
2074 	/* Need to allocate new map to store queue on this CPU's map */
2075 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2076 			       cpu_to_node(cpu));
2077 	if (!new_map)
2078 		return NULL;
2079 
2080 	for (i = 0; i < pos; i++)
2081 		new_map->queues[i] = map->queues[i];
2082 	new_map->alloc_len = alloc_len;
2083 	new_map->len = pos;
2084 
2085 	return new_map;
2086 }
2087 
2088 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2089 			u16 index)
2090 {
2091 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2092 	int i, cpu, tci, numa_node_id = -2;
2093 	int maps_sz, num_tc = 1, tc = 0;
2094 	struct xps_map *map, *new_map;
2095 	bool active = false;
2096 
2097 	if (dev->num_tc) {
2098 		num_tc = dev->num_tc;
2099 		tc = netdev_txq_to_tc(dev, index);
2100 		if (tc < 0)
2101 			return -EINVAL;
2102 	}
2103 
2104 	maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2105 	if (maps_sz < L1_CACHE_BYTES)
2106 		maps_sz = L1_CACHE_BYTES;
2107 
2108 	mutex_lock(&xps_map_mutex);
2109 
2110 	dev_maps = xmap_dereference(dev->xps_maps);
2111 
2112 	/* allocate memory for queue storage */
2113 	for_each_cpu_and(cpu, cpu_online_mask, mask) {
2114 		if (!new_dev_maps)
2115 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2116 		if (!new_dev_maps) {
2117 			mutex_unlock(&xps_map_mutex);
2118 			return -ENOMEM;
2119 		}
2120 
2121 		tci = cpu * num_tc + tc;
2122 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2123 				 NULL;
2124 
2125 		map = expand_xps_map(map, cpu, index);
2126 		if (!map)
2127 			goto error;
2128 
2129 		RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2130 	}
2131 
2132 	if (!new_dev_maps)
2133 		goto out_no_new_maps;
2134 
2135 	for_each_possible_cpu(cpu) {
2136 		/* copy maps belonging to foreign traffic classes */
2137 		for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2138 			/* fill in the new device map from the old device map */
2139 			map = xmap_dereference(dev_maps->cpu_map[tci]);
2140 			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2141 		}
2142 
2143 		/* We need to explicitly update tci as prevous loop
2144 		 * could break out early if dev_maps is NULL.
2145 		 */
2146 		tci = cpu * num_tc + tc;
2147 
2148 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2149 			/* add queue to CPU maps */
2150 			int pos = 0;
2151 
2152 			map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2153 			while ((pos < map->len) && (map->queues[pos] != index))
2154 				pos++;
2155 
2156 			if (pos == map->len)
2157 				map->queues[map->len++] = index;
2158 #ifdef CONFIG_NUMA
2159 			if (numa_node_id == -2)
2160 				numa_node_id = cpu_to_node(cpu);
2161 			else if (numa_node_id != cpu_to_node(cpu))
2162 				numa_node_id = -1;
2163 #endif
2164 		} else if (dev_maps) {
2165 			/* fill in the new device map from the old device map */
2166 			map = xmap_dereference(dev_maps->cpu_map[tci]);
2167 			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2168 		}
2169 
2170 		/* copy maps belonging to foreign traffic classes */
2171 		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2172 			/* fill in the new device map from the old device map */
2173 			map = xmap_dereference(dev_maps->cpu_map[tci]);
2174 			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2175 		}
2176 	}
2177 
2178 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2179 
2180 	/* Cleanup old maps */
2181 	if (!dev_maps)
2182 		goto out_no_old_maps;
2183 
2184 	for_each_possible_cpu(cpu) {
2185 		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2186 			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2187 			map = xmap_dereference(dev_maps->cpu_map[tci]);
2188 			if (map && map != new_map)
2189 				kfree_rcu(map, rcu);
2190 		}
2191 	}
2192 
2193 	kfree_rcu(dev_maps, rcu);
2194 
2195 out_no_old_maps:
2196 	dev_maps = new_dev_maps;
2197 	active = true;
2198 
2199 out_no_new_maps:
2200 	/* update Tx queue numa node */
2201 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2202 				     (numa_node_id >= 0) ? numa_node_id :
2203 				     NUMA_NO_NODE);
2204 
2205 	if (!dev_maps)
2206 		goto out_no_maps;
2207 
2208 	/* removes queue from unused CPUs */
2209 	for_each_possible_cpu(cpu) {
2210 		for (i = tc, tci = cpu * num_tc; i--; tci++)
2211 			active |= remove_xps_queue(dev_maps, tci, index);
2212 		if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2213 			active |= remove_xps_queue(dev_maps, tci, index);
2214 		for (i = num_tc - tc, tci++; --i; tci++)
2215 			active |= remove_xps_queue(dev_maps, tci, index);
2216 	}
2217 
2218 	/* free map if not active */
2219 	if (!active) {
2220 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2221 		kfree_rcu(dev_maps, rcu);
2222 	}
2223 
2224 out_no_maps:
2225 	mutex_unlock(&xps_map_mutex);
2226 
2227 	return 0;
2228 error:
2229 	/* remove any maps that we added */
2230 	for_each_possible_cpu(cpu) {
2231 		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2232 			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2233 			map = dev_maps ?
2234 			      xmap_dereference(dev_maps->cpu_map[tci]) :
2235 			      NULL;
2236 			if (new_map && new_map != map)
2237 				kfree(new_map);
2238 		}
2239 	}
2240 
2241 	mutex_unlock(&xps_map_mutex);
2242 
2243 	kfree(new_dev_maps);
2244 	return -ENOMEM;
2245 }
2246 EXPORT_SYMBOL(netif_set_xps_queue);
2247 
2248 #endif
2249 void netdev_reset_tc(struct net_device *dev)
2250 {
2251 #ifdef CONFIG_XPS
2252 	netif_reset_xps_queues_gt(dev, 0);
2253 #endif
2254 	dev->num_tc = 0;
2255 	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2256 	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2257 }
2258 EXPORT_SYMBOL(netdev_reset_tc);
2259 
2260 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2261 {
2262 	if (tc >= dev->num_tc)
2263 		return -EINVAL;
2264 
2265 #ifdef CONFIG_XPS
2266 	netif_reset_xps_queues(dev, offset, count);
2267 #endif
2268 	dev->tc_to_txq[tc].count = count;
2269 	dev->tc_to_txq[tc].offset = offset;
2270 	return 0;
2271 }
2272 EXPORT_SYMBOL(netdev_set_tc_queue);
2273 
2274 int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2275 {
2276 	if (num_tc > TC_MAX_QUEUE)
2277 		return -EINVAL;
2278 
2279 #ifdef CONFIG_XPS
2280 	netif_reset_xps_queues_gt(dev, 0);
2281 #endif
2282 	dev->num_tc = num_tc;
2283 	return 0;
2284 }
2285 EXPORT_SYMBOL(netdev_set_num_tc);
2286 
2287 /*
2288  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2289  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2290  */
2291 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2292 {
2293 	int rc;
2294 
2295 	if (txq < 1 || txq > dev->num_tx_queues)
2296 		return -EINVAL;
2297 
2298 	if (dev->reg_state == NETREG_REGISTERED ||
2299 	    dev->reg_state == NETREG_UNREGISTERING) {
2300 		ASSERT_RTNL();
2301 
2302 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2303 						  txq);
2304 		if (rc)
2305 			return rc;
2306 
2307 		if (dev->num_tc)
2308 			netif_setup_tc(dev, txq);
2309 
2310 		if (txq < dev->real_num_tx_queues) {
2311 			qdisc_reset_all_tx_gt(dev, txq);
2312 #ifdef CONFIG_XPS
2313 			netif_reset_xps_queues_gt(dev, txq);
2314 #endif
2315 		}
2316 	}
2317 
2318 	dev->real_num_tx_queues = txq;
2319 	return 0;
2320 }
2321 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2322 
2323 #ifdef CONFIG_SYSFS
2324 /**
2325  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2326  *	@dev: Network device
2327  *	@rxq: Actual number of RX queues
2328  *
2329  *	This must be called either with the rtnl_lock held or before
2330  *	registration of the net device.  Returns 0 on success, or a
2331  *	negative error code.  If called before registration, it always
2332  *	succeeds.
2333  */
2334 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2335 {
2336 	int rc;
2337 
2338 	if (rxq < 1 || rxq > dev->num_rx_queues)
2339 		return -EINVAL;
2340 
2341 	if (dev->reg_state == NETREG_REGISTERED) {
2342 		ASSERT_RTNL();
2343 
2344 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2345 						  rxq);
2346 		if (rc)
2347 			return rc;
2348 	}
2349 
2350 	dev->real_num_rx_queues = rxq;
2351 	return 0;
2352 }
2353 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2354 #endif
2355 
2356 /**
2357  * netif_get_num_default_rss_queues - default number of RSS queues
2358  *
2359  * This routine should set an upper limit on the number of RSS queues
2360  * used by default by multiqueue devices.
2361  */
2362 int netif_get_num_default_rss_queues(void)
2363 {
2364 	return is_kdump_kernel() ?
2365 		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2366 }
2367 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2368 
2369 static void __netif_reschedule(struct Qdisc *q)
2370 {
2371 	struct softnet_data *sd;
2372 	unsigned long flags;
2373 
2374 	local_irq_save(flags);
2375 	sd = this_cpu_ptr(&softnet_data);
2376 	q->next_sched = NULL;
2377 	*sd->output_queue_tailp = q;
2378 	sd->output_queue_tailp = &q->next_sched;
2379 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2380 	local_irq_restore(flags);
2381 }
2382 
2383 void __netif_schedule(struct Qdisc *q)
2384 {
2385 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2386 		__netif_reschedule(q);
2387 }
2388 EXPORT_SYMBOL(__netif_schedule);
2389 
2390 struct dev_kfree_skb_cb {
2391 	enum skb_free_reason reason;
2392 };
2393 
2394 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2395 {
2396 	return (struct dev_kfree_skb_cb *)skb->cb;
2397 }
2398 
2399 void netif_schedule_queue(struct netdev_queue *txq)
2400 {
2401 	rcu_read_lock();
2402 	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2403 		struct Qdisc *q = rcu_dereference(txq->qdisc);
2404 
2405 		__netif_schedule(q);
2406 	}
2407 	rcu_read_unlock();
2408 }
2409 EXPORT_SYMBOL(netif_schedule_queue);
2410 
2411 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2412 {
2413 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2414 		struct Qdisc *q;
2415 
2416 		rcu_read_lock();
2417 		q = rcu_dereference(dev_queue->qdisc);
2418 		__netif_schedule(q);
2419 		rcu_read_unlock();
2420 	}
2421 }
2422 EXPORT_SYMBOL(netif_tx_wake_queue);
2423 
2424 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2425 {
2426 	unsigned long flags;
2427 
2428 	if (likely(atomic_read(&skb->users) == 1)) {
2429 		smp_rmb();
2430 		atomic_set(&skb->users, 0);
2431 	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2432 		return;
2433 	}
2434 	get_kfree_skb_cb(skb)->reason = reason;
2435 	local_irq_save(flags);
2436 	skb->next = __this_cpu_read(softnet_data.completion_queue);
2437 	__this_cpu_write(softnet_data.completion_queue, skb);
2438 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2439 	local_irq_restore(flags);
2440 }
2441 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2442 
2443 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2444 {
2445 	if (in_irq() || irqs_disabled())
2446 		__dev_kfree_skb_irq(skb, reason);
2447 	else
2448 		dev_kfree_skb(skb);
2449 }
2450 EXPORT_SYMBOL(__dev_kfree_skb_any);
2451 
2452 
2453 /**
2454  * netif_device_detach - mark device as removed
2455  * @dev: network device
2456  *
2457  * Mark device as removed from system and therefore no longer available.
2458  */
2459 void netif_device_detach(struct net_device *dev)
2460 {
2461 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2462 	    netif_running(dev)) {
2463 		netif_tx_stop_all_queues(dev);
2464 	}
2465 }
2466 EXPORT_SYMBOL(netif_device_detach);
2467 
2468 /**
2469  * netif_device_attach - mark device as attached
2470  * @dev: network device
2471  *
2472  * Mark device as attached from system and restart if needed.
2473  */
2474 void netif_device_attach(struct net_device *dev)
2475 {
2476 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2477 	    netif_running(dev)) {
2478 		netif_tx_wake_all_queues(dev);
2479 		__netdev_watchdog_up(dev);
2480 	}
2481 }
2482 EXPORT_SYMBOL(netif_device_attach);
2483 
2484 /*
2485  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2486  * to be used as a distribution range.
2487  */
2488 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2489 		  unsigned int num_tx_queues)
2490 {
2491 	u32 hash;
2492 	u16 qoffset = 0;
2493 	u16 qcount = num_tx_queues;
2494 
2495 	if (skb_rx_queue_recorded(skb)) {
2496 		hash = skb_get_rx_queue(skb);
2497 		while (unlikely(hash >= num_tx_queues))
2498 			hash -= num_tx_queues;
2499 		return hash;
2500 	}
2501 
2502 	if (dev->num_tc) {
2503 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2504 		qoffset = dev->tc_to_txq[tc].offset;
2505 		qcount = dev->tc_to_txq[tc].count;
2506 	}
2507 
2508 	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2509 }
2510 EXPORT_SYMBOL(__skb_tx_hash);
2511 
2512 static void skb_warn_bad_offload(const struct sk_buff *skb)
2513 {
2514 	static const netdev_features_t null_features;
2515 	struct net_device *dev = skb->dev;
2516 	const char *name = "";
2517 
2518 	if (!net_ratelimit())
2519 		return;
2520 
2521 	if (dev) {
2522 		if (dev->dev.parent)
2523 			name = dev_driver_string(dev->dev.parent);
2524 		else
2525 			name = netdev_name(dev);
2526 	}
2527 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2528 	     "gso_type=%d ip_summed=%d\n",
2529 	     name, dev ? &dev->features : &null_features,
2530 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2531 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2532 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2533 }
2534 
2535 /*
2536  * Invalidate hardware checksum when packet is to be mangled, and
2537  * complete checksum manually on outgoing path.
2538  */
2539 int skb_checksum_help(struct sk_buff *skb)
2540 {
2541 	__wsum csum;
2542 	int ret = 0, offset;
2543 
2544 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2545 		goto out_set_summed;
2546 
2547 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2548 		skb_warn_bad_offload(skb);
2549 		return -EINVAL;
2550 	}
2551 
2552 	/* Before computing a checksum, we should make sure no frag could
2553 	 * be modified by an external entity : checksum could be wrong.
2554 	 */
2555 	if (skb_has_shared_frag(skb)) {
2556 		ret = __skb_linearize(skb);
2557 		if (ret)
2558 			goto out;
2559 	}
2560 
2561 	offset = skb_checksum_start_offset(skb);
2562 	BUG_ON(offset >= skb_headlen(skb));
2563 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2564 
2565 	offset += skb->csum_offset;
2566 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2567 
2568 	if (skb_cloned(skb) &&
2569 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2570 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2571 		if (ret)
2572 			goto out;
2573 	}
2574 
2575 	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2576 out_set_summed:
2577 	skb->ip_summed = CHECKSUM_NONE;
2578 out:
2579 	return ret;
2580 }
2581 EXPORT_SYMBOL(skb_checksum_help);
2582 
2583 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2584 {
2585 	__be16 type = skb->protocol;
2586 
2587 	/* Tunnel gso handlers can set protocol to ethernet. */
2588 	if (type == htons(ETH_P_TEB)) {
2589 		struct ethhdr *eth;
2590 
2591 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2592 			return 0;
2593 
2594 		eth = (struct ethhdr *)skb_mac_header(skb);
2595 		type = eth->h_proto;
2596 	}
2597 
2598 	return __vlan_get_protocol(skb, type, depth);
2599 }
2600 
2601 /**
2602  *	skb_mac_gso_segment - mac layer segmentation handler.
2603  *	@skb: buffer to segment
2604  *	@features: features for the output path (see dev->features)
2605  */
2606 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2607 				    netdev_features_t features)
2608 {
2609 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2610 	struct packet_offload *ptype;
2611 	int vlan_depth = skb->mac_len;
2612 	__be16 type = skb_network_protocol(skb, &vlan_depth);
2613 
2614 	if (unlikely(!type))
2615 		return ERR_PTR(-EINVAL);
2616 
2617 	__skb_pull(skb, vlan_depth);
2618 
2619 	rcu_read_lock();
2620 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2621 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2622 			segs = ptype->callbacks.gso_segment(skb, features);
2623 			break;
2624 		}
2625 	}
2626 	rcu_read_unlock();
2627 
2628 	__skb_push(skb, skb->data - skb_mac_header(skb));
2629 
2630 	return segs;
2631 }
2632 EXPORT_SYMBOL(skb_mac_gso_segment);
2633 
2634 
2635 /* openvswitch calls this on rx path, so we need a different check.
2636  */
2637 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2638 {
2639 	if (tx_path)
2640 		return skb->ip_summed != CHECKSUM_PARTIAL;
2641 	else
2642 		return skb->ip_summed == CHECKSUM_NONE;
2643 }
2644 
2645 /**
2646  *	__skb_gso_segment - Perform segmentation on skb.
2647  *	@skb: buffer to segment
2648  *	@features: features for the output path (see dev->features)
2649  *	@tx_path: whether it is called in TX path
2650  *
2651  *	This function segments the given skb and returns a list of segments.
2652  *
2653  *	It may return NULL if the skb requires no segmentation.  This is
2654  *	only possible when GSO is used for verifying header integrity.
2655  *
2656  *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2657  */
2658 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2659 				  netdev_features_t features, bool tx_path)
2660 {
2661 	struct sk_buff *segs;
2662 
2663 	if (unlikely(skb_needs_check(skb, tx_path))) {
2664 		int err;
2665 
2666 		/* We're going to init ->check field in TCP or UDP header */
2667 		err = skb_cow_head(skb, 0);
2668 		if (err < 0)
2669 			return ERR_PTR(err);
2670 	}
2671 
2672 	/* Only report GSO partial support if it will enable us to
2673 	 * support segmentation on this frame without needing additional
2674 	 * work.
2675 	 */
2676 	if (features & NETIF_F_GSO_PARTIAL) {
2677 		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2678 		struct net_device *dev = skb->dev;
2679 
2680 		partial_features |= dev->features & dev->gso_partial_features;
2681 		if (!skb_gso_ok(skb, features | partial_features))
2682 			features &= ~NETIF_F_GSO_PARTIAL;
2683 	}
2684 
2685 	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2686 		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2687 
2688 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2689 	SKB_GSO_CB(skb)->encap_level = 0;
2690 
2691 	skb_reset_mac_header(skb);
2692 	skb_reset_mac_len(skb);
2693 
2694 	segs = skb_mac_gso_segment(skb, features);
2695 
2696 	if (unlikely(skb_needs_check(skb, tx_path)))
2697 		skb_warn_bad_offload(skb);
2698 
2699 	return segs;
2700 }
2701 EXPORT_SYMBOL(__skb_gso_segment);
2702 
2703 /* Take action when hardware reception checksum errors are detected. */
2704 #ifdef CONFIG_BUG
2705 void netdev_rx_csum_fault(struct net_device *dev)
2706 {
2707 	if (net_ratelimit()) {
2708 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2709 		dump_stack();
2710 	}
2711 }
2712 EXPORT_SYMBOL(netdev_rx_csum_fault);
2713 #endif
2714 
2715 /* Actually, we should eliminate this check as soon as we know, that:
2716  * 1. IOMMU is present and allows to map all the memory.
2717  * 2. No high memory really exists on this machine.
2718  */
2719 
2720 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2721 {
2722 #ifdef CONFIG_HIGHMEM
2723 	int i;
2724 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2725 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2726 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2727 			if (PageHighMem(skb_frag_page(frag)))
2728 				return 1;
2729 		}
2730 	}
2731 
2732 	if (PCI_DMA_BUS_IS_PHYS) {
2733 		struct device *pdev = dev->dev.parent;
2734 
2735 		if (!pdev)
2736 			return 0;
2737 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2738 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2739 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2740 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2741 				return 1;
2742 		}
2743 	}
2744 #endif
2745 	return 0;
2746 }
2747 
2748 /* If MPLS offload request, verify we are testing hardware MPLS features
2749  * instead of standard features for the netdev.
2750  */
2751 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2752 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2753 					   netdev_features_t features,
2754 					   __be16 type)
2755 {
2756 	if (eth_p_mpls(type))
2757 		features &= skb->dev->mpls_features;
2758 
2759 	return features;
2760 }
2761 #else
2762 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2763 					   netdev_features_t features,
2764 					   __be16 type)
2765 {
2766 	return features;
2767 }
2768 #endif
2769 
2770 static netdev_features_t harmonize_features(struct sk_buff *skb,
2771 	netdev_features_t features)
2772 {
2773 	int tmp;
2774 	__be16 type;
2775 
2776 	type = skb_network_protocol(skb, &tmp);
2777 	features = net_mpls_features(skb, features, type);
2778 
2779 	if (skb->ip_summed != CHECKSUM_NONE &&
2780 	    !can_checksum_protocol(features, type)) {
2781 		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2782 	}
2783 	if (illegal_highdma(skb->dev, skb))
2784 		features &= ~NETIF_F_SG;
2785 
2786 	return features;
2787 }
2788 
2789 netdev_features_t passthru_features_check(struct sk_buff *skb,
2790 					  struct net_device *dev,
2791 					  netdev_features_t features)
2792 {
2793 	return features;
2794 }
2795 EXPORT_SYMBOL(passthru_features_check);
2796 
2797 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2798 					     struct net_device *dev,
2799 					     netdev_features_t features)
2800 {
2801 	return vlan_features_check(skb, features);
2802 }
2803 
2804 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2805 					    struct net_device *dev,
2806 					    netdev_features_t features)
2807 {
2808 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2809 
2810 	if (gso_segs > dev->gso_max_segs)
2811 		return features & ~NETIF_F_GSO_MASK;
2812 
2813 	/* Support for GSO partial features requires software
2814 	 * intervention before we can actually process the packets
2815 	 * so we need to strip support for any partial features now
2816 	 * and we can pull them back in after we have partially
2817 	 * segmented the frame.
2818 	 */
2819 	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2820 		features &= ~dev->gso_partial_features;
2821 
2822 	/* Make sure to clear the IPv4 ID mangling feature if the
2823 	 * IPv4 header has the potential to be fragmented.
2824 	 */
2825 	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2826 		struct iphdr *iph = skb->encapsulation ?
2827 				    inner_ip_hdr(skb) : ip_hdr(skb);
2828 
2829 		if (!(iph->frag_off & htons(IP_DF)))
2830 			features &= ~NETIF_F_TSO_MANGLEID;
2831 	}
2832 
2833 	return features;
2834 }
2835 
2836 netdev_features_t netif_skb_features(struct sk_buff *skb)
2837 {
2838 	struct net_device *dev = skb->dev;
2839 	netdev_features_t features = dev->features;
2840 
2841 	if (skb_is_gso(skb))
2842 		features = gso_features_check(skb, dev, features);
2843 
2844 	/* If encapsulation offload request, verify we are testing
2845 	 * hardware encapsulation features instead of standard
2846 	 * features for the netdev
2847 	 */
2848 	if (skb->encapsulation)
2849 		features &= dev->hw_enc_features;
2850 
2851 	if (skb_vlan_tagged(skb))
2852 		features = netdev_intersect_features(features,
2853 						     dev->vlan_features |
2854 						     NETIF_F_HW_VLAN_CTAG_TX |
2855 						     NETIF_F_HW_VLAN_STAG_TX);
2856 
2857 	if (dev->netdev_ops->ndo_features_check)
2858 		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2859 								features);
2860 	else
2861 		features &= dflt_features_check(skb, dev, features);
2862 
2863 	return harmonize_features(skb, features);
2864 }
2865 EXPORT_SYMBOL(netif_skb_features);
2866 
2867 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2868 		    struct netdev_queue *txq, bool more)
2869 {
2870 	unsigned int len;
2871 	int rc;
2872 
2873 	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2874 		dev_queue_xmit_nit(skb, dev);
2875 
2876 	len = skb->len;
2877 	trace_net_dev_start_xmit(skb, dev);
2878 	rc = netdev_start_xmit(skb, dev, txq, more);
2879 	trace_net_dev_xmit(skb, rc, dev, len);
2880 
2881 	return rc;
2882 }
2883 
2884 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2885 				    struct netdev_queue *txq, int *ret)
2886 {
2887 	struct sk_buff *skb = first;
2888 	int rc = NETDEV_TX_OK;
2889 
2890 	while (skb) {
2891 		struct sk_buff *next = skb->next;
2892 
2893 		skb->next = NULL;
2894 		rc = xmit_one(skb, dev, txq, next != NULL);
2895 		if (unlikely(!dev_xmit_complete(rc))) {
2896 			skb->next = next;
2897 			goto out;
2898 		}
2899 
2900 		skb = next;
2901 		if (netif_xmit_stopped(txq) && skb) {
2902 			rc = NETDEV_TX_BUSY;
2903 			break;
2904 		}
2905 	}
2906 
2907 out:
2908 	*ret = rc;
2909 	return skb;
2910 }
2911 
2912 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2913 					  netdev_features_t features)
2914 {
2915 	if (skb_vlan_tag_present(skb) &&
2916 	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2917 		skb = __vlan_hwaccel_push_inside(skb);
2918 	return skb;
2919 }
2920 
2921 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2922 {
2923 	netdev_features_t features;
2924 
2925 	features = netif_skb_features(skb);
2926 	skb = validate_xmit_vlan(skb, features);
2927 	if (unlikely(!skb))
2928 		goto out_null;
2929 
2930 	if (netif_needs_gso(skb, features)) {
2931 		struct sk_buff *segs;
2932 
2933 		segs = skb_gso_segment(skb, features);
2934 		if (IS_ERR(segs)) {
2935 			goto out_kfree_skb;
2936 		} else if (segs) {
2937 			consume_skb(skb);
2938 			skb = segs;
2939 		}
2940 	} else {
2941 		if (skb_needs_linearize(skb, features) &&
2942 		    __skb_linearize(skb))
2943 			goto out_kfree_skb;
2944 
2945 		/* If packet is not checksummed and device does not
2946 		 * support checksumming for this protocol, complete
2947 		 * checksumming here.
2948 		 */
2949 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2950 			if (skb->encapsulation)
2951 				skb_set_inner_transport_header(skb,
2952 							       skb_checksum_start_offset(skb));
2953 			else
2954 				skb_set_transport_header(skb,
2955 							 skb_checksum_start_offset(skb));
2956 			if (!(features & NETIF_F_CSUM_MASK) &&
2957 			    skb_checksum_help(skb))
2958 				goto out_kfree_skb;
2959 		}
2960 	}
2961 
2962 	return skb;
2963 
2964 out_kfree_skb:
2965 	kfree_skb(skb);
2966 out_null:
2967 	atomic_long_inc(&dev->tx_dropped);
2968 	return NULL;
2969 }
2970 
2971 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2972 {
2973 	struct sk_buff *next, *head = NULL, *tail;
2974 
2975 	for (; skb != NULL; skb = next) {
2976 		next = skb->next;
2977 		skb->next = NULL;
2978 
2979 		/* in case skb wont be segmented, point to itself */
2980 		skb->prev = skb;
2981 
2982 		skb = validate_xmit_skb(skb, dev);
2983 		if (!skb)
2984 			continue;
2985 
2986 		if (!head)
2987 			head = skb;
2988 		else
2989 			tail->next = skb;
2990 		/* If skb was segmented, skb->prev points to
2991 		 * the last segment. If not, it still contains skb.
2992 		 */
2993 		tail = skb->prev;
2994 	}
2995 	return head;
2996 }
2997 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
2998 
2999 static void qdisc_pkt_len_init(struct sk_buff *skb)
3000 {
3001 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3002 
3003 	qdisc_skb_cb(skb)->pkt_len = skb->len;
3004 
3005 	/* To get more precise estimation of bytes sent on wire,
3006 	 * we add to pkt_len the headers size of all segments
3007 	 */
3008 	if (shinfo->gso_size)  {
3009 		unsigned int hdr_len;
3010 		u16 gso_segs = shinfo->gso_segs;
3011 
3012 		/* mac layer + network layer */
3013 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3014 
3015 		/* + transport layer */
3016 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3017 			hdr_len += tcp_hdrlen(skb);
3018 		else
3019 			hdr_len += sizeof(struct udphdr);
3020 
3021 		if (shinfo->gso_type & SKB_GSO_DODGY)
3022 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3023 						shinfo->gso_size);
3024 
3025 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3026 	}
3027 }
3028 
3029 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3030 				 struct net_device *dev,
3031 				 struct netdev_queue *txq)
3032 {
3033 	spinlock_t *root_lock = qdisc_lock(q);
3034 	struct sk_buff *to_free = NULL;
3035 	bool contended;
3036 	int rc;
3037 
3038 	qdisc_calculate_pkt_len(skb, q);
3039 	/*
3040 	 * Heuristic to force contended enqueues to serialize on a
3041 	 * separate lock before trying to get qdisc main lock.
3042 	 * This permits qdisc->running owner to get the lock more
3043 	 * often and dequeue packets faster.
3044 	 */
3045 	contended = qdisc_is_running(q);
3046 	if (unlikely(contended))
3047 		spin_lock(&q->busylock);
3048 
3049 	spin_lock(root_lock);
3050 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3051 		__qdisc_drop(skb, &to_free);
3052 		rc = NET_XMIT_DROP;
3053 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3054 		   qdisc_run_begin(q)) {
3055 		/*
3056 		 * This is a work-conserving queue; there are no old skbs
3057 		 * waiting to be sent out; and the qdisc is not running -
3058 		 * xmit the skb directly.
3059 		 */
3060 
3061 		qdisc_bstats_update(q, skb);
3062 
3063 		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3064 			if (unlikely(contended)) {
3065 				spin_unlock(&q->busylock);
3066 				contended = false;
3067 			}
3068 			__qdisc_run(q);
3069 		} else
3070 			qdisc_run_end(q);
3071 
3072 		rc = NET_XMIT_SUCCESS;
3073 	} else {
3074 		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3075 		if (qdisc_run_begin(q)) {
3076 			if (unlikely(contended)) {
3077 				spin_unlock(&q->busylock);
3078 				contended = false;
3079 			}
3080 			__qdisc_run(q);
3081 		}
3082 	}
3083 	spin_unlock(root_lock);
3084 	if (unlikely(to_free))
3085 		kfree_skb_list(to_free);
3086 	if (unlikely(contended))
3087 		spin_unlock(&q->busylock);
3088 	return rc;
3089 }
3090 
3091 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3092 static void skb_update_prio(struct sk_buff *skb)
3093 {
3094 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3095 
3096 	if (!skb->priority && skb->sk && map) {
3097 		unsigned int prioidx =
3098 			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3099 
3100 		if (prioidx < map->priomap_len)
3101 			skb->priority = map->priomap[prioidx];
3102 	}
3103 }
3104 #else
3105 #define skb_update_prio(skb)
3106 #endif
3107 
3108 DEFINE_PER_CPU(int, xmit_recursion);
3109 EXPORT_SYMBOL(xmit_recursion);
3110 
3111 /**
3112  *	dev_loopback_xmit - loop back @skb
3113  *	@net: network namespace this loopback is happening in
3114  *	@sk:  sk needed to be a netfilter okfn
3115  *	@skb: buffer to transmit
3116  */
3117 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3118 {
3119 	skb_reset_mac_header(skb);
3120 	__skb_pull(skb, skb_network_offset(skb));
3121 	skb->pkt_type = PACKET_LOOPBACK;
3122 	skb->ip_summed = CHECKSUM_UNNECESSARY;
3123 	WARN_ON(!skb_dst(skb));
3124 	skb_dst_force(skb);
3125 	netif_rx_ni(skb);
3126 	return 0;
3127 }
3128 EXPORT_SYMBOL(dev_loopback_xmit);
3129 
3130 #ifdef CONFIG_NET_EGRESS
3131 static struct sk_buff *
3132 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3133 {
3134 	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3135 	struct tcf_result cl_res;
3136 
3137 	if (!cl)
3138 		return skb;
3139 
3140 	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3141 	qdisc_bstats_cpu_update(cl->q, skb);
3142 
3143 	switch (tc_classify(skb, cl, &cl_res, false)) {
3144 	case TC_ACT_OK:
3145 	case TC_ACT_RECLASSIFY:
3146 		skb->tc_index = TC_H_MIN(cl_res.classid);
3147 		break;
3148 	case TC_ACT_SHOT:
3149 		qdisc_qstats_cpu_drop(cl->q);
3150 		*ret = NET_XMIT_DROP;
3151 		kfree_skb(skb);
3152 		return NULL;
3153 	case TC_ACT_STOLEN:
3154 	case TC_ACT_QUEUED:
3155 		*ret = NET_XMIT_SUCCESS;
3156 		consume_skb(skb);
3157 		return NULL;
3158 	case TC_ACT_REDIRECT:
3159 		/* No need to push/pop skb's mac_header here on egress! */
3160 		skb_do_redirect(skb);
3161 		*ret = NET_XMIT_SUCCESS;
3162 		return NULL;
3163 	default:
3164 		break;
3165 	}
3166 
3167 	return skb;
3168 }
3169 #endif /* CONFIG_NET_EGRESS */
3170 
3171 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3172 {
3173 #ifdef CONFIG_XPS
3174 	struct xps_dev_maps *dev_maps;
3175 	struct xps_map *map;
3176 	int queue_index = -1;
3177 
3178 	rcu_read_lock();
3179 	dev_maps = rcu_dereference(dev->xps_maps);
3180 	if (dev_maps) {
3181 		unsigned int tci = skb->sender_cpu - 1;
3182 
3183 		if (dev->num_tc) {
3184 			tci *= dev->num_tc;
3185 			tci += netdev_get_prio_tc_map(dev, skb->priority);
3186 		}
3187 
3188 		map = rcu_dereference(dev_maps->cpu_map[tci]);
3189 		if (map) {
3190 			if (map->len == 1)
3191 				queue_index = map->queues[0];
3192 			else
3193 				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3194 									   map->len)];
3195 			if (unlikely(queue_index >= dev->real_num_tx_queues))
3196 				queue_index = -1;
3197 		}
3198 	}
3199 	rcu_read_unlock();
3200 
3201 	return queue_index;
3202 #else
3203 	return -1;
3204 #endif
3205 }
3206 
3207 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3208 {
3209 	struct sock *sk = skb->sk;
3210 	int queue_index = sk_tx_queue_get(sk);
3211 
3212 	if (queue_index < 0 || skb->ooo_okay ||
3213 	    queue_index >= dev->real_num_tx_queues) {
3214 		int new_index = get_xps_queue(dev, skb);
3215 		if (new_index < 0)
3216 			new_index = skb_tx_hash(dev, skb);
3217 
3218 		if (queue_index != new_index && sk &&
3219 		    sk_fullsock(sk) &&
3220 		    rcu_access_pointer(sk->sk_dst_cache))
3221 			sk_tx_queue_set(sk, new_index);
3222 
3223 		queue_index = new_index;
3224 	}
3225 
3226 	return queue_index;
3227 }
3228 
3229 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3230 				    struct sk_buff *skb,
3231 				    void *accel_priv)
3232 {
3233 	int queue_index = 0;
3234 
3235 #ifdef CONFIG_XPS
3236 	u32 sender_cpu = skb->sender_cpu - 1;
3237 
3238 	if (sender_cpu >= (u32)NR_CPUS)
3239 		skb->sender_cpu = raw_smp_processor_id() + 1;
3240 #endif
3241 
3242 	if (dev->real_num_tx_queues != 1) {
3243 		const struct net_device_ops *ops = dev->netdev_ops;
3244 		if (ops->ndo_select_queue)
3245 			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3246 							    __netdev_pick_tx);
3247 		else
3248 			queue_index = __netdev_pick_tx(dev, skb);
3249 
3250 		if (!accel_priv)
3251 			queue_index = netdev_cap_txqueue(dev, queue_index);
3252 	}
3253 
3254 	skb_set_queue_mapping(skb, queue_index);
3255 	return netdev_get_tx_queue(dev, queue_index);
3256 }
3257 
3258 /**
3259  *	__dev_queue_xmit - transmit a buffer
3260  *	@skb: buffer to transmit
3261  *	@accel_priv: private data used for L2 forwarding offload
3262  *
3263  *	Queue a buffer for transmission to a network device. The caller must
3264  *	have set the device and priority and built the buffer before calling
3265  *	this function. The function can be called from an interrupt.
3266  *
3267  *	A negative errno code is returned on a failure. A success does not
3268  *	guarantee the frame will be transmitted as it may be dropped due
3269  *	to congestion or traffic shaping.
3270  *
3271  * -----------------------------------------------------------------------------------
3272  *      I notice this method can also return errors from the queue disciplines,
3273  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3274  *      be positive.
3275  *
3276  *      Regardless of the return value, the skb is consumed, so it is currently
3277  *      difficult to retry a send to this method.  (You can bump the ref count
3278  *      before sending to hold a reference for retry if you are careful.)
3279  *
3280  *      When calling this method, interrupts MUST be enabled.  This is because
3281  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3282  *          --BLG
3283  */
3284 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3285 {
3286 	struct net_device *dev = skb->dev;
3287 	struct netdev_queue *txq;
3288 	struct Qdisc *q;
3289 	int rc = -ENOMEM;
3290 
3291 	skb_reset_mac_header(skb);
3292 
3293 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3294 		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3295 
3296 	/* Disable soft irqs for various locks below. Also
3297 	 * stops preemption for RCU.
3298 	 */
3299 	rcu_read_lock_bh();
3300 
3301 	skb_update_prio(skb);
3302 
3303 	qdisc_pkt_len_init(skb);
3304 #ifdef CONFIG_NET_CLS_ACT
3305 	skb->tc_at_ingress = 0;
3306 # ifdef CONFIG_NET_EGRESS
3307 	if (static_key_false(&egress_needed)) {
3308 		skb = sch_handle_egress(skb, &rc, dev);
3309 		if (!skb)
3310 			goto out;
3311 	}
3312 # endif
3313 #endif
3314 	/* If device/qdisc don't need skb->dst, release it right now while
3315 	 * its hot in this cpu cache.
3316 	 */
3317 	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3318 		skb_dst_drop(skb);
3319 	else
3320 		skb_dst_force(skb);
3321 
3322 	txq = netdev_pick_tx(dev, skb, accel_priv);
3323 	q = rcu_dereference_bh(txq->qdisc);
3324 
3325 	trace_net_dev_queue(skb);
3326 	if (q->enqueue) {
3327 		rc = __dev_xmit_skb(skb, q, dev, txq);
3328 		goto out;
3329 	}
3330 
3331 	/* The device has no queue. Common case for software devices:
3332 	   loopback, all the sorts of tunnels...
3333 
3334 	   Really, it is unlikely that netif_tx_lock protection is necessary
3335 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3336 	   counters.)
3337 	   However, it is possible, that they rely on protection
3338 	   made by us here.
3339 
3340 	   Check this and shot the lock. It is not prone from deadlocks.
3341 	   Either shot noqueue qdisc, it is even simpler 8)
3342 	 */
3343 	if (dev->flags & IFF_UP) {
3344 		int cpu = smp_processor_id(); /* ok because BHs are off */
3345 
3346 		if (txq->xmit_lock_owner != cpu) {
3347 			if (unlikely(__this_cpu_read(xmit_recursion) >
3348 				     XMIT_RECURSION_LIMIT))
3349 				goto recursion_alert;
3350 
3351 			skb = validate_xmit_skb(skb, dev);
3352 			if (!skb)
3353 				goto out;
3354 
3355 			HARD_TX_LOCK(dev, txq, cpu);
3356 
3357 			if (!netif_xmit_stopped(txq)) {
3358 				__this_cpu_inc(xmit_recursion);
3359 				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3360 				__this_cpu_dec(xmit_recursion);
3361 				if (dev_xmit_complete(rc)) {
3362 					HARD_TX_UNLOCK(dev, txq);
3363 					goto out;
3364 				}
3365 			}
3366 			HARD_TX_UNLOCK(dev, txq);
3367 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3368 					     dev->name);
3369 		} else {
3370 			/* Recursion is detected! It is possible,
3371 			 * unfortunately
3372 			 */
3373 recursion_alert:
3374 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3375 					     dev->name);
3376 		}
3377 	}
3378 
3379 	rc = -ENETDOWN;
3380 	rcu_read_unlock_bh();
3381 
3382 	atomic_long_inc(&dev->tx_dropped);
3383 	kfree_skb_list(skb);
3384 	return rc;
3385 out:
3386 	rcu_read_unlock_bh();
3387 	return rc;
3388 }
3389 
3390 int dev_queue_xmit(struct sk_buff *skb)
3391 {
3392 	return __dev_queue_xmit(skb, NULL);
3393 }
3394 EXPORT_SYMBOL(dev_queue_xmit);
3395 
3396 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3397 {
3398 	return __dev_queue_xmit(skb, accel_priv);
3399 }
3400 EXPORT_SYMBOL(dev_queue_xmit_accel);
3401 
3402 
3403 /*=======================================================================
3404 			Receiver routines
3405   =======================================================================*/
3406 
3407 int netdev_max_backlog __read_mostly = 1000;
3408 EXPORT_SYMBOL(netdev_max_backlog);
3409 
3410 int netdev_tstamp_prequeue __read_mostly = 1;
3411 int netdev_budget __read_mostly = 300;
3412 int weight_p __read_mostly = 64;           /* old backlog weight */
3413 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
3414 int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
3415 int dev_rx_weight __read_mostly = 64;
3416 int dev_tx_weight __read_mostly = 64;
3417 
3418 /* Called with irq disabled */
3419 static inline void ____napi_schedule(struct softnet_data *sd,
3420 				     struct napi_struct *napi)
3421 {
3422 	list_add_tail(&napi->poll_list, &sd->poll_list);
3423 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3424 }
3425 
3426 #ifdef CONFIG_RPS
3427 
3428 /* One global table that all flow-based protocols share. */
3429 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3430 EXPORT_SYMBOL(rps_sock_flow_table);
3431 u32 rps_cpu_mask __read_mostly;
3432 EXPORT_SYMBOL(rps_cpu_mask);
3433 
3434 struct static_key rps_needed __read_mostly;
3435 EXPORT_SYMBOL(rps_needed);
3436 struct static_key rfs_needed __read_mostly;
3437 EXPORT_SYMBOL(rfs_needed);
3438 
3439 static struct rps_dev_flow *
3440 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3441 	    struct rps_dev_flow *rflow, u16 next_cpu)
3442 {
3443 	if (next_cpu < nr_cpu_ids) {
3444 #ifdef CONFIG_RFS_ACCEL
3445 		struct netdev_rx_queue *rxqueue;
3446 		struct rps_dev_flow_table *flow_table;
3447 		struct rps_dev_flow *old_rflow;
3448 		u32 flow_id;
3449 		u16 rxq_index;
3450 		int rc;
3451 
3452 		/* Should we steer this flow to a different hardware queue? */
3453 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3454 		    !(dev->features & NETIF_F_NTUPLE))
3455 			goto out;
3456 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3457 		if (rxq_index == skb_get_rx_queue(skb))
3458 			goto out;
3459 
3460 		rxqueue = dev->_rx + rxq_index;
3461 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3462 		if (!flow_table)
3463 			goto out;
3464 		flow_id = skb_get_hash(skb) & flow_table->mask;
3465 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3466 							rxq_index, flow_id);
3467 		if (rc < 0)
3468 			goto out;
3469 		old_rflow = rflow;
3470 		rflow = &flow_table->flows[flow_id];
3471 		rflow->filter = rc;
3472 		if (old_rflow->filter == rflow->filter)
3473 			old_rflow->filter = RPS_NO_FILTER;
3474 	out:
3475 #endif
3476 		rflow->last_qtail =
3477 			per_cpu(softnet_data, next_cpu).input_queue_head;
3478 	}
3479 
3480 	rflow->cpu = next_cpu;
3481 	return rflow;
3482 }
3483 
3484 /*
3485  * get_rps_cpu is called from netif_receive_skb and returns the target
3486  * CPU from the RPS map of the receiving queue for a given skb.
3487  * rcu_read_lock must be held on entry.
3488  */
3489 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3490 		       struct rps_dev_flow **rflowp)
3491 {
3492 	const struct rps_sock_flow_table *sock_flow_table;
3493 	struct netdev_rx_queue *rxqueue = dev->_rx;
3494 	struct rps_dev_flow_table *flow_table;
3495 	struct rps_map *map;
3496 	int cpu = -1;
3497 	u32 tcpu;
3498 	u32 hash;
3499 
3500 	if (skb_rx_queue_recorded(skb)) {
3501 		u16 index = skb_get_rx_queue(skb);
3502 
3503 		if (unlikely(index >= dev->real_num_rx_queues)) {
3504 			WARN_ONCE(dev->real_num_rx_queues > 1,
3505 				  "%s received packet on queue %u, but number "
3506 				  "of RX queues is %u\n",
3507 				  dev->name, index, dev->real_num_rx_queues);
3508 			goto done;
3509 		}
3510 		rxqueue += index;
3511 	}
3512 
3513 	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3514 
3515 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3516 	map = rcu_dereference(rxqueue->rps_map);
3517 	if (!flow_table && !map)
3518 		goto done;
3519 
3520 	skb_reset_network_header(skb);
3521 	hash = skb_get_hash(skb);
3522 	if (!hash)
3523 		goto done;
3524 
3525 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3526 	if (flow_table && sock_flow_table) {
3527 		struct rps_dev_flow *rflow;
3528 		u32 next_cpu;
3529 		u32 ident;
3530 
3531 		/* First check into global flow table if there is a match */
3532 		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3533 		if ((ident ^ hash) & ~rps_cpu_mask)
3534 			goto try_rps;
3535 
3536 		next_cpu = ident & rps_cpu_mask;
3537 
3538 		/* OK, now we know there is a match,
3539 		 * we can look at the local (per receive queue) flow table
3540 		 */
3541 		rflow = &flow_table->flows[hash & flow_table->mask];
3542 		tcpu = rflow->cpu;
3543 
3544 		/*
3545 		 * If the desired CPU (where last recvmsg was done) is
3546 		 * different from current CPU (one in the rx-queue flow
3547 		 * table entry), switch if one of the following holds:
3548 		 *   - Current CPU is unset (>= nr_cpu_ids).
3549 		 *   - Current CPU is offline.
3550 		 *   - The current CPU's queue tail has advanced beyond the
3551 		 *     last packet that was enqueued using this table entry.
3552 		 *     This guarantees that all previous packets for the flow
3553 		 *     have been dequeued, thus preserving in order delivery.
3554 		 */
3555 		if (unlikely(tcpu != next_cpu) &&
3556 		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3557 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3558 		      rflow->last_qtail)) >= 0)) {
3559 			tcpu = next_cpu;
3560 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3561 		}
3562 
3563 		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3564 			*rflowp = rflow;
3565 			cpu = tcpu;
3566 			goto done;
3567 		}
3568 	}
3569 
3570 try_rps:
3571 
3572 	if (map) {
3573 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3574 		if (cpu_online(tcpu)) {
3575 			cpu = tcpu;
3576 			goto done;
3577 		}
3578 	}
3579 
3580 done:
3581 	return cpu;
3582 }
3583 
3584 #ifdef CONFIG_RFS_ACCEL
3585 
3586 /**
3587  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3588  * @dev: Device on which the filter was set
3589  * @rxq_index: RX queue index
3590  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3591  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3592  *
3593  * Drivers that implement ndo_rx_flow_steer() should periodically call
3594  * this function for each installed filter and remove the filters for
3595  * which it returns %true.
3596  */
3597 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3598 			 u32 flow_id, u16 filter_id)
3599 {
3600 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3601 	struct rps_dev_flow_table *flow_table;
3602 	struct rps_dev_flow *rflow;
3603 	bool expire = true;
3604 	unsigned int cpu;
3605 
3606 	rcu_read_lock();
3607 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3608 	if (flow_table && flow_id <= flow_table->mask) {
3609 		rflow = &flow_table->flows[flow_id];
3610 		cpu = ACCESS_ONCE(rflow->cpu);
3611 		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3612 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3613 			   rflow->last_qtail) <
3614 		     (int)(10 * flow_table->mask)))
3615 			expire = false;
3616 	}
3617 	rcu_read_unlock();
3618 	return expire;
3619 }
3620 EXPORT_SYMBOL(rps_may_expire_flow);
3621 
3622 #endif /* CONFIG_RFS_ACCEL */
3623 
3624 /* Called from hardirq (IPI) context */
3625 static void rps_trigger_softirq(void *data)
3626 {
3627 	struct softnet_data *sd = data;
3628 
3629 	____napi_schedule(sd, &sd->backlog);
3630 	sd->received_rps++;
3631 }
3632 
3633 #endif /* CONFIG_RPS */
3634 
3635 /*
3636  * Check if this softnet_data structure is another cpu one
3637  * If yes, queue it to our IPI list and return 1
3638  * If no, return 0
3639  */
3640 static int rps_ipi_queued(struct softnet_data *sd)
3641 {
3642 #ifdef CONFIG_RPS
3643 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3644 
3645 	if (sd != mysd) {
3646 		sd->rps_ipi_next = mysd->rps_ipi_list;
3647 		mysd->rps_ipi_list = sd;
3648 
3649 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3650 		return 1;
3651 	}
3652 #endif /* CONFIG_RPS */
3653 	return 0;
3654 }
3655 
3656 #ifdef CONFIG_NET_FLOW_LIMIT
3657 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3658 #endif
3659 
3660 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3661 {
3662 #ifdef CONFIG_NET_FLOW_LIMIT
3663 	struct sd_flow_limit *fl;
3664 	struct softnet_data *sd;
3665 	unsigned int old_flow, new_flow;
3666 
3667 	if (qlen < (netdev_max_backlog >> 1))
3668 		return false;
3669 
3670 	sd = this_cpu_ptr(&softnet_data);
3671 
3672 	rcu_read_lock();
3673 	fl = rcu_dereference(sd->flow_limit);
3674 	if (fl) {
3675 		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3676 		old_flow = fl->history[fl->history_head];
3677 		fl->history[fl->history_head] = new_flow;
3678 
3679 		fl->history_head++;
3680 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3681 
3682 		if (likely(fl->buckets[old_flow]))
3683 			fl->buckets[old_flow]--;
3684 
3685 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3686 			fl->count++;
3687 			rcu_read_unlock();
3688 			return true;
3689 		}
3690 	}
3691 	rcu_read_unlock();
3692 #endif
3693 	return false;
3694 }
3695 
3696 /*
3697  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3698  * queue (may be a remote CPU queue).
3699  */
3700 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3701 			      unsigned int *qtail)
3702 {
3703 	struct softnet_data *sd;
3704 	unsigned long flags;
3705 	unsigned int qlen;
3706 
3707 	sd = &per_cpu(softnet_data, cpu);
3708 
3709 	local_irq_save(flags);
3710 
3711 	rps_lock(sd);
3712 	if (!netif_running(skb->dev))
3713 		goto drop;
3714 	qlen = skb_queue_len(&sd->input_pkt_queue);
3715 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3716 		if (qlen) {
3717 enqueue:
3718 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3719 			input_queue_tail_incr_save(sd, qtail);
3720 			rps_unlock(sd);
3721 			local_irq_restore(flags);
3722 			return NET_RX_SUCCESS;
3723 		}
3724 
3725 		/* Schedule NAPI for backlog device
3726 		 * We can use non atomic operation since we own the queue lock
3727 		 */
3728 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3729 			if (!rps_ipi_queued(sd))
3730 				____napi_schedule(sd, &sd->backlog);
3731 		}
3732 		goto enqueue;
3733 	}
3734 
3735 drop:
3736 	sd->dropped++;
3737 	rps_unlock(sd);
3738 
3739 	local_irq_restore(flags);
3740 
3741 	atomic_long_inc(&skb->dev->rx_dropped);
3742 	kfree_skb(skb);
3743 	return NET_RX_DROP;
3744 }
3745 
3746 static int netif_rx_internal(struct sk_buff *skb)
3747 {
3748 	int ret;
3749 
3750 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3751 
3752 	trace_netif_rx(skb);
3753 #ifdef CONFIG_RPS
3754 	if (static_key_false(&rps_needed)) {
3755 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3756 		int cpu;
3757 
3758 		preempt_disable();
3759 		rcu_read_lock();
3760 
3761 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3762 		if (cpu < 0)
3763 			cpu = smp_processor_id();
3764 
3765 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3766 
3767 		rcu_read_unlock();
3768 		preempt_enable();
3769 	} else
3770 #endif
3771 	{
3772 		unsigned int qtail;
3773 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3774 		put_cpu();
3775 	}
3776 	return ret;
3777 }
3778 
3779 /**
3780  *	netif_rx	-	post buffer to the network code
3781  *	@skb: buffer to post
3782  *
3783  *	This function receives a packet from a device driver and queues it for
3784  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3785  *	may be dropped during processing for congestion control or by the
3786  *	protocol layers.
3787  *
3788  *	return values:
3789  *	NET_RX_SUCCESS	(no congestion)
3790  *	NET_RX_DROP     (packet was dropped)
3791  *
3792  */
3793 
3794 int netif_rx(struct sk_buff *skb)
3795 {
3796 	trace_netif_rx_entry(skb);
3797 
3798 	return netif_rx_internal(skb);
3799 }
3800 EXPORT_SYMBOL(netif_rx);
3801 
3802 int netif_rx_ni(struct sk_buff *skb)
3803 {
3804 	int err;
3805 
3806 	trace_netif_rx_ni_entry(skb);
3807 
3808 	preempt_disable();
3809 	err = netif_rx_internal(skb);
3810 	if (local_softirq_pending())
3811 		do_softirq();
3812 	preempt_enable();
3813 
3814 	return err;
3815 }
3816 EXPORT_SYMBOL(netif_rx_ni);
3817 
3818 static __latent_entropy void net_tx_action(struct softirq_action *h)
3819 {
3820 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3821 
3822 	if (sd->completion_queue) {
3823 		struct sk_buff *clist;
3824 
3825 		local_irq_disable();
3826 		clist = sd->completion_queue;
3827 		sd->completion_queue = NULL;
3828 		local_irq_enable();
3829 
3830 		while (clist) {
3831 			struct sk_buff *skb = clist;
3832 			clist = clist->next;
3833 
3834 			WARN_ON(atomic_read(&skb->users));
3835 			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3836 				trace_consume_skb(skb);
3837 			else
3838 				trace_kfree_skb(skb, net_tx_action);
3839 
3840 			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3841 				__kfree_skb(skb);
3842 			else
3843 				__kfree_skb_defer(skb);
3844 		}
3845 
3846 		__kfree_skb_flush();
3847 	}
3848 
3849 	if (sd->output_queue) {
3850 		struct Qdisc *head;
3851 
3852 		local_irq_disable();
3853 		head = sd->output_queue;
3854 		sd->output_queue = NULL;
3855 		sd->output_queue_tailp = &sd->output_queue;
3856 		local_irq_enable();
3857 
3858 		while (head) {
3859 			struct Qdisc *q = head;
3860 			spinlock_t *root_lock;
3861 
3862 			head = head->next_sched;
3863 
3864 			root_lock = qdisc_lock(q);
3865 			spin_lock(root_lock);
3866 			/* We need to make sure head->next_sched is read
3867 			 * before clearing __QDISC_STATE_SCHED
3868 			 */
3869 			smp_mb__before_atomic();
3870 			clear_bit(__QDISC_STATE_SCHED, &q->state);
3871 			qdisc_run(q);
3872 			spin_unlock(root_lock);
3873 		}
3874 	}
3875 }
3876 
3877 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3878 /* This hook is defined here for ATM LANE */
3879 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3880 			     unsigned char *addr) __read_mostly;
3881 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3882 #endif
3883 
3884 static inline struct sk_buff *
3885 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3886 		   struct net_device *orig_dev)
3887 {
3888 #ifdef CONFIG_NET_CLS_ACT
3889 	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3890 	struct tcf_result cl_res;
3891 
3892 	/* If there's at least one ingress present somewhere (so
3893 	 * we get here via enabled static key), remaining devices
3894 	 * that are not configured with an ingress qdisc will bail
3895 	 * out here.
3896 	 */
3897 	if (!cl)
3898 		return skb;
3899 	if (*pt_prev) {
3900 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3901 		*pt_prev = NULL;
3902 	}
3903 
3904 	qdisc_skb_cb(skb)->pkt_len = skb->len;
3905 	skb->tc_at_ingress = 1;
3906 	qdisc_bstats_cpu_update(cl->q, skb);
3907 
3908 	switch (tc_classify(skb, cl, &cl_res, false)) {
3909 	case TC_ACT_OK:
3910 	case TC_ACT_RECLASSIFY:
3911 		skb->tc_index = TC_H_MIN(cl_res.classid);
3912 		break;
3913 	case TC_ACT_SHOT:
3914 		qdisc_qstats_cpu_drop(cl->q);
3915 		kfree_skb(skb);
3916 		return NULL;
3917 	case TC_ACT_STOLEN:
3918 	case TC_ACT_QUEUED:
3919 		consume_skb(skb);
3920 		return NULL;
3921 	case TC_ACT_REDIRECT:
3922 		/* skb_mac_header check was done by cls/act_bpf, so
3923 		 * we can safely push the L2 header back before
3924 		 * redirecting to another netdev
3925 		 */
3926 		__skb_push(skb, skb->mac_len);
3927 		skb_do_redirect(skb);
3928 		return NULL;
3929 	default:
3930 		break;
3931 	}
3932 #endif /* CONFIG_NET_CLS_ACT */
3933 	return skb;
3934 }
3935 
3936 /**
3937  *	netdev_is_rx_handler_busy - check if receive handler is registered
3938  *	@dev: device to check
3939  *
3940  *	Check if a receive handler is already registered for a given device.
3941  *	Return true if there one.
3942  *
3943  *	The caller must hold the rtnl_mutex.
3944  */
3945 bool netdev_is_rx_handler_busy(struct net_device *dev)
3946 {
3947 	ASSERT_RTNL();
3948 	return dev && rtnl_dereference(dev->rx_handler);
3949 }
3950 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3951 
3952 /**
3953  *	netdev_rx_handler_register - register receive handler
3954  *	@dev: device to register a handler for
3955  *	@rx_handler: receive handler to register
3956  *	@rx_handler_data: data pointer that is used by rx handler
3957  *
3958  *	Register a receive handler for a device. This handler will then be
3959  *	called from __netif_receive_skb. A negative errno code is returned
3960  *	on a failure.
3961  *
3962  *	The caller must hold the rtnl_mutex.
3963  *
3964  *	For a general description of rx_handler, see enum rx_handler_result.
3965  */
3966 int netdev_rx_handler_register(struct net_device *dev,
3967 			       rx_handler_func_t *rx_handler,
3968 			       void *rx_handler_data)
3969 {
3970 	if (netdev_is_rx_handler_busy(dev))
3971 		return -EBUSY;
3972 
3973 	/* Note: rx_handler_data must be set before rx_handler */
3974 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3975 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3976 
3977 	return 0;
3978 }
3979 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3980 
3981 /**
3982  *	netdev_rx_handler_unregister - unregister receive handler
3983  *	@dev: device to unregister a handler from
3984  *
3985  *	Unregister a receive handler from a device.
3986  *
3987  *	The caller must hold the rtnl_mutex.
3988  */
3989 void netdev_rx_handler_unregister(struct net_device *dev)
3990 {
3991 
3992 	ASSERT_RTNL();
3993 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3994 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3995 	 * section has a guarantee to see a non NULL rx_handler_data
3996 	 * as well.
3997 	 */
3998 	synchronize_net();
3999 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4000 }
4001 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4002 
4003 /*
4004  * Limit the use of PFMEMALLOC reserves to those protocols that implement
4005  * the special handling of PFMEMALLOC skbs.
4006  */
4007 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4008 {
4009 	switch (skb->protocol) {
4010 	case htons(ETH_P_ARP):
4011 	case htons(ETH_P_IP):
4012 	case htons(ETH_P_IPV6):
4013 	case htons(ETH_P_8021Q):
4014 	case htons(ETH_P_8021AD):
4015 		return true;
4016 	default:
4017 		return false;
4018 	}
4019 }
4020 
4021 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4022 			     int *ret, struct net_device *orig_dev)
4023 {
4024 #ifdef CONFIG_NETFILTER_INGRESS
4025 	if (nf_hook_ingress_active(skb)) {
4026 		int ingress_retval;
4027 
4028 		if (*pt_prev) {
4029 			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4030 			*pt_prev = NULL;
4031 		}
4032 
4033 		rcu_read_lock();
4034 		ingress_retval = nf_hook_ingress(skb);
4035 		rcu_read_unlock();
4036 		return ingress_retval;
4037 	}
4038 #endif /* CONFIG_NETFILTER_INGRESS */
4039 	return 0;
4040 }
4041 
4042 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4043 {
4044 	struct packet_type *ptype, *pt_prev;
4045 	rx_handler_func_t *rx_handler;
4046 	struct net_device *orig_dev;
4047 	bool deliver_exact = false;
4048 	int ret = NET_RX_DROP;
4049 	__be16 type;
4050 
4051 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4052 
4053 	trace_netif_receive_skb(skb);
4054 
4055 	orig_dev = skb->dev;
4056 
4057 	skb_reset_network_header(skb);
4058 	if (!skb_transport_header_was_set(skb))
4059 		skb_reset_transport_header(skb);
4060 	skb_reset_mac_len(skb);
4061 
4062 	pt_prev = NULL;
4063 
4064 another_round:
4065 	skb->skb_iif = skb->dev->ifindex;
4066 
4067 	__this_cpu_inc(softnet_data.processed);
4068 
4069 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4070 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4071 		skb = skb_vlan_untag(skb);
4072 		if (unlikely(!skb))
4073 			goto out;
4074 	}
4075 
4076 	if (skb_skip_tc_classify(skb))
4077 		goto skip_classify;
4078 
4079 	if (pfmemalloc)
4080 		goto skip_taps;
4081 
4082 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4083 		if (pt_prev)
4084 			ret = deliver_skb(skb, pt_prev, orig_dev);
4085 		pt_prev = ptype;
4086 	}
4087 
4088 	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4089 		if (pt_prev)
4090 			ret = deliver_skb(skb, pt_prev, orig_dev);
4091 		pt_prev = ptype;
4092 	}
4093 
4094 skip_taps:
4095 #ifdef CONFIG_NET_INGRESS
4096 	if (static_key_false(&ingress_needed)) {
4097 		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4098 		if (!skb)
4099 			goto out;
4100 
4101 		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4102 			goto out;
4103 	}
4104 #endif
4105 	skb_reset_tc(skb);
4106 skip_classify:
4107 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4108 		goto drop;
4109 
4110 	if (skb_vlan_tag_present(skb)) {
4111 		if (pt_prev) {
4112 			ret = deliver_skb(skb, pt_prev, orig_dev);
4113 			pt_prev = NULL;
4114 		}
4115 		if (vlan_do_receive(&skb))
4116 			goto another_round;
4117 		else if (unlikely(!skb))
4118 			goto out;
4119 	}
4120 
4121 	rx_handler = rcu_dereference(skb->dev->rx_handler);
4122 	if (rx_handler) {
4123 		if (pt_prev) {
4124 			ret = deliver_skb(skb, pt_prev, orig_dev);
4125 			pt_prev = NULL;
4126 		}
4127 		switch (rx_handler(&skb)) {
4128 		case RX_HANDLER_CONSUMED:
4129 			ret = NET_RX_SUCCESS;
4130 			goto out;
4131 		case RX_HANDLER_ANOTHER:
4132 			goto another_round;
4133 		case RX_HANDLER_EXACT:
4134 			deliver_exact = true;
4135 		case RX_HANDLER_PASS:
4136 			break;
4137 		default:
4138 			BUG();
4139 		}
4140 	}
4141 
4142 	if (unlikely(skb_vlan_tag_present(skb))) {
4143 		if (skb_vlan_tag_get_id(skb))
4144 			skb->pkt_type = PACKET_OTHERHOST;
4145 		/* Note: we might in the future use prio bits
4146 		 * and set skb->priority like in vlan_do_receive()
4147 		 * For the time being, just ignore Priority Code Point
4148 		 */
4149 		skb->vlan_tci = 0;
4150 	}
4151 
4152 	type = skb->protocol;
4153 
4154 	/* deliver only exact match when indicated */
4155 	if (likely(!deliver_exact)) {
4156 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4157 				       &ptype_base[ntohs(type) &
4158 						   PTYPE_HASH_MASK]);
4159 	}
4160 
4161 	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4162 			       &orig_dev->ptype_specific);
4163 
4164 	if (unlikely(skb->dev != orig_dev)) {
4165 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4166 				       &skb->dev->ptype_specific);
4167 	}
4168 
4169 	if (pt_prev) {
4170 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4171 			goto drop;
4172 		else
4173 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4174 	} else {
4175 drop:
4176 		if (!deliver_exact)
4177 			atomic_long_inc(&skb->dev->rx_dropped);
4178 		else
4179 			atomic_long_inc(&skb->dev->rx_nohandler);
4180 		kfree_skb(skb);
4181 		/* Jamal, now you will not able to escape explaining
4182 		 * me how you were going to use this. :-)
4183 		 */
4184 		ret = NET_RX_DROP;
4185 	}
4186 
4187 out:
4188 	return ret;
4189 }
4190 
4191 static int __netif_receive_skb(struct sk_buff *skb)
4192 {
4193 	int ret;
4194 
4195 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4196 		unsigned long pflags = current->flags;
4197 
4198 		/*
4199 		 * PFMEMALLOC skbs are special, they should
4200 		 * - be delivered to SOCK_MEMALLOC sockets only
4201 		 * - stay away from userspace
4202 		 * - have bounded memory usage
4203 		 *
4204 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4205 		 * context down to all allocation sites.
4206 		 */
4207 		current->flags |= PF_MEMALLOC;
4208 		ret = __netif_receive_skb_core(skb, true);
4209 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4210 	} else
4211 		ret = __netif_receive_skb_core(skb, false);
4212 
4213 	return ret;
4214 }
4215 
4216 static int netif_receive_skb_internal(struct sk_buff *skb)
4217 {
4218 	int ret;
4219 
4220 	net_timestamp_check(netdev_tstamp_prequeue, skb);
4221 
4222 	if (skb_defer_rx_timestamp(skb))
4223 		return NET_RX_SUCCESS;
4224 
4225 	rcu_read_lock();
4226 
4227 #ifdef CONFIG_RPS
4228 	if (static_key_false(&rps_needed)) {
4229 		struct rps_dev_flow voidflow, *rflow = &voidflow;
4230 		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4231 
4232 		if (cpu >= 0) {
4233 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4234 			rcu_read_unlock();
4235 			return ret;
4236 		}
4237 	}
4238 #endif
4239 	ret = __netif_receive_skb(skb);
4240 	rcu_read_unlock();
4241 	return ret;
4242 }
4243 
4244 /**
4245  *	netif_receive_skb - process receive buffer from network
4246  *	@skb: buffer to process
4247  *
4248  *	netif_receive_skb() is the main receive data processing function.
4249  *	It always succeeds. The buffer may be dropped during processing
4250  *	for congestion control or by the protocol layers.
4251  *
4252  *	This function may only be called from softirq context and interrupts
4253  *	should be enabled.
4254  *
4255  *	Return values (usually ignored):
4256  *	NET_RX_SUCCESS: no congestion
4257  *	NET_RX_DROP: packet was dropped
4258  */
4259 int netif_receive_skb(struct sk_buff *skb)
4260 {
4261 	trace_netif_receive_skb_entry(skb);
4262 
4263 	return netif_receive_skb_internal(skb);
4264 }
4265 EXPORT_SYMBOL(netif_receive_skb);
4266 
4267 DEFINE_PER_CPU(struct work_struct, flush_works);
4268 
4269 /* Network device is going away, flush any packets still pending */
4270 static void flush_backlog(struct work_struct *work)
4271 {
4272 	struct sk_buff *skb, *tmp;
4273 	struct softnet_data *sd;
4274 
4275 	local_bh_disable();
4276 	sd = this_cpu_ptr(&softnet_data);
4277 
4278 	local_irq_disable();
4279 	rps_lock(sd);
4280 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4281 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4282 			__skb_unlink(skb, &sd->input_pkt_queue);
4283 			kfree_skb(skb);
4284 			input_queue_head_incr(sd);
4285 		}
4286 	}
4287 	rps_unlock(sd);
4288 	local_irq_enable();
4289 
4290 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4291 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4292 			__skb_unlink(skb, &sd->process_queue);
4293 			kfree_skb(skb);
4294 			input_queue_head_incr(sd);
4295 		}
4296 	}
4297 	local_bh_enable();
4298 }
4299 
4300 static void flush_all_backlogs(void)
4301 {
4302 	unsigned int cpu;
4303 
4304 	get_online_cpus();
4305 
4306 	for_each_online_cpu(cpu)
4307 		queue_work_on(cpu, system_highpri_wq,
4308 			      per_cpu_ptr(&flush_works, cpu));
4309 
4310 	for_each_online_cpu(cpu)
4311 		flush_work(per_cpu_ptr(&flush_works, cpu));
4312 
4313 	put_online_cpus();
4314 }
4315 
4316 static int napi_gro_complete(struct sk_buff *skb)
4317 {
4318 	struct packet_offload *ptype;
4319 	__be16 type = skb->protocol;
4320 	struct list_head *head = &offload_base;
4321 	int err = -ENOENT;
4322 
4323 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4324 
4325 	if (NAPI_GRO_CB(skb)->count == 1) {
4326 		skb_shinfo(skb)->gso_size = 0;
4327 		goto out;
4328 	}
4329 
4330 	rcu_read_lock();
4331 	list_for_each_entry_rcu(ptype, head, list) {
4332 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4333 			continue;
4334 
4335 		err = ptype->callbacks.gro_complete(skb, 0);
4336 		break;
4337 	}
4338 	rcu_read_unlock();
4339 
4340 	if (err) {
4341 		WARN_ON(&ptype->list == head);
4342 		kfree_skb(skb);
4343 		return NET_RX_SUCCESS;
4344 	}
4345 
4346 out:
4347 	return netif_receive_skb_internal(skb);
4348 }
4349 
4350 /* napi->gro_list contains packets ordered by age.
4351  * youngest packets at the head of it.
4352  * Complete skbs in reverse order to reduce latencies.
4353  */
4354 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4355 {
4356 	struct sk_buff *skb, *prev = NULL;
4357 
4358 	/* scan list and build reverse chain */
4359 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4360 		skb->prev = prev;
4361 		prev = skb;
4362 	}
4363 
4364 	for (skb = prev; skb; skb = prev) {
4365 		skb->next = NULL;
4366 
4367 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4368 			return;
4369 
4370 		prev = skb->prev;
4371 		napi_gro_complete(skb);
4372 		napi->gro_count--;
4373 	}
4374 
4375 	napi->gro_list = NULL;
4376 }
4377 EXPORT_SYMBOL(napi_gro_flush);
4378 
4379 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4380 {
4381 	struct sk_buff *p;
4382 	unsigned int maclen = skb->dev->hard_header_len;
4383 	u32 hash = skb_get_hash_raw(skb);
4384 
4385 	for (p = napi->gro_list; p; p = p->next) {
4386 		unsigned long diffs;
4387 
4388 		NAPI_GRO_CB(p)->flush = 0;
4389 
4390 		if (hash != skb_get_hash_raw(p)) {
4391 			NAPI_GRO_CB(p)->same_flow = 0;
4392 			continue;
4393 		}
4394 
4395 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4396 		diffs |= p->vlan_tci ^ skb->vlan_tci;
4397 		diffs |= skb_metadata_dst_cmp(p, skb);
4398 		if (maclen == ETH_HLEN)
4399 			diffs |= compare_ether_header(skb_mac_header(p),
4400 						      skb_mac_header(skb));
4401 		else if (!diffs)
4402 			diffs = memcmp(skb_mac_header(p),
4403 				       skb_mac_header(skb),
4404 				       maclen);
4405 		NAPI_GRO_CB(p)->same_flow = !diffs;
4406 	}
4407 }
4408 
4409 static void skb_gro_reset_offset(struct sk_buff *skb)
4410 {
4411 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4412 	const skb_frag_t *frag0 = &pinfo->frags[0];
4413 
4414 	NAPI_GRO_CB(skb)->data_offset = 0;
4415 	NAPI_GRO_CB(skb)->frag0 = NULL;
4416 	NAPI_GRO_CB(skb)->frag0_len = 0;
4417 
4418 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4419 	    pinfo->nr_frags &&
4420 	    !PageHighMem(skb_frag_page(frag0))) {
4421 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4422 		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4423 						    skb_frag_size(frag0),
4424 						    skb->end - skb->tail);
4425 	}
4426 }
4427 
4428 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4429 {
4430 	struct skb_shared_info *pinfo = skb_shinfo(skb);
4431 
4432 	BUG_ON(skb->end - skb->tail < grow);
4433 
4434 	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4435 
4436 	skb->data_len -= grow;
4437 	skb->tail += grow;
4438 
4439 	pinfo->frags[0].page_offset += grow;
4440 	skb_frag_size_sub(&pinfo->frags[0], grow);
4441 
4442 	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4443 		skb_frag_unref(skb, 0);
4444 		memmove(pinfo->frags, pinfo->frags + 1,
4445 			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4446 	}
4447 }
4448 
4449 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4450 {
4451 	struct sk_buff **pp = NULL;
4452 	struct packet_offload *ptype;
4453 	__be16 type = skb->protocol;
4454 	struct list_head *head = &offload_base;
4455 	int same_flow;
4456 	enum gro_result ret;
4457 	int grow;
4458 
4459 	if (!(skb->dev->features & NETIF_F_GRO))
4460 		goto normal;
4461 
4462 	if (skb->csum_bad)
4463 		goto normal;
4464 
4465 	gro_list_prepare(napi, skb);
4466 
4467 	rcu_read_lock();
4468 	list_for_each_entry_rcu(ptype, head, list) {
4469 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4470 			continue;
4471 
4472 		skb_set_network_header(skb, skb_gro_offset(skb));
4473 		skb_reset_mac_len(skb);
4474 		NAPI_GRO_CB(skb)->same_flow = 0;
4475 		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4476 		NAPI_GRO_CB(skb)->free = 0;
4477 		NAPI_GRO_CB(skb)->encap_mark = 0;
4478 		NAPI_GRO_CB(skb)->recursion_counter = 0;
4479 		NAPI_GRO_CB(skb)->is_fou = 0;
4480 		NAPI_GRO_CB(skb)->is_atomic = 1;
4481 		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4482 
4483 		/* Setup for GRO checksum validation */
4484 		switch (skb->ip_summed) {
4485 		case CHECKSUM_COMPLETE:
4486 			NAPI_GRO_CB(skb)->csum = skb->csum;
4487 			NAPI_GRO_CB(skb)->csum_valid = 1;
4488 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4489 			break;
4490 		case CHECKSUM_UNNECESSARY:
4491 			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4492 			NAPI_GRO_CB(skb)->csum_valid = 0;
4493 			break;
4494 		default:
4495 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4496 			NAPI_GRO_CB(skb)->csum_valid = 0;
4497 		}
4498 
4499 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4500 		break;
4501 	}
4502 	rcu_read_unlock();
4503 
4504 	if (&ptype->list == head)
4505 		goto normal;
4506 
4507 	same_flow = NAPI_GRO_CB(skb)->same_flow;
4508 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4509 
4510 	if (pp) {
4511 		struct sk_buff *nskb = *pp;
4512 
4513 		*pp = nskb->next;
4514 		nskb->next = NULL;
4515 		napi_gro_complete(nskb);
4516 		napi->gro_count--;
4517 	}
4518 
4519 	if (same_flow)
4520 		goto ok;
4521 
4522 	if (NAPI_GRO_CB(skb)->flush)
4523 		goto normal;
4524 
4525 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4526 		struct sk_buff *nskb = napi->gro_list;
4527 
4528 		/* locate the end of the list to select the 'oldest' flow */
4529 		while (nskb->next) {
4530 			pp = &nskb->next;
4531 			nskb = *pp;
4532 		}
4533 		*pp = NULL;
4534 		nskb->next = NULL;
4535 		napi_gro_complete(nskb);
4536 	} else {
4537 		napi->gro_count++;
4538 	}
4539 	NAPI_GRO_CB(skb)->count = 1;
4540 	NAPI_GRO_CB(skb)->age = jiffies;
4541 	NAPI_GRO_CB(skb)->last = skb;
4542 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4543 	skb->next = napi->gro_list;
4544 	napi->gro_list = skb;
4545 	ret = GRO_HELD;
4546 
4547 pull:
4548 	grow = skb_gro_offset(skb) - skb_headlen(skb);
4549 	if (grow > 0)
4550 		gro_pull_from_frag0(skb, grow);
4551 ok:
4552 	return ret;
4553 
4554 normal:
4555 	ret = GRO_NORMAL;
4556 	goto pull;
4557 }
4558 
4559 struct packet_offload *gro_find_receive_by_type(__be16 type)
4560 {
4561 	struct list_head *offload_head = &offload_base;
4562 	struct packet_offload *ptype;
4563 
4564 	list_for_each_entry_rcu(ptype, offload_head, list) {
4565 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4566 			continue;
4567 		return ptype;
4568 	}
4569 	return NULL;
4570 }
4571 EXPORT_SYMBOL(gro_find_receive_by_type);
4572 
4573 struct packet_offload *gro_find_complete_by_type(__be16 type)
4574 {
4575 	struct list_head *offload_head = &offload_base;
4576 	struct packet_offload *ptype;
4577 
4578 	list_for_each_entry_rcu(ptype, offload_head, list) {
4579 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4580 			continue;
4581 		return ptype;
4582 	}
4583 	return NULL;
4584 }
4585 EXPORT_SYMBOL(gro_find_complete_by_type);
4586 
4587 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4588 {
4589 	switch (ret) {
4590 	case GRO_NORMAL:
4591 		if (netif_receive_skb_internal(skb))
4592 			ret = GRO_DROP;
4593 		break;
4594 
4595 	case GRO_DROP:
4596 		kfree_skb(skb);
4597 		break;
4598 
4599 	case GRO_MERGED_FREE:
4600 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4601 			skb_dst_drop(skb);
4602 			secpath_reset(skb);
4603 			kmem_cache_free(skbuff_head_cache, skb);
4604 		} else {
4605 			__kfree_skb(skb);
4606 		}
4607 		break;
4608 
4609 	case GRO_HELD:
4610 	case GRO_MERGED:
4611 		break;
4612 	}
4613 
4614 	return ret;
4615 }
4616 
4617 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4618 {
4619 	skb_mark_napi_id(skb, napi);
4620 	trace_napi_gro_receive_entry(skb);
4621 
4622 	skb_gro_reset_offset(skb);
4623 
4624 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4625 }
4626 EXPORT_SYMBOL(napi_gro_receive);
4627 
4628 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4629 {
4630 	if (unlikely(skb->pfmemalloc)) {
4631 		consume_skb(skb);
4632 		return;
4633 	}
4634 	__skb_pull(skb, skb_headlen(skb));
4635 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4636 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4637 	skb->vlan_tci = 0;
4638 	skb->dev = napi->dev;
4639 	skb->skb_iif = 0;
4640 	skb->encapsulation = 0;
4641 	skb_shinfo(skb)->gso_type = 0;
4642 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4643 	secpath_reset(skb);
4644 
4645 	napi->skb = skb;
4646 }
4647 
4648 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4649 {
4650 	struct sk_buff *skb = napi->skb;
4651 
4652 	if (!skb) {
4653 		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4654 		if (skb) {
4655 			napi->skb = skb;
4656 			skb_mark_napi_id(skb, napi);
4657 		}
4658 	}
4659 	return skb;
4660 }
4661 EXPORT_SYMBOL(napi_get_frags);
4662 
4663 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4664 				      struct sk_buff *skb,
4665 				      gro_result_t ret)
4666 {
4667 	switch (ret) {
4668 	case GRO_NORMAL:
4669 	case GRO_HELD:
4670 		__skb_push(skb, ETH_HLEN);
4671 		skb->protocol = eth_type_trans(skb, skb->dev);
4672 		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4673 			ret = GRO_DROP;
4674 		break;
4675 
4676 	case GRO_DROP:
4677 	case GRO_MERGED_FREE:
4678 		napi_reuse_skb(napi, skb);
4679 		break;
4680 
4681 	case GRO_MERGED:
4682 		break;
4683 	}
4684 
4685 	return ret;
4686 }
4687 
4688 /* Upper GRO stack assumes network header starts at gro_offset=0
4689  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4690  * We copy ethernet header into skb->data to have a common layout.
4691  */
4692 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4693 {
4694 	struct sk_buff *skb = napi->skb;
4695 	const struct ethhdr *eth;
4696 	unsigned int hlen = sizeof(*eth);
4697 
4698 	napi->skb = NULL;
4699 
4700 	skb_reset_mac_header(skb);
4701 	skb_gro_reset_offset(skb);
4702 
4703 	eth = skb_gro_header_fast(skb, 0);
4704 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4705 		eth = skb_gro_header_slow(skb, hlen, 0);
4706 		if (unlikely(!eth)) {
4707 			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4708 					     __func__, napi->dev->name);
4709 			napi_reuse_skb(napi, skb);
4710 			return NULL;
4711 		}
4712 	} else {
4713 		gro_pull_from_frag0(skb, hlen);
4714 		NAPI_GRO_CB(skb)->frag0 += hlen;
4715 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4716 	}
4717 	__skb_pull(skb, hlen);
4718 
4719 	/*
4720 	 * This works because the only protocols we care about don't require
4721 	 * special handling.
4722 	 * We'll fix it up properly in napi_frags_finish()
4723 	 */
4724 	skb->protocol = eth->h_proto;
4725 
4726 	return skb;
4727 }
4728 
4729 gro_result_t napi_gro_frags(struct napi_struct *napi)
4730 {
4731 	struct sk_buff *skb = napi_frags_skb(napi);
4732 
4733 	if (!skb)
4734 		return GRO_DROP;
4735 
4736 	trace_napi_gro_frags_entry(skb);
4737 
4738 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4739 }
4740 EXPORT_SYMBOL(napi_gro_frags);
4741 
4742 /* Compute the checksum from gro_offset and return the folded value
4743  * after adding in any pseudo checksum.
4744  */
4745 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4746 {
4747 	__wsum wsum;
4748 	__sum16 sum;
4749 
4750 	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4751 
4752 	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4753 	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4754 	if (likely(!sum)) {
4755 		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4756 		    !skb->csum_complete_sw)
4757 			netdev_rx_csum_fault(skb->dev);
4758 	}
4759 
4760 	NAPI_GRO_CB(skb)->csum = wsum;
4761 	NAPI_GRO_CB(skb)->csum_valid = 1;
4762 
4763 	return sum;
4764 }
4765 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4766 
4767 /*
4768  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4769  * Note: called with local irq disabled, but exits with local irq enabled.
4770  */
4771 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4772 {
4773 #ifdef CONFIG_RPS
4774 	struct softnet_data *remsd = sd->rps_ipi_list;
4775 
4776 	if (remsd) {
4777 		sd->rps_ipi_list = NULL;
4778 
4779 		local_irq_enable();
4780 
4781 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4782 		while (remsd) {
4783 			struct softnet_data *next = remsd->rps_ipi_next;
4784 
4785 			if (cpu_online(remsd->cpu))
4786 				smp_call_function_single_async(remsd->cpu,
4787 							   &remsd->csd);
4788 			remsd = next;
4789 		}
4790 	} else
4791 #endif
4792 		local_irq_enable();
4793 }
4794 
4795 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4796 {
4797 #ifdef CONFIG_RPS
4798 	return sd->rps_ipi_list != NULL;
4799 #else
4800 	return false;
4801 #endif
4802 }
4803 
4804 static int process_backlog(struct napi_struct *napi, int quota)
4805 {
4806 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4807 	bool again = true;
4808 	int work = 0;
4809 
4810 	/* Check if we have pending ipi, its better to send them now,
4811 	 * not waiting net_rx_action() end.
4812 	 */
4813 	if (sd_has_rps_ipi_waiting(sd)) {
4814 		local_irq_disable();
4815 		net_rps_action_and_irq_enable(sd);
4816 	}
4817 
4818 	napi->weight = dev_rx_weight;
4819 	while (again) {
4820 		struct sk_buff *skb;
4821 
4822 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4823 			rcu_read_lock();
4824 			__netif_receive_skb(skb);
4825 			rcu_read_unlock();
4826 			input_queue_head_incr(sd);
4827 			if (++work >= quota)
4828 				return work;
4829 
4830 		}
4831 
4832 		local_irq_disable();
4833 		rps_lock(sd);
4834 		if (skb_queue_empty(&sd->input_pkt_queue)) {
4835 			/*
4836 			 * Inline a custom version of __napi_complete().
4837 			 * only current cpu owns and manipulates this napi,
4838 			 * and NAPI_STATE_SCHED is the only possible flag set
4839 			 * on backlog.
4840 			 * We can use a plain write instead of clear_bit(),
4841 			 * and we dont need an smp_mb() memory barrier.
4842 			 */
4843 			napi->state = 0;
4844 			again = false;
4845 		} else {
4846 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4847 						   &sd->process_queue);
4848 		}
4849 		rps_unlock(sd);
4850 		local_irq_enable();
4851 	}
4852 
4853 	return work;
4854 }
4855 
4856 /**
4857  * __napi_schedule - schedule for receive
4858  * @n: entry to schedule
4859  *
4860  * The entry's receive function will be scheduled to run.
4861  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4862  */
4863 void __napi_schedule(struct napi_struct *n)
4864 {
4865 	unsigned long flags;
4866 
4867 	local_irq_save(flags);
4868 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4869 	local_irq_restore(flags);
4870 }
4871 EXPORT_SYMBOL(__napi_schedule);
4872 
4873 /**
4874  * __napi_schedule_irqoff - schedule for receive
4875  * @n: entry to schedule
4876  *
4877  * Variant of __napi_schedule() assuming hard irqs are masked
4878  */
4879 void __napi_schedule_irqoff(struct napi_struct *n)
4880 {
4881 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4882 }
4883 EXPORT_SYMBOL(__napi_schedule_irqoff);
4884 
4885 bool __napi_complete(struct napi_struct *n)
4886 {
4887 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4888 
4889 	/* Some drivers call us directly, instead of calling
4890 	 * napi_complete_done().
4891 	 */
4892 	if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4893 		return false;
4894 
4895 	list_del_init(&n->poll_list);
4896 	smp_mb__before_atomic();
4897 	clear_bit(NAPI_STATE_SCHED, &n->state);
4898 	return true;
4899 }
4900 EXPORT_SYMBOL(__napi_complete);
4901 
4902 bool napi_complete_done(struct napi_struct *n, int work_done)
4903 {
4904 	unsigned long flags;
4905 
4906 	/*
4907 	 * 1) Don't let napi dequeue from the cpu poll list
4908 	 *    just in case its running on a different cpu.
4909 	 * 2) If we are busy polling, do nothing here, we have
4910 	 *    the guarantee we will be called later.
4911 	 */
4912 	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4913 				 NAPIF_STATE_IN_BUSY_POLL)))
4914 		return false;
4915 
4916 	if (n->gro_list) {
4917 		unsigned long timeout = 0;
4918 
4919 		if (work_done)
4920 			timeout = n->dev->gro_flush_timeout;
4921 
4922 		if (timeout)
4923 			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4924 				      HRTIMER_MODE_REL_PINNED);
4925 		else
4926 			napi_gro_flush(n, false);
4927 	}
4928 	if (likely(list_empty(&n->poll_list))) {
4929 		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4930 	} else {
4931 		/* If n->poll_list is not empty, we need to mask irqs */
4932 		local_irq_save(flags);
4933 		__napi_complete(n);
4934 		local_irq_restore(flags);
4935 	}
4936 	return true;
4937 }
4938 EXPORT_SYMBOL(napi_complete_done);
4939 
4940 /* must be called under rcu_read_lock(), as we dont take a reference */
4941 static struct napi_struct *napi_by_id(unsigned int napi_id)
4942 {
4943 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4944 	struct napi_struct *napi;
4945 
4946 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4947 		if (napi->napi_id == napi_id)
4948 			return napi;
4949 
4950 	return NULL;
4951 }
4952 
4953 #if defined(CONFIG_NET_RX_BUSY_POLL)
4954 
4955 #define BUSY_POLL_BUDGET 8
4956 
4957 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
4958 {
4959 	int rc;
4960 
4961 	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
4962 
4963 	local_bh_disable();
4964 
4965 	/* All we really want here is to re-enable device interrupts.
4966 	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
4967 	 */
4968 	rc = napi->poll(napi, BUSY_POLL_BUDGET);
4969 	netpoll_poll_unlock(have_poll_lock);
4970 	if (rc == BUSY_POLL_BUDGET)
4971 		__napi_schedule(napi);
4972 	local_bh_enable();
4973 	if (local_softirq_pending())
4974 		do_softirq();
4975 }
4976 
4977 bool sk_busy_loop(struct sock *sk, int nonblock)
4978 {
4979 	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4980 	int (*napi_poll)(struct napi_struct *napi, int budget);
4981 	int (*busy_poll)(struct napi_struct *dev);
4982 	void *have_poll_lock = NULL;
4983 	struct napi_struct *napi;
4984 	int rc;
4985 
4986 restart:
4987 	rc = false;
4988 	napi_poll = NULL;
4989 
4990 	rcu_read_lock();
4991 
4992 	napi = napi_by_id(sk->sk_napi_id);
4993 	if (!napi)
4994 		goto out;
4995 
4996 	/* Note: ndo_busy_poll method is optional in linux-4.5 */
4997 	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4998 
4999 	preempt_disable();
5000 	for (;;) {
5001 		rc = 0;
5002 		local_bh_disable();
5003 		if (busy_poll) {
5004 			rc = busy_poll(napi);
5005 			goto count;
5006 		}
5007 		if (!napi_poll) {
5008 			unsigned long val = READ_ONCE(napi->state);
5009 
5010 			/* If multiple threads are competing for this napi,
5011 			 * we avoid dirtying napi->state as much as we can.
5012 			 */
5013 			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5014 				   NAPIF_STATE_IN_BUSY_POLL))
5015 				goto count;
5016 			if (cmpxchg(&napi->state, val,
5017 				    val | NAPIF_STATE_IN_BUSY_POLL |
5018 					  NAPIF_STATE_SCHED) != val)
5019 				goto count;
5020 			have_poll_lock = netpoll_poll_lock(napi);
5021 			napi_poll = napi->poll;
5022 		}
5023 		rc = napi_poll(napi, BUSY_POLL_BUDGET);
5024 		trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5025 count:
5026 		if (rc > 0)
5027 			__NET_ADD_STATS(sock_net(sk),
5028 					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5029 		local_bh_enable();
5030 
5031 		if (rc == LL_FLUSH_FAILED)
5032 			break; /* permanent failure */
5033 
5034 		if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5035 		    busy_loop_timeout(end_time))
5036 			break;
5037 
5038 		if (unlikely(need_resched())) {
5039 			if (napi_poll)
5040 				busy_poll_stop(napi, have_poll_lock);
5041 			preempt_enable();
5042 			rcu_read_unlock();
5043 			cond_resched();
5044 			rc = !skb_queue_empty(&sk->sk_receive_queue);
5045 			if (rc || busy_loop_timeout(end_time))
5046 				return rc;
5047 			goto restart;
5048 		}
5049 		cpu_relax();
5050 	}
5051 	if (napi_poll)
5052 		busy_poll_stop(napi, have_poll_lock);
5053 	preempt_enable();
5054 	rc = !skb_queue_empty(&sk->sk_receive_queue);
5055 out:
5056 	rcu_read_unlock();
5057 	return rc;
5058 }
5059 EXPORT_SYMBOL(sk_busy_loop);
5060 
5061 #endif /* CONFIG_NET_RX_BUSY_POLL */
5062 
5063 static void napi_hash_add(struct napi_struct *napi)
5064 {
5065 	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5066 	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5067 		return;
5068 
5069 	spin_lock(&napi_hash_lock);
5070 
5071 	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5072 	do {
5073 		if (unlikely(++napi_gen_id < NR_CPUS + 1))
5074 			napi_gen_id = NR_CPUS + 1;
5075 	} while (napi_by_id(napi_gen_id));
5076 	napi->napi_id = napi_gen_id;
5077 
5078 	hlist_add_head_rcu(&napi->napi_hash_node,
5079 			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5080 
5081 	spin_unlock(&napi_hash_lock);
5082 }
5083 
5084 /* Warning : caller is responsible to make sure rcu grace period
5085  * is respected before freeing memory containing @napi
5086  */
5087 bool napi_hash_del(struct napi_struct *napi)
5088 {
5089 	bool rcu_sync_needed = false;
5090 
5091 	spin_lock(&napi_hash_lock);
5092 
5093 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5094 		rcu_sync_needed = true;
5095 		hlist_del_rcu(&napi->napi_hash_node);
5096 	}
5097 	spin_unlock(&napi_hash_lock);
5098 	return rcu_sync_needed;
5099 }
5100 EXPORT_SYMBOL_GPL(napi_hash_del);
5101 
5102 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5103 {
5104 	struct napi_struct *napi;
5105 
5106 	napi = container_of(timer, struct napi_struct, timer);
5107 	if (napi->gro_list)
5108 		napi_schedule(napi);
5109 
5110 	return HRTIMER_NORESTART;
5111 }
5112 
5113 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5114 		    int (*poll)(struct napi_struct *, int), int weight)
5115 {
5116 	INIT_LIST_HEAD(&napi->poll_list);
5117 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5118 	napi->timer.function = napi_watchdog;
5119 	napi->gro_count = 0;
5120 	napi->gro_list = NULL;
5121 	napi->skb = NULL;
5122 	napi->poll = poll;
5123 	if (weight > NAPI_POLL_WEIGHT)
5124 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5125 			    weight, dev->name);
5126 	napi->weight = weight;
5127 	list_add(&napi->dev_list, &dev->napi_list);
5128 	napi->dev = dev;
5129 #ifdef CONFIG_NETPOLL
5130 	napi->poll_owner = -1;
5131 #endif
5132 	set_bit(NAPI_STATE_SCHED, &napi->state);
5133 	napi_hash_add(napi);
5134 }
5135 EXPORT_SYMBOL(netif_napi_add);
5136 
5137 void napi_disable(struct napi_struct *n)
5138 {
5139 	might_sleep();
5140 	set_bit(NAPI_STATE_DISABLE, &n->state);
5141 
5142 	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5143 		msleep(1);
5144 	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5145 		msleep(1);
5146 
5147 	hrtimer_cancel(&n->timer);
5148 
5149 	clear_bit(NAPI_STATE_DISABLE, &n->state);
5150 }
5151 EXPORT_SYMBOL(napi_disable);
5152 
5153 /* Must be called in process context */
5154 void netif_napi_del(struct napi_struct *napi)
5155 {
5156 	might_sleep();
5157 	if (napi_hash_del(napi))
5158 		synchronize_net();
5159 	list_del_init(&napi->dev_list);
5160 	napi_free_frags(napi);
5161 
5162 	kfree_skb_list(napi->gro_list);
5163 	napi->gro_list = NULL;
5164 	napi->gro_count = 0;
5165 }
5166 EXPORT_SYMBOL(netif_napi_del);
5167 
5168 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5169 {
5170 	void *have;
5171 	int work, weight;
5172 
5173 	list_del_init(&n->poll_list);
5174 
5175 	have = netpoll_poll_lock(n);
5176 
5177 	weight = n->weight;
5178 
5179 	/* This NAPI_STATE_SCHED test is for avoiding a race
5180 	 * with netpoll's poll_napi().  Only the entity which
5181 	 * obtains the lock and sees NAPI_STATE_SCHED set will
5182 	 * actually make the ->poll() call.  Therefore we avoid
5183 	 * accidentally calling ->poll() when NAPI is not scheduled.
5184 	 */
5185 	work = 0;
5186 	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5187 		work = n->poll(n, weight);
5188 		trace_napi_poll(n, work, weight);
5189 	}
5190 
5191 	WARN_ON_ONCE(work > weight);
5192 
5193 	if (likely(work < weight))
5194 		goto out_unlock;
5195 
5196 	/* Drivers must not modify the NAPI state if they
5197 	 * consume the entire weight.  In such cases this code
5198 	 * still "owns" the NAPI instance and therefore can
5199 	 * move the instance around on the list at-will.
5200 	 */
5201 	if (unlikely(napi_disable_pending(n))) {
5202 		napi_complete(n);
5203 		goto out_unlock;
5204 	}
5205 
5206 	if (n->gro_list) {
5207 		/* flush too old packets
5208 		 * If HZ < 1000, flush all packets.
5209 		 */
5210 		napi_gro_flush(n, HZ >= 1000);
5211 	}
5212 
5213 	/* Some drivers may have called napi_schedule
5214 	 * prior to exhausting their budget.
5215 	 */
5216 	if (unlikely(!list_empty(&n->poll_list))) {
5217 		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5218 			     n->dev ? n->dev->name : "backlog");
5219 		goto out_unlock;
5220 	}
5221 
5222 	list_add_tail(&n->poll_list, repoll);
5223 
5224 out_unlock:
5225 	netpoll_poll_unlock(have);
5226 
5227 	return work;
5228 }
5229 
5230 static __latent_entropy void net_rx_action(struct softirq_action *h)
5231 {
5232 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5233 	unsigned long time_limit = jiffies + 2;
5234 	int budget = netdev_budget;
5235 	LIST_HEAD(list);
5236 	LIST_HEAD(repoll);
5237 
5238 	local_irq_disable();
5239 	list_splice_init(&sd->poll_list, &list);
5240 	local_irq_enable();
5241 
5242 	for (;;) {
5243 		struct napi_struct *n;
5244 
5245 		if (list_empty(&list)) {
5246 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5247 				goto out;
5248 			break;
5249 		}
5250 
5251 		n = list_first_entry(&list, struct napi_struct, poll_list);
5252 		budget -= napi_poll(n, &repoll);
5253 
5254 		/* If softirq window is exhausted then punt.
5255 		 * Allow this to run for 2 jiffies since which will allow
5256 		 * an average latency of 1.5/HZ.
5257 		 */
5258 		if (unlikely(budget <= 0 ||
5259 			     time_after_eq(jiffies, time_limit))) {
5260 			sd->time_squeeze++;
5261 			break;
5262 		}
5263 	}
5264 
5265 	local_irq_disable();
5266 
5267 	list_splice_tail_init(&sd->poll_list, &list);
5268 	list_splice_tail(&repoll, &list);
5269 	list_splice(&list, &sd->poll_list);
5270 	if (!list_empty(&sd->poll_list))
5271 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5272 
5273 	net_rps_action_and_irq_enable(sd);
5274 out:
5275 	__kfree_skb_flush();
5276 }
5277 
5278 struct netdev_adjacent {
5279 	struct net_device *dev;
5280 
5281 	/* upper master flag, there can only be one master device per list */
5282 	bool master;
5283 
5284 	/* counter for the number of times this device was added to us */
5285 	u16 ref_nr;
5286 
5287 	/* private field for the users */
5288 	void *private;
5289 
5290 	struct list_head list;
5291 	struct rcu_head rcu;
5292 };
5293 
5294 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5295 						 struct list_head *adj_list)
5296 {
5297 	struct netdev_adjacent *adj;
5298 
5299 	list_for_each_entry(adj, adj_list, list) {
5300 		if (adj->dev == adj_dev)
5301 			return adj;
5302 	}
5303 	return NULL;
5304 }
5305 
5306 static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5307 {
5308 	struct net_device *dev = data;
5309 
5310 	return upper_dev == dev;
5311 }
5312 
5313 /**
5314  * netdev_has_upper_dev - Check if device is linked to an upper device
5315  * @dev: device
5316  * @upper_dev: upper device to check
5317  *
5318  * Find out if a device is linked to specified upper device and return true
5319  * in case it is. Note that this checks only immediate upper device,
5320  * not through a complete stack of devices. The caller must hold the RTNL lock.
5321  */
5322 bool netdev_has_upper_dev(struct net_device *dev,
5323 			  struct net_device *upper_dev)
5324 {
5325 	ASSERT_RTNL();
5326 
5327 	return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5328 					     upper_dev);
5329 }
5330 EXPORT_SYMBOL(netdev_has_upper_dev);
5331 
5332 /**
5333  * netdev_has_upper_dev_all - Check if device is linked to an upper device
5334  * @dev: device
5335  * @upper_dev: upper device to check
5336  *
5337  * Find out if a device is linked to specified upper device and return true
5338  * in case it is. Note that this checks the entire upper device chain.
5339  * The caller must hold rcu lock.
5340  */
5341 
5342 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5343 				  struct net_device *upper_dev)
5344 {
5345 	return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5346 					       upper_dev);
5347 }
5348 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5349 
5350 /**
5351  * netdev_has_any_upper_dev - Check if device is linked to some device
5352  * @dev: device
5353  *
5354  * Find out if a device is linked to an upper device and return true in case
5355  * it is. The caller must hold the RTNL lock.
5356  */
5357 static bool netdev_has_any_upper_dev(struct net_device *dev)
5358 {
5359 	ASSERT_RTNL();
5360 
5361 	return !list_empty(&dev->adj_list.upper);
5362 }
5363 
5364 /**
5365  * netdev_master_upper_dev_get - Get master upper device
5366  * @dev: device
5367  *
5368  * Find a master upper device and return pointer to it or NULL in case
5369  * it's not there. The caller must hold the RTNL lock.
5370  */
5371 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5372 {
5373 	struct netdev_adjacent *upper;
5374 
5375 	ASSERT_RTNL();
5376 
5377 	if (list_empty(&dev->adj_list.upper))
5378 		return NULL;
5379 
5380 	upper = list_first_entry(&dev->adj_list.upper,
5381 				 struct netdev_adjacent, list);
5382 	if (likely(upper->master))
5383 		return upper->dev;
5384 	return NULL;
5385 }
5386 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5387 
5388 /**
5389  * netdev_has_any_lower_dev - Check if device is linked to some device
5390  * @dev: device
5391  *
5392  * Find out if a device is linked to a lower device and return true in case
5393  * it is. The caller must hold the RTNL lock.
5394  */
5395 static bool netdev_has_any_lower_dev(struct net_device *dev)
5396 {
5397 	ASSERT_RTNL();
5398 
5399 	return !list_empty(&dev->adj_list.lower);
5400 }
5401 
5402 void *netdev_adjacent_get_private(struct list_head *adj_list)
5403 {
5404 	struct netdev_adjacent *adj;
5405 
5406 	adj = list_entry(adj_list, struct netdev_adjacent, list);
5407 
5408 	return adj->private;
5409 }
5410 EXPORT_SYMBOL(netdev_adjacent_get_private);
5411 
5412 /**
5413  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5414  * @dev: device
5415  * @iter: list_head ** of the current position
5416  *
5417  * Gets the next device from the dev's upper list, starting from iter
5418  * position. The caller must hold RCU read lock.
5419  */
5420 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5421 						 struct list_head **iter)
5422 {
5423 	struct netdev_adjacent *upper;
5424 
5425 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5426 
5427 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5428 
5429 	if (&upper->list == &dev->adj_list.upper)
5430 		return NULL;
5431 
5432 	*iter = &upper->list;
5433 
5434 	return upper->dev;
5435 }
5436 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5437 
5438 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5439 						    struct list_head **iter)
5440 {
5441 	struct netdev_adjacent *upper;
5442 
5443 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5444 
5445 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5446 
5447 	if (&upper->list == &dev->adj_list.upper)
5448 		return NULL;
5449 
5450 	*iter = &upper->list;
5451 
5452 	return upper->dev;
5453 }
5454 
5455 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5456 				  int (*fn)(struct net_device *dev,
5457 					    void *data),
5458 				  void *data)
5459 {
5460 	struct net_device *udev;
5461 	struct list_head *iter;
5462 	int ret;
5463 
5464 	for (iter = &dev->adj_list.upper,
5465 	     udev = netdev_next_upper_dev_rcu(dev, &iter);
5466 	     udev;
5467 	     udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5468 		/* first is the upper device itself */
5469 		ret = fn(udev, data);
5470 		if (ret)
5471 			return ret;
5472 
5473 		/* then look at all of its upper devices */
5474 		ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5475 		if (ret)
5476 			return ret;
5477 	}
5478 
5479 	return 0;
5480 }
5481 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5482 
5483 /**
5484  * netdev_lower_get_next_private - Get the next ->private from the
5485  *				   lower neighbour list
5486  * @dev: device
5487  * @iter: list_head ** of the current position
5488  *
5489  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5490  * list, starting from iter position. The caller must hold either hold the
5491  * RTNL lock or its own locking that guarantees that the neighbour lower
5492  * list will remain unchanged.
5493  */
5494 void *netdev_lower_get_next_private(struct net_device *dev,
5495 				    struct list_head **iter)
5496 {
5497 	struct netdev_adjacent *lower;
5498 
5499 	lower = list_entry(*iter, struct netdev_adjacent, list);
5500 
5501 	if (&lower->list == &dev->adj_list.lower)
5502 		return NULL;
5503 
5504 	*iter = lower->list.next;
5505 
5506 	return lower->private;
5507 }
5508 EXPORT_SYMBOL(netdev_lower_get_next_private);
5509 
5510 /**
5511  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5512  *				       lower neighbour list, RCU
5513  *				       variant
5514  * @dev: device
5515  * @iter: list_head ** of the current position
5516  *
5517  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5518  * list, starting from iter position. The caller must hold RCU read lock.
5519  */
5520 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5521 					struct list_head **iter)
5522 {
5523 	struct netdev_adjacent *lower;
5524 
5525 	WARN_ON_ONCE(!rcu_read_lock_held());
5526 
5527 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5528 
5529 	if (&lower->list == &dev->adj_list.lower)
5530 		return NULL;
5531 
5532 	*iter = &lower->list;
5533 
5534 	return lower->private;
5535 }
5536 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5537 
5538 /**
5539  * netdev_lower_get_next - Get the next device from the lower neighbour
5540  *                         list
5541  * @dev: device
5542  * @iter: list_head ** of the current position
5543  *
5544  * Gets the next netdev_adjacent from the dev's lower neighbour
5545  * list, starting from iter position. The caller must hold RTNL lock or
5546  * its own locking that guarantees that the neighbour lower
5547  * list will remain unchanged.
5548  */
5549 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5550 {
5551 	struct netdev_adjacent *lower;
5552 
5553 	lower = list_entry(*iter, struct netdev_adjacent, list);
5554 
5555 	if (&lower->list == &dev->adj_list.lower)
5556 		return NULL;
5557 
5558 	*iter = lower->list.next;
5559 
5560 	return lower->dev;
5561 }
5562 EXPORT_SYMBOL(netdev_lower_get_next);
5563 
5564 static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5565 						struct list_head **iter)
5566 {
5567 	struct netdev_adjacent *lower;
5568 
5569 	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5570 
5571 	if (&lower->list == &dev->adj_list.lower)
5572 		return NULL;
5573 
5574 	*iter = &lower->list;
5575 
5576 	return lower->dev;
5577 }
5578 
5579 int netdev_walk_all_lower_dev(struct net_device *dev,
5580 			      int (*fn)(struct net_device *dev,
5581 					void *data),
5582 			      void *data)
5583 {
5584 	struct net_device *ldev;
5585 	struct list_head *iter;
5586 	int ret;
5587 
5588 	for (iter = &dev->adj_list.lower,
5589 	     ldev = netdev_next_lower_dev(dev, &iter);
5590 	     ldev;
5591 	     ldev = netdev_next_lower_dev(dev, &iter)) {
5592 		/* first is the lower device itself */
5593 		ret = fn(ldev, data);
5594 		if (ret)
5595 			return ret;
5596 
5597 		/* then look at all of its lower devices */
5598 		ret = netdev_walk_all_lower_dev(ldev, fn, data);
5599 		if (ret)
5600 			return ret;
5601 	}
5602 
5603 	return 0;
5604 }
5605 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5606 
5607 static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5608 						    struct list_head **iter)
5609 {
5610 	struct netdev_adjacent *lower;
5611 
5612 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5613 	if (&lower->list == &dev->adj_list.lower)
5614 		return NULL;
5615 
5616 	*iter = &lower->list;
5617 
5618 	return lower->dev;
5619 }
5620 
5621 int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5622 				  int (*fn)(struct net_device *dev,
5623 					    void *data),
5624 				  void *data)
5625 {
5626 	struct net_device *ldev;
5627 	struct list_head *iter;
5628 	int ret;
5629 
5630 	for (iter = &dev->adj_list.lower,
5631 	     ldev = netdev_next_lower_dev_rcu(dev, &iter);
5632 	     ldev;
5633 	     ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5634 		/* first is the lower device itself */
5635 		ret = fn(ldev, data);
5636 		if (ret)
5637 			return ret;
5638 
5639 		/* then look at all of its lower devices */
5640 		ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5641 		if (ret)
5642 			return ret;
5643 	}
5644 
5645 	return 0;
5646 }
5647 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5648 
5649 /**
5650  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5651  *				       lower neighbour list, RCU
5652  *				       variant
5653  * @dev: device
5654  *
5655  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5656  * list. The caller must hold RCU read lock.
5657  */
5658 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5659 {
5660 	struct netdev_adjacent *lower;
5661 
5662 	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5663 			struct netdev_adjacent, list);
5664 	if (lower)
5665 		return lower->private;
5666 	return NULL;
5667 }
5668 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5669 
5670 /**
5671  * netdev_master_upper_dev_get_rcu - Get master upper device
5672  * @dev: device
5673  *
5674  * Find a master upper device and return pointer to it or NULL in case
5675  * it's not there. The caller must hold the RCU read lock.
5676  */
5677 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5678 {
5679 	struct netdev_adjacent *upper;
5680 
5681 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5682 				       struct netdev_adjacent, list);
5683 	if (upper && likely(upper->master))
5684 		return upper->dev;
5685 	return NULL;
5686 }
5687 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5688 
5689 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5690 			      struct net_device *adj_dev,
5691 			      struct list_head *dev_list)
5692 {
5693 	char linkname[IFNAMSIZ+7];
5694 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5695 		"upper_%s" : "lower_%s", adj_dev->name);
5696 	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5697 				 linkname);
5698 }
5699 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5700 			       char *name,
5701 			       struct list_head *dev_list)
5702 {
5703 	char linkname[IFNAMSIZ+7];
5704 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5705 		"upper_%s" : "lower_%s", name);
5706 	sysfs_remove_link(&(dev->dev.kobj), linkname);
5707 }
5708 
5709 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5710 						 struct net_device *adj_dev,
5711 						 struct list_head *dev_list)
5712 {
5713 	return (dev_list == &dev->adj_list.upper ||
5714 		dev_list == &dev->adj_list.lower) &&
5715 		net_eq(dev_net(dev), dev_net(adj_dev));
5716 }
5717 
5718 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5719 					struct net_device *adj_dev,
5720 					struct list_head *dev_list,
5721 					void *private, bool master)
5722 {
5723 	struct netdev_adjacent *adj;
5724 	int ret;
5725 
5726 	adj = __netdev_find_adj(adj_dev, dev_list);
5727 
5728 	if (adj) {
5729 		adj->ref_nr += 1;
5730 		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5731 			 dev->name, adj_dev->name, adj->ref_nr);
5732 
5733 		return 0;
5734 	}
5735 
5736 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5737 	if (!adj)
5738 		return -ENOMEM;
5739 
5740 	adj->dev = adj_dev;
5741 	adj->master = master;
5742 	adj->ref_nr = 1;
5743 	adj->private = private;
5744 	dev_hold(adj_dev);
5745 
5746 	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5747 		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5748 
5749 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5750 		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5751 		if (ret)
5752 			goto free_adj;
5753 	}
5754 
5755 	/* Ensure that master link is always the first item in list. */
5756 	if (master) {
5757 		ret = sysfs_create_link(&(dev->dev.kobj),
5758 					&(adj_dev->dev.kobj), "master");
5759 		if (ret)
5760 			goto remove_symlinks;
5761 
5762 		list_add_rcu(&adj->list, dev_list);
5763 	} else {
5764 		list_add_tail_rcu(&adj->list, dev_list);
5765 	}
5766 
5767 	return 0;
5768 
5769 remove_symlinks:
5770 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5771 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5772 free_adj:
5773 	kfree(adj);
5774 	dev_put(adj_dev);
5775 
5776 	return ret;
5777 }
5778 
5779 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5780 					 struct net_device *adj_dev,
5781 					 u16 ref_nr,
5782 					 struct list_head *dev_list)
5783 {
5784 	struct netdev_adjacent *adj;
5785 
5786 	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5787 		 dev->name, adj_dev->name, ref_nr);
5788 
5789 	adj = __netdev_find_adj(adj_dev, dev_list);
5790 
5791 	if (!adj) {
5792 		pr_err("Adjacency does not exist for device %s from %s\n",
5793 		       dev->name, adj_dev->name);
5794 		WARN_ON(1);
5795 		return;
5796 	}
5797 
5798 	if (adj->ref_nr > ref_nr) {
5799 		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5800 			 dev->name, adj_dev->name, ref_nr,
5801 			 adj->ref_nr - ref_nr);
5802 		adj->ref_nr -= ref_nr;
5803 		return;
5804 	}
5805 
5806 	if (adj->master)
5807 		sysfs_remove_link(&(dev->dev.kobj), "master");
5808 
5809 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5810 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5811 
5812 	list_del_rcu(&adj->list);
5813 	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5814 		 adj_dev->name, dev->name, adj_dev->name);
5815 	dev_put(adj_dev);
5816 	kfree_rcu(adj, rcu);
5817 }
5818 
5819 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5820 					    struct net_device *upper_dev,
5821 					    struct list_head *up_list,
5822 					    struct list_head *down_list,
5823 					    void *private, bool master)
5824 {
5825 	int ret;
5826 
5827 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5828 					   private, master);
5829 	if (ret)
5830 		return ret;
5831 
5832 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5833 					   private, false);
5834 	if (ret) {
5835 		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5836 		return ret;
5837 	}
5838 
5839 	return 0;
5840 }
5841 
5842 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5843 					       struct net_device *upper_dev,
5844 					       u16 ref_nr,
5845 					       struct list_head *up_list,
5846 					       struct list_head *down_list)
5847 {
5848 	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5849 	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5850 }
5851 
5852 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5853 						struct net_device *upper_dev,
5854 						void *private, bool master)
5855 {
5856 	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5857 						&dev->adj_list.upper,
5858 						&upper_dev->adj_list.lower,
5859 						private, master);
5860 }
5861 
5862 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5863 						   struct net_device *upper_dev)
5864 {
5865 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5866 					   &dev->adj_list.upper,
5867 					   &upper_dev->adj_list.lower);
5868 }
5869 
5870 static int __netdev_upper_dev_link(struct net_device *dev,
5871 				   struct net_device *upper_dev, bool master,
5872 				   void *upper_priv, void *upper_info)
5873 {
5874 	struct netdev_notifier_changeupper_info changeupper_info;
5875 	int ret = 0;
5876 
5877 	ASSERT_RTNL();
5878 
5879 	if (dev == upper_dev)
5880 		return -EBUSY;
5881 
5882 	/* To prevent loops, check if dev is not upper device to upper_dev. */
5883 	if (netdev_has_upper_dev(upper_dev, dev))
5884 		return -EBUSY;
5885 
5886 	if (netdev_has_upper_dev(dev, upper_dev))
5887 		return -EEXIST;
5888 
5889 	if (master && netdev_master_upper_dev_get(dev))
5890 		return -EBUSY;
5891 
5892 	changeupper_info.upper_dev = upper_dev;
5893 	changeupper_info.master = master;
5894 	changeupper_info.linking = true;
5895 	changeupper_info.upper_info = upper_info;
5896 
5897 	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5898 					    &changeupper_info.info);
5899 	ret = notifier_to_errno(ret);
5900 	if (ret)
5901 		return ret;
5902 
5903 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5904 						   master);
5905 	if (ret)
5906 		return ret;
5907 
5908 	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5909 					    &changeupper_info.info);
5910 	ret = notifier_to_errno(ret);
5911 	if (ret)
5912 		goto rollback;
5913 
5914 	return 0;
5915 
5916 rollback:
5917 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5918 
5919 	return ret;
5920 }
5921 
5922 /**
5923  * netdev_upper_dev_link - Add a link to the upper device
5924  * @dev: device
5925  * @upper_dev: new upper device
5926  *
5927  * Adds a link to device which is upper to this one. The caller must hold
5928  * the RTNL lock. On a failure a negative errno code is returned.
5929  * On success the reference counts are adjusted and the function
5930  * returns zero.
5931  */
5932 int netdev_upper_dev_link(struct net_device *dev,
5933 			  struct net_device *upper_dev)
5934 {
5935 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5936 }
5937 EXPORT_SYMBOL(netdev_upper_dev_link);
5938 
5939 /**
5940  * netdev_master_upper_dev_link - Add a master link to the upper device
5941  * @dev: device
5942  * @upper_dev: new upper device
5943  * @upper_priv: upper device private
5944  * @upper_info: upper info to be passed down via notifier
5945  *
5946  * Adds a link to device which is upper to this one. In this case, only
5947  * one master upper device can be linked, although other non-master devices
5948  * might be linked as well. The caller must hold the RTNL lock.
5949  * On a failure a negative errno code is returned. On success the reference
5950  * counts are adjusted and the function returns zero.
5951  */
5952 int netdev_master_upper_dev_link(struct net_device *dev,
5953 				 struct net_device *upper_dev,
5954 				 void *upper_priv, void *upper_info)
5955 {
5956 	return __netdev_upper_dev_link(dev, upper_dev, true,
5957 				       upper_priv, upper_info);
5958 }
5959 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5960 
5961 /**
5962  * netdev_upper_dev_unlink - Removes a link to upper device
5963  * @dev: device
5964  * @upper_dev: new upper device
5965  *
5966  * Removes a link to device which is upper to this one. The caller must hold
5967  * the RTNL lock.
5968  */
5969 void netdev_upper_dev_unlink(struct net_device *dev,
5970 			     struct net_device *upper_dev)
5971 {
5972 	struct netdev_notifier_changeupper_info changeupper_info;
5973 	ASSERT_RTNL();
5974 
5975 	changeupper_info.upper_dev = upper_dev;
5976 	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5977 	changeupper_info.linking = false;
5978 
5979 	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5980 				      &changeupper_info.info);
5981 
5982 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5983 
5984 	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5985 				      &changeupper_info.info);
5986 }
5987 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5988 
5989 /**
5990  * netdev_bonding_info_change - Dispatch event about slave change
5991  * @dev: device
5992  * @bonding_info: info to dispatch
5993  *
5994  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5995  * The caller must hold the RTNL lock.
5996  */
5997 void netdev_bonding_info_change(struct net_device *dev,
5998 				struct netdev_bonding_info *bonding_info)
5999 {
6000 	struct netdev_notifier_bonding_info	info;
6001 
6002 	memcpy(&info.bonding_info, bonding_info,
6003 	       sizeof(struct netdev_bonding_info));
6004 	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6005 				      &info.info);
6006 }
6007 EXPORT_SYMBOL(netdev_bonding_info_change);
6008 
6009 static void netdev_adjacent_add_links(struct net_device *dev)
6010 {
6011 	struct netdev_adjacent *iter;
6012 
6013 	struct net *net = dev_net(dev);
6014 
6015 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6016 		if (!net_eq(net, dev_net(iter->dev)))
6017 			continue;
6018 		netdev_adjacent_sysfs_add(iter->dev, dev,
6019 					  &iter->dev->adj_list.lower);
6020 		netdev_adjacent_sysfs_add(dev, iter->dev,
6021 					  &dev->adj_list.upper);
6022 	}
6023 
6024 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6025 		if (!net_eq(net, dev_net(iter->dev)))
6026 			continue;
6027 		netdev_adjacent_sysfs_add(iter->dev, dev,
6028 					  &iter->dev->adj_list.upper);
6029 		netdev_adjacent_sysfs_add(dev, iter->dev,
6030 					  &dev->adj_list.lower);
6031 	}
6032 }
6033 
6034 static void netdev_adjacent_del_links(struct net_device *dev)
6035 {
6036 	struct netdev_adjacent *iter;
6037 
6038 	struct net *net = dev_net(dev);
6039 
6040 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6041 		if (!net_eq(net, dev_net(iter->dev)))
6042 			continue;
6043 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6044 					  &iter->dev->adj_list.lower);
6045 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6046 					  &dev->adj_list.upper);
6047 	}
6048 
6049 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6050 		if (!net_eq(net, dev_net(iter->dev)))
6051 			continue;
6052 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6053 					  &iter->dev->adj_list.upper);
6054 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6055 					  &dev->adj_list.lower);
6056 	}
6057 }
6058 
6059 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6060 {
6061 	struct netdev_adjacent *iter;
6062 
6063 	struct net *net = dev_net(dev);
6064 
6065 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6066 		if (!net_eq(net, dev_net(iter->dev)))
6067 			continue;
6068 		netdev_adjacent_sysfs_del(iter->dev, oldname,
6069 					  &iter->dev->adj_list.lower);
6070 		netdev_adjacent_sysfs_add(iter->dev, dev,
6071 					  &iter->dev->adj_list.lower);
6072 	}
6073 
6074 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6075 		if (!net_eq(net, dev_net(iter->dev)))
6076 			continue;
6077 		netdev_adjacent_sysfs_del(iter->dev, oldname,
6078 					  &iter->dev->adj_list.upper);
6079 		netdev_adjacent_sysfs_add(iter->dev, dev,
6080 					  &iter->dev->adj_list.upper);
6081 	}
6082 }
6083 
6084 void *netdev_lower_dev_get_private(struct net_device *dev,
6085 				   struct net_device *lower_dev)
6086 {
6087 	struct netdev_adjacent *lower;
6088 
6089 	if (!lower_dev)
6090 		return NULL;
6091 	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6092 	if (!lower)
6093 		return NULL;
6094 
6095 	return lower->private;
6096 }
6097 EXPORT_SYMBOL(netdev_lower_dev_get_private);
6098 
6099 
6100 int dev_get_nest_level(struct net_device *dev)
6101 {
6102 	struct net_device *lower = NULL;
6103 	struct list_head *iter;
6104 	int max_nest = -1;
6105 	int nest;
6106 
6107 	ASSERT_RTNL();
6108 
6109 	netdev_for_each_lower_dev(dev, lower, iter) {
6110 		nest = dev_get_nest_level(lower);
6111 		if (max_nest < nest)
6112 			max_nest = nest;
6113 	}
6114 
6115 	return max_nest + 1;
6116 }
6117 EXPORT_SYMBOL(dev_get_nest_level);
6118 
6119 /**
6120  * netdev_lower_change - Dispatch event about lower device state change
6121  * @lower_dev: device
6122  * @lower_state_info: state to dispatch
6123  *
6124  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6125  * The caller must hold the RTNL lock.
6126  */
6127 void netdev_lower_state_changed(struct net_device *lower_dev,
6128 				void *lower_state_info)
6129 {
6130 	struct netdev_notifier_changelowerstate_info changelowerstate_info;
6131 
6132 	ASSERT_RTNL();
6133 	changelowerstate_info.lower_state_info = lower_state_info;
6134 	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6135 				      &changelowerstate_info.info);
6136 }
6137 EXPORT_SYMBOL(netdev_lower_state_changed);
6138 
6139 int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6140 					   struct neighbour *n)
6141 {
6142 	struct net_device *lower_dev, *stop_dev;
6143 	struct list_head *iter;
6144 	int err;
6145 
6146 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6147 		if (!lower_dev->netdev_ops->ndo_neigh_construct)
6148 			continue;
6149 		err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6150 		if (err) {
6151 			stop_dev = lower_dev;
6152 			goto rollback;
6153 		}
6154 	}
6155 	return 0;
6156 
6157 rollback:
6158 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6159 		if (lower_dev == stop_dev)
6160 			break;
6161 		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6162 			continue;
6163 		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6164 	}
6165 	return err;
6166 }
6167 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6168 
6169 void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6170 					  struct neighbour *n)
6171 {
6172 	struct net_device *lower_dev;
6173 	struct list_head *iter;
6174 
6175 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6176 		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6177 			continue;
6178 		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6179 	}
6180 }
6181 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6182 
6183 static void dev_change_rx_flags(struct net_device *dev, int flags)
6184 {
6185 	const struct net_device_ops *ops = dev->netdev_ops;
6186 
6187 	if (ops->ndo_change_rx_flags)
6188 		ops->ndo_change_rx_flags(dev, flags);
6189 }
6190 
6191 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6192 {
6193 	unsigned int old_flags = dev->flags;
6194 	kuid_t uid;
6195 	kgid_t gid;
6196 
6197 	ASSERT_RTNL();
6198 
6199 	dev->flags |= IFF_PROMISC;
6200 	dev->promiscuity += inc;
6201 	if (dev->promiscuity == 0) {
6202 		/*
6203 		 * Avoid overflow.
6204 		 * If inc causes overflow, untouch promisc and return error.
6205 		 */
6206 		if (inc < 0)
6207 			dev->flags &= ~IFF_PROMISC;
6208 		else {
6209 			dev->promiscuity -= inc;
6210 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6211 				dev->name);
6212 			return -EOVERFLOW;
6213 		}
6214 	}
6215 	if (dev->flags != old_flags) {
6216 		pr_info("device %s %s promiscuous mode\n",
6217 			dev->name,
6218 			dev->flags & IFF_PROMISC ? "entered" : "left");
6219 		if (audit_enabled) {
6220 			current_uid_gid(&uid, &gid);
6221 			audit_log(current->audit_context, GFP_ATOMIC,
6222 				AUDIT_ANOM_PROMISCUOUS,
6223 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6224 				dev->name, (dev->flags & IFF_PROMISC),
6225 				(old_flags & IFF_PROMISC),
6226 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6227 				from_kuid(&init_user_ns, uid),
6228 				from_kgid(&init_user_ns, gid),
6229 				audit_get_sessionid(current));
6230 		}
6231 
6232 		dev_change_rx_flags(dev, IFF_PROMISC);
6233 	}
6234 	if (notify)
6235 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6236 	return 0;
6237 }
6238 
6239 /**
6240  *	dev_set_promiscuity	- update promiscuity count on a device
6241  *	@dev: device
6242  *	@inc: modifier
6243  *
6244  *	Add or remove promiscuity from a device. While the count in the device
6245  *	remains above zero the interface remains promiscuous. Once it hits zero
6246  *	the device reverts back to normal filtering operation. A negative inc
6247  *	value is used to drop promiscuity on the device.
6248  *	Return 0 if successful or a negative errno code on error.
6249  */
6250 int dev_set_promiscuity(struct net_device *dev, int inc)
6251 {
6252 	unsigned int old_flags = dev->flags;
6253 	int err;
6254 
6255 	err = __dev_set_promiscuity(dev, inc, true);
6256 	if (err < 0)
6257 		return err;
6258 	if (dev->flags != old_flags)
6259 		dev_set_rx_mode(dev);
6260 	return err;
6261 }
6262 EXPORT_SYMBOL(dev_set_promiscuity);
6263 
6264 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6265 {
6266 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6267 
6268 	ASSERT_RTNL();
6269 
6270 	dev->flags |= IFF_ALLMULTI;
6271 	dev->allmulti += inc;
6272 	if (dev->allmulti == 0) {
6273 		/*
6274 		 * Avoid overflow.
6275 		 * If inc causes overflow, untouch allmulti and return error.
6276 		 */
6277 		if (inc < 0)
6278 			dev->flags &= ~IFF_ALLMULTI;
6279 		else {
6280 			dev->allmulti -= inc;
6281 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6282 				dev->name);
6283 			return -EOVERFLOW;
6284 		}
6285 	}
6286 	if (dev->flags ^ old_flags) {
6287 		dev_change_rx_flags(dev, IFF_ALLMULTI);
6288 		dev_set_rx_mode(dev);
6289 		if (notify)
6290 			__dev_notify_flags(dev, old_flags,
6291 					   dev->gflags ^ old_gflags);
6292 	}
6293 	return 0;
6294 }
6295 
6296 /**
6297  *	dev_set_allmulti	- update allmulti count on a device
6298  *	@dev: device
6299  *	@inc: modifier
6300  *
6301  *	Add or remove reception of all multicast frames to a device. While the
6302  *	count in the device remains above zero the interface remains listening
6303  *	to all interfaces. Once it hits zero the device reverts back to normal
6304  *	filtering operation. A negative @inc value is used to drop the counter
6305  *	when releasing a resource needing all multicasts.
6306  *	Return 0 if successful or a negative errno code on error.
6307  */
6308 
6309 int dev_set_allmulti(struct net_device *dev, int inc)
6310 {
6311 	return __dev_set_allmulti(dev, inc, true);
6312 }
6313 EXPORT_SYMBOL(dev_set_allmulti);
6314 
6315 /*
6316  *	Upload unicast and multicast address lists to device and
6317  *	configure RX filtering. When the device doesn't support unicast
6318  *	filtering it is put in promiscuous mode while unicast addresses
6319  *	are present.
6320  */
6321 void __dev_set_rx_mode(struct net_device *dev)
6322 {
6323 	const struct net_device_ops *ops = dev->netdev_ops;
6324 
6325 	/* dev_open will call this function so the list will stay sane. */
6326 	if (!(dev->flags&IFF_UP))
6327 		return;
6328 
6329 	if (!netif_device_present(dev))
6330 		return;
6331 
6332 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6333 		/* Unicast addresses changes may only happen under the rtnl,
6334 		 * therefore calling __dev_set_promiscuity here is safe.
6335 		 */
6336 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6337 			__dev_set_promiscuity(dev, 1, false);
6338 			dev->uc_promisc = true;
6339 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6340 			__dev_set_promiscuity(dev, -1, false);
6341 			dev->uc_promisc = false;
6342 		}
6343 	}
6344 
6345 	if (ops->ndo_set_rx_mode)
6346 		ops->ndo_set_rx_mode(dev);
6347 }
6348 
6349 void dev_set_rx_mode(struct net_device *dev)
6350 {
6351 	netif_addr_lock_bh(dev);
6352 	__dev_set_rx_mode(dev);
6353 	netif_addr_unlock_bh(dev);
6354 }
6355 
6356 /**
6357  *	dev_get_flags - get flags reported to userspace
6358  *	@dev: device
6359  *
6360  *	Get the combination of flag bits exported through APIs to userspace.
6361  */
6362 unsigned int dev_get_flags(const struct net_device *dev)
6363 {
6364 	unsigned int flags;
6365 
6366 	flags = (dev->flags & ~(IFF_PROMISC |
6367 				IFF_ALLMULTI |
6368 				IFF_RUNNING |
6369 				IFF_LOWER_UP |
6370 				IFF_DORMANT)) |
6371 		(dev->gflags & (IFF_PROMISC |
6372 				IFF_ALLMULTI));
6373 
6374 	if (netif_running(dev)) {
6375 		if (netif_oper_up(dev))
6376 			flags |= IFF_RUNNING;
6377 		if (netif_carrier_ok(dev))
6378 			flags |= IFF_LOWER_UP;
6379 		if (netif_dormant(dev))
6380 			flags |= IFF_DORMANT;
6381 	}
6382 
6383 	return flags;
6384 }
6385 EXPORT_SYMBOL(dev_get_flags);
6386 
6387 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6388 {
6389 	unsigned int old_flags = dev->flags;
6390 	int ret;
6391 
6392 	ASSERT_RTNL();
6393 
6394 	/*
6395 	 *	Set the flags on our device.
6396 	 */
6397 
6398 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6399 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6400 			       IFF_AUTOMEDIA)) |
6401 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6402 				    IFF_ALLMULTI));
6403 
6404 	/*
6405 	 *	Load in the correct multicast list now the flags have changed.
6406 	 */
6407 
6408 	if ((old_flags ^ flags) & IFF_MULTICAST)
6409 		dev_change_rx_flags(dev, IFF_MULTICAST);
6410 
6411 	dev_set_rx_mode(dev);
6412 
6413 	/*
6414 	 *	Have we downed the interface. We handle IFF_UP ourselves
6415 	 *	according to user attempts to set it, rather than blindly
6416 	 *	setting it.
6417 	 */
6418 
6419 	ret = 0;
6420 	if ((old_flags ^ flags) & IFF_UP)
6421 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6422 
6423 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6424 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6425 		unsigned int old_flags = dev->flags;
6426 
6427 		dev->gflags ^= IFF_PROMISC;
6428 
6429 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6430 			if (dev->flags != old_flags)
6431 				dev_set_rx_mode(dev);
6432 	}
6433 
6434 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6435 	   is important. Some (broken) drivers set IFF_PROMISC, when
6436 	   IFF_ALLMULTI is requested not asking us and not reporting.
6437 	 */
6438 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6439 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6440 
6441 		dev->gflags ^= IFF_ALLMULTI;
6442 		__dev_set_allmulti(dev, inc, false);
6443 	}
6444 
6445 	return ret;
6446 }
6447 
6448 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6449 			unsigned int gchanges)
6450 {
6451 	unsigned int changes = dev->flags ^ old_flags;
6452 
6453 	if (gchanges)
6454 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6455 
6456 	if (changes & IFF_UP) {
6457 		if (dev->flags & IFF_UP)
6458 			call_netdevice_notifiers(NETDEV_UP, dev);
6459 		else
6460 			call_netdevice_notifiers(NETDEV_DOWN, dev);
6461 	}
6462 
6463 	if (dev->flags & IFF_UP &&
6464 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6465 		struct netdev_notifier_change_info change_info;
6466 
6467 		change_info.flags_changed = changes;
6468 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6469 					      &change_info.info);
6470 	}
6471 }
6472 
6473 /**
6474  *	dev_change_flags - change device settings
6475  *	@dev: device
6476  *	@flags: device state flags
6477  *
6478  *	Change settings on device based state flags. The flags are
6479  *	in the userspace exported format.
6480  */
6481 int dev_change_flags(struct net_device *dev, unsigned int flags)
6482 {
6483 	int ret;
6484 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6485 
6486 	ret = __dev_change_flags(dev, flags);
6487 	if (ret < 0)
6488 		return ret;
6489 
6490 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6491 	__dev_notify_flags(dev, old_flags, changes);
6492 	return ret;
6493 }
6494 EXPORT_SYMBOL(dev_change_flags);
6495 
6496 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6497 {
6498 	const struct net_device_ops *ops = dev->netdev_ops;
6499 
6500 	if (ops->ndo_change_mtu)
6501 		return ops->ndo_change_mtu(dev, new_mtu);
6502 
6503 	dev->mtu = new_mtu;
6504 	return 0;
6505 }
6506 
6507 /**
6508  *	dev_set_mtu - Change maximum transfer unit
6509  *	@dev: device
6510  *	@new_mtu: new transfer unit
6511  *
6512  *	Change the maximum transfer size of the network device.
6513  */
6514 int dev_set_mtu(struct net_device *dev, int new_mtu)
6515 {
6516 	int err, orig_mtu;
6517 
6518 	if (new_mtu == dev->mtu)
6519 		return 0;
6520 
6521 	/* MTU must be positive, and in range */
6522 	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6523 		net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6524 				    dev->name, new_mtu, dev->min_mtu);
6525 		return -EINVAL;
6526 	}
6527 
6528 	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6529 		net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6530 				    dev->name, new_mtu, dev->max_mtu);
6531 		return -EINVAL;
6532 	}
6533 
6534 	if (!netif_device_present(dev))
6535 		return -ENODEV;
6536 
6537 	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6538 	err = notifier_to_errno(err);
6539 	if (err)
6540 		return err;
6541 
6542 	orig_mtu = dev->mtu;
6543 	err = __dev_set_mtu(dev, new_mtu);
6544 
6545 	if (!err) {
6546 		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6547 		err = notifier_to_errno(err);
6548 		if (err) {
6549 			/* setting mtu back and notifying everyone again,
6550 			 * so that they have a chance to revert changes.
6551 			 */
6552 			__dev_set_mtu(dev, orig_mtu);
6553 			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6554 		}
6555 	}
6556 	return err;
6557 }
6558 EXPORT_SYMBOL(dev_set_mtu);
6559 
6560 /**
6561  *	dev_set_group - Change group this device belongs to
6562  *	@dev: device
6563  *	@new_group: group this device should belong to
6564  */
6565 void dev_set_group(struct net_device *dev, int new_group)
6566 {
6567 	dev->group = new_group;
6568 }
6569 EXPORT_SYMBOL(dev_set_group);
6570 
6571 /**
6572  *	dev_set_mac_address - Change Media Access Control Address
6573  *	@dev: device
6574  *	@sa: new address
6575  *
6576  *	Change the hardware (MAC) address of the device
6577  */
6578 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6579 {
6580 	const struct net_device_ops *ops = dev->netdev_ops;
6581 	int err;
6582 
6583 	if (!ops->ndo_set_mac_address)
6584 		return -EOPNOTSUPP;
6585 	if (sa->sa_family != dev->type)
6586 		return -EINVAL;
6587 	if (!netif_device_present(dev))
6588 		return -ENODEV;
6589 	err = ops->ndo_set_mac_address(dev, sa);
6590 	if (err)
6591 		return err;
6592 	dev->addr_assign_type = NET_ADDR_SET;
6593 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6594 	add_device_randomness(dev->dev_addr, dev->addr_len);
6595 	return 0;
6596 }
6597 EXPORT_SYMBOL(dev_set_mac_address);
6598 
6599 /**
6600  *	dev_change_carrier - Change device carrier
6601  *	@dev: device
6602  *	@new_carrier: new value
6603  *
6604  *	Change device carrier
6605  */
6606 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6607 {
6608 	const struct net_device_ops *ops = dev->netdev_ops;
6609 
6610 	if (!ops->ndo_change_carrier)
6611 		return -EOPNOTSUPP;
6612 	if (!netif_device_present(dev))
6613 		return -ENODEV;
6614 	return ops->ndo_change_carrier(dev, new_carrier);
6615 }
6616 EXPORT_SYMBOL(dev_change_carrier);
6617 
6618 /**
6619  *	dev_get_phys_port_id - Get device physical port ID
6620  *	@dev: device
6621  *	@ppid: port ID
6622  *
6623  *	Get device physical port ID
6624  */
6625 int dev_get_phys_port_id(struct net_device *dev,
6626 			 struct netdev_phys_item_id *ppid)
6627 {
6628 	const struct net_device_ops *ops = dev->netdev_ops;
6629 
6630 	if (!ops->ndo_get_phys_port_id)
6631 		return -EOPNOTSUPP;
6632 	return ops->ndo_get_phys_port_id(dev, ppid);
6633 }
6634 EXPORT_SYMBOL(dev_get_phys_port_id);
6635 
6636 /**
6637  *	dev_get_phys_port_name - Get device physical port name
6638  *	@dev: device
6639  *	@name: port name
6640  *	@len: limit of bytes to copy to name
6641  *
6642  *	Get device physical port name
6643  */
6644 int dev_get_phys_port_name(struct net_device *dev,
6645 			   char *name, size_t len)
6646 {
6647 	const struct net_device_ops *ops = dev->netdev_ops;
6648 
6649 	if (!ops->ndo_get_phys_port_name)
6650 		return -EOPNOTSUPP;
6651 	return ops->ndo_get_phys_port_name(dev, name, len);
6652 }
6653 EXPORT_SYMBOL(dev_get_phys_port_name);
6654 
6655 /**
6656  *	dev_change_proto_down - update protocol port state information
6657  *	@dev: device
6658  *	@proto_down: new value
6659  *
6660  *	This info can be used by switch drivers to set the phys state of the
6661  *	port.
6662  */
6663 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6664 {
6665 	const struct net_device_ops *ops = dev->netdev_ops;
6666 
6667 	if (!ops->ndo_change_proto_down)
6668 		return -EOPNOTSUPP;
6669 	if (!netif_device_present(dev))
6670 		return -ENODEV;
6671 	return ops->ndo_change_proto_down(dev, proto_down);
6672 }
6673 EXPORT_SYMBOL(dev_change_proto_down);
6674 
6675 /**
6676  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
6677  *	@dev: device
6678  *	@fd: new program fd or negative value to clear
6679  *	@flags: xdp-related flags
6680  *
6681  *	Set or clear a bpf program for a device
6682  */
6683 int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
6684 {
6685 	const struct net_device_ops *ops = dev->netdev_ops;
6686 	struct bpf_prog *prog = NULL;
6687 	struct netdev_xdp xdp;
6688 	int err;
6689 
6690 	ASSERT_RTNL();
6691 
6692 	if (!ops->ndo_xdp)
6693 		return -EOPNOTSUPP;
6694 	if (fd >= 0) {
6695 		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6696 			memset(&xdp, 0, sizeof(xdp));
6697 			xdp.command = XDP_QUERY_PROG;
6698 
6699 			err = ops->ndo_xdp(dev, &xdp);
6700 			if (err < 0)
6701 				return err;
6702 			if (xdp.prog_attached)
6703 				return -EBUSY;
6704 		}
6705 
6706 		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6707 		if (IS_ERR(prog))
6708 			return PTR_ERR(prog);
6709 	}
6710 
6711 	memset(&xdp, 0, sizeof(xdp));
6712 	xdp.command = XDP_SETUP_PROG;
6713 	xdp.prog = prog;
6714 
6715 	err = ops->ndo_xdp(dev, &xdp);
6716 	if (err < 0 && prog)
6717 		bpf_prog_put(prog);
6718 
6719 	return err;
6720 }
6721 EXPORT_SYMBOL(dev_change_xdp_fd);
6722 
6723 /**
6724  *	dev_new_index	-	allocate an ifindex
6725  *	@net: the applicable net namespace
6726  *
6727  *	Returns a suitable unique value for a new device interface
6728  *	number.  The caller must hold the rtnl semaphore or the
6729  *	dev_base_lock to be sure it remains unique.
6730  */
6731 static int dev_new_index(struct net *net)
6732 {
6733 	int ifindex = net->ifindex;
6734 	for (;;) {
6735 		if (++ifindex <= 0)
6736 			ifindex = 1;
6737 		if (!__dev_get_by_index(net, ifindex))
6738 			return net->ifindex = ifindex;
6739 	}
6740 }
6741 
6742 /* Delayed registration/unregisteration */
6743 static LIST_HEAD(net_todo_list);
6744 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6745 
6746 static void net_set_todo(struct net_device *dev)
6747 {
6748 	list_add_tail(&dev->todo_list, &net_todo_list);
6749 	dev_net(dev)->dev_unreg_count++;
6750 }
6751 
6752 static void rollback_registered_many(struct list_head *head)
6753 {
6754 	struct net_device *dev, *tmp;
6755 	LIST_HEAD(close_head);
6756 
6757 	BUG_ON(dev_boot_phase);
6758 	ASSERT_RTNL();
6759 
6760 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6761 		/* Some devices call without registering
6762 		 * for initialization unwind. Remove those
6763 		 * devices and proceed with the remaining.
6764 		 */
6765 		if (dev->reg_state == NETREG_UNINITIALIZED) {
6766 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6767 				 dev->name, dev);
6768 
6769 			WARN_ON(1);
6770 			list_del(&dev->unreg_list);
6771 			continue;
6772 		}
6773 		dev->dismantle = true;
6774 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6775 	}
6776 
6777 	/* If device is running, close it first. */
6778 	list_for_each_entry(dev, head, unreg_list)
6779 		list_add_tail(&dev->close_list, &close_head);
6780 	dev_close_many(&close_head, true);
6781 
6782 	list_for_each_entry(dev, head, unreg_list) {
6783 		/* And unlink it from device chain. */
6784 		unlist_netdevice(dev);
6785 
6786 		dev->reg_state = NETREG_UNREGISTERING;
6787 	}
6788 	flush_all_backlogs();
6789 
6790 	synchronize_net();
6791 
6792 	list_for_each_entry(dev, head, unreg_list) {
6793 		struct sk_buff *skb = NULL;
6794 
6795 		/* Shutdown queueing discipline. */
6796 		dev_shutdown(dev);
6797 
6798 
6799 		/* Notify protocols, that we are about to destroy
6800 		   this device. They should clean all the things.
6801 		*/
6802 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6803 
6804 		if (!dev->rtnl_link_ops ||
6805 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6806 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6807 						     GFP_KERNEL);
6808 
6809 		/*
6810 		 *	Flush the unicast and multicast chains
6811 		 */
6812 		dev_uc_flush(dev);
6813 		dev_mc_flush(dev);
6814 
6815 		if (dev->netdev_ops->ndo_uninit)
6816 			dev->netdev_ops->ndo_uninit(dev);
6817 
6818 		if (skb)
6819 			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6820 
6821 		/* Notifier chain MUST detach us all upper devices. */
6822 		WARN_ON(netdev_has_any_upper_dev(dev));
6823 		WARN_ON(netdev_has_any_lower_dev(dev));
6824 
6825 		/* Remove entries from kobject tree */
6826 		netdev_unregister_kobject(dev);
6827 #ifdef CONFIG_XPS
6828 		/* Remove XPS queueing entries */
6829 		netif_reset_xps_queues_gt(dev, 0);
6830 #endif
6831 	}
6832 
6833 	synchronize_net();
6834 
6835 	list_for_each_entry(dev, head, unreg_list)
6836 		dev_put(dev);
6837 }
6838 
6839 static void rollback_registered(struct net_device *dev)
6840 {
6841 	LIST_HEAD(single);
6842 
6843 	list_add(&dev->unreg_list, &single);
6844 	rollback_registered_many(&single);
6845 	list_del(&single);
6846 }
6847 
6848 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6849 	struct net_device *upper, netdev_features_t features)
6850 {
6851 	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6852 	netdev_features_t feature;
6853 	int feature_bit;
6854 
6855 	for_each_netdev_feature(&upper_disables, feature_bit) {
6856 		feature = __NETIF_F_BIT(feature_bit);
6857 		if (!(upper->wanted_features & feature)
6858 		    && (features & feature)) {
6859 			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6860 				   &feature, upper->name);
6861 			features &= ~feature;
6862 		}
6863 	}
6864 
6865 	return features;
6866 }
6867 
6868 static void netdev_sync_lower_features(struct net_device *upper,
6869 	struct net_device *lower, netdev_features_t features)
6870 {
6871 	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6872 	netdev_features_t feature;
6873 	int feature_bit;
6874 
6875 	for_each_netdev_feature(&upper_disables, feature_bit) {
6876 		feature = __NETIF_F_BIT(feature_bit);
6877 		if (!(features & feature) && (lower->features & feature)) {
6878 			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6879 				   &feature, lower->name);
6880 			lower->wanted_features &= ~feature;
6881 			netdev_update_features(lower);
6882 
6883 			if (unlikely(lower->features & feature))
6884 				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6885 					    &feature, lower->name);
6886 		}
6887 	}
6888 }
6889 
6890 static netdev_features_t netdev_fix_features(struct net_device *dev,
6891 	netdev_features_t features)
6892 {
6893 	/* Fix illegal checksum combinations */
6894 	if ((features & NETIF_F_HW_CSUM) &&
6895 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6896 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6897 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6898 	}
6899 
6900 	/* TSO requires that SG is present as well. */
6901 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6902 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6903 		features &= ~NETIF_F_ALL_TSO;
6904 	}
6905 
6906 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6907 					!(features & NETIF_F_IP_CSUM)) {
6908 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6909 		features &= ~NETIF_F_TSO;
6910 		features &= ~NETIF_F_TSO_ECN;
6911 	}
6912 
6913 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6914 					 !(features & NETIF_F_IPV6_CSUM)) {
6915 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6916 		features &= ~NETIF_F_TSO6;
6917 	}
6918 
6919 	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6920 	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6921 		features &= ~NETIF_F_TSO_MANGLEID;
6922 
6923 	/* TSO ECN requires that TSO is present as well. */
6924 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6925 		features &= ~NETIF_F_TSO_ECN;
6926 
6927 	/* Software GSO depends on SG. */
6928 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6929 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6930 		features &= ~NETIF_F_GSO;
6931 	}
6932 
6933 	/* UFO needs SG and checksumming */
6934 	if (features & NETIF_F_UFO) {
6935 		/* maybe split UFO into V4 and V6? */
6936 		if (!(features & NETIF_F_HW_CSUM) &&
6937 		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6938 		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6939 			netdev_dbg(dev,
6940 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6941 			features &= ~NETIF_F_UFO;
6942 		}
6943 
6944 		if (!(features & NETIF_F_SG)) {
6945 			netdev_dbg(dev,
6946 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6947 			features &= ~NETIF_F_UFO;
6948 		}
6949 	}
6950 
6951 	/* GSO partial features require GSO partial be set */
6952 	if ((features & dev->gso_partial_features) &&
6953 	    !(features & NETIF_F_GSO_PARTIAL)) {
6954 		netdev_dbg(dev,
6955 			   "Dropping partially supported GSO features since no GSO partial.\n");
6956 		features &= ~dev->gso_partial_features;
6957 	}
6958 
6959 #ifdef CONFIG_NET_RX_BUSY_POLL
6960 	if (dev->netdev_ops->ndo_busy_poll)
6961 		features |= NETIF_F_BUSY_POLL;
6962 	else
6963 #endif
6964 		features &= ~NETIF_F_BUSY_POLL;
6965 
6966 	return features;
6967 }
6968 
6969 int __netdev_update_features(struct net_device *dev)
6970 {
6971 	struct net_device *upper, *lower;
6972 	netdev_features_t features;
6973 	struct list_head *iter;
6974 	int err = -1;
6975 
6976 	ASSERT_RTNL();
6977 
6978 	features = netdev_get_wanted_features(dev);
6979 
6980 	if (dev->netdev_ops->ndo_fix_features)
6981 		features = dev->netdev_ops->ndo_fix_features(dev, features);
6982 
6983 	/* driver might be less strict about feature dependencies */
6984 	features = netdev_fix_features(dev, features);
6985 
6986 	/* some features can't be enabled if they're off an an upper device */
6987 	netdev_for_each_upper_dev_rcu(dev, upper, iter)
6988 		features = netdev_sync_upper_features(dev, upper, features);
6989 
6990 	if (dev->features == features)
6991 		goto sync_lower;
6992 
6993 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6994 		&dev->features, &features);
6995 
6996 	if (dev->netdev_ops->ndo_set_features)
6997 		err = dev->netdev_ops->ndo_set_features(dev, features);
6998 	else
6999 		err = 0;
7000 
7001 	if (unlikely(err < 0)) {
7002 		netdev_err(dev,
7003 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
7004 			err, &features, &dev->features);
7005 		/* return non-0 since some features might have changed and
7006 		 * it's better to fire a spurious notification than miss it
7007 		 */
7008 		return -1;
7009 	}
7010 
7011 sync_lower:
7012 	/* some features must be disabled on lower devices when disabled
7013 	 * on an upper device (think: bonding master or bridge)
7014 	 */
7015 	netdev_for_each_lower_dev(dev, lower, iter)
7016 		netdev_sync_lower_features(dev, lower, features);
7017 
7018 	if (!err)
7019 		dev->features = features;
7020 
7021 	return err < 0 ? 0 : 1;
7022 }
7023 
7024 /**
7025  *	netdev_update_features - recalculate device features
7026  *	@dev: the device to check
7027  *
7028  *	Recalculate dev->features set and send notifications if it
7029  *	has changed. Should be called after driver or hardware dependent
7030  *	conditions might have changed that influence the features.
7031  */
7032 void netdev_update_features(struct net_device *dev)
7033 {
7034 	if (__netdev_update_features(dev))
7035 		netdev_features_change(dev);
7036 }
7037 EXPORT_SYMBOL(netdev_update_features);
7038 
7039 /**
7040  *	netdev_change_features - recalculate device features
7041  *	@dev: the device to check
7042  *
7043  *	Recalculate dev->features set and send notifications even
7044  *	if they have not changed. Should be called instead of
7045  *	netdev_update_features() if also dev->vlan_features might
7046  *	have changed to allow the changes to be propagated to stacked
7047  *	VLAN devices.
7048  */
7049 void netdev_change_features(struct net_device *dev)
7050 {
7051 	__netdev_update_features(dev);
7052 	netdev_features_change(dev);
7053 }
7054 EXPORT_SYMBOL(netdev_change_features);
7055 
7056 /**
7057  *	netif_stacked_transfer_operstate -	transfer operstate
7058  *	@rootdev: the root or lower level device to transfer state from
7059  *	@dev: the device to transfer operstate to
7060  *
7061  *	Transfer operational state from root to device. This is normally
7062  *	called when a stacking relationship exists between the root
7063  *	device and the device(a leaf device).
7064  */
7065 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7066 					struct net_device *dev)
7067 {
7068 	if (rootdev->operstate == IF_OPER_DORMANT)
7069 		netif_dormant_on(dev);
7070 	else
7071 		netif_dormant_off(dev);
7072 
7073 	if (netif_carrier_ok(rootdev)) {
7074 		if (!netif_carrier_ok(dev))
7075 			netif_carrier_on(dev);
7076 	} else {
7077 		if (netif_carrier_ok(dev))
7078 			netif_carrier_off(dev);
7079 	}
7080 }
7081 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7082 
7083 #ifdef CONFIG_SYSFS
7084 static int netif_alloc_rx_queues(struct net_device *dev)
7085 {
7086 	unsigned int i, count = dev->num_rx_queues;
7087 	struct netdev_rx_queue *rx;
7088 	size_t sz = count * sizeof(*rx);
7089 
7090 	BUG_ON(count < 1);
7091 
7092 	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7093 	if (!rx) {
7094 		rx = vzalloc(sz);
7095 		if (!rx)
7096 			return -ENOMEM;
7097 	}
7098 	dev->_rx = rx;
7099 
7100 	for (i = 0; i < count; i++)
7101 		rx[i].dev = dev;
7102 	return 0;
7103 }
7104 #endif
7105 
7106 static void netdev_init_one_queue(struct net_device *dev,
7107 				  struct netdev_queue *queue, void *_unused)
7108 {
7109 	/* Initialize queue lock */
7110 	spin_lock_init(&queue->_xmit_lock);
7111 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7112 	queue->xmit_lock_owner = -1;
7113 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7114 	queue->dev = dev;
7115 #ifdef CONFIG_BQL
7116 	dql_init(&queue->dql, HZ);
7117 #endif
7118 }
7119 
7120 static void netif_free_tx_queues(struct net_device *dev)
7121 {
7122 	kvfree(dev->_tx);
7123 }
7124 
7125 static int netif_alloc_netdev_queues(struct net_device *dev)
7126 {
7127 	unsigned int count = dev->num_tx_queues;
7128 	struct netdev_queue *tx;
7129 	size_t sz = count * sizeof(*tx);
7130 
7131 	if (count < 1 || count > 0xffff)
7132 		return -EINVAL;
7133 
7134 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7135 	if (!tx) {
7136 		tx = vzalloc(sz);
7137 		if (!tx)
7138 			return -ENOMEM;
7139 	}
7140 	dev->_tx = tx;
7141 
7142 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7143 	spin_lock_init(&dev->tx_global_lock);
7144 
7145 	return 0;
7146 }
7147 
7148 void netif_tx_stop_all_queues(struct net_device *dev)
7149 {
7150 	unsigned int i;
7151 
7152 	for (i = 0; i < dev->num_tx_queues; i++) {
7153 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7154 		netif_tx_stop_queue(txq);
7155 	}
7156 }
7157 EXPORT_SYMBOL(netif_tx_stop_all_queues);
7158 
7159 /**
7160  *	register_netdevice	- register a network device
7161  *	@dev: device to register
7162  *
7163  *	Take a completed network device structure and add it to the kernel
7164  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7165  *	chain. 0 is returned on success. A negative errno code is returned
7166  *	on a failure to set up the device, or if the name is a duplicate.
7167  *
7168  *	Callers must hold the rtnl semaphore. You may want
7169  *	register_netdev() instead of this.
7170  *
7171  *	BUGS:
7172  *	The locking appears insufficient to guarantee two parallel registers
7173  *	will not get the same name.
7174  */
7175 
7176 int register_netdevice(struct net_device *dev)
7177 {
7178 	int ret;
7179 	struct net *net = dev_net(dev);
7180 
7181 	BUG_ON(dev_boot_phase);
7182 	ASSERT_RTNL();
7183 
7184 	might_sleep();
7185 
7186 	/* When net_device's are persistent, this will be fatal. */
7187 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7188 	BUG_ON(!net);
7189 
7190 	spin_lock_init(&dev->addr_list_lock);
7191 	netdev_set_addr_lockdep_class(dev);
7192 
7193 	ret = dev_get_valid_name(net, dev, dev->name);
7194 	if (ret < 0)
7195 		goto out;
7196 
7197 	/* Init, if this function is available */
7198 	if (dev->netdev_ops->ndo_init) {
7199 		ret = dev->netdev_ops->ndo_init(dev);
7200 		if (ret) {
7201 			if (ret > 0)
7202 				ret = -EIO;
7203 			goto out;
7204 		}
7205 	}
7206 
7207 	if (((dev->hw_features | dev->features) &
7208 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
7209 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7210 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7211 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7212 		ret = -EINVAL;
7213 		goto err_uninit;
7214 	}
7215 
7216 	ret = -EBUSY;
7217 	if (!dev->ifindex)
7218 		dev->ifindex = dev_new_index(net);
7219 	else if (__dev_get_by_index(net, dev->ifindex))
7220 		goto err_uninit;
7221 
7222 	/* Transfer changeable features to wanted_features and enable
7223 	 * software offloads (GSO and GRO).
7224 	 */
7225 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
7226 	dev->features |= NETIF_F_SOFT_FEATURES;
7227 	dev->wanted_features = dev->features & dev->hw_features;
7228 
7229 	if (!(dev->flags & IFF_LOOPBACK))
7230 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
7231 
7232 	/* If IPv4 TCP segmentation offload is supported we should also
7233 	 * allow the device to enable segmenting the frame with the option
7234 	 * of ignoring a static IP ID value.  This doesn't enable the
7235 	 * feature itself but allows the user to enable it later.
7236 	 */
7237 	if (dev->hw_features & NETIF_F_TSO)
7238 		dev->hw_features |= NETIF_F_TSO_MANGLEID;
7239 	if (dev->vlan_features & NETIF_F_TSO)
7240 		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7241 	if (dev->mpls_features & NETIF_F_TSO)
7242 		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7243 	if (dev->hw_enc_features & NETIF_F_TSO)
7244 		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7245 
7246 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7247 	 */
7248 	dev->vlan_features |= NETIF_F_HIGHDMA;
7249 
7250 	/* Make NETIF_F_SG inheritable to tunnel devices.
7251 	 */
7252 	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7253 
7254 	/* Make NETIF_F_SG inheritable to MPLS.
7255 	 */
7256 	dev->mpls_features |= NETIF_F_SG;
7257 
7258 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7259 	ret = notifier_to_errno(ret);
7260 	if (ret)
7261 		goto err_uninit;
7262 
7263 	ret = netdev_register_kobject(dev);
7264 	if (ret)
7265 		goto err_uninit;
7266 	dev->reg_state = NETREG_REGISTERED;
7267 
7268 	__netdev_update_features(dev);
7269 
7270 	/*
7271 	 *	Default initial state at registry is that the
7272 	 *	device is present.
7273 	 */
7274 
7275 	set_bit(__LINK_STATE_PRESENT, &dev->state);
7276 
7277 	linkwatch_init_dev(dev);
7278 
7279 	dev_init_scheduler(dev);
7280 	dev_hold(dev);
7281 	list_netdevice(dev);
7282 	add_device_randomness(dev->dev_addr, dev->addr_len);
7283 
7284 	/* If the device has permanent device address, driver should
7285 	 * set dev_addr and also addr_assign_type should be set to
7286 	 * NET_ADDR_PERM (default value).
7287 	 */
7288 	if (dev->addr_assign_type == NET_ADDR_PERM)
7289 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7290 
7291 	/* Notify protocols, that a new device appeared. */
7292 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7293 	ret = notifier_to_errno(ret);
7294 	if (ret) {
7295 		rollback_registered(dev);
7296 		dev->reg_state = NETREG_UNREGISTERED;
7297 	}
7298 	/*
7299 	 *	Prevent userspace races by waiting until the network
7300 	 *	device is fully setup before sending notifications.
7301 	 */
7302 	if (!dev->rtnl_link_ops ||
7303 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7304 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7305 
7306 out:
7307 	return ret;
7308 
7309 err_uninit:
7310 	if (dev->netdev_ops->ndo_uninit)
7311 		dev->netdev_ops->ndo_uninit(dev);
7312 	goto out;
7313 }
7314 EXPORT_SYMBOL(register_netdevice);
7315 
7316 /**
7317  *	init_dummy_netdev	- init a dummy network device for NAPI
7318  *	@dev: device to init
7319  *
7320  *	This takes a network device structure and initialize the minimum
7321  *	amount of fields so it can be used to schedule NAPI polls without
7322  *	registering a full blown interface. This is to be used by drivers
7323  *	that need to tie several hardware interfaces to a single NAPI
7324  *	poll scheduler due to HW limitations.
7325  */
7326 int init_dummy_netdev(struct net_device *dev)
7327 {
7328 	/* Clear everything. Note we don't initialize spinlocks
7329 	 * are they aren't supposed to be taken by any of the
7330 	 * NAPI code and this dummy netdev is supposed to be
7331 	 * only ever used for NAPI polls
7332 	 */
7333 	memset(dev, 0, sizeof(struct net_device));
7334 
7335 	/* make sure we BUG if trying to hit standard
7336 	 * register/unregister code path
7337 	 */
7338 	dev->reg_state = NETREG_DUMMY;
7339 
7340 	/* NAPI wants this */
7341 	INIT_LIST_HEAD(&dev->napi_list);
7342 
7343 	/* a dummy interface is started by default */
7344 	set_bit(__LINK_STATE_PRESENT, &dev->state);
7345 	set_bit(__LINK_STATE_START, &dev->state);
7346 
7347 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
7348 	 * because users of this 'device' dont need to change
7349 	 * its refcount.
7350 	 */
7351 
7352 	return 0;
7353 }
7354 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7355 
7356 
7357 /**
7358  *	register_netdev	- register a network device
7359  *	@dev: device to register
7360  *
7361  *	Take a completed network device structure and add it to the kernel
7362  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7363  *	chain. 0 is returned on success. A negative errno code is returned
7364  *	on a failure to set up the device, or if the name is a duplicate.
7365  *
7366  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7367  *	and expands the device name if you passed a format string to
7368  *	alloc_netdev.
7369  */
7370 int register_netdev(struct net_device *dev)
7371 {
7372 	int err;
7373 
7374 	rtnl_lock();
7375 	err = register_netdevice(dev);
7376 	rtnl_unlock();
7377 	return err;
7378 }
7379 EXPORT_SYMBOL(register_netdev);
7380 
7381 int netdev_refcnt_read(const struct net_device *dev)
7382 {
7383 	int i, refcnt = 0;
7384 
7385 	for_each_possible_cpu(i)
7386 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7387 	return refcnt;
7388 }
7389 EXPORT_SYMBOL(netdev_refcnt_read);
7390 
7391 /**
7392  * netdev_wait_allrefs - wait until all references are gone.
7393  * @dev: target net_device
7394  *
7395  * This is called when unregistering network devices.
7396  *
7397  * Any protocol or device that holds a reference should register
7398  * for netdevice notification, and cleanup and put back the
7399  * reference if they receive an UNREGISTER event.
7400  * We can get stuck here if buggy protocols don't correctly
7401  * call dev_put.
7402  */
7403 static void netdev_wait_allrefs(struct net_device *dev)
7404 {
7405 	unsigned long rebroadcast_time, warning_time;
7406 	int refcnt;
7407 
7408 	linkwatch_forget_dev(dev);
7409 
7410 	rebroadcast_time = warning_time = jiffies;
7411 	refcnt = netdev_refcnt_read(dev);
7412 
7413 	while (refcnt != 0) {
7414 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7415 			rtnl_lock();
7416 
7417 			/* Rebroadcast unregister notification */
7418 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7419 
7420 			__rtnl_unlock();
7421 			rcu_barrier();
7422 			rtnl_lock();
7423 
7424 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7425 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7426 				     &dev->state)) {
7427 				/* We must not have linkwatch events
7428 				 * pending on unregister. If this
7429 				 * happens, we simply run the queue
7430 				 * unscheduled, resulting in a noop
7431 				 * for this device.
7432 				 */
7433 				linkwatch_run_queue();
7434 			}
7435 
7436 			__rtnl_unlock();
7437 
7438 			rebroadcast_time = jiffies;
7439 		}
7440 
7441 		msleep(250);
7442 
7443 		refcnt = netdev_refcnt_read(dev);
7444 
7445 		if (time_after(jiffies, warning_time + 10 * HZ)) {
7446 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7447 				 dev->name, refcnt);
7448 			warning_time = jiffies;
7449 		}
7450 	}
7451 }
7452 
7453 /* The sequence is:
7454  *
7455  *	rtnl_lock();
7456  *	...
7457  *	register_netdevice(x1);
7458  *	register_netdevice(x2);
7459  *	...
7460  *	unregister_netdevice(y1);
7461  *	unregister_netdevice(y2);
7462  *      ...
7463  *	rtnl_unlock();
7464  *	free_netdev(y1);
7465  *	free_netdev(y2);
7466  *
7467  * We are invoked by rtnl_unlock().
7468  * This allows us to deal with problems:
7469  * 1) We can delete sysfs objects which invoke hotplug
7470  *    without deadlocking with linkwatch via keventd.
7471  * 2) Since we run with the RTNL semaphore not held, we can sleep
7472  *    safely in order to wait for the netdev refcnt to drop to zero.
7473  *
7474  * We must not return until all unregister events added during
7475  * the interval the lock was held have been completed.
7476  */
7477 void netdev_run_todo(void)
7478 {
7479 	struct list_head list;
7480 
7481 	/* Snapshot list, allow later requests */
7482 	list_replace_init(&net_todo_list, &list);
7483 
7484 	__rtnl_unlock();
7485 
7486 
7487 	/* Wait for rcu callbacks to finish before next phase */
7488 	if (!list_empty(&list))
7489 		rcu_barrier();
7490 
7491 	while (!list_empty(&list)) {
7492 		struct net_device *dev
7493 			= list_first_entry(&list, struct net_device, todo_list);
7494 		list_del(&dev->todo_list);
7495 
7496 		rtnl_lock();
7497 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7498 		__rtnl_unlock();
7499 
7500 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7501 			pr_err("network todo '%s' but state %d\n",
7502 			       dev->name, dev->reg_state);
7503 			dump_stack();
7504 			continue;
7505 		}
7506 
7507 		dev->reg_state = NETREG_UNREGISTERED;
7508 
7509 		netdev_wait_allrefs(dev);
7510 
7511 		/* paranoia */
7512 		BUG_ON(netdev_refcnt_read(dev));
7513 		BUG_ON(!list_empty(&dev->ptype_all));
7514 		BUG_ON(!list_empty(&dev->ptype_specific));
7515 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7516 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7517 		WARN_ON(dev->dn_ptr);
7518 
7519 		if (dev->destructor)
7520 			dev->destructor(dev);
7521 
7522 		/* Report a network device has been unregistered */
7523 		rtnl_lock();
7524 		dev_net(dev)->dev_unreg_count--;
7525 		__rtnl_unlock();
7526 		wake_up(&netdev_unregistering_wq);
7527 
7528 		/* Free network device */
7529 		kobject_put(&dev->dev.kobj);
7530 	}
7531 }
7532 
7533 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7534  * all the same fields in the same order as net_device_stats, with only
7535  * the type differing, but rtnl_link_stats64 may have additional fields
7536  * at the end for newer counters.
7537  */
7538 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7539 			     const struct net_device_stats *netdev_stats)
7540 {
7541 #if BITS_PER_LONG == 64
7542 	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7543 	memcpy(stats64, netdev_stats, sizeof(*stats64));
7544 	/* zero out counters that only exist in rtnl_link_stats64 */
7545 	memset((char *)stats64 + sizeof(*netdev_stats), 0,
7546 	       sizeof(*stats64) - sizeof(*netdev_stats));
7547 #else
7548 	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7549 	const unsigned long *src = (const unsigned long *)netdev_stats;
7550 	u64 *dst = (u64 *)stats64;
7551 
7552 	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7553 	for (i = 0; i < n; i++)
7554 		dst[i] = src[i];
7555 	/* zero out counters that only exist in rtnl_link_stats64 */
7556 	memset((char *)stats64 + n * sizeof(u64), 0,
7557 	       sizeof(*stats64) - n * sizeof(u64));
7558 #endif
7559 }
7560 EXPORT_SYMBOL(netdev_stats_to_stats64);
7561 
7562 /**
7563  *	dev_get_stats	- get network device statistics
7564  *	@dev: device to get statistics from
7565  *	@storage: place to store stats
7566  *
7567  *	Get network statistics from device. Return @storage.
7568  *	The device driver may provide its own method by setting
7569  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7570  *	otherwise the internal statistics structure is used.
7571  */
7572 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7573 					struct rtnl_link_stats64 *storage)
7574 {
7575 	const struct net_device_ops *ops = dev->netdev_ops;
7576 
7577 	if (ops->ndo_get_stats64) {
7578 		memset(storage, 0, sizeof(*storage));
7579 		ops->ndo_get_stats64(dev, storage);
7580 	} else if (ops->ndo_get_stats) {
7581 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7582 	} else {
7583 		netdev_stats_to_stats64(storage, &dev->stats);
7584 	}
7585 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7586 	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7587 	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7588 	return storage;
7589 }
7590 EXPORT_SYMBOL(dev_get_stats);
7591 
7592 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7593 {
7594 	struct netdev_queue *queue = dev_ingress_queue(dev);
7595 
7596 #ifdef CONFIG_NET_CLS_ACT
7597 	if (queue)
7598 		return queue;
7599 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7600 	if (!queue)
7601 		return NULL;
7602 	netdev_init_one_queue(dev, queue, NULL);
7603 	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7604 	queue->qdisc_sleeping = &noop_qdisc;
7605 	rcu_assign_pointer(dev->ingress_queue, queue);
7606 #endif
7607 	return queue;
7608 }
7609 
7610 static const struct ethtool_ops default_ethtool_ops;
7611 
7612 void netdev_set_default_ethtool_ops(struct net_device *dev,
7613 				    const struct ethtool_ops *ops)
7614 {
7615 	if (dev->ethtool_ops == &default_ethtool_ops)
7616 		dev->ethtool_ops = ops;
7617 }
7618 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7619 
7620 void netdev_freemem(struct net_device *dev)
7621 {
7622 	char *addr = (char *)dev - dev->padded;
7623 
7624 	kvfree(addr);
7625 }
7626 
7627 /**
7628  *	alloc_netdev_mqs - allocate network device
7629  *	@sizeof_priv:		size of private data to allocate space for
7630  *	@name:			device name format string
7631  *	@name_assign_type: 	origin of device name
7632  *	@setup:			callback to initialize device
7633  *	@txqs:			the number of TX subqueues to allocate
7634  *	@rxqs:			the number of RX subqueues to allocate
7635  *
7636  *	Allocates a struct net_device with private data area for driver use
7637  *	and performs basic initialization.  Also allocates subqueue structs
7638  *	for each queue on the device.
7639  */
7640 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7641 		unsigned char name_assign_type,
7642 		void (*setup)(struct net_device *),
7643 		unsigned int txqs, unsigned int rxqs)
7644 {
7645 	struct net_device *dev;
7646 	size_t alloc_size;
7647 	struct net_device *p;
7648 
7649 	BUG_ON(strlen(name) >= sizeof(dev->name));
7650 
7651 	if (txqs < 1) {
7652 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7653 		return NULL;
7654 	}
7655 
7656 #ifdef CONFIG_SYSFS
7657 	if (rxqs < 1) {
7658 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7659 		return NULL;
7660 	}
7661 #endif
7662 
7663 	alloc_size = sizeof(struct net_device);
7664 	if (sizeof_priv) {
7665 		/* ensure 32-byte alignment of private area */
7666 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7667 		alloc_size += sizeof_priv;
7668 	}
7669 	/* ensure 32-byte alignment of whole construct */
7670 	alloc_size += NETDEV_ALIGN - 1;
7671 
7672 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7673 	if (!p)
7674 		p = vzalloc(alloc_size);
7675 	if (!p)
7676 		return NULL;
7677 
7678 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7679 	dev->padded = (char *)dev - (char *)p;
7680 
7681 	dev->pcpu_refcnt = alloc_percpu(int);
7682 	if (!dev->pcpu_refcnt)
7683 		goto free_dev;
7684 
7685 	if (dev_addr_init(dev))
7686 		goto free_pcpu;
7687 
7688 	dev_mc_init(dev);
7689 	dev_uc_init(dev);
7690 
7691 	dev_net_set(dev, &init_net);
7692 
7693 	dev->gso_max_size = GSO_MAX_SIZE;
7694 	dev->gso_max_segs = GSO_MAX_SEGS;
7695 
7696 	INIT_LIST_HEAD(&dev->napi_list);
7697 	INIT_LIST_HEAD(&dev->unreg_list);
7698 	INIT_LIST_HEAD(&dev->close_list);
7699 	INIT_LIST_HEAD(&dev->link_watch_list);
7700 	INIT_LIST_HEAD(&dev->adj_list.upper);
7701 	INIT_LIST_HEAD(&dev->adj_list.lower);
7702 	INIT_LIST_HEAD(&dev->ptype_all);
7703 	INIT_LIST_HEAD(&dev->ptype_specific);
7704 #ifdef CONFIG_NET_SCHED
7705 	hash_init(dev->qdisc_hash);
7706 #endif
7707 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7708 	setup(dev);
7709 
7710 	if (!dev->tx_queue_len) {
7711 		dev->priv_flags |= IFF_NO_QUEUE;
7712 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7713 	}
7714 
7715 	dev->num_tx_queues = txqs;
7716 	dev->real_num_tx_queues = txqs;
7717 	if (netif_alloc_netdev_queues(dev))
7718 		goto free_all;
7719 
7720 #ifdef CONFIG_SYSFS
7721 	dev->num_rx_queues = rxqs;
7722 	dev->real_num_rx_queues = rxqs;
7723 	if (netif_alloc_rx_queues(dev))
7724 		goto free_all;
7725 #endif
7726 
7727 	strcpy(dev->name, name);
7728 	dev->name_assign_type = name_assign_type;
7729 	dev->group = INIT_NETDEV_GROUP;
7730 	if (!dev->ethtool_ops)
7731 		dev->ethtool_ops = &default_ethtool_ops;
7732 
7733 	nf_hook_ingress_init(dev);
7734 
7735 	return dev;
7736 
7737 free_all:
7738 	free_netdev(dev);
7739 	return NULL;
7740 
7741 free_pcpu:
7742 	free_percpu(dev->pcpu_refcnt);
7743 free_dev:
7744 	netdev_freemem(dev);
7745 	return NULL;
7746 }
7747 EXPORT_SYMBOL(alloc_netdev_mqs);
7748 
7749 /**
7750  *	free_netdev - free network device
7751  *	@dev: device
7752  *
7753  *	This function does the last stage of destroying an allocated device
7754  * 	interface. The reference to the device object is released.
7755  *	If this is the last reference then it will be freed.
7756  *	Must be called in process context.
7757  */
7758 void free_netdev(struct net_device *dev)
7759 {
7760 	struct napi_struct *p, *n;
7761 
7762 	might_sleep();
7763 	netif_free_tx_queues(dev);
7764 #ifdef CONFIG_SYSFS
7765 	kvfree(dev->_rx);
7766 #endif
7767 
7768 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7769 
7770 	/* Flush device addresses */
7771 	dev_addr_flush(dev);
7772 
7773 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7774 		netif_napi_del(p);
7775 
7776 	free_percpu(dev->pcpu_refcnt);
7777 	dev->pcpu_refcnt = NULL;
7778 
7779 	/*  Compatibility with error handling in drivers */
7780 	if (dev->reg_state == NETREG_UNINITIALIZED) {
7781 		netdev_freemem(dev);
7782 		return;
7783 	}
7784 
7785 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7786 	dev->reg_state = NETREG_RELEASED;
7787 
7788 	/* will free via device release */
7789 	put_device(&dev->dev);
7790 }
7791 EXPORT_SYMBOL(free_netdev);
7792 
7793 /**
7794  *	synchronize_net -  Synchronize with packet receive processing
7795  *
7796  *	Wait for packets currently being received to be done.
7797  *	Does not block later packets from starting.
7798  */
7799 void synchronize_net(void)
7800 {
7801 	might_sleep();
7802 	if (rtnl_is_locked())
7803 		synchronize_rcu_expedited();
7804 	else
7805 		synchronize_rcu();
7806 }
7807 EXPORT_SYMBOL(synchronize_net);
7808 
7809 /**
7810  *	unregister_netdevice_queue - remove device from the kernel
7811  *	@dev: device
7812  *	@head: list
7813  *
7814  *	This function shuts down a device interface and removes it
7815  *	from the kernel tables.
7816  *	If head not NULL, device is queued to be unregistered later.
7817  *
7818  *	Callers must hold the rtnl semaphore.  You may want
7819  *	unregister_netdev() instead of this.
7820  */
7821 
7822 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7823 {
7824 	ASSERT_RTNL();
7825 
7826 	if (head) {
7827 		list_move_tail(&dev->unreg_list, head);
7828 	} else {
7829 		rollback_registered(dev);
7830 		/* Finish processing unregister after unlock */
7831 		net_set_todo(dev);
7832 	}
7833 }
7834 EXPORT_SYMBOL(unregister_netdevice_queue);
7835 
7836 /**
7837  *	unregister_netdevice_many - unregister many devices
7838  *	@head: list of devices
7839  *
7840  *  Note: As most callers use a stack allocated list_head,
7841  *  we force a list_del() to make sure stack wont be corrupted later.
7842  */
7843 void unregister_netdevice_many(struct list_head *head)
7844 {
7845 	struct net_device *dev;
7846 
7847 	if (!list_empty(head)) {
7848 		rollback_registered_many(head);
7849 		list_for_each_entry(dev, head, unreg_list)
7850 			net_set_todo(dev);
7851 		list_del(head);
7852 	}
7853 }
7854 EXPORT_SYMBOL(unregister_netdevice_many);
7855 
7856 /**
7857  *	unregister_netdev - remove device from the kernel
7858  *	@dev: device
7859  *
7860  *	This function shuts down a device interface and removes it
7861  *	from the kernel tables.
7862  *
7863  *	This is just a wrapper for unregister_netdevice that takes
7864  *	the rtnl semaphore.  In general you want to use this and not
7865  *	unregister_netdevice.
7866  */
7867 void unregister_netdev(struct net_device *dev)
7868 {
7869 	rtnl_lock();
7870 	unregister_netdevice(dev);
7871 	rtnl_unlock();
7872 }
7873 EXPORT_SYMBOL(unregister_netdev);
7874 
7875 /**
7876  *	dev_change_net_namespace - move device to different nethost namespace
7877  *	@dev: device
7878  *	@net: network namespace
7879  *	@pat: If not NULL name pattern to try if the current device name
7880  *	      is already taken in the destination network namespace.
7881  *
7882  *	This function shuts down a device interface and moves it
7883  *	to a new network namespace. On success 0 is returned, on
7884  *	a failure a netagive errno code is returned.
7885  *
7886  *	Callers must hold the rtnl semaphore.
7887  */
7888 
7889 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7890 {
7891 	int err;
7892 
7893 	ASSERT_RTNL();
7894 
7895 	/* Don't allow namespace local devices to be moved. */
7896 	err = -EINVAL;
7897 	if (dev->features & NETIF_F_NETNS_LOCAL)
7898 		goto out;
7899 
7900 	/* Ensure the device has been registrered */
7901 	if (dev->reg_state != NETREG_REGISTERED)
7902 		goto out;
7903 
7904 	/* Get out if there is nothing todo */
7905 	err = 0;
7906 	if (net_eq(dev_net(dev), net))
7907 		goto out;
7908 
7909 	/* Pick the destination device name, and ensure
7910 	 * we can use it in the destination network namespace.
7911 	 */
7912 	err = -EEXIST;
7913 	if (__dev_get_by_name(net, dev->name)) {
7914 		/* We get here if we can't use the current device name */
7915 		if (!pat)
7916 			goto out;
7917 		if (dev_get_valid_name(net, dev, pat) < 0)
7918 			goto out;
7919 	}
7920 
7921 	/*
7922 	 * And now a mini version of register_netdevice unregister_netdevice.
7923 	 */
7924 
7925 	/* If device is running close it first. */
7926 	dev_close(dev);
7927 
7928 	/* And unlink it from device chain */
7929 	err = -ENODEV;
7930 	unlist_netdevice(dev);
7931 
7932 	synchronize_net();
7933 
7934 	/* Shutdown queueing discipline. */
7935 	dev_shutdown(dev);
7936 
7937 	/* Notify protocols, that we are about to destroy
7938 	   this device. They should clean all the things.
7939 
7940 	   Note that dev->reg_state stays at NETREG_REGISTERED.
7941 	   This is wanted because this way 8021q and macvlan know
7942 	   the device is just moving and can keep their slaves up.
7943 	*/
7944 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7945 	rcu_barrier();
7946 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7947 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7948 
7949 	/*
7950 	 *	Flush the unicast and multicast chains
7951 	 */
7952 	dev_uc_flush(dev);
7953 	dev_mc_flush(dev);
7954 
7955 	/* Send a netdev-removed uevent to the old namespace */
7956 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7957 	netdev_adjacent_del_links(dev);
7958 
7959 	/* Actually switch the network namespace */
7960 	dev_net_set(dev, net);
7961 
7962 	/* If there is an ifindex conflict assign a new one */
7963 	if (__dev_get_by_index(net, dev->ifindex))
7964 		dev->ifindex = dev_new_index(net);
7965 
7966 	/* Send a netdev-add uevent to the new namespace */
7967 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7968 	netdev_adjacent_add_links(dev);
7969 
7970 	/* Fixup kobjects */
7971 	err = device_rename(&dev->dev, dev->name);
7972 	WARN_ON(err);
7973 
7974 	/* Add the device back in the hashes */
7975 	list_netdevice(dev);
7976 
7977 	/* Notify protocols, that a new device appeared. */
7978 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7979 
7980 	/*
7981 	 *	Prevent userspace races by waiting until the network
7982 	 *	device is fully setup before sending notifications.
7983 	 */
7984 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7985 
7986 	synchronize_net();
7987 	err = 0;
7988 out:
7989 	return err;
7990 }
7991 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7992 
7993 static int dev_cpu_dead(unsigned int oldcpu)
7994 {
7995 	struct sk_buff **list_skb;
7996 	struct sk_buff *skb;
7997 	unsigned int cpu;
7998 	struct softnet_data *sd, *oldsd;
7999 
8000 	local_irq_disable();
8001 	cpu = smp_processor_id();
8002 	sd = &per_cpu(softnet_data, cpu);
8003 	oldsd = &per_cpu(softnet_data, oldcpu);
8004 
8005 	/* Find end of our completion_queue. */
8006 	list_skb = &sd->completion_queue;
8007 	while (*list_skb)
8008 		list_skb = &(*list_skb)->next;
8009 	/* Append completion queue from offline CPU. */
8010 	*list_skb = oldsd->completion_queue;
8011 	oldsd->completion_queue = NULL;
8012 
8013 	/* Append output queue from offline CPU. */
8014 	if (oldsd->output_queue) {
8015 		*sd->output_queue_tailp = oldsd->output_queue;
8016 		sd->output_queue_tailp = oldsd->output_queue_tailp;
8017 		oldsd->output_queue = NULL;
8018 		oldsd->output_queue_tailp = &oldsd->output_queue;
8019 	}
8020 	/* Append NAPI poll list from offline CPU, with one exception :
8021 	 * process_backlog() must be called by cpu owning percpu backlog.
8022 	 * We properly handle process_queue & input_pkt_queue later.
8023 	 */
8024 	while (!list_empty(&oldsd->poll_list)) {
8025 		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8026 							    struct napi_struct,
8027 							    poll_list);
8028 
8029 		list_del_init(&napi->poll_list);
8030 		if (napi->poll == process_backlog)
8031 			napi->state = 0;
8032 		else
8033 			____napi_schedule(sd, napi);
8034 	}
8035 
8036 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
8037 	local_irq_enable();
8038 
8039 	/* Process offline CPU's input_pkt_queue */
8040 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8041 		netif_rx_ni(skb);
8042 		input_queue_head_incr(oldsd);
8043 	}
8044 	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8045 		netif_rx_ni(skb);
8046 		input_queue_head_incr(oldsd);
8047 	}
8048 
8049 	return 0;
8050 }
8051 
8052 /**
8053  *	netdev_increment_features - increment feature set by one
8054  *	@all: current feature set
8055  *	@one: new feature set
8056  *	@mask: mask feature set
8057  *
8058  *	Computes a new feature set after adding a device with feature set
8059  *	@one to the master device with current feature set @all.  Will not
8060  *	enable anything that is off in @mask. Returns the new feature set.
8061  */
8062 netdev_features_t netdev_increment_features(netdev_features_t all,
8063 	netdev_features_t one, netdev_features_t mask)
8064 {
8065 	if (mask & NETIF_F_HW_CSUM)
8066 		mask |= NETIF_F_CSUM_MASK;
8067 	mask |= NETIF_F_VLAN_CHALLENGED;
8068 
8069 	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8070 	all &= one | ~NETIF_F_ALL_FOR_ALL;
8071 
8072 	/* If one device supports hw checksumming, set for all. */
8073 	if (all & NETIF_F_HW_CSUM)
8074 		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8075 
8076 	return all;
8077 }
8078 EXPORT_SYMBOL(netdev_increment_features);
8079 
8080 static struct hlist_head * __net_init netdev_create_hash(void)
8081 {
8082 	int i;
8083 	struct hlist_head *hash;
8084 
8085 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8086 	if (hash != NULL)
8087 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
8088 			INIT_HLIST_HEAD(&hash[i]);
8089 
8090 	return hash;
8091 }
8092 
8093 /* Initialize per network namespace state */
8094 static int __net_init netdev_init(struct net *net)
8095 {
8096 	if (net != &init_net)
8097 		INIT_LIST_HEAD(&net->dev_base_head);
8098 
8099 	net->dev_name_head = netdev_create_hash();
8100 	if (net->dev_name_head == NULL)
8101 		goto err_name;
8102 
8103 	net->dev_index_head = netdev_create_hash();
8104 	if (net->dev_index_head == NULL)
8105 		goto err_idx;
8106 
8107 	return 0;
8108 
8109 err_idx:
8110 	kfree(net->dev_name_head);
8111 err_name:
8112 	return -ENOMEM;
8113 }
8114 
8115 /**
8116  *	netdev_drivername - network driver for the device
8117  *	@dev: network device
8118  *
8119  *	Determine network driver for device.
8120  */
8121 const char *netdev_drivername(const struct net_device *dev)
8122 {
8123 	const struct device_driver *driver;
8124 	const struct device *parent;
8125 	const char *empty = "";
8126 
8127 	parent = dev->dev.parent;
8128 	if (!parent)
8129 		return empty;
8130 
8131 	driver = parent->driver;
8132 	if (driver && driver->name)
8133 		return driver->name;
8134 	return empty;
8135 }
8136 
8137 static void __netdev_printk(const char *level, const struct net_device *dev,
8138 			    struct va_format *vaf)
8139 {
8140 	if (dev && dev->dev.parent) {
8141 		dev_printk_emit(level[1] - '0',
8142 				dev->dev.parent,
8143 				"%s %s %s%s: %pV",
8144 				dev_driver_string(dev->dev.parent),
8145 				dev_name(dev->dev.parent),
8146 				netdev_name(dev), netdev_reg_state(dev),
8147 				vaf);
8148 	} else if (dev) {
8149 		printk("%s%s%s: %pV",
8150 		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
8151 	} else {
8152 		printk("%s(NULL net_device): %pV", level, vaf);
8153 	}
8154 }
8155 
8156 void netdev_printk(const char *level, const struct net_device *dev,
8157 		   const char *format, ...)
8158 {
8159 	struct va_format vaf;
8160 	va_list args;
8161 
8162 	va_start(args, format);
8163 
8164 	vaf.fmt = format;
8165 	vaf.va = &args;
8166 
8167 	__netdev_printk(level, dev, &vaf);
8168 
8169 	va_end(args);
8170 }
8171 EXPORT_SYMBOL(netdev_printk);
8172 
8173 #define define_netdev_printk_level(func, level)			\
8174 void func(const struct net_device *dev, const char *fmt, ...)	\
8175 {								\
8176 	struct va_format vaf;					\
8177 	va_list args;						\
8178 								\
8179 	va_start(args, fmt);					\
8180 								\
8181 	vaf.fmt = fmt;						\
8182 	vaf.va = &args;						\
8183 								\
8184 	__netdev_printk(level, dev, &vaf);			\
8185 								\
8186 	va_end(args);						\
8187 }								\
8188 EXPORT_SYMBOL(func);
8189 
8190 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8191 define_netdev_printk_level(netdev_alert, KERN_ALERT);
8192 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8193 define_netdev_printk_level(netdev_err, KERN_ERR);
8194 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8195 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8196 define_netdev_printk_level(netdev_info, KERN_INFO);
8197 
8198 static void __net_exit netdev_exit(struct net *net)
8199 {
8200 	kfree(net->dev_name_head);
8201 	kfree(net->dev_index_head);
8202 }
8203 
8204 static struct pernet_operations __net_initdata netdev_net_ops = {
8205 	.init = netdev_init,
8206 	.exit = netdev_exit,
8207 };
8208 
8209 static void __net_exit default_device_exit(struct net *net)
8210 {
8211 	struct net_device *dev, *aux;
8212 	/*
8213 	 * Push all migratable network devices back to the
8214 	 * initial network namespace
8215 	 */
8216 	rtnl_lock();
8217 	for_each_netdev_safe(net, dev, aux) {
8218 		int err;
8219 		char fb_name[IFNAMSIZ];
8220 
8221 		/* Ignore unmoveable devices (i.e. loopback) */
8222 		if (dev->features & NETIF_F_NETNS_LOCAL)
8223 			continue;
8224 
8225 		/* Leave virtual devices for the generic cleanup */
8226 		if (dev->rtnl_link_ops)
8227 			continue;
8228 
8229 		/* Push remaining network devices to init_net */
8230 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8231 		err = dev_change_net_namespace(dev, &init_net, fb_name);
8232 		if (err) {
8233 			pr_emerg("%s: failed to move %s to init_net: %d\n",
8234 				 __func__, dev->name, err);
8235 			BUG();
8236 		}
8237 	}
8238 	rtnl_unlock();
8239 }
8240 
8241 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8242 {
8243 	/* Return with the rtnl_lock held when there are no network
8244 	 * devices unregistering in any network namespace in net_list.
8245 	 */
8246 	struct net *net;
8247 	bool unregistering;
8248 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
8249 
8250 	add_wait_queue(&netdev_unregistering_wq, &wait);
8251 	for (;;) {
8252 		unregistering = false;
8253 		rtnl_lock();
8254 		list_for_each_entry(net, net_list, exit_list) {
8255 			if (net->dev_unreg_count > 0) {
8256 				unregistering = true;
8257 				break;
8258 			}
8259 		}
8260 		if (!unregistering)
8261 			break;
8262 		__rtnl_unlock();
8263 
8264 		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8265 	}
8266 	remove_wait_queue(&netdev_unregistering_wq, &wait);
8267 }
8268 
8269 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8270 {
8271 	/* At exit all network devices most be removed from a network
8272 	 * namespace.  Do this in the reverse order of registration.
8273 	 * Do this across as many network namespaces as possible to
8274 	 * improve batching efficiency.
8275 	 */
8276 	struct net_device *dev;
8277 	struct net *net;
8278 	LIST_HEAD(dev_kill_list);
8279 
8280 	/* To prevent network device cleanup code from dereferencing
8281 	 * loopback devices or network devices that have been freed
8282 	 * wait here for all pending unregistrations to complete,
8283 	 * before unregistring the loopback device and allowing the
8284 	 * network namespace be freed.
8285 	 *
8286 	 * The netdev todo list containing all network devices
8287 	 * unregistrations that happen in default_device_exit_batch
8288 	 * will run in the rtnl_unlock() at the end of
8289 	 * default_device_exit_batch.
8290 	 */
8291 	rtnl_lock_unregistering(net_list);
8292 	list_for_each_entry(net, net_list, exit_list) {
8293 		for_each_netdev_reverse(net, dev) {
8294 			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8295 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8296 			else
8297 				unregister_netdevice_queue(dev, &dev_kill_list);
8298 		}
8299 	}
8300 	unregister_netdevice_many(&dev_kill_list);
8301 	rtnl_unlock();
8302 }
8303 
8304 static struct pernet_operations __net_initdata default_device_ops = {
8305 	.exit = default_device_exit,
8306 	.exit_batch = default_device_exit_batch,
8307 };
8308 
8309 /*
8310  *	Initialize the DEV module. At boot time this walks the device list and
8311  *	unhooks any devices that fail to initialise (normally hardware not
8312  *	present) and leaves us with a valid list of present and active devices.
8313  *
8314  */
8315 
8316 /*
8317  *       This is called single threaded during boot, so no need
8318  *       to take the rtnl semaphore.
8319  */
8320 static int __init net_dev_init(void)
8321 {
8322 	int i, rc = -ENOMEM;
8323 
8324 	BUG_ON(!dev_boot_phase);
8325 
8326 	if (dev_proc_init())
8327 		goto out;
8328 
8329 	if (netdev_kobject_init())
8330 		goto out;
8331 
8332 	INIT_LIST_HEAD(&ptype_all);
8333 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
8334 		INIT_LIST_HEAD(&ptype_base[i]);
8335 
8336 	INIT_LIST_HEAD(&offload_base);
8337 
8338 	if (register_pernet_subsys(&netdev_net_ops))
8339 		goto out;
8340 
8341 	/*
8342 	 *	Initialise the packet receive queues.
8343 	 */
8344 
8345 	for_each_possible_cpu(i) {
8346 		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8347 		struct softnet_data *sd = &per_cpu(softnet_data, i);
8348 
8349 		INIT_WORK(flush, flush_backlog);
8350 
8351 		skb_queue_head_init(&sd->input_pkt_queue);
8352 		skb_queue_head_init(&sd->process_queue);
8353 		INIT_LIST_HEAD(&sd->poll_list);
8354 		sd->output_queue_tailp = &sd->output_queue;
8355 #ifdef CONFIG_RPS
8356 		sd->csd.func = rps_trigger_softirq;
8357 		sd->csd.info = sd;
8358 		sd->cpu = i;
8359 #endif
8360 
8361 		sd->backlog.poll = process_backlog;
8362 		sd->backlog.weight = weight_p;
8363 	}
8364 
8365 	dev_boot_phase = 0;
8366 
8367 	/* The loopback device is special if any other network devices
8368 	 * is present in a network namespace the loopback device must
8369 	 * be present. Since we now dynamically allocate and free the
8370 	 * loopback device ensure this invariant is maintained by
8371 	 * keeping the loopback device as the first device on the
8372 	 * list of network devices.  Ensuring the loopback devices
8373 	 * is the first device that appears and the last network device
8374 	 * that disappears.
8375 	 */
8376 	if (register_pernet_device(&loopback_net_ops))
8377 		goto out;
8378 
8379 	if (register_pernet_device(&default_device_ops))
8380 		goto out;
8381 
8382 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8383 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8384 
8385 	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8386 				       NULL, dev_cpu_dead);
8387 	WARN_ON(rc < 0);
8388 	dst_subsys_init();
8389 	rc = 0;
8390 out:
8391 	return rc;
8392 }
8393 
8394 subsys_initcall(net_dev_init);
8395