xref: /linux-6.15/net/core/dev.c (revision e67f88dd)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <[email protected]>
12  *				Mark Evans, <[email protected]>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <[email protected]>
16  *		Alan Cox <[email protected]>
17  *		David Hinds <[email protected]>
18  *		Alexey Kuznetsov <[email protected]>
19  *		Adam Sulmicki <[email protected]>
20  *              Pekka Riikonen <[email protected]>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 #include <linux/cpu_rmap.h>
136 
137 #include "net-sysfs.h"
138 
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
141 
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144 
145 /*
146  *	The list of packet types we will receive (as opposed to discard)
147  *	and the routines to invoke.
148  *
149  *	Why 16. Because with 16 the only overlap we get on a hash of the
150  *	low nibble of the protocol value is RARP/SNAP/X.25.
151  *
152  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
153  *             sure which should go first, but I bet it won't make much
154  *             difference if we are running VLANs.  The good news is that
155  *             this protocol won't be in the list unless compiled in, so
156  *             the average user (w/out VLANs) will not be adversely affected.
157  *             --BLG
158  *
159  *		0800	IP
160  *		8100    802.1Q VLAN
161  *		0001	802.3
162  *		0002	AX.25
163  *		0004	802.2
164  *		8035	RARP
165  *		0005	SNAP
166  *		0805	X.25
167  *		0806	ARP
168  *		8137	IPX
169  *		0009	Localtalk
170  *		86DD	IPv6
171  */
172 
173 #define PTYPE_HASH_SIZE	(16)
174 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
175 
176 static DEFINE_SPINLOCK(ptype_lock);
177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178 static struct list_head ptype_all __read_mostly;	/* Taps */
179 
180 /*
181  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182  * semaphore.
183  *
184  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
185  *
186  * Writers must hold the rtnl semaphore while they loop through the
187  * dev_base_head list, and hold dev_base_lock for writing when they do the
188  * actual updates.  This allows pure readers to access the list even
189  * while a writer is preparing to update it.
190  *
191  * To put it another way, dev_base_lock is held for writing only to
192  * protect against pure readers; the rtnl semaphore provides the
193  * protection against other writers.
194  *
195  * See, for example usages, register_netdevice() and
196  * unregister_netdevice(), which must be called with the rtnl
197  * semaphore held.
198  */
199 DEFINE_RWLOCK(dev_base_lock);
200 EXPORT_SYMBOL(dev_base_lock);
201 
202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
203 {
204 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
205 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206 }
207 
208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
209 {
210 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211 }
212 
213 static inline void rps_lock(struct softnet_data *sd)
214 {
215 #ifdef CONFIG_RPS
216 	spin_lock(&sd->input_pkt_queue.lock);
217 #endif
218 }
219 
220 static inline void rps_unlock(struct softnet_data *sd)
221 {
222 #ifdef CONFIG_RPS
223 	spin_unlock(&sd->input_pkt_queue.lock);
224 #endif
225 }
226 
227 /* Device list insertion */
228 static int list_netdevice(struct net_device *dev)
229 {
230 	struct net *net = dev_net(dev);
231 
232 	ASSERT_RTNL();
233 
234 	write_lock_bh(&dev_base_lock);
235 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
236 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
237 	hlist_add_head_rcu(&dev->index_hlist,
238 			   dev_index_hash(net, dev->ifindex));
239 	write_unlock_bh(&dev_base_lock);
240 	return 0;
241 }
242 
243 /* Device list removal
244  * caller must respect a RCU grace period before freeing/reusing dev
245  */
246 static void unlist_netdevice(struct net_device *dev)
247 {
248 	ASSERT_RTNL();
249 
250 	/* Unlink dev from the device chain */
251 	write_lock_bh(&dev_base_lock);
252 	list_del_rcu(&dev->dev_list);
253 	hlist_del_rcu(&dev->name_hlist);
254 	hlist_del_rcu(&dev->index_hlist);
255 	write_unlock_bh(&dev_base_lock);
256 }
257 
258 /*
259  *	Our notifier list
260  */
261 
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263 
264 /*
265  *	Device drivers call our routines to queue packets here. We empty the
266  *	queue in the local softnet handler.
267  */
268 
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271 
272 #ifdef CONFIG_LOCKDEP
273 /*
274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275  * according to dev->type
276  */
277 static const unsigned short netdev_lock_type[] =
278 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
291 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
292 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
293 	 ARPHRD_VOID, ARPHRD_NONE};
294 
295 static const char *const netdev_lock_name[] =
296 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
309 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
310 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
311 	 "_xmit_VOID", "_xmit_NONE"};
312 
313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
315 
316 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
317 {
318 	int i;
319 
320 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
321 		if (netdev_lock_type[i] == dev_type)
322 			return i;
323 	/* the last key is used by default */
324 	return ARRAY_SIZE(netdev_lock_type) - 1;
325 }
326 
327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
328 						 unsigned short dev_type)
329 {
330 	int i;
331 
332 	i = netdev_lock_pos(dev_type);
333 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
334 				   netdev_lock_name[i]);
335 }
336 
337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
338 {
339 	int i;
340 
341 	i = netdev_lock_pos(dev->type);
342 	lockdep_set_class_and_name(&dev->addr_list_lock,
343 				   &netdev_addr_lock_key[i],
344 				   netdev_lock_name[i]);
345 }
346 #else
347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
348 						 unsigned short dev_type)
349 {
350 }
351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
352 {
353 }
354 #endif
355 
356 /*******************************************************************************
357 
358 		Protocol management and registration routines
359 
360 *******************************************************************************/
361 
362 /*
363  *	Add a protocol ID to the list. Now that the input handler is
364  *	smarter we can dispense with all the messy stuff that used to be
365  *	here.
366  *
367  *	BEWARE!!! Protocol handlers, mangling input packets,
368  *	MUST BE last in hash buckets and checking protocol handlers
369  *	MUST start from promiscuous ptype_all chain in net_bh.
370  *	It is true now, do not change it.
371  *	Explanation follows: if protocol handler, mangling packet, will
372  *	be the first on list, it is not able to sense, that packet
373  *	is cloned and should be copied-on-write, so that it will
374  *	change it and subsequent readers will get broken packet.
375  *							--ANK (980803)
376  */
377 
378 static inline struct list_head *ptype_head(const struct packet_type *pt)
379 {
380 	if (pt->type == htons(ETH_P_ALL))
381 		return &ptype_all;
382 	else
383 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384 }
385 
386 /**
387  *	dev_add_pack - add packet handler
388  *	@pt: packet type declaration
389  *
390  *	Add a protocol handler to the networking stack. The passed &packet_type
391  *	is linked into kernel lists and may not be freed until it has been
392  *	removed from the kernel lists.
393  *
394  *	This call does not sleep therefore it can not
395  *	guarantee all CPU's that are in middle of receiving packets
396  *	will see the new packet type (until the next received packet).
397  */
398 
399 void dev_add_pack(struct packet_type *pt)
400 {
401 	struct list_head *head = ptype_head(pt);
402 
403 	spin_lock(&ptype_lock);
404 	list_add_rcu(&pt->list, head);
405 	spin_unlock(&ptype_lock);
406 }
407 EXPORT_SYMBOL(dev_add_pack);
408 
409 /**
410  *	__dev_remove_pack	 - remove packet handler
411  *	@pt: packet type declaration
412  *
413  *	Remove a protocol handler that was previously added to the kernel
414  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
415  *	from the kernel lists and can be freed or reused once this function
416  *	returns.
417  *
418  *      The packet type might still be in use by receivers
419  *	and must not be freed until after all the CPU's have gone
420  *	through a quiescent state.
421  */
422 void __dev_remove_pack(struct packet_type *pt)
423 {
424 	struct list_head *head = ptype_head(pt);
425 	struct packet_type *pt1;
426 
427 	spin_lock(&ptype_lock);
428 
429 	list_for_each_entry(pt1, head, list) {
430 		if (pt == pt1) {
431 			list_del_rcu(&pt->list);
432 			goto out;
433 		}
434 	}
435 
436 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437 out:
438 	spin_unlock(&ptype_lock);
439 }
440 EXPORT_SYMBOL(__dev_remove_pack);
441 
442 /**
443  *	dev_remove_pack	 - remove packet handler
444  *	@pt: packet type declaration
445  *
446  *	Remove a protocol handler that was previously added to the kernel
447  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
448  *	from the kernel lists and can be freed or reused once this function
449  *	returns.
450  *
451  *	This call sleeps to guarantee that no CPU is looking at the packet
452  *	type after return.
453  */
454 void dev_remove_pack(struct packet_type *pt)
455 {
456 	__dev_remove_pack(pt);
457 
458 	synchronize_net();
459 }
460 EXPORT_SYMBOL(dev_remove_pack);
461 
462 /******************************************************************************
463 
464 		      Device Boot-time Settings Routines
465 
466 *******************************************************************************/
467 
468 /* Boot time configuration table */
469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470 
471 /**
472  *	netdev_boot_setup_add	- add new setup entry
473  *	@name: name of the device
474  *	@map: configured settings for the device
475  *
476  *	Adds new setup entry to the dev_boot_setup list.  The function
477  *	returns 0 on error and 1 on success.  This is a generic routine to
478  *	all netdevices.
479  */
480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
481 {
482 	struct netdev_boot_setup *s;
483 	int i;
484 
485 	s = dev_boot_setup;
486 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 			memset(s[i].name, 0, sizeof(s[i].name));
489 			strlcpy(s[i].name, name, IFNAMSIZ);
490 			memcpy(&s[i].map, map, sizeof(s[i].map));
491 			break;
492 		}
493 	}
494 
495 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496 }
497 
498 /**
499  *	netdev_boot_setup_check	- check boot time settings
500  *	@dev: the netdevice
501  *
502  * 	Check boot time settings for the device.
503  *	The found settings are set for the device to be used
504  *	later in the device probing.
505  *	Returns 0 if no settings found, 1 if they are.
506  */
507 int netdev_boot_setup_check(struct net_device *dev)
508 {
509 	struct netdev_boot_setup *s = dev_boot_setup;
510 	int i;
511 
512 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514 		    !strcmp(dev->name, s[i].name)) {
515 			dev->irq 	= s[i].map.irq;
516 			dev->base_addr 	= s[i].map.base_addr;
517 			dev->mem_start 	= s[i].map.mem_start;
518 			dev->mem_end 	= s[i].map.mem_end;
519 			return 1;
520 		}
521 	}
522 	return 0;
523 }
524 EXPORT_SYMBOL(netdev_boot_setup_check);
525 
526 
527 /**
528  *	netdev_boot_base	- get address from boot time settings
529  *	@prefix: prefix for network device
530  *	@unit: id for network device
531  *
532  * 	Check boot time settings for the base address of device.
533  *	The found settings are set for the device to be used
534  *	later in the device probing.
535  *	Returns 0 if no settings found.
536  */
537 unsigned long netdev_boot_base(const char *prefix, int unit)
538 {
539 	const struct netdev_boot_setup *s = dev_boot_setup;
540 	char name[IFNAMSIZ];
541 	int i;
542 
543 	sprintf(name, "%s%d", prefix, unit);
544 
545 	/*
546 	 * If device already registered then return base of 1
547 	 * to indicate not to probe for this interface
548 	 */
549 	if (__dev_get_by_name(&init_net, name))
550 		return 1;
551 
552 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 		if (!strcmp(name, s[i].name))
554 			return s[i].map.base_addr;
555 	return 0;
556 }
557 
558 /*
559  * Saves at boot time configured settings for any netdevice.
560  */
561 int __init netdev_boot_setup(char *str)
562 {
563 	int ints[5];
564 	struct ifmap map;
565 
566 	str = get_options(str, ARRAY_SIZE(ints), ints);
567 	if (!str || !*str)
568 		return 0;
569 
570 	/* Save settings */
571 	memset(&map, 0, sizeof(map));
572 	if (ints[0] > 0)
573 		map.irq = ints[1];
574 	if (ints[0] > 1)
575 		map.base_addr = ints[2];
576 	if (ints[0] > 2)
577 		map.mem_start = ints[3];
578 	if (ints[0] > 3)
579 		map.mem_end = ints[4];
580 
581 	/* Add new entry to the list */
582 	return netdev_boot_setup_add(str, &map);
583 }
584 
585 __setup("netdev=", netdev_boot_setup);
586 
587 /*******************************************************************************
588 
589 			    Device Interface Subroutines
590 
591 *******************************************************************************/
592 
593 /**
594  *	__dev_get_by_name	- find a device by its name
595  *	@net: the applicable net namespace
596  *	@name: name to find
597  *
598  *	Find an interface by name. Must be called under RTNL semaphore
599  *	or @dev_base_lock. If the name is found a pointer to the device
600  *	is returned. If the name is not found then %NULL is returned. The
601  *	reference counters are not incremented so the caller must be
602  *	careful with locks.
603  */
604 
605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
606 {
607 	struct hlist_node *p;
608 	struct net_device *dev;
609 	struct hlist_head *head = dev_name_hash(net, name);
610 
611 	hlist_for_each_entry(dev, p, head, name_hlist)
612 		if (!strncmp(dev->name, name, IFNAMSIZ))
613 			return dev;
614 
615 	return NULL;
616 }
617 EXPORT_SYMBOL(__dev_get_by_name);
618 
619 /**
620  *	dev_get_by_name_rcu	- find a device by its name
621  *	@net: the applicable net namespace
622  *	@name: name to find
623  *
624  *	Find an interface by name.
625  *	If the name is found a pointer to the device is returned.
626  * 	If the name is not found then %NULL is returned.
627  *	The reference counters are not incremented so the caller must be
628  *	careful with locks. The caller must hold RCU lock.
629  */
630 
631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632 {
633 	struct hlist_node *p;
634 	struct net_device *dev;
635 	struct hlist_head *head = dev_name_hash(net, name);
636 
637 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 		if (!strncmp(dev->name, name, IFNAMSIZ))
639 			return dev;
640 
641 	return NULL;
642 }
643 EXPORT_SYMBOL(dev_get_by_name_rcu);
644 
645 /**
646  *	dev_get_by_name		- find a device by its name
647  *	@net: the applicable net namespace
648  *	@name: name to find
649  *
650  *	Find an interface by name. This can be called from any
651  *	context and does its own locking. The returned handle has
652  *	the usage count incremented and the caller must use dev_put() to
653  *	release it when it is no longer needed. %NULL is returned if no
654  *	matching device is found.
655  */
656 
657 struct net_device *dev_get_by_name(struct net *net, const char *name)
658 {
659 	struct net_device *dev;
660 
661 	rcu_read_lock();
662 	dev = dev_get_by_name_rcu(net, name);
663 	if (dev)
664 		dev_hold(dev);
665 	rcu_read_unlock();
666 	return dev;
667 }
668 EXPORT_SYMBOL(dev_get_by_name);
669 
670 /**
671  *	__dev_get_by_index - find a device by its ifindex
672  *	@net: the applicable net namespace
673  *	@ifindex: index of device
674  *
675  *	Search for an interface by index. Returns %NULL if the device
676  *	is not found or a pointer to the device. The device has not
677  *	had its reference counter increased so the caller must be careful
678  *	about locking. The caller must hold either the RTNL semaphore
679  *	or @dev_base_lock.
680  */
681 
682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
683 {
684 	struct hlist_node *p;
685 	struct net_device *dev;
686 	struct hlist_head *head = dev_index_hash(net, ifindex);
687 
688 	hlist_for_each_entry(dev, p, head, index_hlist)
689 		if (dev->ifindex == ifindex)
690 			return dev;
691 
692 	return NULL;
693 }
694 EXPORT_SYMBOL(__dev_get_by_index);
695 
696 /**
697  *	dev_get_by_index_rcu - find a device by its ifindex
698  *	@net: the applicable net namespace
699  *	@ifindex: index of device
700  *
701  *	Search for an interface by index. Returns %NULL if the device
702  *	is not found or a pointer to the device. The device has not
703  *	had its reference counter increased so the caller must be careful
704  *	about locking. The caller must hold RCU lock.
705  */
706 
707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708 {
709 	struct hlist_node *p;
710 	struct net_device *dev;
711 	struct hlist_head *head = dev_index_hash(net, ifindex);
712 
713 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 		if (dev->ifindex == ifindex)
715 			return dev;
716 
717 	return NULL;
718 }
719 EXPORT_SYMBOL(dev_get_by_index_rcu);
720 
721 
722 /**
723  *	dev_get_by_index - find a device by its ifindex
724  *	@net: the applicable net namespace
725  *	@ifindex: index of device
726  *
727  *	Search for an interface by index. Returns NULL if the device
728  *	is not found or a pointer to the device. The device returned has
729  *	had a reference added and the pointer is safe until the user calls
730  *	dev_put to indicate they have finished with it.
731  */
732 
733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
734 {
735 	struct net_device *dev;
736 
737 	rcu_read_lock();
738 	dev = dev_get_by_index_rcu(net, ifindex);
739 	if (dev)
740 		dev_hold(dev);
741 	rcu_read_unlock();
742 	return dev;
743 }
744 EXPORT_SYMBOL(dev_get_by_index);
745 
746 /**
747  *	dev_getbyhwaddr_rcu - find a device by its hardware address
748  *	@net: the applicable net namespace
749  *	@type: media type of device
750  *	@ha: hardware address
751  *
752  *	Search for an interface by MAC address. Returns NULL if the device
753  *	is not found or a pointer to the device.
754  *	The caller must hold RCU or RTNL.
755  *	The returned device has not had its ref count increased
756  *	and the caller must therefore be careful about locking
757  *
758  */
759 
760 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
761 				       const char *ha)
762 {
763 	struct net_device *dev;
764 
765 	for_each_netdev_rcu(net, dev)
766 		if (dev->type == type &&
767 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
768 			return dev;
769 
770 	return NULL;
771 }
772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
773 
774 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
775 {
776 	struct net_device *dev;
777 
778 	ASSERT_RTNL();
779 	for_each_netdev(net, dev)
780 		if (dev->type == type)
781 			return dev;
782 
783 	return NULL;
784 }
785 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
786 
787 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
788 {
789 	struct net_device *dev, *ret = NULL;
790 
791 	rcu_read_lock();
792 	for_each_netdev_rcu(net, dev)
793 		if (dev->type == type) {
794 			dev_hold(dev);
795 			ret = dev;
796 			break;
797 		}
798 	rcu_read_unlock();
799 	return ret;
800 }
801 EXPORT_SYMBOL(dev_getfirstbyhwtype);
802 
803 /**
804  *	dev_get_by_flags_rcu - find any device with given flags
805  *	@net: the applicable net namespace
806  *	@if_flags: IFF_* values
807  *	@mask: bitmask of bits in if_flags to check
808  *
809  *	Search for any interface with the given flags. Returns NULL if a device
810  *	is not found or a pointer to the device. Must be called inside
811  *	rcu_read_lock(), and result refcount is unchanged.
812  */
813 
814 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
815 				    unsigned short mask)
816 {
817 	struct net_device *dev, *ret;
818 
819 	ret = NULL;
820 	for_each_netdev_rcu(net, dev) {
821 		if (((dev->flags ^ if_flags) & mask) == 0) {
822 			ret = dev;
823 			break;
824 		}
825 	}
826 	return ret;
827 }
828 EXPORT_SYMBOL(dev_get_by_flags_rcu);
829 
830 /**
831  *	dev_valid_name - check if name is okay for network device
832  *	@name: name string
833  *
834  *	Network device names need to be valid file names to
835  *	to allow sysfs to work.  We also disallow any kind of
836  *	whitespace.
837  */
838 int dev_valid_name(const char *name)
839 {
840 	if (*name == '\0')
841 		return 0;
842 	if (strlen(name) >= IFNAMSIZ)
843 		return 0;
844 	if (!strcmp(name, ".") || !strcmp(name, ".."))
845 		return 0;
846 
847 	while (*name) {
848 		if (*name == '/' || isspace(*name))
849 			return 0;
850 		name++;
851 	}
852 	return 1;
853 }
854 EXPORT_SYMBOL(dev_valid_name);
855 
856 /**
857  *	__dev_alloc_name - allocate a name for a device
858  *	@net: network namespace to allocate the device name in
859  *	@name: name format string
860  *	@buf:  scratch buffer and result name string
861  *
862  *	Passed a format string - eg "lt%d" it will try and find a suitable
863  *	id. It scans list of devices to build up a free map, then chooses
864  *	the first empty slot. The caller must hold the dev_base or rtnl lock
865  *	while allocating the name and adding the device in order to avoid
866  *	duplicates.
867  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
868  *	Returns the number of the unit assigned or a negative errno code.
869  */
870 
871 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
872 {
873 	int i = 0;
874 	const char *p;
875 	const int max_netdevices = 8*PAGE_SIZE;
876 	unsigned long *inuse;
877 	struct net_device *d;
878 
879 	p = strnchr(name, IFNAMSIZ-1, '%');
880 	if (p) {
881 		/*
882 		 * Verify the string as this thing may have come from
883 		 * the user.  There must be either one "%d" and no other "%"
884 		 * characters.
885 		 */
886 		if (p[1] != 'd' || strchr(p + 2, '%'))
887 			return -EINVAL;
888 
889 		/* Use one page as a bit array of possible slots */
890 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
891 		if (!inuse)
892 			return -ENOMEM;
893 
894 		for_each_netdev(net, d) {
895 			if (!sscanf(d->name, name, &i))
896 				continue;
897 			if (i < 0 || i >= max_netdevices)
898 				continue;
899 
900 			/*  avoid cases where sscanf is not exact inverse of printf */
901 			snprintf(buf, IFNAMSIZ, name, i);
902 			if (!strncmp(buf, d->name, IFNAMSIZ))
903 				set_bit(i, inuse);
904 		}
905 
906 		i = find_first_zero_bit(inuse, max_netdevices);
907 		free_page((unsigned long) inuse);
908 	}
909 
910 	if (buf != name)
911 		snprintf(buf, IFNAMSIZ, name, i);
912 	if (!__dev_get_by_name(net, buf))
913 		return i;
914 
915 	/* It is possible to run out of possible slots
916 	 * when the name is long and there isn't enough space left
917 	 * for the digits, or if all bits are used.
918 	 */
919 	return -ENFILE;
920 }
921 
922 /**
923  *	dev_alloc_name - allocate a name for a device
924  *	@dev: device
925  *	@name: name format string
926  *
927  *	Passed a format string - eg "lt%d" it will try and find a suitable
928  *	id. It scans list of devices to build up a free map, then chooses
929  *	the first empty slot. The caller must hold the dev_base or rtnl lock
930  *	while allocating the name and adding the device in order to avoid
931  *	duplicates.
932  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
933  *	Returns the number of the unit assigned or a negative errno code.
934  */
935 
936 int dev_alloc_name(struct net_device *dev, const char *name)
937 {
938 	char buf[IFNAMSIZ];
939 	struct net *net;
940 	int ret;
941 
942 	BUG_ON(!dev_net(dev));
943 	net = dev_net(dev);
944 	ret = __dev_alloc_name(net, name, buf);
945 	if (ret >= 0)
946 		strlcpy(dev->name, buf, IFNAMSIZ);
947 	return ret;
948 }
949 EXPORT_SYMBOL(dev_alloc_name);
950 
951 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
952 {
953 	struct net *net;
954 
955 	BUG_ON(!dev_net(dev));
956 	net = dev_net(dev);
957 
958 	if (!dev_valid_name(name))
959 		return -EINVAL;
960 
961 	if (fmt && strchr(name, '%'))
962 		return dev_alloc_name(dev, name);
963 	else if (__dev_get_by_name(net, name))
964 		return -EEXIST;
965 	else if (dev->name != name)
966 		strlcpy(dev->name, name, IFNAMSIZ);
967 
968 	return 0;
969 }
970 
971 /**
972  *	dev_change_name - change name of a device
973  *	@dev: device
974  *	@newname: name (or format string) must be at least IFNAMSIZ
975  *
976  *	Change name of a device, can pass format strings "eth%d".
977  *	for wildcarding.
978  */
979 int dev_change_name(struct net_device *dev, const char *newname)
980 {
981 	char oldname[IFNAMSIZ];
982 	int err = 0;
983 	int ret;
984 	struct net *net;
985 
986 	ASSERT_RTNL();
987 	BUG_ON(!dev_net(dev));
988 
989 	net = dev_net(dev);
990 	if (dev->flags & IFF_UP)
991 		return -EBUSY;
992 
993 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
994 		return 0;
995 
996 	memcpy(oldname, dev->name, IFNAMSIZ);
997 
998 	err = dev_get_valid_name(dev, newname, 1);
999 	if (err < 0)
1000 		return err;
1001 
1002 rollback:
1003 	ret = device_rename(&dev->dev, dev->name);
1004 	if (ret) {
1005 		memcpy(dev->name, oldname, IFNAMSIZ);
1006 		return ret;
1007 	}
1008 
1009 	write_lock_bh(&dev_base_lock);
1010 	hlist_del(&dev->name_hlist);
1011 	write_unlock_bh(&dev_base_lock);
1012 
1013 	synchronize_rcu();
1014 
1015 	write_lock_bh(&dev_base_lock);
1016 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017 	write_unlock_bh(&dev_base_lock);
1018 
1019 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020 	ret = notifier_to_errno(ret);
1021 
1022 	if (ret) {
1023 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1024 		if (err >= 0) {
1025 			err = ret;
1026 			memcpy(dev->name, oldname, IFNAMSIZ);
1027 			goto rollback;
1028 		} else {
1029 			printk(KERN_ERR
1030 			       "%s: name change rollback failed: %d.\n",
1031 			       dev->name, ret);
1032 		}
1033 	}
1034 
1035 	return err;
1036 }
1037 
1038 /**
1039  *	dev_set_alias - change ifalias of a device
1040  *	@dev: device
1041  *	@alias: name up to IFALIASZ
1042  *	@len: limit of bytes to copy from info
1043  *
1044  *	Set ifalias for a device,
1045  */
1046 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047 {
1048 	ASSERT_RTNL();
1049 
1050 	if (len >= IFALIASZ)
1051 		return -EINVAL;
1052 
1053 	if (!len) {
1054 		if (dev->ifalias) {
1055 			kfree(dev->ifalias);
1056 			dev->ifalias = NULL;
1057 		}
1058 		return 0;
1059 	}
1060 
1061 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062 	if (!dev->ifalias)
1063 		return -ENOMEM;
1064 
1065 	strlcpy(dev->ifalias, alias, len+1);
1066 	return len;
1067 }
1068 
1069 
1070 /**
1071  *	netdev_features_change - device changes features
1072  *	@dev: device to cause notification
1073  *
1074  *	Called to indicate a device has changed features.
1075  */
1076 void netdev_features_change(struct net_device *dev)
1077 {
1078 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079 }
1080 EXPORT_SYMBOL(netdev_features_change);
1081 
1082 /**
1083  *	netdev_state_change - device changes state
1084  *	@dev: device to cause notification
1085  *
1086  *	Called to indicate a device has changed state. This function calls
1087  *	the notifier chains for netdev_chain and sends a NEWLINK message
1088  *	to the routing socket.
1089  */
1090 void netdev_state_change(struct net_device *dev)
1091 {
1092 	if (dev->flags & IFF_UP) {
1093 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095 	}
1096 }
1097 EXPORT_SYMBOL(netdev_state_change);
1098 
1099 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100 {
1101 	return call_netdevice_notifiers(event, dev);
1102 }
1103 EXPORT_SYMBOL(netdev_bonding_change);
1104 
1105 /**
1106  *	dev_load 	- load a network module
1107  *	@net: the applicable net namespace
1108  *	@name: name of interface
1109  *
1110  *	If a network interface is not present and the process has suitable
1111  *	privileges this function loads the module. If module loading is not
1112  *	available in this kernel then it becomes a nop.
1113  */
1114 
1115 void dev_load(struct net *net, const char *name)
1116 {
1117 	struct net_device *dev;
1118 	int no_module;
1119 
1120 	rcu_read_lock();
1121 	dev = dev_get_by_name_rcu(net, name);
1122 	rcu_read_unlock();
1123 
1124 	no_module = !dev;
1125 	if (no_module && capable(CAP_NET_ADMIN))
1126 		no_module = request_module("netdev-%s", name);
1127 	if (no_module && capable(CAP_SYS_MODULE)) {
1128 		if (!request_module("%s", name))
1129 			pr_err("Loading kernel module for a network device "
1130 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1131 "instead\n", name);
1132 	}
1133 }
1134 EXPORT_SYMBOL(dev_load);
1135 
1136 static int __dev_open(struct net_device *dev)
1137 {
1138 	const struct net_device_ops *ops = dev->netdev_ops;
1139 	int ret;
1140 
1141 	ASSERT_RTNL();
1142 
1143 	if (!netif_device_present(dev))
1144 		return -ENODEV;
1145 
1146 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1147 	ret = notifier_to_errno(ret);
1148 	if (ret)
1149 		return ret;
1150 
1151 	set_bit(__LINK_STATE_START, &dev->state);
1152 
1153 	if (ops->ndo_validate_addr)
1154 		ret = ops->ndo_validate_addr(dev);
1155 
1156 	if (!ret && ops->ndo_open)
1157 		ret = ops->ndo_open(dev);
1158 
1159 	if (ret)
1160 		clear_bit(__LINK_STATE_START, &dev->state);
1161 	else {
1162 		dev->flags |= IFF_UP;
1163 		net_dmaengine_get();
1164 		dev_set_rx_mode(dev);
1165 		dev_activate(dev);
1166 	}
1167 
1168 	return ret;
1169 }
1170 
1171 /**
1172  *	dev_open	- prepare an interface for use.
1173  *	@dev:	device to open
1174  *
1175  *	Takes a device from down to up state. The device's private open
1176  *	function is invoked and then the multicast lists are loaded. Finally
1177  *	the device is moved into the up state and a %NETDEV_UP message is
1178  *	sent to the netdev notifier chain.
1179  *
1180  *	Calling this function on an active interface is a nop. On a failure
1181  *	a negative errno code is returned.
1182  */
1183 int dev_open(struct net_device *dev)
1184 {
1185 	int ret;
1186 
1187 	if (dev->flags & IFF_UP)
1188 		return 0;
1189 
1190 	ret = __dev_open(dev);
1191 	if (ret < 0)
1192 		return ret;
1193 
1194 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1195 	call_netdevice_notifiers(NETDEV_UP, dev);
1196 
1197 	return ret;
1198 }
1199 EXPORT_SYMBOL(dev_open);
1200 
1201 static int __dev_close_many(struct list_head *head)
1202 {
1203 	struct net_device *dev;
1204 
1205 	ASSERT_RTNL();
1206 	might_sleep();
1207 
1208 	list_for_each_entry(dev, head, unreg_list) {
1209 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1210 
1211 		clear_bit(__LINK_STATE_START, &dev->state);
1212 
1213 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1214 		 * can be even on different cpu. So just clear netif_running().
1215 		 *
1216 		 * dev->stop() will invoke napi_disable() on all of it's
1217 		 * napi_struct instances on this device.
1218 		 */
1219 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1220 	}
1221 
1222 	dev_deactivate_many(head);
1223 
1224 	list_for_each_entry(dev, head, unreg_list) {
1225 		const struct net_device_ops *ops = dev->netdev_ops;
1226 
1227 		/*
1228 		 *	Call the device specific close. This cannot fail.
1229 		 *	Only if device is UP
1230 		 *
1231 		 *	We allow it to be called even after a DETACH hot-plug
1232 		 *	event.
1233 		 */
1234 		if (ops->ndo_stop)
1235 			ops->ndo_stop(dev);
1236 
1237 		dev->flags &= ~IFF_UP;
1238 		net_dmaengine_put();
1239 	}
1240 
1241 	return 0;
1242 }
1243 
1244 static int __dev_close(struct net_device *dev)
1245 {
1246 	int retval;
1247 	LIST_HEAD(single);
1248 
1249 	list_add(&dev->unreg_list, &single);
1250 	retval = __dev_close_many(&single);
1251 	list_del(&single);
1252 	return retval;
1253 }
1254 
1255 static int dev_close_many(struct list_head *head)
1256 {
1257 	struct net_device *dev, *tmp;
1258 	LIST_HEAD(tmp_list);
1259 
1260 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1261 		if (!(dev->flags & IFF_UP))
1262 			list_move(&dev->unreg_list, &tmp_list);
1263 
1264 	__dev_close_many(head);
1265 
1266 	list_for_each_entry(dev, head, unreg_list) {
1267 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1268 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1269 	}
1270 
1271 	/* rollback_registered_many needs the complete original list */
1272 	list_splice(&tmp_list, head);
1273 	return 0;
1274 }
1275 
1276 /**
1277  *	dev_close - shutdown an interface.
1278  *	@dev: device to shutdown
1279  *
1280  *	This function moves an active device into down state. A
1281  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1282  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1283  *	chain.
1284  */
1285 int dev_close(struct net_device *dev)
1286 {
1287 	LIST_HEAD(single);
1288 
1289 	list_add(&dev->unreg_list, &single);
1290 	dev_close_many(&single);
1291 	list_del(&single);
1292 	return 0;
1293 }
1294 EXPORT_SYMBOL(dev_close);
1295 
1296 
1297 /**
1298  *	dev_disable_lro - disable Large Receive Offload on a device
1299  *	@dev: device
1300  *
1301  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1302  *	called under RTNL.  This is needed if received packets may be
1303  *	forwarded to another interface.
1304  */
1305 void dev_disable_lro(struct net_device *dev)
1306 {
1307 	u32 flags;
1308 
1309 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1310 		flags = dev->ethtool_ops->get_flags(dev);
1311 	else
1312 		flags = ethtool_op_get_flags(dev);
1313 
1314 	if (!(flags & ETH_FLAG_LRO))
1315 		return;
1316 
1317 	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1318 	if (unlikely(dev->features & NETIF_F_LRO))
1319 		netdev_WARN(dev, "failed to disable LRO!\n");
1320 }
1321 EXPORT_SYMBOL(dev_disable_lro);
1322 
1323 
1324 static int dev_boot_phase = 1;
1325 
1326 /**
1327  *	register_netdevice_notifier - register a network notifier block
1328  *	@nb: notifier
1329  *
1330  *	Register a notifier to be called when network device events occur.
1331  *	The notifier passed is linked into the kernel structures and must
1332  *	not be reused until it has been unregistered. A negative errno code
1333  *	is returned on a failure.
1334  *
1335  * 	When registered all registration and up events are replayed
1336  *	to the new notifier to allow device to have a race free
1337  *	view of the network device list.
1338  */
1339 
1340 int register_netdevice_notifier(struct notifier_block *nb)
1341 {
1342 	struct net_device *dev;
1343 	struct net_device *last;
1344 	struct net *net;
1345 	int err;
1346 
1347 	rtnl_lock();
1348 	err = raw_notifier_chain_register(&netdev_chain, nb);
1349 	if (err)
1350 		goto unlock;
1351 	if (dev_boot_phase)
1352 		goto unlock;
1353 	for_each_net(net) {
1354 		for_each_netdev(net, dev) {
1355 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1356 			err = notifier_to_errno(err);
1357 			if (err)
1358 				goto rollback;
1359 
1360 			if (!(dev->flags & IFF_UP))
1361 				continue;
1362 
1363 			nb->notifier_call(nb, NETDEV_UP, dev);
1364 		}
1365 	}
1366 
1367 unlock:
1368 	rtnl_unlock();
1369 	return err;
1370 
1371 rollback:
1372 	last = dev;
1373 	for_each_net(net) {
1374 		for_each_netdev(net, dev) {
1375 			if (dev == last)
1376 				break;
1377 
1378 			if (dev->flags & IFF_UP) {
1379 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1380 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1381 			}
1382 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1383 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1384 		}
1385 	}
1386 
1387 	raw_notifier_chain_unregister(&netdev_chain, nb);
1388 	goto unlock;
1389 }
1390 EXPORT_SYMBOL(register_netdevice_notifier);
1391 
1392 /**
1393  *	unregister_netdevice_notifier - unregister a network notifier block
1394  *	@nb: notifier
1395  *
1396  *	Unregister a notifier previously registered by
1397  *	register_netdevice_notifier(). The notifier is unlinked into the
1398  *	kernel structures and may then be reused. A negative errno code
1399  *	is returned on a failure.
1400  */
1401 
1402 int unregister_netdevice_notifier(struct notifier_block *nb)
1403 {
1404 	int err;
1405 
1406 	rtnl_lock();
1407 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1408 	rtnl_unlock();
1409 	return err;
1410 }
1411 EXPORT_SYMBOL(unregister_netdevice_notifier);
1412 
1413 /**
1414  *	call_netdevice_notifiers - call all network notifier blocks
1415  *      @val: value passed unmodified to notifier function
1416  *      @dev: net_device pointer passed unmodified to notifier function
1417  *
1418  *	Call all network notifier blocks.  Parameters and return value
1419  *	are as for raw_notifier_call_chain().
1420  */
1421 
1422 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1423 {
1424 	ASSERT_RTNL();
1425 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1426 }
1427 EXPORT_SYMBOL(call_netdevice_notifiers);
1428 
1429 /* When > 0 there are consumers of rx skb time stamps */
1430 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1431 
1432 void net_enable_timestamp(void)
1433 {
1434 	atomic_inc(&netstamp_needed);
1435 }
1436 EXPORT_SYMBOL(net_enable_timestamp);
1437 
1438 void net_disable_timestamp(void)
1439 {
1440 	atomic_dec(&netstamp_needed);
1441 }
1442 EXPORT_SYMBOL(net_disable_timestamp);
1443 
1444 static inline void net_timestamp_set(struct sk_buff *skb)
1445 {
1446 	if (atomic_read(&netstamp_needed))
1447 		__net_timestamp(skb);
1448 	else
1449 		skb->tstamp.tv64 = 0;
1450 }
1451 
1452 static inline void net_timestamp_check(struct sk_buff *skb)
1453 {
1454 	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1455 		__net_timestamp(skb);
1456 }
1457 
1458 static inline bool is_skb_forwardable(struct net_device *dev,
1459 				      struct sk_buff *skb)
1460 {
1461 	unsigned int len;
1462 
1463 	if (!(dev->flags & IFF_UP))
1464 		return false;
1465 
1466 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1467 	if (skb->len <= len)
1468 		return true;
1469 
1470 	/* if TSO is enabled, we don't care about the length as the packet
1471 	 * could be forwarded without being segmented before
1472 	 */
1473 	if (skb_is_gso(skb))
1474 		return true;
1475 
1476 	return false;
1477 }
1478 
1479 /**
1480  * dev_forward_skb - loopback an skb to another netif
1481  *
1482  * @dev: destination network device
1483  * @skb: buffer to forward
1484  *
1485  * return values:
1486  *	NET_RX_SUCCESS	(no congestion)
1487  *	NET_RX_DROP     (packet was dropped, but freed)
1488  *
1489  * dev_forward_skb can be used for injecting an skb from the
1490  * start_xmit function of one device into the receive queue
1491  * of another device.
1492  *
1493  * The receiving device may be in another namespace, so
1494  * we have to clear all information in the skb that could
1495  * impact namespace isolation.
1496  */
1497 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1498 {
1499 	skb_orphan(skb);
1500 	nf_reset(skb);
1501 
1502 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1503 		atomic_long_inc(&dev->rx_dropped);
1504 		kfree_skb(skb);
1505 		return NET_RX_DROP;
1506 	}
1507 	skb_set_dev(skb, dev);
1508 	skb->tstamp.tv64 = 0;
1509 	skb->pkt_type = PACKET_HOST;
1510 	skb->protocol = eth_type_trans(skb, dev);
1511 	return netif_rx(skb);
1512 }
1513 EXPORT_SYMBOL_GPL(dev_forward_skb);
1514 
1515 static inline int deliver_skb(struct sk_buff *skb,
1516 			      struct packet_type *pt_prev,
1517 			      struct net_device *orig_dev)
1518 {
1519 	atomic_inc(&skb->users);
1520 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1521 }
1522 
1523 /*
1524  *	Support routine. Sends outgoing frames to any network
1525  *	taps currently in use.
1526  */
1527 
1528 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1529 {
1530 	struct packet_type *ptype;
1531 	struct sk_buff *skb2 = NULL;
1532 	struct packet_type *pt_prev = NULL;
1533 
1534 	rcu_read_lock();
1535 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1536 		/* Never send packets back to the socket
1537 		 * they originated from - MvS ([email protected])
1538 		 */
1539 		if ((ptype->dev == dev || !ptype->dev) &&
1540 		    (ptype->af_packet_priv == NULL ||
1541 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1542 			if (pt_prev) {
1543 				deliver_skb(skb2, pt_prev, skb->dev);
1544 				pt_prev = ptype;
1545 				continue;
1546 			}
1547 
1548 			skb2 = skb_clone(skb, GFP_ATOMIC);
1549 			if (!skb2)
1550 				break;
1551 
1552 			net_timestamp_set(skb2);
1553 
1554 			/* skb->nh should be correctly
1555 			   set by sender, so that the second statement is
1556 			   just protection against buggy protocols.
1557 			 */
1558 			skb_reset_mac_header(skb2);
1559 
1560 			if (skb_network_header(skb2) < skb2->data ||
1561 			    skb2->network_header > skb2->tail) {
1562 				if (net_ratelimit())
1563 					printk(KERN_CRIT "protocol %04x is "
1564 					       "buggy, dev %s\n",
1565 					       ntohs(skb2->protocol),
1566 					       dev->name);
1567 				skb_reset_network_header(skb2);
1568 			}
1569 
1570 			skb2->transport_header = skb2->network_header;
1571 			skb2->pkt_type = PACKET_OUTGOING;
1572 			pt_prev = ptype;
1573 		}
1574 	}
1575 	if (pt_prev)
1576 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1577 	rcu_read_unlock();
1578 }
1579 
1580 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1581  * @dev: Network device
1582  * @txq: number of queues available
1583  *
1584  * If real_num_tx_queues is changed the tc mappings may no longer be
1585  * valid. To resolve this verify the tc mapping remains valid and if
1586  * not NULL the mapping. With no priorities mapping to this
1587  * offset/count pair it will no longer be used. In the worst case TC0
1588  * is invalid nothing can be done so disable priority mappings. If is
1589  * expected that drivers will fix this mapping if they can before
1590  * calling netif_set_real_num_tx_queues.
1591  */
1592 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1593 {
1594 	int i;
1595 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1596 
1597 	/* If TC0 is invalidated disable TC mapping */
1598 	if (tc->offset + tc->count > txq) {
1599 		pr_warning("Number of in use tx queues changed "
1600 			   "invalidating tc mappings. Priority "
1601 			   "traffic classification disabled!\n");
1602 		dev->num_tc = 0;
1603 		return;
1604 	}
1605 
1606 	/* Invalidated prio to tc mappings set to TC0 */
1607 	for (i = 1; i < TC_BITMASK + 1; i++) {
1608 		int q = netdev_get_prio_tc_map(dev, i);
1609 
1610 		tc = &dev->tc_to_txq[q];
1611 		if (tc->offset + tc->count > txq) {
1612 			pr_warning("Number of in use tx queues "
1613 				   "changed. Priority %i to tc "
1614 				   "mapping %i is no longer valid "
1615 				   "setting map to 0\n",
1616 				   i, q);
1617 			netdev_set_prio_tc_map(dev, i, 0);
1618 		}
1619 	}
1620 }
1621 
1622 /*
1623  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1624  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1625  */
1626 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1627 {
1628 	int rc;
1629 
1630 	if (txq < 1 || txq > dev->num_tx_queues)
1631 		return -EINVAL;
1632 
1633 	if (dev->reg_state == NETREG_REGISTERED ||
1634 	    dev->reg_state == NETREG_UNREGISTERING) {
1635 		ASSERT_RTNL();
1636 
1637 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1638 						  txq);
1639 		if (rc)
1640 			return rc;
1641 
1642 		if (dev->num_tc)
1643 			netif_setup_tc(dev, txq);
1644 
1645 		if (txq < dev->real_num_tx_queues)
1646 			qdisc_reset_all_tx_gt(dev, txq);
1647 	}
1648 
1649 	dev->real_num_tx_queues = txq;
1650 	return 0;
1651 }
1652 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1653 
1654 #ifdef CONFIG_RPS
1655 /**
1656  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1657  *	@dev: Network device
1658  *	@rxq: Actual number of RX queues
1659  *
1660  *	This must be called either with the rtnl_lock held or before
1661  *	registration of the net device.  Returns 0 on success, or a
1662  *	negative error code.  If called before registration, it always
1663  *	succeeds.
1664  */
1665 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1666 {
1667 	int rc;
1668 
1669 	if (rxq < 1 || rxq > dev->num_rx_queues)
1670 		return -EINVAL;
1671 
1672 	if (dev->reg_state == NETREG_REGISTERED) {
1673 		ASSERT_RTNL();
1674 
1675 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1676 						  rxq);
1677 		if (rc)
1678 			return rc;
1679 	}
1680 
1681 	dev->real_num_rx_queues = rxq;
1682 	return 0;
1683 }
1684 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1685 #endif
1686 
1687 static inline void __netif_reschedule(struct Qdisc *q)
1688 {
1689 	struct softnet_data *sd;
1690 	unsigned long flags;
1691 
1692 	local_irq_save(flags);
1693 	sd = &__get_cpu_var(softnet_data);
1694 	q->next_sched = NULL;
1695 	*sd->output_queue_tailp = q;
1696 	sd->output_queue_tailp = &q->next_sched;
1697 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1698 	local_irq_restore(flags);
1699 }
1700 
1701 void __netif_schedule(struct Qdisc *q)
1702 {
1703 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1704 		__netif_reschedule(q);
1705 }
1706 EXPORT_SYMBOL(__netif_schedule);
1707 
1708 void dev_kfree_skb_irq(struct sk_buff *skb)
1709 {
1710 	if (atomic_dec_and_test(&skb->users)) {
1711 		struct softnet_data *sd;
1712 		unsigned long flags;
1713 
1714 		local_irq_save(flags);
1715 		sd = &__get_cpu_var(softnet_data);
1716 		skb->next = sd->completion_queue;
1717 		sd->completion_queue = skb;
1718 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1719 		local_irq_restore(flags);
1720 	}
1721 }
1722 EXPORT_SYMBOL(dev_kfree_skb_irq);
1723 
1724 void dev_kfree_skb_any(struct sk_buff *skb)
1725 {
1726 	if (in_irq() || irqs_disabled())
1727 		dev_kfree_skb_irq(skb);
1728 	else
1729 		dev_kfree_skb(skb);
1730 }
1731 EXPORT_SYMBOL(dev_kfree_skb_any);
1732 
1733 
1734 /**
1735  * netif_device_detach - mark device as removed
1736  * @dev: network device
1737  *
1738  * Mark device as removed from system and therefore no longer available.
1739  */
1740 void netif_device_detach(struct net_device *dev)
1741 {
1742 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1743 	    netif_running(dev)) {
1744 		netif_tx_stop_all_queues(dev);
1745 	}
1746 }
1747 EXPORT_SYMBOL(netif_device_detach);
1748 
1749 /**
1750  * netif_device_attach - mark device as attached
1751  * @dev: network device
1752  *
1753  * Mark device as attached from system and restart if needed.
1754  */
1755 void netif_device_attach(struct net_device *dev)
1756 {
1757 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1758 	    netif_running(dev)) {
1759 		netif_tx_wake_all_queues(dev);
1760 		__netdev_watchdog_up(dev);
1761 	}
1762 }
1763 EXPORT_SYMBOL(netif_device_attach);
1764 
1765 /**
1766  * skb_dev_set -- assign a new device to a buffer
1767  * @skb: buffer for the new device
1768  * @dev: network device
1769  *
1770  * If an skb is owned by a device already, we have to reset
1771  * all data private to the namespace a device belongs to
1772  * before assigning it a new device.
1773  */
1774 #ifdef CONFIG_NET_NS
1775 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1776 {
1777 	skb_dst_drop(skb);
1778 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1779 		secpath_reset(skb);
1780 		nf_reset(skb);
1781 		skb_init_secmark(skb);
1782 		skb->mark = 0;
1783 		skb->priority = 0;
1784 		skb->nf_trace = 0;
1785 		skb->ipvs_property = 0;
1786 #ifdef CONFIG_NET_SCHED
1787 		skb->tc_index = 0;
1788 #endif
1789 	}
1790 	skb->dev = dev;
1791 }
1792 EXPORT_SYMBOL(skb_set_dev);
1793 #endif /* CONFIG_NET_NS */
1794 
1795 /*
1796  * Invalidate hardware checksum when packet is to be mangled, and
1797  * complete checksum manually on outgoing path.
1798  */
1799 int skb_checksum_help(struct sk_buff *skb)
1800 {
1801 	__wsum csum;
1802 	int ret = 0, offset;
1803 
1804 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1805 		goto out_set_summed;
1806 
1807 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1808 		/* Let GSO fix up the checksum. */
1809 		goto out_set_summed;
1810 	}
1811 
1812 	offset = skb_checksum_start_offset(skb);
1813 	BUG_ON(offset >= skb_headlen(skb));
1814 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1815 
1816 	offset += skb->csum_offset;
1817 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1818 
1819 	if (skb_cloned(skb) &&
1820 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1821 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1822 		if (ret)
1823 			goto out;
1824 	}
1825 
1826 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1827 out_set_summed:
1828 	skb->ip_summed = CHECKSUM_NONE;
1829 out:
1830 	return ret;
1831 }
1832 EXPORT_SYMBOL(skb_checksum_help);
1833 
1834 /**
1835  *	skb_gso_segment - Perform segmentation on skb.
1836  *	@skb: buffer to segment
1837  *	@features: features for the output path (see dev->features)
1838  *
1839  *	This function segments the given skb and returns a list of segments.
1840  *
1841  *	It may return NULL if the skb requires no segmentation.  This is
1842  *	only possible when GSO is used for verifying header integrity.
1843  */
1844 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1845 {
1846 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1847 	struct packet_type *ptype;
1848 	__be16 type = skb->protocol;
1849 	int vlan_depth = ETH_HLEN;
1850 	int err;
1851 
1852 	while (type == htons(ETH_P_8021Q)) {
1853 		struct vlan_hdr *vh;
1854 
1855 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1856 			return ERR_PTR(-EINVAL);
1857 
1858 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1859 		type = vh->h_vlan_encapsulated_proto;
1860 		vlan_depth += VLAN_HLEN;
1861 	}
1862 
1863 	skb_reset_mac_header(skb);
1864 	skb->mac_len = skb->network_header - skb->mac_header;
1865 	__skb_pull(skb, skb->mac_len);
1866 
1867 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1868 		struct net_device *dev = skb->dev;
1869 		struct ethtool_drvinfo info = {};
1870 
1871 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1872 			dev->ethtool_ops->get_drvinfo(dev, &info);
1873 
1874 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1875 		     info.driver, dev ? dev->features : 0L,
1876 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1877 		     skb->len, skb->data_len, skb->ip_summed);
1878 
1879 		if (skb_header_cloned(skb) &&
1880 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1881 			return ERR_PTR(err);
1882 	}
1883 
1884 	rcu_read_lock();
1885 	list_for_each_entry_rcu(ptype,
1886 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1887 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1888 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1889 				err = ptype->gso_send_check(skb);
1890 				segs = ERR_PTR(err);
1891 				if (err || skb_gso_ok(skb, features))
1892 					break;
1893 				__skb_push(skb, (skb->data -
1894 						 skb_network_header(skb)));
1895 			}
1896 			segs = ptype->gso_segment(skb, features);
1897 			break;
1898 		}
1899 	}
1900 	rcu_read_unlock();
1901 
1902 	__skb_push(skb, skb->data - skb_mac_header(skb));
1903 
1904 	return segs;
1905 }
1906 EXPORT_SYMBOL(skb_gso_segment);
1907 
1908 /* Take action when hardware reception checksum errors are detected. */
1909 #ifdef CONFIG_BUG
1910 void netdev_rx_csum_fault(struct net_device *dev)
1911 {
1912 	if (net_ratelimit()) {
1913 		printk(KERN_ERR "%s: hw csum failure.\n",
1914 			dev ? dev->name : "<unknown>");
1915 		dump_stack();
1916 	}
1917 }
1918 EXPORT_SYMBOL(netdev_rx_csum_fault);
1919 #endif
1920 
1921 /* Actually, we should eliminate this check as soon as we know, that:
1922  * 1. IOMMU is present and allows to map all the memory.
1923  * 2. No high memory really exists on this machine.
1924  */
1925 
1926 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1927 {
1928 #ifdef CONFIG_HIGHMEM
1929 	int i;
1930 	if (!(dev->features & NETIF_F_HIGHDMA)) {
1931 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1932 			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1933 				return 1;
1934 	}
1935 
1936 	if (PCI_DMA_BUS_IS_PHYS) {
1937 		struct device *pdev = dev->dev.parent;
1938 
1939 		if (!pdev)
1940 			return 0;
1941 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1942 			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1943 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1944 				return 1;
1945 		}
1946 	}
1947 #endif
1948 	return 0;
1949 }
1950 
1951 struct dev_gso_cb {
1952 	void (*destructor)(struct sk_buff *skb);
1953 };
1954 
1955 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1956 
1957 static void dev_gso_skb_destructor(struct sk_buff *skb)
1958 {
1959 	struct dev_gso_cb *cb;
1960 
1961 	do {
1962 		struct sk_buff *nskb = skb->next;
1963 
1964 		skb->next = nskb->next;
1965 		nskb->next = NULL;
1966 		kfree_skb(nskb);
1967 	} while (skb->next);
1968 
1969 	cb = DEV_GSO_CB(skb);
1970 	if (cb->destructor)
1971 		cb->destructor(skb);
1972 }
1973 
1974 /**
1975  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1976  *	@skb: buffer to segment
1977  *	@features: device features as applicable to this skb
1978  *
1979  *	This function segments the given skb and stores the list of segments
1980  *	in skb->next.
1981  */
1982 static int dev_gso_segment(struct sk_buff *skb, int features)
1983 {
1984 	struct sk_buff *segs;
1985 
1986 	segs = skb_gso_segment(skb, features);
1987 
1988 	/* Verifying header integrity only. */
1989 	if (!segs)
1990 		return 0;
1991 
1992 	if (IS_ERR(segs))
1993 		return PTR_ERR(segs);
1994 
1995 	skb->next = segs;
1996 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1997 	skb->destructor = dev_gso_skb_destructor;
1998 
1999 	return 0;
2000 }
2001 
2002 /*
2003  * Try to orphan skb early, right before transmission by the device.
2004  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2005  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2006  */
2007 static inline void skb_orphan_try(struct sk_buff *skb)
2008 {
2009 	struct sock *sk = skb->sk;
2010 
2011 	if (sk && !skb_shinfo(skb)->tx_flags) {
2012 		/* skb_tx_hash() wont be able to get sk.
2013 		 * We copy sk_hash into skb->rxhash
2014 		 */
2015 		if (!skb->rxhash)
2016 			skb->rxhash = sk->sk_hash;
2017 		skb_orphan(skb);
2018 	}
2019 }
2020 
2021 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2022 {
2023 	return ((features & NETIF_F_GEN_CSUM) ||
2024 		((features & NETIF_F_V4_CSUM) &&
2025 		 protocol == htons(ETH_P_IP)) ||
2026 		((features & NETIF_F_V6_CSUM) &&
2027 		 protocol == htons(ETH_P_IPV6)) ||
2028 		((features & NETIF_F_FCOE_CRC) &&
2029 		 protocol == htons(ETH_P_FCOE)));
2030 }
2031 
2032 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2033 {
2034 	if (!can_checksum_protocol(features, protocol)) {
2035 		features &= ~NETIF_F_ALL_CSUM;
2036 		features &= ~NETIF_F_SG;
2037 	} else if (illegal_highdma(skb->dev, skb)) {
2038 		features &= ~NETIF_F_SG;
2039 	}
2040 
2041 	return features;
2042 }
2043 
2044 u32 netif_skb_features(struct sk_buff *skb)
2045 {
2046 	__be16 protocol = skb->protocol;
2047 	u32 features = skb->dev->features;
2048 
2049 	if (protocol == htons(ETH_P_8021Q)) {
2050 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2051 		protocol = veh->h_vlan_encapsulated_proto;
2052 	} else if (!vlan_tx_tag_present(skb)) {
2053 		return harmonize_features(skb, protocol, features);
2054 	}
2055 
2056 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2057 
2058 	if (protocol != htons(ETH_P_8021Q)) {
2059 		return harmonize_features(skb, protocol, features);
2060 	} else {
2061 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2062 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2063 		return harmonize_features(skb, protocol, features);
2064 	}
2065 }
2066 EXPORT_SYMBOL(netif_skb_features);
2067 
2068 /*
2069  * Returns true if either:
2070  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2071  *	2. skb is fragmented and the device does not support SG, or if
2072  *	   at least one of fragments is in highmem and device does not
2073  *	   support DMA from it.
2074  */
2075 static inline int skb_needs_linearize(struct sk_buff *skb,
2076 				      int features)
2077 {
2078 	return skb_is_nonlinear(skb) &&
2079 			((skb_has_frag_list(skb) &&
2080 				!(features & NETIF_F_FRAGLIST)) ||
2081 			(skb_shinfo(skb)->nr_frags &&
2082 				!(features & NETIF_F_SG)));
2083 }
2084 
2085 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2086 			struct netdev_queue *txq)
2087 {
2088 	const struct net_device_ops *ops = dev->netdev_ops;
2089 	int rc = NETDEV_TX_OK;
2090 
2091 	if (likely(!skb->next)) {
2092 		u32 features;
2093 
2094 		/*
2095 		 * If device doesn't need skb->dst, release it right now while
2096 		 * its hot in this cpu cache
2097 		 */
2098 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2099 			skb_dst_drop(skb);
2100 
2101 		if (!list_empty(&ptype_all))
2102 			dev_queue_xmit_nit(skb, dev);
2103 
2104 		skb_orphan_try(skb);
2105 
2106 		features = netif_skb_features(skb);
2107 
2108 		if (vlan_tx_tag_present(skb) &&
2109 		    !(features & NETIF_F_HW_VLAN_TX)) {
2110 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2111 			if (unlikely(!skb))
2112 				goto out;
2113 
2114 			skb->vlan_tci = 0;
2115 		}
2116 
2117 		if (netif_needs_gso(skb, features)) {
2118 			if (unlikely(dev_gso_segment(skb, features)))
2119 				goto out_kfree_skb;
2120 			if (skb->next)
2121 				goto gso;
2122 		} else {
2123 			if (skb_needs_linearize(skb, features) &&
2124 			    __skb_linearize(skb))
2125 				goto out_kfree_skb;
2126 
2127 			/* If packet is not checksummed and device does not
2128 			 * support checksumming for this protocol, complete
2129 			 * checksumming here.
2130 			 */
2131 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2132 				skb_set_transport_header(skb,
2133 					skb_checksum_start_offset(skb));
2134 				if (!(features & NETIF_F_ALL_CSUM) &&
2135 				     skb_checksum_help(skb))
2136 					goto out_kfree_skb;
2137 			}
2138 		}
2139 
2140 		rc = ops->ndo_start_xmit(skb, dev);
2141 		trace_net_dev_xmit(skb, rc);
2142 		if (rc == NETDEV_TX_OK)
2143 			txq_trans_update(txq);
2144 		return rc;
2145 	}
2146 
2147 gso:
2148 	do {
2149 		struct sk_buff *nskb = skb->next;
2150 
2151 		skb->next = nskb->next;
2152 		nskb->next = NULL;
2153 
2154 		/*
2155 		 * If device doesn't need nskb->dst, release it right now while
2156 		 * its hot in this cpu cache
2157 		 */
2158 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2159 			skb_dst_drop(nskb);
2160 
2161 		rc = ops->ndo_start_xmit(nskb, dev);
2162 		trace_net_dev_xmit(nskb, rc);
2163 		if (unlikely(rc != NETDEV_TX_OK)) {
2164 			if (rc & ~NETDEV_TX_MASK)
2165 				goto out_kfree_gso_skb;
2166 			nskb->next = skb->next;
2167 			skb->next = nskb;
2168 			return rc;
2169 		}
2170 		txq_trans_update(txq);
2171 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2172 			return NETDEV_TX_BUSY;
2173 	} while (skb->next);
2174 
2175 out_kfree_gso_skb:
2176 	if (likely(skb->next == NULL))
2177 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2178 out_kfree_skb:
2179 	kfree_skb(skb);
2180 out:
2181 	return rc;
2182 }
2183 
2184 static u32 hashrnd __read_mostly;
2185 
2186 /*
2187  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2188  * to be used as a distribution range.
2189  */
2190 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2191 		  unsigned int num_tx_queues)
2192 {
2193 	u32 hash;
2194 	u16 qoffset = 0;
2195 	u16 qcount = num_tx_queues;
2196 
2197 	if (skb_rx_queue_recorded(skb)) {
2198 		hash = skb_get_rx_queue(skb);
2199 		while (unlikely(hash >= num_tx_queues))
2200 			hash -= num_tx_queues;
2201 		return hash;
2202 	}
2203 
2204 	if (dev->num_tc) {
2205 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2206 		qoffset = dev->tc_to_txq[tc].offset;
2207 		qcount = dev->tc_to_txq[tc].count;
2208 	}
2209 
2210 	if (skb->sk && skb->sk->sk_hash)
2211 		hash = skb->sk->sk_hash;
2212 	else
2213 		hash = (__force u16) skb->protocol ^ skb->rxhash;
2214 	hash = jhash_1word(hash, hashrnd);
2215 
2216 	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2217 }
2218 EXPORT_SYMBOL(__skb_tx_hash);
2219 
2220 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2221 {
2222 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2223 		if (net_ratelimit()) {
2224 			pr_warning("%s selects TX queue %d, but "
2225 				"real number of TX queues is %d\n",
2226 				dev->name, queue_index, dev->real_num_tx_queues);
2227 		}
2228 		return 0;
2229 	}
2230 	return queue_index;
2231 }
2232 
2233 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2234 {
2235 #ifdef CONFIG_XPS
2236 	struct xps_dev_maps *dev_maps;
2237 	struct xps_map *map;
2238 	int queue_index = -1;
2239 
2240 	rcu_read_lock();
2241 	dev_maps = rcu_dereference(dev->xps_maps);
2242 	if (dev_maps) {
2243 		map = rcu_dereference(
2244 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2245 		if (map) {
2246 			if (map->len == 1)
2247 				queue_index = map->queues[0];
2248 			else {
2249 				u32 hash;
2250 				if (skb->sk && skb->sk->sk_hash)
2251 					hash = skb->sk->sk_hash;
2252 				else
2253 					hash = (__force u16) skb->protocol ^
2254 					    skb->rxhash;
2255 				hash = jhash_1word(hash, hashrnd);
2256 				queue_index = map->queues[
2257 				    ((u64)hash * map->len) >> 32];
2258 			}
2259 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2260 				queue_index = -1;
2261 		}
2262 	}
2263 	rcu_read_unlock();
2264 
2265 	return queue_index;
2266 #else
2267 	return -1;
2268 #endif
2269 }
2270 
2271 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2272 					struct sk_buff *skb)
2273 {
2274 	int queue_index;
2275 	const struct net_device_ops *ops = dev->netdev_ops;
2276 
2277 	if (dev->real_num_tx_queues == 1)
2278 		queue_index = 0;
2279 	else if (ops->ndo_select_queue) {
2280 		queue_index = ops->ndo_select_queue(dev, skb);
2281 		queue_index = dev_cap_txqueue(dev, queue_index);
2282 	} else {
2283 		struct sock *sk = skb->sk;
2284 		queue_index = sk_tx_queue_get(sk);
2285 
2286 		if (queue_index < 0 || skb->ooo_okay ||
2287 		    queue_index >= dev->real_num_tx_queues) {
2288 			int old_index = queue_index;
2289 
2290 			queue_index = get_xps_queue(dev, skb);
2291 			if (queue_index < 0)
2292 				queue_index = skb_tx_hash(dev, skb);
2293 
2294 			if (queue_index != old_index && sk) {
2295 				struct dst_entry *dst =
2296 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2297 
2298 				if (dst && skb_dst(skb) == dst)
2299 					sk_tx_queue_set(sk, queue_index);
2300 			}
2301 		}
2302 	}
2303 
2304 	skb_set_queue_mapping(skb, queue_index);
2305 	return netdev_get_tx_queue(dev, queue_index);
2306 }
2307 
2308 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2309 				 struct net_device *dev,
2310 				 struct netdev_queue *txq)
2311 {
2312 	spinlock_t *root_lock = qdisc_lock(q);
2313 	bool contended;
2314 	int rc;
2315 
2316 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2317 	qdisc_calculate_pkt_len(skb, q);
2318 	/*
2319 	 * Heuristic to force contended enqueues to serialize on a
2320 	 * separate lock before trying to get qdisc main lock.
2321 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2322 	 * and dequeue packets faster.
2323 	 */
2324 	contended = qdisc_is_running(q);
2325 	if (unlikely(contended))
2326 		spin_lock(&q->busylock);
2327 
2328 	spin_lock(root_lock);
2329 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2330 		kfree_skb(skb);
2331 		rc = NET_XMIT_DROP;
2332 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2333 		   qdisc_run_begin(q)) {
2334 		/*
2335 		 * This is a work-conserving queue; there are no old skbs
2336 		 * waiting to be sent out; and the qdisc is not running -
2337 		 * xmit the skb directly.
2338 		 */
2339 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2340 			skb_dst_force(skb);
2341 
2342 		qdisc_bstats_update(q, skb);
2343 
2344 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2345 			if (unlikely(contended)) {
2346 				spin_unlock(&q->busylock);
2347 				contended = false;
2348 			}
2349 			__qdisc_run(q);
2350 		} else
2351 			qdisc_run_end(q);
2352 
2353 		rc = NET_XMIT_SUCCESS;
2354 	} else {
2355 		skb_dst_force(skb);
2356 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2357 		if (qdisc_run_begin(q)) {
2358 			if (unlikely(contended)) {
2359 				spin_unlock(&q->busylock);
2360 				contended = false;
2361 			}
2362 			__qdisc_run(q);
2363 		}
2364 	}
2365 	spin_unlock(root_lock);
2366 	if (unlikely(contended))
2367 		spin_unlock(&q->busylock);
2368 	return rc;
2369 }
2370 
2371 static DEFINE_PER_CPU(int, xmit_recursion);
2372 #define RECURSION_LIMIT 10
2373 
2374 /**
2375  *	dev_queue_xmit - transmit a buffer
2376  *	@skb: buffer to transmit
2377  *
2378  *	Queue a buffer for transmission to a network device. The caller must
2379  *	have set the device and priority and built the buffer before calling
2380  *	this function. The function can be called from an interrupt.
2381  *
2382  *	A negative errno code is returned on a failure. A success does not
2383  *	guarantee the frame will be transmitted as it may be dropped due
2384  *	to congestion or traffic shaping.
2385  *
2386  * -----------------------------------------------------------------------------------
2387  *      I notice this method can also return errors from the queue disciplines,
2388  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2389  *      be positive.
2390  *
2391  *      Regardless of the return value, the skb is consumed, so it is currently
2392  *      difficult to retry a send to this method.  (You can bump the ref count
2393  *      before sending to hold a reference for retry if you are careful.)
2394  *
2395  *      When calling this method, interrupts MUST be enabled.  This is because
2396  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2397  *          --BLG
2398  */
2399 int dev_queue_xmit(struct sk_buff *skb)
2400 {
2401 	struct net_device *dev = skb->dev;
2402 	struct netdev_queue *txq;
2403 	struct Qdisc *q;
2404 	int rc = -ENOMEM;
2405 
2406 	/* Disable soft irqs for various locks below. Also
2407 	 * stops preemption for RCU.
2408 	 */
2409 	rcu_read_lock_bh();
2410 
2411 	txq = dev_pick_tx(dev, skb);
2412 	q = rcu_dereference_bh(txq->qdisc);
2413 
2414 #ifdef CONFIG_NET_CLS_ACT
2415 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2416 #endif
2417 	trace_net_dev_queue(skb);
2418 	if (q->enqueue) {
2419 		rc = __dev_xmit_skb(skb, q, dev, txq);
2420 		goto out;
2421 	}
2422 
2423 	/* The device has no queue. Common case for software devices:
2424 	   loopback, all the sorts of tunnels...
2425 
2426 	   Really, it is unlikely that netif_tx_lock protection is necessary
2427 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2428 	   counters.)
2429 	   However, it is possible, that they rely on protection
2430 	   made by us here.
2431 
2432 	   Check this and shot the lock. It is not prone from deadlocks.
2433 	   Either shot noqueue qdisc, it is even simpler 8)
2434 	 */
2435 	if (dev->flags & IFF_UP) {
2436 		int cpu = smp_processor_id(); /* ok because BHs are off */
2437 
2438 		if (txq->xmit_lock_owner != cpu) {
2439 
2440 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2441 				goto recursion_alert;
2442 
2443 			HARD_TX_LOCK(dev, txq, cpu);
2444 
2445 			if (!netif_tx_queue_stopped(txq)) {
2446 				__this_cpu_inc(xmit_recursion);
2447 				rc = dev_hard_start_xmit(skb, dev, txq);
2448 				__this_cpu_dec(xmit_recursion);
2449 				if (dev_xmit_complete(rc)) {
2450 					HARD_TX_UNLOCK(dev, txq);
2451 					goto out;
2452 				}
2453 			}
2454 			HARD_TX_UNLOCK(dev, txq);
2455 			if (net_ratelimit())
2456 				printk(KERN_CRIT "Virtual device %s asks to "
2457 				       "queue packet!\n", dev->name);
2458 		} else {
2459 			/* Recursion is detected! It is possible,
2460 			 * unfortunately
2461 			 */
2462 recursion_alert:
2463 			if (net_ratelimit())
2464 				printk(KERN_CRIT "Dead loop on virtual device "
2465 				       "%s, fix it urgently!\n", dev->name);
2466 		}
2467 	}
2468 
2469 	rc = -ENETDOWN;
2470 	rcu_read_unlock_bh();
2471 
2472 	kfree_skb(skb);
2473 	return rc;
2474 out:
2475 	rcu_read_unlock_bh();
2476 	return rc;
2477 }
2478 EXPORT_SYMBOL(dev_queue_xmit);
2479 
2480 
2481 /*=======================================================================
2482 			Receiver routines
2483   =======================================================================*/
2484 
2485 int netdev_max_backlog __read_mostly = 1000;
2486 int netdev_tstamp_prequeue __read_mostly = 1;
2487 int netdev_budget __read_mostly = 300;
2488 int weight_p __read_mostly = 64;            /* old backlog weight */
2489 
2490 /* Called with irq disabled */
2491 static inline void ____napi_schedule(struct softnet_data *sd,
2492 				     struct napi_struct *napi)
2493 {
2494 	list_add_tail(&napi->poll_list, &sd->poll_list);
2495 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2496 }
2497 
2498 /*
2499  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2500  * and src/dst port numbers. Returns a non-zero hash number on success
2501  * and 0 on failure.
2502  */
2503 __u32 __skb_get_rxhash(struct sk_buff *skb)
2504 {
2505 	int nhoff, hash = 0, poff;
2506 	const struct ipv6hdr *ip6;
2507 	const struct iphdr *ip;
2508 	u8 ip_proto;
2509 	u32 addr1, addr2, ihl;
2510 	union {
2511 		u32 v32;
2512 		u16 v16[2];
2513 	} ports;
2514 
2515 	nhoff = skb_network_offset(skb);
2516 
2517 	switch (skb->protocol) {
2518 	case __constant_htons(ETH_P_IP):
2519 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2520 			goto done;
2521 
2522 		ip = (const struct iphdr *) (skb->data + nhoff);
2523 		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2524 			ip_proto = 0;
2525 		else
2526 			ip_proto = ip->protocol;
2527 		addr1 = (__force u32) ip->saddr;
2528 		addr2 = (__force u32) ip->daddr;
2529 		ihl = ip->ihl;
2530 		break;
2531 	case __constant_htons(ETH_P_IPV6):
2532 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2533 			goto done;
2534 
2535 		ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2536 		ip_proto = ip6->nexthdr;
2537 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2538 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2539 		ihl = (40 >> 2);
2540 		break;
2541 	default:
2542 		goto done;
2543 	}
2544 
2545 	ports.v32 = 0;
2546 	poff = proto_ports_offset(ip_proto);
2547 	if (poff >= 0) {
2548 		nhoff += ihl * 4 + poff;
2549 		if (pskb_may_pull(skb, nhoff + 4)) {
2550 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2551 			if (ports.v16[1] < ports.v16[0])
2552 				swap(ports.v16[0], ports.v16[1]);
2553 		}
2554 	}
2555 
2556 	/* get a consistent hash (same value on both flow directions) */
2557 	if (addr2 < addr1)
2558 		swap(addr1, addr2);
2559 
2560 	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2561 	if (!hash)
2562 		hash = 1;
2563 
2564 done:
2565 	return hash;
2566 }
2567 EXPORT_SYMBOL(__skb_get_rxhash);
2568 
2569 #ifdef CONFIG_RPS
2570 
2571 /* One global table that all flow-based protocols share. */
2572 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2573 EXPORT_SYMBOL(rps_sock_flow_table);
2574 
2575 static struct rps_dev_flow *
2576 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2577 	    struct rps_dev_flow *rflow, u16 next_cpu)
2578 {
2579 	u16 tcpu;
2580 
2581 	tcpu = rflow->cpu = next_cpu;
2582 	if (tcpu != RPS_NO_CPU) {
2583 #ifdef CONFIG_RFS_ACCEL
2584 		struct netdev_rx_queue *rxqueue;
2585 		struct rps_dev_flow_table *flow_table;
2586 		struct rps_dev_flow *old_rflow;
2587 		u32 flow_id;
2588 		u16 rxq_index;
2589 		int rc;
2590 
2591 		/* Should we steer this flow to a different hardware queue? */
2592 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2593 		    !(dev->features & NETIF_F_NTUPLE))
2594 			goto out;
2595 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2596 		if (rxq_index == skb_get_rx_queue(skb))
2597 			goto out;
2598 
2599 		rxqueue = dev->_rx + rxq_index;
2600 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2601 		if (!flow_table)
2602 			goto out;
2603 		flow_id = skb->rxhash & flow_table->mask;
2604 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2605 							rxq_index, flow_id);
2606 		if (rc < 0)
2607 			goto out;
2608 		old_rflow = rflow;
2609 		rflow = &flow_table->flows[flow_id];
2610 		rflow->cpu = next_cpu;
2611 		rflow->filter = rc;
2612 		if (old_rflow->filter == rflow->filter)
2613 			old_rflow->filter = RPS_NO_FILTER;
2614 	out:
2615 #endif
2616 		rflow->last_qtail =
2617 			per_cpu(softnet_data, tcpu).input_queue_head;
2618 	}
2619 
2620 	return rflow;
2621 }
2622 
2623 /*
2624  * get_rps_cpu is called from netif_receive_skb and returns the target
2625  * CPU from the RPS map of the receiving queue for a given skb.
2626  * rcu_read_lock must be held on entry.
2627  */
2628 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2629 		       struct rps_dev_flow **rflowp)
2630 {
2631 	struct netdev_rx_queue *rxqueue;
2632 	struct rps_map *map;
2633 	struct rps_dev_flow_table *flow_table;
2634 	struct rps_sock_flow_table *sock_flow_table;
2635 	int cpu = -1;
2636 	u16 tcpu;
2637 
2638 	if (skb_rx_queue_recorded(skb)) {
2639 		u16 index = skb_get_rx_queue(skb);
2640 		if (unlikely(index >= dev->real_num_rx_queues)) {
2641 			WARN_ONCE(dev->real_num_rx_queues > 1,
2642 				  "%s received packet on queue %u, but number "
2643 				  "of RX queues is %u\n",
2644 				  dev->name, index, dev->real_num_rx_queues);
2645 			goto done;
2646 		}
2647 		rxqueue = dev->_rx + index;
2648 	} else
2649 		rxqueue = dev->_rx;
2650 
2651 	map = rcu_dereference(rxqueue->rps_map);
2652 	if (map) {
2653 		if (map->len == 1 &&
2654 		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2655 			tcpu = map->cpus[0];
2656 			if (cpu_online(tcpu))
2657 				cpu = tcpu;
2658 			goto done;
2659 		}
2660 	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2661 		goto done;
2662 	}
2663 
2664 	skb_reset_network_header(skb);
2665 	if (!skb_get_rxhash(skb))
2666 		goto done;
2667 
2668 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2669 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2670 	if (flow_table && sock_flow_table) {
2671 		u16 next_cpu;
2672 		struct rps_dev_flow *rflow;
2673 
2674 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2675 		tcpu = rflow->cpu;
2676 
2677 		next_cpu = sock_flow_table->ents[skb->rxhash &
2678 		    sock_flow_table->mask];
2679 
2680 		/*
2681 		 * If the desired CPU (where last recvmsg was done) is
2682 		 * different from current CPU (one in the rx-queue flow
2683 		 * table entry), switch if one of the following holds:
2684 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2685 		 *   - Current CPU is offline.
2686 		 *   - The current CPU's queue tail has advanced beyond the
2687 		 *     last packet that was enqueued using this table entry.
2688 		 *     This guarantees that all previous packets for the flow
2689 		 *     have been dequeued, thus preserving in order delivery.
2690 		 */
2691 		if (unlikely(tcpu != next_cpu) &&
2692 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2693 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2694 		      rflow->last_qtail)) >= 0))
2695 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2696 
2697 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2698 			*rflowp = rflow;
2699 			cpu = tcpu;
2700 			goto done;
2701 		}
2702 	}
2703 
2704 	if (map) {
2705 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2706 
2707 		if (cpu_online(tcpu)) {
2708 			cpu = tcpu;
2709 			goto done;
2710 		}
2711 	}
2712 
2713 done:
2714 	return cpu;
2715 }
2716 
2717 #ifdef CONFIG_RFS_ACCEL
2718 
2719 /**
2720  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2721  * @dev: Device on which the filter was set
2722  * @rxq_index: RX queue index
2723  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2724  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2725  *
2726  * Drivers that implement ndo_rx_flow_steer() should periodically call
2727  * this function for each installed filter and remove the filters for
2728  * which it returns %true.
2729  */
2730 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2731 			 u32 flow_id, u16 filter_id)
2732 {
2733 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2734 	struct rps_dev_flow_table *flow_table;
2735 	struct rps_dev_flow *rflow;
2736 	bool expire = true;
2737 	int cpu;
2738 
2739 	rcu_read_lock();
2740 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2741 	if (flow_table && flow_id <= flow_table->mask) {
2742 		rflow = &flow_table->flows[flow_id];
2743 		cpu = ACCESS_ONCE(rflow->cpu);
2744 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2745 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2746 			   rflow->last_qtail) <
2747 		     (int)(10 * flow_table->mask)))
2748 			expire = false;
2749 	}
2750 	rcu_read_unlock();
2751 	return expire;
2752 }
2753 EXPORT_SYMBOL(rps_may_expire_flow);
2754 
2755 #endif /* CONFIG_RFS_ACCEL */
2756 
2757 /* Called from hardirq (IPI) context */
2758 static void rps_trigger_softirq(void *data)
2759 {
2760 	struct softnet_data *sd = data;
2761 
2762 	____napi_schedule(sd, &sd->backlog);
2763 	sd->received_rps++;
2764 }
2765 
2766 #endif /* CONFIG_RPS */
2767 
2768 /*
2769  * Check if this softnet_data structure is another cpu one
2770  * If yes, queue it to our IPI list and return 1
2771  * If no, return 0
2772  */
2773 static int rps_ipi_queued(struct softnet_data *sd)
2774 {
2775 #ifdef CONFIG_RPS
2776 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2777 
2778 	if (sd != mysd) {
2779 		sd->rps_ipi_next = mysd->rps_ipi_list;
2780 		mysd->rps_ipi_list = sd;
2781 
2782 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2783 		return 1;
2784 	}
2785 #endif /* CONFIG_RPS */
2786 	return 0;
2787 }
2788 
2789 /*
2790  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2791  * queue (may be a remote CPU queue).
2792  */
2793 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2794 			      unsigned int *qtail)
2795 {
2796 	struct softnet_data *sd;
2797 	unsigned long flags;
2798 
2799 	sd = &per_cpu(softnet_data, cpu);
2800 
2801 	local_irq_save(flags);
2802 
2803 	rps_lock(sd);
2804 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2805 		if (skb_queue_len(&sd->input_pkt_queue)) {
2806 enqueue:
2807 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2808 			input_queue_tail_incr_save(sd, qtail);
2809 			rps_unlock(sd);
2810 			local_irq_restore(flags);
2811 			return NET_RX_SUCCESS;
2812 		}
2813 
2814 		/* Schedule NAPI for backlog device
2815 		 * We can use non atomic operation since we own the queue lock
2816 		 */
2817 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2818 			if (!rps_ipi_queued(sd))
2819 				____napi_schedule(sd, &sd->backlog);
2820 		}
2821 		goto enqueue;
2822 	}
2823 
2824 	sd->dropped++;
2825 	rps_unlock(sd);
2826 
2827 	local_irq_restore(flags);
2828 
2829 	atomic_long_inc(&skb->dev->rx_dropped);
2830 	kfree_skb(skb);
2831 	return NET_RX_DROP;
2832 }
2833 
2834 /**
2835  *	netif_rx	-	post buffer to the network code
2836  *	@skb: buffer to post
2837  *
2838  *	This function receives a packet from a device driver and queues it for
2839  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2840  *	may be dropped during processing for congestion control or by the
2841  *	protocol layers.
2842  *
2843  *	return values:
2844  *	NET_RX_SUCCESS	(no congestion)
2845  *	NET_RX_DROP     (packet was dropped)
2846  *
2847  */
2848 
2849 int netif_rx(struct sk_buff *skb)
2850 {
2851 	int ret;
2852 
2853 	/* if netpoll wants it, pretend we never saw it */
2854 	if (netpoll_rx(skb))
2855 		return NET_RX_DROP;
2856 
2857 	if (netdev_tstamp_prequeue)
2858 		net_timestamp_check(skb);
2859 
2860 	trace_netif_rx(skb);
2861 #ifdef CONFIG_RPS
2862 	{
2863 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2864 		int cpu;
2865 
2866 		preempt_disable();
2867 		rcu_read_lock();
2868 
2869 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2870 		if (cpu < 0)
2871 			cpu = smp_processor_id();
2872 
2873 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2874 
2875 		rcu_read_unlock();
2876 		preempt_enable();
2877 	}
2878 #else
2879 	{
2880 		unsigned int qtail;
2881 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2882 		put_cpu();
2883 	}
2884 #endif
2885 	return ret;
2886 }
2887 EXPORT_SYMBOL(netif_rx);
2888 
2889 int netif_rx_ni(struct sk_buff *skb)
2890 {
2891 	int err;
2892 
2893 	preempt_disable();
2894 	err = netif_rx(skb);
2895 	if (local_softirq_pending())
2896 		do_softirq();
2897 	preempt_enable();
2898 
2899 	return err;
2900 }
2901 EXPORT_SYMBOL(netif_rx_ni);
2902 
2903 static void net_tx_action(struct softirq_action *h)
2904 {
2905 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2906 
2907 	if (sd->completion_queue) {
2908 		struct sk_buff *clist;
2909 
2910 		local_irq_disable();
2911 		clist = sd->completion_queue;
2912 		sd->completion_queue = NULL;
2913 		local_irq_enable();
2914 
2915 		while (clist) {
2916 			struct sk_buff *skb = clist;
2917 			clist = clist->next;
2918 
2919 			WARN_ON(atomic_read(&skb->users));
2920 			trace_kfree_skb(skb, net_tx_action);
2921 			__kfree_skb(skb);
2922 		}
2923 	}
2924 
2925 	if (sd->output_queue) {
2926 		struct Qdisc *head;
2927 
2928 		local_irq_disable();
2929 		head = sd->output_queue;
2930 		sd->output_queue = NULL;
2931 		sd->output_queue_tailp = &sd->output_queue;
2932 		local_irq_enable();
2933 
2934 		while (head) {
2935 			struct Qdisc *q = head;
2936 			spinlock_t *root_lock;
2937 
2938 			head = head->next_sched;
2939 
2940 			root_lock = qdisc_lock(q);
2941 			if (spin_trylock(root_lock)) {
2942 				smp_mb__before_clear_bit();
2943 				clear_bit(__QDISC_STATE_SCHED,
2944 					  &q->state);
2945 				qdisc_run(q);
2946 				spin_unlock(root_lock);
2947 			} else {
2948 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2949 					      &q->state)) {
2950 					__netif_reschedule(q);
2951 				} else {
2952 					smp_mb__before_clear_bit();
2953 					clear_bit(__QDISC_STATE_SCHED,
2954 						  &q->state);
2955 				}
2956 			}
2957 		}
2958 	}
2959 }
2960 
2961 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2962     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2963 /* This hook is defined here for ATM LANE */
2964 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2965 			     unsigned char *addr) __read_mostly;
2966 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2967 #endif
2968 
2969 #ifdef CONFIG_NET_CLS_ACT
2970 /* TODO: Maybe we should just force sch_ingress to be compiled in
2971  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2972  * a compare and 2 stores extra right now if we dont have it on
2973  * but have CONFIG_NET_CLS_ACT
2974  * NOTE: This doesn't stop any functionality; if you dont have
2975  * the ingress scheduler, you just can't add policies on ingress.
2976  *
2977  */
2978 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2979 {
2980 	struct net_device *dev = skb->dev;
2981 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2982 	int result = TC_ACT_OK;
2983 	struct Qdisc *q;
2984 
2985 	if (unlikely(MAX_RED_LOOP < ttl++)) {
2986 		if (net_ratelimit())
2987 			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2988 			       skb->skb_iif, dev->ifindex);
2989 		return TC_ACT_SHOT;
2990 	}
2991 
2992 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2993 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2994 
2995 	q = rxq->qdisc;
2996 	if (q != &noop_qdisc) {
2997 		spin_lock(qdisc_lock(q));
2998 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2999 			result = qdisc_enqueue_root(skb, q);
3000 		spin_unlock(qdisc_lock(q));
3001 	}
3002 
3003 	return result;
3004 }
3005 
3006 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3007 					 struct packet_type **pt_prev,
3008 					 int *ret, struct net_device *orig_dev)
3009 {
3010 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3011 
3012 	if (!rxq || rxq->qdisc == &noop_qdisc)
3013 		goto out;
3014 
3015 	if (*pt_prev) {
3016 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3017 		*pt_prev = NULL;
3018 	}
3019 
3020 	switch (ing_filter(skb, rxq)) {
3021 	case TC_ACT_SHOT:
3022 	case TC_ACT_STOLEN:
3023 		kfree_skb(skb);
3024 		return NULL;
3025 	}
3026 
3027 out:
3028 	skb->tc_verd = 0;
3029 	return skb;
3030 }
3031 #endif
3032 
3033 /**
3034  *	netdev_rx_handler_register - register receive handler
3035  *	@dev: device to register a handler for
3036  *	@rx_handler: receive handler to register
3037  *	@rx_handler_data: data pointer that is used by rx handler
3038  *
3039  *	Register a receive hander for a device. This handler will then be
3040  *	called from __netif_receive_skb. A negative errno code is returned
3041  *	on a failure.
3042  *
3043  *	The caller must hold the rtnl_mutex.
3044  *
3045  *	For a general description of rx_handler, see enum rx_handler_result.
3046  */
3047 int netdev_rx_handler_register(struct net_device *dev,
3048 			       rx_handler_func_t *rx_handler,
3049 			       void *rx_handler_data)
3050 {
3051 	ASSERT_RTNL();
3052 
3053 	if (dev->rx_handler)
3054 		return -EBUSY;
3055 
3056 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3057 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3058 
3059 	return 0;
3060 }
3061 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3062 
3063 /**
3064  *	netdev_rx_handler_unregister - unregister receive handler
3065  *	@dev: device to unregister a handler from
3066  *
3067  *	Unregister a receive hander from a device.
3068  *
3069  *	The caller must hold the rtnl_mutex.
3070  */
3071 void netdev_rx_handler_unregister(struct net_device *dev)
3072 {
3073 
3074 	ASSERT_RTNL();
3075 	rcu_assign_pointer(dev->rx_handler, NULL);
3076 	rcu_assign_pointer(dev->rx_handler_data, NULL);
3077 }
3078 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3079 
3080 static int __netif_receive_skb(struct sk_buff *skb)
3081 {
3082 	struct packet_type *ptype, *pt_prev;
3083 	rx_handler_func_t *rx_handler;
3084 	struct net_device *orig_dev;
3085 	struct net_device *null_or_dev;
3086 	bool deliver_exact = false;
3087 	int ret = NET_RX_DROP;
3088 	__be16 type;
3089 
3090 	if (!netdev_tstamp_prequeue)
3091 		net_timestamp_check(skb);
3092 
3093 	trace_netif_receive_skb(skb);
3094 
3095 	/* if we've gotten here through NAPI, check netpoll */
3096 	if (netpoll_receive_skb(skb))
3097 		return NET_RX_DROP;
3098 
3099 	if (!skb->skb_iif)
3100 		skb->skb_iif = skb->dev->ifindex;
3101 	orig_dev = skb->dev;
3102 
3103 	skb_reset_network_header(skb);
3104 	skb_reset_transport_header(skb);
3105 	skb->mac_len = skb->network_header - skb->mac_header;
3106 
3107 	pt_prev = NULL;
3108 
3109 	rcu_read_lock();
3110 
3111 another_round:
3112 
3113 	__this_cpu_inc(softnet_data.processed);
3114 
3115 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3116 		skb = vlan_untag(skb);
3117 		if (unlikely(!skb))
3118 			goto out;
3119 	}
3120 
3121 #ifdef CONFIG_NET_CLS_ACT
3122 	if (skb->tc_verd & TC_NCLS) {
3123 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3124 		goto ncls;
3125 	}
3126 #endif
3127 
3128 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3129 		if (!ptype->dev || ptype->dev == skb->dev) {
3130 			if (pt_prev)
3131 				ret = deliver_skb(skb, pt_prev, orig_dev);
3132 			pt_prev = ptype;
3133 		}
3134 	}
3135 
3136 #ifdef CONFIG_NET_CLS_ACT
3137 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3138 	if (!skb)
3139 		goto out;
3140 ncls:
3141 #endif
3142 
3143 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3144 	if (rx_handler) {
3145 		if (pt_prev) {
3146 			ret = deliver_skb(skb, pt_prev, orig_dev);
3147 			pt_prev = NULL;
3148 		}
3149 		switch (rx_handler(&skb)) {
3150 		case RX_HANDLER_CONSUMED:
3151 			goto out;
3152 		case RX_HANDLER_ANOTHER:
3153 			goto another_round;
3154 		case RX_HANDLER_EXACT:
3155 			deliver_exact = true;
3156 		case RX_HANDLER_PASS:
3157 			break;
3158 		default:
3159 			BUG();
3160 		}
3161 	}
3162 
3163 	if (vlan_tx_tag_present(skb)) {
3164 		if (pt_prev) {
3165 			ret = deliver_skb(skb, pt_prev, orig_dev);
3166 			pt_prev = NULL;
3167 		}
3168 		if (vlan_do_receive(&skb)) {
3169 			ret = __netif_receive_skb(skb);
3170 			goto out;
3171 		} else if (unlikely(!skb))
3172 			goto out;
3173 	}
3174 
3175 	/* deliver only exact match when indicated */
3176 	null_or_dev = deliver_exact ? skb->dev : NULL;
3177 
3178 	type = skb->protocol;
3179 	list_for_each_entry_rcu(ptype,
3180 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3181 		if (ptype->type == type &&
3182 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3183 		     ptype->dev == orig_dev)) {
3184 			if (pt_prev)
3185 				ret = deliver_skb(skb, pt_prev, orig_dev);
3186 			pt_prev = ptype;
3187 		}
3188 	}
3189 
3190 	if (pt_prev) {
3191 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3192 	} else {
3193 		atomic_long_inc(&skb->dev->rx_dropped);
3194 		kfree_skb(skb);
3195 		/* Jamal, now you will not able to escape explaining
3196 		 * me how you were going to use this. :-)
3197 		 */
3198 		ret = NET_RX_DROP;
3199 	}
3200 
3201 out:
3202 	rcu_read_unlock();
3203 	return ret;
3204 }
3205 
3206 /**
3207  *	netif_receive_skb - process receive buffer from network
3208  *	@skb: buffer to process
3209  *
3210  *	netif_receive_skb() is the main receive data processing function.
3211  *	It always succeeds. The buffer may be dropped during processing
3212  *	for congestion control or by the protocol layers.
3213  *
3214  *	This function may only be called from softirq context and interrupts
3215  *	should be enabled.
3216  *
3217  *	Return values (usually ignored):
3218  *	NET_RX_SUCCESS: no congestion
3219  *	NET_RX_DROP: packet was dropped
3220  */
3221 int netif_receive_skb(struct sk_buff *skb)
3222 {
3223 	if (netdev_tstamp_prequeue)
3224 		net_timestamp_check(skb);
3225 
3226 	if (skb_defer_rx_timestamp(skb))
3227 		return NET_RX_SUCCESS;
3228 
3229 #ifdef CONFIG_RPS
3230 	{
3231 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3232 		int cpu, ret;
3233 
3234 		rcu_read_lock();
3235 
3236 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3237 
3238 		if (cpu >= 0) {
3239 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3240 			rcu_read_unlock();
3241 		} else {
3242 			rcu_read_unlock();
3243 			ret = __netif_receive_skb(skb);
3244 		}
3245 
3246 		return ret;
3247 	}
3248 #else
3249 	return __netif_receive_skb(skb);
3250 #endif
3251 }
3252 EXPORT_SYMBOL(netif_receive_skb);
3253 
3254 /* Network device is going away, flush any packets still pending
3255  * Called with irqs disabled.
3256  */
3257 static void flush_backlog(void *arg)
3258 {
3259 	struct net_device *dev = arg;
3260 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3261 	struct sk_buff *skb, *tmp;
3262 
3263 	rps_lock(sd);
3264 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3265 		if (skb->dev == dev) {
3266 			__skb_unlink(skb, &sd->input_pkt_queue);
3267 			kfree_skb(skb);
3268 			input_queue_head_incr(sd);
3269 		}
3270 	}
3271 	rps_unlock(sd);
3272 
3273 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3274 		if (skb->dev == dev) {
3275 			__skb_unlink(skb, &sd->process_queue);
3276 			kfree_skb(skb);
3277 			input_queue_head_incr(sd);
3278 		}
3279 	}
3280 }
3281 
3282 static int napi_gro_complete(struct sk_buff *skb)
3283 {
3284 	struct packet_type *ptype;
3285 	__be16 type = skb->protocol;
3286 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3287 	int err = -ENOENT;
3288 
3289 	if (NAPI_GRO_CB(skb)->count == 1) {
3290 		skb_shinfo(skb)->gso_size = 0;
3291 		goto out;
3292 	}
3293 
3294 	rcu_read_lock();
3295 	list_for_each_entry_rcu(ptype, head, list) {
3296 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3297 			continue;
3298 
3299 		err = ptype->gro_complete(skb);
3300 		break;
3301 	}
3302 	rcu_read_unlock();
3303 
3304 	if (err) {
3305 		WARN_ON(&ptype->list == head);
3306 		kfree_skb(skb);
3307 		return NET_RX_SUCCESS;
3308 	}
3309 
3310 out:
3311 	return netif_receive_skb(skb);
3312 }
3313 
3314 inline void napi_gro_flush(struct napi_struct *napi)
3315 {
3316 	struct sk_buff *skb, *next;
3317 
3318 	for (skb = napi->gro_list; skb; skb = next) {
3319 		next = skb->next;
3320 		skb->next = NULL;
3321 		napi_gro_complete(skb);
3322 	}
3323 
3324 	napi->gro_count = 0;
3325 	napi->gro_list = NULL;
3326 }
3327 EXPORT_SYMBOL(napi_gro_flush);
3328 
3329 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3330 {
3331 	struct sk_buff **pp = NULL;
3332 	struct packet_type *ptype;
3333 	__be16 type = skb->protocol;
3334 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3335 	int same_flow;
3336 	int mac_len;
3337 	enum gro_result ret;
3338 
3339 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3340 		goto normal;
3341 
3342 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3343 		goto normal;
3344 
3345 	rcu_read_lock();
3346 	list_for_each_entry_rcu(ptype, head, list) {
3347 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3348 			continue;
3349 
3350 		skb_set_network_header(skb, skb_gro_offset(skb));
3351 		mac_len = skb->network_header - skb->mac_header;
3352 		skb->mac_len = mac_len;
3353 		NAPI_GRO_CB(skb)->same_flow = 0;
3354 		NAPI_GRO_CB(skb)->flush = 0;
3355 		NAPI_GRO_CB(skb)->free = 0;
3356 
3357 		pp = ptype->gro_receive(&napi->gro_list, skb);
3358 		break;
3359 	}
3360 	rcu_read_unlock();
3361 
3362 	if (&ptype->list == head)
3363 		goto normal;
3364 
3365 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3366 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3367 
3368 	if (pp) {
3369 		struct sk_buff *nskb = *pp;
3370 
3371 		*pp = nskb->next;
3372 		nskb->next = NULL;
3373 		napi_gro_complete(nskb);
3374 		napi->gro_count--;
3375 	}
3376 
3377 	if (same_flow)
3378 		goto ok;
3379 
3380 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3381 		goto normal;
3382 
3383 	napi->gro_count++;
3384 	NAPI_GRO_CB(skb)->count = 1;
3385 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3386 	skb->next = napi->gro_list;
3387 	napi->gro_list = skb;
3388 	ret = GRO_HELD;
3389 
3390 pull:
3391 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3392 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3393 
3394 		BUG_ON(skb->end - skb->tail < grow);
3395 
3396 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3397 
3398 		skb->tail += grow;
3399 		skb->data_len -= grow;
3400 
3401 		skb_shinfo(skb)->frags[0].page_offset += grow;
3402 		skb_shinfo(skb)->frags[0].size -= grow;
3403 
3404 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3405 			put_page(skb_shinfo(skb)->frags[0].page);
3406 			memmove(skb_shinfo(skb)->frags,
3407 				skb_shinfo(skb)->frags + 1,
3408 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3409 		}
3410 	}
3411 
3412 ok:
3413 	return ret;
3414 
3415 normal:
3416 	ret = GRO_NORMAL;
3417 	goto pull;
3418 }
3419 EXPORT_SYMBOL(dev_gro_receive);
3420 
3421 static inline gro_result_t
3422 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3423 {
3424 	struct sk_buff *p;
3425 
3426 	for (p = napi->gro_list; p; p = p->next) {
3427 		unsigned long diffs;
3428 
3429 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3430 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3431 		diffs |= compare_ether_header(skb_mac_header(p),
3432 					      skb_gro_mac_header(skb));
3433 		NAPI_GRO_CB(p)->same_flow = !diffs;
3434 		NAPI_GRO_CB(p)->flush = 0;
3435 	}
3436 
3437 	return dev_gro_receive(napi, skb);
3438 }
3439 
3440 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3441 {
3442 	switch (ret) {
3443 	case GRO_NORMAL:
3444 		if (netif_receive_skb(skb))
3445 			ret = GRO_DROP;
3446 		break;
3447 
3448 	case GRO_DROP:
3449 	case GRO_MERGED_FREE:
3450 		kfree_skb(skb);
3451 		break;
3452 
3453 	case GRO_HELD:
3454 	case GRO_MERGED:
3455 		break;
3456 	}
3457 
3458 	return ret;
3459 }
3460 EXPORT_SYMBOL(napi_skb_finish);
3461 
3462 void skb_gro_reset_offset(struct sk_buff *skb)
3463 {
3464 	NAPI_GRO_CB(skb)->data_offset = 0;
3465 	NAPI_GRO_CB(skb)->frag0 = NULL;
3466 	NAPI_GRO_CB(skb)->frag0_len = 0;
3467 
3468 	if (skb->mac_header == skb->tail &&
3469 	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3470 		NAPI_GRO_CB(skb)->frag0 =
3471 			page_address(skb_shinfo(skb)->frags[0].page) +
3472 			skb_shinfo(skb)->frags[0].page_offset;
3473 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3474 	}
3475 }
3476 EXPORT_SYMBOL(skb_gro_reset_offset);
3477 
3478 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3479 {
3480 	skb_gro_reset_offset(skb);
3481 
3482 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3483 }
3484 EXPORT_SYMBOL(napi_gro_receive);
3485 
3486 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3487 {
3488 	__skb_pull(skb, skb_headlen(skb));
3489 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3490 	skb->vlan_tci = 0;
3491 	skb->dev = napi->dev;
3492 	skb->skb_iif = 0;
3493 
3494 	napi->skb = skb;
3495 }
3496 
3497 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3498 {
3499 	struct sk_buff *skb = napi->skb;
3500 
3501 	if (!skb) {
3502 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3503 		if (skb)
3504 			napi->skb = skb;
3505 	}
3506 	return skb;
3507 }
3508 EXPORT_SYMBOL(napi_get_frags);
3509 
3510 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3511 			       gro_result_t ret)
3512 {
3513 	switch (ret) {
3514 	case GRO_NORMAL:
3515 	case GRO_HELD:
3516 		skb->protocol = eth_type_trans(skb, skb->dev);
3517 
3518 		if (ret == GRO_HELD)
3519 			skb_gro_pull(skb, -ETH_HLEN);
3520 		else if (netif_receive_skb(skb))
3521 			ret = GRO_DROP;
3522 		break;
3523 
3524 	case GRO_DROP:
3525 	case GRO_MERGED_FREE:
3526 		napi_reuse_skb(napi, skb);
3527 		break;
3528 
3529 	case GRO_MERGED:
3530 		break;
3531 	}
3532 
3533 	return ret;
3534 }
3535 EXPORT_SYMBOL(napi_frags_finish);
3536 
3537 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3538 {
3539 	struct sk_buff *skb = napi->skb;
3540 	struct ethhdr *eth;
3541 	unsigned int hlen;
3542 	unsigned int off;
3543 
3544 	napi->skb = NULL;
3545 
3546 	skb_reset_mac_header(skb);
3547 	skb_gro_reset_offset(skb);
3548 
3549 	off = skb_gro_offset(skb);
3550 	hlen = off + sizeof(*eth);
3551 	eth = skb_gro_header_fast(skb, off);
3552 	if (skb_gro_header_hard(skb, hlen)) {
3553 		eth = skb_gro_header_slow(skb, hlen, off);
3554 		if (unlikely(!eth)) {
3555 			napi_reuse_skb(napi, skb);
3556 			skb = NULL;
3557 			goto out;
3558 		}
3559 	}
3560 
3561 	skb_gro_pull(skb, sizeof(*eth));
3562 
3563 	/*
3564 	 * This works because the only protocols we care about don't require
3565 	 * special handling.  We'll fix it up properly at the end.
3566 	 */
3567 	skb->protocol = eth->h_proto;
3568 
3569 out:
3570 	return skb;
3571 }
3572 EXPORT_SYMBOL(napi_frags_skb);
3573 
3574 gro_result_t napi_gro_frags(struct napi_struct *napi)
3575 {
3576 	struct sk_buff *skb = napi_frags_skb(napi);
3577 
3578 	if (!skb)
3579 		return GRO_DROP;
3580 
3581 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3582 }
3583 EXPORT_SYMBOL(napi_gro_frags);
3584 
3585 /*
3586  * net_rps_action sends any pending IPI's for rps.
3587  * Note: called with local irq disabled, but exits with local irq enabled.
3588  */
3589 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3590 {
3591 #ifdef CONFIG_RPS
3592 	struct softnet_data *remsd = sd->rps_ipi_list;
3593 
3594 	if (remsd) {
3595 		sd->rps_ipi_list = NULL;
3596 
3597 		local_irq_enable();
3598 
3599 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3600 		while (remsd) {
3601 			struct softnet_data *next = remsd->rps_ipi_next;
3602 
3603 			if (cpu_online(remsd->cpu))
3604 				__smp_call_function_single(remsd->cpu,
3605 							   &remsd->csd, 0);
3606 			remsd = next;
3607 		}
3608 	} else
3609 #endif
3610 		local_irq_enable();
3611 }
3612 
3613 static int process_backlog(struct napi_struct *napi, int quota)
3614 {
3615 	int work = 0;
3616 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3617 
3618 #ifdef CONFIG_RPS
3619 	/* Check if we have pending ipi, its better to send them now,
3620 	 * not waiting net_rx_action() end.
3621 	 */
3622 	if (sd->rps_ipi_list) {
3623 		local_irq_disable();
3624 		net_rps_action_and_irq_enable(sd);
3625 	}
3626 #endif
3627 	napi->weight = weight_p;
3628 	local_irq_disable();
3629 	while (work < quota) {
3630 		struct sk_buff *skb;
3631 		unsigned int qlen;
3632 
3633 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3634 			local_irq_enable();
3635 			__netif_receive_skb(skb);
3636 			local_irq_disable();
3637 			input_queue_head_incr(sd);
3638 			if (++work >= quota) {
3639 				local_irq_enable();
3640 				return work;
3641 			}
3642 		}
3643 
3644 		rps_lock(sd);
3645 		qlen = skb_queue_len(&sd->input_pkt_queue);
3646 		if (qlen)
3647 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3648 						   &sd->process_queue);
3649 
3650 		if (qlen < quota - work) {
3651 			/*
3652 			 * Inline a custom version of __napi_complete().
3653 			 * only current cpu owns and manipulates this napi,
3654 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3655 			 * we can use a plain write instead of clear_bit(),
3656 			 * and we dont need an smp_mb() memory barrier.
3657 			 */
3658 			list_del(&napi->poll_list);
3659 			napi->state = 0;
3660 
3661 			quota = work + qlen;
3662 		}
3663 		rps_unlock(sd);
3664 	}
3665 	local_irq_enable();
3666 
3667 	return work;
3668 }
3669 
3670 /**
3671  * __napi_schedule - schedule for receive
3672  * @n: entry to schedule
3673  *
3674  * The entry's receive function will be scheduled to run
3675  */
3676 void __napi_schedule(struct napi_struct *n)
3677 {
3678 	unsigned long flags;
3679 
3680 	local_irq_save(flags);
3681 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3682 	local_irq_restore(flags);
3683 }
3684 EXPORT_SYMBOL(__napi_schedule);
3685 
3686 void __napi_complete(struct napi_struct *n)
3687 {
3688 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3689 	BUG_ON(n->gro_list);
3690 
3691 	list_del(&n->poll_list);
3692 	smp_mb__before_clear_bit();
3693 	clear_bit(NAPI_STATE_SCHED, &n->state);
3694 }
3695 EXPORT_SYMBOL(__napi_complete);
3696 
3697 void napi_complete(struct napi_struct *n)
3698 {
3699 	unsigned long flags;
3700 
3701 	/*
3702 	 * don't let napi dequeue from the cpu poll list
3703 	 * just in case its running on a different cpu
3704 	 */
3705 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3706 		return;
3707 
3708 	napi_gro_flush(n);
3709 	local_irq_save(flags);
3710 	__napi_complete(n);
3711 	local_irq_restore(flags);
3712 }
3713 EXPORT_SYMBOL(napi_complete);
3714 
3715 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3716 		    int (*poll)(struct napi_struct *, int), int weight)
3717 {
3718 	INIT_LIST_HEAD(&napi->poll_list);
3719 	napi->gro_count = 0;
3720 	napi->gro_list = NULL;
3721 	napi->skb = NULL;
3722 	napi->poll = poll;
3723 	napi->weight = weight;
3724 	list_add(&napi->dev_list, &dev->napi_list);
3725 	napi->dev = dev;
3726 #ifdef CONFIG_NETPOLL
3727 	spin_lock_init(&napi->poll_lock);
3728 	napi->poll_owner = -1;
3729 #endif
3730 	set_bit(NAPI_STATE_SCHED, &napi->state);
3731 }
3732 EXPORT_SYMBOL(netif_napi_add);
3733 
3734 void netif_napi_del(struct napi_struct *napi)
3735 {
3736 	struct sk_buff *skb, *next;
3737 
3738 	list_del_init(&napi->dev_list);
3739 	napi_free_frags(napi);
3740 
3741 	for (skb = napi->gro_list; skb; skb = next) {
3742 		next = skb->next;
3743 		skb->next = NULL;
3744 		kfree_skb(skb);
3745 	}
3746 
3747 	napi->gro_list = NULL;
3748 	napi->gro_count = 0;
3749 }
3750 EXPORT_SYMBOL(netif_napi_del);
3751 
3752 static void net_rx_action(struct softirq_action *h)
3753 {
3754 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3755 	unsigned long time_limit = jiffies + 2;
3756 	int budget = netdev_budget;
3757 	void *have;
3758 
3759 	local_irq_disable();
3760 
3761 	while (!list_empty(&sd->poll_list)) {
3762 		struct napi_struct *n;
3763 		int work, weight;
3764 
3765 		/* If softirq window is exhuasted then punt.
3766 		 * Allow this to run for 2 jiffies since which will allow
3767 		 * an average latency of 1.5/HZ.
3768 		 */
3769 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3770 			goto softnet_break;
3771 
3772 		local_irq_enable();
3773 
3774 		/* Even though interrupts have been re-enabled, this
3775 		 * access is safe because interrupts can only add new
3776 		 * entries to the tail of this list, and only ->poll()
3777 		 * calls can remove this head entry from the list.
3778 		 */
3779 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3780 
3781 		have = netpoll_poll_lock(n);
3782 
3783 		weight = n->weight;
3784 
3785 		/* This NAPI_STATE_SCHED test is for avoiding a race
3786 		 * with netpoll's poll_napi().  Only the entity which
3787 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3788 		 * actually make the ->poll() call.  Therefore we avoid
3789 		 * accidentally calling ->poll() when NAPI is not scheduled.
3790 		 */
3791 		work = 0;
3792 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3793 			work = n->poll(n, weight);
3794 			trace_napi_poll(n);
3795 		}
3796 
3797 		WARN_ON_ONCE(work > weight);
3798 
3799 		budget -= work;
3800 
3801 		local_irq_disable();
3802 
3803 		/* Drivers must not modify the NAPI state if they
3804 		 * consume the entire weight.  In such cases this code
3805 		 * still "owns" the NAPI instance and therefore can
3806 		 * move the instance around on the list at-will.
3807 		 */
3808 		if (unlikely(work == weight)) {
3809 			if (unlikely(napi_disable_pending(n))) {
3810 				local_irq_enable();
3811 				napi_complete(n);
3812 				local_irq_disable();
3813 			} else
3814 				list_move_tail(&n->poll_list, &sd->poll_list);
3815 		}
3816 
3817 		netpoll_poll_unlock(have);
3818 	}
3819 out:
3820 	net_rps_action_and_irq_enable(sd);
3821 
3822 #ifdef CONFIG_NET_DMA
3823 	/*
3824 	 * There may not be any more sk_buffs coming right now, so push
3825 	 * any pending DMA copies to hardware
3826 	 */
3827 	dma_issue_pending_all();
3828 #endif
3829 
3830 	return;
3831 
3832 softnet_break:
3833 	sd->time_squeeze++;
3834 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3835 	goto out;
3836 }
3837 
3838 static gifconf_func_t *gifconf_list[NPROTO];
3839 
3840 /**
3841  *	register_gifconf	-	register a SIOCGIF handler
3842  *	@family: Address family
3843  *	@gifconf: Function handler
3844  *
3845  *	Register protocol dependent address dumping routines. The handler
3846  *	that is passed must not be freed or reused until it has been replaced
3847  *	by another handler.
3848  */
3849 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3850 {
3851 	if (family >= NPROTO)
3852 		return -EINVAL;
3853 	gifconf_list[family] = gifconf;
3854 	return 0;
3855 }
3856 EXPORT_SYMBOL(register_gifconf);
3857 
3858 
3859 /*
3860  *	Map an interface index to its name (SIOCGIFNAME)
3861  */
3862 
3863 /*
3864  *	We need this ioctl for efficient implementation of the
3865  *	if_indextoname() function required by the IPv6 API.  Without
3866  *	it, we would have to search all the interfaces to find a
3867  *	match.  --pb
3868  */
3869 
3870 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3871 {
3872 	struct net_device *dev;
3873 	struct ifreq ifr;
3874 
3875 	/*
3876 	 *	Fetch the caller's info block.
3877 	 */
3878 
3879 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3880 		return -EFAULT;
3881 
3882 	rcu_read_lock();
3883 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3884 	if (!dev) {
3885 		rcu_read_unlock();
3886 		return -ENODEV;
3887 	}
3888 
3889 	strcpy(ifr.ifr_name, dev->name);
3890 	rcu_read_unlock();
3891 
3892 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3893 		return -EFAULT;
3894 	return 0;
3895 }
3896 
3897 /*
3898  *	Perform a SIOCGIFCONF call. This structure will change
3899  *	size eventually, and there is nothing I can do about it.
3900  *	Thus we will need a 'compatibility mode'.
3901  */
3902 
3903 static int dev_ifconf(struct net *net, char __user *arg)
3904 {
3905 	struct ifconf ifc;
3906 	struct net_device *dev;
3907 	char __user *pos;
3908 	int len;
3909 	int total;
3910 	int i;
3911 
3912 	/*
3913 	 *	Fetch the caller's info block.
3914 	 */
3915 
3916 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3917 		return -EFAULT;
3918 
3919 	pos = ifc.ifc_buf;
3920 	len = ifc.ifc_len;
3921 
3922 	/*
3923 	 *	Loop over the interfaces, and write an info block for each.
3924 	 */
3925 
3926 	total = 0;
3927 	for_each_netdev(net, dev) {
3928 		for (i = 0; i < NPROTO; i++) {
3929 			if (gifconf_list[i]) {
3930 				int done;
3931 				if (!pos)
3932 					done = gifconf_list[i](dev, NULL, 0);
3933 				else
3934 					done = gifconf_list[i](dev, pos + total,
3935 							       len - total);
3936 				if (done < 0)
3937 					return -EFAULT;
3938 				total += done;
3939 			}
3940 		}
3941 	}
3942 
3943 	/*
3944 	 *	All done.  Write the updated control block back to the caller.
3945 	 */
3946 	ifc.ifc_len = total;
3947 
3948 	/*
3949 	 * 	Both BSD and Solaris return 0 here, so we do too.
3950 	 */
3951 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3952 }
3953 
3954 #ifdef CONFIG_PROC_FS
3955 /*
3956  *	This is invoked by the /proc filesystem handler to display a device
3957  *	in detail.
3958  */
3959 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3960 	__acquires(RCU)
3961 {
3962 	struct net *net = seq_file_net(seq);
3963 	loff_t off;
3964 	struct net_device *dev;
3965 
3966 	rcu_read_lock();
3967 	if (!*pos)
3968 		return SEQ_START_TOKEN;
3969 
3970 	off = 1;
3971 	for_each_netdev_rcu(net, dev)
3972 		if (off++ == *pos)
3973 			return dev;
3974 
3975 	return NULL;
3976 }
3977 
3978 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3979 {
3980 	struct net_device *dev = v;
3981 
3982 	if (v == SEQ_START_TOKEN)
3983 		dev = first_net_device_rcu(seq_file_net(seq));
3984 	else
3985 		dev = next_net_device_rcu(dev);
3986 
3987 	++*pos;
3988 	return dev;
3989 }
3990 
3991 void dev_seq_stop(struct seq_file *seq, void *v)
3992 	__releases(RCU)
3993 {
3994 	rcu_read_unlock();
3995 }
3996 
3997 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3998 {
3999 	struct rtnl_link_stats64 temp;
4000 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4001 
4002 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4003 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4004 		   dev->name, stats->rx_bytes, stats->rx_packets,
4005 		   stats->rx_errors,
4006 		   stats->rx_dropped + stats->rx_missed_errors,
4007 		   stats->rx_fifo_errors,
4008 		   stats->rx_length_errors + stats->rx_over_errors +
4009 		    stats->rx_crc_errors + stats->rx_frame_errors,
4010 		   stats->rx_compressed, stats->multicast,
4011 		   stats->tx_bytes, stats->tx_packets,
4012 		   stats->tx_errors, stats->tx_dropped,
4013 		   stats->tx_fifo_errors, stats->collisions,
4014 		   stats->tx_carrier_errors +
4015 		    stats->tx_aborted_errors +
4016 		    stats->tx_window_errors +
4017 		    stats->tx_heartbeat_errors,
4018 		   stats->tx_compressed);
4019 }
4020 
4021 /*
4022  *	Called from the PROCfs module. This now uses the new arbitrary sized
4023  *	/proc/net interface to create /proc/net/dev
4024  */
4025 static int dev_seq_show(struct seq_file *seq, void *v)
4026 {
4027 	if (v == SEQ_START_TOKEN)
4028 		seq_puts(seq, "Inter-|   Receive                            "
4029 			      "                    |  Transmit\n"
4030 			      " face |bytes    packets errs drop fifo frame "
4031 			      "compressed multicast|bytes    packets errs "
4032 			      "drop fifo colls carrier compressed\n");
4033 	else
4034 		dev_seq_printf_stats(seq, v);
4035 	return 0;
4036 }
4037 
4038 static struct softnet_data *softnet_get_online(loff_t *pos)
4039 {
4040 	struct softnet_data *sd = NULL;
4041 
4042 	while (*pos < nr_cpu_ids)
4043 		if (cpu_online(*pos)) {
4044 			sd = &per_cpu(softnet_data, *pos);
4045 			break;
4046 		} else
4047 			++*pos;
4048 	return sd;
4049 }
4050 
4051 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4052 {
4053 	return softnet_get_online(pos);
4054 }
4055 
4056 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4057 {
4058 	++*pos;
4059 	return softnet_get_online(pos);
4060 }
4061 
4062 static void softnet_seq_stop(struct seq_file *seq, void *v)
4063 {
4064 }
4065 
4066 static int softnet_seq_show(struct seq_file *seq, void *v)
4067 {
4068 	struct softnet_data *sd = v;
4069 
4070 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4071 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4072 		   0, 0, 0, 0, /* was fastroute */
4073 		   sd->cpu_collision, sd->received_rps);
4074 	return 0;
4075 }
4076 
4077 static const struct seq_operations dev_seq_ops = {
4078 	.start = dev_seq_start,
4079 	.next  = dev_seq_next,
4080 	.stop  = dev_seq_stop,
4081 	.show  = dev_seq_show,
4082 };
4083 
4084 static int dev_seq_open(struct inode *inode, struct file *file)
4085 {
4086 	return seq_open_net(inode, file, &dev_seq_ops,
4087 			    sizeof(struct seq_net_private));
4088 }
4089 
4090 static const struct file_operations dev_seq_fops = {
4091 	.owner	 = THIS_MODULE,
4092 	.open    = dev_seq_open,
4093 	.read    = seq_read,
4094 	.llseek  = seq_lseek,
4095 	.release = seq_release_net,
4096 };
4097 
4098 static const struct seq_operations softnet_seq_ops = {
4099 	.start = softnet_seq_start,
4100 	.next  = softnet_seq_next,
4101 	.stop  = softnet_seq_stop,
4102 	.show  = softnet_seq_show,
4103 };
4104 
4105 static int softnet_seq_open(struct inode *inode, struct file *file)
4106 {
4107 	return seq_open(file, &softnet_seq_ops);
4108 }
4109 
4110 static const struct file_operations softnet_seq_fops = {
4111 	.owner	 = THIS_MODULE,
4112 	.open    = softnet_seq_open,
4113 	.read    = seq_read,
4114 	.llseek  = seq_lseek,
4115 	.release = seq_release,
4116 };
4117 
4118 static void *ptype_get_idx(loff_t pos)
4119 {
4120 	struct packet_type *pt = NULL;
4121 	loff_t i = 0;
4122 	int t;
4123 
4124 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4125 		if (i == pos)
4126 			return pt;
4127 		++i;
4128 	}
4129 
4130 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4131 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4132 			if (i == pos)
4133 				return pt;
4134 			++i;
4135 		}
4136 	}
4137 	return NULL;
4138 }
4139 
4140 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4141 	__acquires(RCU)
4142 {
4143 	rcu_read_lock();
4144 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4145 }
4146 
4147 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4148 {
4149 	struct packet_type *pt;
4150 	struct list_head *nxt;
4151 	int hash;
4152 
4153 	++*pos;
4154 	if (v == SEQ_START_TOKEN)
4155 		return ptype_get_idx(0);
4156 
4157 	pt = v;
4158 	nxt = pt->list.next;
4159 	if (pt->type == htons(ETH_P_ALL)) {
4160 		if (nxt != &ptype_all)
4161 			goto found;
4162 		hash = 0;
4163 		nxt = ptype_base[0].next;
4164 	} else
4165 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4166 
4167 	while (nxt == &ptype_base[hash]) {
4168 		if (++hash >= PTYPE_HASH_SIZE)
4169 			return NULL;
4170 		nxt = ptype_base[hash].next;
4171 	}
4172 found:
4173 	return list_entry(nxt, struct packet_type, list);
4174 }
4175 
4176 static void ptype_seq_stop(struct seq_file *seq, void *v)
4177 	__releases(RCU)
4178 {
4179 	rcu_read_unlock();
4180 }
4181 
4182 static int ptype_seq_show(struct seq_file *seq, void *v)
4183 {
4184 	struct packet_type *pt = v;
4185 
4186 	if (v == SEQ_START_TOKEN)
4187 		seq_puts(seq, "Type Device      Function\n");
4188 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4189 		if (pt->type == htons(ETH_P_ALL))
4190 			seq_puts(seq, "ALL ");
4191 		else
4192 			seq_printf(seq, "%04x", ntohs(pt->type));
4193 
4194 		seq_printf(seq, " %-8s %pF\n",
4195 			   pt->dev ? pt->dev->name : "", pt->func);
4196 	}
4197 
4198 	return 0;
4199 }
4200 
4201 static const struct seq_operations ptype_seq_ops = {
4202 	.start = ptype_seq_start,
4203 	.next  = ptype_seq_next,
4204 	.stop  = ptype_seq_stop,
4205 	.show  = ptype_seq_show,
4206 };
4207 
4208 static int ptype_seq_open(struct inode *inode, struct file *file)
4209 {
4210 	return seq_open_net(inode, file, &ptype_seq_ops,
4211 			sizeof(struct seq_net_private));
4212 }
4213 
4214 static const struct file_operations ptype_seq_fops = {
4215 	.owner	 = THIS_MODULE,
4216 	.open    = ptype_seq_open,
4217 	.read    = seq_read,
4218 	.llseek  = seq_lseek,
4219 	.release = seq_release_net,
4220 };
4221 
4222 
4223 static int __net_init dev_proc_net_init(struct net *net)
4224 {
4225 	int rc = -ENOMEM;
4226 
4227 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4228 		goto out;
4229 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4230 		goto out_dev;
4231 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4232 		goto out_softnet;
4233 
4234 	if (wext_proc_init(net))
4235 		goto out_ptype;
4236 	rc = 0;
4237 out:
4238 	return rc;
4239 out_ptype:
4240 	proc_net_remove(net, "ptype");
4241 out_softnet:
4242 	proc_net_remove(net, "softnet_stat");
4243 out_dev:
4244 	proc_net_remove(net, "dev");
4245 	goto out;
4246 }
4247 
4248 static void __net_exit dev_proc_net_exit(struct net *net)
4249 {
4250 	wext_proc_exit(net);
4251 
4252 	proc_net_remove(net, "ptype");
4253 	proc_net_remove(net, "softnet_stat");
4254 	proc_net_remove(net, "dev");
4255 }
4256 
4257 static struct pernet_operations __net_initdata dev_proc_ops = {
4258 	.init = dev_proc_net_init,
4259 	.exit = dev_proc_net_exit,
4260 };
4261 
4262 static int __init dev_proc_init(void)
4263 {
4264 	return register_pernet_subsys(&dev_proc_ops);
4265 }
4266 #else
4267 #define dev_proc_init() 0
4268 #endif	/* CONFIG_PROC_FS */
4269 
4270 
4271 /**
4272  *	netdev_set_master	-	set up master pointer
4273  *	@slave: slave device
4274  *	@master: new master device
4275  *
4276  *	Changes the master device of the slave. Pass %NULL to break the
4277  *	bonding. The caller must hold the RTNL semaphore. On a failure
4278  *	a negative errno code is returned. On success the reference counts
4279  *	are adjusted and the function returns zero.
4280  */
4281 int netdev_set_master(struct net_device *slave, struct net_device *master)
4282 {
4283 	struct net_device *old = slave->master;
4284 
4285 	ASSERT_RTNL();
4286 
4287 	if (master) {
4288 		if (old)
4289 			return -EBUSY;
4290 		dev_hold(master);
4291 	}
4292 
4293 	slave->master = master;
4294 
4295 	if (old) {
4296 		synchronize_net();
4297 		dev_put(old);
4298 	}
4299 	return 0;
4300 }
4301 EXPORT_SYMBOL(netdev_set_master);
4302 
4303 /**
4304  *	netdev_set_bond_master	-	set up bonding master/slave pair
4305  *	@slave: slave device
4306  *	@master: new master device
4307  *
4308  *	Changes the master device of the slave. Pass %NULL to break the
4309  *	bonding. The caller must hold the RTNL semaphore. On a failure
4310  *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4311  *	to the routing socket and the function returns zero.
4312  */
4313 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4314 {
4315 	int err;
4316 
4317 	ASSERT_RTNL();
4318 
4319 	err = netdev_set_master(slave, master);
4320 	if (err)
4321 		return err;
4322 	if (master)
4323 		slave->flags |= IFF_SLAVE;
4324 	else
4325 		slave->flags &= ~IFF_SLAVE;
4326 
4327 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4328 	return 0;
4329 }
4330 EXPORT_SYMBOL(netdev_set_bond_master);
4331 
4332 static void dev_change_rx_flags(struct net_device *dev, int flags)
4333 {
4334 	const struct net_device_ops *ops = dev->netdev_ops;
4335 
4336 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4337 		ops->ndo_change_rx_flags(dev, flags);
4338 }
4339 
4340 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4341 {
4342 	unsigned short old_flags = dev->flags;
4343 	uid_t uid;
4344 	gid_t gid;
4345 
4346 	ASSERT_RTNL();
4347 
4348 	dev->flags |= IFF_PROMISC;
4349 	dev->promiscuity += inc;
4350 	if (dev->promiscuity == 0) {
4351 		/*
4352 		 * Avoid overflow.
4353 		 * If inc causes overflow, untouch promisc and return error.
4354 		 */
4355 		if (inc < 0)
4356 			dev->flags &= ~IFF_PROMISC;
4357 		else {
4358 			dev->promiscuity -= inc;
4359 			printk(KERN_WARNING "%s: promiscuity touches roof, "
4360 				"set promiscuity failed, promiscuity feature "
4361 				"of device might be broken.\n", dev->name);
4362 			return -EOVERFLOW;
4363 		}
4364 	}
4365 	if (dev->flags != old_flags) {
4366 		printk(KERN_INFO "device %s %s promiscuous mode\n",
4367 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4368 							       "left");
4369 		if (audit_enabled) {
4370 			current_uid_gid(&uid, &gid);
4371 			audit_log(current->audit_context, GFP_ATOMIC,
4372 				AUDIT_ANOM_PROMISCUOUS,
4373 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4374 				dev->name, (dev->flags & IFF_PROMISC),
4375 				(old_flags & IFF_PROMISC),
4376 				audit_get_loginuid(current),
4377 				uid, gid,
4378 				audit_get_sessionid(current));
4379 		}
4380 
4381 		dev_change_rx_flags(dev, IFF_PROMISC);
4382 	}
4383 	return 0;
4384 }
4385 
4386 /**
4387  *	dev_set_promiscuity	- update promiscuity count on a device
4388  *	@dev: device
4389  *	@inc: modifier
4390  *
4391  *	Add or remove promiscuity from a device. While the count in the device
4392  *	remains above zero the interface remains promiscuous. Once it hits zero
4393  *	the device reverts back to normal filtering operation. A negative inc
4394  *	value is used to drop promiscuity on the device.
4395  *	Return 0 if successful or a negative errno code on error.
4396  */
4397 int dev_set_promiscuity(struct net_device *dev, int inc)
4398 {
4399 	unsigned short old_flags = dev->flags;
4400 	int err;
4401 
4402 	err = __dev_set_promiscuity(dev, inc);
4403 	if (err < 0)
4404 		return err;
4405 	if (dev->flags != old_flags)
4406 		dev_set_rx_mode(dev);
4407 	return err;
4408 }
4409 EXPORT_SYMBOL(dev_set_promiscuity);
4410 
4411 /**
4412  *	dev_set_allmulti	- update allmulti count on a device
4413  *	@dev: device
4414  *	@inc: modifier
4415  *
4416  *	Add or remove reception of all multicast frames to a device. While the
4417  *	count in the device remains above zero the interface remains listening
4418  *	to all interfaces. Once it hits zero the device reverts back to normal
4419  *	filtering operation. A negative @inc value is used to drop the counter
4420  *	when releasing a resource needing all multicasts.
4421  *	Return 0 if successful or a negative errno code on error.
4422  */
4423 
4424 int dev_set_allmulti(struct net_device *dev, int inc)
4425 {
4426 	unsigned short old_flags = dev->flags;
4427 
4428 	ASSERT_RTNL();
4429 
4430 	dev->flags |= IFF_ALLMULTI;
4431 	dev->allmulti += inc;
4432 	if (dev->allmulti == 0) {
4433 		/*
4434 		 * Avoid overflow.
4435 		 * If inc causes overflow, untouch allmulti and return error.
4436 		 */
4437 		if (inc < 0)
4438 			dev->flags &= ~IFF_ALLMULTI;
4439 		else {
4440 			dev->allmulti -= inc;
4441 			printk(KERN_WARNING "%s: allmulti touches roof, "
4442 				"set allmulti failed, allmulti feature of "
4443 				"device might be broken.\n", dev->name);
4444 			return -EOVERFLOW;
4445 		}
4446 	}
4447 	if (dev->flags ^ old_flags) {
4448 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4449 		dev_set_rx_mode(dev);
4450 	}
4451 	return 0;
4452 }
4453 EXPORT_SYMBOL(dev_set_allmulti);
4454 
4455 /*
4456  *	Upload unicast and multicast address lists to device and
4457  *	configure RX filtering. When the device doesn't support unicast
4458  *	filtering it is put in promiscuous mode while unicast addresses
4459  *	are present.
4460  */
4461 void __dev_set_rx_mode(struct net_device *dev)
4462 {
4463 	const struct net_device_ops *ops = dev->netdev_ops;
4464 
4465 	/* dev_open will call this function so the list will stay sane. */
4466 	if (!(dev->flags&IFF_UP))
4467 		return;
4468 
4469 	if (!netif_device_present(dev))
4470 		return;
4471 
4472 	if (ops->ndo_set_rx_mode)
4473 		ops->ndo_set_rx_mode(dev);
4474 	else {
4475 		/* Unicast addresses changes may only happen under the rtnl,
4476 		 * therefore calling __dev_set_promiscuity here is safe.
4477 		 */
4478 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4479 			__dev_set_promiscuity(dev, 1);
4480 			dev->uc_promisc = 1;
4481 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4482 			__dev_set_promiscuity(dev, -1);
4483 			dev->uc_promisc = 0;
4484 		}
4485 
4486 		if (ops->ndo_set_multicast_list)
4487 			ops->ndo_set_multicast_list(dev);
4488 	}
4489 }
4490 
4491 void dev_set_rx_mode(struct net_device *dev)
4492 {
4493 	netif_addr_lock_bh(dev);
4494 	__dev_set_rx_mode(dev);
4495 	netif_addr_unlock_bh(dev);
4496 }
4497 
4498 /**
4499  *	dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
4500  *	@dev: device
4501  *	@cmd: memory area for ethtool_ops::get_settings() result
4502  *
4503  *      The cmd arg is initialized properly (cleared and
4504  *      ethtool_cmd::cmd field set to ETHTOOL_GSET).
4505  *
4506  *	Return device's ethtool_ops::get_settings() result value or
4507  *	-EOPNOTSUPP when device doesn't expose
4508  *	ethtool_ops::get_settings() operation.
4509  */
4510 int dev_ethtool_get_settings(struct net_device *dev,
4511 			     struct ethtool_cmd *cmd)
4512 {
4513 	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
4514 		return -EOPNOTSUPP;
4515 
4516 	memset(cmd, 0, sizeof(struct ethtool_cmd));
4517 	cmd->cmd = ETHTOOL_GSET;
4518 	return dev->ethtool_ops->get_settings(dev, cmd);
4519 }
4520 EXPORT_SYMBOL(dev_ethtool_get_settings);
4521 
4522 /**
4523  *	dev_get_flags - get flags reported to userspace
4524  *	@dev: device
4525  *
4526  *	Get the combination of flag bits exported through APIs to userspace.
4527  */
4528 unsigned dev_get_flags(const struct net_device *dev)
4529 {
4530 	unsigned flags;
4531 
4532 	flags = (dev->flags & ~(IFF_PROMISC |
4533 				IFF_ALLMULTI |
4534 				IFF_RUNNING |
4535 				IFF_LOWER_UP |
4536 				IFF_DORMANT)) |
4537 		(dev->gflags & (IFF_PROMISC |
4538 				IFF_ALLMULTI));
4539 
4540 	if (netif_running(dev)) {
4541 		if (netif_oper_up(dev))
4542 			flags |= IFF_RUNNING;
4543 		if (netif_carrier_ok(dev))
4544 			flags |= IFF_LOWER_UP;
4545 		if (netif_dormant(dev))
4546 			flags |= IFF_DORMANT;
4547 	}
4548 
4549 	return flags;
4550 }
4551 EXPORT_SYMBOL(dev_get_flags);
4552 
4553 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4554 {
4555 	int old_flags = dev->flags;
4556 	int ret;
4557 
4558 	ASSERT_RTNL();
4559 
4560 	/*
4561 	 *	Set the flags on our device.
4562 	 */
4563 
4564 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4565 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4566 			       IFF_AUTOMEDIA)) |
4567 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4568 				    IFF_ALLMULTI));
4569 
4570 	/*
4571 	 *	Load in the correct multicast list now the flags have changed.
4572 	 */
4573 
4574 	if ((old_flags ^ flags) & IFF_MULTICAST)
4575 		dev_change_rx_flags(dev, IFF_MULTICAST);
4576 
4577 	dev_set_rx_mode(dev);
4578 
4579 	/*
4580 	 *	Have we downed the interface. We handle IFF_UP ourselves
4581 	 *	according to user attempts to set it, rather than blindly
4582 	 *	setting it.
4583 	 */
4584 
4585 	ret = 0;
4586 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4587 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4588 
4589 		if (!ret)
4590 			dev_set_rx_mode(dev);
4591 	}
4592 
4593 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4594 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4595 
4596 		dev->gflags ^= IFF_PROMISC;
4597 		dev_set_promiscuity(dev, inc);
4598 	}
4599 
4600 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4601 	   is important. Some (broken) drivers set IFF_PROMISC, when
4602 	   IFF_ALLMULTI is requested not asking us and not reporting.
4603 	 */
4604 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4605 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4606 
4607 		dev->gflags ^= IFF_ALLMULTI;
4608 		dev_set_allmulti(dev, inc);
4609 	}
4610 
4611 	return ret;
4612 }
4613 
4614 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4615 {
4616 	unsigned int changes = dev->flags ^ old_flags;
4617 
4618 	if (changes & IFF_UP) {
4619 		if (dev->flags & IFF_UP)
4620 			call_netdevice_notifiers(NETDEV_UP, dev);
4621 		else
4622 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4623 	}
4624 
4625 	if (dev->flags & IFF_UP &&
4626 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4627 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4628 }
4629 
4630 /**
4631  *	dev_change_flags - change device settings
4632  *	@dev: device
4633  *	@flags: device state flags
4634  *
4635  *	Change settings on device based state flags. The flags are
4636  *	in the userspace exported format.
4637  */
4638 int dev_change_flags(struct net_device *dev, unsigned flags)
4639 {
4640 	int ret, changes;
4641 	int old_flags = dev->flags;
4642 
4643 	ret = __dev_change_flags(dev, flags);
4644 	if (ret < 0)
4645 		return ret;
4646 
4647 	changes = old_flags ^ dev->flags;
4648 	if (changes)
4649 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4650 
4651 	__dev_notify_flags(dev, old_flags);
4652 	return ret;
4653 }
4654 EXPORT_SYMBOL(dev_change_flags);
4655 
4656 /**
4657  *	dev_set_mtu - Change maximum transfer unit
4658  *	@dev: device
4659  *	@new_mtu: new transfer unit
4660  *
4661  *	Change the maximum transfer size of the network device.
4662  */
4663 int dev_set_mtu(struct net_device *dev, int new_mtu)
4664 {
4665 	const struct net_device_ops *ops = dev->netdev_ops;
4666 	int err;
4667 
4668 	if (new_mtu == dev->mtu)
4669 		return 0;
4670 
4671 	/*	MTU must be positive.	 */
4672 	if (new_mtu < 0)
4673 		return -EINVAL;
4674 
4675 	if (!netif_device_present(dev))
4676 		return -ENODEV;
4677 
4678 	err = 0;
4679 	if (ops->ndo_change_mtu)
4680 		err = ops->ndo_change_mtu(dev, new_mtu);
4681 	else
4682 		dev->mtu = new_mtu;
4683 
4684 	if (!err && dev->flags & IFF_UP)
4685 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4686 	return err;
4687 }
4688 EXPORT_SYMBOL(dev_set_mtu);
4689 
4690 /**
4691  *	dev_set_group - Change group this device belongs to
4692  *	@dev: device
4693  *	@new_group: group this device should belong to
4694  */
4695 void dev_set_group(struct net_device *dev, int new_group)
4696 {
4697 	dev->group = new_group;
4698 }
4699 EXPORT_SYMBOL(dev_set_group);
4700 
4701 /**
4702  *	dev_set_mac_address - Change Media Access Control Address
4703  *	@dev: device
4704  *	@sa: new address
4705  *
4706  *	Change the hardware (MAC) address of the device
4707  */
4708 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4709 {
4710 	const struct net_device_ops *ops = dev->netdev_ops;
4711 	int err;
4712 
4713 	if (!ops->ndo_set_mac_address)
4714 		return -EOPNOTSUPP;
4715 	if (sa->sa_family != dev->type)
4716 		return -EINVAL;
4717 	if (!netif_device_present(dev))
4718 		return -ENODEV;
4719 	err = ops->ndo_set_mac_address(dev, sa);
4720 	if (!err)
4721 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4722 	return err;
4723 }
4724 EXPORT_SYMBOL(dev_set_mac_address);
4725 
4726 /*
4727  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4728  */
4729 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4730 {
4731 	int err;
4732 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4733 
4734 	if (!dev)
4735 		return -ENODEV;
4736 
4737 	switch (cmd) {
4738 	case SIOCGIFFLAGS:	/* Get interface flags */
4739 		ifr->ifr_flags = (short) dev_get_flags(dev);
4740 		return 0;
4741 
4742 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4743 				   (currently unused) */
4744 		ifr->ifr_metric = 0;
4745 		return 0;
4746 
4747 	case SIOCGIFMTU:	/* Get the MTU of a device */
4748 		ifr->ifr_mtu = dev->mtu;
4749 		return 0;
4750 
4751 	case SIOCGIFHWADDR:
4752 		if (!dev->addr_len)
4753 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4754 		else
4755 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4756 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4757 		ifr->ifr_hwaddr.sa_family = dev->type;
4758 		return 0;
4759 
4760 	case SIOCGIFSLAVE:
4761 		err = -EINVAL;
4762 		break;
4763 
4764 	case SIOCGIFMAP:
4765 		ifr->ifr_map.mem_start = dev->mem_start;
4766 		ifr->ifr_map.mem_end   = dev->mem_end;
4767 		ifr->ifr_map.base_addr = dev->base_addr;
4768 		ifr->ifr_map.irq       = dev->irq;
4769 		ifr->ifr_map.dma       = dev->dma;
4770 		ifr->ifr_map.port      = dev->if_port;
4771 		return 0;
4772 
4773 	case SIOCGIFINDEX:
4774 		ifr->ifr_ifindex = dev->ifindex;
4775 		return 0;
4776 
4777 	case SIOCGIFTXQLEN:
4778 		ifr->ifr_qlen = dev->tx_queue_len;
4779 		return 0;
4780 
4781 	default:
4782 		/* dev_ioctl() should ensure this case
4783 		 * is never reached
4784 		 */
4785 		WARN_ON(1);
4786 		err = -EINVAL;
4787 		break;
4788 
4789 	}
4790 	return err;
4791 }
4792 
4793 /*
4794  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4795  */
4796 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4797 {
4798 	int err;
4799 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4800 	const struct net_device_ops *ops;
4801 
4802 	if (!dev)
4803 		return -ENODEV;
4804 
4805 	ops = dev->netdev_ops;
4806 
4807 	switch (cmd) {
4808 	case SIOCSIFFLAGS:	/* Set interface flags */
4809 		return dev_change_flags(dev, ifr->ifr_flags);
4810 
4811 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4812 				   (currently unused) */
4813 		return -EOPNOTSUPP;
4814 
4815 	case SIOCSIFMTU:	/* Set the MTU of a device */
4816 		return dev_set_mtu(dev, ifr->ifr_mtu);
4817 
4818 	case SIOCSIFHWADDR:
4819 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4820 
4821 	case SIOCSIFHWBROADCAST:
4822 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4823 			return -EINVAL;
4824 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4825 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4826 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4827 		return 0;
4828 
4829 	case SIOCSIFMAP:
4830 		if (ops->ndo_set_config) {
4831 			if (!netif_device_present(dev))
4832 				return -ENODEV;
4833 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4834 		}
4835 		return -EOPNOTSUPP;
4836 
4837 	case SIOCADDMULTI:
4838 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4839 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4840 			return -EINVAL;
4841 		if (!netif_device_present(dev))
4842 			return -ENODEV;
4843 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4844 
4845 	case SIOCDELMULTI:
4846 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4847 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4848 			return -EINVAL;
4849 		if (!netif_device_present(dev))
4850 			return -ENODEV;
4851 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4852 
4853 	case SIOCSIFTXQLEN:
4854 		if (ifr->ifr_qlen < 0)
4855 			return -EINVAL;
4856 		dev->tx_queue_len = ifr->ifr_qlen;
4857 		return 0;
4858 
4859 	case SIOCSIFNAME:
4860 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4861 		return dev_change_name(dev, ifr->ifr_newname);
4862 
4863 	/*
4864 	 *	Unknown or private ioctl
4865 	 */
4866 	default:
4867 		if ((cmd >= SIOCDEVPRIVATE &&
4868 		    cmd <= SIOCDEVPRIVATE + 15) ||
4869 		    cmd == SIOCBONDENSLAVE ||
4870 		    cmd == SIOCBONDRELEASE ||
4871 		    cmd == SIOCBONDSETHWADDR ||
4872 		    cmd == SIOCBONDSLAVEINFOQUERY ||
4873 		    cmd == SIOCBONDINFOQUERY ||
4874 		    cmd == SIOCBONDCHANGEACTIVE ||
4875 		    cmd == SIOCGMIIPHY ||
4876 		    cmd == SIOCGMIIREG ||
4877 		    cmd == SIOCSMIIREG ||
4878 		    cmd == SIOCBRADDIF ||
4879 		    cmd == SIOCBRDELIF ||
4880 		    cmd == SIOCSHWTSTAMP ||
4881 		    cmd == SIOCWANDEV) {
4882 			err = -EOPNOTSUPP;
4883 			if (ops->ndo_do_ioctl) {
4884 				if (netif_device_present(dev))
4885 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4886 				else
4887 					err = -ENODEV;
4888 			}
4889 		} else
4890 			err = -EINVAL;
4891 
4892 	}
4893 	return err;
4894 }
4895 
4896 /*
4897  *	This function handles all "interface"-type I/O control requests. The actual
4898  *	'doing' part of this is dev_ifsioc above.
4899  */
4900 
4901 /**
4902  *	dev_ioctl	-	network device ioctl
4903  *	@net: the applicable net namespace
4904  *	@cmd: command to issue
4905  *	@arg: pointer to a struct ifreq in user space
4906  *
4907  *	Issue ioctl functions to devices. This is normally called by the
4908  *	user space syscall interfaces but can sometimes be useful for
4909  *	other purposes. The return value is the return from the syscall if
4910  *	positive or a negative errno code on error.
4911  */
4912 
4913 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4914 {
4915 	struct ifreq ifr;
4916 	int ret;
4917 	char *colon;
4918 
4919 	/* One special case: SIOCGIFCONF takes ifconf argument
4920 	   and requires shared lock, because it sleeps writing
4921 	   to user space.
4922 	 */
4923 
4924 	if (cmd == SIOCGIFCONF) {
4925 		rtnl_lock();
4926 		ret = dev_ifconf(net, (char __user *) arg);
4927 		rtnl_unlock();
4928 		return ret;
4929 	}
4930 	if (cmd == SIOCGIFNAME)
4931 		return dev_ifname(net, (struct ifreq __user *)arg);
4932 
4933 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4934 		return -EFAULT;
4935 
4936 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4937 
4938 	colon = strchr(ifr.ifr_name, ':');
4939 	if (colon)
4940 		*colon = 0;
4941 
4942 	/*
4943 	 *	See which interface the caller is talking about.
4944 	 */
4945 
4946 	switch (cmd) {
4947 	/*
4948 	 *	These ioctl calls:
4949 	 *	- can be done by all.
4950 	 *	- atomic and do not require locking.
4951 	 *	- return a value
4952 	 */
4953 	case SIOCGIFFLAGS:
4954 	case SIOCGIFMETRIC:
4955 	case SIOCGIFMTU:
4956 	case SIOCGIFHWADDR:
4957 	case SIOCGIFSLAVE:
4958 	case SIOCGIFMAP:
4959 	case SIOCGIFINDEX:
4960 	case SIOCGIFTXQLEN:
4961 		dev_load(net, ifr.ifr_name);
4962 		rcu_read_lock();
4963 		ret = dev_ifsioc_locked(net, &ifr, cmd);
4964 		rcu_read_unlock();
4965 		if (!ret) {
4966 			if (colon)
4967 				*colon = ':';
4968 			if (copy_to_user(arg, &ifr,
4969 					 sizeof(struct ifreq)))
4970 				ret = -EFAULT;
4971 		}
4972 		return ret;
4973 
4974 	case SIOCETHTOOL:
4975 		dev_load(net, ifr.ifr_name);
4976 		rtnl_lock();
4977 		ret = dev_ethtool(net, &ifr);
4978 		rtnl_unlock();
4979 		if (!ret) {
4980 			if (colon)
4981 				*colon = ':';
4982 			if (copy_to_user(arg, &ifr,
4983 					 sizeof(struct ifreq)))
4984 				ret = -EFAULT;
4985 		}
4986 		return ret;
4987 
4988 	/*
4989 	 *	These ioctl calls:
4990 	 *	- require superuser power.
4991 	 *	- require strict serialization.
4992 	 *	- return a value
4993 	 */
4994 	case SIOCGMIIPHY:
4995 	case SIOCGMIIREG:
4996 	case SIOCSIFNAME:
4997 		if (!capable(CAP_NET_ADMIN))
4998 			return -EPERM;
4999 		dev_load(net, ifr.ifr_name);
5000 		rtnl_lock();
5001 		ret = dev_ifsioc(net, &ifr, cmd);
5002 		rtnl_unlock();
5003 		if (!ret) {
5004 			if (colon)
5005 				*colon = ':';
5006 			if (copy_to_user(arg, &ifr,
5007 					 sizeof(struct ifreq)))
5008 				ret = -EFAULT;
5009 		}
5010 		return ret;
5011 
5012 	/*
5013 	 *	These ioctl calls:
5014 	 *	- require superuser power.
5015 	 *	- require strict serialization.
5016 	 *	- do not return a value
5017 	 */
5018 	case SIOCSIFFLAGS:
5019 	case SIOCSIFMETRIC:
5020 	case SIOCSIFMTU:
5021 	case SIOCSIFMAP:
5022 	case SIOCSIFHWADDR:
5023 	case SIOCSIFSLAVE:
5024 	case SIOCADDMULTI:
5025 	case SIOCDELMULTI:
5026 	case SIOCSIFHWBROADCAST:
5027 	case SIOCSIFTXQLEN:
5028 	case SIOCSMIIREG:
5029 	case SIOCBONDENSLAVE:
5030 	case SIOCBONDRELEASE:
5031 	case SIOCBONDSETHWADDR:
5032 	case SIOCBONDCHANGEACTIVE:
5033 	case SIOCBRADDIF:
5034 	case SIOCBRDELIF:
5035 	case SIOCSHWTSTAMP:
5036 		if (!capable(CAP_NET_ADMIN))
5037 			return -EPERM;
5038 		/* fall through */
5039 	case SIOCBONDSLAVEINFOQUERY:
5040 	case SIOCBONDINFOQUERY:
5041 		dev_load(net, ifr.ifr_name);
5042 		rtnl_lock();
5043 		ret = dev_ifsioc(net, &ifr, cmd);
5044 		rtnl_unlock();
5045 		return ret;
5046 
5047 	case SIOCGIFMEM:
5048 		/* Get the per device memory space. We can add this but
5049 		 * currently do not support it */
5050 	case SIOCSIFMEM:
5051 		/* Set the per device memory buffer space.
5052 		 * Not applicable in our case */
5053 	case SIOCSIFLINK:
5054 		return -EINVAL;
5055 
5056 	/*
5057 	 *	Unknown or private ioctl.
5058 	 */
5059 	default:
5060 		if (cmd == SIOCWANDEV ||
5061 		    (cmd >= SIOCDEVPRIVATE &&
5062 		     cmd <= SIOCDEVPRIVATE + 15)) {
5063 			dev_load(net, ifr.ifr_name);
5064 			rtnl_lock();
5065 			ret = dev_ifsioc(net, &ifr, cmd);
5066 			rtnl_unlock();
5067 			if (!ret && copy_to_user(arg, &ifr,
5068 						 sizeof(struct ifreq)))
5069 				ret = -EFAULT;
5070 			return ret;
5071 		}
5072 		/* Take care of Wireless Extensions */
5073 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5074 			return wext_handle_ioctl(net, &ifr, cmd, arg);
5075 		return -EINVAL;
5076 	}
5077 }
5078 
5079 
5080 /**
5081  *	dev_new_index	-	allocate an ifindex
5082  *	@net: the applicable net namespace
5083  *
5084  *	Returns a suitable unique value for a new device interface
5085  *	number.  The caller must hold the rtnl semaphore or the
5086  *	dev_base_lock to be sure it remains unique.
5087  */
5088 static int dev_new_index(struct net *net)
5089 {
5090 	static int ifindex;
5091 	for (;;) {
5092 		if (++ifindex <= 0)
5093 			ifindex = 1;
5094 		if (!__dev_get_by_index(net, ifindex))
5095 			return ifindex;
5096 	}
5097 }
5098 
5099 /* Delayed registration/unregisteration */
5100 static LIST_HEAD(net_todo_list);
5101 
5102 static void net_set_todo(struct net_device *dev)
5103 {
5104 	list_add_tail(&dev->todo_list, &net_todo_list);
5105 }
5106 
5107 static void rollback_registered_many(struct list_head *head)
5108 {
5109 	struct net_device *dev, *tmp;
5110 
5111 	BUG_ON(dev_boot_phase);
5112 	ASSERT_RTNL();
5113 
5114 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5115 		/* Some devices call without registering
5116 		 * for initialization unwind. Remove those
5117 		 * devices and proceed with the remaining.
5118 		 */
5119 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5120 			pr_debug("unregister_netdevice: device %s/%p never "
5121 				 "was registered\n", dev->name, dev);
5122 
5123 			WARN_ON(1);
5124 			list_del(&dev->unreg_list);
5125 			continue;
5126 		}
5127 
5128 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5129 	}
5130 
5131 	/* If device is running, close it first. */
5132 	dev_close_many(head);
5133 
5134 	list_for_each_entry(dev, head, unreg_list) {
5135 		/* And unlink it from device chain. */
5136 		unlist_netdevice(dev);
5137 
5138 		dev->reg_state = NETREG_UNREGISTERING;
5139 	}
5140 
5141 	synchronize_net();
5142 
5143 	list_for_each_entry(dev, head, unreg_list) {
5144 		/* Shutdown queueing discipline. */
5145 		dev_shutdown(dev);
5146 
5147 
5148 		/* Notify protocols, that we are about to destroy
5149 		   this device. They should clean all the things.
5150 		*/
5151 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5152 
5153 		if (!dev->rtnl_link_ops ||
5154 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5155 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5156 
5157 		/*
5158 		 *	Flush the unicast and multicast chains
5159 		 */
5160 		dev_uc_flush(dev);
5161 		dev_mc_flush(dev);
5162 
5163 		if (dev->netdev_ops->ndo_uninit)
5164 			dev->netdev_ops->ndo_uninit(dev);
5165 
5166 		/* Notifier chain MUST detach us from master device. */
5167 		WARN_ON(dev->master);
5168 
5169 		/* Remove entries from kobject tree */
5170 		netdev_unregister_kobject(dev);
5171 	}
5172 
5173 	/* Process any work delayed until the end of the batch */
5174 	dev = list_first_entry(head, struct net_device, unreg_list);
5175 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5176 
5177 	rcu_barrier();
5178 
5179 	list_for_each_entry(dev, head, unreg_list)
5180 		dev_put(dev);
5181 }
5182 
5183 static void rollback_registered(struct net_device *dev)
5184 {
5185 	LIST_HEAD(single);
5186 
5187 	list_add(&dev->unreg_list, &single);
5188 	rollback_registered_many(&single);
5189 	list_del(&single);
5190 }
5191 
5192 u32 netdev_fix_features(struct net_device *dev, u32 features)
5193 {
5194 	/* Fix illegal checksum combinations */
5195 	if ((features & NETIF_F_HW_CSUM) &&
5196 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5197 		netdev_info(dev, "mixed HW and IP checksum settings.\n");
5198 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5199 	}
5200 
5201 	if ((features & NETIF_F_NO_CSUM) &&
5202 	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5203 		netdev_info(dev, "mixed no checksumming and other settings.\n");
5204 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5205 	}
5206 
5207 	/* Fix illegal SG+CSUM combinations. */
5208 	if ((features & NETIF_F_SG) &&
5209 	    !(features & NETIF_F_ALL_CSUM)) {
5210 		netdev_info(dev,
5211 			    "Dropping NETIF_F_SG since no checksum feature.\n");
5212 		features &= ~NETIF_F_SG;
5213 	}
5214 
5215 	/* TSO requires that SG is present as well. */
5216 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5217 		netdev_info(dev, "Dropping TSO features since no SG feature.\n");
5218 		features &= ~NETIF_F_ALL_TSO;
5219 	}
5220 
5221 	/* TSO ECN requires that TSO is present as well. */
5222 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5223 		features &= ~NETIF_F_TSO_ECN;
5224 
5225 	/* Software GSO depends on SG. */
5226 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5227 		netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5228 		features &= ~NETIF_F_GSO;
5229 	}
5230 
5231 	/* UFO needs SG and checksumming */
5232 	if (features & NETIF_F_UFO) {
5233 		/* maybe split UFO into V4 and V6? */
5234 		if (!((features & NETIF_F_GEN_CSUM) ||
5235 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5236 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5237 			netdev_info(dev,
5238 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5239 			features &= ~NETIF_F_UFO;
5240 		}
5241 
5242 		if (!(features & NETIF_F_SG)) {
5243 			netdev_info(dev,
5244 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5245 			features &= ~NETIF_F_UFO;
5246 		}
5247 	}
5248 
5249 	return features;
5250 }
5251 EXPORT_SYMBOL(netdev_fix_features);
5252 
5253 int __netdev_update_features(struct net_device *dev)
5254 {
5255 	u32 features;
5256 	int err = 0;
5257 
5258 	ASSERT_RTNL();
5259 
5260 	features = netdev_get_wanted_features(dev);
5261 
5262 	if (dev->netdev_ops->ndo_fix_features)
5263 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5264 
5265 	/* driver might be less strict about feature dependencies */
5266 	features = netdev_fix_features(dev, features);
5267 
5268 	if (dev->features == features)
5269 		return 0;
5270 
5271 	netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n",
5272 		dev->features, features);
5273 
5274 	if (dev->netdev_ops->ndo_set_features)
5275 		err = dev->netdev_ops->ndo_set_features(dev, features);
5276 
5277 	if (unlikely(err < 0)) {
5278 		netdev_err(dev,
5279 			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5280 			err, features, dev->features);
5281 		return -1;
5282 	}
5283 
5284 	if (!err)
5285 		dev->features = features;
5286 
5287 	return 1;
5288 }
5289 
5290 void netdev_update_features(struct net_device *dev)
5291 {
5292 	if (__netdev_update_features(dev))
5293 		netdev_features_change(dev);
5294 }
5295 EXPORT_SYMBOL(netdev_update_features);
5296 
5297 /**
5298  *	netif_stacked_transfer_operstate -	transfer operstate
5299  *	@rootdev: the root or lower level device to transfer state from
5300  *	@dev: the device to transfer operstate to
5301  *
5302  *	Transfer operational state from root to device. This is normally
5303  *	called when a stacking relationship exists between the root
5304  *	device and the device(a leaf device).
5305  */
5306 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5307 					struct net_device *dev)
5308 {
5309 	if (rootdev->operstate == IF_OPER_DORMANT)
5310 		netif_dormant_on(dev);
5311 	else
5312 		netif_dormant_off(dev);
5313 
5314 	if (netif_carrier_ok(rootdev)) {
5315 		if (!netif_carrier_ok(dev))
5316 			netif_carrier_on(dev);
5317 	} else {
5318 		if (netif_carrier_ok(dev))
5319 			netif_carrier_off(dev);
5320 	}
5321 }
5322 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5323 
5324 #ifdef CONFIG_RPS
5325 static int netif_alloc_rx_queues(struct net_device *dev)
5326 {
5327 	unsigned int i, count = dev->num_rx_queues;
5328 	struct netdev_rx_queue *rx;
5329 
5330 	BUG_ON(count < 1);
5331 
5332 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5333 	if (!rx) {
5334 		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5335 		return -ENOMEM;
5336 	}
5337 	dev->_rx = rx;
5338 
5339 	for (i = 0; i < count; i++)
5340 		rx[i].dev = dev;
5341 	return 0;
5342 }
5343 #endif
5344 
5345 static void netdev_init_one_queue(struct net_device *dev,
5346 				  struct netdev_queue *queue, void *_unused)
5347 {
5348 	/* Initialize queue lock */
5349 	spin_lock_init(&queue->_xmit_lock);
5350 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5351 	queue->xmit_lock_owner = -1;
5352 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5353 	queue->dev = dev;
5354 }
5355 
5356 static int netif_alloc_netdev_queues(struct net_device *dev)
5357 {
5358 	unsigned int count = dev->num_tx_queues;
5359 	struct netdev_queue *tx;
5360 
5361 	BUG_ON(count < 1);
5362 
5363 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5364 	if (!tx) {
5365 		pr_err("netdev: Unable to allocate %u tx queues.\n",
5366 		       count);
5367 		return -ENOMEM;
5368 	}
5369 	dev->_tx = tx;
5370 
5371 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5372 	spin_lock_init(&dev->tx_global_lock);
5373 
5374 	return 0;
5375 }
5376 
5377 /**
5378  *	register_netdevice	- register a network device
5379  *	@dev: device to register
5380  *
5381  *	Take a completed network device structure and add it to the kernel
5382  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5383  *	chain. 0 is returned on success. A negative errno code is returned
5384  *	on a failure to set up the device, or if the name is a duplicate.
5385  *
5386  *	Callers must hold the rtnl semaphore. You may want
5387  *	register_netdev() instead of this.
5388  *
5389  *	BUGS:
5390  *	The locking appears insufficient to guarantee two parallel registers
5391  *	will not get the same name.
5392  */
5393 
5394 int register_netdevice(struct net_device *dev)
5395 {
5396 	int ret;
5397 	struct net *net = dev_net(dev);
5398 
5399 	BUG_ON(dev_boot_phase);
5400 	ASSERT_RTNL();
5401 
5402 	might_sleep();
5403 
5404 	/* When net_device's are persistent, this will be fatal. */
5405 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5406 	BUG_ON(!net);
5407 
5408 	spin_lock_init(&dev->addr_list_lock);
5409 	netdev_set_addr_lockdep_class(dev);
5410 
5411 	dev->iflink = -1;
5412 
5413 	/* Init, if this function is available */
5414 	if (dev->netdev_ops->ndo_init) {
5415 		ret = dev->netdev_ops->ndo_init(dev);
5416 		if (ret) {
5417 			if (ret > 0)
5418 				ret = -EIO;
5419 			goto out;
5420 		}
5421 	}
5422 
5423 	ret = dev_get_valid_name(dev, dev->name, 0);
5424 	if (ret)
5425 		goto err_uninit;
5426 
5427 	dev->ifindex = dev_new_index(net);
5428 	if (dev->iflink == -1)
5429 		dev->iflink = dev->ifindex;
5430 
5431 	/* Transfer changeable features to wanted_features and enable
5432 	 * software offloads (GSO and GRO).
5433 	 */
5434 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5435 	dev->features |= NETIF_F_SOFT_FEATURES;
5436 	dev->wanted_features = dev->features & dev->hw_features;
5437 
5438 	/* Avoid warning from netdev_fix_features() for GSO without SG */
5439 	if (!(dev->wanted_features & NETIF_F_SG)) {
5440 		dev->wanted_features &= ~NETIF_F_GSO;
5441 		dev->features &= ~NETIF_F_GSO;
5442 	}
5443 
5444 	/* Turn on no cache copy if HW is doing checksum */
5445 	dev->hw_features |= NETIF_F_NOCACHE_COPY;
5446 	if ((dev->features & NETIF_F_ALL_CSUM) &&
5447 	    !(dev->features & NETIF_F_NO_CSUM)) {
5448 		dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5449 		dev->features |= NETIF_F_NOCACHE_COPY;
5450 	}
5451 
5452 	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5453 	 * vlan_dev_init() will do the dev->features check, so these features
5454 	 * are enabled only if supported by underlying device.
5455 	 */
5456 	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5457 
5458 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5459 	ret = notifier_to_errno(ret);
5460 	if (ret)
5461 		goto err_uninit;
5462 
5463 	ret = netdev_register_kobject(dev);
5464 	if (ret)
5465 		goto err_uninit;
5466 	dev->reg_state = NETREG_REGISTERED;
5467 
5468 	__netdev_update_features(dev);
5469 
5470 	/*
5471 	 *	Default initial state at registry is that the
5472 	 *	device is present.
5473 	 */
5474 
5475 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5476 
5477 	dev_init_scheduler(dev);
5478 	dev_hold(dev);
5479 	list_netdevice(dev);
5480 
5481 	/* Notify protocols, that a new device appeared. */
5482 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5483 	ret = notifier_to_errno(ret);
5484 	if (ret) {
5485 		rollback_registered(dev);
5486 		dev->reg_state = NETREG_UNREGISTERED;
5487 	}
5488 	/*
5489 	 *	Prevent userspace races by waiting until the network
5490 	 *	device is fully setup before sending notifications.
5491 	 */
5492 	if (!dev->rtnl_link_ops ||
5493 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5494 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5495 
5496 out:
5497 	return ret;
5498 
5499 err_uninit:
5500 	if (dev->netdev_ops->ndo_uninit)
5501 		dev->netdev_ops->ndo_uninit(dev);
5502 	goto out;
5503 }
5504 EXPORT_SYMBOL(register_netdevice);
5505 
5506 /**
5507  *	init_dummy_netdev	- init a dummy network device for NAPI
5508  *	@dev: device to init
5509  *
5510  *	This takes a network device structure and initialize the minimum
5511  *	amount of fields so it can be used to schedule NAPI polls without
5512  *	registering a full blown interface. This is to be used by drivers
5513  *	that need to tie several hardware interfaces to a single NAPI
5514  *	poll scheduler due to HW limitations.
5515  */
5516 int init_dummy_netdev(struct net_device *dev)
5517 {
5518 	/* Clear everything. Note we don't initialize spinlocks
5519 	 * are they aren't supposed to be taken by any of the
5520 	 * NAPI code and this dummy netdev is supposed to be
5521 	 * only ever used for NAPI polls
5522 	 */
5523 	memset(dev, 0, sizeof(struct net_device));
5524 
5525 	/* make sure we BUG if trying to hit standard
5526 	 * register/unregister code path
5527 	 */
5528 	dev->reg_state = NETREG_DUMMY;
5529 
5530 	/* NAPI wants this */
5531 	INIT_LIST_HEAD(&dev->napi_list);
5532 
5533 	/* a dummy interface is started by default */
5534 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5535 	set_bit(__LINK_STATE_START, &dev->state);
5536 
5537 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5538 	 * because users of this 'device' dont need to change
5539 	 * its refcount.
5540 	 */
5541 
5542 	return 0;
5543 }
5544 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5545 
5546 
5547 /**
5548  *	register_netdev	- register a network device
5549  *	@dev: device to register
5550  *
5551  *	Take a completed network device structure and add it to the kernel
5552  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5553  *	chain. 0 is returned on success. A negative errno code is returned
5554  *	on a failure to set up the device, or if the name is a duplicate.
5555  *
5556  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5557  *	and expands the device name if you passed a format string to
5558  *	alloc_netdev.
5559  */
5560 int register_netdev(struct net_device *dev)
5561 {
5562 	int err;
5563 
5564 	rtnl_lock();
5565 
5566 	/*
5567 	 * If the name is a format string the caller wants us to do a
5568 	 * name allocation.
5569 	 */
5570 	if (strchr(dev->name, '%')) {
5571 		err = dev_alloc_name(dev, dev->name);
5572 		if (err < 0)
5573 			goto out;
5574 	}
5575 
5576 	err = register_netdevice(dev);
5577 out:
5578 	rtnl_unlock();
5579 	return err;
5580 }
5581 EXPORT_SYMBOL(register_netdev);
5582 
5583 int netdev_refcnt_read(const struct net_device *dev)
5584 {
5585 	int i, refcnt = 0;
5586 
5587 	for_each_possible_cpu(i)
5588 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5589 	return refcnt;
5590 }
5591 EXPORT_SYMBOL(netdev_refcnt_read);
5592 
5593 /*
5594  * netdev_wait_allrefs - wait until all references are gone.
5595  *
5596  * This is called when unregistering network devices.
5597  *
5598  * Any protocol or device that holds a reference should register
5599  * for netdevice notification, and cleanup and put back the
5600  * reference if they receive an UNREGISTER event.
5601  * We can get stuck here if buggy protocols don't correctly
5602  * call dev_put.
5603  */
5604 static void netdev_wait_allrefs(struct net_device *dev)
5605 {
5606 	unsigned long rebroadcast_time, warning_time;
5607 	int refcnt;
5608 
5609 	linkwatch_forget_dev(dev);
5610 
5611 	rebroadcast_time = warning_time = jiffies;
5612 	refcnt = netdev_refcnt_read(dev);
5613 
5614 	while (refcnt != 0) {
5615 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5616 			rtnl_lock();
5617 
5618 			/* Rebroadcast unregister notification */
5619 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5620 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5621 			 * should have already handle it the first time */
5622 
5623 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5624 				     &dev->state)) {
5625 				/* We must not have linkwatch events
5626 				 * pending on unregister. If this
5627 				 * happens, we simply run the queue
5628 				 * unscheduled, resulting in a noop
5629 				 * for this device.
5630 				 */
5631 				linkwatch_run_queue();
5632 			}
5633 
5634 			__rtnl_unlock();
5635 
5636 			rebroadcast_time = jiffies;
5637 		}
5638 
5639 		msleep(250);
5640 
5641 		refcnt = netdev_refcnt_read(dev);
5642 
5643 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5644 			printk(KERN_EMERG "unregister_netdevice: "
5645 			       "waiting for %s to become free. Usage "
5646 			       "count = %d\n",
5647 			       dev->name, refcnt);
5648 			warning_time = jiffies;
5649 		}
5650 	}
5651 }
5652 
5653 /* The sequence is:
5654  *
5655  *	rtnl_lock();
5656  *	...
5657  *	register_netdevice(x1);
5658  *	register_netdevice(x2);
5659  *	...
5660  *	unregister_netdevice(y1);
5661  *	unregister_netdevice(y2);
5662  *      ...
5663  *	rtnl_unlock();
5664  *	free_netdev(y1);
5665  *	free_netdev(y2);
5666  *
5667  * We are invoked by rtnl_unlock().
5668  * This allows us to deal with problems:
5669  * 1) We can delete sysfs objects which invoke hotplug
5670  *    without deadlocking with linkwatch via keventd.
5671  * 2) Since we run with the RTNL semaphore not held, we can sleep
5672  *    safely in order to wait for the netdev refcnt to drop to zero.
5673  *
5674  * We must not return until all unregister events added during
5675  * the interval the lock was held have been completed.
5676  */
5677 void netdev_run_todo(void)
5678 {
5679 	struct list_head list;
5680 
5681 	/* Snapshot list, allow later requests */
5682 	list_replace_init(&net_todo_list, &list);
5683 
5684 	__rtnl_unlock();
5685 
5686 	while (!list_empty(&list)) {
5687 		struct net_device *dev
5688 			= list_first_entry(&list, struct net_device, todo_list);
5689 		list_del(&dev->todo_list);
5690 
5691 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5692 			printk(KERN_ERR "network todo '%s' but state %d\n",
5693 			       dev->name, dev->reg_state);
5694 			dump_stack();
5695 			continue;
5696 		}
5697 
5698 		dev->reg_state = NETREG_UNREGISTERED;
5699 
5700 		on_each_cpu(flush_backlog, dev, 1);
5701 
5702 		netdev_wait_allrefs(dev);
5703 
5704 		/* paranoia */
5705 		BUG_ON(netdev_refcnt_read(dev));
5706 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5707 		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5708 		WARN_ON(dev->dn_ptr);
5709 
5710 		if (dev->destructor)
5711 			dev->destructor(dev);
5712 
5713 		/* Free network device */
5714 		kobject_put(&dev->dev.kobj);
5715 	}
5716 }
5717 
5718 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5719  * fields in the same order, with only the type differing.
5720  */
5721 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5722 				    const struct net_device_stats *netdev_stats)
5723 {
5724 #if BITS_PER_LONG == 64
5725         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5726         memcpy(stats64, netdev_stats, sizeof(*stats64));
5727 #else
5728 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5729 	const unsigned long *src = (const unsigned long *)netdev_stats;
5730 	u64 *dst = (u64 *)stats64;
5731 
5732 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5733 		     sizeof(*stats64) / sizeof(u64));
5734 	for (i = 0; i < n; i++)
5735 		dst[i] = src[i];
5736 #endif
5737 }
5738 
5739 /**
5740  *	dev_get_stats	- get network device statistics
5741  *	@dev: device to get statistics from
5742  *	@storage: place to store stats
5743  *
5744  *	Get network statistics from device. Return @storage.
5745  *	The device driver may provide its own method by setting
5746  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5747  *	otherwise the internal statistics structure is used.
5748  */
5749 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5750 					struct rtnl_link_stats64 *storage)
5751 {
5752 	const struct net_device_ops *ops = dev->netdev_ops;
5753 
5754 	if (ops->ndo_get_stats64) {
5755 		memset(storage, 0, sizeof(*storage));
5756 		ops->ndo_get_stats64(dev, storage);
5757 	} else if (ops->ndo_get_stats) {
5758 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5759 	} else {
5760 		netdev_stats_to_stats64(storage, &dev->stats);
5761 	}
5762 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5763 	return storage;
5764 }
5765 EXPORT_SYMBOL(dev_get_stats);
5766 
5767 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5768 {
5769 	struct netdev_queue *queue = dev_ingress_queue(dev);
5770 
5771 #ifdef CONFIG_NET_CLS_ACT
5772 	if (queue)
5773 		return queue;
5774 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5775 	if (!queue)
5776 		return NULL;
5777 	netdev_init_one_queue(dev, queue, NULL);
5778 	queue->qdisc = &noop_qdisc;
5779 	queue->qdisc_sleeping = &noop_qdisc;
5780 	rcu_assign_pointer(dev->ingress_queue, queue);
5781 #endif
5782 	return queue;
5783 }
5784 
5785 /**
5786  *	alloc_netdev_mqs - allocate network device
5787  *	@sizeof_priv:	size of private data to allocate space for
5788  *	@name:		device name format string
5789  *	@setup:		callback to initialize device
5790  *	@txqs:		the number of TX subqueues to allocate
5791  *	@rxqs:		the number of RX subqueues to allocate
5792  *
5793  *	Allocates a struct net_device with private data area for driver use
5794  *	and performs basic initialization.  Also allocates subquue structs
5795  *	for each queue on the device.
5796  */
5797 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5798 		void (*setup)(struct net_device *),
5799 		unsigned int txqs, unsigned int rxqs)
5800 {
5801 	struct net_device *dev;
5802 	size_t alloc_size;
5803 	struct net_device *p;
5804 
5805 	BUG_ON(strlen(name) >= sizeof(dev->name));
5806 
5807 	if (txqs < 1) {
5808 		pr_err("alloc_netdev: Unable to allocate device "
5809 		       "with zero queues.\n");
5810 		return NULL;
5811 	}
5812 
5813 #ifdef CONFIG_RPS
5814 	if (rxqs < 1) {
5815 		pr_err("alloc_netdev: Unable to allocate device "
5816 		       "with zero RX queues.\n");
5817 		return NULL;
5818 	}
5819 #endif
5820 
5821 	alloc_size = sizeof(struct net_device);
5822 	if (sizeof_priv) {
5823 		/* ensure 32-byte alignment of private area */
5824 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5825 		alloc_size += sizeof_priv;
5826 	}
5827 	/* ensure 32-byte alignment of whole construct */
5828 	alloc_size += NETDEV_ALIGN - 1;
5829 
5830 	p = kzalloc(alloc_size, GFP_KERNEL);
5831 	if (!p) {
5832 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5833 		return NULL;
5834 	}
5835 
5836 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5837 	dev->padded = (char *)dev - (char *)p;
5838 
5839 	dev->pcpu_refcnt = alloc_percpu(int);
5840 	if (!dev->pcpu_refcnt)
5841 		goto free_p;
5842 
5843 	if (dev_addr_init(dev))
5844 		goto free_pcpu;
5845 
5846 	dev_mc_init(dev);
5847 	dev_uc_init(dev);
5848 
5849 	dev_net_set(dev, &init_net);
5850 
5851 	dev->gso_max_size = GSO_MAX_SIZE;
5852 
5853 	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5854 	dev->ethtool_ntuple_list.count = 0;
5855 	INIT_LIST_HEAD(&dev->napi_list);
5856 	INIT_LIST_HEAD(&dev->unreg_list);
5857 	INIT_LIST_HEAD(&dev->link_watch_list);
5858 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5859 	setup(dev);
5860 
5861 	dev->num_tx_queues = txqs;
5862 	dev->real_num_tx_queues = txqs;
5863 	if (netif_alloc_netdev_queues(dev))
5864 		goto free_all;
5865 
5866 #ifdef CONFIG_RPS
5867 	dev->num_rx_queues = rxqs;
5868 	dev->real_num_rx_queues = rxqs;
5869 	if (netif_alloc_rx_queues(dev))
5870 		goto free_all;
5871 #endif
5872 
5873 	strcpy(dev->name, name);
5874 	dev->group = INIT_NETDEV_GROUP;
5875 	return dev;
5876 
5877 free_all:
5878 	free_netdev(dev);
5879 	return NULL;
5880 
5881 free_pcpu:
5882 	free_percpu(dev->pcpu_refcnt);
5883 	kfree(dev->_tx);
5884 #ifdef CONFIG_RPS
5885 	kfree(dev->_rx);
5886 #endif
5887 
5888 free_p:
5889 	kfree(p);
5890 	return NULL;
5891 }
5892 EXPORT_SYMBOL(alloc_netdev_mqs);
5893 
5894 /**
5895  *	free_netdev - free network device
5896  *	@dev: device
5897  *
5898  *	This function does the last stage of destroying an allocated device
5899  * 	interface. The reference to the device object is released.
5900  *	If this is the last reference then it will be freed.
5901  */
5902 void free_netdev(struct net_device *dev)
5903 {
5904 	struct napi_struct *p, *n;
5905 
5906 	release_net(dev_net(dev));
5907 
5908 	kfree(dev->_tx);
5909 #ifdef CONFIG_RPS
5910 	kfree(dev->_rx);
5911 #endif
5912 
5913 	kfree(rcu_dereference_raw(dev->ingress_queue));
5914 
5915 	/* Flush device addresses */
5916 	dev_addr_flush(dev);
5917 
5918 	/* Clear ethtool n-tuple list */
5919 	ethtool_ntuple_flush(dev);
5920 
5921 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5922 		netif_napi_del(p);
5923 
5924 	free_percpu(dev->pcpu_refcnt);
5925 	dev->pcpu_refcnt = NULL;
5926 
5927 	/*  Compatibility with error handling in drivers */
5928 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5929 		kfree((char *)dev - dev->padded);
5930 		return;
5931 	}
5932 
5933 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5934 	dev->reg_state = NETREG_RELEASED;
5935 
5936 	/* will free via device release */
5937 	put_device(&dev->dev);
5938 }
5939 EXPORT_SYMBOL(free_netdev);
5940 
5941 /**
5942  *	synchronize_net -  Synchronize with packet receive processing
5943  *
5944  *	Wait for packets currently being received to be done.
5945  *	Does not block later packets from starting.
5946  */
5947 void synchronize_net(void)
5948 {
5949 	might_sleep();
5950 	synchronize_rcu();
5951 }
5952 EXPORT_SYMBOL(synchronize_net);
5953 
5954 /**
5955  *	unregister_netdevice_queue - remove device from the kernel
5956  *	@dev: device
5957  *	@head: list
5958  *
5959  *	This function shuts down a device interface and removes it
5960  *	from the kernel tables.
5961  *	If head not NULL, device is queued to be unregistered later.
5962  *
5963  *	Callers must hold the rtnl semaphore.  You may want
5964  *	unregister_netdev() instead of this.
5965  */
5966 
5967 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5968 {
5969 	ASSERT_RTNL();
5970 
5971 	if (head) {
5972 		list_move_tail(&dev->unreg_list, head);
5973 	} else {
5974 		rollback_registered(dev);
5975 		/* Finish processing unregister after unlock */
5976 		net_set_todo(dev);
5977 	}
5978 }
5979 EXPORT_SYMBOL(unregister_netdevice_queue);
5980 
5981 /**
5982  *	unregister_netdevice_many - unregister many devices
5983  *	@head: list of devices
5984  */
5985 void unregister_netdevice_many(struct list_head *head)
5986 {
5987 	struct net_device *dev;
5988 
5989 	if (!list_empty(head)) {
5990 		rollback_registered_many(head);
5991 		list_for_each_entry(dev, head, unreg_list)
5992 			net_set_todo(dev);
5993 	}
5994 }
5995 EXPORT_SYMBOL(unregister_netdevice_many);
5996 
5997 /**
5998  *	unregister_netdev - remove device from the kernel
5999  *	@dev: device
6000  *
6001  *	This function shuts down a device interface and removes it
6002  *	from the kernel tables.
6003  *
6004  *	This is just a wrapper for unregister_netdevice that takes
6005  *	the rtnl semaphore.  In general you want to use this and not
6006  *	unregister_netdevice.
6007  */
6008 void unregister_netdev(struct net_device *dev)
6009 {
6010 	rtnl_lock();
6011 	unregister_netdevice(dev);
6012 	rtnl_unlock();
6013 }
6014 EXPORT_SYMBOL(unregister_netdev);
6015 
6016 /**
6017  *	dev_change_net_namespace - move device to different nethost namespace
6018  *	@dev: device
6019  *	@net: network namespace
6020  *	@pat: If not NULL name pattern to try if the current device name
6021  *	      is already taken in the destination network namespace.
6022  *
6023  *	This function shuts down a device interface and moves it
6024  *	to a new network namespace. On success 0 is returned, on
6025  *	a failure a netagive errno code is returned.
6026  *
6027  *	Callers must hold the rtnl semaphore.
6028  */
6029 
6030 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6031 {
6032 	int err;
6033 
6034 	ASSERT_RTNL();
6035 
6036 	/* Don't allow namespace local devices to be moved. */
6037 	err = -EINVAL;
6038 	if (dev->features & NETIF_F_NETNS_LOCAL)
6039 		goto out;
6040 
6041 	/* Ensure the device has been registrered */
6042 	err = -EINVAL;
6043 	if (dev->reg_state != NETREG_REGISTERED)
6044 		goto out;
6045 
6046 	/* Get out if there is nothing todo */
6047 	err = 0;
6048 	if (net_eq(dev_net(dev), net))
6049 		goto out;
6050 
6051 	/* Pick the destination device name, and ensure
6052 	 * we can use it in the destination network namespace.
6053 	 */
6054 	err = -EEXIST;
6055 	if (__dev_get_by_name(net, dev->name)) {
6056 		/* We get here if we can't use the current device name */
6057 		if (!pat)
6058 			goto out;
6059 		if (dev_get_valid_name(dev, pat, 1))
6060 			goto out;
6061 	}
6062 
6063 	/*
6064 	 * And now a mini version of register_netdevice unregister_netdevice.
6065 	 */
6066 
6067 	/* If device is running close it first. */
6068 	dev_close(dev);
6069 
6070 	/* And unlink it from device chain */
6071 	err = -ENODEV;
6072 	unlist_netdevice(dev);
6073 
6074 	synchronize_net();
6075 
6076 	/* Shutdown queueing discipline. */
6077 	dev_shutdown(dev);
6078 
6079 	/* Notify protocols, that we are about to destroy
6080 	   this device. They should clean all the things.
6081 
6082 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6083 	   This is wanted because this way 8021q and macvlan know
6084 	   the device is just moving and can keep their slaves up.
6085 	*/
6086 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6087 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6088 
6089 	/*
6090 	 *	Flush the unicast and multicast chains
6091 	 */
6092 	dev_uc_flush(dev);
6093 	dev_mc_flush(dev);
6094 
6095 	/* Actually switch the network namespace */
6096 	dev_net_set(dev, net);
6097 
6098 	/* If there is an ifindex conflict assign a new one */
6099 	if (__dev_get_by_index(net, dev->ifindex)) {
6100 		int iflink = (dev->iflink == dev->ifindex);
6101 		dev->ifindex = dev_new_index(net);
6102 		if (iflink)
6103 			dev->iflink = dev->ifindex;
6104 	}
6105 
6106 	/* Fixup kobjects */
6107 	err = device_rename(&dev->dev, dev->name);
6108 	WARN_ON(err);
6109 
6110 	/* Add the device back in the hashes */
6111 	list_netdevice(dev);
6112 
6113 	/* Notify protocols, that a new device appeared. */
6114 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6115 
6116 	/*
6117 	 *	Prevent userspace races by waiting until the network
6118 	 *	device is fully setup before sending notifications.
6119 	 */
6120 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6121 
6122 	synchronize_net();
6123 	err = 0;
6124 out:
6125 	return err;
6126 }
6127 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6128 
6129 static int dev_cpu_callback(struct notifier_block *nfb,
6130 			    unsigned long action,
6131 			    void *ocpu)
6132 {
6133 	struct sk_buff **list_skb;
6134 	struct sk_buff *skb;
6135 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6136 	struct softnet_data *sd, *oldsd;
6137 
6138 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6139 		return NOTIFY_OK;
6140 
6141 	local_irq_disable();
6142 	cpu = smp_processor_id();
6143 	sd = &per_cpu(softnet_data, cpu);
6144 	oldsd = &per_cpu(softnet_data, oldcpu);
6145 
6146 	/* Find end of our completion_queue. */
6147 	list_skb = &sd->completion_queue;
6148 	while (*list_skb)
6149 		list_skb = &(*list_skb)->next;
6150 	/* Append completion queue from offline CPU. */
6151 	*list_skb = oldsd->completion_queue;
6152 	oldsd->completion_queue = NULL;
6153 
6154 	/* Append output queue from offline CPU. */
6155 	if (oldsd->output_queue) {
6156 		*sd->output_queue_tailp = oldsd->output_queue;
6157 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6158 		oldsd->output_queue = NULL;
6159 		oldsd->output_queue_tailp = &oldsd->output_queue;
6160 	}
6161 
6162 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6163 	local_irq_enable();
6164 
6165 	/* Process offline CPU's input_pkt_queue */
6166 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6167 		netif_rx(skb);
6168 		input_queue_head_incr(oldsd);
6169 	}
6170 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6171 		netif_rx(skb);
6172 		input_queue_head_incr(oldsd);
6173 	}
6174 
6175 	return NOTIFY_OK;
6176 }
6177 
6178 
6179 /**
6180  *	netdev_increment_features - increment feature set by one
6181  *	@all: current feature set
6182  *	@one: new feature set
6183  *	@mask: mask feature set
6184  *
6185  *	Computes a new feature set after adding a device with feature set
6186  *	@one to the master device with current feature set @all.  Will not
6187  *	enable anything that is off in @mask. Returns the new feature set.
6188  */
6189 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6190 {
6191 	if (mask & NETIF_F_GEN_CSUM)
6192 		mask |= NETIF_F_ALL_CSUM;
6193 	mask |= NETIF_F_VLAN_CHALLENGED;
6194 
6195 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6196 	all &= one | ~NETIF_F_ALL_FOR_ALL;
6197 
6198 	/* If device needs checksumming, downgrade to it. */
6199 	if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6200 		all &= ~NETIF_F_NO_CSUM;
6201 
6202 	/* If one device supports hw checksumming, set for all. */
6203 	if (all & NETIF_F_GEN_CSUM)
6204 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6205 
6206 	return all;
6207 }
6208 EXPORT_SYMBOL(netdev_increment_features);
6209 
6210 static struct hlist_head *netdev_create_hash(void)
6211 {
6212 	int i;
6213 	struct hlist_head *hash;
6214 
6215 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6216 	if (hash != NULL)
6217 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6218 			INIT_HLIST_HEAD(&hash[i]);
6219 
6220 	return hash;
6221 }
6222 
6223 /* Initialize per network namespace state */
6224 static int __net_init netdev_init(struct net *net)
6225 {
6226 	INIT_LIST_HEAD(&net->dev_base_head);
6227 
6228 	net->dev_name_head = netdev_create_hash();
6229 	if (net->dev_name_head == NULL)
6230 		goto err_name;
6231 
6232 	net->dev_index_head = netdev_create_hash();
6233 	if (net->dev_index_head == NULL)
6234 		goto err_idx;
6235 
6236 	return 0;
6237 
6238 err_idx:
6239 	kfree(net->dev_name_head);
6240 err_name:
6241 	return -ENOMEM;
6242 }
6243 
6244 /**
6245  *	netdev_drivername - network driver for the device
6246  *	@dev: network device
6247  *	@buffer: buffer for resulting name
6248  *	@len: size of buffer
6249  *
6250  *	Determine network driver for device.
6251  */
6252 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6253 {
6254 	const struct device_driver *driver;
6255 	const struct device *parent;
6256 
6257 	if (len <= 0 || !buffer)
6258 		return buffer;
6259 	buffer[0] = 0;
6260 
6261 	parent = dev->dev.parent;
6262 
6263 	if (!parent)
6264 		return buffer;
6265 
6266 	driver = parent->driver;
6267 	if (driver && driver->name)
6268 		strlcpy(buffer, driver->name, len);
6269 	return buffer;
6270 }
6271 
6272 static int __netdev_printk(const char *level, const struct net_device *dev,
6273 			   struct va_format *vaf)
6274 {
6275 	int r;
6276 
6277 	if (dev && dev->dev.parent)
6278 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6279 			       netdev_name(dev), vaf);
6280 	else if (dev)
6281 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6282 	else
6283 		r = printk("%s(NULL net_device): %pV", level, vaf);
6284 
6285 	return r;
6286 }
6287 
6288 int netdev_printk(const char *level, const struct net_device *dev,
6289 		  const char *format, ...)
6290 {
6291 	struct va_format vaf;
6292 	va_list args;
6293 	int r;
6294 
6295 	va_start(args, format);
6296 
6297 	vaf.fmt = format;
6298 	vaf.va = &args;
6299 
6300 	r = __netdev_printk(level, dev, &vaf);
6301 	va_end(args);
6302 
6303 	return r;
6304 }
6305 EXPORT_SYMBOL(netdev_printk);
6306 
6307 #define define_netdev_printk_level(func, level)			\
6308 int func(const struct net_device *dev, const char *fmt, ...)	\
6309 {								\
6310 	int r;							\
6311 	struct va_format vaf;					\
6312 	va_list args;						\
6313 								\
6314 	va_start(args, fmt);					\
6315 								\
6316 	vaf.fmt = fmt;						\
6317 	vaf.va = &args;						\
6318 								\
6319 	r = __netdev_printk(level, dev, &vaf);			\
6320 	va_end(args);						\
6321 								\
6322 	return r;						\
6323 }								\
6324 EXPORT_SYMBOL(func);
6325 
6326 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6327 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6328 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6329 define_netdev_printk_level(netdev_err, KERN_ERR);
6330 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6331 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6332 define_netdev_printk_level(netdev_info, KERN_INFO);
6333 
6334 static void __net_exit netdev_exit(struct net *net)
6335 {
6336 	kfree(net->dev_name_head);
6337 	kfree(net->dev_index_head);
6338 }
6339 
6340 static struct pernet_operations __net_initdata netdev_net_ops = {
6341 	.init = netdev_init,
6342 	.exit = netdev_exit,
6343 };
6344 
6345 static void __net_exit default_device_exit(struct net *net)
6346 {
6347 	struct net_device *dev, *aux;
6348 	/*
6349 	 * Push all migratable network devices back to the
6350 	 * initial network namespace
6351 	 */
6352 	rtnl_lock();
6353 	for_each_netdev_safe(net, dev, aux) {
6354 		int err;
6355 		char fb_name[IFNAMSIZ];
6356 
6357 		/* Ignore unmoveable devices (i.e. loopback) */
6358 		if (dev->features & NETIF_F_NETNS_LOCAL)
6359 			continue;
6360 
6361 		/* Leave virtual devices for the generic cleanup */
6362 		if (dev->rtnl_link_ops)
6363 			continue;
6364 
6365 		/* Push remaining network devices to init_net */
6366 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6367 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6368 		if (err) {
6369 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6370 				__func__, dev->name, err);
6371 			BUG();
6372 		}
6373 	}
6374 	rtnl_unlock();
6375 }
6376 
6377 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6378 {
6379 	/* At exit all network devices most be removed from a network
6380 	 * namespace.  Do this in the reverse order of registration.
6381 	 * Do this across as many network namespaces as possible to
6382 	 * improve batching efficiency.
6383 	 */
6384 	struct net_device *dev;
6385 	struct net *net;
6386 	LIST_HEAD(dev_kill_list);
6387 
6388 	rtnl_lock();
6389 	list_for_each_entry(net, net_list, exit_list) {
6390 		for_each_netdev_reverse(net, dev) {
6391 			if (dev->rtnl_link_ops)
6392 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6393 			else
6394 				unregister_netdevice_queue(dev, &dev_kill_list);
6395 		}
6396 	}
6397 	unregister_netdevice_many(&dev_kill_list);
6398 	list_del(&dev_kill_list);
6399 	rtnl_unlock();
6400 }
6401 
6402 static struct pernet_operations __net_initdata default_device_ops = {
6403 	.exit = default_device_exit,
6404 	.exit_batch = default_device_exit_batch,
6405 };
6406 
6407 /*
6408  *	Initialize the DEV module. At boot time this walks the device list and
6409  *	unhooks any devices that fail to initialise (normally hardware not
6410  *	present) and leaves us with a valid list of present and active devices.
6411  *
6412  */
6413 
6414 /*
6415  *       This is called single threaded during boot, so no need
6416  *       to take the rtnl semaphore.
6417  */
6418 static int __init net_dev_init(void)
6419 {
6420 	int i, rc = -ENOMEM;
6421 
6422 	BUG_ON(!dev_boot_phase);
6423 
6424 	if (dev_proc_init())
6425 		goto out;
6426 
6427 	if (netdev_kobject_init())
6428 		goto out;
6429 
6430 	INIT_LIST_HEAD(&ptype_all);
6431 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6432 		INIT_LIST_HEAD(&ptype_base[i]);
6433 
6434 	if (register_pernet_subsys(&netdev_net_ops))
6435 		goto out;
6436 
6437 	/*
6438 	 *	Initialise the packet receive queues.
6439 	 */
6440 
6441 	for_each_possible_cpu(i) {
6442 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6443 
6444 		memset(sd, 0, sizeof(*sd));
6445 		skb_queue_head_init(&sd->input_pkt_queue);
6446 		skb_queue_head_init(&sd->process_queue);
6447 		sd->completion_queue = NULL;
6448 		INIT_LIST_HEAD(&sd->poll_list);
6449 		sd->output_queue = NULL;
6450 		sd->output_queue_tailp = &sd->output_queue;
6451 #ifdef CONFIG_RPS
6452 		sd->csd.func = rps_trigger_softirq;
6453 		sd->csd.info = sd;
6454 		sd->csd.flags = 0;
6455 		sd->cpu = i;
6456 #endif
6457 
6458 		sd->backlog.poll = process_backlog;
6459 		sd->backlog.weight = weight_p;
6460 		sd->backlog.gro_list = NULL;
6461 		sd->backlog.gro_count = 0;
6462 	}
6463 
6464 	dev_boot_phase = 0;
6465 
6466 	/* The loopback device is special if any other network devices
6467 	 * is present in a network namespace the loopback device must
6468 	 * be present. Since we now dynamically allocate and free the
6469 	 * loopback device ensure this invariant is maintained by
6470 	 * keeping the loopback device as the first device on the
6471 	 * list of network devices.  Ensuring the loopback devices
6472 	 * is the first device that appears and the last network device
6473 	 * that disappears.
6474 	 */
6475 	if (register_pernet_device(&loopback_net_ops))
6476 		goto out;
6477 
6478 	if (register_pernet_device(&default_device_ops))
6479 		goto out;
6480 
6481 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6482 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6483 
6484 	hotcpu_notifier(dev_cpu_callback, 0);
6485 	dst_init();
6486 	dev_mcast_init();
6487 	rc = 0;
6488 out:
6489 	return rc;
6490 }
6491 
6492 subsys_initcall(net_dev_init);
6493 
6494 static int __init initialize_hashrnd(void)
6495 {
6496 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6497 	return 0;
6498 }
6499 
6500 late_initcall_sync(initialize_hashrnd);
6501 
6502