xref: /linux-6.15/net/core/dev.c (revision facb4edc)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <[email protected]>
12  *				Mark Evans, <[email protected]>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <[email protected]>
16  *		Alan Cox <[email protected]>
17  *		David Hinds <[email protected]>
18  *		Alexey Kuznetsov <[email protected]>
19  *		Adam Sulmicki <[email protected]>
20  *              Pekka Riikonen <[email protected]>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 
136 #include "net-sysfs.h"
137 
138 /* Instead of increasing this, you should create a hash table. */
139 #define MAX_GRO_SKBS 8
140 
141 /* This should be increased if a protocol with a bigger head is added. */
142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
143 
144 /*
145  *	The list of packet types we will receive (as opposed to discard)
146  *	and the routines to invoke.
147  *
148  *	Why 16. Because with 16 the only overlap we get on a hash of the
149  *	low nibble of the protocol value is RARP/SNAP/X.25.
150  *
151  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
152  *             sure which should go first, but I bet it won't make much
153  *             difference if we are running VLANs.  The good news is that
154  *             this protocol won't be in the list unless compiled in, so
155  *             the average user (w/out VLANs) will not be adversely affected.
156  *             --BLG
157  *
158  *		0800	IP
159  *		8100    802.1Q VLAN
160  *		0001	802.3
161  *		0002	AX.25
162  *		0004	802.2
163  *		8035	RARP
164  *		0005	SNAP
165  *		0805	X.25
166  *		0806	ARP
167  *		8137	IPX
168  *		0009	Localtalk
169  *		86DD	IPv6
170  */
171 
172 #define PTYPE_HASH_SIZE	(16)
173 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
174 
175 static DEFINE_SPINLOCK(ptype_lock);
176 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
177 static struct list_head ptype_all __read_mostly;	/* Taps */
178 
179 /*
180  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
181  * semaphore.
182  *
183  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
184  *
185  * Writers must hold the rtnl semaphore while they loop through the
186  * dev_base_head list, and hold dev_base_lock for writing when they do the
187  * actual updates.  This allows pure readers to access the list even
188  * while a writer is preparing to update it.
189  *
190  * To put it another way, dev_base_lock is held for writing only to
191  * protect against pure readers; the rtnl semaphore provides the
192  * protection against other writers.
193  *
194  * See, for example usages, register_netdevice() and
195  * unregister_netdevice(), which must be called with the rtnl
196  * semaphore held.
197  */
198 DEFINE_RWLOCK(dev_base_lock);
199 EXPORT_SYMBOL(dev_base_lock);
200 
201 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
202 {
203 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
204 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
205 }
206 
207 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
208 {
209 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
210 }
211 
212 static inline void rps_lock(struct softnet_data *sd)
213 {
214 #ifdef CONFIG_RPS
215 	spin_lock(&sd->input_pkt_queue.lock);
216 #endif
217 }
218 
219 static inline void rps_unlock(struct softnet_data *sd)
220 {
221 #ifdef CONFIG_RPS
222 	spin_unlock(&sd->input_pkt_queue.lock);
223 #endif
224 }
225 
226 /* Device list insertion */
227 static int list_netdevice(struct net_device *dev)
228 {
229 	struct net *net = dev_net(dev);
230 
231 	ASSERT_RTNL();
232 
233 	write_lock_bh(&dev_base_lock);
234 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
235 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
236 	hlist_add_head_rcu(&dev->index_hlist,
237 			   dev_index_hash(net, dev->ifindex));
238 	write_unlock_bh(&dev_base_lock);
239 	return 0;
240 }
241 
242 /* Device list removal
243  * caller must respect a RCU grace period before freeing/reusing dev
244  */
245 static void unlist_netdevice(struct net_device *dev)
246 {
247 	ASSERT_RTNL();
248 
249 	/* Unlink dev from the device chain */
250 	write_lock_bh(&dev_base_lock);
251 	list_del_rcu(&dev->dev_list);
252 	hlist_del_rcu(&dev->name_hlist);
253 	hlist_del_rcu(&dev->index_hlist);
254 	write_unlock_bh(&dev_base_lock);
255 }
256 
257 /*
258  *	Our notifier list
259  */
260 
261 static RAW_NOTIFIER_HEAD(netdev_chain);
262 
263 /*
264  *	Device drivers call our routines to queue packets here. We empty the
265  *	queue in the local softnet handler.
266  */
267 
268 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
269 EXPORT_PER_CPU_SYMBOL(softnet_data);
270 
271 #ifdef CONFIG_LOCKDEP
272 /*
273  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
274  * according to dev->type
275  */
276 static const unsigned short netdev_lock_type[] =
277 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
278 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
279 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
280 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
281 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
282 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
283 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
284 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
285 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
286 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
287 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
288 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
289 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
290 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
291 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
292 	 ARPHRD_VOID, ARPHRD_NONE};
293 
294 static const char *const netdev_lock_name[] =
295 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
308 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
309 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
310 	 "_xmit_VOID", "_xmit_NONE"};
311 
312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 
315 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
316 {
317 	int i;
318 
319 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
320 		if (netdev_lock_type[i] == dev_type)
321 			return i;
322 	/* the last key is used by default */
323 	return ARRAY_SIZE(netdev_lock_type) - 1;
324 }
325 
326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
327 						 unsigned short dev_type)
328 {
329 	int i;
330 
331 	i = netdev_lock_pos(dev_type);
332 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
333 				   netdev_lock_name[i]);
334 }
335 
336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337 {
338 	int i;
339 
340 	i = netdev_lock_pos(dev->type);
341 	lockdep_set_class_and_name(&dev->addr_list_lock,
342 				   &netdev_addr_lock_key[i],
343 				   netdev_lock_name[i]);
344 }
345 #else
346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
347 						 unsigned short dev_type)
348 {
349 }
350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
351 {
352 }
353 #endif
354 
355 /*******************************************************************************
356 
357 		Protocol management and registration routines
358 
359 *******************************************************************************/
360 
361 /*
362  *	Add a protocol ID to the list. Now that the input handler is
363  *	smarter we can dispense with all the messy stuff that used to be
364  *	here.
365  *
366  *	BEWARE!!! Protocol handlers, mangling input packets,
367  *	MUST BE last in hash buckets and checking protocol handlers
368  *	MUST start from promiscuous ptype_all chain in net_bh.
369  *	It is true now, do not change it.
370  *	Explanation follows: if protocol handler, mangling packet, will
371  *	be the first on list, it is not able to sense, that packet
372  *	is cloned and should be copied-on-write, so that it will
373  *	change it and subsequent readers will get broken packet.
374  *							--ANK (980803)
375  */
376 
377 static inline struct list_head *ptype_head(const struct packet_type *pt)
378 {
379 	if (pt->type == htons(ETH_P_ALL))
380 		return &ptype_all;
381 	else
382 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383 }
384 
385 /**
386  *	dev_add_pack - add packet handler
387  *	@pt: packet type declaration
388  *
389  *	Add a protocol handler to the networking stack. The passed &packet_type
390  *	is linked into kernel lists and may not be freed until it has been
391  *	removed from the kernel lists.
392  *
393  *	This call does not sleep therefore it can not
394  *	guarantee all CPU's that are in middle of receiving packets
395  *	will see the new packet type (until the next received packet).
396  */
397 
398 void dev_add_pack(struct packet_type *pt)
399 {
400 	struct list_head *head = ptype_head(pt);
401 
402 	spin_lock(&ptype_lock);
403 	list_add_rcu(&pt->list, head);
404 	spin_unlock(&ptype_lock);
405 }
406 EXPORT_SYMBOL(dev_add_pack);
407 
408 /**
409  *	__dev_remove_pack	 - remove packet handler
410  *	@pt: packet type declaration
411  *
412  *	Remove a protocol handler that was previously added to the kernel
413  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
414  *	from the kernel lists and can be freed or reused once this function
415  *	returns.
416  *
417  *      The packet type might still be in use by receivers
418  *	and must not be freed until after all the CPU's have gone
419  *	through a quiescent state.
420  */
421 void __dev_remove_pack(struct packet_type *pt)
422 {
423 	struct list_head *head = ptype_head(pt);
424 	struct packet_type *pt1;
425 
426 	spin_lock(&ptype_lock);
427 
428 	list_for_each_entry(pt1, head, list) {
429 		if (pt == pt1) {
430 			list_del_rcu(&pt->list);
431 			goto out;
432 		}
433 	}
434 
435 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
436 out:
437 	spin_unlock(&ptype_lock);
438 }
439 EXPORT_SYMBOL(__dev_remove_pack);
440 
441 /**
442  *	dev_remove_pack	 - remove packet handler
443  *	@pt: packet type declaration
444  *
445  *	Remove a protocol handler that was previously added to the kernel
446  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
447  *	from the kernel lists and can be freed or reused once this function
448  *	returns.
449  *
450  *	This call sleeps to guarantee that no CPU is looking at the packet
451  *	type after return.
452  */
453 void dev_remove_pack(struct packet_type *pt)
454 {
455 	__dev_remove_pack(pt);
456 
457 	synchronize_net();
458 }
459 EXPORT_SYMBOL(dev_remove_pack);
460 
461 /******************************************************************************
462 
463 		      Device Boot-time Settings Routines
464 
465 *******************************************************************************/
466 
467 /* Boot time configuration table */
468 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
469 
470 /**
471  *	netdev_boot_setup_add	- add new setup entry
472  *	@name: name of the device
473  *	@map: configured settings for the device
474  *
475  *	Adds new setup entry to the dev_boot_setup list.  The function
476  *	returns 0 on error and 1 on success.  This is a generic routine to
477  *	all netdevices.
478  */
479 static int netdev_boot_setup_add(char *name, struct ifmap *map)
480 {
481 	struct netdev_boot_setup *s;
482 	int i;
483 
484 	s = dev_boot_setup;
485 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
486 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
487 			memset(s[i].name, 0, sizeof(s[i].name));
488 			strlcpy(s[i].name, name, IFNAMSIZ);
489 			memcpy(&s[i].map, map, sizeof(s[i].map));
490 			break;
491 		}
492 	}
493 
494 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
495 }
496 
497 /**
498  *	netdev_boot_setup_check	- check boot time settings
499  *	@dev: the netdevice
500  *
501  * 	Check boot time settings for the device.
502  *	The found settings are set for the device to be used
503  *	later in the device probing.
504  *	Returns 0 if no settings found, 1 if they are.
505  */
506 int netdev_boot_setup_check(struct net_device *dev)
507 {
508 	struct netdev_boot_setup *s = dev_boot_setup;
509 	int i;
510 
511 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
512 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
513 		    !strcmp(dev->name, s[i].name)) {
514 			dev->irq 	= s[i].map.irq;
515 			dev->base_addr 	= s[i].map.base_addr;
516 			dev->mem_start 	= s[i].map.mem_start;
517 			dev->mem_end 	= s[i].map.mem_end;
518 			return 1;
519 		}
520 	}
521 	return 0;
522 }
523 EXPORT_SYMBOL(netdev_boot_setup_check);
524 
525 
526 /**
527  *	netdev_boot_base	- get address from boot time settings
528  *	@prefix: prefix for network device
529  *	@unit: id for network device
530  *
531  * 	Check boot time settings for the base address of device.
532  *	The found settings are set for the device to be used
533  *	later in the device probing.
534  *	Returns 0 if no settings found.
535  */
536 unsigned long netdev_boot_base(const char *prefix, int unit)
537 {
538 	const struct netdev_boot_setup *s = dev_boot_setup;
539 	char name[IFNAMSIZ];
540 	int i;
541 
542 	sprintf(name, "%s%d", prefix, unit);
543 
544 	/*
545 	 * If device already registered then return base of 1
546 	 * to indicate not to probe for this interface
547 	 */
548 	if (__dev_get_by_name(&init_net, name))
549 		return 1;
550 
551 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
552 		if (!strcmp(name, s[i].name))
553 			return s[i].map.base_addr;
554 	return 0;
555 }
556 
557 /*
558  * Saves at boot time configured settings for any netdevice.
559  */
560 int __init netdev_boot_setup(char *str)
561 {
562 	int ints[5];
563 	struct ifmap map;
564 
565 	str = get_options(str, ARRAY_SIZE(ints), ints);
566 	if (!str || !*str)
567 		return 0;
568 
569 	/* Save settings */
570 	memset(&map, 0, sizeof(map));
571 	if (ints[0] > 0)
572 		map.irq = ints[1];
573 	if (ints[0] > 1)
574 		map.base_addr = ints[2];
575 	if (ints[0] > 2)
576 		map.mem_start = ints[3];
577 	if (ints[0] > 3)
578 		map.mem_end = ints[4];
579 
580 	/* Add new entry to the list */
581 	return netdev_boot_setup_add(str, &map);
582 }
583 
584 __setup("netdev=", netdev_boot_setup);
585 
586 /*******************************************************************************
587 
588 			    Device Interface Subroutines
589 
590 *******************************************************************************/
591 
592 /**
593  *	__dev_get_by_name	- find a device by its name
594  *	@net: the applicable net namespace
595  *	@name: name to find
596  *
597  *	Find an interface by name. Must be called under RTNL semaphore
598  *	or @dev_base_lock. If the name is found a pointer to the device
599  *	is returned. If the name is not found then %NULL is returned. The
600  *	reference counters are not incremented so the caller must be
601  *	careful with locks.
602  */
603 
604 struct net_device *__dev_get_by_name(struct net *net, const char *name)
605 {
606 	struct hlist_node *p;
607 	struct net_device *dev;
608 	struct hlist_head *head = dev_name_hash(net, name);
609 
610 	hlist_for_each_entry(dev, p, head, name_hlist)
611 		if (!strncmp(dev->name, name, IFNAMSIZ))
612 			return dev;
613 
614 	return NULL;
615 }
616 EXPORT_SYMBOL(__dev_get_by_name);
617 
618 /**
619  *	dev_get_by_name_rcu	- find a device by its name
620  *	@net: the applicable net namespace
621  *	@name: name to find
622  *
623  *	Find an interface by name.
624  *	If the name is found a pointer to the device is returned.
625  * 	If the name is not found then %NULL is returned.
626  *	The reference counters are not incremented so the caller must be
627  *	careful with locks. The caller must hold RCU lock.
628  */
629 
630 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
631 {
632 	struct hlist_node *p;
633 	struct net_device *dev;
634 	struct hlist_head *head = dev_name_hash(net, name);
635 
636 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
637 		if (!strncmp(dev->name, name, IFNAMSIZ))
638 			return dev;
639 
640 	return NULL;
641 }
642 EXPORT_SYMBOL(dev_get_by_name_rcu);
643 
644 /**
645  *	dev_get_by_name		- find a device by its name
646  *	@net: the applicable net namespace
647  *	@name: name to find
648  *
649  *	Find an interface by name. This can be called from any
650  *	context and does its own locking. The returned handle has
651  *	the usage count incremented and the caller must use dev_put() to
652  *	release it when it is no longer needed. %NULL is returned if no
653  *	matching device is found.
654  */
655 
656 struct net_device *dev_get_by_name(struct net *net, const char *name)
657 {
658 	struct net_device *dev;
659 
660 	rcu_read_lock();
661 	dev = dev_get_by_name_rcu(net, name);
662 	if (dev)
663 		dev_hold(dev);
664 	rcu_read_unlock();
665 	return dev;
666 }
667 EXPORT_SYMBOL(dev_get_by_name);
668 
669 /**
670  *	__dev_get_by_index - find a device by its ifindex
671  *	@net: the applicable net namespace
672  *	@ifindex: index of device
673  *
674  *	Search for an interface by index. Returns %NULL if the device
675  *	is not found or a pointer to the device. The device has not
676  *	had its reference counter increased so the caller must be careful
677  *	about locking. The caller must hold either the RTNL semaphore
678  *	or @dev_base_lock.
679  */
680 
681 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
682 {
683 	struct hlist_node *p;
684 	struct net_device *dev;
685 	struct hlist_head *head = dev_index_hash(net, ifindex);
686 
687 	hlist_for_each_entry(dev, p, head, index_hlist)
688 		if (dev->ifindex == ifindex)
689 			return dev;
690 
691 	return NULL;
692 }
693 EXPORT_SYMBOL(__dev_get_by_index);
694 
695 /**
696  *	dev_get_by_index_rcu - find a device by its ifindex
697  *	@net: the applicable net namespace
698  *	@ifindex: index of device
699  *
700  *	Search for an interface by index. Returns %NULL if the device
701  *	is not found or a pointer to the device. The device has not
702  *	had its reference counter increased so the caller must be careful
703  *	about locking. The caller must hold RCU lock.
704  */
705 
706 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
707 {
708 	struct hlist_node *p;
709 	struct net_device *dev;
710 	struct hlist_head *head = dev_index_hash(net, ifindex);
711 
712 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
713 		if (dev->ifindex == ifindex)
714 			return dev;
715 
716 	return NULL;
717 }
718 EXPORT_SYMBOL(dev_get_by_index_rcu);
719 
720 
721 /**
722  *	dev_get_by_index - find a device by its ifindex
723  *	@net: the applicable net namespace
724  *	@ifindex: index of device
725  *
726  *	Search for an interface by index. Returns NULL if the device
727  *	is not found or a pointer to the device. The device returned has
728  *	had a reference added and the pointer is safe until the user calls
729  *	dev_put to indicate they have finished with it.
730  */
731 
732 struct net_device *dev_get_by_index(struct net *net, int ifindex)
733 {
734 	struct net_device *dev;
735 
736 	rcu_read_lock();
737 	dev = dev_get_by_index_rcu(net, ifindex);
738 	if (dev)
739 		dev_hold(dev);
740 	rcu_read_unlock();
741 	return dev;
742 }
743 EXPORT_SYMBOL(dev_get_by_index);
744 
745 /**
746  *	dev_getbyhwaddr_rcu - find a device by its hardware address
747  *	@net: the applicable net namespace
748  *	@type: media type of device
749  *	@ha: hardware address
750  *
751  *	Search for an interface by MAC address. Returns NULL if the device
752  *	is not found or a pointer to the device. The caller must hold RCU
753  *	The returned device has not had its ref count increased
754  *	and the caller must therefore be careful about locking
755  *
756  */
757 
758 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
759 				       const char *ha)
760 {
761 	struct net_device *dev;
762 
763 	for_each_netdev_rcu(net, dev)
764 		if (dev->type == type &&
765 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
766 			return dev;
767 
768 	return NULL;
769 }
770 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
771 
772 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
773 {
774 	struct net_device *dev;
775 
776 	ASSERT_RTNL();
777 	for_each_netdev(net, dev)
778 		if (dev->type == type)
779 			return dev;
780 
781 	return NULL;
782 }
783 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
784 
785 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
786 {
787 	struct net_device *dev, *ret = NULL;
788 
789 	rcu_read_lock();
790 	for_each_netdev_rcu(net, dev)
791 		if (dev->type == type) {
792 			dev_hold(dev);
793 			ret = dev;
794 			break;
795 		}
796 	rcu_read_unlock();
797 	return ret;
798 }
799 EXPORT_SYMBOL(dev_getfirstbyhwtype);
800 
801 /**
802  *	dev_get_by_flags_rcu - find any device with given flags
803  *	@net: the applicable net namespace
804  *	@if_flags: IFF_* values
805  *	@mask: bitmask of bits in if_flags to check
806  *
807  *	Search for any interface with the given flags. Returns NULL if a device
808  *	is not found or a pointer to the device. Must be called inside
809  *	rcu_read_lock(), and result refcount is unchanged.
810  */
811 
812 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
813 				    unsigned short mask)
814 {
815 	struct net_device *dev, *ret;
816 
817 	ret = NULL;
818 	for_each_netdev_rcu(net, dev) {
819 		if (((dev->flags ^ if_flags) & mask) == 0) {
820 			ret = dev;
821 			break;
822 		}
823 	}
824 	return ret;
825 }
826 EXPORT_SYMBOL(dev_get_by_flags_rcu);
827 
828 /**
829  *	dev_valid_name - check if name is okay for network device
830  *	@name: name string
831  *
832  *	Network device names need to be valid file names to
833  *	to allow sysfs to work.  We also disallow any kind of
834  *	whitespace.
835  */
836 int dev_valid_name(const char *name)
837 {
838 	if (*name == '\0')
839 		return 0;
840 	if (strlen(name) >= IFNAMSIZ)
841 		return 0;
842 	if (!strcmp(name, ".") || !strcmp(name, ".."))
843 		return 0;
844 
845 	while (*name) {
846 		if (*name == '/' || isspace(*name))
847 			return 0;
848 		name++;
849 	}
850 	return 1;
851 }
852 EXPORT_SYMBOL(dev_valid_name);
853 
854 /**
855  *	__dev_alloc_name - allocate a name for a device
856  *	@net: network namespace to allocate the device name in
857  *	@name: name format string
858  *	@buf:  scratch buffer and result name string
859  *
860  *	Passed a format string - eg "lt%d" it will try and find a suitable
861  *	id. It scans list of devices to build up a free map, then chooses
862  *	the first empty slot. The caller must hold the dev_base or rtnl lock
863  *	while allocating the name and adding the device in order to avoid
864  *	duplicates.
865  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
866  *	Returns the number of the unit assigned or a negative errno code.
867  */
868 
869 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
870 {
871 	int i = 0;
872 	const char *p;
873 	const int max_netdevices = 8*PAGE_SIZE;
874 	unsigned long *inuse;
875 	struct net_device *d;
876 
877 	p = strnchr(name, IFNAMSIZ-1, '%');
878 	if (p) {
879 		/*
880 		 * Verify the string as this thing may have come from
881 		 * the user.  There must be either one "%d" and no other "%"
882 		 * characters.
883 		 */
884 		if (p[1] != 'd' || strchr(p + 2, '%'))
885 			return -EINVAL;
886 
887 		/* Use one page as a bit array of possible slots */
888 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
889 		if (!inuse)
890 			return -ENOMEM;
891 
892 		for_each_netdev(net, d) {
893 			if (!sscanf(d->name, name, &i))
894 				continue;
895 			if (i < 0 || i >= max_netdevices)
896 				continue;
897 
898 			/*  avoid cases where sscanf is not exact inverse of printf */
899 			snprintf(buf, IFNAMSIZ, name, i);
900 			if (!strncmp(buf, d->name, IFNAMSIZ))
901 				set_bit(i, inuse);
902 		}
903 
904 		i = find_first_zero_bit(inuse, max_netdevices);
905 		free_page((unsigned long) inuse);
906 	}
907 
908 	if (buf != name)
909 		snprintf(buf, IFNAMSIZ, name, i);
910 	if (!__dev_get_by_name(net, buf))
911 		return i;
912 
913 	/* It is possible to run out of possible slots
914 	 * when the name is long and there isn't enough space left
915 	 * for the digits, or if all bits are used.
916 	 */
917 	return -ENFILE;
918 }
919 
920 /**
921  *	dev_alloc_name - allocate a name for a device
922  *	@dev: device
923  *	@name: name format string
924  *
925  *	Passed a format string - eg "lt%d" it will try and find a suitable
926  *	id. It scans list of devices to build up a free map, then chooses
927  *	the first empty slot. The caller must hold the dev_base or rtnl lock
928  *	while allocating the name and adding the device in order to avoid
929  *	duplicates.
930  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
931  *	Returns the number of the unit assigned or a negative errno code.
932  */
933 
934 int dev_alloc_name(struct net_device *dev, const char *name)
935 {
936 	char buf[IFNAMSIZ];
937 	struct net *net;
938 	int ret;
939 
940 	BUG_ON(!dev_net(dev));
941 	net = dev_net(dev);
942 	ret = __dev_alloc_name(net, name, buf);
943 	if (ret >= 0)
944 		strlcpy(dev->name, buf, IFNAMSIZ);
945 	return ret;
946 }
947 EXPORT_SYMBOL(dev_alloc_name);
948 
949 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
950 {
951 	struct net *net;
952 
953 	BUG_ON(!dev_net(dev));
954 	net = dev_net(dev);
955 
956 	if (!dev_valid_name(name))
957 		return -EINVAL;
958 
959 	if (fmt && strchr(name, '%'))
960 		return dev_alloc_name(dev, name);
961 	else if (__dev_get_by_name(net, name))
962 		return -EEXIST;
963 	else if (dev->name != name)
964 		strlcpy(dev->name, name, IFNAMSIZ);
965 
966 	return 0;
967 }
968 
969 /**
970  *	dev_change_name - change name of a device
971  *	@dev: device
972  *	@newname: name (or format string) must be at least IFNAMSIZ
973  *
974  *	Change name of a device, can pass format strings "eth%d".
975  *	for wildcarding.
976  */
977 int dev_change_name(struct net_device *dev, const char *newname)
978 {
979 	char oldname[IFNAMSIZ];
980 	int err = 0;
981 	int ret;
982 	struct net *net;
983 
984 	ASSERT_RTNL();
985 	BUG_ON(!dev_net(dev));
986 
987 	net = dev_net(dev);
988 	if (dev->flags & IFF_UP)
989 		return -EBUSY;
990 
991 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
992 		return 0;
993 
994 	memcpy(oldname, dev->name, IFNAMSIZ);
995 
996 	err = dev_get_valid_name(dev, newname, 1);
997 	if (err < 0)
998 		return err;
999 
1000 rollback:
1001 	ret = device_rename(&dev->dev, dev->name);
1002 	if (ret) {
1003 		memcpy(dev->name, oldname, IFNAMSIZ);
1004 		return ret;
1005 	}
1006 
1007 	write_lock_bh(&dev_base_lock);
1008 	hlist_del(&dev->name_hlist);
1009 	write_unlock_bh(&dev_base_lock);
1010 
1011 	synchronize_rcu();
1012 
1013 	write_lock_bh(&dev_base_lock);
1014 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1015 	write_unlock_bh(&dev_base_lock);
1016 
1017 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1018 	ret = notifier_to_errno(ret);
1019 
1020 	if (ret) {
1021 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1022 		if (err >= 0) {
1023 			err = ret;
1024 			memcpy(dev->name, oldname, IFNAMSIZ);
1025 			goto rollback;
1026 		} else {
1027 			printk(KERN_ERR
1028 			       "%s: name change rollback failed: %d.\n",
1029 			       dev->name, ret);
1030 		}
1031 	}
1032 
1033 	return err;
1034 }
1035 
1036 /**
1037  *	dev_set_alias - change ifalias of a device
1038  *	@dev: device
1039  *	@alias: name up to IFALIASZ
1040  *	@len: limit of bytes to copy from info
1041  *
1042  *	Set ifalias for a device,
1043  */
1044 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1045 {
1046 	ASSERT_RTNL();
1047 
1048 	if (len >= IFALIASZ)
1049 		return -EINVAL;
1050 
1051 	if (!len) {
1052 		if (dev->ifalias) {
1053 			kfree(dev->ifalias);
1054 			dev->ifalias = NULL;
1055 		}
1056 		return 0;
1057 	}
1058 
1059 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1060 	if (!dev->ifalias)
1061 		return -ENOMEM;
1062 
1063 	strlcpy(dev->ifalias, alias, len+1);
1064 	return len;
1065 }
1066 
1067 
1068 /**
1069  *	netdev_features_change - device changes features
1070  *	@dev: device to cause notification
1071  *
1072  *	Called to indicate a device has changed features.
1073  */
1074 void netdev_features_change(struct net_device *dev)
1075 {
1076 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1077 }
1078 EXPORT_SYMBOL(netdev_features_change);
1079 
1080 /**
1081  *	netdev_state_change - device changes state
1082  *	@dev: device to cause notification
1083  *
1084  *	Called to indicate a device has changed state. This function calls
1085  *	the notifier chains for netdev_chain and sends a NEWLINK message
1086  *	to the routing socket.
1087  */
1088 void netdev_state_change(struct net_device *dev)
1089 {
1090 	if (dev->flags & IFF_UP) {
1091 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1092 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1093 	}
1094 }
1095 EXPORT_SYMBOL(netdev_state_change);
1096 
1097 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1098 {
1099 	return call_netdevice_notifiers(event, dev);
1100 }
1101 EXPORT_SYMBOL(netdev_bonding_change);
1102 
1103 /**
1104  *	dev_load 	- load a network module
1105  *	@net: the applicable net namespace
1106  *	@name: name of interface
1107  *
1108  *	If a network interface is not present and the process has suitable
1109  *	privileges this function loads the module. If module loading is not
1110  *	available in this kernel then it becomes a nop.
1111  */
1112 
1113 void dev_load(struct net *net, const char *name)
1114 {
1115 	struct net_device *dev;
1116 
1117 	rcu_read_lock();
1118 	dev = dev_get_by_name_rcu(net, name);
1119 	rcu_read_unlock();
1120 
1121 	if (!dev && capable(CAP_NET_ADMIN))
1122 		request_module("%s", name);
1123 }
1124 EXPORT_SYMBOL(dev_load);
1125 
1126 static int __dev_open(struct net_device *dev)
1127 {
1128 	const struct net_device_ops *ops = dev->netdev_ops;
1129 	int ret;
1130 
1131 	ASSERT_RTNL();
1132 
1133 	/*
1134 	 *	Is it even present?
1135 	 */
1136 	if (!netif_device_present(dev))
1137 		return -ENODEV;
1138 
1139 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1140 	ret = notifier_to_errno(ret);
1141 	if (ret)
1142 		return ret;
1143 
1144 	/*
1145 	 *	Call device private open method
1146 	 */
1147 	set_bit(__LINK_STATE_START, &dev->state);
1148 
1149 	if (ops->ndo_validate_addr)
1150 		ret = ops->ndo_validate_addr(dev);
1151 
1152 	if (!ret && ops->ndo_open)
1153 		ret = ops->ndo_open(dev);
1154 
1155 	/*
1156 	 *	If it went open OK then:
1157 	 */
1158 
1159 	if (ret)
1160 		clear_bit(__LINK_STATE_START, &dev->state);
1161 	else {
1162 		/*
1163 		 *	Set the flags.
1164 		 */
1165 		dev->flags |= IFF_UP;
1166 
1167 		/*
1168 		 *	Enable NET_DMA
1169 		 */
1170 		net_dmaengine_get();
1171 
1172 		/*
1173 		 *	Initialize multicasting status
1174 		 */
1175 		dev_set_rx_mode(dev);
1176 
1177 		/*
1178 		 *	Wakeup transmit queue engine
1179 		 */
1180 		dev_activate(dev);
1181 	}
1182 
1183 	return ret;
1184 }
1185 
1186 /**
1187  *	dev_open	- prepare an interface for use.
1188  *	@dev:	device to open
1189  *
1190  *	Takes a device from down to up state. The device's private open
1191  *	function is invoked and then the multicast lists are loaded. Finally
1192  *	the device is moved into the up state and a %NETDEV_UP message is
1193  *	sent to the netdev notifier chain.
1194  *
1195  *	Calling this function on an active interface is a nop. On a failure
1196  *	a negative errno code is returned.
1197  */
1198 int dev_open(struct net_device *dev)
1199 {
1200 	int ret;
1201 
1202 	/*
1203 	 *	Is it already up?
1204 	 */
1205 	if (dev->flags & IFF_UP)
1206 		return 0;
1207 
1208 	/*
1209 	 *	Open device
1210 	 */
1211 	ret = __dev_open(dev);
1212 	if (ret < 0)
1213 		return ret;
1214 
1215 	/*
1216 	 *	... and announce new interface.
1217 	 */
1218 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1219 	call_netdevice_notifiers(NETDEV_UP, dev);
1220 
1221 	return ret;
1222 }
1223 EXPORT_SYMBOL(dev_open);
1224 
1225 static int __dev_close_many(struct list_head *head)
1226 {
1227 	struct net_device *dev;
1228 
1229 	ASSERT_RTNL();
1230 	might_sleep();
1231 
1232 	list_for_each_entry(dev, head, unreg_list) {
1233 		/*
1234 		 *	Tell people we are going down, so that they can
1235 		 *	prepare to death, when device is still operating.
1236 		 */
1237 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1238 
1239 		clear_bit(__LINK_STATE_START, &dev->state);
1240 
1241 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1242 		 * can be even on different cpu. So just clear netif_running().
1243 		 *
1244 		 * dev->stop() will invoke napi_disable() on all of it's
1245 		 * napi_struct instances on this device.
1246 		 */
1247 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1248 	}
1249 
1250 	dev_deactivate_many(head);
1251 
1252 	list_for_each_entry(dev, head, unreg_list) {
1253 		const struct net_device_ops *ops = dev->netdev_ops;
1254 
1255 		/*
1256 		 *	Call the device specific close. This cannot fail.
1257 		 *	Only if device is UP
1258 		 *
1259 		 *	We allow it to be called even after a DETACH hot-plug
1260 		 *	event.
1261 		 */
1262 		if (ops->ndo_stop)
1263 			ops->ndo_stop(dev);
1264 
1265 		/*
1266 		 *	Device is now down.
1267 		 */
1268 
1269 		dev->flags &= ~IFF_UP;
1270 
1271 		/*
1272 		 *	Shutdown NET_DMA
1273 		 */
1274 		net_dmaengine_put();
1275 	}
1276 
1277 	return 0;
1278 }
1279 
1280 static int __dev_close(struct net_device *dev)
1281 {
1282 	LIST_HEAD(single);
1283 
1284 	list_add(&dev->unreg_list, &single);
1285 	return __dev_close_many(&single);
1286 }
1287 
1288 int dev_close_many(struct list_head *head)
1289 {
1290 	struct net_device *dev, *tmp;
1291 	LIST_HEAD(tmp_list);
1292 
1293 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1294 		if (!(dev->flags & IFF_UP))
1295 			list_move(&dev->unreg_list, &tmp_list);
1296 
1297 	__dev_close_many(head);
1298 
1299 	/*
1300 	 * Tell people we are down
1301 	 */
1302 	list_for_each_entry(dev, head, unreg_list) {
1303 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1304 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1305 	}
1306 
1307 	/* rollback_registered_many needs the complete original list */
1308 	list_splice(&tmp_list, head);
1309 	return 0;
1310 }
1311 
1312 /**
1313  *	dev_close - shutdown an interface.
1314  *	@dev: device to shutdown
1315  *
1316  *	This function moves an active device into down state. A
1317  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1318  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1319  *	chain.
1320  */
1321 int dev_close(struct net_device *dev)
1322 {
1323 	LIST_HEAD(single);
1324 
1325 	list_add(&dev->unreg_list, &single);
1326 	dev_close_many(&single);
1327 
1328 	return 0;
1329 }
1330 EXPORT_SYMBOL(dev_close);
1331 
1332 
1333 /**
1334  *	dev_disable_lro - disable Large Receive Offload on a device
1335  *	@dev: device
1336  *
1337  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1338  *	called under RTNL.  This is needed if received packets may be
1339  *	forwarded to another interface.
1340  */
1341 void dev_disable_lro(struct net_device *dev)
1342 {
1343 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1344 	    dev->ethtool_ops->set_flags) {
1345 		u32 flags = dev->ethtool_ops->get_flags(dev);
1346 		if (flags & ETH_FLAG_LRO) {
1347 			flags &= ~ETH_FLAG_LRO;
1348 			dev->ethtool_ops->set_flags(dev, flags);
1349 		}
1350 	}
1351 	WARN_ON(dev->features & NETIF_F_LRO);
1352 }
1353 EXPORT_SYMBOL(dev_disable_lro);
1354 
1355 
1356 static int dev_boot_phase = 1;
1357 
1358 /*
1359  *	Device change register/unregister. These are not inline or static
1360  *	as we export them to the world.
1361  */
1362 
1363 /**
1364  *	register_netdevice_notifier - register a network notifier block
1365  *	@nb: notifier
1366  *
1367  *	Register a notifier to be called when network device events occur.
1368  *	The notifier passed is linked into the kernel structures and must
1369  *	not be reused until it has been unregistered. A negative errno code
1370  *	is returned on a failure.
1371  *
1372  * 	When registered all registration and up events are replayed
1373  *	to the new notifier to allow device to have a race free
1374  *	view of the network device list.
1375  */
1376 
1377 int register_netdevice_notifier(struct notifier_block *nb)
1378 {
1379 	struct net_device *dev;
1380 	struct net_device *last;
1381 	struct net *net;
1382 	int err;
1383 
1384 	rtnl_lock();
1385 	err = raw_notifier_chain_register(&netdev_chain, nb);
1386 	if (err)
1387 		goto unlock;
1388 	if (dev_boot_phase)
1389 		goto unlock;
1390 	for_each_net(net) {
1391 		for_each_netdev(net, dev) {
1392 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1393 			err = notifier_to_errno(err);
1394 			if (err)
1395 				goto rollback;
1396 
1397 			if (!(dev->flags & IFF_UP))
1398 				continue;
1399 
1400 			nb->notifier_call(nb, NETDEV_UP, dev);
1401 		}
1402 	}
1403 
1404 unlock:
1405 	rtnl_unlock();
1406 	return err;
1407 
1408 rollback:
1409 	last = dev;
1410 	for_each_net(net) {
1411 		for_each_netdev(net, dev) {
1412 			if (dev == last)
1413 				break;
1414 
1415 			if (dev->flags & IFF_UP) {
1416 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1417 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1418 			}
1419 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1420 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1421 		}
1422 	}
1423 
1424 	raw_notifier_chain_unregister(&netdev_chain, nb);
1425 	goto unlock;
1426 }
1427 EXPORT_SYMBOL(register_netdevice_notifier);
1428 
1429 /**
1430  *	unregister_netdevice_notifier - unregister a network notifier block
1431  *	@nb: notifier
1432  *
1433  *	Unregister a notifier previously registered by
1434  *	register_netdevice_notifier(). The notifier is unlinked into the
1435  *	kernel structures and may then be reused. A negative errno code
1436  *	is returned on a failure.
1437  */
1438 
1439 int unregister_netdevice_notifier(struct notifier_block *nb)
1440 {
1441 	int err;
1442 
1443 	rtnl_lock();
1444 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1445 	rtnl_unlock();
1446 	return err;
1447 }
1448 EXPORT_SYMBOL(unregister_netdevice_notifier);
1449 
1450 /**
1451  *	call_netdevice_notifiers - call all network notifier blocks
1452  *      @val: value passed unmodified to notifier function
1453  *      @dev: net_device pointer passed unmodified to notifier function
1454  *
1455  *	Call all network notifier blocks.  Parameters and return value
1456  *	are as for raw_notifier_call_chain().
1457  */
1458 
1459 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1460 {
1461 	ASSERT_RTNL();
1462 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1463 }
1464 
1465 /* When > 0 there are consumers of rx skb time stamps */
1466 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1467 
1468 void net_enable_timestamp(void)
1469 {
1470 	atomic_inc(&netstamp_needed);
1471 }
1472 EXPORT_SYMBOL(net_enable_timestamp);
1473 
1474 void net_disable_timestamp(void)
1475 {
1476 	atomic_dec(&netstamp_needed);
1477 }
1478 EXPORT_SYMBOL(net_disable_timestamp);
1479 
1480 static inline void net_timestamp_set(struct sk_buff *skb)
1481 {
1482 	if (atomic_read(&netstamp_needed))
1483 		__net_timestamp(skb);
1484 	else
1485 		skb->tstamp.tv64 = 0;
1486 }
1487 
1488 static inline void net_timestamp_check(struct sk_buff *skb)
1489 {
1490 	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1491 		__net_timestamp(skb);
1492 }
1493 
1494 /**
1495  * dev_forward_skb - loopback an skb to another netif
1496  *
1497  * @dev: destination network device
1498  * @skb: buffer to forward
1499  *
1500  * return values:
1501  *	NET_RX_SUCCESS	(no congestion)
1502  *	NET_RX_DROP     (packet was dropped, but freed)
1503  *
1504  * dev_forward_skb can be used for injecting an skb from the
1505  * start_xmit function of one device into the receive queue
1506  * of another device.
1507  *
1508  * The receiving device may be in another namespace, so
1509  * we have to clear all information in the skb that could
1510  * impact namespace isolation.
1511  */
1512 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1513 {
1514 	skb_orphan(skb);
1515 	nf_reset(skb);
1516 
1517 	if (unlikely(!(dev->flags & IFF_UP) ||
1518 		     (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1519 		atomic_long_inc(&dev->rx_dropped);
1520 		kfree_skb(skb);
1521 		return NET_RX_DROP;
1522 	}
1523 	skb_set_dev(skb, dev);
1524 	skb->tstamp.tv64 = 0;
1525 	skb->pkt_type = PACKET_HOST;
1526 	skb->protocol = eth_type_trans(skb, dev);
1527 	return netif_rx(skb);
1528 }
1529 EXPORT_SYMBOL_GPL(dev_forward_skb);
1530 
1531 static inline int deliver_skb(struct sk_buff *skb,
1532 			      struct packet_type *pt_prev,
1533 			      struct net_device *orig_dev)
1534 {
1535 	atomic_inc(&skb->users);
1536 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1537 }
1538 
1539 /*
1540  *	Support routine. Sends outgoing frames to any network
1541  *	taps currently in use.
1542  */
1543 
1544 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1545 {
1546 	struct packet_type *ptype;
1547 	struct sk_buff *skb2 = NULL;
1548 	struct packet_type *pt_prev = NULL;
1549 
1550 	rcu_read_lock();
1551 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1552 		/* Never send packets back to the socket
1553 		 * they originated from - MvS ([email protected])
1554 		 */
1555 		if ((ptype->dev == dev || !ptype->dev) &&
1556 		    (ptype->af_packet_priv == NULL ||
1557 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1558 			if (pt_prev) {
1559 				deliver_skb(skb2, pt_prev, skb->dev);
1560 				pt_prev = ptype;
1561 				continue;
1562 			}
1563 
1564 			skb2 = skb_clone(skb, GFP_ATOMIC);
1565 			if (!skb2)
1566 				break;
1567 
1568 			net_timestamp_set(skb2);
1569 
1570 			/* skb->nh should be correctly
1571 			   set by sender, so that the second statement is
1572 			   just protection against buggy protocols.
1573 			 */
1574 			skb_reset_mac_header(skb2);
1575 
1576 			if (skb_network_header(skb2) < skb2->data ||
1577 			    skb2->network_header > skb2->tail) {
1578 				if (net_ratelimit())
1579 					printk(KERN_CRIT "protocol %04x is "
1580 					       "buggy, dev %s\n",
1581 					       ntohs(skb2->protocol),
1582 					       dev->name);
1583 				skb_reset_network_header(skb2);
1584 			}
1585 
1586 			skb2->transport_header = skb2->network_header;
1587 			skb2->pkt_type = PACKET_OUTGOING;
1588 			pt_prev = ptype;
1589 		}
1590 	}
1591 	if (pt_prev)
1592 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1593 	rcu_read_unlock();
1594 }
1595 
1596 /*
1597  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1598  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1599  */
1600 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1601 {
1602 	int rc;
1603 
1604 	if (txq < 1 || txq > dev->num_tx_queues)
1605 		return -EINVAL;
1606 
1607 	if (dev->reg_state == NETREG_REGISTERED) {
1608 		ASSERT_RTNL();
1609 
1610 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1611 						  txq);
1612 		if (rc)
1613 			return rc;
1614 
1615 		if (txq < dev->real_num_tx_queues)
1616 			qdisc_reset_all_tx_gt(dev, txq);
1617 	}
1618 
1619 	dev->real_num_tx_queues = txq;
1620 	return 0;
1621 }
1622 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1623 
1624 #ifdef CONFIG_RPS
1625 /**
1626  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1627  *	@dev: Network device
1628  *	@rxq: Actual number of RX queues
1629  *
1630  *	This must be called either with the rtnl_lock held or before
1631  *	registration of the net device.  Returns 0 on success, or a
1632  *	negative error code.  If called before registration, it always
1633  *	succeeds.
1634  */
1635 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1636 {
1637 	int rc;
1638 
1639 	if (rxq < 1 || rxq > dev->num_rx_queues)
1640 		return -EINVAL;
1641 
1642 	if (dev->reg_state == NETREG_REGISTERED) {
1643 		ASSERT_RTNL();
1644 
1645 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1646 						  rxq);
1647 		if (rc)
1648 			return rc;
1649 	}
1650 
1651 	dev->real_num_rx_queues = rxq;
1652 	return 0;
1653 }
1654 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1655 #endif
1656 
1657 static inline void __netif_reschedule(struct Qdisc *q)
1658 {
1659 	struct softnet_data *sd;
1660 	unsigned long flags;
1661 
1662 	local_irq_save(flags);
1663 	sd = &__get_cpu_var(softnet_data);
1664 	q->next_sched = NULL;
1665 	*sd->output_queue_tailp = q;
1666 	sd->output_queue_tailp = &q->next_sched;
1667 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1668 	local_irq_restore(flags);
1669 }
1670 
1671 void __netif_schedule(struct Qdisc *q)
1672 {
1673 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1674 		__netif_reschedule(q);
1675 }
1676 EXPORT_SYMBOL(__netif_schedule);
1677 
1678 void dev_kfree_skb_irq(struct sk_buff *skb)
1679 {
1680 	if (atomic_dec_and_test(&skb->users)) {
1681 		struct softnet_data *sd;
1682 		unsigned long flags;
1683 
1684 		local_irq_save(flags);
1685 		sd = &__get_cpu_var(softnet_data);
1686 		skb->next = sd->completion_queue;
1687 		sd->completion_queue = skb;
1688 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1689 		local_irq_restore(flags);
1690 	}
1691 }
1692 EXPORT_SYMBOL(dev_kfree_skb_irq);
1693 
1694 void dev_kfree_skb_any(struct sk_buff *skb)
1695 {
1696 	if (in_irq() || irqs_disabled())
1697 		dev_kfree_skb_irq(skb);
1698 	else
1699 		dev_kfree_skb(skb);
1700 }
1701 EXPORT_SYMBOL(dev_kfree_skb_any);
1702 
1703 
1704 /**
1705  * netif_device_detach - mark device as removed
1706  * @dev: network device
1707  *
1708  * Mark device as removed from system and therefore no longer available.
1709  */
1710 void netif_device_detach(struct net_device *dev)
1711 {
1712 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1713 	    netif_running(dev)) {
1714 		netif_tx_stop_all_queues(dev);
1715 	}
1716 }
1717 EXPORT_SYMBOL(netif_device_detach);
1718 
1719 /**
1720  * netif_device_attach - mark device as attached
1721  * @dev: network device
1722  *
1723  * Mark device as attached from system and restart if needed.
1724  */
1725 void netif_device_attach(struct net_device *dev)
1726 {
1727 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1728 	    netif_running(dev)) {
1729 		netif_tx_wake_all_queues(dev);
1730 		__netdev_watchdog_up(dev);
1731 	}
1732 }
1733 EXPORT_SYMBOL(netif_device_attach);
1734 
1735 /**
1736  * skb_dev_set -- assign a new device to a buffer
1737  * @skb: buffer for the new device
1738  * @dev: network device
1739  *
1740  * If an skb is owned by a device already, we have to reset
1741  * all data private to the namespace a device belongs to
1742  * before assigning it a new device.
1743  */
1744 #ifdef CONFIG_NET_NS
1745 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1746 {
1747 	skb_dst_drop(skb);
1748 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1749 		secpath_reset(skb);
1750 		nf_reset(skb);
1751 		skb_init_secmark(skb);
1752 		skb->mark = 0;
1753 		skb->priority = 0;
1754 		skb->nf_trace = 0;
1755 		skb->ipvs_property = 0;
1756 #ifdef CONFIG_NET_SCHED
1757 		skb->tc_index = 0;
1758 #endif
1759 	}
1760 	skb->dev = dev;
1761 }
1762 EXPORT_SYMBOL(skb_set_dev);
1763 #endif /* CONFIG_NET_NS */
1764 
1765 /*
1766  * Invalidate hardware checksum when packet is to be mangled, and
1767  * complete checksum manually on outgoing path.
1768  */
1769 int skb_checksum_help(struct sk_buff *skb)
1770 {
1771 	__wsum csum;
1772 	int ret = 0, offset;
1773 
1774 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1775 		goto out_set_summed;
1776 
1777 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1778 		/* Let GSO fix up the checksum. */
1779 		goto out_set_summed;
1780 	}
1781 
1782 	offset = skb_checksum_start_offset(skb);
1783 	BUG_ON(offset >= skb_headlen(skb));
1784 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1785 
1786 	offset += skb->csum_offset;
1787 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1788 
1789 	if (skb_cloned(skb) &&
1790 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1791 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1792 		if (ret)
1793 			goto out;
1794 	}
1795 
1796 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1797 out_set_summed:
1798 	skb->ip_summed = CHECKSUM_NONE;
1799 out:
1800 	return ret;
1801 }
1802 EXPORT_SYMBOL(skb_checksum_help);
1803 
1804 /**
1805  *	skb_gso_segment - Perform segmentation on skb.
1806  *	@skb: buffer to segment
1807  *	@features: features for the output path (see dev->features)
1808  *
1809  *	This function segments the given skb and returns a list of segments.
1810  *
1811  *	It may return NULL if the skb requires no segmentation.  This is
1812  *	only possible when GSO is used for verifying header integrity.
1813  */
1814 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1815 {
1816 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1817 	struct packet_type *ptype;
1818 	__be16 type = skb->protocol;
1819 	int vlan_depth = ETH_HLEN;
1820 	int err;
1821 
1822 	while (type == htons(ETH_P_8021Q)) {
1823 		struct vlan_hdr *vh;
1824 
1825 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1826 			return ERR_PTR(-EINVAL);
1827 
1828 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1829 		type = vh->h_vlan_encapsulated_proto;
1830 		vlan_depth += VLAN_HLEN;
1831 	}
1832 
1833 	skb_reset_mac_header(skb);
1834 	skb->mac_len = skb->network_header - skb->mac_header;
1835 	__skb_pull(skb, skb->mac_len);
1836 
1837 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1838 		struct net_device *dev = skb->dev;
1839 		struct ethtool_drvinfo info = {};
1840 
1841 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1842 			dev->ethtool_ops->get_drvinfo(dev, &info);
1843 
1844 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1845 		     info.driver, dev ? dev->features : 0L,
1846 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1847 		     skb->len, skb->data_len, skb->ip_summed);
1848 
1849 		if (skb_header_cloned(skb) &&
1850 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1851 			return ERR_PTR(err);
1852 	}
1853 
1854 	rcu_read_lock();
1855 	list_for_each_entry_rcu(ptype,
1856 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1857 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1858 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1859 				err = ptype->gso_send_check(skb);
1860 				segs = ERR_PTR(err);
1861 				if (err || skb_gso_ok(skb, features))
1862 					break;
1863 				__skb_push(skb, (skb->data -
1864 						 skb_network_header(skb)));
1865 			}
1866 			segs = ptype->gso_segment(skb, features);
1867 			break;
1868 		}
1869 	}
1870 	rcu_read_unlock();
1871 
1872 	__skb_push(skb, skb->data - skb_mac_header(skb));
1873 
1874 	return segs;
1875 }
1876 EXPORT_SYMBOL(skb_gso_segment);
1877 
1878 /* Take action when hardware reception checksum errors are detected. */
1879 #ifdef CONFIG_BUG
1880 void netdev_rx_csum_fault(struct net_device *dev)
1881 {
1882 	if (net_ratelimit()) {
1883 		printk(KERN_ERR "%s: hw csum failure.\n",
1884 			dev ? dev->name : "<unknown>");
1885 		dump_stack();
1886 	}
1887 }
1888 EXPORT_SYMBOL(netdev_rx_csum_fault);
1889 #endif
1890 
1891 /* Actually, we should eliminate this check as soon as we know, that:
1892  * 1. IOMMU is present and allows to map all the memory.
1893  * 2. No high memory really exists on this machine.
1894  */
1895 
1896 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1897 {
1898 #ifdef CONFIG_HIGHMEM
1899 	int i;
1900 	if (!(dev->features & NETIF_F_HIGHDMA)) {
1901 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1902 			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1903 				return 1;
1904 	}
1905 
1906 	if (PCI_DMA_BUS_IS_PHYS) {
1907 		struct device *pdev = dev->dev.parent;
1908 
1909 		if (!pdev)
1910 			return 0;
1911 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1912 			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1913 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1914 				return 1;
1915 		}
1916 	}
1917 #endif
1918 	return 0;
1919 }
1920 
1921 struct dev_gso_cb {
1922 	void (*destructor)(struct sk_buff *skb);
1923 };
1924 
1925 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1926 
1927 static void dev_gso_skb_destructor(struct sk_buff *skb)
1928 {
1929 	struct dev_gso_cb *cb;
1930 
1931 	do {
1932 		struct sk_buff *nskb = skb->next;
1933 
1934 		skb->next = nskb->next;
1935 		nskb->next = NULL;
1936 		kfree_skb(nskb);
1937 	} while (skb->next);
1938 
1939 	cb = DEV_GSO_CB(skb);
1940 	if (cb->destructor)
1941 		cb->destructor(skb);
1942 }
1943 
1944 /**
1945  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1946  *	@skb: buffer to segment
1947  *	@features: device features as applicable to this skb
1948  *
1949  *	This function segments the given skb and stores the list of segments
1950  *	in skb->next.
1951  */
1952 static int dev_gso_segment(struct sk_buff *skb, int features)
1953 {
1954 	struct sk_buff *segs;
1955 
1956 	segs = skb_gso_segment(skb, features);
1957 
1958 	/* Verifying header integrity only. */
1959 	if (!segs)
1960 		return 0;
1961 
1962 	if (IS_ERR(segs))
1963 		return PTR_ERR(segs);
1964 
1965 	skb->next = segs;
1966 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1967 	skb->destructor = dev_gso_skb_destructor;
1968 
1969 	return 0;
1970 }
1971 
1972 /*
1973  * Try to orphan skb early, right before transmission by the device.
1974  * We cannot orphan skb if tx timestamp is requested or the sk-reference
1975  * is needed on driver level for other reasons, e.g. see net/can/raw.c
1976  */
1977 static inline void skb_orphan_try(struct sk_buff *skb)
1978 {
1979 	struct sock *sk = skb->sk;
1980 
1981 	if (sk && !skb_shinfo(skb)->tx_flags) {
1982 		/* skb_tx_hash() wont be able to get sk.
1983 		 * We copy sk_hash into skb->rxhash
1984 		 */
1985 		if (!skb->rxhash)
1986 			skb->rxhash = sk->sk_hash;
1987 		skb_orphan(skb);
1988 	}
1989 }
1990 
1991 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1992 {
1993 	return ((features & NETIF_F_GEN_CSUM) ||
1994 		((features & NETIF_F_V4_CSUM) &&
1995 		 protocol == htons(ETH_P_IP)) ||
1996 		((features & NETIF_F_V6_CSUM) &&
1997 		 protocol == htons(ETH_P_IPV6)) ||
1998 		((features & NETIF_F_FCOE_CRC) &&
1999 		 protocol == htons(ETH_P_FCOE)));
2000 }
2001 
2002 static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
2003 {
2004 	if (!can_checksum_protocol(protocol, features)) {
2005 		features &= ~NETIF_F_ALL_CSUM;
2006 		features &= ~NETIF_F_SG;
2007 	} else if (illegal_highdma(skb->dev, skb)) {
2008 		features &= ~NETIF_F_SG;
2009 	}
2010 
2011 	return features;
2012 }
2013 
2014 int netif_skb_features(struct sk_buff *skb)
2015 {
2016 	__be16 protocol = skb->protocol;
2017 	int features = skb->dev->features;
2018 
2019 	if (protocol == htons(ETH_P_8021Q)) {
2020 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2021 		protocol = veh->h_vlan_encapsulated_proto;
2022 	} else if (!vlan_tx_tag_present(skb)) {
2023 		return harmonize_features(skb, protocol, features);
2024 	}
2025 
2026 	features &= skb->dev->vlan_features;
2027 
2028 	if (protocol != htons(ETH_P_8021Q)) {
2029 		return harmonize_features(skb, protocol, features);
2030 	} else {
2031 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2032 				NETIF_F_GEN_CSUM;
2033 		return harmonize_features(skb, protocol, features);
2034 	}
2035 }
2036 EXPORT_SYMBOL(netif_skb_features);
2037 
2038 /*
2039  * Returns true if either:
2040  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2041  *	2. skb is fragmented and the device does not support SG, or if
2042  *	   at least one of fragments is in highmem and device does not
2043  *	   support DMA from it.
2044  */
2045 static inline int skb_needs_linearize(struct sk_buff *skb,
2046 				      int features)
2047 {
2048 	return skb_is_nonlinear(skb) &&
2049 			((skb_has_frag_list(skb) &&
2050 				!(features & NETIF_F_FRAGLIST)) ||
2051 			(skb_shinfo(skb)->nr_frags &&
2052 				!(features & NETIF_F_SG)));
2053 }
2054 
2055 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2056 			struct netdev_queue *txq)
2057 {
2058 	const struct net_device_ops *ops = dev->netdev_ops;
2059 	int rc = NETDEV_TX_OK;
2060 
2061 	if (likely(!skb->next)) {
2062 		int features;
2063 
2064 		/*
2065 		 * If device doesnt need skb->dst, release it right now while
2066 		 * its hot in this cpu cache
2067 		 */
2068 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2069 			skb_dst_drop(skb);
2070 
2071 		if (!list_empty(&ptype_all))
2072 			dev_queue_xmit_nit(skb, dev);
2073 
2074 		skb_orphan_try(skb);
2075 
2076 		features = netif_skb_features(skb);
2077 
2078 		if (vlan_tx_tag_present(skb) &&
2079 		    !(features & NETIF_F_HW_VLAN_TX)) {
2080 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2081 			if (unlikely(!skb))
2082 				goto out;
2083 
2084 			skb->vlan_tci = 0;
2085 		}
2086 
2087 		if (netif_needs_gso(skb, features)) {
2088 			if (unlikely(dev_gso_segment(skb, features)))
2089 				goto out_kfree_skb;
2090 			if (skb->next)
2091 				goto gso;
2092 		} else {
2093 			if (skb_needs_linearize(skb, features) &&
2094 			    __skb_linearize(skb))
2095 				goto out_kfree_skb;
2096 
2097 			/* If packet is not checksummed and device does not
2098 			 * support checksumming for this protocol, complete
2099 			 * checksumming here.
2100 			 */
2101 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2102 				skb_set_transport_header(skb,
2103 					skb_checksum_start_offset(skb));
2104 				if (!(features & NETIF_F_ALL_CSUM) &&
2105 				     skb_checksum_help(skb))
2106 					goto out_kfree_skb;
2107 			}
2108 		}
2109 
2110 		rc = ops->ndo_start_xmit(skb, dev);
2111 		trace_net_dev_xmit(skb, rc);
2112 		if (rc == NETDEV_TX_OK)
2113 			txq_trans_update(txq);
2114 		return rc;
2115 	}
2116 
2117 gso:
2118 	do {
2119 		struct sk_buff *nskb = skb->next;
2120 
2121 		skb->next = nskb->next;
2122 		nskb->next = NULL;
2123 
2124 		/*
2125 		 * If device doesnt need nskb->dst, release it right now while
2126 		 * its hot in this cpu cache
2127 		 */
2128 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2129 			skb_dst_drop(nskb);
2130 
2131 		rc = ops->ndo_start_xmit(nskb, dev);
2132 		trace_net_dev_xmit(nskb, rc);
2133 		if (unlikely(rc != NETDEV_TX_OK)) {
2134 			if (rc & ~NETDEV_TX_MASK)
2135 				goto out_kfree_gso_skb;
2136 			nskb->next = skb->next;
2137 			skb->next = nskb;
2138 			return rc;
2139 		}
2140 		txq_trans_update(txq);
2141 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2142 			return NETDEV_TX_BUSY;
2143 	} while (skb->next);
2144 
2145 out_kfree_gso_skb:
2146 	if (likely(skb->next == NULL))
2147 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2148 out_kfree_skb:
2149 	kfree_skb(skb);
2150 out:
2151 	return rc;
2152 }
2153 
2154 static u32 hashrnd __read_mostly;
2155 
2156 /*
2157  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2158  * to be used as a distribution range.
2159  */
2160 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2161 		  unsigned int num_tx_queues)
2162 {
2163 	u32 hash;
2164 
2165 	if (skb_rx_queue_recorded(skb)) {
2166 		hash = skb_get_rx_queue(skb);
2167 		while (unlikely(hash >= num_tx_queues))
2168 			hash -= num_tx_queues;
2169 		return hash;
2170 	}
2171 
2172 	if (skb->sk && skb->sk->sk_hash)
2173 		hash = skb->sk->sk_hash;
2174 	else
2175 		hash = (__force u16) skb->protocol ^ skb->rxhash;
2176 	hash = jhash_1word(hash, hashrnd);
2177 
2178 	return (u16) (((u64) hash * num_tx_queues) >> 32);
2179 }
2180 EXPORT_SYMBOL(__skb_tx_hash);
2181 
2182 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2183 {
2184 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2185 		if (net_ratelimit()) {
2186 			pr_warning("%s selects TX queue %d, but "
2187 				"real number of TX queues is %d\n",
2188 				dev->name, queue_index, dev->real_num_tx_queues);
2189 		}
2190 		return 0;
2191 	}
2192 	return queue_index;
2193 }
2194 
2195 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2196 {
2197 #ifdef CONFIG_XPS
2198 	struct xps_dev_maps *dev_maps;
2199 	struct xps_map *map;
2200 	int queue_index = -1;
2201 
2202 	rcu_read_lock();
2203 	dev_maps = rcu_dereference(dev->xps_maps);
2204 	if (dev_maps) {
2205 		map = rcu_dereference(
2206 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2207 		if (map) {
2208 			if (map->len == 1)
2209 				queue_index = map->queues[0];
2210 			else {
2211 				u32 hash;
2212 				if (skb->sk && skb->sk->sk_hash)
2213 					hash = skb->sk->sk_hash;
2214 				else
2215 					hash = (__force u16) skb->protocol ^
2216 					    skb->rxhash;
2217 				hash = jhash_1word(hash, hashrnd);
2218 				queue_index = map->queues[
2219 				    ((u64)hash * map->len) >> 32];
2220 			}
2221 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2222 				queue_index = -1;
2223 		}
2224 	}
2225 	rcu_read_unlock();
2226 
2227 	return queue_index;
2228 #else
2229 	return -1;
2230 #endif
2231 }
2232 
2233 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2234 					struct sk_buff *skb)
2235 {
2236 	int queue_index;
2237 	const struct net_device_ops *ops = dev->netdev_ops;
2238 
2239 	if (dev->real_num_tx_queues == 1)
2240 		queue_index = 0;
2241 	else if (ops->ndo_select_queue) {
2242 		queue_index = ops->ndo_select_queue(dev, skb);
2243 		queue_index = dev_cap_txqueue(dev, queue_index);
2244 	} else {
2245 		struct sock *sk = skb->sk;
2246 		queue_index = sk_tx_queue_get(sk);
2247 
2248 		if (queue_index < 0 || skb->ooo_okay ||
2249 		    queue_index >= dev->real_num_tx_queues) {
2250 			int old_index = queue_index;
2251 
2252 			queue_index = get_xps_queue(dev, skb);
2253 			if (queue_index < 0)
2254 				queue_index = skb_tx_hash(dev, skb);
2255 
2256 			if (queue_index != old_index && sk) {
2257 				struct dst_entry *dst =
2258 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2259 
2260 				if (dst && skb_dst(skb) == dst)
2261 					sk_tx_queue_set(sk, queue_index);
2262 			}
2263 		}
2264 	}
2265 
2266 	skb_set_queue_mapping(skb, queue_index);
2267 	return netdev_get_tx_queue(dev, queue_index);
2268 }
2269 
2270 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2271 				 struct net_device *dev,
2272 				 struct netdev_queue *txq)
2273 {
2274 	spinlock_t *root_lock = qdisc_lock(q);
2275 	bool contended = qdisc_is_running(q);
2276 	int rc;
2277 
2278 	/*
2279 	 * Heuristic to force contended enqueues to serialize on a
2280 	 * separate lock before trying to get qdisc main lock.
2281 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2282 	 * and dequeue packets faster.
2283 	 */
2284 	if (unlikely(contended))
2285 		spin_lock(&q->busylock);
2286 
2287 	spin_lock(root_lock);
2288 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2289 		kfree_skb(skb);
2290 		rc = NET_XMIT_DROP;
2291 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2292 		   qdisc_run_begin(q)) {
2293 		/*
2294 		 * This is a work-conserving queue; there are no old skbs
2295 		 * waiting to be sent out; and the qdisc is not running -
2296 		 * xmit the skb directly.
2297 		 */
2298 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2299 			skb_dst_force(skb);
2300 		__qdisc_update_bstats(q, skb->len);
2301 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2302 			if (unlikely(contended)) {
2303 				spin_unlock(&q->busylock);
2304 				contended = false;
2305 			}
2306 			__qdisc_run(q);
2307 		} else
2308 			qdisc_run_end(q);
2309 
2310 		rc = NET_XMIT_SUCCESS;
2311 	} else {
2312 		skb_dst_force(skb);
2313 		rc = qdisc_enqueue_root(skb, q);
2314 		if (qdisc_run_begin(q)) {
2315 			if (unlikely(contended)) {
2316 				spin_unlock(&q->busylock);
2317 				contended = false;
2318 			}
2319 			__qdisc_run(q);
2320 		}
2321 	}
2322 	spin_unlock(root_lock);
2323 	if (unlikely(contended))
2324 		spin_unlock(&q->busylock);
2325 	return rc;
2326 }
2327 
2328 static DEFINE_PER_CPU(int, xmit_recursion);
2329 #define RECURSION_LIMIT 10
2330 
2331 /**
2332  *	dev_queue_xmit - transmit a buffer
2333  *	@skb: buffer to transmit
2334  *
2335  *	Queue a buffer for transmission to a network device. The caller must
2336  *	have set the device and priority and built the buffer before calling
2337  *	this function. The function can be called from an interrupt.
2338  *
2339  *	A negative errno code is returned on a failure. A success does not
2340  *	guarantee the frame will be transmitted as it may be dropped due
2341  *	to congestion or traffic shaping.
2342  *
2343  * -----------------------------------------------------------------------------------
2344  *      I notice this method can also return errors from the queue disciplines,
2345  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2346  *      be positive.
2347  *
2348  *      Regardless of the return value, the skb is consumed, so it is currently
2349  *      difficult to retry a send to this method.  (You can bump the ref count
2350  *      before sending to hold a reference for retry if you are careful.)
2351  *
2352  *      When calling this method, interrupts MUST be enabled.  This is because
2353  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2354  *          --BLG
2355  */
2356 int dev_queue_xmit(struct sk_buff *skb)
2357 {
2358 	struct net_device *dev = skb->dev;
2359 	struct netdev_queue *txq;
2360 	struct Qdisc *q;
2361 	int rc = -ENOMEM;
2362 
2363 	/* Disable soft irqs for various locks below. Also
2364 	 * stops preemption for RCU.
2365 	 */
2366 	rcu_read_lock_bh();
2367 
2368 	txq = dev_pick_tx(dev, skb);
2369 	q = rcu_dereference_bh(txq->qdisc);
2370 
2371 #ifdef CONFIG_NET_CLS_ACT
2372 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2373 #endif
2374 	trace_net_dev_queue(skb);
2375 	if (q->enqueue) {
2376 		rc = __dev_xmit_skb(skb, q, dev, txq);
2377 		goto out;
2378 	}
2379 
2380 	/* The device has no queue. Common case for software devices:
2381 	   loopback, all the sorts of tunnels...
2382 
2383 	   Really, it is unlikely that netif_tx_lock protection is necessary
2384 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2385 	   counters.)
2386 	   However, it is possible, that they rely on protection
2387 	   made by us here.
2388 
2389 	   Check this and shot the lock. It is not prone from deadlocks.
2390 	   Either shot noqueue qdisc, it is even simpler 8)
2391 	 */
2392 	if (dev->flags & IFF_UP) {
2393 		int cpu = smp_processor_id(); /* ok because BHs are off */
2394 
2395 		if (txq->xmit_lock_owner != cpu) {
2396 
2397 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2398 				goto recursion_alert;
2399 
2400 			HARD_TX_LOCK(dev, txq, cpu);
2401 
2402 			if (!netif_tx_queue_stopped(txq)) {
2403 				__this_cpu_inc(xmit_recursion);
2404 				rc = dev_hard_start_xmit(skb, dev, txq);
2405 				__this_cpu_dec(xmit_recursion);
2406 				if (dev_xmit_complete(rc)) {
2407 					HARD_TX_UNLOCK(dev, txq);
2408 					goto out;
2409 				}
2410 			}
2411 			HARD_TX_UNLOCK(dev, txq);
2412 			if (net_ratelimit())
2413 				printk(KERN_CRIT "Virtual device %s asks to "
2414 				       "queue packet!\n", dev->name);
2415 		} else {
2416 			/* Recursion is detected! It is possible,
2417 			 * unfortunately
2418 			 */
2419 recursion_alert:
2420 			if (net_ratelimit())
2421 				printk(KERN_CRIT "Dead loop on virtual device "
2422 				       "%s, fix it urgently!\n", dev->name);
2423 		}
2424 	}
2425 
2426 	rc = -ENETDOWN;
2427 	rcu_read_unlock_bh();
2428 
2429 	kfree_skb(skb);
2430 	return rc;
2431 out:
2432 	rcu_read_unlock_bh();
2433 	return rc;
2434 }
2435 EXPORT_SYMBOL(dev_queue_xmit);
2436 
2437 
2438 /*=======================================================================
2439 			Receiver routines
2440   =======================================================================*/
2441 
2442 int netdev_max_backlog __read_mostly = 1000;
2443 int netdev_tstamp_prequeue __read_mostly = 1;
2444 int netdev_budget __read_mostly = 300;
2445 int weight_p __read_mostly = 64;            /* old backlog weight */
2446 
2447 /* Called with irq disabled */
2448 static inline void ____napi_schedule(struct softnet_data *sd,
2449 				     struct napi_struct *napi)
2450 {
2451 	list_add_tail(&napi->poll_list, &sd->poll_list);
2452 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2453 }
2454 
2455 /*
2456  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2457  * and src/dst port numbers. Returns a non-zero hash number on success
2458  * and 0 on failure.
2459  */
2460 __u32 __skb_get_rxhash(struct sk_buff *skb)
2461 {
2462 	int nhoff, hash = 0, poff;
2463 	struct ipv6hdr *ip6;
2464 	struct iphdr *ip;
2465 	u8 ip_proto;
2466 	u32 addr1, addr2, ihl;
2467 	union {
2468 		u32 v32;
2469 		u16 v16[2];
2470 	} ports;
2471 
2472 	nhoff = skb_network_offset(skb);
2473 
2474 	switch (skb->protocol) {
2475 	case __constant_htons(ETH_P_IP):
2476 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2477 			goto done;
2478 
2479 		ip = (struct iphdr *) (skb->data + nhoff);
2480 		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2481 			ip_proto = 0;
2482 		else
2483 			ip_proto = ip->protocol;
2484 		addr1 = (__force u32) ip->saddr;
2485 		addr2 = (__force u32) ip->daddr;
2486 		ihl = ip->ihl;
2487 		break;
2488 	case __constant_htons(ETH_P_IPV6):
2489 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2490 			goto done;
2491 
2492 		ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2493 		ip_proto = ip6->nexthdr;
2494 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2495 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2496 		ihl = (40 >> 2);
2497 		break;
2498 	default:
2499 		goto done;
2500 	}
2501 
2502 	ports.v32 = 0;
2503 	poff = proto_ports_offset(ip_proto);
2504 	if (poff >= 0) {
2505 		nhoff += ihl * 4 + poff;
2506 		if (pskb_may_pull(skb, nhoff + 4)) {
2507 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2508 			if (ports.v16[1] < ports.v16[0])
2509 				swap(ports.v16[0], ports.v16[1]);
2510 		}
2511 	}
2512 
2513 	/* get a consistent hash (same value on both flow directions) */
2514 	if (addr2 < addr1)
2515 		swap(addr1, addr2);
2516 
2517 	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2518 	if (!hash)
2519 		hash = 1;
2520 
2521 done:
2522 	return hash;
2523 }
2524 EXPORT_SYMBOL(__skb_get_rxhash);
2525 
2526 #ifdef CONFIG_RPS
2527 
2528 /* One global table that all flow-based protocols share. */
2529 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2530 EXPORT_SYMBOL(rps_sock_flow_table);
2531 
2532 /*
2533  * get_rps_cpu is called from netif_receive_skb and returns the target
2534  * CPU from the RPS map of the receiving queue for a given skb.
2535  * rcu_read_lock must be held on entry.
2536  */
2537 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2538 		       struct rps_dev_flow **rflowp)
2539 {
2540 	struct netdev_rx_queue *rxqueue;
2541 	struct rps_map *map;
2542 	struct rps_dev_flow_table *flow_table;
2543 	struct rps_sock_flow_table *sock_flow_table;
2544 	int cpu = -1;
2545 	u16 tcpu;
2546 
2547 	if (skb_rx_queue_recorded(skb)) {
2548 		u16 index = skb_get_rx_queue(skb);
2549 		if (unlikely(index >= dev->real_num_rx_queues)) {
2550 			WARN_ONCE(dev->real_num_rx_queues > 1,
2551 				  "%s received packet on queue %u, but number "
2552 				  "of RX queues is %u\n",
2553 				  dev->name, index, dev->real_num_rx_queues);
2554 			goto done;
2555 		}
2556 		rxqueue = dev->_rx + index;
2557 	} else
2558 		rxqueue = dev->_rx;
2559 
2560 	map = rcu_dereference(rxqueue->rps_map);
2561 	if (map) {
2562 		if (map->len == 1) {
2563 			tcpu = map->cpus[0];
2564 			if (cpu_online(tcpu))
2565 				cpu = tcpu;
2566 			goto done;
2567 		}
2568 	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2569 		goto done;
2570 	}
2571 
2572 	skb_reset_network_header(skb);
2573 	if (!skb_get_rxhash(skb))
2574 		goto done;
2575 
2576 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2577 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2578 	if (flow_table && sock_flow_table) {
2579 		u16 next_cpu;
2580 		struct rps_dev_flow *rflow;
2581 
2582 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2583 		tcpu = rflow->cpu;
2584 
2585 		next_cpu = sock_flow_table->ents[skb->rxhash &
2586 		    sock_flow_table->mask];
2587 
2588 		/*
2589 		 * If the desired CPU (where last recvmsg was done) is
2590 		 * different from current CPU (one in the rx-queue flow
2591 		 * table entry), switch if one of the following holds:
2592 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2593 		 *   - Current CPU is offline.
2594 		 *   - The current CPU's queue tail has advanced beyond the
2595 		 *     last packet that was enqueued using this table entry.
2596 		 *     This guarantees that all previous packets for the flow
2597 		 *     have been dequeued, thus preserving in order delivery.
2598 		 */
2599 		if (unlikely(tcpu != next_cpu) &&
2600 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2601 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2602 		      rflow->last_qtail)) >= 0)) {
2603 			tcpu = rflow->cpu = next_cpu;
2604 			if (tcpu != RPS_NO_CPU)
2605 				rflow->last_qtail = per_cpu(softnet_data,
2606 				    tcpu).input_queue_head;
2607 		}
2608 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2609 			*rflowp = rflow;
2610 			cpu = tcpu;
2611 			goto done;
2612 		}
2613 	}
2614 
2615 	if (map) {
2616 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2617 
2618 		if (cpu_online(tcpu)) {
2619 			cpu = tcpu;
2620 			goto done;
2621 		}
2622 	}
2623 
2624 done:
2625 	return cpu;
2626 }
2627 
2628 /* Called from hardirq (IPI) context */
2629 static void rps_trigger_softirq(void *data)
2630 {
2631 	struct softnet_data *sd = data;
2632 
2633 	____napi_schedule(sd, &sd->backlog);
2634 	sd->received_rps++;
2635 }
2636 
2637 #endif /* CONFIG_RPS */
2638 
2639 /*
2640  * Check if this softnet_data structure is another cpu one
2641  * If yes, queue it to our IPI list and return 1
2642  * If no, return 0
2643  */
2644 static int rps_ipi_queued(struct softnet_data *sd)
2645 {
2646 #ifdef CONFIG_RPS
2647 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2648 
2649 	if (sd != mysd) {
2650 		sd->rps_ipi_next = mysd->rps_ipi_list;
2651 		mysd->rps_ipi_list = sd;
2652 
2653 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2654 		return 1;
2655 	}
2656 #endif /* CONFIG_RPS */
2657 	return 0;
2658 }
2659 
2660 /*
2661  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2662  * queue (may be a remote CPU queue).
2663  */
2664 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2665 			      unsigned int *qtail)
2666 {
2667 	struct softnet_data *sd;
2668 	unsigned long flags;
2669 
2670 	sd = &per_cpu(softnet_data, cpu);
2671 
2672 	local_irq_save(flags);
2673 
2674 	rps_lock(sd);
2675 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2676 		if (skb_queue_len(&sd->input_pkt_queue)) {
2677 enqueue:
2678 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2679 			input_queue_tail_incr_save(sd, qtail);
2680 			rps_unlock(sd);
2681 			local_irq_restore(flags);
2682 			return NET_RX_SUCCESS;
2683 		}
2684 
2685 		/* Schedule NAPI for backlog device
2686 		 * We can use non atomic operation since we own the queue lock
2687 		 */
2688 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2689 			if (!rps_ipi_queued(sd))
2690 				____napi_schedule(sd, &sd->backlog);
2691 		}
2692 		goto enqueue;
2693 	}
2694 
2695 	sd->dropped++;
2696 	rps_unlock(sd);
2697 
2698 	local_irq_restore(flags);
2699 
2700 	atomic_long_inc(&skb->dev->rx_dropped);
2701 	kfree_skb(skb);
2702 	return NET_RX_DROP;
2703 }
2704 
2705 /**
2706  *	netif_rx	-	post buffer to the network code
2707  *	@skb: buffer to post
2708  *
2709  *	This function receives a packet from a device driver and queues it for
2710  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2711  *	may be dropped during processing for congestion control or by the
2712  *	protocol layers.
2713  *
2714  *	return values:
2715  *	NET_RX_SUCCESS	(no congestion)
2716  *	NET_RX_DROP     (packet was dropped)
2717  *
2718  */
2719 
2720 int netif_rx(struct sk_buff *skb)
2721 {
2722 	int ret;
2723 
2724 	/* if netpoll wants it, pretend we never saw it */
2725 	if (netpoll_rx(skb))
2726 		return NET_RX_DROP;
2727 
2728 	if (netdev_tstamp_prequeue)
2729 		net_timestamp_check(skb);
2730 
2731 	trace_netif_rx(skb);
2732 #ifdef CONFIG_RPS
2733 	{
2734 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2735 		int cpu;
2736 
2737 		preempt_disable();
2738 		rcu_read_lock();
2739 
2740 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2741 		if (cpu < 0)
2742 			cpu = smp_processor_id();
2743 
2744 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2745 
2746 		rcu_read_unlock();
2747 		preempt_enable();
2748 	}
2749 #else
2750 	{
2751 		unsigned int qtail;
2752 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2753 		put_cpu();
2754 	}
2755 #endif
2756 	return ret;
2757 }
2758 EXPORT_SYMBOL(netif_rx);
2759 
2760 int netif_rx_ni(struct sk_buff *skb)
2761 {
2762 	int err;
2763 
2764 	preempt_disable();
2765 	err = netif_rx(skb);
2766 	if (local_softirq_pending())
2767 		do_softirq();
2768 	preempt_enable();
2769 
2770 	return err;
2771 }
2772 EXPORT_SYMBOL(netif_rx_ni);
2773 
2774 static void net_tx_action(struct softirq_action *h)
2775 {
2776 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2777 
2778 	if (sd->completion_queue) {
2779 		struct sk_buff *clist;
2780 
2781 		local_irq_disable();
2782 		clist = sd->completion_queue;
2783 		sd->completion_queue = NULL;
2784 		local_irq_enable();
2785 
2786 		while (clist) {
2787 			struct sk_buff *skb = clist;
2788 			clist = clist->next;
2789 
2790 			WARN_ON(atomic_read(&skb->users));
2791 			trace_kfree_skb(skb, net_tx_action);
2792 			__kfree_skb(skb);
2793 		}
2794 	}
2795 
2796 	if (sd->output_queue) {
2797 		struct Qdisc *head;
2798 
2799 		local_irq_disable();
2800 		head = sd->output_queue;
2801 		sd->output_queue = NULL;
2802 		sd->output_queue_tailp = &sd->output_queue;
2803 		local_irq_enable();
2804 
2805 		while (head) {
2806 			struct Qdisc *q = head;
2807 			spinlock_t *root_lock;
2808 
2809 			head = head->next_sched;
2810 
2811 			root_lock = qdisc_lock(q);
2812 			if (spin_trylock(root_lock)) {
2813 				smp_mb__before_clear_bit();
2814 				clear_bit(__QDISC_STATE_SCHED,
2815 					  &q->state);
2816 				qdisc_run(q);
2817 				spin_unlock(root_lock);
2818 			} else {
2819 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2820 					      &q->state)) {
2821 					__netif_reschedule(q);
2822 				} else {
2823 					smp_mb__before_clear_bit();
2824 					clear_bit(__QDISC_STATE_SCHED,
2825 						  &q->state);
2826 				}
2827 			}
2828 		}
2829 	}
2830 }
2831 
2832 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2833     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2834 /* This hook is defined here for ATM LANE */
2835 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2836 			     unsigned char *addr) __read_mostly;
2837 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2838 #endif
2839 
2840 #ifdef CONFIG_NET_CLS_ACT
2841 /* TODO: Maybe we should just force sch_ingress to be compiled in
2842  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2843  * a compare and 2 stores extra right now if we dont have it on
2844  * but have CONFIG_NET_CLS_ACT
2845  * NOTE: This doesnt stop any functionality; if you dont have
2846  * the ingress scheduler, you just cant add policies on ingress.
2847  *
2848  */
2849 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2850 {
2851 	struct net_device *dev = skb->dev;
2852 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2853 	int result = TC_ACT_OK;
2854 	struct Qdisc *q;
2855 
2856 	if (unlikely(MAX_RED_LOOP < ttl++)) {
2857 		if (net_ratelimit())
2858 			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2859 			       skb->skb_iif, dev->ifindex);
2860 		return TC_ACT_SHOT;
2861 	}
2862 
2863 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2864 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2865 
2866 	q = rxq->qdisc;
2867 	if (q != &noop_qdisc) {
2868 		spin_lock(qdisc_lock(q));
2869 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2870 			result = qdisc_enqueue_root(skb, q);
2871 		spin_unlock(qdisc_lock(q));
2872 	}
2873 
2874 	return result;
2875 }
2876 
2877 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2878 					 struct packet_type **pt_prev,
2879 					 int *ret, struct net_device *orig_dev)
2880 {
2881 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2882 
2883 	if (!rxq || rxq->qdisc == &noop_qdisc)
2884 		goto out;
2885 
2886 	if (*pt_prev) {
2887 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2888 		*pt_prev = NULL;
2889 	}
2890 
2891 	switch (ing_filter(skb, rxq)) {
2892 	case TC_ACT_SHOT:
2893 	case TC_ACT_STOLEN:
2894 		kfree_skb(skb);
2895 		return NULL;
2896 	}
2897 
2898 out:
2899 	skb->tc_verd = 0;
2900 	return skb;
2901 }
2902 #endif
2903 
2904 /**
2905  *	netdev_rx_handler_register - register receive handler
2906  *	@dev: device to register a handler for
2907  *	@rx_handler: receive handler to register
2908  *	@rx_handler_data: data pointer that is used by rx handler
2909  *
2910  *	Register a receive hander for a device. This handler will then be
2911  *	called from __netif_receive_skb. A negative errno code is returned
2912  *	on a failure.
2913  *
2914  *	The caller must hold the rtnl_mutex.
2915  */
2916 int netdev_rx_handler_register(struct net_device *dev,
2917 			       rx_handler_func_t *rx_handler,
2918 			       void *rx_handler_data)
2919 {
2920 	ASSERT_RTNL();
2921 
2922 	if (dev->rx_handler)
2923 		return -EBUSY;
2924 
2925 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2926 	rcu_assign_pointer(dev->rx_handler, rx_handler);
2927 
2928 	return 0;
2929 }
2930 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2931 
2932 /**
2933  *	netdev_rx_handler_unregister - unregister receive handler
2934  *	@dev: device to unregister a handler from
2935  *
2936  *	Unregister a receive hander from a device.
2937  *
2938  *	The caller must hold the rtnl_mutex.
2939  */
2940 void netdev_rx_handler_unregister(struct net_device *dev)
2941 {
2942 
2943 	ASSERT_RTNL();
2944 	rcu_assign_pointer(dev->rx_handler, NULL);
2945 	rcu_assign_pointer(dev->rx_handler_data, NULL);
2946 }
2947 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2948 
2949 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2950 					      struct net_device *master)
2951 {
2952 	if (skb->pkt_type == PACKET_HOST) {
2953 		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2954 
2955 		memcpy(dest, master->dev_addr, ETH_ALEN);
2956 	}
2957 }
2958 
2959 /* On bonding slaves other than the currently active slave, suppress
2960  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2961  * ARP on active-backup slaves with arp_validate enabled.
2962  */
2963 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2964 {
2965 	struct net_device *dev = skb->dev;
2966 
2967 	if (master->priv_flags & IFF_MASTER_ARPMON)
2968 		dev->last_rx = jiffies;
2969 
2970 	if ((master->priv_flags & IFF_MASTER_ALB) &&
2971 	    (master->priv_flags & IFF_BRIDGE_PORT)) {
2972 		/* Do address unmangle. The local destination address
2973 		 * will be always the one master has. Provides the right
2974 		 * functionality in a bridge.
2975 		 */
2976 		skb_bond_set_mac_by_master(skb, master);
2977 	}
2978 
2979 	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2980 		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2981 		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
2982 			return 0;
2983 
2984 		if (master->priv_flags & IFF_MASTER_ALB) {
2985 			if (skb->pkt_type != PACKET_BROADCAST &&
2986 			    skb->pkt_type != PACKET_MULTICAST)
2987 				return 0;
2988 		}
2989 		if (master->priv_flags & IFF_MASTER_8023AD &&
2990 		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2991 			return 0;
2992 
2993 		return 1;
2994 	}
2995 	return 0;
2996 }
2997 EXPORT_SYMBOL(__skb_bond_should_drop);
2998 
2999 static int __netif_receive_skb(struct sk_buff *skb)
3000 {
3001 	struct packet_type *ptype, *pt_prev;
3002 	rx_handler_func_t *rx_handler;
3003 	struct net_device *orig_dev;
3004 	struct net_device *master;
3005 	struct net_device *null_or_orig;
3006 	struct net_device *orig_or_bond;
3007 	int ret = NET_RX_DROP;
3008 	__be16 type;
3009 
3010 	if (!netdev_tstamp_prequeue)
3011 		net_timestamp_check(skb);
3012 
3013 	trace_netif_receive_skb(skb);
3014 
3015 	/* if we've gotten here through NAPI, check netpoll */
3016 	if (netpoll_receive_skb(skb))
3017 		return NET_RX_DROP;
3018 
3019 	if (!skb->skb_iif)
3020 		skb->skb_iif = skb->dev->ifindex;
3021 
3022 	/*
3023 	 * bonding note: skbs received on inactive slaves should only
3024 	 * be delivered to pkt handlers that are exact matches.  Also
3025 	 * the deliver_no_wcard flag will be set.  If packet handlers
3026 	 * are sensitive to duplicate packets these skbs will need to
3027 	 * be dropped at the handler.
3028 	 */
3029 	null_or_orig = NULL;
3030 	orig_dev = skb->dev;
3031 	master = ACCESS_ONCE(orig_dev->master);
3032 	if (skb->deliver_no_wcard)
3033 		null_or_orig = orig_dev;
3034 	else if (master) {
3035 		if (skb_bond_should_drop(skb, master)) {
3036 			skb->deliver_no_wcard = 1;
3037 			null_or_orig = orig_dev; /* deliver only exact match */
3038 		} else
3039 			skb->dev = master;
3040 	}
3041 
3042 	__this_cpu_inc(softnet_data.processed);
3043 	skb_reset_network_header(skb);
3044 	skb_reset_transport_header(skb);
3045 	skb->mac_len = skb->network_header - skb->mac_header;
3046 
3047 	pt_prev = NULL;
3048 
3049 	rcu_read_lock();
3050 
3051 #ifdef CONFIG_NET_CLS_ACT
3052 	if (skb->tc_verd & TC_NCLS) {
3053 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3054 		goto ncls;
3055 	}
3056 #endif
3057 
3058 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3059 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3060 		    ptype->dev == orig_dev) {
3061 			if (pt_prev)
3062 				ret = deliver_skb(skb, pt_prev, orig_dev);
3063 			pt_prev = ptype;
3064 		}
3065 	}
3066 
3067 #ifdef CONFIG_NET_CLS_ACT
3068 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3069 	if (!skb)
3070 		goto out;
3071 ncls:
3072 #endif
3073 
3074 	/* Handle special case of bridge or macvlan */
3075 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3076 	if (rx_handler) {
3077 		if (pt_prev) {
3078 			ret = deliver_skb(skb, pt_prev, orig_dev);
3079 			pt_prev = NULL;
3080 		}
3081 		skb = rx_handler(skb);
3082 		if (!skb)
3083 			goto out;
3084 	}
3085 
3086 	if (vlan_tx_tag_present(skb)) {
3087 		if (pt_prev) {
3088 			ret = deliver_skb(skb, pt_prev, orig_dev);
3089 			pt_prev = NULL;
3090 		}
3091 		if (vlan_hwaccel_do_receive(&skb)) {
3092 			ret = __netif_receive_skb(skb);
3093 			goto out;
3094 		} else if (unlikely(!skb))
3095 			goto out;
3096 	}
3097 
3098 	/*
3099 	 * Make sure frames received on VLAN interfaces stacked on
3100 	 * bonding interfaces still make their way to any base bonding
3101 	 * device that may have registered for a specific ptype.  The
3102 	 * handler may have to adjust skb->dev and orig_dev.
3103 	 */
3104 	orig_or_bond = orig_dev;
3105 	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3106 	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
3107 		orig_or_bond = vlan_dev_real_dev(skb->dev);
3108 	}
3109 
3110 	type = skb->protocol;
3111 	list_for_each_entry_rcu(ptype,
3112 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3113 		if (ptype->type == type && (ptype->dev == null_or_orig ||
3114 		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
3115 		     ptype->dev == orig_or_bond)) {
3116 			if (pt_prev)
3117 				ret = deliver_skb(skb, pt_prev, orig_dev);
3118 			pt_prev = ptype;
3119 		}
3120 	}
3121 
3122 	if (pt_prev) {
3123 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3124 	} else {
3125 		atomic_long_inc(&skb->dev->rx_dropped);
3126 		kfree_skb(skb);
3127 		/* Jamal, now you will not able to escape explaining
3128 		 * me how you were going to use this. :-)
3129 		 */
3130 		ret = NET_RX_DROP;
3131 	}
3132 
3133 out:
3134 	rcu_read_unlock();
3135 	return ret;
3136 }
3137 
3138 /**
3139  *	netif_receive_skb - process receive buffer from network
3140  *	@skb: buffer to process
3141  *
3142  *	netif_receive_skb() is the main receive data processing function.
3143  *	It always succeeds. The buffer may be dropped during processing
3144  *	for congestion control or by the protocol layers.
3145  *
3146  *	This function may only be called from softirq context and interrupts
3147  *	should be enabled.
3148  *
3149  *	Return values (usually ignored):
3150  *	NET_RX_SUCCESS: no congestion
3151  *	NET_RX_DROP: packet was dropped
3152  */
3153 int netif_receive_skb(struct sk_buff *skb)
3154 {
3155 	if (netdev_tstamp_prequeue)
3156 		net_timestamp_check(skb);
3157 
3158 	if (skb_defer_rx_timestamp(skb))
3159 		return NET_RX_SUCCESS;
3160 
3161 #ifdef CONFIG_RPS
3162 	{
3163 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3164 		int cpu, ret;
3165 
3166 		rcu_read_lock();
3167 
3168 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3169 
3170 		if (cpu >= 0) {
3171 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3172 			rcu_read_unlock();
3173 		} else {
3174 			rcu_read_unlock();
3175 			ret = __netif_receive_skb(skb);
3176 		}
3177 
3178 		return ret;
3179 	}
3180 #else
3181 	return __netif_receive_skb(skb);
3182 #endif
3183 }
3184 EXPORT_SYMBOL(netif_receive_skb);
3185 
3186 /* Network device is going away, flush any packets still pending
3187  * Called with irqs disabled.
3188  */
3189 static void flush_backlog(void *arg)
3190 {
3191 	struct net_device *dev = arg;
3192 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3193 	struct sk_buff *skb, *tmp;
3194 
3195 	rps_lock(sd);
3196 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3197 		if (skb->dev == dev) {
3198 			__skb_unlink(skb, &sd->input_pkt_queue);
3199 			kfree_skb(skb);
3200 			input_queue_head_incr(sd);
3201 		}
3202 	}
3203 	rps_unlock(sd);
3204 
3205 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3206 		if (skb->dev == dev) {
3207 			__skb_unlink(skb, &sd->process_queue);
3208 			kfree_skb(skb);
3209 			input_queue_head_incr(sd);
3210 		}
3211 	}
3212 }
3213 
3214 static int napi_gro_complete(struct sk_buff *skb)
3215 {
3216 	struct packet_type *ptype;
3217 	__be16 type = skb->protocol;
3218 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3219 	int err = -ENOENT;
3220 
3221 	if (NAPI_GRO_CB(skb)->count == 1) {
3222 		skb_shinfo(skb)->gso_size = 0;
3223 		goto out;
3224 	}
3225 
3226 	rcu_read_lock();
3227 	list_for_each_entry_rcu(ptype, head, list) {
3228 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3229 			continue;
3230 
3231 		err = ptype->gro_complete(skb);
3232 		break;
3233 	}
3234 	rcu_read_unlock();
3235 
3236 	if (err) {
3237 		WARN_ON(&ptype->list == head);
3238 		kfree_skb(skb);
3239 		return NET_RX_SUCCESS;
3240 	}
3241 
3242 out:
3243 	return netif_receive_skb(skb);
3244 }
3245 
3246 inline void napi_gro_flush(struct napi_struct *napi)
3247 {
3248 	struct sk_buff *skb, *next;
3249 
3250 	for (skb = napi->gro_list; skb; skb = next) {
3251 		next = skb->next;
3252 		skb->next = NULL;
3253 		napi_gro_complete(skb);
3254 	}
3255 
3256 	napi->gro_count = 0;
3257 	napi->gro_list = NULL;
3258 }
3259 EXPORT_SYMBOL(napi_gro_flush);
3260 
3261 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3262 {
3263 	struct sk_buff **pp = NULL;
3264 	struct packet_type *ptype;
3265 	__be16 type = skb->protocol;
3266 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3267 	int same_flow;
3268 	int mac_len;
3269 	enum gro_result ret;
3270 
3271 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3272 		goto normal;
3273 
3274 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3275 		goto normal;
3276 
3277 	rcu_read_lock();
3278 	list_for_each_entry_rcu(ptype, head, list) {
3279 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3280 			continue;
3281 
3282 		skb_set_network_header(skb, skb_gro_offset(skb));
3283 		mac_len = skb->network_header - skb->mac_header;
3284 		skb->mac_len = mac_len;
3285 		NAPI_GRO_CB(skb)->same_flow = 0;
3286 		NAPI_GRO_CB(skb)->flush = 0;
3287 		NAPI_GRO_CB(skb)->free = 0;
3288 
3289 		pp = ptype->gro_receive(&napi->gro_list, skb);
3290 		break;
3291 	}
3292 	rcu_read_unlock();
3293 
3294 	if (&ptype->list == head)
3295 		goto normal;
3296 
3297 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3298 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3299 
3300 	if (pp) {
3301 		struct sk_buff *nskb = *pp;
3302 
3303 		*pp = nskb->next;
3304 		nskb->next = NULL;
3305 		napi_gro_complete(nskb);
3306 		napi->gro_count--;
3307 	}
3308 
3309 	if (same_flow)
3310 		goto ok;
3311 
3312 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3313 		goto normal;
3314 
3315 	napi->gro_count++;
3316 	NAPI_GRO_CB(skb)->count = 1;
3317 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3318 	skb->next = napi->gro_list;
3319 	napi->gro_list = skb;
3320 	ret = GRO_HELD;
3321 
3322 pull:
3323 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3324 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3325 
3326 		BUG_ON(skb->end - skb->tail < grow);
3327 
3328 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3329 
3330 		skb->tail += grow;
3331 		skb->data_len -= grow;
3332 
3333 		skb_shinfo(skb)->frags[0].page_offset += grow;
3334 		skb_shinfo(skb)->frags[0].size -= grow;
3335 
3336 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3337 			put_page(skb_shinfo(skb)->frags[0].page);
3338 			memmove(skb_shinfo(skb)->frags,
3339 				skb_shinfo(skb)->frags + 1,
3340 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3341 		}
3342 	}
3343 
3344 ok:
3345 	return ret;
3346 
3347 normal:
3348 	ret = GRO_NORMAL;
3349 	goto pull;
3350 }
3351 EXPORT_SYMBOL(dev_gro_receive);
3352 
3353 static inline gro_result_t
3354 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3355 {
3356 	struct sk_buff *p;
3357 
3358 	for (p = napi->gro_list; p; p = p->next) {
3359 		unsigned long diffs;
3360 
3361 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3362 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3363 		diffs |= compare_ether_header(skb_mac_header(p),
3364 					      skb_gro_mac_header(skb));
3365 		NAPI_GRO_CB(p)->same_flow = !diffs;
3366 		NAPI_GRO_CB(p)->flush = 0;
3367 	}
3368 
3369 	return dev_gro_receive(napi, skb);
3370 }
3371 
3372 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3373 {
3374 	switch (ret) {
3375 	case GRO_NORMAL:
3376 		if (netif_receive_skb(skb))
3377 			ret = GRO_DROP;
3378 		break;
3379 
3380 	case GRO_DROP:
3381 	case GRO_MERGED_FREE:
3382 		kfree_skb(skb);
3383 		break;
3384 
3385 	case GRO_HELD:
3386 	case GRO_MERGED:
3387 		break;
3388 	}
3389 
3390 	return ret;
3391 }
3392 EXPORT_SYMBOL(napi_skb_finish);
3393 
3394 void skb_gro_reset_offset(struct sk_buff *skb)
3395 {
3396 	NAPI_GRO_CB(skb)->data_offset = 0;
3397 	NAPI_GRO_CB(skb)->frag0 = NULL;
3398 	NAPI_GRO_CB(skb)->frag0_len = 0;
3399 
3400 	if (skb->mac_header == skb->tail &&
3401 	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3402 		NAPI_GRO_CB(skb)->frag0 =
3403 			page_address(skb_shinfo(skb)->frags[0].page) +
3404 			skb_shinfo(skb)->frags[0].page_offset;
3405 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3406 	}
3407 }
3408 EXPORT_SYMBOL(skb_gro_reset_offset);
3409 
3410 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3411 {
3412 	skb_gro_reset_offset(skb);
3413 
3414 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3415 }
3416 EXPORT_SYMBOL(napi_gro_receive);
3417 
3418 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3419 {
3420 	__skb_pull(skb, skb_headlen(skb));
3421 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3422 	skb->vlan_tci = 0;
3423 
3424 	napi->skb = skb;
3425 }
3426 
3427 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3428 {
3429 	struct sk_buff *skb = napi->skb;
3430 
3431 	if (!skb) {
3432 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3433 		if (skb)
3434 			napi->skb = skb;
3435 	}
3436 	return skb;
3437 }
3438 EXPORT_SYMBOL(napi_get_frags);
3439 
3440 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3441 			       gro_result_t ret)
3442 {
3443 	switch (ret) {
3444 	case GRO_NORMAL:
3445 	case GRO_HELD:
3446 		skb->protocol = eth_type_trans(skb, skb->dev);
3447 
3448 		if (ret == GRO_HELD)
3449 			skb_gro_pull(skb, -ETH_HLEN);
3450 		else if (netif_receive_skb(skb))
3451 			ret = GRO_DROP;
3452 		break;
3453 
3454 	case GRO_DROP:
3455 	case GRO_MERGED_FREE:
3456 		napi_reuse_skb(napi, skb);
3457 		break;
3458 
3459 	case GRO_MERGED:
3460 		break;
3461 	}
3462 
3463 	return ret;
3464 }
3465 EXPORT_SYMBOL(napi_frags_finish);
3466 
3467 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3468 {
3469 	struct sk_buff *skb = napi->skb;
3470 	struct ethhdr *eth;
3471 	unsigned int hlen;
3472 	unsigned int off;
3473 
3474 	napi->skb = NULL;
3475 
3476 	skb_reset_mac_header(skb);
3477 	skb_gro_reset_offset(skb);
3478 
3479 	off = skb_gro_offset(skb);
3480 	hlen = off + sizeof(*eth);
3481 	eth = skb_gro_header_fast(skb, off);
3482 	if (skb_gro_header_hard(skb, hlen)) {
3483 		eth = skb_gro_header_slow(skb, hlen, off);
3484 		if (unlikely(!eth)) {
3485 			napi_reuse_skb(napi, skb);
3486 			skb = NULL;
3487 			goto out;
3488 		}
3489 	}
3490 
3491 	skb_gro_pull(skb, sizeof(*eth));
3492 
3493 	/*
3494 	 * This works because the only protocols we care about don't require
3495 	 * special handling.  We'll fix it up properly at the end.
3496 	 */
3497 	skb->protocol = eth->h_proto;
3498 
3499 out:
3500 	return skb;
3501 }
3502 EXPORT_SYMBOL(napi_frags_skb);
3503 
3504 gro_result_t napi_gro_frags(struct napi_struct *napi)
3505 {
3506 	struct sk_buff *skb = napi_frags_skb(napi);
3507 
3508 	if (!skb)
3509 		return GRO_DROP;
3510 
3511 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3512 }
3513 EXPORT_SYMBOL(napi_gro_frags);
3514 
3515 /*
3516  * net_rps_action sends any pending IPI's for rps.
3517  * Note: called with local irq disabled, but exits with local irq enabled.
3518  */
3519 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3520 {
3521 #ifdef CONFIG_RPS
3522 	struct softnet_data *remsd = sd->rps_ipi_list;
3523 
3524 	if (remsd) {
3525 		sd->rps_ipi_list = NULL;
3526 
3527 		local_irq_enable();
3528 
3529 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3530 		while (remsd) {
3531 			struct softnet_data *next = remsd->rps_ipi_next;
3532 
3533 			if (cpu_online(remsd->cpu))
3534 				__smp_call_function_single(remsd->cpu,
3535 							   &remsd->csd, 0);
3536 			remsd = next;
3537 		}
3538 	} else
3539 #endif
3540 		local_irq_enable();
3541 }
3542 
3543 static int process_backlog(struct napi_struct *napi, int quota)
3544 {
3545 	int work = 0;
3546 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3547 
3548 #ifdef CONFIG_RPS
3549 	/* Check if we have pending ipi, its better to send them now,
3550 	 * not waiting net_rx_action() end.
3551 	 */
3552 	if (sd->rps_ipi_list) {
3553 		local_irq_disable();
3554 		net_rps_action_and_irq_enable(sd);
3555 	}
3556 #endif
3557 	napi->weight = weight_p;
3558 	local_irq_disable();
3559 	while (work < quota) {
3560 		struct sk_buff *skb;
3561 		unsigned int qlen;
3562 
3563 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3564 			local_irq_enable();
3565 			__netif_receive_skb(skb);
3566 			local_irq_disable();
3567 			input_queue_head_incr(sd);
3568 			if (++work >= quota) {
3569 				local_irq_enable();
3570 				return work;
3571 			}
3572 		}
3573 
3574 		rps_lock(sd);
3575 		qlen = skb_queue_len(&sd->input_pkt_queue);
3576 		if (qlen)
3577 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3578 						   &sd->process_queue);
3579 
3580 		if (qlen < quota - work) {
3581 			/*
3582 			 * Inline a custom version of __napi_complete().
3583 			 * only current cpu owns and manipulates this napi,
3584 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3585 			 * we can use a plain write instead of clear_bit(),
3586 			 * and we dont need an smp_mb() memory barrier.
3587 			 */
3588 			list_del(&napi->poll_list);
3589 			napi->state = 0;
3590 
3591 			quota = work + qlen;
3592 		}
3593 		rps_unlock(sd);
3594 	}
3595 	local_irq_enable();
3596 
3597 	return work;
3598 }
3599 
3600 /**
3601  * __napi_schedule - schedule for receive
3602  * @n: entry to schedule
3603  *
3604  * The entry's receive function will be scheduled to run
3605  */
3606 void __napi_schedule(struct napi_struct *n)
3607 {
3608 	unsigned long flags;
3609 
3610 	local_irq_save(flags);
3611 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3612 	local_irq_restore(flags);
3613 }
3614 EXPORT_SYMBOL(__napi_schedule);
3615 
3616 void __napi_complete(struct napi_struct *n)
3617 {
3618 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3619 	BUG_ON(n->gro_list);
3620 
3621 	list_del(&n->poll_list);
3622 	smp_mb__before_clear_bit();
3623 	clear_bit(NAPI_STATE_SCHED, &n->state);
3624 }
3625 EXPORT_SYMBOL(__napi_complete);
3626 
3627 void napi_complete(struct napi_struct *n)
3628 {
3629 	unsigned long flags;
3630 
3631 	/*
3632 	 * don't let napi dequeue from the cpu poll list
3633 	 * just in case its running on a different cpu
3634 	 */
3635 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3636 		return;
3637 
3638 	napi_gro_flush(n);
3639 	local_irq_save(flags);
3640 	__napi_complete(n);
3641 	local_irq_restore(flags);
3642 }
3643 EXPORT_SYMBOL(napi_complete);
3644 
3645 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3646 		    int (*poll)(struct napi_struct *, int), int weight)
3647 {
3648 	INIT_LIST_HEAD(&napi->poll_list);
3649 	napi->gro_count = 0;
3650 	napi->gro_list = NULL;
3651 	napi->skb = NULL;
3652 	napi->poll = poll;
3653 	napi->weight = weight;
3654 	list_add(&napi->dev_list, &dev->napi_list);
3655 	napi->dev = dev;
3656 #ifdef CONFIG_NETPOLL
3657 	spin_lock_init(&napi->poll_lock);
3658 	napi->poll_owner = -1;
3659 #endif
3660 	set_bit(NAPI_STATE_SCHED, &napi->state);
3661 }
3662 EXPORT_SYMBOL(netif_napi_add);
3663 
3664 void netif_napi_del(struct napi_struct *napi)
3665 {
3666 	struct sk_buff *skb, *next;
3667 
3668 	list_del_init(&napi->dev_list);
3669 	napi_free_frags(napi);
3670 
3671 	for (skb = napi->gro_list; skb; skb = next) {
3672 		next = skb->next;
3673 		skb->next = NULL;
3674 		kfree_skb(skb);
3675 	}
3676 
3677 	napi->gro_list = NULL;
3678 	napi->gro_count = 0;
3679 }
3680 EXPORT_SYMBOL(netif_napi_del);
3681 
3682 static void net_rx_action(struct softirq_action *h)
3683 {
3684 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3685 	unsigned long time_limit = jiffies + 2;
3686 	int budget = netdev_budget;
3687 	void *have;
3688 
3689 	local_irq_disable();
3690 
3691 	while (!list_empty(&sd->poll_list)) {
3692 		struct napi_struct *n;
3693 		int work, weight;
3694 
3695 		/* If softirq window is exhuasted then punt.
3696 		 * Allow this to run for 2 jiffies since which will allow
3697 		 * an average latency of 1.5/HZ.
3698 		 */
3699 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3700 			goto softnet_break;
3701 
3702 		local_irq_enable();
3703 
3704 		/* Even though interrupts have been re-enabled, this
3705 		 * access is safe because interrupts can only add new
3706 		 * entries to the tail of this list, and only ->poll()
3707 		 * calls can remove this head entry from the list.
3708 		 */
3709 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3710 
3711 		have = netpoll_poll_lock(n);
3712 
3713 		weight = n->weight;
3714 
3715 		/* This NAPI_STATE_SCHED test is for avoiding a race
3716 		 * with netpoll's poll_napi().  Only the entity which
3717 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3718 		 * actually make the ->poll() call.  Therefore we avoid
3719 		 * accidently calling ->poll() when NAPI is not scheduled.
3720 		 */
3721 		work = 0;
3722 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3723 			work = n->poll(n, weight);
3724 			trace_napi_poll(n);
3725 		}
3726 
3727 		WARN_ON_ONCE(work > weight);
3728 
3729 		budget -= work;
3730 
3731 		local_irq_disable();
3732 
3733 		/* Drivers must not modify the NAPI state if they
3734 		 * consume the entire weight.  In such cases this code
3735 		 * still "owns" the NAPI instance and therefore can
3736 		 * move the instance around on the list at-will.
3737 		 */
3738 		if (unlikely(work == weight)) {
3739 			if (unlikely(napi_disable_pending(n))) {
3740 				local_irq_enable();
3741 				napi_complete(n);
3742 				local_irq_disable();
3743 			} else
3744 				list_move_tail(&n->poll_list, &sd->poll_list);
3745 		}
3746 
3747 		netpoll_poll_unlock(have);
3748 	}
3749 out:
3750 	net_rps_action_and_irq_enable(sd);
3751 
3752 #ifdef CONFIG_NET_DMA
3753 	/*
3754 	 * There may not be any more sk_buffs coming right now, so push
3755 	 * any pending DMA copies to hardware
3756 	 */
3757 	dma_issue_pending_all();
3758 #endif
3759 
3760 	return;
3761 
3762 softnet_break:
3763 	sd->time_squeeze++;
3764 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3765 	goto out;
3766 }
3767 
3768 static gifconf_func_t *gifconf_list[NPROTO];
3769 
3770 /**
3771  *	register_gifconf	-	register a SIOCGIF handler
3772  *	@family: Address family
3773  *	@gifconf: Function handler
3774  *
3775  *	Register protocol dependent address dumping routines. The handler
3776  *	that is passed must not be freed or reused until it has been replaced
3777  *	by another handler.
3778  */
3779 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3780 {
3781 	if (family >= NPROTO)
3782 		return -EINVAL;
3783 	gifconf_list[family] = gifconf;
3784 	return 0;
3785 }
3786 EXPORT_SYMBOL(register_gifconf);
3787 
3788 
3789 /*
3790  *	Map an interface index to its name (SIOCGIFNAME)
3791  */
3792 
3793 /*
3794  *	We need this ioctl for efficient implementation of the
3795  *	if_indextoname() function required by the IPv6 API.  Without
3796  *	it, we would have to search all the interfaces to find a
3797  *	match.  --pb
3798  */
3799 
3800 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3801 {
3802 	struct net_device *dev;
3803 	struct ifreq ifr;
3804 
3805 	/*
3806 	 *	Fetch the caller's info block.
3807 	 */
3808 
3809 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3810 		return -EFAULT;
3811 
3812 	rcu_read_lock();
3813 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3814 	if (!dev) {
3815 		rcu_read_unlock();
3816 		return -ENODEV;
3817 	}
3818 
3819 	strcpy(ifr.ifr_name, dev->name);
3820 	rcu_read_unlock();
3821 
3822 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3823 		return -EFAULT;
3824 	return 0;
3825 }
3826 
3827 /*
3828  *	Perform a SIOCGIFCONF call. This structure will change
3829  *	size eventually, and there is nothing I can do about it.
3830  *	Thus we will need a 'compatibility mode'.
3831  */
3832 
3833 static int dev_ifconf(struct net *net, char __user *arg)
3834 {
3835 	struct ifconf ifc;
3836 	struct net_device *dev;
3837 	char __user *pos;
3838 	int len;
3839 	int total;
3840 	int i;
3841 
3842 	/*
3843 	 *	Fetch the caller's info block.
3844 	 */
3845 
3846 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3847 		return -EFAULT;
3848 
3849 	pos = ifc.ifc_buf;
3850 	len = ifc.ifc_len;
3851 
3852 	/*
3853 	 *	Loop over the interfaces, and write an info block for each.
3854 	 */
3855 
3856 	total = 0;
3857 	for_each_netdev(net, dev) {
3858 		for (i = 0; i < NPROTO; i++) {
3859 			if (gifconf_list[i]) {
3860 				int done;
3861 				if (!pos)
3862 					done = gifconf_list[i](dev, NULL, 0);
3863 				else
3864 					done = gifconf_list[i](dev, pos + total,
3865 							       len - total);
3866 				if (done < 0)
3867 					return -EFAULT;
3868 				total += done;
3869 			}
3870 		}
3871 	}
3872 
3873 	/*
3874 	 *	All done.  Write the updated control block back to the caller.
3875 	 */
3876 	ifc.ifc_len = total;
3877 
3878 	/*
3879 	 * 	Both BSD and Solaris return 0 here, so we do too.
3880 	 */
3881 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3882 }
3883 
3884 #ifdef CONFIG_PROC_FS
3885 /*
3886  *	This is invoked by the /proc filesystem handler to display a device
3887  *	in detail.
3888  */
3889 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3890 	__acquires(RCU)
3891 {
3892 	struct net *net = seq_file_net(seq);
3893 	loff_t off;
3894 	struct net_device *dev;
3895 
3896 	rcu_read_lock();
3897 	if (!*pos)
3898 		return SEQ_START_TOKEN;
3899 
3900 	off = 1;
3901 	for_each_netdev_rcu(net, dev)
3902 		if (off++ == *pos)
3903 			return dev;
3904 
3905 	return NULL;
3906 }
3907 
3908 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3909 {
3910 	struct net_device *dev = (v == SEQ_START_TOKEN) ?
3911 				  first_net_device(seq_file_net(seq)) :
3912 				  next_net_device((struct net_device *)v);
3913 
3914 	++*pos;
3915 	return rcu_dereference(dev);
3916 }
3917 
3918 void dev_seq_stop(struct seq_file *seq, void *v)
3919 	__releases(RCU)
3920 {
3921 	rcu_read_unlock();
3922 }
3923 
3924 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3925 {
3926 	struct rtnl_link_stats64 temp;
3927 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3928 
3929 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3930 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3931 		   dev->name, stats->rx_bytes, stats->rx_packets,
3932 		   stats->rx_errors,
3933 		   stats->rx_dropped + stats->rx_missed_errors,
3934 		   stats->rx_fifo_errors,
3935 		   stats->rx_length_errors + stats->rx_over_errors +
3936 		    stats->rx_crc_errors + stats->rx_frame_errors,
3937 		   stats->rx_compressed, stats->multicast,
3938 		   stats->tx_bytes, stats->tx_packets,
3939 		   stats->tx_errors, stats->tx_dropped,
3940 		   stats->tx_fifo_errors, stats->collisions,
3941 		   stats->tx_carrier_errors +
3942 		    stats->tx_aborted_errors +
3943 		    stats->tx_window_errors +
3944 		    stats->tx_heartbeat_errors,
3945 		   stats->tx_compressed);
3946 }
3947 
3948 /*
3949  *	Called from the PROCfs module. This now uses the new arbitrary sized
3950  *	/proc/net interface to create /proc/net/dev
3951  */
3952 static int dev_seq_show(struct seq_file *seq, void *v)
3953 {
3954 	if (v == SEQ_START_TOKEN)
3955 		seq_puts(seq, "Inter-|   Receive                            "
3956 			      "                    |  Transmit\n"
3957 			      " face |bytes    packets errs drop fifo frame "
3958 			      "compressed multicast|bytes    packets errs "
3959 			      "drop fifo colls carrier compressed\n");
3960 	else
3961 		dev_seq_printf_stats(seq, v);
3962 	return 0;
3963 }
3964 
3965 static struct softnet_data *softnet_get_online(loff_t *pos)
3966 {
3967 	struct softnet_data *sd = NULL;
3968 
3969 	while (*pos < nr_cpu_ids)
3970 		if (cpu_online(*pos)) {
3971 			sd = &per_cpu(softnet_data, *pos);
3972 			break;
3973 		} else
3974 			++*pos;
3975 	return sd;
3976 }
3977 
3978 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3979 {
3980 	return softnet_get_online(pos);
3981 }
3982 
3983 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3984 {
3985 	++*pos;
3986 	return softnet_get_online(pos);
3987 }
3988 
3989 static void softnet_seq_stop(struct seq_file *seq, void *v)
3990 {
3991 }
3992 
3993 static int softnet_seq_show(struct seq_file *seq, void *v)
3994 {
3995 	struct softnet_data *sd = v;
3996 
3997 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3998 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
3999 		   0, 0, 0, 0, /* was fastroute */
4000 		   sd->cpu_collision, sd->received_rps);
4001 	return 0;
4002 }
4003 
4004 static const struct seq_operations dev_seq_ops = {
4005 	.start = dev_seq_start,
4006 	.next  = dev_seq_next,
4007 	.stop  = dev_seq_stop,
4008 	.show  = dev_seq_show,
4009 };
4010 
4011 static int dev_seq_open(struct inode *inode, struct file *file)
4012 {
4013 	return seq_open_net(inode, file, &dev_seq_ops,
4014 			    sizeof(struct seq_net_private));
4015 }
4016 
4017 static const struct file_operations dev_seq_fops = {
4018 	.owner	 = THIS_MODULE,
4019 	.open    = dev_seq_open,
4020 	.read    = seq_read,
4021 	.llseek  = seq_lseek,
4022 	.release = seq_release_net,
4023 };
4024 
4025 static const struct seq_operations softnet_seq_ops = {
4026 	.start = softnet_seq_start,
4027 	.next  = softnet_seq_next,
4028 	.stop  = softnet_seq_stop,
4029 	.show  = softnet_seq_show,
4030 };
4031 
4032 static int softnet_seq_open(struct inode *inode, struct file *file)
4033 {
4034 	return seq_open(file, &softnet_seq_ops);
4035 }
4036 
4037 static const struct file_operations softnet_seq_fops = {
4038 	.owner	 = THIS_MODULE,
4039 	.open    = softnet_seq_open,
4040 	.read    = seq_read,
4041 	.llseek  = seq_lseek,
4042 	.release = seq_release,
4043 };
4044 
4045 static void *ptype_get_idx(loff_t pos)
4046 {
4047 	struct packet_type *pt = NULL;
4048 	loff_t i = 0;
4049 	int t;
4050 
4051 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4052 		if (i == pos)
4053 			return pt;
4054 		++i;
4055 	}
4056 
4057 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4058 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4059 			if (i == pos)
4060 				return pt;
4061 			++i;
4062 		}
4063 	}
4064 	return NULL;
4065 }
4066 
4067 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4068 	__acquires(RCU)
4069 {
4070 	rcu_read_lock();
4071 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4072 }
4073 
4074 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4075 {
4076 	struct packet_type *pt;
4077 	struct list_head *nxt;
4078 	int hash;
4079 
4080 	++*pos;
4081 	if (v == SEQ_START_TOKEN)
4082 		return ptype_get_idx(0);
4083 
4084 	pt = v;
4085 	nxt = pt->list.next;
4086 	if (pt->type == htons(ETH_P_ALL)) {
4087 		if (nxt != &ptype_all)
4088 			goto found;
4089 		hash = 0;
4090 		nxt = ptype_base[0].next;
4091 	} else
4092 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4093 
4094 	while (nxt == &ptype_base[hash]) {
4095 		if (++hash >= PTYPE_HASH_SIZE)
4096 			return NULL;
4097 		nxt = ptype_base[hash].next;
4098 	}
4099 found:
4100 	return list_entry(nxt, struct packet_type, list);
4101 }
4102 
4103 static void ptype_seq_stop(struct seq_file *seq, void *v)
4104 	__releases(RCU)
4105 {
4106 	rcu_read_unlock();
4107 }
4108 
4109 static int ptype_seq_show(struct seq_file *seq, void *v)
4110 {
4111 	struct packet_type *pt = v;
4112 
4113 	if (v == SEQ_START_TOKEN)
4114 		seq_puts(seq, "Type Device      Function\n");
4115 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4116 		if (pt->type == htons(ETH_P_ALL))
4117 			seq_puts(seq, "ALL ");
4118 		else
4119 			seq_printf(seq, "%04x", ntohs(pt->type));
4120 
4121 		seq_printf(seq, " %-8s %pF\n",
4122 			   pt->dev ? pt->dev->name : "", pt->func);
4123 	}
4124 
4125 	return 0;
4126 }
4127 
4128 static const struct seq_operations ptype_seq_ops = {
4129 	.start = ptype_seq_start,
4130 	.next  = ptype_seq_next,
4131 	.stop  = ptype_seq_stop,
4132 	.show  = ptype_seq_show,
4133 };
4134 
4135 static int ptype_seq_open(struct inode *inode, struct file *file)
4136 {
4137 	return seq_open_net(inode, file, &ptype_seq_ops,
4138 			sizeof(struct seq_net_private));
4139 }
4140 
4141 static const struct file_operations ptype_seq_fops = {
4142 	.owner	 = THIS_MODULE,
4143 	.open    = ptype_seq_open,
4144 	.read    = seq_read,
4145 	.llseek  = seq_lseek,
4146 	.release = seq_release_net,
4147 };
4148 
4149 
4150 static int __net_init dev_proc_net_init(struct net *net)
4151 {
4152 	int rc = -ENOMEM;
4153 
4154 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4155 		goto out;
4156 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4157 		goto out_dev;
4158 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4159 		goto out_softnet;
4160 
4161 	if (wext_proc_init(net))
4162 		goto out_ptype;
4163 	rc = 0;
4164 out:
4165 	return rc;
4166 out_ptype:
4167 	proc_net_remove(net, "ptype");
4168 out_softnet:
4169 	proc_net_remove(net, "softnet_stat");
4170 out_dev:
4171 	proc_net_remove(net, "dev");
4172 	goto out;
4173 }
4174 
4175 static void __net_exit dev_proc_net_exit(struct net *net)
4176 {
4177 	wext_proc_exit(net);
4178 
4179 	proc_net_remove(net, "ptype");
4180 	proc_net_remove(net, "softnet_stat");
4181 	proc_net_remove(net, "dev");
4182 }
4183 
4184 static struct pernet_operations __net_initdata dev_proc_ops = {
4185 	.init = dev_proc_net_init,
4186 	.exit = dev_proc_net_exit,
4187 };
4188 
4189 static int __init dev_proc_init(void)
4190 {
4191 	return register_pernet_subsys(&dev_proc_ops);
4192 }
4193 #else
4194 #define dev_proc_init() 0
4195 #endif	/* CONFIG_PROC_FS */
4196 
4197 
4198 /**
4199  *	netdev_set_master	-	set up master/slave pair
4200  *	@slave: slave device
4201  *	@master: new master device
4202  *
4203  *	Changes the master device of the slave. Pass %NULL to break the
4204  *	bonding. The caller must hold the RTNL semaphore. On a failure
4205  *	a negative errno code is returned. On success the reference counts
4206  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4207  *	function returns zero.
4208  */
4209 int netdev_set_master(struct net_device *slave, struct net_device *master)
4210 {
4211 	struct net_device *old = slave->master;
4212 
4213 	ASSERT_RTNL();
4214 
4215 	if (master) {
4216 		if (old)
4217 			return -EBUSY;
4218 		dev_hold(master);
4219 	}
4220 
4221 	slave->master = master;
4222 
4223 	if (old) {
4224 		synchronize_net();
4225 		dev_put(old);
4226 	}
4227 	if (master)
4228 		slave->flags |= IFF_SLAVE;
4229 	else
4230 		slave->flags &= ~IFF_SLAVE;
4231 
4232 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4233 	return 0;
4234 }
4235 EXPORT_SYMBOL(netdev_set_master);
4236 
4237 static void dev_change_rx_flags(struct net_device *dev, int flags)
4238 {
4239 	const struct net_device_ops *ops = dev->netdev_ops;
4240 
4241 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4242 		ops->ndo_change_rx_flags(dev, flags);
4243 }
4244 
4245 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4246 {
4247 	unsigned short old_flags = dev->flags;
4248 	uid_t uid;
4249 	gid_t gid;
4250 
4251 	ASSERT_RTNL();
4252 
4253 	dev->flags |= IFF_PROMISC;
4254 	dev->promiscuity += inc;
4255 	if (dev->promiscuity == 0) {
4256 		/*
4257 		 * Avoid overflow.
4258 		 * If inc causes overflow, untouch promisc and return error.
4259 		 */
4260 		if (inc < 0)
4261 			dev->flags &= ~IFF_PROMISC;
4262 		else {
4263 			dev->promiscuity -= inc;
4264 			printk(KERN_WARNING "%s: promiscuity touches roof, "
4265 				"set promiscuity failed, promiscuity feature "
4266 				"of device might be broken.\n", dev->name);
4267 			return -EOVERFLOW;
4268 		}
4269 	}
4270 	if (dev->flags != old_flags) {
4271 		printk(KERN_INFO "device %s %s promiscuous mode\n",
4272 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4273 							       "left");
4274 		if (audit_enabled) {
4275 			current_uid_gid(&uid, &gid);
4276 			audit_log(current->audit_context, GFP_ATOMIC,
4277 				AUDIT_ANOM_PROMISCUOUS,
4278 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4279 				dev->name, (dev->flags & IFF_PROMISC),
4280 				(old_flags & IFF_PROMISC),
4281 				audit_get_loginuid(current),
4282 				uid, gid,
4283 				audit_get_sessionid(current));
4284 		}
4285 
4286 		dev_change_rx_flags(dev, IFF_PROMISC);
4287 	}
4288 	return 0;
4289 }
4290 
4291 /**
4292  *	dev_set_promiscuity	- update promiscuity count on a device
4293  *	@dev: device
4294  *	@inc: modifier
4295  *
4296  *	Add or remove promiscuity from a device. While the count in the device
4297  *	remains above zero the interface remains promiscuous. Once it hits zero
4298  *	the device reverts back to normal filtering operation. A negative inc
4299  *	value is used to drop promiscuity on the device.
4300  *	Return 0 if successful or a negative errno code on error.
4301  */
4302 int dev_set_promiscuity(struct net_device *dev, int inc)
4303 {
4304 	unsigned short old_flags = dev->flags;
4305 	int err;
4306 
4307 	err = __dev_set_promiscuity(dev, inc);
4308 	if (err < 0)
4309 		return err;
4310 	if (dev->flags != old_flags)
4311 		dev_set_rx_mode(dev);
4312 	return err;
4313 }
4314 EXPORT_SYMBOL(dev_set_promiscuity);
4315 
4316 /**
4317  *	dev_set_allmulti	- update allmulti count on a device
4318  *	@dev: device
4319  *	@inc: modifier
4320  *
4321  *	Add or remove reception of all multicast frames to a device. While the
4322  *	count in the device remains above zero the interface remains listening
4323  *	to all interfaces. Once it hits zero the device reverts back to normal
4324  *	filtering operation. A negative @inc value is used to drop the counter
4325  *	when releasing a resource needing all multicasts.
4326  *	Return 0 if successful or a negative errno code on error.
4327  */
4328 
4329 int dev_set_allmulti(struct net_device *dev, int inc)
4330 {
4331 	unsigned short old_flags = dev->flags;
4332 
4333 	ASSERT_RTNL();
4334 
4335 	dev->flags |= IFF_ALLMULTI;
4336 	dev->allmulti += inc;
4337 	if (dev->allmulti == 0) {
4338 		/*
4339 		 * Avoid overflow.
4340 		 * If inc causes overflow, untouch allmulti and return error.
4341 		 */
4342 		if (inc < 0)
4343 			dev->flags &= ~IFF_ALLMULTI;
4344 		else {
4345 			dev->allmulti -= inc;
4346 			printk(KERN_WARNING "%s: allmulti touches roof, "
4347 				"set allmulti failed, allmulti feature of "
4348 				"device might be broken.\n", dev->name);
4349 			return -EOVERFLOW;
4350 		}
4351 	}
4352 	if (dev->flags ^ old_flags) {
4353 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4354 		dev_set_rx_mode(dev);
4355 	}
4356 	return 0;
4357 }
4358 EXPORT_SYMBOL(dev_set_allmulti);
4359 
4360 /*
4361  *	Upload unicast and multicast address lists to device and
4362  *	configure RX filtering. When the device doesn't support unicast
4363  *	filtering it is put in promiscuous mode while unicast addresses
4364  *	are present.
4365  */
4366 void __dev_set_rx_mode(struct net_device *dev)
4367 {
4368 	const struct net_device_ops *ops = dev->netdev_ops;
4369 
4370 	/* dev_open will call this function so the list will stay sane. */
4371 	if (!(dev->flags&IFF_UP))
4372 		return;
4373 
4374 	if (!netif_device_present(dev))
4375 		return;
4376 
4377 	if (ops->ndo_set_rx_mode)
4378 		ops->ndo_set_rx_mode(dev);
4379 	else {
4380 		/* Unicast addresses changes may only happen under the rtnl,
4381 		 * therefore calling __dev_set_promiscuity here is safe.
4382 		 */
4383 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4384 			__dev_set_promiscuity(dev, 1);
4385 			dev->uc_promisc = 1;
4386 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4387 			__dev_set_promiscuity(dev, -1);
4388 			dev->uc_promisc = 0;
4389 		}
4390 
4391 		if (ops->ndo_set_multicast_list)
4392 			ops->ndo_set_multicast_list(dev);
4393 	}
4394 }
4395 
4396 void dev_set_rx_mode(struct net_device *dev)
4397 {
4398 	netif_addr_lock_bh(dev);
4399 	__dev_set_rx_mode(dev);
4400 	netif_addr_unlock_bh(dev);
4401 }
4402 
4403 /**
4404  *	dev_get_flags - get flags reported to userspace
4405  *	@dev: device
4406  *
4407  *	Get the combination of flag bits exported through APIs to userspace.
4408  */
4409 unsigned dev_get_flags(const struct net_device *dev)
4410 {
4411 	unsigned flags;
4412 
4413 	flags = (dev->flags & ~(IFF_PROMISC |
4414 				IFF_ALLMULTI |
4415 				IFF_RUNNING |
4416 				IFF_LOWER_UP |
4417 				IFF_DORMANT)) |
4418 		(dev->gflags & (IFF_PROMISC |
4419 				IFF_ALLMULTI));
4420 
4421 	if (netif_running(dev)) {
4422 		if (netif_oper_up(dev))
4423 			flags |= IFF_RUNNING;
4424 		if (netif_carrier_ok(dev))
4425 			flags |= IFF_LOWER_UP;
4426 		if (netif_dormant(dev))
4427 			flags |= IFF_DORMANT;
4428 	}
4429 
4430 	return flags;
4431 }
4432 EXPORT_SYMBOL(dev_get_flags);
4433 
4434 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4435 {
4436 	int old_flags = dev->flags;
4437 	int ret;
4438 
4439 	ASSERT_RTNL();
4440 
4441 	/*
4442 	 *	Set the flags on our device.
4443 	 */
4444 
4445 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4446 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4447 			       IFF_AUTOMEDIA)) |
4448 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4449 				    IFF_ALLMULTI));
4450 
4451 	/*
4452 	 *	Load in the correct multicast list now the flags have changed.
4453 	 */
4454 
4455 	if ((old_flags ^ flags) & IFF_MULTICAST)
4456 		dev_change_rx_flags(dev, IFF_MULTICAST);
4457 
4458 	dev_set_rx_mode(dev);
4459 
4460 	/*
4461 	 *	Have we downed the interface. We handle IFF_UP ourselves
4462 	 *	according to user attempts to set it, rather than blindly
4463 	 *	setting it.
4464 	 */
4465 
4466 	ret = 0;
4467 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4468 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4469 
4470 		if (!ret)
4471 			dev_set_rx_mode(dev);
4472 	}
4473 
4474 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4475 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4476 
4477 		dev->gflags ^= IFF_PROMISC;
4478 		dev_set_promiscuity(dev, inc);
4479 	}
4480 
4481 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4482 	   is important. Some (broken) drivers set IFF_PROMISC, when
4483 	   IFF_ALLMULTI is requested not asking us and not reporting.
4484 	 */
4485 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4486 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4487 
4488 		dev->gflags ^= IFF_ALLMULTI;
4489 		dev_set_allmulti(dev, inc);
4490 	}
4491 
4492 	return ret;
4493 }
4494 
4495 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4496 {
4497 	unsigned int changes = dev->flags ^ old_flags;
4498 
4499 	if (changes & IFF_UP) {
4500 		if (dev->flags & IFF_UP)
4501 			call_netdevice_notifiers(NETDEV_UP, dev);
4502 		else
4503 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4504 	}
4505 
4506 	if (dev->flags & IFF_UP &&
4507 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4508 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4509 }
4510 
4511 /**
4512  *	dev_change_flags - change device settings
4513  *	@dev: device
4514  *	@flags: device state flags
4515  *
4516  *	Change settings on device based state flags. The flags are
4517  *	in the userspace exported format.
4518  */
4519 int dev_change_flags(struct net_device *dev, unsigned flags)
4520 {
4521 	int ret, changes;
4522 	int old_flags = dev->flags;
4523 
4524 	ret = __dev_change_flags(dev, flags);
4525 	if (ret < 0)
4526 		return ret;
4527 
4528 	changes = old_flags ^ dev->flags;
4529 	if (changes)
4530 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4531 
4532 	__dev_notify_flags(dev, old_flags);
4533 	return ret;
4534 }
4535 EXPORT_SYMBOL(dev_change_flags);
4536 
4537 /**
4538  *	dev_set_mtu - Change maximum transfer unit
4539  *	@dev: device
4540  *	@new_mtu: new transfer unit
4541  *
4542  *	Change the maximum transfer size of the network device.
4543  */
4544 int dev_set_mtu(struct net_device *dev, int new_mtu)
4545 {
4546 	const struct net_device_ops *ops = dev->netdev_ops;
4547 	int err;
4548 
4549 	if (new_mtu == dev->mtu)
4550 		return 0;
4551 
4552 	/*	MTU must be positive.	 */
4553 	if (new_mtu < 0)
4554 		return -EINVAL;
4555 
4556 	if (!netif_device_present(dev))
4557 		return -ENODEV;
4558 
4559 	err = 0;
4560 	if (ops->ndo_change_mtu)
4561 		err = ops->ndo_change_mtu(dev, new_mtu);
4562 	else
4563 		dev->mtu = new_mtu;
4564 
4565 	if (!err && dev->flags & IFF_UP)
4566 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4567 	return err;
4568 }
4569 EXPORT_SYMBOL(dev_set_mtu);
4570 
4571 /**
4572  *	dev_set_mac_address - Change Media Access Control Address
4573  *	@dev: device
4574  *	@sa: new address
4575  *
4576  *	Change the hardware (MAC) address of the device
4577  */
4578 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4579 {
4580 	const struct net_device_ops *ops = dev->netdev_ops;
4581 	int err;
4582 
4583 	if (!ops->ndo_set_mac_address)
4584 		return -EOPNOTSUPP;
4585 	if (sa->sa_family != dev->type)
4586 		return -EINVAL;
4587 	if (!netif_device_present(dev))
4588 		return -ENODEV;
4589 	err = ops->ndo_set_mac_address(dev, sa);
4590 	if (!err)
4591 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4592 	return err;
4593 }
4594 EXPORT_SYMBOL(dev_set_mac_address);
4595 
4596 /*
4597  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4598  */
4599 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4600 {
4601 	int err;
4602 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4603 
4604 	if (!dev)
4605 		return -ENODEV;
4606 
4607 	switch (cmd) {
4608 	case SIOCGIFFLAGS:	/* Get interface flags */
4609 		ifr->ifr_flags = (short) dev_get_flags(dev);
4610 		return 0;
4611 
4612 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4613 				   (currently unused) */
4614 		ifr->ifr_metric = 0;
4615 		return 0;
4616 
4617 	case SIOCGIFMTU:	/* Get the MTU of a device */
4618 		ifr->ifr_mtu = dev->mtu;
4619 		return 0;
4620 
4621 	case SIOCGIFHWADDR:
4622 		if (!dev->addr_len)
4623 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4624 		else
4625 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4626 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4627 		ifr->ifr_hwaddr.sa_family = dev->type;
4628 		return 0;
4629 
4630 	case SIOCGIFSLAVE:
4631 		err = -EINVAL;
4632 		break;
4633 
4634 	case SIOCGIFMAP:
4635 		ifr->ifr_map.mem_start = dev->mem_start;
4636 		ifr->ifr_map.mem_end   = dev->mem_end;
4637 		ifr->ifr_map.base_addr = dev->base_addr;
4638 		ifr->ifr_map.irq       = dev->irq;
4639 		ifr->ifr_map.dma       = dev->dma;
4640 		ifr->ifr_map.port      = dev->if_port;
4641 		return 0;
4642 
4643 	case SIOCGIFINDEX:
4644 		ifr->ifr_ifindex = dev->ifindex;
4645 		return 0;
4646 
4647 	case SIOCGIFTXQLEN:
4648 		ifr->ifr_qlen = dev->tx_queue_len;
4649 		return 0;
4650 
4651 	default:
4652 		/* dev_ioctl() should ensure this case
4653 		 * is never reached
4654 		 */
4655 		WARN_ON(1);
4656 		err = -EINVAL;
4657 		break;
4658 
4659 	}
4660 	return err;
4661 }
4662 
4663 /*
4664  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4665  */
4666 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4667 {
4668 	int err;
4669 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4670 	const struct net_device_ops *ops;
4671 
4672 	if (!dev)
4673 		return -ENODEV;
4674 
4675 	ops = dev->netdev_ops;
4676 
4677 	switch (cmd) {
4678 	case SIOCSIFFLAGS:	/* Set interface flags */
4679 		return dev_change_flags(dev, ifr->ifr_flags);
4680 
4681 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4682 				   (currently unused) */
4683 		return -EOPNOTSUPP;
4684 
4685 	case SIOCSIFMTU:	/* Set the MTU of a device */
4686 		return dev_set_mtu(dev, ifr->ifr_mtu);
4687 
4688 	case SIOCSIFHWADDR:
4689 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4690 
4691 	case SIOCSIFHWBROADCAST:
4692 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4693 			return -EINVAL;
4694 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4695 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4696 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4697 		return 0;
4698 
4699 	case SIOCSIFMAP:
4700 		if (ops->ndo_set_config) {
4701 			if (!netif_device_present(dev))
4702 				return -ENODEV;
4703 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4704 		}
4705 		return -EOPNOTSUPP;
4706 
4707 	case SIOCADDMULTI:
4708 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4709 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4710 			return -EINVAL;
4711 		if (!netif_device_present(dev))
4712 			return -ENODEV;
4713 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4714 
4715 	case SIOCDELMULTI:
4716 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4717 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4718 			return -EINVAL;
4719 		if (!netif_device_present(dev))
4720 			return -ENODEV;
4721 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4722 
4723 	case SIOCSIFTXQLEN:
4724 		if (ifr->ifr_qlen < 0)
4725 			return -EINVAL;
4726 		dev->tx_queue_len = ifr->ifr_qlen;
4727 		return 0;
4728 
4729 	case SIOCSIFNAME:
4730 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4731 		return dev_change_name(dev, ifr->ifr_newname);
4732 
4733 	/*
4734 	 *	Unknown or private ioctl
4735 	 */
4736 	default:
4737 		if ((cmd >= SIOCDEVPRIVATE &&
4738 		    cmd <= SIOCDEVPRIVATE + 15) ||
4739 		    cmd == SIOCBONDENSLAVE ||
4740 		    cmd == SIOCBONDRELEASE ||
4741 		    cmd == SIOCBONDSETHWADDR ||
4742 		    cmd == SIOCBONDSLAVEINFOQUERY ||
4743 		    cmd == SIOCBONDINFOQUERY ||
4744 		    cmd == SIOCBONDCHANGEACTIVE ||
4745 		    cmd == SIOCGMIIPHY ||
4746 		    cmd == SIOCGMIIREG ||
4747 		    cmd == SIOCSMIIREG ||
4748 		    cmd == SIOCBRADDIF ||
4749 		    cmd == SIOCBRDELIF ||
4750 		    cmd == SIOCSHWTSTAMP ||
4751 		    cmd == SIOCWANDEV) {
4752 			err = -EOPNOTSUPP;
4753 			if (ops->ndo_do_ioctl) {
4754 				if (netif_device_present(dev))
4755 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4756 				else
4757 					err = -ENODEV;
4758 			}
4759 		} else
4760 			err = -EINVAL;
4761 
4762 	}
4763 	return err;
4764 }
4765 
4766 /*
4767  *	This function handles all "interface"-type I/O control requests. The actual
4768  *	'doing' part of this is dev_ifsioc above.
4769  */
4770 
4771 /**
4772  *	dev_ioctl	-	network device ioctl
4773  *	@net: the applicable net namespace
4774  *	@cmd: command to issue
4775  *	@arg: pointer to a struct ifreq in user space
4776  *
4777  *	Issue ioctl functions to devices. This is normally called by the
4778  *	user space syscall interfaces but can sometimes be useful for
4779  *	other purposes. The return value is the return from the syscall if
4780  *	positive or a negative errno code on error.
4781  */
4782 
4783 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4784 {
4785 	struct ifreq ifr;
4786 	int ret;
4787 	char *colon;
4788 
4789 	/* One special case: SIOCGIFCONF takes ifconf argument
4790 	   and requires shared lock, because it sleeps writing
4791 	   to user space.
4792 	 */
4793 
4794 	if (cmd == SIOCGIFCONF) {
4795 		rtnl_lock();
4796 		ret = dev_ifconf(net, (char __user *) arg);
4797 		rtnl_unlock();
4798 		return ret;
4799 	}
4800 	if (cmd == SIOCGIFNAME)
4801 		return dev_ifname(net, (struct ifreq __user *)arg);
4802 
4803 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4804 		return -EFAULT;
4805 
4806 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4807 
4808 	colon = strchr(ifr.ifr_name, ':');
4809 	if (colon)
4810 		*colon = 0;
4811 
4812 	/*
4813 	 *	See which interface the caller is talking about.
4814 	 */
4815 
4816 	switch (cmd) {
4817 	/*
4818 	 *	These ioctl calls:
4819 	 *	- can be done by all.
4820 	 *	- atomic and do not require locking.
4821 	 *	- return a value
4822 	 */
4823 	case SIOCGIFFLAGS:
4824 	case SIOCGIFMETRIC:
4825 	case SIOCGIFMTU:
4826 	case SIOCGIFHWADDR:
4827 	case SIOCGIFSLAVE:
4828 	case SIOCGIFMAP:
4829 	case SIOCGIFINDEX:
4830 	case SIOCGIFTXQLEN:
4831 		dev_load(net, ifr.ifr_name);
4832 		rcu_read_lock();
4833 		ret = dev_ifsioc_locked(net, &ifr, cmd);
4834 		rcu_read_unlock();
4835 		if (!ret) {
4836 			if (colon)
4837 				*colon = ':';
4838 			if (copy_to_user(arg, &ifr,
4839 					 sizeof(struct ifreq)))
4840 				ret = -EFAULT;
4841 		}
4842 		return ret;
4843 
4844 	case SIOCETHTOOL:
4845 		dev_load(net, ifr.ifr_name);
4846 		rtnl_lock();
4847 		ret = dev_ethtool(net, &ifr);
4848 		rtnl_unlock();
4849 		if (!ret) {
4850 			if (colon)
4851 				*colon = ':';
4852 			if (copy_to_user(arg, &ifr,
4853 					 sizeof(struct ifreq)))
4854 				ret = -EFAULT;
4855 		}
4856 		return ret;
4857 
4858 	/*
4859 	 *	These ioctl calls:
4860 	 *	- require superuser power.
4861 	 *	- require strict serialization.
4862 	 *	- return a value
4863 	 */
4864 	case SIOCGMIIPHY:
4865 	case SIOCGMIIREG:
4866 	case SIOCSIFNAME:
4867 		if (!capable(CAP_NET_ADMIN))
4868 			return -EPERM;
4869 		dev_load(net, ifr.ifr_name);
4870 		rtnl_lock();
4871 		ret = dev_ifsioc(net, &ifr, cmd);
4872 		rtnl_unlock();
4873 		if (!ret) {
4874 			if (colon)
4875 				*colon = ':';
4876 			if (copy_to_user(arg, &ifr,
4877 					 sizeof(struct ifreq)))
4878 				ret = -EFAULT;
4879 		}
4880 		return ret;
4881 
4882 	/*
4883 	 *	These ioctl calls:
4884 	 *	- require superuser power.
4885 	 *	- require strict serialization.
4886 	 *	- do not return a value
4887 	 */
4888 	case SIOCSIFFLAGS:
4889 	case SIOCSIFMETRIC:
4890 	case SIOCSIFMTU:
4891 	case SIOCSIFMAP:
4892 	case SIOCSIFHWADDR:
4893 	case SIOCSIFSLAVE:
4894 	case SIOCADDMULTI:
4895 	case SIOCDELMULTI:
4896 	case SIOCSIFHWBROADCAST:
4897 	case SIOCSIFTXQLEN:
4898 	case SIOCSMIIREG:
4899 	case SIOCBONDENSLAVE:
4900 	case SIOCBONDRELEASE:
4901 	case SIOCBONDSETHWADDR:
4902 	case SIOCBONDCHANGEACTIVE:
4903 	case SIOCBRADDIF:
4904 	case SIOCBRDELIF:
4905 	case SIOCSHWTSTAMP:
4906 		if (!capable(CAP_NET_ADMIN))
4907 			return -EPERM;
4908 		/* fall through */
4909 	case SIOCBONDSLAVEINFOQUERY:
4910 	case SIOCBONDINFOQUERY:
4911 		dev_load(net, ifr.ifr_name);
4912 		rtnl_lock();
4913 		ret = dev_ifsioc(net, &ifr, cmd);
4914 		rtnl_unlock();
4915 		return ret;
4916 
4917 	case SIOCGIFMEM:
4918 		/* Get the per device memory space. We can add this but
4919 		 * currently do not support it */
4920 	case SIOCSIFMEM:
4921 		/* Set the per device memory buffer space.
4922 		 * Not applicable in our case */
4923 	case SIOCSIFLINK:
4924 		return -EINVAL;
4925 
4926 	/*
4927 	 *	Unknown or private ioctl.
4928 	 */
4929 	default:
4930 		if (cmd == SIOCWANDEV ||
4931 		    (cmd >= SIOCDEVPRIVATE &&
4932 		     cmd <= SIOCDEVPRIVATE + 15)) {
4933 			dev_load(net, ifr.ifr_name);
4934 			rtnl_lock();
4935 			ret = dev_ifsioc(net, &ifr, cmd);
4936 			rtnl_unlock();
4937 			if (!ret && copy_to_user(arg, &ifr,
4938 						 sizeof(struct ifreq)))
4939 				ret = -EFAULT;
4940 			return ret;
4941 		}
4942 		/* Take care of Wireless Extensions */
4943 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4944 			return wext_handle_ioctl(net, &ifr, cmd, arg);
4945 		return -EINVAL;
4946 	}
4947 }
4948 
4949 
4950 /**
4951  *	dev_new_index	-	allocate an ifindex
4952  *	@net: the applicable net namespace
4953  *
4954  *	Returns a suitable unique value for a new device interface
4955  *	number.  The caller must hold the rtnl semaphore or the
4956  *	dev_base_lock to be sure it remains unique.
4957  */
4958 static int dev_new_index(struct net *net)
4959 {
4960 	static int ifindex;
4961 	for (;;) {
4962 		if (++ifindex <= 0)
4963 			ifindex = 1;
4964 		if (!__dev_get_by_index(net, ifindex))
4965 			return ifindex;
4966 	}
4967 }
4968 
4969 /* Delayed registration/unregisteration */
4970 static LIST_HEAD(net_todo_list);
4971 
4972 static void net_set_todo(struct net_device *dev)
4973 {
4974 	list_add_tail(&dev->todo_list, &net_todo_list);
4975 }
4976 
4977 static void rollback_registered_many(struct list_head *head)
4978 {
4979 	struct net_device *dev, *tmp;
4980 
4981 	BUG_ON(dev_boot_phase);
4982 	ASSERT_RTNL();
4983 
4984 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4985 		/* Some devices call without registering
4986 		 * for initialization unwind. Remove those
4987 		 * devices and proceed with the remaining.
4988 		 */
4989 		if (dev->reg_state == NETREG_UNINITIALIZED) {
4990 			pr_debug("unregister_netdevice: device %s/%p never "
4991 				 "was registered\n", dev->name, dev);
4992 
4993 			WARN_ON(1);
4994 			list_del(&dev->unreg_list);
4995 			continue;
4996 		}
4997 
4998 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
4999 	}
5000 
5001 	/* If device is running, close it first. */
5002 	dev_close_many(head);
5003 
5004 	list_for_each_entry(dev, head, unreg_list) {
5005 		/* And unlink it from device chain. */
5006 		unlist_netdevice(dev);
5007 
5008 		dev->reg_state = NETREG_UNREGISTERING;
5009 	}
5010 
5011 	synchronize_net();
5012 
5013 	list_for_each_entry(dev, head, unreg_list) {
5014 		/* Shutdown queueing discipline. */
5015 		dev_shutdown(dev);
5016 
5017 
5018 		/* Notify protocols, that we are about to destroy
5019 		   this device. They should clean all the things.
5020 		*/
5021 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5022 
5023 		if (!dev->rtnl_link_ops ||
5024 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5025 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5026 
5027 		/*
5028 		 *	Flush the unicast and multicast chains
5029 		 */
5030 		dev_uc_flush(dev);
5031 		dev_mc_flush(dev);
5032 
5033 		if (dev->netdev_ops->ndo_uninit)
5034 			dev->netdev_ops->ndo_uninit(dev);
5035 
5036 		/* Notifier chain MUST detach us from master device. */
5037 		WARN_ON(dev->master);
5038 
5039 		/* Remove entries from kobject tree */
5040 		netdev_unregister_kobject(dev);
5041 	}
5042 
5043 	/* Process any work delayed until the end of the batch */
5044 	dev = list_first_entry(head, struct net_device, unreg_list);
5045 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5046 
5047 	rcu_barrier();
5048 
5049 	list_for_each_entry(dev, head, unreg_list)
5050 		dev_put(dev);
5051 }
5052 
5053 static void rollback_registered(struct net_device *dev)
5054 {
5055 	LIST_HEAD(single);
5056 
5057 	list_add(&dev->unreg_list, &single);
5058 	rollback_registered_many(&single);
5059 }
5060 
5061 unsigned long netdev_fix_features(unsigned long features, const char *name)
5062 {
5063 	/* Fix illegal SG+CSUM combinations. */
5064 	if ((features & NETIF_F_SG) &&
5065 	    !(features & NETIF_F_ALL_CSUM)) {
5066 		if (name)
5067 			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5068 			       "checksum feature.\n", name);
5069 		features &= ~NETIF_F_SG;
5070 	}
5071 
5072 	/* TSO requires that SG is present as well. */
5073 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5074 		if (name)
5075 			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5076 			       "SG feature.\n", name);
5077 		features &= ~NETIF_F_TSO;
5078 	}
5079 
5080 	if (features & NETIF_F_UFO) {
5081 		/* maybe split UFO into V4 and V6? */
5082 		if (!((features & NETIF_F_GEN_CSUM) ||
5083 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5084 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5085 			if (name)
5086 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5087 				       "since no checksum offload features.\n",
5088 				       name);
5089 			features &= ~NETIF_F_UFO;
5090 		}
5091 
5092 		if (!(features & NETIF_F_SG)) {
5093 			if (name)
5094 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5095 				       "since no NETIF_F_SG feature.\n", name);
5096 			features &= ~NETIF_F_UFO;
5097 		}
5098 	}
5099 
5100 	return features;
5101 }
5102 EXPORT_SYMBOL(netdev_fix_features);
5103 
5104 /**
5105  *	netif_stacked_transfer_operstate -	transfer operstate
5106  *	@rootdev: the root or lower level device to transfer state from
5107  *	@dev: the device to transfer operstate to
5108  *
5109  *	Transfer operational state from root to device. This is normally
5110  *	called when a stacking relationship exists between the root
5111  *	device and the device(a leaf device).
5112  */
5113 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5114 					struct net_device *dev)
5115 {
5116 	if (rootdev->operstate == IF_OPER_DORMANT)
5117 		netif_dormant_on(dev);
5118 	else
5119 		netif_dormant_off(dev);
5120 
5121 	if (netif_carrier_ok(rootdev)) {
5122 		if (!netif_carrier_ok(dev))
5123 			netif_carrier_on(dev);
5124 	} else {
5125 		if (netif_carrier_ok(dev))
5126 			netif_carrier_off(dev);
5127 	}
5128 }
5129 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5130 
5131 #ifdef CONFIG_RPS
5132 static int netif_alloc_rx_queues(struct net_device *dev)
5133 {
5134 	unsigned int i, count = dev->num_rx_queues;
5135 	struct netdev_rx_queue *rx;
5136 
5137 	BUG_ON(count < 1);
5138 
5139 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5140 	if (!rx) {
5141 		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5142 		return -ENOMEM;
5143 	}
5144 	dev->_rx = rx;
5145 
5146 	for (i = 0; i < count; i++)
5147 		rx[i].dev = dev;
5148 	return 0;
5149 }
5150 #endif
5151 
5152 static void netdev_init_one_queue(struct net_device *dev,
5153 				  struct netdev_queue *queue, void *_unused)
5154 {
5155 	/* Initialize queue lock */
5156 	spin_lock_init(&queue->_xmit_lock);
5157 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5158 	queue->xmit_lock_owner = -1;
5159 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5160 	queue->dev = dev;
5161 }
5162 
5163 static int netif_alloc_netdev_queues(struct net_device *dev)
5164 {
5165 	unsigned int count = dev->num_tx_queues;
5166 	struct netdev_queue *tx;
5167 
5168 	BUG_ON(count < 1);
5169 
5170 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5171 	if (!tx) {
5172 		pr_err("netdev: Unable to allocate %u tx queues.\n",
5173 		       count);
5174 		return -ENOMEM;
5175 	}
5176 	dev->_tx = tx;
5177 
5178 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5179 	spin_lock_init(&dev->tx_global_lock);
5180 
5181 	return 0;
5182 }
5183 
5184 /**
5185  *	register_netdevice	- register a network device
5186  *	@dev: device to register
5187  *
5188  *	Take a completed network device structure and add it to the kernel
5189  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5190  *	chain. 0 is returned on success. A negative errno code is returned
5191  *	on a failure to set up the device, or if the name is a duplicate.
5192  *
5193  *	Callers must hold the rtnl semaphore. You may want
5194  *	register_netdev() instead of this.
5195  *
5196  *	BUGS:
5197  *	The locking appears insufficient to guarantee two parallel registers
5198  *	will not get the same name.
5199  */
5200 
5201 int register_netdevice(struct net_device *dev)
5202 {
5203 	int ret;
5204 	struct net *net = dev_net(dev);
5205 
5206 	BUG_ON(dev_boot_phase);
5207 	ASSERT_RTNL();
5208 
5209 	might_sleep();
5210 
5211 	/* When net_device's are persistent, this will be fatal. */
5212 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5213 	BUG_ON(!net);
5214 
5215 	spin_lock_init(&dev->addr_list_lock);
5216 	netdev_set_addr_lockdep_class(dev);
5217 
5218 	dev->iflink = -1;
5219 
5220 	/* Init, if this function is available */
5221 	if (dev->netdev_ops->ndo_init) {
5222 		ret = dev->netdev_ops->ndo_init(dev);
5223 		if (ret) {
5224 			if (ret > 0)
5225 				ret = -EIO;
5226 			goto out;
5227 		}
5228 	}
5229 
5230 	ret = dev_get_valid_name(dev, dev->name, 0);
5231 	if (ret)
5232 		goto err_uninit;
5233 
5234 	dev->ifindex = dev_new_index(net);
5235 	if (dev->iflink == -1)
5236 		dev->iflink = dev->ifindex;
5237 
5238 	/* Fix illegal checksum combinations */
5239 	if ((dev->features & NETIF_F_HW_CSUM) &&
5240 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5241 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5242 		       dev->name);
5243 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5244 	}
5245 
5246 	if ((dev->features & NETIF_F_NO_CSUM) &&
5247 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5248 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5249 		       dev->name);
5250 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5251 	}
5252 
5253 	dev->features = netdev_fix_features(dev->features, dev->name);
5254 
5255 	/* Enable software GSO if SG is supported. */
5256 	if (dev->features & NETIF_F_SG)
5257 		dev->features |= NETIF_F_GSO;
5258 
5259 	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5260 	 * vlan_dev_init() will do the dev->features check, so these features
5261 	 * are enabled only if supported by underlying device.
5262 	 */
5263 	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5264 
5265 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5266 	ret = notifier_to_errno(ret);
5267 	if (ret)
5268 		goto err_uninit;
5269 
5270 	ret = netdev_register_kobject(dev);
5271 	if (ret)
5272 		goto err_uninit;
5273 	dev->reg_state = NETREG_REGISTERED;
5274 
5275 	/*
5276 	 *	Default initial state at registry is that the
5277 	 *	device is present.
5278 	 */
5279 
5280 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5281 
5282 	dev_init_scheduler(dev);
5283 	dev_hold(dev);
5284 	list_netdevice(dev);
5285 
5286 	/* Notify protocols, that a new device appeared. */
5287 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5288 	ret = notifier_to_errno(ret);
5289 	if (ret) {
5290 		rollback_registered(dev);
5291 		dev->reg_state = NETREG_UNREGISTERED;
5292 	}
5293 	/*
5294 	 *	Prevent userspace races by waiting until the network
5295 	 *	device is fully setup before sending notifications.
5296 	 */
5297 	if (!dev->rtnl_link_ops ||
5298 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5299 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5300 
5301 out:
5302 	return ret;
5303 
5304 err_uninit:
5305 	if (dev->netdev_ops->ndo_uninit)
5306 		dev->netdev_ops->ndo_uninit(dev);
5307 	goto out;
5308 }
5309 EXPORT_SYMBOL(register_netdevice);
5310 
5311 /**
5312  *	init_dummy_netdev	- init a dummy network device for NAPI
5313  *	@dev: device to init
5314  *
5315  *	This takes a network device structure and initialize the minimum
5316  *	amount of fields so it can be used to schedule NAPI polls without
5317  *	registering a full blown interface. This is to be used by drivers
5318  *	that need to tie several hardware interfaces to a single NAPI
5319  *	poll scheduler due to HW limitations.
5320  */
5321 int init_dummy_netdev(struct net_device *dev)
5322 {
5323 	/* Clear everything. Note we don't initialize spinlocks
5324 	 * are they aren't supposed to be taken by any of the
5325 	 * NAPI code and this dummy netdev is supposed to be
5326 	 * only ever used for NAPI polls
5327 	 */
5328 	memset(dev, 0, sizeof(struct net_device));
5329 
5330 	/* make sure we BUG if trying to hit standard
5331 	 * register/unregister code path
5332 	 */
5333 	dev->reg_state = NETREG_DUMMY;
5334 
5335 	/* NAPI wants this */
5336 	INIT_LIST_HEAD(&dev->napi_list);
5337 
5338 	/* a dummy interface is started by default */
5339 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5340 	set_bit(__LINK_STATE_START, &dev->state);
5341 
5342 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5343 	 * because users of this 'device' dont need to change
5344 	 * its refcount.
5345 	 */
5346 
5347 	return 0;
5348 }
5349 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5350 
5351 
5352 /**
5353  *	register_netdev	- register a network device
5354  *	@dev: device to register
5355  *
5356  *	Take a completed network device structure and add it to the kernel
5357  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5358  *	chain. 0 is returned on success. A negative errno code is returned
5359  *	on a failure to set up the device, or if the name is a duplicate.
5360  *
5361  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5362  *	and expands the device name if you passed a format string to
5363  *	alloc_netdev.
5364  */
5365 int register_netdev(struct net_device *dev)
5366 {
5367 	int err;
5368 
5369 	rtnl_lock();
5370 
5371 	/*
5372 	 * If the name is a format string the caller wants us to do a
5373 	 * name allocation.
5374 	 */
5375 	if (strchr(dev->name, '%')) {
5376 		err = dev_alloc_name(dev, dev->name);
5377 		if (err < 0)
5378 			goto out;
5379 	}
5380 
5381 	err = register_netdevice(dev);
5382 out:
5383 	rtnl_unlock();
5384 	return err;
5385 }
5386 EXPORT_SYMBOL(register_netdev);
5387 
5388 int netdev_refcnt_read(const struct net_device *dev)
5389 {
5390 	int i, refcnt = 0;
5391 
5392 	for_each_possible_cpu(i)
5393 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5394 	return refcnt;
5395 }
5396 EXPORT_SYMBOL(netdev_refcnt_read);
5397 
5398 /*
5399  * netdev_wait_allrefs - wait until all references are gone.
5400  *
5401  * This is called when unregistering network devices.
5402  *
5403  * Any protocol or device that holds a reference should register
5404  * for netdevice notification, and cleanup and put back the
5405  * reference if they receive an UNREGISTER event.
5406  * We can get stuck here if buggy protocols don't correctly
5407  * call dev_put.
5408  */
5409 static void netdev_wait_allrefs(struct net_device *dev)
5410 {
5411 	unsigned long rebroadcast_time, warning_time;
5412 	int refcnt;
5413 
5414 	linkwatch_forget_dev(dev);
5415 
5416 	rebroadcast_time = warning_time = jiffies;
5417 	refcnt = netdev_refcnt_read(dev);
5418 
5419 	while (refcnt != 0) {
5420 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5421 			rtnl_lock();
5422 
5423 			/* Rebroadcast unregister notification */
5424 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5425 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5426 			 * should have already handle it the first time */
5427 
5428 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5429 				     &dev->state)) {
5430 				/* We must not have linkwatch events
5431 				 * pending on unregister. If this
5432 				 * happens, we simply run the queue
5433 				 * unscheduled, resulting in a noop
5434 				 * for this device.
5435 				 */
5436 				linkwatch_run_queue();
5437 			}
5438 
5439 			__rtnl_unlock();
5440 
5441 			rebroadcast_time = jiffies;
5442 		}
5443 
5444 		msleep(250);
5445 
5446 		refcnt = netdev_refcnt_read(dev);
5447 
5448 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5449 			printk(KERN_EMERG "unregister_netdevice: "
5450 			       "waiting for %s to become free. Usage "
5451 			       "count = %d\n",
5452 			       dev->name, refcnt);
5453 			warning_time = jiffies;
5454 		}
5455 	}
5456 }
5457 
5458 /* The sequence is:
5459  *
5460  *	rtnl_lock();
5461  *	...
5462  *	register_netdevice(x1);
5463  *	register_netdevice(x2);
5464  *	...
5465  *	unregister_netdevice(y1);
5466  *	unregister_netdevice(y2);
5467  *      ...
5468  *	rtnl_unlock();
5469  *	free_netdev(y1);
5470  *	free_netdev(y2);
5471  *
5472  * We are invoked by rtnl_unlock().
5473  * This allows us to deal with problems:
5474  * 1) We can delete sysfs objects which invoke hotplug
5475  *    without deadlocking with linkwatch via keventd.
5476  * 2) Since we run with the RTNL semaphore not held, we can sleep
5477  *    safely in order to wait for the netdev refcnt to drop to zero.
5478  *
5479  * We must not return until all unregister events added during
5480  * the interval the lock was held have been completed.
5481  */
5482 void netdev_run_todo(void)
5483 {
5484 	struct list_head list;
5485 
5486 	/* Snapshot list, allow later requests */
5487 	list_replace_init(&net_todo_list, &list);
5488 
5489 	__rtnl_unlock();
5490 
5491 	while (!list_empty(&list)) {
5492 		struct net_device *dev
5493 			= list_first_entry(&list, struct net_device, todo_list);
5494 		list_del(&dev->todo_list);
5495 
5496 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5497 			printk(KERN_ERR "network todo '%s' but state %d\n",
5498 			       dev->name, dev->reg_state);
5499 			dump_stack();
5500 			continue;
5501 		}
5502 
5503 		dev->reg_state = NETREG_UNREGISTERED;
5504 
5505 		on_each_cpu(flush_backlog, dev, 1);
5506 
5507 		netdev_wait_allrefs(dev);
5508 
5509 		/* paranoia */
5510 		BUG_ON(netdev_refcnt_read(dev));
5511 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5512 		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5513 		WARN_ON(dev->dn_ptr);
5514 
5515 		if (dev->destructor)
5516 			dev->destructor(dev);
5517 
5518 		/* Free network device */
5519 		kobject_put(&dev->dev.kobj);
5520 	}
5521 }
5522 
5523 /**
5524  *	dev_txq_stats_fold - fold tx_queues stats
5525  *	@dev: device to get statistics from
5526  *	@stats: struct rtnl_link_stats64 to hold results
5527  */
5528 void dev_txq_stats_fold(const struct net_device *dev,
5529 			struct rtnl_link_stats64 *stats)
5530 {
5531 	u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5532 	unsigned int i;
5533 	struct netdev_queue *txq;
5534 
5535 	for (i = 0; i < dev->num_tx_queues; i++) {
5536 		txq = netdev_get_tx_queue(dev, i);
5537 		spin_lock_bh(&txq->_xmit_lock);
5538 		tx_bytes   += txq->tx_bytes;
5539 		tx_packets += txq->tx_packets;
5540 		tx_dropped += txq->tx_dropped;
5541 		spin_unlock_bh(&txq->_xmit_lock);
5542 	}
5543 	if (tx_bytes || tx_packets || tx_dropped) {
5544 		stats->tx_bytes   = tx_bytes;
5545 		stats->tx_packets = tx_packets;
5546 		stats->tx_dropped = tx_dropped;
5547 	}
5548 }
5549 EXPORT_SYMBOL(dev_txq_stats_fold);
5550 
5551 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5552  * fields in the same order, with only the type differing.
5553  */
5554 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5555 				    const struct net_device_stats *netdev_stats)
5556 {
5557 #if BITS_PER_LONG == 64
5558         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5559         memcpy(stats64, netdev_stats, sizeof(*stats64));
5560 #else
5561 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5562 	const unsigned long *src = (const unsigned long *)netdev_stats;
5563 	u64 *dst = (u64 *)stats64;
5564 
5565 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5566 		     sizeof(*stats64) / sizeof(u64));
5567 	for (i = 0; i < n; i++)
5568 		dst[i] = src[i];
5569 #endif
5570 }
5571 
5572 /**
5573  *	dev_get_stats	- get network device statistics
5574  *	@dev: device to get statistics from
5575  *	@storage: place to store stats
5576  *
5577  *	Get network statistics from device. Return @storage.
5578  *	The device driver may provide its own method by setting
5579  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5580  *	otherwise the internal statistics structure is used.
5581  */
5582 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5583 					struct rtnl_link_stats64 *storage)
5584 {
5585 	const struct net_device_ops *ops = dev->netdev_ops;
5586 
5587 	if (ops->ndo_get_stats64) {
5588 		memset(storage, 0, sizeof(*storage));
5589 		ops->ndo_get_stats64(dev, storage);
5590 	} else if (ops->ndo_get_stats) {
5591 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5592 	} else {
5593 		netdev_stats_to_stats64(storage, &dev->stats);
5594 		dev_txq_stats_fold(dev, storage);
5595 	}
5596 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5597 	return storage;
5598 }
5599 EXPORT_SYMBOL(dev_get_stats);
5600 
5601 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5602 {
5603 	struct netdev_queue *queue = dev_ingress_queue(dev);
5604 
5605 #ifdef CONFIG_NET_CLS_ACT
5606 	if (queue)
5607 		return queue;
5608 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5609 	if (!queue)
5610 		return NULL;
5611 	netdev_init_one_queue(dev, queue, NULL);
5612 	queue->qdisc = &noop_qdisc;
5613 	queue->qdisc_sleeping = &noop_qdisc;
5614 	rcu_assign_pointer(dev->ingress_queue, queue);
5615 #endif
5616 	return queue;
5617 }
5618 
5619 /**
5620  *	alloc_netdev_mq - allocate network device
5621  *	@sizeof_priv:	size of private data to allocate space for
5622  *	@name:		device name format string
5623  *	@setup:		callback to initialize device
5624  *	@queue_count:	the number of subqueues to allocate
5625  *
5626  *	Allocates a struct net_device with private data area for driver use
5627  *	and performs basic initialization.  Also allocates subquue structs
5628  *	for each queue on the device at the end of the netdevice.
5629  */
5630 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5631 		void (*setup)(struct net_device *), unsigned int queue_count)
5632 {
5633 	struct net_device *dev;
5634 	size_t alloc_size;
5635 	struct net_device *p;
5636 
5637 	BUG_ON(strlen(name) >= sizeof(dev->name));
5638 
5639 	if (queue_count < 1) {
5640 		pr_err("alloc_netdev: Unable to allocate device "
5641 		       "with zero queues.\n");
5642 		return NULL;
5643 	}
5644 
5645 	alloc_size = sizeof(struct net_device);
5646 	if (sizeof_priv) {
5647 		/* ensure 32-byte alignment of private area */
5648 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5649 		alloc_size += sizeof_priv;
5650 	}
5651 	/* ensure 32-byte alignment of whole construct */
5652 	alloc_size += NETDEV_ALIGN - 1;
5653 
5654 	p = kzalloc(alloc_size, GFP_KERNEL);
5655 	if (!p) {
5656 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5657 		return NULL;
5658 	}
5659 
5660 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5661 	dev->padded = (char *)dev - (char *)p;
5662 
5663 	dev->pcpu_refcnt = alloc_percpu(int);
5664 	if (!dev->pcpu_refcnt)
5665 		goto free_p;
5666 
5667 	if (dev_addr_init(dev))
5668 		goto free_pcpu;
5669 
5670 	dev_mc_init(dev);
5671 	dev_uc_init(dev);
5672 
5673 	dev_net_set(dev, &init_net);
5674 
5675 	dev->num_tx_queues = queue_count;
5676 	dev->real_num_tx_queues = queue_count;
5677 	if (netif_alloc_netdev_queues(dev))
5678 		goto free_pcpu;
5679 
5680 #ifdef CONFIG_RPS
5681 	dev->num_rx_queues = queue_count;
5682 	dev->real_num_rx_queues = queue_count;
5683 	if (netif_alloc_rx_queues(dev))
5684 		goto free_pcpu;
5685 #endif
5686 
5687 	dev->gso_max_size = GSO_MAX_SIZE;
5688 
5689 	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5690 	dev->ethtool_ntuple_list.count = 0;
5691 	INIT_LIST_HEAD(&dev->napi_list);
5692 	INIT_LIST_HEAD(&dev->unreg_list);
5693 	INIT_LIST_HEAD(&dev->link_watch_list);
5694 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5695 	setup(dev);
5696 	strcpy(dev->name, name);
5697 	return dev;
5698 
5699 free_pcpu:
5700 	free_percpu(dev->pcpu_refcnt);
5701 	kfree(dev->_tx);
5702 #ifdef CONFIG_RPS
5703 	kfree(dev->_rx);
5704 #endif
5705 
5706 free_p:
5707 	kfree(p);
5708 	return NULL;
5709 }
5710 EXPORT_SYMBOL(alloc_netdev_mq);
5711 
5712 /**
5713  *	free_netdev - free network device
5714  *	@dev: device
5715  *
5716  *	This function does the last stage of destroying an allocated device
5717  * 	interface. The reference to the device object is released.
5718  *	If this is the last reference then it will be freed.
5719  */
5720 void free_netdev(struct net_device *dev)
5721 {
5722 	struct napi_struct *p, *n;
5723 
5724 	release_net(dev_net(dev));
5725 
5726 	kfree(dev->_tx);
5727 #ifdef CONFIG_RPS
5728 	kfree(dev->_rx);
5729 #endif
5730 
5731 	kfree(rcu_dereference_raw(dev->ingress_queue));
5732 
5733 	/* Flush device addresses */
5734 	dev_addr_flush(dev);
5735 
5736 	/* Clear ethtool n-tuple list */
5737 	ethtool_ntuple_flush(dev);
5738 
5739 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5740 		netif_napi_del(p);
5741 
5742 	free_percpu(dev->pcpu_refcnt);
5743 	dev->pcpu_refcnt = NULL;
5744 
5745 	/*  Compatibility with error handling in drivers */
5746 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5747 		kfree((char *)dev - dev->padded);
5748 		return;
5749 	}
5750 
5751 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5752 	dev->reg_state = NETREG_RELEASED;
5753 
5754 	/* will free via device release */
5755 	put_device(&dev->dev);
5756 }
5757 EXPORT_SYMBOL(free_netdev);
5758 
5759 /**
5760  *	synchronize_net -  Synchronize with packet receive processing
5761  *
5762  *	Wait for packets currently being received to be done.
5763  *	Does not block later packets from starting.
5764  */
5765 void synchronize_net(void)
5766 {
5767 	might_sleep();
5768 	synchronize_rcu();
5769 }
5770 EXPORT_SYMBOL(synchronize_net);
5771 
5772 /**
5773  *	unregister_netdevice_queue - remove device from the kernel
5774  *	@dev: device
5775  *	@head: list
5776  *
5777  *	This function shuts down a device interface and removes it
5778  *	from the kernel tables.
5779  *	If head not NULL, device is queued to be unregistered later.
5780  *
5781  *	Callers must hold the rtnl semaphore.  You may want
5782  *	unregister_netdev() instead of this.
5783  */
5784 
5785 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5786 {
5787 	ASSERT_RTNL();
5788 
5789 	if (head) {
5790 		list_move_tail(&dev->unreg_list, head);
5791 	} else {
5792 		rollback_registered(dev);
5793 		/* Finish processing unregister after unlock */
5794 		net_set_todo(dev);
5795 	}
5796 }
5797 EXPORT_SYMBOL(unregister_netdevice_queue);
5798 
5799 /**
5800  *	unregister_netdevice_many - unregister many devices
5801  *	@head: list of devices
5802  */
5803 void unregister_netdevice_many(struct list_head *head)
5804 {
5805 	struct net_device *dev;
5806 
5807 	if (!list_empty(head)) {
5808 		rollback_registered_many(head);
5809 		list_for_each_entry(dev, head, unreg_list)
5810 			net_set_todo(dev);
5811 	}
5812 }
5813 EXPORT_SYMBOL(unregister_netdevice_many);
5814 
5815 /**
5816  *	unregister_netdev - remove device from the kernel
5817  *	@dev: device
5818  *
5819  *	This function shuts down a device interface and removes it
5820  *	from the kernel tables.
5821  *
5822  *	This is just a wrapper for unregister_netdevice that takes
5823  *	the rtnl semaphore.  In general you want to use this and not
5824  *	unregister_netdevice.
5825  */
5826 void unregister_netdev(struct net_device *dev)
5827 {
5828 	rtnl_lock();
5829 	unregister_netdevice(dev);
5830 	rtnl_unlock();
5831 }
5832 EXPORT_SYMBOL(unregister_netdev);
5833 
5834 /**
5835  *	dev_change_net_namespace - move device to different nethost namespace
5836  *	@dev: device
5837  *	@net: network namespace
5838  *	@pat: If not NULL name pattern to try if the current device name
5839  *	      is already taken in the destination network namespace.
5840  *
5841  *	This function shuts down a device interface and moves it
5842  *	to a new network namespace. On success 0 is returned, on
5843  *	a failure a netagive errno code is returned.
5844  *
5845  *	Callers must hold the rtnl semaphore.
5846  */
5847 
5848 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5849 {
5850 	int err;
5851 
5852 	ASSERT_RTNL();
5853 
5854 	/* Don't allow namespace local devices to be moved. */
5855 	err = -EINVAL;
5856 	if (dev->features & NETIF_F_NETNS_LOCAL)
5857 		goto out;
5858 
5859 	/* Ensure the device has been registrered */
5860 	err = -EINVAL;
5861 	if (dev->reg_state != NETREG_REGISTERED)
5862 		goto out;
5863 
5864 	/* Get out if there is nothing todo */
5865 	err = 0;
5866 	if (net_eq(dev_net(dev), net))
5867 		goto out;
5868 
5869 	/* Pick the destination device name, and ensure
5870 	 * we can use it in the destination network namespace.
5871 	 */
5872 	err = -EEXIST;
5873 	if (__dev_get_by_name(net, dev->name)) {
5874 		/* We get here if we can't use the current device name */
5875 		if (!pat)
5876 			goto out;
5877 		if (dev_get_valid_name(dev, pat, 1))
5878 			goto out;
5879 	}
5880 
5881 	/*
5882 	 * And now a mini version of register_netdevice unregister_netdevice.
5883 	 */
5884 
5885 	/* If device is running close it first. */
5886 	dev_close(dev);
5887 
5888 	/* And unlink it from device chain */
5889 	err = -ENODEV;
5890 	unlist_netdevice(dev);
5891 
5892 	synchronize_net();
5893 
5894 	/* Shutdown queueing discipline. */
5895 	dev_shutdown(dev);
5896 
5897 	/* Notify protocols, that we are about to destroy
5898 	   this device. They should clean all the things.
5899 
5900 	   Note that dev->reg_state stays at NETREG_REGISTERED.
5901 	   This is wanted because this way 8021q and macvlan know
5902 	   the device is just moving and can keep their slaves up.
5903 	*/
5904 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5905 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5906 
5907 	/*
5908 	 *	Flush the unicast and multicast chains
5909 	 */
5910 	dev_uc_flush(dev);
5911 	dev_mc_flush(dev);
5912 
5913 	/* Actually switch the network namespace */
5914 	dev_net_set(dev, net);
5915 
5916 	/* If there is an ifindex conflict assign a new one */
5917 	if (__dev_get_by_index(net, dev->ifindex)) {
5918 		int iflink = (dev->iflink == dev->ifindex);
5919 		dev->ifindex = dev_new_index(net);
5920 		if (iflink)
5921 			dev->iflink = dev->ifindex;
5922 	}
5923 
5924 	/* Fixup kobjects */
5925 	err = device_rename(&dev->dev, dev->name);
5926 	WARN_ON(err);
5927 
5928 	/* Add the device back in the hashes */
5929 	list_netdevice(dev);
5930 
5931 	/* Notify protocols, that a new device appeared. */
5932 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
5933 
5934 	/*
5935 	 *	Prevent userspace races by waiting until the network
5936 	 *	device is fully setup before sending notifications.
5937 	 */
5938 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5939 
5940 	synchronize_net();
5941 	err = 0;
5942 out:
5943 	return err;
5944 }
5945 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5946 
5947 static int dev_cpu_callback(struct notifier_block *nfb,
5948 			    unsigned long action,
5949 			    void *ocpu)
5950 {
5951 	struct sk_buff **list_skb;
5952 	struct sk_buff *skb;
5953 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5954 	struct softnet_data *sd, *oldsd;
5955 
5956 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5957 		return NOTIFY_OK;
5958 
5959 	local_irq_disable();
5960 	cpu = smp_processor_id();
5961 	sd = &per_cpu(softnet_data, cpu);
5962 	oldsd = &per_cpu(softnet_data, oldcpu);
5963 
5964 	/* Find end of our completion_queue. */
5965 	list_skb = &sd->completion_queue;
5966 	while (*list_skb)
5967 		list_skb = &(*list_skb)->next;
5968 	/* Append completion queue from offline CPU. */
5969 	*list_skb = oldsd->completion_queue;
5970 	oldsd->completion_queue = NULL;
5971 
5972 	/* Append output queue from offline CPU. */
5973 	if (oldsd->output_queue) {
5974 		*sd->output_queue_tailp = oldsd->output_queue;
5975 		sd->output_queue_tailp = oldsd->output_queue_tailp;
5976 		oldsd->output_queue = NULL;
5977 		oldsd->output_queue_tailp = &oldsd->output_queue;
5978 	}
5979 
5980 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5981 	local_irq_enable();
5982 
5983 	/* Process offline CPU's input_pkt_queue */
5984 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5985 		netif_rx(skb);
5986 		input_queue_head_incr(oldsd);
5987 	}
5988 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5989 		netif_rx(skb);
5990 		input_queue_head_incr(oldsd);
5991 	}
5992 
5993 	return NOTIFY_OK;
5994 }
5995 
5996 
5997 /**
5998  *	netdev_increment_features - increment feature set by one
5999  *	@all: current feature set
6000  *	@one: new feature set
6001  *	@mask: mask feature set
6002  *
6003  *	Computes a new feature set after adding a device with feature set
6004  *	@one to the master device with current feature set @all.  Will not
6005  *	enable anything that is off in @mask. Returns the new feature set.
6006  */
6007 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
6008 					unsigned long mask)
6009 {
6010 	/* If device needs checksumming, downgrade to it. */
6011 	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6012 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6013 	else if (mask & NETIF_F_ALL_CSUM) {
6014 		/* If one device supports v4/v6 checksumming, set for all. */
6015 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6016 		    !(all & NETIF_F_GEN_CSUM)) {
6017 			all &= ~NETIF_F_ALL_CSUM;
6018 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6019 		}
6020 
6021 		/* If one device supports hw checksumming, set for all. */
6022 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6023 			all &= ~NETIF_F_ALL_CSUM;
6024 			all |= NETIF_F_HW_CSUM;
6025 		}
6026 	}
6027 
6028 	one |= NETIF_F_ALL_CSUM;
6029 
6030 	one |= all & NETIF_F_ONE_FOR_ALL;
6031 	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6032 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
6033 
6034 	return all;
6035 }
6036 EXPORT_SYMBOL(netdev_increment_features);
6037 
6038 static struct hlist_head *netdev_create_hash(void)
6039 {
6040 	int i;
6041 	struct hlist_head *hash;
6042 
6043 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6044 	if (hash != NULL)
6045 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6046 			INIT_HLIST_HEAD(&hash[i]);
6047 
6048 	return hash;
6049 }
6050 
6051 /* Initialize per network namespace state */
6052 static int __net_init netdev_init(struct net *net)
6053 {
6054 	INIT_LIST_HEAD(&net->dev_base_head);
6055 
6056 	net->dev_name_head = netdev_create_hash();
6057 	if (net->dev_name_head == NULL)
6058 		goto err_name;
6059 
6060 	net->dev_index_head = netdev_create_hash();
6061 	if (net->dev_index_head == NULL)
6062 		goto err_idx;
6063 
6064 	return 0;
6065 
6066 err_idx:
6067 	kfree(net->dev_name_head);
6068 err_name:
6069 	return -ENOMEM;
6070 }
6071 
6072 /**
6073  *	netdev_drivername - network driver for the device
6074  *	@dev: network device
6075  *	@buffer: buffer for resulting name
6076  *	@len: size of buffer
6077  *
6078  *	Determine network driver for device.
6079  */
6080 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6081 {
6082 	const struct device_driver *driver;
6083 	const struct device *parent;
6084 
6085 	if (len <= 0 || !buffer)
6086 		return buffer;
6087 	buffer[0] = 0;
6088 
6089 	parent = dev->dev.parent;
6090 
6091 	if (!parent)
6092 		return buffer;
6093 
6094 	driver = parent->driver;
6095 	if (driver && driver->name)
6096 		strlcpy(buffer, driver->name, len);
6097 	return buffer;
6098 }
6099 
6100 static int __netdev_printk(const char *level, const struct net_device *dev,
6101 			   struct va_format *vaf)
6102 {
6103 	int r;
6104 
6105 	if (dev && dev->dev.parent)
6106 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6107 			       netdev_name(dev), vaf);
6108 	else if (dev)
6109 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6110 	else
6111 		r = printk("%s(NULL net_device): %pV", level, vaf);
6112 
6113 	return r;
6114 }
6115 
6116 int netdev_printk(const char *level, const struct net_device *dev,
6117 		  const char *format, ...)
6118 {
6119 	struct va_format vaf;
6120 	va_list args;
6121 	int r;
6122 
6123 	va_start(args, format);
6124 
6125 	vaf.fmt = format;
6126 	vaf.va = &args;
6127 
6128 	r = __netdev_printk(level, dev, &vaf);
6129 	va_end(args);
6130 
6131 	return r;
6132 }
6133 EXPORT_SYMBOL(netdev_printk);
6134 
6135 #define define_netdev_printk_level(func, level)			\
6136 int func(const struct net_device *dev, const char *fmt, ...)	\
6137 {								\
6138 	int r;							\
6139 	struct va_format vaf;					\
6140 	va_list args;						\
6141 								\
6142 	va_start(args, fmt);					\
6143 								\
6144 	vaf.fmt = fmt;						\
6145 	vaf.va = &args;						\
6146 								\
6147 	r = __netdev_printk(level, dev, &vaf);			\
6148 	va_end(args);						\
6149 								\
6150 	return r;						\
6151 }								\
6152 EXPORT_SYMBOL(func);
6153 
6154 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6155 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6156 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6157 define_netdev_printk_level(netdev_err, KERN_ERR);
6158 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6159 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6160 define_netdev_printk_level(netdev_info, KERN_INFO);
6161 
6162 static void __net_exit netdev_exit(struct net *net)
6163 {
6164 	kfree(net->dev_name_head);
6165 	kfree(net->dev_index_head);
6166 }
6167 
6168 static struct pernet_operations __net_initdata netdev_net_ops = {
6169 	.init = netdev_init,
6170 	.exit = netdev_exit,
6171 };
6172 
6173 static void __net_exit default_device_exit(struct net *net)
6174 {
6175 	struct net_device *dev, *aux;
6176 	/*
6177 	 * Push all migratable network devices back to the
6178 	 * initial network namespace
6179 	 */
6180 	rtnl_lock();
6181 	for_each_netdev_safe(net, dev, aux) {
6182 		int err;
6183 		char fb_name[IFNAMSIZ];
6184 
6185 		/* Ignore unmoveable devices (i.e. loopback) */
6186 		if (dev->features & NETIF_F_NETNS_LOCAL)
6187 			continue;
6188 
6189 		/* Leave virtual devices for the generic cleanup */
6190 		if (dev->rtnl_link_ops)
6191 			continue;
6192 
6193 		/* Push remaing network devices to init_net */
6194 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6195 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6196 		if (err) {
6197 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6198 				__func__, dev->name, err);
6199 			BUG();
6200 		}
6201 	}
6202 	rtnl_unlock();
6203 }
6204 
6205 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6206 {
6207 	/* At exit all network devices most be removed from a network
6208 	 * namespace.  Do this in the reverse order of registeration.
6209 	 * Do this across as many network namespaces as possible to
6210 	 * improve batching efficiency.
6211 	 */
6212 	struct net_device *dev;
6213 	struct net *net;
6214 	LIST_HEAD(dev_kill_list);
6215 
6216 	rtnl_lock();
6217 	list_for_each_entry(net, net_list, exit_list) {
6218 		for_each_netdev_reverse(net, dev) {
6219 			if (dev->rtnl_link_ops)
6220 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6221 			else
6222 				unregister_netdevice_queue(dev, &dev_kill_list);
6223 		}
6224 	}
6225 	unregister_netdevice_many(&dev_kill_list);
6226 	rtnl_unlock();
6227 }
6228 
6229 static struct pernet_operations __net_initdata default_device_ops = {
6230 	.exit = default_device_exit,
6231 	.exit_batch = default_device_exit_batch,
6232 };
6233 
6234 /*
6235  *	Initialize the DEV module. At boot time this walks the device list and
6236  *	unhooks any devices that fail to initialise (normally hardware not
6237  *	present) and leaves us with a valid list of present and active devices.
6238  *
6239  */
6240 
6241 /*
6242  *       This is called single threaded during boot, so no need
6243  *       to take the rtnl semaphore.
6244  */
6245 static int __init net_dev_init(void)
6246 {
6247 	int i, rc = -ENOMEM;
6248 
6249 	BUG_ON(!dev_boot_phase);
6250 
6251 	if (dev_proc_init())
6252 		goto out;
6253 
6254 	if (netdev_kobject_init())
6255 		goto out;
6256 
6257 	INIT_LIST_HEAD(&ptype_all);
6258 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6259 		INIT_LIST_HEAD(&ptype_base[i]);
6260 
6261 	if (register_pernet_subsys(&netdev_net_ops))
6262 		goto out;
6263 
6264 	/*
6265 	 *	Initialise the packet receive queues.
6266 	 */
6267 
6268 	for_each_possible_cpu(i) {
6269 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6270 
6271 		memset(sd, 0, sizeof(*sd));
6272 		skb_queue_head_init(&sd->input_pkt_queue);
6273 		skb_queue_head_init(&sd->process_queue);
6274 		sd->completion_queue = NULL;
6275 		INIT_LIST_HEAD(&sd->poll_list);
6276 		sd->output_queue = NULL;
6277 		sd->output_queue_tailp = &sd->output_queue;
6278 #ifdef CONFIG_RPS
6279 		sd->csd.func = rps_trigger_softirq;
6280 		sd->csd.info = sd;
6281 		sd->csd.flags = 0;
6282 		sd->cpu = i;
6283 #endif
6284 
6285 		sd->backlog.poll = process_backlog;
6286 		sd->backlog.weight = weight_p;
6287 		sd->backlog.gro_list = NULL;
6288 		sd->backlog.gro_count = 0;
6289 	}
6290 
6291 	dev_boot_phase = 0;
6292 
6293 	/* The loopback device is special if any other network devices
6294 	 * is present in a network namespace the loopback device must
6295 	 * be present. Since we now dynamically allocate and free the
6296 	 * loopback device ensure this invariant is maintained by
6297 	 * keeping the loopback device as the first device on the
6298 	 * list of network devices.  Ensuring the loopback devices
6299 	 * is the first device that appears and the last network device
6300 	 * that disappears.
6301 	 */
6302 	if (register_pernet_device(&loopback_net_ops))
6303 		goto out;
6304 
6305 	if (register_pernet_device(&default_device_ops))
6306 		goto out;
6307 
6308 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6309 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6310 
6311 	hotcpu_notifier(dev_cpu_callback, 0);
6312 	dst_init();
6313 	dev_mcast_init();
6314 	rc = 0;
6315 out:
6316 	return rc;
6317 }
6318 
6319 subsys_initcall(net_dev_init);
6320 
6321 static int __init initialize_hashrnd(void)
6322 {
6323 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6324 	return 0;
6325 }
6326 
6327 late_initcall_sync(initialize_hashrnd);
6328 
6329