xref: /linux-6.15/net/core/dev.c (revision 8d98efa8)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <[email protected]>
12  *				Mark Evans, <[email protected]>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <[email protected]>
16  *		Alan Cox <[email protected]>
17  *		David Hinds <[email protected]>
18  *		Alexey Kuznetsov <[email protected]>
19  *		Adam Sulmicki <[email protected]>
20  *              Pekka Riikonen <[email protected]>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <linux/pci.h>
132 #include <linux/inetdevice.h>
133 
134 #include "net-sysfs.h"
135 
136 /* Instead of increasing this, you should create a hash table. */
137 #define MAX_GRO_SKBS 8
138 
139 /* This should be increased if a protocol with a bigger head is added. */
140 #define GRO_MAX_HEAD (MAX_HEADER + 128)
141 
142 /*
143  *	The list of packet types we will receive (as opposed to discard)
144  *	and the routines to invoke.
145  *
146  *	Why 16. Because with 16 the only overlap we get on a hash of the
147  *	low nibble of the protocol value is RARP/SNAP/X.25.
148  *
149  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
150  *             sure which should go first, but I bet it won't make much
151  *             difference if we are running VLANs.  The good news is that
152  *             this protocol won't be in the list unless compiled in, so
153  *             the average user (w/out VLANs) will not be adversely affected.
154  *             --BLG
155  *
156  *		0800	IP
157  *		8100    802.1Q VLAN
158  *		0001	802.3
159  *		0002	AX.25
160  *		0004	802.2
161  *		8035	RARP
162  *		0005	SNAP
163  *		0805	X.25
164  *		0806	ARP
165  *		8137	IPX
166  *		0009	Localtalk
167  *		86DD	IPv6
168  */
169 
170 #define PTYPE_HASH_SIZE	(16)
171 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
172 
173 static DEFINE_SPINLOCK(ptype_lock);
174 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
175 static struct list_head ptype_all __read_mostly;	/* Taps */
176 
177 /*
178  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
179  * semaphore.
180  *
181  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
182  *
183  * Writers must hold the rtnl semaphore while they loop through the
184  * dev_base_head list, and hold dev_base_lock for writing when they do the
185  * actual updates.  This allows pure readers to access the list even
186  * while a writer is preparing to update it.
187  *
188  * To put it another way, dev_base_lock is held for writing only to
189  * protect against pure readers; the rtnl semaphore provides the
190  * protection against other writers.
191  *
192  * See, for example usages, register_netdevice() and
193  * unregister_netdevice(), which must be called with the rtnl
194  * semaphore held.
195  */
196 DEFINE_RWLOCK(dev_base_lock);
197 EXPORT_SYMBOL(dev_base_lock);
198 
199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
200 {
201 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
202 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
203 }
204 
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
208 }
209 
210 static inline void rps_lock(struct softnet_data *sd)
211 {
212 #ifdef CONFIG_RPS
213 	spin_lock(&sd->input_pkt_queue.lock);
214 #endif
215 }
216 
217 static inline void rps_unlock(struct softnet_data *sd)
218 {
219 #ifdef CONFIG_RPS
220 	spin_unlock(&sd->input_pkt_queue.lock);
221 #endif
222 }
223 
224 /* Device list insertion */
225 static int list_netdevice(struct net_device *dev)
226 {
227 	struct net *net = dev_net(dev);
228 
229 	ASSERT_RTNL();
230 
231 	write_lock_bh(&dev_base_lock);
232 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
233 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
234 	hlist_add_head_rcu(&dev->index_hlist,
235 			   dev_index_hash(net, dev->ifindex));
236 	write_unlock_bh(&dev_base_lock);
237 	return 0;
238 }
239 
240 /* Device list removal
241  * caller must respect a RCU grace period before freeing/reusing dev
242  */
243 static void unlist_netdevice(struct net_device *dev)
244 {
245 	ASSERT_RTNL();
246 
247 	/* Unlink dev from the device chain */
248 	write_lock_bh(&dev_base_lock);
249 	list_del_rcu(&dev->dev_list);
250 	hlist_del_rcu(&dev->name_hlist);
251 	hlist_del_rcu(&dev->index_hlist);
252 	write_unlock_bh(&dev_base_lock);
253 }
254 
255 /*
256  *	Our notifier list
257  */
258 
259 static RAW_NOTIFIER_HEAD(netdev_chain);
260 
261 /*
262  *	Device drivers call our routines to queue packets here. We empty the
263  *	queue in the local softnet handler.
264  */
265 
266 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
267 EXPORT_PER_CPU_SYMBOL(softnet_data);
268 
269 #ifdef CONFIG_LOCKDEP
270 /*
271  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
272  * according to dev->type
273  */
274 static const unsigned short netdev_lock_type[] =
275 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
276 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
277 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
278 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
279 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
280 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
281 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
282 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
283 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
284 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
285 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
286 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
287 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
288 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
289 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
290 	 ARPHRD_VOID, ARPHRD_NONE};
291 
292 static const char *const netdev_lock_name[] =
293 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
294 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
295 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
296 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
297 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
298 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
299 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
300 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
301 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
302 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
303 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
304 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
305 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
306 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
307 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
308 	 "_xmit_VOID", "_xmit_NONE"};
309 
310 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
311 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
312 
313 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
314 {
315 	int i;
316 
317 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
318 		if (netdev_lock_type[i] == dev_type)
319 			return i;
320 	/* the last key is used by default */
321 	return ARRAY_SIZE(netdev_lock_type) - 1;
322 }
323 
324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
325 						 unsigned short dev_type)
326 {
327 	int i;
328 
329 	i = netdev_lock_pos(dev_type);
330 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
331 				   netdev_lock_name[i]);
332 }
333 
334 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
335 {
336 	int i;
337 
338 	i = netdev_lock_pos(dev->type);
339 	lockdep_set_class_and_name(&dev->addr_list_lock,
340 				   &netdev_addr_lock_key[i],
341 				   netdev_lock_name[i]);
342 }
343 #else
344 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
345 						 unsigned short dev_type)
346 {
347 }
348 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349 {
350 }
351 #endif
352 
353 /*******************************************************************************
354 
355 		Protocol management and registration routines
356 
357 *******************************************************************************/
358 
359 /*
360  *	Add a protocol ID to the list. Now that the input handler is
361  *	smarter we can dispense with all the messy stuff that used to be
362  *	here.
363  *
364  *	BEWARE!!! Protocol handlers, mangling input packets,
365  *	MUST BE last in hash buckets and checking protocol handlers
366  *	MUST start from promiscuous ptype_all chain in net_bh.
367  *	It is true now, do not change it.
368  *	Explanation follows: if protocol handler, mangling packet, will
369  *	be the first on list, it is not able to sense, that packet
370  *	is cloned and should be copied-on-write, so that it will
371  *	change it and subsequent readers will get broken packet.
372  *							--ANK (980803)
373  */
374 
375 static inline struct list_head *ptype_head(const struct packet_type *pt)
376 {
377 	if (pt->type == htons(ETH_P_ALL))
378 		return &ptype_all;
379 	else
380 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
381 }
382 
383 /**
384  *	dev_add_pack - add packet handler
385  *	@pt: packet type declaration
386  *
387  *	Add a protocol handler to the networking stack. The passed &packet_type
388  *	is linked into kernel lists and may not be freed until it has been
389  *	removed from the kernel lists.
390  *
391  *	This call does not sleep therefore it can not
392  *	guarantee all CPU's that are in middle of receiving packets
393  *	will see the new packet type (until the next received packet).
394  */
395 
396 void dev_add_pack(struct packet_type *pt)
397 {
398 	struct list_head *head = ptype_head(pt);
399 
400 	spin_lock(&ptype_lock);
401 	list_add_rcu(&pt->list, head);
402 	spin_unlock(&ptype_lock);
403 }
404 EXPORT_SYMBOL(dev_add_pack);
405 
406 /**
407  *	__dev_remove_pack	 - remove packet handler
408  *	@pt: packet type declaration
409  *
410  *	Remove a protocol handler that was previously added to the kernel
411  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
412  *	from the kernel lists and can be freed or reused once this function
413  *	returns.
414  *
415  *      The packet type might still be in use by receivers
416  *	and must not be freed until after all the CPU's have gone
417  *	through a quiescent state.
418  */
419 void __dev_remove_pack(struct packet_type *pt)
420 {
421 	struct list_head *head = ptype_head(pt);
422 	struct packet_type *pt1;
423 
424 	spin_lock(&ptype_lock);
425 
426 	list_for_each_entry(pt1, head, list) {
427 		if (pt == pt1) {
428 			list_del_rcu(&pt->list);
429 			goto out;
430 		}
431 	}
432 
433 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
434 out:
435 	spin_unlock(&ptype_lock);
436 }
437 EXPORT_SYMBOL(__dev_remove_pack);
438 
439 /**
440  *	dev_remove_pack	 - remove packet handler
441  *	@pt: packet type declaration
442  *
443  *	Remove a protocol handler that was previously added to the kernel
444  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
445  *	from the kernel lists and can be freed or reused once this function
446  *	returns.
447  *
448  *	This call sleeps to guarantee that no CPU is looking at the packet
449  *	type after return.
450  */
451 void dev_remove_pack(struct packet_type *pt)
452 {
453 	__dev_remove_pack(pt);
454 
455 	synchronize_net();
456 }
457 EXPORT_SYMBOL(dev_remove_pack);
458 
459 /******************************************************************************
460 
461 		      Device Boot-time Settings Routines
462 
463 *******************************************************************************/
464 
465 /* Boot time configuration table */
466 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
467 
468 /**
469  *	netdev_boot_setup_add	- add new setup entry
470  *	@name: name of the device
471  *	@map: configured settings for the device
472  *
473  *	Adds new setup entry to the dev_boot_setup list.  The function
474  *	returns 0 on error and 1 on success.  This is a generic routine to
475  *	all netdevices.
476  */
477 static int netdev_boot_setup_add(char *name, struct ifmap *map)
478 {
479 	struct netdev_boot_setup *s;
480 	int i;
481 
482 	s = dev_boot_setup;
483 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
484 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
485 			memset(s[i].name, 0, sizeof(s[i].name));
486 			strlcpy(s[i].name, name, IFNAMSIZ);
487 			memcpy(&s[i].map, map, sizeof(s[i].map));
488 			break;
489 		}
490 	}
491 
492 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
493 }
494 
495 /**
496  *	netdev_boot_setup_check	- check boot time settings
497  *	@dev: the netdevice
498  *
499  * 	Check boot time settings for the device.
500  *	The found settings are set for the device to be used
501  *	later in the device probing.
502  *	Returns 0 if no settings found, 1 if they are.
503  */
504 int netdev_boot_setup_check(struct net_device *dev)
505 {
506 	struct netdev_boot_setup *s = dev_boot_setup;
507 	int i;
508 
509 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
510 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
511 		    !strcmp(dev->name, s[i].name)) {
512 			dev->irq 	= s[i].map.irq;
513 			dev->base_addr 	= s[i].map.base_addr;
514 			dev->mem_start 	= s[i].map.mem_start;
515 			dev->mem_end 	= s[i].map.mem_end;
516 			return 1;
517 		}
518 	}
519 	return 0;
520 }
521 EXPORT_SYMBOL(netdev_boot_setup_check);
522 
523 
524 /**
525  *	netdev_boot_base	- get address from boot time settings
526  *	@prefix: prefix for network device
527  *	@unit: id for network device
528  *
529  * 	Check boot time settings for the base address of device.
530  *	The found settings are set for the device to be used
531  *	later in the device probing.
532  *	Returns 0 if no settings found.
533  */
534 unsigned long netdev_boot_base(const char *prefix, int unit)
535 {
536 	const struct netdev_boot_setup *s = dev_boot_setup;
537 	char name[IFNAMSIZ];
538 	int i;
539 
540 	sprintf(name, "%s%d", prefix, unit);
541 
542 	/*
543 	 * If device already registered then return base of 1
544 	 * to indicate not to probe for this interface
545 	 */
546 	if (__dev_get_by_name(&init_net, name))
547 		return 1;
548 
549 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
550 		if (!strcmp(name, s[i].name))
551 			return s[i].map.base_addr;
552 	return 0;
553 }
554 
555 /*
556  * Saves at boot time configured settings for any netdevice.
557  */
558 int __init netdev_boot_setup(char *str)
559 {
560 	int ints[5];
561 	struct ifmap map;
562 
563 	str = get_options(str, ARRAY_SIZE(ints), ints);
564 	if (!str || !*str)
565 		return 0;
566 
567 	/* Save settings */
568 	memset(&map, 0, sizeof(map));
569 	if (ints[0] > 0)
570 		map.irq = ints[1];
571 	if (ints[0] > 1)
572 		map.base_addr = ints[2];
573 	if (ints[0] > 2)
574 		map.mem_start = ints[3];
575 	if (ints[0] > 3)
576 		map.mem_end = ints[4];
577 
578 	/* Add new entry to the list */
579 	return netdev_boot_setup_add(str, &map);
580 }
581 
582 __setup("netdev=", netdev_boot_setup);
583 
584 /*******************************************************************************
585 
586 			    Device Interface Subroutines
587 
588 *******************************************************************************/
589 
590 /**
591  *	__dev_get_by_name	- find a device by its name
592  *	@net: the applicable net namespace
593  *	@name: name to find
594  *
595  *	Find an interface by name. Must be called under RTNL semaphore
596  *	or @dev_base_lock. If the name is found a pointer to the device
597  *	is returned. If the name is not found then %NULL is returned. The
598  *	reference counters are not incremented so the caller must be
599  *	careful with locks.
600  */
601 
602 struct net_device *__dev_get_by_name(struct net *net, const char *name)
603 {
604 	struct hlist_node *p;
605 	struct net_device *dev;
606 	struct hlist_head *head = dev_name_hash(net, name);
607 
608 	hlist_for_each_entry(dev, p, head, name_hlist)
609 		if (!strncmp(dev->name, name, IFNAMSIZ))
610 			return dev;
611 
612 	return NULL;
613 }
614 EXPORT_SYMBOL(__dev_get_by_name);
615 
616 /**
617  *	dev_get_by_name_rcu	- find a device by its name
618  *	@net: the applicable net namespace
619  *	@name: name to find
620  *
621  *	Find an interface by name.
622  *	If the name is found a pointer to the device is returned.
623  * 	If the name is not found then %NULL is returned.
624  *	The reference counters are not incremented so the caller must be
625  *	careful with locks. The caller must hold RCU lock.
626  */
627 
628 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
629 {
630 	struct hlist_node *p;
631 	struct net_device *dev;
632 	struct hlist_head *head = dev_name_hash(net, name);
633 
634 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
635 		if (!strncmp(dev->name, name, IFNAMSIZ))
636 			return dev;
637 
638 	return NULL;
639 }
640 EXPORT_SYMBOL(dev_get_by_name_rcu);
641 
642 /**
643  *	dev_get_by_name		- find a device by its name
644  *	@net: the applicable net namespace
645  *	@name: name to find
646  *
647  *	Find an interface by name. This can be called from any
648  *	context and does its own locking. The returned handle has
649  *	the usage count incremented and the caller must use dev_put() to
650  *	release it when it is no longer needed. %NULL is returned if no
651  *	matching device is found.
652  */
653 
654 struct net_device *dev_get_by_name(struct net *net, const char *name)
655 {
656 	struct net_device *dev;
657 
658 	rcu_read_lock();
659 	dev = dev_get_by_name_rcu(net, name);
660 	if (dev)
661 		dev_hold(dev);
662 	rcu_read_unlock();
663 	return dev;
664 }
665 EXPORT_SYMBOL(dev_get_by_name);
666 
667 /**
668  *	__dev_get_by_index - find a device by its ifindex
669  *	@net: the applicable net namespace
670  *	@ifindex: index of device
671  *
672  *	Search for an interface by index. Returns %NULL if the device
673  *	is not found or a pointer to the device. The device has not
674  *	had its reference counter increased so the caller must be careful
675  *	about locking. The caller must hold either the RTNL semaphore
676  *	or @dev_base_lock.
677  */
678 
679 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
680 {
681 	struct hlist_node *p;
682 	struct net_device *dev;
683 	struct hlist_head *head = dev_index_hash(net, ifindex);
684 
685 	hlist_for_each_entry(dev, p, head, index_hlist)
686 		if (dev->ifindex == ifindex)
687 			return dev;
688 
689 	return NULL;
690 }
691 EXPORT_SYMBOL(__dev_get_by_index);
692 
693 /**
694  *	dev_get_by_index_rcu - find a device by its ifindex
695  *	@net: the applicable net namespace
696  *	@ifindex: index of device
697  *
698  *	Search for an interface by index. Returns %NULL if the device
699  *	is not found or a pointer to the device. The device has not
700  *	had its reference counter increased so the caller must be careful
701  *	about locking. The caller must hold RCU lock.
702  */
703 
704 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
705 {
706 	struct hlist_node *p;
707 	struct net_device *dev;
708 	struct hlist_head *head = dev_index_hash(net, ifindex);
709 
710 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
711 		if (dev->ifindex == ifindex)
712 			return dev;
713 
714 	return NULL;
715 }
716 EXPORT_SYMBOL(dev_get_by_index_rcu);
717 
718 
719 /**
720  *	dev_get_by_index - find a device by its ifindex
721  *	@net: the applicable net namespace
722  *	@ifindex: index of device
723  *
724  *	Search for an interface by index. Returns NULL if the device
725  *	is not found or a pointer to the device. The device returned has
726  *	had a reference added and the pointer is safe until the user calls
727  *	dev_put to indicate they have finished with it.
728  */
729 
730 struct net_device *dev_get_by_index(struct net *net, int ifindex)
731 {
732 	struct net_device *dev;
733 
734 	rcu_read_lock();
735 	dev = dev_get_by_index_rcu(net, ifindex);
736 	if (dev)
737 		dev_hold(dev);
738 	rcu_read_unlock();
739 	return dev;
740 }
741 EXPORT_SYMBOL(dev_get_by_index);
742 
743 /**
744  *	dev_getbyhwaddr - find a device by its hardware address
745  *	@net: the applicable net namespace
746  *	@type: media type of device
747  *	@ha: hardware address
748  *
749  *	Search for an interface by MAC address. Returns NULL if the device
750  *	is not found or a pointer to the device. The caller must hold the
751  *	rtnl semaphore. The returned device has not had its ref count increased
752  *	and the caller must therefore be careful about locking
753  *
754  *	BUGS:
755  *	If the API was consistent this would be __dev_get_by_hwaddr
756  */
757 
758 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
759 {
760 	struct net_device *dev;
761 
762 	ASSERT_RTNL();
763 
764 	for_each_netdev(net, dev)
765 		if (dev->type == type &&
766 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
767 			return dev;
768 
769 	return NULL;
770 }
771 EXPORT_SYMBOL(dev_getbyhwaddr);
772 
773 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
774 {
775 	struct net_device *dev;
776 
777 	ASSERT_RTNL();
778 	for_each_netdev(net, dev)
779 		if (dev->type == type)
780 			return dev;
781 
782 	return NULL;
783 }
784 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
785 
786 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
787 {
788 	struct net_device *dev, *ret = NULL;
789 
790 	rcu_read_lock();
791 	for_each_netdev_rcu(net, dev)
792 		if (dev->type == type) {
793 			dev_hold(dev);
794 			ret = dev;
795 			break;
796 		}
797 	rcu_read_unlock();
798 	return ret;
799 }
800 EXPORT_SYMBOL(dev_getfirstbyhwtype);
801 
802 /**
803  *	dev_get_by_flags_rcu - find any device with given flags
804  *	@net: the applicable net namespace
805  *	@if_flags: IFF_* values
806  *	@mask: bitmask of bits in if_flags to check
807  *
808  *	Search for any interface with the given flags. Returns NULL if a device
809  *	is not found or a pointer to the device. Must be called inside
810  *	rcu_read_lock(), and result refcount is unchanged.
811  */
812 
813 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
814 				    unsigned short mask)
815 {
816 	struct net_device *dev, *ret;
817 
818 	ret = NULL;
819 	for_each_netdev_rcu(net, dev) {
820 		if (((dev->flags ^ if_flags) & mask) == 0) {
821 			ret = dev;
822 			break;
823 		}
824 	}
825 	return ret;
826 }
827 EXPORT_SYMBOL(dev_get_by_flags_rcu);
828 
829 /**
830  *	dev_valid_name - check if name is okay for network device
831  *	@name: name string
832  *
833  *	Network device names need to be valid file names to
834  *	to allow sysfs to work.  We also disallow any kind of
835  *	whitespace.
836  */
837 int dev_valid_name(const char *name)
838 {
839 	if (*name == '\0')
840 		return 0;
841 	if (strlen(name) >= IFNAMSIZ)
842 		return 0;
843 	if (!strcmp(name, ".") || !strcmp(name, ".."))
844 		return 0;
845 
846 	while (*name) {
847 		if (*name == '/' || isspace(*name))
848 			return 0;
849 		name++;
850 	}
851 	return 1;
852 }
853 EXPORT_SYMBOL(dev_valid_name);
854 
855 /**
856  *	__dev_alloc_name - allocate a name for a device
857  *	@net: network namespace to allocate the device name in
858  *	@name: name format string
859  *	@buf:  scratch buffer and result name string
860  *
861  *	Passed a format string - eg "lt%d" it will try and find a suitable
862  *	id. It scans list of devices to build up a free map, then chooses
863  *	the first empty slot. The caller must hold the dev_base or rtnl lock
864  *	while allocating the name and adding the device in order to avoid
865  *	duplicates.
866  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
867  *	Returns the number of the unit assigned or a negative errno code.
868  */
869 
870 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
871 {
872 	int i = 0;
873 	const char *p;
874 	const int max_netdevices = 8*PAGE_SIZE;
875 	unsigned long *inuse;
876 	struct net_device *d;
877 
878 	p = strnchr(name, IFNAMSIZ-1, '%');
879 	if (p) {
880 		/*
881 		 * Verify the string as this thing may have come from
882 		 * the user.  There must be either one "%d" and no other "%"
883 		 * characters.
884 		 */
885 		if (p[1] != 'd' || strchr(p + 2, '%'))
886 			return -EINVAL;
887 
888 		/* Use one page as a bit array of possible slots */
889 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
890 		if (!inuse)
891 			return -ENOMEM;
892 
893 		for_each_netdev(net, d) {
894 			if (!sscanf(d->name, name, &i))
895 				continue;
896 			if (i < 0 || i >= max_netdevices)
897 				continue;
898 
899 			/*  avoid cases where sscanf is not exact inverse of printf */
900 			snprintf(buf, IFNAMSIZ, name, i);
901 			if (!strncmp(buf, d->name, IFNAMSIZ))
902 				set_bit(i, inuse);
903 		}
904 
905 		i = find_first_zero_bit(inuse, max_netdevices);
906 		free_page((unsigned long) inuse);
907 	}
908 
909 	if (buf != name)
910 		snprintf(buf, IFNAMSIZ, name, i);
911 	if (!__dev_get_by_name(net, buf))
912 		return i;
913 
914 	/* It is possible to run out of possible slots
915 	 * when the name is long and there isn't enough space left
916 	 * for the digits, or if all bits are used.
917 	 */
918 	return -ENFILE;
919 }
920 
921 /**
922  *	dev_alloc_name - allocate a name for a device
923  *	@dev: device
924  *	@name: name format string
925  *
926  *	Passed a format string - eg "lt%d" it will try and find a suitable
927  *	id. It scans list of devices to build up a free map, then chooses
928  *	the first empty slot. The caller must hold the dev_base or rtnl lock
929  *	while allocating the name and adding the device in order to avoid
930  *	duplicates.
931  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
932  *	Returns the number of the unit assigned or a negative errno code.
933  */
934 
935 int dev_alloc_name(struct net_device *dev, const char *name)
936 {
937 	char buf[IFNAMSIZ];
938 	struct net *net;
939 	int ret;
940 
941 	BUG_ON(!dev_net(dev));
942 	net = dev_net(dev);
943 	ret = __dev_alloc_name(net, name, buf);
944 	if (ret >= 0)
945 		strlcpy(dev->name, buf, IFNAMSIZ);
946 	return ret;
947 }
948 EXPORT_SYMBOL(dev_alloc_name);
949 
950 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
951 {
952 	struct net *net;
953 
954 	BUG_ON(!dev_net(dev));
955 	net = dev_net(dev);
956 
957 	if (!dev_valid_name(name))
958 		return -EINVAL;
959 
960 	if (fmt && strchr(name, '%'))
961 		return dev_alloc_name(dev, name);
962 	else if (__dev_get_by_name(net, name))
963 		return -EEXIST;
964 	else if (dev->name != name)
965 		strlcpy(dev->name, name, IFNAMSIZ);
966 
967 	return 0;
968 }
969 
970 /**
971  *	dev_change_name - change name of a device
972  *	@dev: device
973  *	@newname: name (or format string) must be at least IFNAMSIZ
974  *
975  *	Change name of a device, can pass format strings "eth%d".
976  *	for wildcarding.
977  */
978 int dev_change_name(struct net_device *dev, const char *newname)
979 {
980 	char oldname[IFNAMSIZ];
981 	int err = 0;
982 	int ret;
983 	struct net *net;
984 
985 	ASSERT_RTNL();
986 	BUG_ON(!dev_net(dev));
987 
988 	net = dev_net(dev);
989 	if (dev->flags & IFF_UP)
990 		return -EBUSY;
991 
992 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
993 		return 0;
994 
995 	memcpy(oldname, dev->name, IFNAMSIZ);
996 
997 	err = dev_get_valid_name(dev, newname, 1);
998 	if (err < 0)
999 		return err;
1000 
1001 rollback:
1002 	ret = device_rename(&dev->dev, dev->name);
1003 	if (ret) {
1004 		memcpy(dev->name, oldname, IFNAMSIZ);
1005 		return ret;
1006 	}
1007 
1008 	write_lock_bh(&dev_base_lock);
1009 	hlist_del(&dev->name_hlist);
1010 	write_unlock_bh(&dev_base_lock);
1011 
1012 	synchronize_rcu();
1013 
1014 	write_lock_bh(&dev_base_lock);
1015 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1016 	write_unlock_bh(&dev_base_lock);
1017 
1018 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1019 	ret = notifier_to_errno(ret);
1020 
1021 	if (ret) {
1022 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1023 		if (err >= 0) {
1024 			err = ret;
1025 			memcpy(dev->name, oldname, IFNAMSIZ);
1026 			goto rollback;
1027 		} else {
1028 			printk(KERN_ERR
1029 			       "%s: name change rollback failed: %d.\n",
1030 			       dev->name, ret);
1031 		}
1032 	}
1033 
1034 	return err;
1035 }
1036 
1037 /**
1038  *	dev_set_alias - change ifalias of a device
1039  *	@dev: device
1040  *	@alias: name up to IFALIASZ
1041  *	@len: limit of bytes to copy from info
1042  *
1043  *	Set ifalias for a device,
1044  */
1045 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1046 {
1047 	ASSERT_RTNL();
1048 
1049 	if (len >= IFALIASZ)
1050 		return -EINVAL;
1051 
1052 	if (!len) {
1053 		if (dev->ifalias) {
1054 			kfree(dev->ifalias);
1055 			dev->ifalias = NULL;
1056 		}
1057 		return 0;
1058 	}
1059 
1060 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1061 	if (!dev->ifalias)
1062 		return -ENOMEM;
1063 
1064 	strlcpy(dev->ifalias, alias, len+1);
1065 	return len;
1066 }
1067 
1068 
1069 /**
1070  *	netdev_features_change - device changes features
1071  *	@dev: device to cause notification
1072  *
1073  *	Called to indicate a device has changed features.
1074  */
1075 void netdev_features_change(struct net_device *dev)
1076 {
1077 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1078 }
1079 EXPORT_SYMBOL(netdev_features_change);
1080 
1081 /**
1082  *	netdev_state_change - device changes state
1083  *	@dev: device to cause notification
1084  *
1085  *	Called to indicate a device has changed state. This function calls
1086  *	the notifier chains for netdev_chain and sends a NEWLINK message
1087  *	to the routing socket.
1088  */
1089 void netdev_state_change(struct net_device *dev)
1090 {
1091 	if (dev->flags & IFF_UP) {
1092 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1093 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1094 	}
1095 }
1096 EXPORT_SYMBOL(netdev_state_change);
1097 
1098 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1099 {
1100 	return call_netdevice_notifiers(event, dev);
1101 }
1102 EXPORT_SYMBOL(netdev_bonding_change);
1103 
1104 /**
1105  *	dev_load 	- load a network module
1106  *	@net: the applicable net namespace
1107  *	@name: name of interface
1108  *
1109  *	If a network interface is not present and the process has suitable
1110  *	privileges this function loads the module. If module loading is not
1111  *	available in this kernel then it becomes a nop.
1112  */
1113 
1114 void dev_load(struct net *net, const char *name)
1115 {
1116 	struct net_device *dev;
1117 
1118 	rcu_read_lock();
1119 	dev = dev_get_by_name_rcu(net, name);
1120 	rcu_read_unlock();
1121 
1122 	if (!dev && capable(CAP_NET_ADMIN))
1123 		request_module("%s", name);
1124 }
1125 EXPORT_SYMBOL(dev_load);
1126 
1127 static int __dev_open(struct net_device *dev)
1128 {
1129 	const struct net_device_ops *ops = dev->netdev_ops;
1130 	int ret;
1131 
1132 	ASSERT_RTNL();
1133 
1134 	/*
1135 	 *	Is it even present?
1136 	 */
1137 	if (!netif_device_present(dev))
1138 		return -ENODEV;
1139 
1140 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1141 	ret = notifier_to_errno(ret);
1142 	if (ret)
1143 		return ret;
1144 
1145 	/*
1146 	 *	Call device private open method
1147 	 */
1148 	set_bit(__LINK_STATE_START, &dev->state);
1149 
1150 	if (ops->ndo_validate_addr)
1151 		ret = ops->ndo_validate_addr(dev);
1152 
1153 	if (!ret && ops->ndo_open)
1154 		ret = ops->ndo_open(dev);
1155 
1156 	/*
1157 	 *	If it went open OK then:
1158 	 */
1159 
1160 	if (ret)
1161 		clear_bit(__LINK_STATE_START, &dev->state);
1162 	else {
1163 		/*
1164 		 *	Set the flags.
1165 		 */
1166 		dev->flags |= IFF_UP;
1167 
1168 		/*
1169 		 *	Enable NET_DMA
1170 		 */
1171 		net_dmaengine_get();
1172 
1173 		/*
1174 		 *	Initialize multicasting status
1175 		 */
1176 		dev_set_rx_mode(dev);
1177 
1178 		/*
1179 		 *	Wakeup transmit queue engine
1180 		 */
1181 		dev_activate(dev);
1182 	}
1183 
1184 	return ret;
1185 }
1186 
1187 /**
1188  *	dev_open	- prepare an interface for use.
1189  *	@dev:	device to open
1190  *
1191  *	Takes a device from down to up state. The device's private open
1192  *	function is invoked and then the multicast lists are loaded. Finally
1193  *	the device is moved into the up state and a %NETDEV_UP message is
1194  *	sent to the netdev notifier chain.
1195  *
1196  *	Calling this function on an active interface is a nop. On a failure
1197  *	a negative errno code is returned.
1198  */
1199 int dev_open(struct net_device *dev)
1200 {
1201 	int ret;
1202 
1203 	/*
1204 	 *	Is it already up?
1205 	 */
1206 	if (dev->flags & IFF_UP)
1207 		return 0;
1208 
1209 	/*
1210 	 *	Open device
1211 	 */
1212 	ret = __dev_open(dev);
1213 	if (ret < 0)
1214 		return ret;
1215 
1216 	/*
1217 	 *	... and announce new interface.
1218 	 */
1219 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1220 	call_netdevice_notifiers(NETDEV_UP, dev);
1221 
1222 	return ret;
1223 }
1224 EXPORT_SYMBOL(dev_open);
1225 
1226 static int __dev_close(struct net_device *dev)
1227 {
1228 	const struct net_device_ops *ops = dev->netdev_ops;
1229 
1230 	ASSERT_RTNL();
1231 	might_sleep();
1232 
1233 	/*
1234 	 *	Tell people we are going down, so that they can
1235 	 *	prepare to death, when device is still operating.
1236 	 */
1237 	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1238 
1239 	clear_bit(__LINK_STATE_START, &dev->state);
1240 
1241 	/* Synchronize to scheduled poll. We cannot touch poll list,
1242 	 * it can be even on different cpu. So just clear netif_running().
1243 	 *
1244 	 * dev->stop() will invoke napi_disable() on all of it's
1245 	 * napi_struct instances on this device.
1246 	 */
1247 	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1248 
1249 	dev_deactivate(dev);
1250 
1251 	/*
1252 	 *	Call the device specific close. This cannot fail.
1253 	 *	Only if device is UP
1254 	 *
1255 	 *	We allow it to be called even after a DETACH hot-plug
1256 	 *	event.
1257 	 */
1258 	if (ops->ndo_stop)
1259 		ops->ndo_stop(dev);
1260 
1261 	/*
1262 	 *	Device is now down.
1263 	 */
1264 
1265 	dev->flags &= ~IFF_UP;
1266 
1267 	/*
1268 	 *	Shutdown NET_DMA
1269 	 */
1270 	net_dmaengine_put();
1271 
1272 	return 0;
1273 }
1274 
1275 /**
1276  *	dev_close - shutdown an interface.
1277  *	@dev: device to shutdown
1278  *
1279  *	This function moves an active device into down state. A
1280  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1281  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1282  *	chain.
1283  */
1284 int dev_close(struct net_device *dev)
1285 {
1286 	if (!(dev->flags & IFF_UP))
1287 		return 0;
1288 
1289 	__dev_close(dev);
1290 
1291 	/*
1292 	 * Tell people we are down
1293 	 */
1294 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1295 	call_netdevice_notifiers(NETDEV_DOWN, dev);
1296 
1297 	return 0;
1298 }
1299 EXPORT_SYMBOL(dev_close);
1300 
1301 
1302 /**
1303  *	dev_disable_lro - disable Large Receive Offload on a device
1304  *	@dev: device
1305  *
1306  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1307  *	called under RTNL.  This is needed if received packets may be
1308  *	forwarded to another interface.
1309  */
1310 void dev_disable_lro(struct net_device *dev)
1311 {
1312 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1313 	    dev->ethtool_ops->set_flags) {
1314 		u32 flags = dev->ethtool_ops->get_flags(dev);
1315 		if (flags & ETH_FLAG_LRO) {
1316 			flags &= ~ETH_FLAG_LRO;
1317 			dev->ethtool_ops->set_flags(dev, flags);
1318 		}
1319 	}
1320 	WARN_ON(dev->features & NETIF_F_LRO);
1321 }
1322 EXPORT_SYMBOL(dev_disable_lro);
1323 
1324 
1325 static int dev_boot_phase = 1;
1326 
1327 /*
1328  *	Device change register/unregister. These are not inline or static
1329  *	as we export them to the world.
1330  */
1331 
1332 /**
1333  *	register_netdevice_notifier - register a network notifier block
1334  *	@nb: notifier
1335  *
1336  *	Register a notifier to be called when network device events occur.
1337  *	The notifier passed is linked into the kernel structures and must
1338  *	not be reused until it has been unregistered. A negative errno code
1339  *	is returned on a failure.
1340  *
1341  * 	When registered all registration and up events are replayed
1342  *	to the new notifier to allow device to have a race free
1343  *	view of the network device list.
1344  */
1345 
1346 int register_netdevice_notifier(struct notifier_block *nb)
1347 {
1348 	struct net_device *dev;
1349 	struct net_device *last;
1350 	struct net *net;
1351 	int err;
1352 
1353 	rtnl_lock();
1354 	err = raw_notifier_chain_register(&netdev_chain, nb);
1355 	if (err)
1356 		goto unlock;
1357 	if (dev_boot_phase)
1358 		goto unlock;
1359 	for_each_net(net) {
1360 		for_each_netdev(net, dev) {
1361 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1362 			err = notifier_to_errno(err);
1363 			if (err)
1364 				goto rollback;
1365 
1366 			if (!(dev->flags & IFF_UP))
1367 				continue;
1368 
1369 			nb->notifier_call(nb, NETDEV_UP, dev);
1370 		}
1371 	}
1372 
1373 unlock:
1374 	rtnl_unlock();
1375 	return err;
1376 
1377 rollback:
1378 	last = dev;
1379 	for_each_net(net) {
1380 		for_each_netdev(net, dev) {
1381 			if (dev == last)
1382 				break;
1383 
1384 			if (dev->flags & IFF_UP) {
1385 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1386 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1387 			}
1388 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1389 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1390 		}
1391 	}
1392 
1393 	raw_notifier_chain_unregister(&netdev_chain, nb);
1394 	goto unlock;
1395 }
1396 EXPORT_SYMBOL(register_netdevice_notifier);
1397 
1398 /**
1399  *	unregister_netdevice_notifier - unregister a network notifier block
1400  *	@nb: notifier
1401  *
1402  *	Unregister a notifier previously registered by
1403  *	register_netdevice_notifier(). The notifier is unlinked into the
1404  *	kernel structures and may then be reused. A negative errno code
1405  *	is returned on a failure.
1406  */
1407 
1408 int unregister_netdevice_notifier(struct notifier_block *nb)
1409 {
1410 	int err;
1411 
1412 	rtnl_lock();
1413 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1414 	rtnl_unlock();
1415 	return err;
1416 }
1417 EXPORT_SYMBOL(unregister_netdevice_notifier);
1418 
1419 /**
1420  *	call_netdevice_notifiers - call all network notifier blocks
1421  *      @val: value passed unmodified to notifier function
1422  *      @dev: net_device pointer passed unmodified to notifier function
1423  *
1424  *	Call all network notifier blocks.  Parameters and return value
1425  *	are as for raw_notifier_call_chain().
1426  */
1427 
1428 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1429 {
1430 	ASSERT_RTNL();
1431 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1432 }
1433 
1434 /* When > 0 there are consumers of rx skb time stamps */
1435 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1436 
1437 void net_enable_timestamp(void)
1438 {
1439 	atomic_inc(&netstamp_needed);
1440 }
1441 EXPORT_SYMBOL(net_enable_timestamp);
1442 
1443 void net_disable_timestamp(void)
1444 {
1445 	atomic_dec(&netstamp_needed);
1446 }
1447 EXPORT_SYMBOL(net_disable_timestamp);
1448 
1449 static inline void net_timestamp_set(struct sk_buff *skb)
1450 {
1451 	if (atomic_read(&netstamp_needed))
1452 		__net_timestamp(skb);
1453 	else
1454 		skb->tstamp.tv64 = 0;
1455 }
1456 
1457 static inline void net_timestamp_check(struct sk_buff *skb)
1458 {
1459 	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1460 		__net_timestamp(skb);
1461 }
1462 
1463 /**
1464  * dev_forward_skb - loopback an skb to another netif
1465  *
1466  * @dev: destination network device
1467  * @skb: buffer to forward
1468  *
1469  * return values:
1470  *	NET_RX_SUCCESS	(no congestion)
1471  *	NET_RX_DROP     (packet was dropped, but freed)
1472  *
1473  * dev_forward_skb can be used for injecting an skb from the
1474  * start_xmit function of one device into the receive queue
1475  * of another device.
1476  *
1477  * The receiving device may be in another namespace, so
1478  * we have to clear all information in the skb that could
1479  * impact namespace isolation.
1480  */
1481 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1482 {
1483 	skb_orphan(skb);
1484 	nf_reset(skb);
1485 
1486 	if (!(dev->flags & IFF_UP) ||
1487 	    (skb->len > (dev->mtu + dev->hard_header_len))) {
1488 		kfree_skb(skb);
1489 		return NET_RX_DROP;
1490 	}
1491 	skb_set_dev(skb, dev);
1492 	skb->tstamp.tv64 = 0;
1493 	skb->pkt_type = PACKET_HOST;
1494 	skb->protocol = eth_type_trans(skb, dev);
1495 	return netif_rx(skb);
1496 }
1497 EXPORT_SYMBOL_GPL(dev_forward_skb);
1498 
1499 /*
1500  *	Support routine. Sends outgoing frames to any network
1501  *	taps currently in use.
1502  */
1503 
1504 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1505 {
1506 	struct packet_type *ptype;
1507 
1508 #ifdef CONFIG_NET_CLS_ACT
1509 	if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1510 		net_timestamp_set(skb);
1511 #else
1512 	net_timestamp_set(skb);
1513 #endif
1514 
1515 	rcu_read_lock();
1516 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1517 		/* Never send packets back to the socket
1518 		 * they originated from - MvS ([email protected])
1519 		 */
1520 		if ((ptype->dev == dev || !ptype->dev) &&
1521 		    (ptype->af_packet_priv == NULL ||
1522 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1523 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1524 			if (!skb2)
1525 				break;
1526 
1527 			/* skb->nh should be correctly
1528 			   set by sender, so that the second statement is
1529 			   just protection against buggy protocols.
1530 			 */
1531 			skb_reset_mac_header(skb2);
1532 
1533 			if (skb_network_header(skb2) < skb2->data ||
1534 			    skb2->network_header > skb2->tail) {
1535 				if (net_ratelimit())
1536 					printk(KERN_CRIT "protocol %04x is "
1537 					       "buggy, dev %s\n",
1538 					       ntohs(skb2->protocol),
1539 					       dev->name);
1540 				skb_reset_network_header(skb2);
1541 			}
1542 
1543 			skb2->transport_header = skb2->network_header;
1544 			skb2->pkt_type = PACKET_OUTGOING;
1545 			ptype->func(skb2, skb->dev, ptype, skb->dev);
1546 		}
1547 	}
1548 	rcu_read_unlock();
1549 }
1550 
1551 /*
1552  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1553  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1554  */
1555 void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1556 {
1557 	unsigned int real_num = dev->real_num_tx_queues;
1558 
1559 	if (unlikely(txq > dev->num_tx_queues))
1560 		;
1561 	else if (txq > real_num)
1562 		dev->real_num_tx_queues = txq;
1563 	else if (txq < real_num) {
1564 		dev->real_num_tx_queues = txq;
1565 		qdisc_reset_all_tx_gt(dev, txq);
1566 	}
1567 }
1568 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1569 
1570 static inline void __netif_reschedule(struct Qdisc *q)
1571 {
1572 	struct softnet_data *sd;
1573 	unsigned long flags;
1574 
1575 	local_irq_save(flags);
1576 	sd = &__get_cpu_var(softnet_data);
1577 	q->next_sched = NULL;
1578 	*sd->output_queue_tailp = q;
1579 	sd->output_queue_tailp = &q->next_sched;
1580 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1581 	local_irq_restore(flags);
1582 }
1583 
1584 void __netif_schedule(struct Qdisc *q)
1585 {
1586 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1587 		__netif_reschedule(q);
1588 }
1589 EXPORT_SYMBOL(__netif_schedule);
1590 
1591 void dev_kfree_skb_irq(struct sk_buff *skb)
1592 {
1593 	if (atomic_dec_and_test(&skb->users)) {
1594 		struct softnet_data *sd;
1595 		unsigned long flags;
1596 
1597 		local_irq_save(flags);
1598 		sd = &__get_cpu_var(softnet_data);
1599 		skb->next = sd->completion_queue;
1600 		sd->completion_queue = skb;
1601 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1602 		local_irq_restore(flags);
1603 	}
1604 }
1605 EXPORT_SYMBOL(dev_kfree_skb_irq);
1606 
1607 void dev_kfree_skb_any(struct sk_buff *skb)
1608 {
1609 	if (in_irq() || irqs_disabled())
1610 		dev_kfree_skb_irq(skb);
1611 	else
1612 		dev_kfree_skb(skb);
1613 }
1614 EXPORT_SYMBOL(dev_kfree_skb_any);
1615 
1616 
1617 /**
1618  * netif_device_detach - mark device as removed
1619  * @dev: network device
1620  *
1621  * Mark device as removed from system and therefore no longer available.
1622  */
1623 void netif_device_detach(struct net_device *dev)
1624 {
1625 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1626 	    netif_running(dev)) {
1627 		netif_tx_stop_all_queues(dev);
1628 	}
1629 }
1630 EXPORT_SYMBOL(netif_device_detach);
1631 
1632 /**
1633  * netif_device_attach - mark device as attached
1634  * @dev: network device
1635  *
1636  * Mark device as attached from system and restart if needed.
1637  */
1638 void netif_device_attach(struct net_device *dev)
1639 {
1640 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1641 	    netif_running(dev)) {
1642 		netif_tx_wake_all_queues(dev);
1643 		__netdev_watchdog_up(dev);
1644 	}
1645 }
1646 EXPORT_SYMBOL(netif_device_attach);
1647 
1648 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1649 {
1650 	return ((features & NETIF_F_GEN_CSUM) ||
1651 		((features & NETIF_F_IP_CSUM) &&
1652 		 protocol == htons(ETH_P_IP)) ||
1653 		((features & NETIF_F_IPV6_CSUM) &&
1654 		 protocol == htons(ETH_P_IPV6)) ||
1655 		((features & NETIF_F_FCOE_CRC) &&
1656 		 protocol == htons(ETH_P_FCOE)));
1657 }
1658 
1659 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1660 {
1661 	if (can_checksum_protocol(dev->features, skb->protocol))
1662 		return true;
1663 
1664 	if (skb->protocol == htons(ETH_P_8021Q)) {
1665 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1666 		if (can_checksum_protocol(dev->features & dev->vlan_features,
1667 					  veh->h_vlan_encapsulated_proto))
1668 			return true;
1669 	}
1670 
1671 	return false;
1672 }
1673 
1674 /**
1675  * skb_dev_set -- assign a new device to a buffer
1676  * @skb: buffer for the new device
1677  * @dev: network device
1678  *
1679  * If an skb is owned by a device already, we have to reset
1680  * all data private to the namespace a device belongs to
1681  * before assigning it a new device.
1682  */
1683 #ifdef CONFIG_NET_NS
1684 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1685 {
1686 	skb_dst_drop(skb);
1687 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1688 		secpath_reset(skb);
1689 		nf_reset(skb);
1690 		skb_init_secmark(skb);
1691 		skb->mark = 0;
1692 		skb->priority = 0;
1693 		skb->nf_trace = 0;
1694 		skb->ipvs_property = 0;
1695 #ifdef CONFIG_NET_SCHED
1696 		skb->tc_index = 0;
1697 #endif
1698 	}
1699 	skb->dev = dev;
1700 }
1701 EXPORT_SYMBOL(skb_set_dev);
1702 #endif /* CONFIG_NET_NS */
1703 
1704 /*
1705  * Invalidate hardware checksum when packet is to be mangled, and
1706  * complete checksum manually on outgoing path.
1707  */
1708 int skb_checksum_help(struct sk_buff *skb)
1709 {
1710 	__wsum csum;
1711 	int ret = 0, offset;
1712 
1713 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1714 		goto out_set_summed;
1715 
1716 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1717 		/* Let GSO fix up the checksum. */
1718 		goto out_set_summed;
1719 	}
1720 
1721 	offset = skb->csum_start - skb_headroom(skb);
1722 	BUG_ON(offset >= skb_headlen(skb));
1723 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1724 
1725 	offset += skb->csum_offset;
1726 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1727 
1728 	if (skb_cloned(skb) &&
1729 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1730 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1731 		if (ret)
1732 			goto out;
1733 	}
1734 
1735 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1736 out_set_summed:
1737 	skb->ip_summed = CHECKSUM_NONE;
1738 out:
1739 	return ret;
1740 }
1741 EXPORT_SYMBOL(skb_checksum_help);
1742 
1743 /**
1744  *	skb_gso_segment - Perform segmentation on skb.
1745  *	@skb: buffer to segment
1746  *	@features: features for the output path (see dev->features)
1747  *
1748  *	This function segments the given skb and returns a list of segments.
1749  *
1750  *	It may return NULL if the skb requires no segmentation.  This is
1751  *	only possible when GSO is used for verifying header integrity.
1752  */
1753 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1754 {
1755 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1756 	struct packet_type *ptype;
1757 	__be16 type = skb->protocol;
1758 	int err;
1759 
1760 	skb_reset_mac_header(skb);
1761 	skb->mac_len = skb->network_header - skb->mac_header;
1762 	__skb_pull(skb, skb->mac_len);
1763 
1764 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1765 		struct net_device *dev = skb->dev;
1766 		struct ethtool_drvinfo info = {};
1767 
1768 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1769 			dev->ethtool_ops->get_drvinfo(dev, &info);
1770 
1771 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1772 			"ip_summed=%d",
1773 		     info.driver, dev ? dev->features : 0L,
1774 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1775 		     skb->len, skb->data_len, skb->ip_summed);
1776 
1777 		if (skb_header_cloned(skb) &&
1778 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1779 			return ERR_PTR(err);
1780 	}
1781 
1782 	rcu_read_lock();
1783 	list_for_each_entry_rcu(ptype,
1784 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1785 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1786 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1787 				err = ptype->gso_send_check(skb);
1788 				segs = ERR_PTR(err);
1789 				if (err || skb_gso_ok(skb, features))
1790 					break;
1791 				__skb_push(skb, (skb->data -
1792 						 skb_network_header(skb)));
1793 			}
1794 			segs = ptype->gso_segment(skb, features);
1795 			break;
1796 		}
1797 	}
1798 	rcu_read_unlock();
1799 
1800 	__skb_push(skb, skb->data - skb_mac_header(skb));
1801 
1802 	return segs;
1803 }
1804 EXPORT_SYMBOL(skb_gso_segment);
1805 
1806 /* Take action when hardware reception checksum errors are detected. */
1807 #ifdef CONFIG_BUG
1808 void netdev_rx_csum_fault(struct net_device *dev)
1809 {
1810 	if (net_ratelimit()) {
1811 		printk(KERN_ERR "%s: hw csum failure.\n",
1812 			dev ? dev->name : "<unknown>");
1813 		dump_stack();
1814 	}
1815 }
1816 EXPORT_SYMBOL(netdev_rx_csum_fault);
1817 #endif
1818 
1819 /* Actually, we should eliminate this check as soon as we know, that:
1820  * 1. IOMMU is present and allows to map all the memory.
1821  * 2. No high memory really exists on this machine.
1822  */
1823 
1824 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1825 {
1826 #ifdef CONFIG_HIGHMEM
1827 	int i;
1828 	if (!(dev->features & NETIF_F_HIGHDMA)) {
1829 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1830 			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1831 				return 1;
1832 	}
1833 
1834 	if (PCI_DMA_BUS_IS_PHYS) {
1835 		struct device *pdev = dev->dev.parent;
1836 
1837 		if (!pdev)
1838 			return 0;
1839 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1840 			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1841 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1842 				return 1;
1843 		}
1844 	}
1845 #endif
1846 	return 0;
1847 }
1848 
1849 struct dev_gso_cb {
1850 	void (*destructor)(struct sk_buff *skb);
1851 };
1852 
1853 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1854 
1855 static void dev_gso_skb_destructor(struct sk_buff *skb)
1856 {
1857 	struct dev_gso_cb *cb;
1858 
1859 	do {
1860 		struct sk_buff *nskb = skb->next;
1861 
1862 		skb->next = nskb->next;
1863 		nskb->next = NULL;
1864 		kfree_skb(nskb);
1865 	} while (skb->next);
1866 
1867 	cb = DEV_GSO_CB(skb);
1868 	if (cb->destructor)
1869 		cb->destructor(skb);
1870 }
1871 
1872 /**
1873  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1874  *	@skb: buffer to segment
1875  *
1876  *	This function segments the given skb and stores the list of segments
1877  *	in skb->next.
1878  */
1879 static int dev_gso_segment(struct sk_buff *skb)
1880 {
1881 	struct net_device *dev = skb->dev;
1882 	struct sk_buff *segs;
1883 	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1884 					 NETIF_F_SG : 0);
1885 
1886 	segs = skb_gso_segment(skb, features);
1887 
1888 	/* Verifying header integrity only. */
1889 	if (!segs)
1890 		return 0;
1891 
1892 	if (IS_ERR(segs))
1893 		return PTR_ERR(segs);
1894 
1895 	skb->next = segs;
1896 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1897 	skb->destructor = dev_gso_skb_destructor;
1898 
1899 	return 0;
1900 }
1901 
1902 /*
1903  * Try to orphan skb early, right before transmission by the device.
1904  * We cannot orphan skb if tx timestamp is requested or the sk-reference
1905  * is needed on driver level for other reasons, e.g. see net/can/raw.c
1906  */
1907 static inline void skb_orphan_try(struct sk_buff *skb)
1908 {
1909 	struct sock *sk = skb->sk;
1910 
1911 	if (sk && !skb_shinfo(skb)->tx_flags) {
1912 		/* skb_tx_hash() wont be able to get sk.
1913 		 * We copy sk_hash into skb->rxhash
1914 		 */
1915 		if (!skb->rxhash)
1916 			skb->rxhash = sk->sk_hash;
1917 		skb_orphan(skb);
1918 	}
1919 }
1920 
1921 /*
1922  * Returns true if either:
1923  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
1924  *	2. skb is fragmented and the device does not support SG, or if
1925  *	   at least one of fragments is in highmem and device does not
1926  *	   support DMA from it.
1927  */
1928 static inline int skb_needs_linearize(struct sk_buff *skb,
1929 				      struct net_device *dev)
1930 {
1931 	return skb_is_nonlinear(skb) &&
1932 	       ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
1933 	        (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
1934 					      illegal_highdma(dev, skb))));
1935 }
1936 
1937 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1938 			struct netdev_queue *txq)
1939 {
1940 	const struct net_device_ops *ops = dev->netdev_ops;
1941 	int rc = NETDEV_TX_OK;
1942 
1943 	if (likely(!skb->next)) {
1944 		if (!list_empty(&ptype_all))
1945 			dev_queue_xmit_nit(skb, dev);
1946 
1947 		/*
1948 		 * If device doesnt need skb->dst, release it right now while
1949 		 * its hot in this cpu cache
1950 		 */
1951 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1952 			skb_dst_drop(skb);
1953 
1954 		skb_orphan_try(skb);
1955 
1956 		if (netif_needs_gso(dev, skb)) {
1957 			if (unlikely(dev_gso_segment(skb)))
1958 				goto out_kfree_skb;
1959 			if (skb->next)
1960 				goto gso;
1961 		} else {
1962 			if (skb_needs_linearize(skb, dev) &&
1963 			    __skb_linearize(skb))
1964 				goto out_kfree_skb;
1965 
1966 			/* If packet is not checksummed and device does not
1967 			 * support checksumming for this protocol, complete
1968 			 * checksumming here.
1969 			 */
1970 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
1971 				skb_set_transport_header(skb, skb->csum_start -
1972 					      skb_headroom(skb));
1973 				if (!dev_can_checksum(dev, skb) &&
1974 				     skb_checksum_help(skb))
1975 					goto out_kfree_skb;
1976 			}
1977 		}
1978 
1979 		rc = ops->ndo_start_xmit(skb, dev);
1980 		if (rc == NETDEV_TX_OK)
1981 			txq_trans_update(txq);
1982 		return rc;
1983 	}
1984 
1985 gso:
1986 	do {
1987 		struct sk_buff *nskb = skb->next;
1988 
1989 		skb->next = nskb->next;
1990 		nskb->next = NULL;
1991 
1992 		/*
1993 		 * If device doesnt need nskb->dst, release it right now while
1994 		 * its hot in this cpu cache
1995 		 */
1996 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1997 			skb_dst_drop(nskb);
1998 
1999 		rc = ops->ndo_start_xmit(nskb, dev);
2000 		if (unlikely(rc != NETDEV_TX_OK)) {
2001 			if (rc & ~NETDEV_TX_MASK)
2002 				goto out_kfree_gso_skb;
2003 			nskb->next = skb->next;
2004 			skb->next = nskb;
2005 			return rc;
2006 		}
2007 		txq_trans_update(txq);
2008 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2009 			return NETDEV_TX_BUSY;
2010 	} while (skb->next);
2011 
2012 out_kfree_gso_skb:
2013 	if (likely(skb->next == NULL))
2014 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2015 out_kfree_skb:
2016 	kfree_skb(skb);
2017 	return rc;
2018 }
2019 
2020 static u32 hashrnd __read_mostly;
2021 
2022 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
2023 {
2024 	u32 hash;
2025 
2026 	if (skb_rx_queue_recorded(skb)) {
2027 		hash = skb_get_rx_queue(skb);
2028 		while (unlikely(hash >= dev->real_num_tx_queues))
2029 			hash -= dev->real_num_tx_queues;
2030 		return hash;
2031 	}
2032 
2033 	if (skb->sk && skb->sk->sk_hash)
2034 		hash = skb->sk->sk_hash;
2035 	else
2036 		hash = (__force u16) skb->protocol ^ skb->rxhash;
2037 	hash = jhash_1word(hash, hashrnd);
2038 
2039 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
2040 }
2041 EXPORT_SYMBOL(skb_tx_hash);
2042 
2043 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2044 {
2045 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2046 		if (net_ratelimit()) {
2047 			pr_warning("%s selects TX queue %d, but "
2048 				"real number of TX queues is %d\n",
2049 				dev->name, queue_index, dev->real_num_tx_queues);
2050 		}
2051 		return 0;
2052 	}
2053 	return queue_index;
2054 }
2055 
2056 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2057 					struct sk_buff *skb)
2058 {
2059 	int queue_index;
2060 	const struct net_device_ops *ops = dev->netdev_ops;
2061 
2062 	if (ops->ndo_select_queue) {
2063 		queue_index = ops->ndo_select_queue(dev, skb);
2064 		queue_index = dev_cap_txqueue(dev, queue_index);
2065 	} else {
2066 		struct sock *sk = skb->sk;
2067 		queue_index = sk_tx_queue_get(sk);
2068 		if (queue_index < 0) {
2069 
2070 			queue_index = 0;
2071 			if (dev->real_num_tx_queues > 1)
2072 				queue_index = skb_tx_hash(dev, skb);
2073 
2074 			if (sk) {
2075 				struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
2076 
2077 				if (dst && skb_dst(skb) == dst)
2078 					sk_tx_queue_set(sk, queue_index);
2079 			}
2080 		}
2081 	}
2082 
2083 	skb_set_queue_mapping(skb, queue_index);
2084 	return netdev_get_tx_queue(dev, queue_index);
2085 }
2086 
2087 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2088 				 struct net_device *dev,
2089 				 struct netdev_queue *txq)
2090 {
2091 	spinlock_t *root_lock = qdisc_lock(q);
2092 	bool contended = qdisc_is_running(q);
2093 	int rc;
2094 
2095 	/*
2096 	 * Heuristic to force contended enqueues to serialize on a
2097 	 * separate lock before trying to get qdisc main lock.
2098 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2099 	 * and dequeue packets faster.
2100 	 */
2101 	if (unlikely(contended))
2102 		spin_lock(&q->busylock);
2103 
2104 	spin_lock(root_lock);
2105 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2106 		kfree_skb(skb);
2107 		rc = NET_XMIT_DROP;
2108 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2109 		   qdisc_run_begin(q)) {
2110 		/*
2111 		 * This is a work-conserving queue; there are no old skbs
2112 		 * waiting to be sent out; and the qdisc is not running -
2113 		 * xmit the skb directly.
2114 		 */
2115 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2116 			skb_dst_force(skb);
2117 		__qdisc_update_bstats(q, skb->len);
2118 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2119 			if (unlikely(contended)) {
2120 				spin_unlock(&q->busylock);
2121 				contended = false;
2122 			}
2123 			__qdisc_run(q);
2124 		} else
2125 			qdisc_run_end(q);
2126 
2127 		rc = NET_XMIT_SUCCESS;
2128 	} else {
2129 		skb_dst_force(skb);
2130 		rc = qdisc_enqueue_root(skb, q);
2131 		if (qdisc_run_begin(q)) {
2132 			if (unlikely(contended)) {
2133 				spin_unlock(&q->busylock);
2134 				contended = false;
2135 			}
2136 			__qdisc_run(q);
2137 		}
2138 	}
2139 	spin_unlock(root_lock);
2140 	if (unlikely(contended))
2141 		spin_unlock(&q->busylock);
2142 	return rc;
2143 }
2144 
2145 /**
2146  *	dev_queue_xmit - transmit a buffer
2147  *	@skb: buffer to transmit
2148  *
2149  *	Queue a buffer for transmission to a network device. The caller must
2150  *	have set the device and priority and built the buffer before calling
2151  *	this function. The function can be called from an interrupt.
2152  *
2153  *	A negative errno code is returned on a failure. A success does not
2154  *	guarantee the frame will be transmitted as it may be dropped due
2155  *	to congestion or traffic shaping.
2156  *
2157  * -----------------------------------------------------------------------------------
2158  *      I notice this method can also return errors from the queue disciplines,
2159  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2160  *      be positive.
2161  *
2162  *      Regardless of the return value, the skb is consumed, so it is currently
2163  *      difficult to retry a send to this method.  (You can bump the ref count
2164  *      before sending to hold a reference for retry if you are careful.)
2165  *
2166  *      When calling this method, interrupts MUST be enabled.  This is because
2167  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2168  *          --BLG
2169  */
2170 int dev_queue_xmit(struct sk_buff *skb)
2171 {
2172 	struct net_device *dev = skb->dev;
2173 	struct netdev_queue *txq;
2174 	struct Qdisc *q;
2175 	int rc = -ENOMEM;
2176 
2177 	/* Disable soft irqs for various locks below. Also
2178 	 * stops preemption for RCU.
2179 	 */
2180 	rcu_read_lock_bh();
2181 
2182 	txq = dev_pick_tx(dev, skb);
2183 	q = rcu_dereference_bh(txq->qdisc);
2184 
2185 #ifdef CONFIG_NET_CLS_ACT
2186 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2187 #endif
2188 	if (q->enqueue) {
2189 		rc = __dev_xmit_skb(skb, q, dev, txq);
2190 		goto out;
2191 	}
2192 
2193 	/* The device has no queue. Common case for software devices:
2194 	   loopback, all the sorts of tunnels...
2195 
2196 	   Really, it is unlikely that netif_tx_lock protection is necessary
2197 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2198 	   counters.)
2199 	   However, it is possible, that they rely on protection
2200 	   made by us here.
2201 
2202 	   Check this and shot the lock. It is not prone from deadlocks.
2203 	   Either shot noqueue qdisc, it is even simpler 8)
2204 	 */
2205 	if (dev->flags & IFF_UP) {
2206 		int cpu = smp_processor_id(); /* ok because BHs are off */
2207 
2208 		if (txq->xmit_lock_owner != cpu) {
2209 
2210 			HARD_TX_LOCK(dev, txq, cpu);
2211 
2212 			if (!netif_tx_queue_stopped(txq)) {
2213 				rc = dev_hard_start_xmit(skb, dev, txq);
2214 				if (dev_xmit_complete(rc)) {
2215 					HARD_TX_UNLOCK(dev, txq);
2216 					goto out;
2217 				}
2218 			}
2219 			HARD_TX_UNLOCK(dev, txq);
2220 			if (net_ratelimit())
2221 				printk(KERN_CRIT "Virtual device %s asks to "
2222 				       "queue packet!\n", dev->name);
2223 		} else {
2224 			/* Recursion is detected! It is possible,
2225 			 * unfortunately */
2226 			if (net_ratelimit())
2227 				printk(KERN_CRIT "Dead loop on virtual device "
2228 				       "%s, fix it urgently!\n", dev->name);
2229 		}
2230 	}
2231 
2232 	rc = -ENETDOWN;
2233 	rcu_read_unlock_bh();
2234 
2235 	kfree_skb(skb);
2236 	return rc;
2237 out:
2238 	rcu_read_unlock_bh();
2239 	return rc;
2240 }
2241 EXPORT_SYMBOL(dev_queue_xmit);
2242 
2243 
2244 /*=======================================================================
2245 			Receiver routines
2246   =======================================================================*/
2247 
2248 int netdev_max_backlog __read_mostly = 1000;
2249 int netdev_tstamp_prequeue __read_mostly = 1;
2250 int netdev_budget __read_mostly = 300;
2251 int weight_p __read_mostly = 64;            /* old backlog weight */
2252 
2253 /* Called with irq disabled */
2254 static inline void ____napi_schedule(struct softnet_data *sd,
2255 				     struct napi_struct *napi)
2256 {
2257 	list_add_tail(&napi->poll_list, &sd->poll_list);
2258 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2259 }
2260 
2261 /*
2262  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2263  * and src/dst port numbers. Returns a non-zero hash number on success
2264  * and 0 on failure.
2265  */
2266 __u32 __skb_get_rxhash(struct sk_buff *skb)
2267 {
2268 	int nhoff, hash = 0, poff;
2269 	struct ipv6hdr *ip6;
2270 	struct iphdr *ip;
2271 	u8 ip_proto;
2272 	u32 addr1, addr2, ihl;
2273 	union {
2274 		u32 v32;
2275 		u16 v16[2];
2276 	} ports;
2277 
2278 	nhoff = skb_network_offset(skb);
2279 
2280 	switch (skb->protocol) {
2281 	case __constant_htons(ETH_P_IP):
2282 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2283 			goto done;
2284 
2285 		ip = (struct iphdr *) (skb->data + nhoff);
2286 		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2287 			ip_proto = 0;
2288 		else
2289 			ip_proto = ip->protocol;
2290 		addr1 = (__force u32) ip->saddr;
2291 		addr2 = (__force u32) ip->daddr;
2292 		ihl = ip->ihl;
2293 		break;
2294 	case __constant_htons(ETH_P_IPV6):
2295 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2296 			goto done;
2297 
2298 		ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2299 		ip_proto = ip6->nexthdr;
2300 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2301 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2302 		ihl = (40 >> 2);
2303 		break;
2304 	default:
2305 		goto done;
2306 	}
2307 
2308 	ports.v32 = 0;
2309 	poff = proto_ports_offset(ip_proto);
2310 	if (poff >= 0) {
2311 		nhoff += ihl * 4 + poff;
2312 		if (pskb_may_pull(skb, nhoff + 4)) {
2313 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2314 			if (ports.v16[1] < ports.v16[0])
2315 				swap(ports.v16[0], ports.v16[1]);
2316 		}
2317 	}
2318 
2319 	/* get a consistent hash (same value on both flow directions) */
2320 	if (addr2 < addr1)
2321 		swap(addr1, addr2);
2322 
2323 	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2324 	if (!hash)
2325 		hash = 1;
2326 
2327 done:
2328 	return hash;
2329 }
2330 EXPORT_SYMBOL(__skb_get_rxhash);
2331 
2332 #ifdef CONFIG_RPS
2333 
2334 /* One global table that all flow-based protocols share. */
2335 struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2336 EXPORT_SYMBOL(rps_sock_flow_table);
2337 
2338 /*
2339  * get_rps_cpu is called from netif_receive_skb and returns the target
2340  * CPU from the RPS map of the receiving queue for a given skb.
2341  * rcu_read_lock must be held on entry.
2342  */
2343 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2344 		       struct rps_dev_flow **rflowp)
2345 {
2346 	struct netdev_rx_queue *rxqueue;
2347 	struct rps_map *map = NULL;
2348 	struct rps_dev_flow_table *flow_table;
2349 	struct rps_sock_flow_table *sock_flow_table;
2350 	int cpu = -1;
2351 	u16 tcpu;
2352 
2353 	if (skb_rx_queue_recorded(skb)) {
2354 		u16 index = skb_get_rx_queue(skb);
2355 		if (unlikely(index >= dev->num_rx_queues)) {
2356 			WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
2357 				"on queue %u, but number of RX queues is %u\n",
2358 				dev->name, index, dev->num_rx_queues);
2359 			goto done;
2360 		}
2361 		rxqueue = dev->_rx + index;
2362 	} else
2363 		rxqueue = dev->_rx;
2364 
2365 	if (rxqueue->rps_map) {
2366 		map = rcu_dereference(rxqueue->rps_map);
2367 		if (map && map->len == 1) {
2368 			tcpu = map->cpus[0];
2369 			if (cpu_online(tcpu))
2370 				cpu = tcpu;
2371 			goto done;
2372 		}
2373 	} else if (!rxqueue->rps_flow_table) {
2374 		goto done;
2375 	}
2376 
2377 	skb_reset_network_header(skb);
2378 	if (!skb_get_rxhash(skb))
2379 		goto done;
2380 
2381 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2382 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2383 	if (flow_table && sock_flow_table) {
2384 		u16 next_cpu;
2385 		struct rps_dev_flow *rflow;
2386 
2387 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2388 		tcpu = rflow->cpu;
2389 
2390 		next_cpu = sock_flow_table->ents[skb->rxhash &
2391 		    sock_flow_table->mask];
2392 
2393 		/*
2394 		 * If the desired CPU (where last recvmsg was done) is
2395 		 * different from current CPU (one in the rx-queue flow
2396 		 * table entry), switch if one of the following holds:
2397 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2398 		 *   - Current CPU is offline.
2399 		 *   - The current CPU's queue tail has advanced beyond the
2400 		 *     last packet that was enqueued using this table entry.
2401 		 *     This guarantees that all previous packets for the flow
2402 		 *     have been dequeued, thus preserving in order delivery.
2403 		 */
2404 		if (unlikely(tcpu != next_cpu) &&
2405 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2406 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2407 		      rflow->last_qtail)) >= 0)) {
2408 			tcpu = rflow->cpu = next_cpu;
2409 			if (tcpu != RPS_NO_CPU)
2410 				rflow->last_qtail = per_cpu(softnet_data,
2411 				    tcpu).input_queue_head;
2412 		}
2413 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2414 			*rflowp = rflow;
2415 			cpu = tcpu;
2416 			goto done;
2417 		}
2418 	}
2419 
2420 	if (map) {
2421 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2422 
2423 		if (cpu_online(tcpu)) {
2424 			cpu = tcpu;
2425 			goto done;
2426 		}
2427 	}
2428 
2429 done:
2430 	return cpu;
2431 }
2432 
2433 /* Called from hardirq (IPI) context */
2434 static void rps_trigger_softirq(void *data)
2435 {
2436 	struct softnet_data *sd = data;
2437 
2438 	____napi_schedule(sd, &sd->backlog);
2439 	sd->received_rps++;
2440 }
2441 
2442 #endif /* CONFIG_RPS */
2443 
2444 /*
2445  * Check if this softnet_data structure is another cpu one
2446  * If yes, queue it to our IPI list and return 1
2447  * If no, return 0
2448  */
2449 static int rps_ipi_queued(struct softnet_data *sd)
2450 {
2451 #ifdef CONFIG_RPS
2452 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2453 
2454 	if (sd != mysd) {
2455 		sd->rps_ipi_next = mysd->rps_ipi_list;
2456 		mysd->rps_ipi_list = sd;
2457 
2458 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2459 		return 1;
2460 	}
2461 #endif /* CONFIG_RPS */
2462 	return 0;
2463 }
2464 
2465 /*
2466  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2467  * queue (may be a remote CPU queue).
2468  */
2469 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2470 			      unsigned int *qtail)
2471 {
2472 	struct softnet_data *sd;
2473 	unsigned long flags;
2474 
2475 	sd = &per_cpu(softnet_data, cpu);
2476 
2477 	local_irq_save(flags);
2478 
2479 	rps_lock(sd);
2480 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2481 		if (skb_queue_len(&sd->input_pkt_queue)) {
2482 enqueue:
2483 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2484 			input_queue_tail_incr_save(sd, qtail);
2485 			rps_unlock(sd);
2486 			local_irq_restore(flags);
2487 			return NET_RX_SUCCESS;
2488 		}
2489 
2490 		/* Schedule NAPI for backlog device
2491 		 * We can use non atomic operation since we own the queue lock
2492 		 */
2493 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2494 			if (!rps_ipi_queued(sd))
2495 				____napi_schedule(sd, &sd->backlog);
2496 		}
2497 		goto enqueue;
2498 	}
2499 
2500 	sd->dropped++;
2501 	rps_unlock(sd);
2502 
2503 	local_irq_restore(flags);
2504 
2505 	kfree_skb(skb);
2506 	return NET_RX_DROP;
2507 }
2508 
2509 /**
2510  *	netif_rx	-	post buffer to the network code
2511  *	@skb: buffer to post
2512  *
2513  *	This function receives a packet from a device driver and queues it for
2514  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2515  *	may be dropped during processing for congestion control or by the
2516  *	protocol layers.
2517  *
2518  *	return values:
2519  *	NET_RX_SUCCESS	(no congestion)
2520  *	NET_RX_DROP     (packet was dropped)
2521  *
2522  */
2523 
2524 int netif_rx(struct sk_buff *skb)
2525 {
2526 	int ret;
2527 
2528 	/* if netpoll wants it, pretend we never saw it */
2529 	if (netpoll_rx(skb))
2530 		return NET_RX_DROP;
2531 
2532 	if (netdev_tstamp_prequeue)
2533 		net_timestamp_check(skb);
2534 
2535 #ifdef CONFIG_RPS
2536 	{
2537 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2538 		int cpu;
2539 
2540 		preempt_disable();
2541 		rcu_read_lock();
2542 
2543 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2544 		if (cpu < 0)
2545 			cpu = smp_processor_id();
2546 
2547 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2548 
2549 		rcu_read_unlock();
2550 		preempt_enable();
2551 	}
2552 #else
2553 	{
2554 		unsigned int qtail;
2555 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2556 		put_cpu();
2557 	}
2558 #endif
2559 	return ret;
2560 }
2561 EXPORT_SYMBOL(netif_rx);
2562 
2563 int netif_rx_ni(struct sk_buff *skb)
2564 {
2565 	int err;
2566 
2567 	preempt_disable();
2568 	err = netif_rx(skb);
2569 	if (local_softirq_pending())
2570 		do_softirq();
2571 	preempt_enable();
2572 
2573 	return err;
2574 }
2575 EXPORT_SYMBOL(netif_rx_ni);
2576 
2577 static void net_tx_action(struct softirq_action *h)
2578 {
2579 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2580 
2581 	if (sd->completion_queue) {
2582 		struct sk_buff *clist;
2583 
2584 		local_irq_disable();
2585 		clist = sd->completion_queue;
2586 		sd->completion_queue = NULL;
2587 		local_irq_enable();
2588 
2589 		while (clist) {
2590 			struct sk_buff *skb = clist;
2591 			clist = clist->next;
2592 
2593 			WARN_ON(atomic_read(&skb->users));
2594 			__kfree_skb(skb);
2595 		}
2596 	}
2597 
2598 	if (sd->output_queue) {
2599 		struct Qdisc *head;
2600 
2601 		local_irq_disable();
2602 		head = sd->output_queue;
2603 		sd->output_queue = NULL;
2604 		sd->output_queue_tailp = &sd->output_queue;
2605 		local_irq_enable();
2606 
2607 		while (head) {
2608 			struct Qdisc *q = head;
2609 			spinlock_t *root_lock;
2610 
2611 			head = head->next_sched;
2612 
2613 			root_lock = qdisc_lock(q);
2614 			if (spin_trylock(root_lock)) {
2615 				smp_mb__before_clear_bit();
2616 				clear_bit(__QDISC_STATE_SCHED,
2617 					  &q->state);
2618 				qdisc_run(q);
2619 				spin_unlock(root_lock);
2620 			} else {
2621 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2622 					      &q->state)) {
2623 					__netif_reschedule(q);
2624 				} else {
2625 					smp_mb__before_clear_bit();
2626 					clear_bit(__QDISC_STATE_SCHED,
2627 						  &q->state);
2628 				}
2629 			}
2630 		}
2631 	}
2632 }
2633 
2634 static inline int deliver_skb(struct sk_buff *skb,
2635 			      struct packet_type *pt_prev,
2636 			      struct net_device *orig_dev)
2637 {
2638 	atomic_inc(&skb->users);
2639 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2640 }
2641 
2642 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2643     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2644 /* This hook is defined here for ATM LANE */
2645 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2646 			     unsigned char *addr) __read_mostly;
2647 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2648 #endif
2649 
2650 #ifdef CONFIG_NET_CLS_ACT
2651 /* TODO: Maybe we should just force sch_ingress to be compiled in
2652  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2653  * a compare and 2 stores extra right now if we dont have it on
2654  * but have CONFIG_NET_CLS_ACT
2655  * NOTE: This doesnt stop any functionality; if you dont have
2656  * the ingress scheduler, you just cant add policies on ingress.
2657  *
2658  */
2659 static int ing_filter(struct sk_buff *skb)
2660 {
2661 	struct net_device *dev = skb->dev;
2662 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2663 	struct netdev_queue *rxq;
2664 	int result = TC_ACT_OK;
2665 	struct Qdisc *q;
2666 
2667 	if (unlikely(MAX_RED_LOOP < ttl++)) {
2668 		if (net_ratelimit())
2669 			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2670 			       skb->skb_iif, dev->ifindex);
2671 		return TC_ACT_SHOT;
2672 	}
2673 
2674 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2675 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2676 
2677 	rxq = &dev->rx_queue;
2678 
2679 	q = rxq->qdisc;
2680 	if (q != &noop_qdisc) {
2681 		spin_lock(qdisc_lock(q));
2682 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2683 			result = qdisc_enqueue_root(skb, q);
2684 		spin_unlock(qdisc_lock(q));
2685 	}
2686 
2687 	return result;
2688 }
2689 
2690 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2691 					 struct packet_type **pt_prev,
2692 					 int *ret, struct net_device *orig_dev)
2693 {
2694 	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2695 		goto out;
2696 
2697 	if (*pt_prev) {
2698 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2699 		*pt_prev = NULL;
2700 	}
2701 
2702 	switch (ing_filter(skb)) {
2703 	case TC_ACT_SHOT:
2704 	case TC_ACT_STOLEN:
2705 		kfree_skb(skb);
2706 		return NULL;
2707 	}
2708 
2709 out:
2710 	skb->tc_verd = 0;
2711 	return skb;
2712 }
2713 #endif
2714 
2715 /*
2716  * 	netif_nit_deliver - deliver received packets to network taps
2717  * 	@skb: buffer
2718  *
2719  * 	This function is used to deliver incoming packets to network
2720  * 	taps. It should be used when the normal netif_receive_skb path
2721  * 	is bypassed, for example because of VLAN acceleration.
2722  */
2723 void netif_nit_deliver(struct sk_buff *skb)
2724 {
2725 	struct packet_type *ptype;
2726 
2727 	if (list_empty(&ptype_all))
2728 		return;
2729 
2730 	skb_reset_network_header(skb);
2731 	skb_reset_transport_header(skb);
2732 	skb->mac_len = skb->network_header - skb->mac_header;
2733 
2734 	rcu_read_lock();
2735 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2736 		if (!ptype->dev || ptype->dev == skb->dev)
2737 			deliver_skb(skb, ptype, skb->dev);
2738 	}
2739 	rcu_read_unlock();
2740 }
2741 
2742 /**
2743  *	netdev_rx_handler_register - register receive handler
2744  *	@dev: device to register a handler for
2745  *	@rx_handler: receive handler to register
2746  *	@rx_handler_data: data pointer that is used by rx handler
2747  *
2748  *	Register a receive hander for a device. This handler will then be
2749  *	called from __netif_receive_skb. A negative errno code is returned
2750  *	on a failure.
2751  *
2752  *	The caller must hold the rtnl_mutex.
2753  */
2754 int netdev_rx_handler_register(struct net_device *dev,
2755 			       rx_handler_func_t *rx_handler,
2756 			       void *rx_handler_data)
2757 {
2758 	ASSERT_RTNL();
2759 
2760 	if (dev->rx_handler)
2761 		return -EBUSY;
2762 
2763 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2764 	rcu_assign_pointer(dev->rx_handler, rx_handler);
2765 
2766 	return 0;
2767 }
2768 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2769 
2770 /**
2771  *	netdev_rx_handler_unregister - unregister receive handler
2772  *	@dev: device to unregister a handler from
2773  *
2774  *	Unregister a receive hander from a device.
2775  *
2776  *	The caller must hold the rtnl_mutex.
2777  */
2778 void netdev_rx_handler_unregister(struct net_device *dev)
2779 {
2780 
2781 	ASSERT_RTNL();
2782 	rcu_assign_pointer(dev->rx_handler, NULL);
2783 	rcu_assign_pointer(dev->rx_handler_data, NULL);
2784 }
2785 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2786 
2787 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2788 					      struct net_device *master)
2789 {
2790 	if (skb->pkt_type == PACKET_HOST) {
2791 		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2792 
2793 		memcpy(dest, master->dev_addr, ETH_ALEN);
2794 	}
2795 }
2796 
2797 /* On bonding slaves other than the currently active slave, suppress
2798  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2799  * ARP on active-backup slaves with arp_validate enabled.
2800  */
2801 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2802 {
2803 	struct net_device *dev = skb->dev;
2804 
2805 	if (master->priv_flags & IFF_MASTER_ARPMON)
2806 		dev->last_rx = jiffies;
2807 
2808 	if ((master->priv_flags & IFF_MASTER_ALB) &&
2809 	    (master->priv_flags & IFF_BRIDGE_PORT)) {
2810 		/* Do address unmangle. The local destination address
2811 		 * will be always the one master has. Provides the right
2812 		 * functionality in a bridge.
2813 		 */
2814 		skb_bond_set_mac_by_master(skb, master);
2815 	}
2816 
2817 	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2818 		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2819 		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
2820 			return 0;
2821 
2822 		if (master->priv_flags & IFF_MASTER_ALB) {
2823 			if (skb->pkt_type != PACKET_BROADCAST &&
2824 			    skb->pkt_type != PACKET_MULTICAST)
2825 				return 0;
2826 		}
2827 		if (master->priv_flags & IFF_MASTER_8023AD &&
2828 		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2829 			return 0;
2830 
2831 		return 1;
2832 	}
2833 	return 0;
2834 }
2835 EXPORT_SYMBOL(__skb_bond_should_drop);
2836 
2837 static int __netif_receive_skb(struct sk_buff *skb)
2838 {
2839 	struct packet_type *ptype, *pt_prev;
2840 	rx_handler_func_t *rx_handler;
2841 	struct net_device *orig_dev;
2842 	struct net_device *master;
2843 	struct net_device *null_or_orig;
2844 	struct net_device *orig_or_bond;
2845 	int ret = NET_RX_DROP;
2846 	__be16 type;
2847 
2848 	if (!netdev_tstamp_prequeue)
2849 		net_timestamp_check(skb);
2850 
2851 	if (vlan_tx_tag_present(skb))
2852 		vlan_hwaccel_do_receive(skb);
2853 
2854 	/* if we've gotten here through NAPI, check netpoll */
2855 	if (netpoll_receive_skb(skb))
2856 		return NET_RX_DROP;
2857 
2858 	if (!skb->skb_iif)
2859 		skb->skb_iif = skb->dev->ifindex;
2860 
2861 	/*
2862 	 * bonding note: skbs received on inactive slaves should only
2863 	 * be delivered to pkt handlers that are exact matches.  Also
2864 	 * the deliver_no_wcard flag will be set.  If packet handlers
2865 	 * are sensitive to duplicate packets these skbs will need to
2866 	 * be dropped at the handler.  The vlan accel path may have
2867 	 * already set the deliver_no_wcard flag.
2868 	 */
2869 	null_or_orig = NULL;
2870 	orig_dev = skb->dev;
2871 	master = ACCESS_ONCE(orig_dev->master);
2872 	if (skb->deliver_no_wcard)
2873 		null_or_orig = orig_dev;
2874 	else if (master) {
2875 		if (skb_bond_should_drop(skb, master)) {
2876 			skb->deliver_no_wcard = 1;
2877 			null_or_orig = orig_dev; /* deliver only exact match */
2878 		} else
2879 			skb->dev = master;
2880 	}
2881 
2882 	__this_cpu_inc(softnet_data.processed);
2883 	skb_reset_network_header(skb);
2884 	skb_reset_transport_header(skb);
2885 	skb->mac_len = skb->network_header - skb->mac_header;
2886 
2887 	pt_prev = NULL;
2888 
2889 	rcu_read_lock();
2890 
2891 #ifdef CONFIG_NET_CLS_ACT
2892 	if (skb->tc_verd & TC_NCLS) {
2893 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2894 		goto ncls;
2895 	}
2896 #endif
2897 
2898 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2899 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2900 		    ptype->dev == orig_dev) {
2901 			if (pt_prev)
2902 				ret = deliver_skb(skb, pt_prev, orig_dev);
2903 			pt_prev = ptype;
2904 		}
2905 	}
2906 
2907 #ifdef CONFIG_NET_CLS_ACT
2908 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2909 	if (!skb)
2910 		goto out;
2911 ncls:
2912 #endif
2913 
2914 	/* Handle special case of bridge or macvlan */
2915 	rx_handler = rcu_dereference(skb->dev->rx_handler);
2916 	if (rx_handler) {
2917 		if (pt_prev) {
2918 			ret = deliver_skb(skb, pt_prev, orig_dev);
2919 			pt_prev = NULL;
2920 		}
2921 		skb = rx_handler(skb);
2922 		if (!skb)
2923 			goto out;
2924 	}
2925 
2926 	/*
2927 	 * Make sure frames received on VLAN interfaces stacked on
2928 	 * bonding interfaces still make their way to any base bonding
2929 	 * device that may have registered for a specific ptype.  The
2930 	 * handler may have to adjust skb->dev and orig_dev.
2931 	 */
2932 	orig_or_bond = orig_dev;
2933 	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2934 	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2935 		orig_or_bond = vlan_dev_real_dev(skb->dev);
2936 	}
2937 
2938 	type = skb->protocol;
2939 	list_for_each_entry_rcu(ptype,
2940 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2941 		if (ptype->type == type && (ptype->dev == null_or_orig ||
2942 		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
2943 		     ptype->dev == orig_or_bond)) {
2944 			if (pt_prev)
2945 				ret = deliver_skb(skb, pt_prev, orig_dev);
2946 			pt_prev = ptype;
2947 		}
2948 	}
2949 
2950 	if (pt_prev) {
2951 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2952 	} else {
2953 		kfree_skb(skb);
2954 		/* Jamal, now you will not able to escape explaining
2955 		 * me how you were going to use this. :-)
2956 		 */
2957 		ret = NET_RX_DROP;
2958 	}
2959 
2960 out:
2961 	rcu_read_unlock();
2962 	return ret;
2963 }
2964 
2965 /**
2966  *	netif_receive_skb - process receive buffer from network
2967  *	@skb: buffer to process
2968  *
2969  *	netif_receive_skb() is the main receive data processing function.
2970  *	It always succeeds. The buffer may be dropped during processing
2971  *	for congestion control or by the protocol layers.
2972  *
2973  *	This function may only be called from softirq context and interrupts
2974  *	should be enabled.
2975  *
2976  *	Return values (usually ignored):
2977  *	NET_RX_SUCCESS: no congestion
2978  *	NET_RX_DROP: packet was dropped
2979  */
2980 int netif_receive_skb(struct sk_buff *skb)
2981 {
2982 	if (netdev_tstamp_prequeue)
2983 		net_timestamp_check(skb);
2984 
2985 	if (skb_defer_rx_timestamp(skb))
2986 		return NET_RX_SUCCESS;
2987 
2988 #ifdef CONFIG_RPS
2989 	{
2990 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2991 		int cpu, ret;
2992 
2993 		rcu_read_lock();
2994 
2995 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2996 
2997 		if (cpu >= 0) {
2998 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2999 			rcu_read_unlock();
3000 		} else {
3001 			rcu_read_unlock();
3002 			ret = __netif_receive_skb(skb);
3003 		}
3004 
3005 		return ret;
3006 	}
3007 #else
3008 	return __netif_receive_skb(skb);
3009 #endif
3010 }
3011 EXPORT_SYMBOL(netif_receive_skb);
3012 
3013 /* Network device is going away, flush any packets still pending
3014  * Called with irqs disabled.
3015  */
3016 static void flush_backlog(void *arg)
3017 {
3018 	struct net_device *dev = arg;
3019 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3020 	struct sk_buff *skb, *tmp;
3021 
3022 	rps_lock(sd);
3023 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3024 		if (skb->dev == dev) {
3025 			__skb_unlink(skb, &sd->input_pkt_queue);
3026 			kfree_skb(skb);
3027 			input_queue_head_incr(sd);
3028 		}
3029 	}
3030 	rps_unlock(sd);
3031 
3032 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3033 		if (skb->dev == dev) {
3034 			__skb_unlink(skb, &sd->process_queue);
3035 			kfree_skb(skb);
3036 			input_queue_head_incr(sd);
3037 		}
3038 	}
3039 }
3040 
3041 static int napi_gro_complete(struct sk_buff *skb)
3042 {
3043 	struct packet_type *ptype;
3044 	__be16 type = skb->protocol;
3045 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3046 	int err = -ENOENT;
3047 
3048 	if (NAPI_GRO_CB(skb)->count == 1) {
3049 		skb_shinfo(skb)->gso_size = 0;
3050 		goto out;
3051 	}
3052 
3053 	rcu_read_lock();
3054 	list_for_each_entry_rcu(ptype, head, list) {
3055 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3056 			continue;
3057 
3058 		err = ptype->gro_complete(skb);
3059 		break;
3060 	}
3061 	rcu_read_unlock();
3062 
3063 	if (err) {
3064 		WARN_ON(&ptype->list == head);
3065 		kfree_skb(skb);
3066 		return NET_RX_SUCCESS;
3067 	}
3068 
3069 out:
3070 	return netif_receive_skb(skb);
3071 }
3072 
3073 inline void napi_gro_flush(struct napi_struct *napi)
3074 {
3075 	struct sk_buff *skb, *next;
3076 
3077 	for (skb = napi->gro_list; skb; skb = next) {
3078 		next = skb->next;
3079 		skb->next = NULL;
3080 		napi_gro_complete(skb);
3081 	}
3082 
3083 	napi->gro_count = 0;
3084 	napi->gro_list = NULL;
3085 }
3086 EXPORT_SYMBOL(napi_gro_flush);
3087 
3088 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3089 {
3090 	struct sk_buff **pp = NULL;
3091 	struct packet_type *ptype;
3092 	__be16 type = skb->protocol;
3093 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3094 	int same_flow;
3095 	int mac_len;
3096 	enum gro_result ret;
3097 
3098 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3099 		goto normal;
3100 
3101 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3102 		goto normal;
3103 
3104 	rcu_read_lock();
3105 	list_for_each_entry_rcu(ptype, head, list) {
3106 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3107 			continue;
3108 
3109 		skb_set_network_header(skb, skb_gro_offset(skb));
3110 		mac_len = skb->network_header - skb->mac_header;
3111 		skb->mac_len = mac_len;
3112 		NAPI_GRO_CB(skb)->same_flow = 0;
3113 		NAPI_GRO_CB(skb)->flush = 0;
3114 		NAPI_GRO_CB(skb)->free = 0;
3115 
3116 		pp = ptype->gro_receive(&napi->gro_list, skb);
3117 		break;
3118 	}
3119 	rcu_read_unlock();
3120 
3121 	if (&ptype->list == head)
3122 		goto normal;
3123 
3124 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3125 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3126 
3127 	if (pp) {
3128 		struct sk_buff *nskb = *pp;
3129 
3130 		*pp = nskb->next;
3131 		nskb->next = NULL;
3132 		napi_gro_complete(nskb);
3133 		napi->gro_count--;
3134 	}
3135 
3136 	if (same_flow)
3137 		goto ok;
3138 
3139 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3140 		goto normal;
3141 
3142 	napi->gro_count++;
3143 	NAPI_GRO_CB(skb)->count = 1;
3144 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3145 	skb->next = napi->gro_list;
3146 	napi->gro_list = skb;
3147 	ret = GRO_HELD;
3148 
3149 pull:
3150 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3151 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3152 
3153 		BUG_ON(skb->end - skb->tail < grow);
3154 
3155 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3156 
3157 		skb->tail += grow;
3158 		skb->data_len -= grow;
3159 
3160 		skb_shinfo(skb)->frags[0].page_offset += grow;
3161 		skb_shinfo(skb)->frags[0].size -= grow;
3162 
3163 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3164 			put_page(skb_shinfo(skb)->frags[0].page);
3165 			memmove(skb_shinfo(skb)->frags,
3166 				skb_shinfo(skb)->frags + 1,
3167 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3168 		}
3169 	}
3170 
3171 ok:
3172 	return ret;
3173 
3174 normal:
3175 	ret = GRO_NORMAL;
3176 	goto pull;
3177 }
3178 EXPORT_SYMBOL(dev_gro_receive);
3179 
3180 static inline gro_result_t
3181 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3182 {
3183 	struct sk_buff *p;
3184 
3185 	for (p = napi->gro_list; p; p = p->next) {
3186 		unsigned long diffs;
3187 
3188 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3189 		diffs |= compare_ether_header(skb_mac_header(p),
3190 					      skb_gro_mac_header(skb));
3191 		NAPI_GRO_CB(p)->same_flow = !diffs;
3192 		NAPI_GRO_CB(p)->flush = 0;
3193 	}
3194 
3195 	return dev_gro_receive(napi, skb);
3196 }
3197 
3198 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3199 {
3200 	switch (ret) {
3201 	case GRO_NORMAL:
3202 		if (netif_receive_skb(skb))
3203 			ret = GRO_DROP;
3204 		break;
3205 
3206 	case GRO_DROP:
3207 	case GRO_MERGED_FREE:
3208 		kfree_skb(skb);
3209 		break;
3210 
3211 	case GRO_HELD:
3212 	case GRO_MERGED:
3213 		break;
3214 	}
3215 
3216 	return ret;
3217 }
3218 EXPORT_SYMBOL(napi_skb_finish);
3219 
3220 void skb_gro_reset_offset(struct sk_buff *skb)
3221 {
3222 	NAPI_GRO_CB(skb)->data_offset = 0;
3223 	NAPI_GRO_CB(skb)->frag0 = NULL;
3224 	NAPI_GRO_CB(skb)->frag0_len = 0;
3225 
3226 	if (skb->mac_header == skb->tail &&
3227 	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3228 		NAPI_GRO_CB(skb)->frag0 =
3229 			page_address(skb_shinfo(skb)->frags[0].page) +
3230 			skb_shinfo(skb)->frags[0].page_offset;
3231 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3232 	}
3233 }
3234 EXPORT_SYMBOL(skb_gro_reset_offset);
3235 
3236 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3237 {
3238 	skb_gro_reset_offset(skb);
3239 
3240 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3241 }
3242 EXPORT_SYMBOL(napi_gro_receive);
3243 
3244 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3245 {
3246 	__skb_pull(skb, skb_headlen(skb));
3247 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3248 
3249 	napi->skb = skb;
3250 }
3251 EXPORT_SYMBOL(napi_reuse_skb);
3252 
3253 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3254 {
3255 	struct sk_buff *skb = napi->skb;
3256 
3257 	if (!skb) {
3258 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3259 		if (skb)
3260 			napi->skb = skb;
3261 	}
3262 	return skb;
3263 }
3264 EXPORT_SYMBOL(napi_get_frags);
3265 
3266 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3267 			       gro_result_t ret)
3268 {
3269 	switch (ret) {
3270 	case GRO_NORMAL:
3271 	case GRO_HELD:
3272 		skb->protocol = eth_type_trans(skb, skb->dev);
3273 
3274 		if (ret == GRO_HELD)
3275 			skb_gro_pull(skb, -ETH_HLEN);
3276 		else if (netif_receive_skb(skb))
3277 			ret = GRO_DROP;
3278 		break;
3279 
3280 	case GRO_DROP:
3281 	case GRO_MERGED_FREE:
3282 		napi_reuse_skb(napi, skb);
3283 		break;
3284 
3285 	case GRO_MERGED:
3286 		break;
3287 	}
3288 
3289 	return ret;
3290 }
3291 EXPORT_SYMBOL(napi_frags_finish);
3292 
3293 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3294 {
3295 	struct sk_buff *skb = napi->skb;
3296 	struct ethhdr *eth;
3297 	unsigned int hlen;
3298 	unsigned int off;
3299 
3300 	napi->skb = NULL;
3301 
3302 	skb_reset_mac_header(skb);
3303 	skb_gro_reset_offset(skb);
3304 
3305 	off = skb_gro_offset(skb);
3306 	hlen = off + sizeof(*eth);
3307 	eth = skb_gro_header_fast(skb, off);
3308 	if (skb_gro_header_hard(skb, hlen)) {
3309 		eth = skb_gro_header_slow(skb, hlen, off);
3310 		if (unlikely(!eth)) {
3311 			napi_reuse_skb(napi, skb);
3312 			skb = NULL;
3313 			goto out;
3314 		}
3315 	}
3316 
3317 	skb_gro_pull(skb, sizeof(*eth));
3318 
3319 	/*
3320 	 * This works because the only protocols we care about don't require
3321 	 * special handling.  We'll fix it up properly at the end.
3322 	 */
3323 	skb->protocol = eth->h_proto;
3324 
3325 out:
3326 	return skb;
3327 }
3328 EXPORT_SYMBOL(napi_frags_skb);
3329 
3330 gro_result_t napi_gro_frags(struct napi_struct *napi)
3331 {
3332 	struct sk_buff *skb = napi_frags_skb(napi);
3333 
3334 	if (!skb)
3335 		return GRO_DROP;
3336 
3337 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3338 }
3339 EXPORT_SYMBOL(napi_gro_frags);
3340 
3341 /*
3342  * net_rps_action sends any pending IPI's for rps.
3343  * Note: called with local irq disabled, but exits with local irq enabled.
3344  */
3345 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3346 {
3347 #ifdef CONFIG_RPS
3348 	struct softnet_data *remsd = sd->rps_ipi_list;
3349 
3350 	if (remsd) {
3351 		sd->rps_ipi_list = NULL;
3352 
3353 		local_irq_enable();
3354 
3355 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3356 		while (remsd) {
3357 			struct softnet_data *next = remsd->rps_ipi_next;
3358 
3359 			if (cpu_online(remsd->cpu))
3360 				__smp_call_function_single(remsd->cpu,
3361 							   &remsd->csd, 0);
3362 			remsd = next;
3363 		}
3364 	} else
3365 #endif
3366 		local_irq_enable();
3367 }
3368 
3369 static int process_backlog(struct napi_struct *napi, int quota)
3370 {
3371 	int work = 0;
3372 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3373 
3374 #ifdef CONFIG_RPS
3375 	/* Check if we have pending ipi, its better to send them now,
3376 	 * not waiting net_rx_action() end.
3377 	 */
3378 	if (sd->rps_ipi_list) {
3379 		local_irq_disable();
3380 		net_rps_action_and_irq_enable(sd);
3381 	}
3382 #endif
3383 	napi->weight = weight_p;
3384 	local_irq_disable();
3385 	while (work < quota) {
3386 		struct sk_buff *skb;
3387 		unsigned int qlen;
3388 
3389 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3390 			local_irq_enable();
3391 			__netif_receive_skb(skb);
3392 			local_irq_disable();
3393 			input_queue_head_incr(sd);
3394 			if (++work >= quota) {
3395 				local_irq_enable();
3396 				return work;
3397 			}
3398 		}
3399 
3400 		rps_lock(sd);
3401 		qlen = skb_queue_len(&sd->input_pkt_queue);
3402 		if (qlen)
3403 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3404 						   &sd->process_queue);
3405 
3406 		if (qlen < quota - work) {
3407 			/*
3408 			 * Inline a custom version of __napi_complete().
3409 			 * only current cpu owns and manipulates this napi,
3410 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3411 			 * we can use a plain write instead of clear_bit(),
3412 			 * and we dont need an smp_mb() memory barrier.
3413 			 */
3414 			list_del(&napi->poll_list);
3415 			napi->state = 0;
3416 
3417 			quota = work + qlen;
3418 		}
3419 		rps_unlock(sd);
3420 	}
3421 	local_irq_enable();
3422 
3423 	return work;
3424 }
3425 
3426 /**
3427  * __napi_schedule - schedule for receive
3428  * @n: entry to schedule
3429  *
3430  * The entry's receive function will be scheduled to run
3431  */
3432 void __napi_schedule(struct napi_struct *n)
3433 {
3434 	unsigned long flags;
3435 
3436 	local_irq_save(flags);
3437 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3438 	local_irq_restore(flags);
3439 }
3440 EXPORT_SYMBOL(__napi_schedule);
3441 
3442 void __napi_complete(struct napi_struct *n)
3443 {
3444 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3445 	BUG_ON(n->gro_list);
3446 
3447 	list_del(&n->poll_list);
3448 	smp_mb__before_clear_bit();
3449 	clear_bit(NAPI_STATE_SCHED, &n->state);
3450 }
3451 EXPORT_SYMBOL(__napi_complete);
3452 
3453 void napi_complete(struct napi_struct *n)
3454 {
3455 	unsigned long flags;
3456 
3457 	/*
3458 	 * don't let napi dequeue from the cpu poll list
3459 	 * just in case its running on a different cpu
3460 	 */
3461 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3462 		return;
3463 
3464 	napi_gro_flush(n);
3465 	local_irq_save(flags);
3466 	__napi_complete(n);
3467 	local_irq_restore(flags);
3468 }
3469 EXPORT_SYMBOL(napi_complete);
3470 
3471 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3472 		    int (*poll)(struct napi_struct *, int), int weight)
3473 {
3474 	INIT_LIST_HEAD(&napi->poll_list);
3475 	napi->gro_count = 0;
3476 	napi->gro_list = NULL;
3477 	napi->skb = NULL;
3478 	napi->poll = poll;
3479 	napi->weight = weight;
3480 	list_add(&napi->dev_list, &dev->napi_list);
3481 	napi->dev = dev;
3482 #ifdef CONFIG_NETPOLL
3483 	spin_lock_init(&napi->poll_lock);
3484 	napi->poll_owner = -1;
3485 #endif
3486 	set_bit(NAPI_STATE_SCHED, &napi->state);
3487 }
3488 EXPORT_SYMBOL(netif_napi_add);
3489 
3490 void netif_napi_del(struct napi_struct *napi)
3491 {
3492 	struct sk_buff *skb, *next;
3493 
3494 	list_del_init(&napi->dev_list);
3495 	napi_free_frags(napi);
3496 
3497 	for (skb = napi->gro_list; skb; skb = next) {
3498 		next = skb->next;
3499 		skb->next = NULL;
3500 		kfree_skb(skb);
3501 	}
3502 
3503 	napi->gro_list = NULL;
3504 	napi->gro_count = 0;
3505 }
3506 EXPORT_SYMBOL(netif_napi_del);
3507 
3508 static void net_rx_action(struct softirq_action *h)
3509 {
3510 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3511 	unsigned long time_limit = jiffies + 2;
3512 	int budget = netdev_budget;
3513 	void *have;
3514 
3515 	local_irq_disable();
3516 
3517 	while (!list_empty(&sd->poll_list)) {
3518 		struct napi_struct *n;
3519 		int work, weight;
3520 
3521 		/* If softirq window is exhuasted then punt.
3522 		 * Allow this to run for 2 jiffies since which will allow
3523 		 * an average latency of 1.5/HZ.
3524 		 */
3525 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3526 			goto softnet_break;
3527 
3528 		local_irq_enable();
3529 
3530 		/* Even though interrupts have been re-enabled, this
3531 		 * access is safe because interrupts can only add new
3532 		 * entries to the tail of this list, and only ->poll()
3533 		 * calls can remove this head entry from the list.
3534 		 */
3535 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3536 
3537 		have = netpoll_poll_lock(n);
3538 
3539 		weight = n->weight;
3540 
3541 		/* This NAPI_STATE_SCHED test is for avoiding a race
3542 		 * with netpoll's poll_napi().  Only the entity which
3543 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3544 		 * actually make the ->poll() call.  Therefore we avoid
3545 		 * accidently calling ->poll() when NAPI is not scheduled.
3546 		 */
3547 		work = 0;
3548 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3549 			work = n->poll(n, weight);
3550 			trace_napi_poll(n);
3551 		}
3552 
3553 		WARN_ON_ONCE(work > weight);
3554 
3555 		budget -= work;
3556 
3557 		local_irq_disable();
3558 
3559 		/* Drivers must not modify the NAPI state if they
3560 		 * consume the entire weight.  In such cases this code
3561 		 * still "owns" the NAPI instance and therefore can
3562 		 * move the instance around on the list at-will.
3563 		 */
3564 		if (unlikely(work == weight)) {
3565 			if (unlikely(napi_disable_pending(n))) {
3566 				local_irq_enable();
3567 				napi_complete(n);
3568 				local_irq_disable();
3569 			} else
3570 				list_move_tail(&n->poll_list, &sd->poll_list);
3571 		}
3572 
3573 		netpoll_poll_unlock(have);
3574 	}
3575 out:
3576 	net_rps_action_and_irq_enable(sd);
3577 
3578 #ifdef CONFIG_NET_DMA
3579 	/*
3580 	 * There may not be any more sk_buffs coming right now, so push
3581 	 * any pending DMA copies to hardware
3582 	 */
3583 	dma_issue_pending_all();
3584 #endif
3585 
3586 	return;
3587 
3588 softnet_break:
3589 	sd->time_squeeze++;
3590 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3591 	goto out;
3592 }
3593 
3594 static gifconf_func_t *gifconf_list[NPROTO];
3595 
3596 /**
3597  *	register_gifconf	-	register a SIOCGIF handler
3598  *	@family: Address family
3599  *	@gifconf: Function handler
3600  *
3601  *	Register protocol dependent address dumping routines. The handler
3602  *	that is passed must not be freed or reused until it has been replaced
3603  *	by another handler.
3604  */
3605 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3606 {
3607 	if (family >= NPROTO)
3608 		return -EINVAL;
3609 	gifconf_list[family] = gifconf;
3610 	return 0;
3611 }
3612 EXPORT_SYMBOL(register_gifconf);
3613 
3614 
3615 /*
3616  *	Map an interface index to its name (SIOCGIFNAME)
3617  */
3618 
3619 /*
3620  *	We need this ioctl for efficient implementation of the
3621  *	if_indextoname() function required by the IPv6 API.  Without
3622  *	it, we would have to search all the interfaces to find a
3623  *	match.  --pb
3624  */
3625 
3626 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3627 {
3628 	struct net_device *dev;
3629 	struct ifreq ifr;
3630 
3631 	/*
3632 	 *	Fetch the caller's info block.
3633 	 */
3634 
3635 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3636 		return -EFAULT;
3637 
3638 	rcu_read_lock();
3639 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3640 	if (!dev) {
3641 		rcu_read_unlock();
3642 		return -ENODEV;
3643 	}
3644 
3645 	strcpy(ifr.ifr_name, dev->name);
3646 	rcu_read_unlock();
3647 
3648 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3649 		return -EFAULT;
3650 	return 0;
3651 }
3652 
3653 /*
3654  *	Perform a SIOCGIFCONF call. This structure will change
3655  *	size eventually, and there is nothing I can do about it.
3656  *	Thus we will need a 'compatibility mode'.
3657  */
3658 
3659 static int dev_ifconf(struct net *net, char __user *arg)
3660 {
3661 	struct ifconf ifc;
3662 	struct net_device *dev;
3663 	char __user *pos;
3664 	int len;
3665 	int total;
3666 	int i;
3667 
3668 	/*
3669 	 *	Fetch the caller's info block.
3670 	 */
3671 
3672 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3673 		return -EFAULT;
3674 
3675 	pos = ifc.ifc_buf;
3676 	len = ifc.ifc_len;
3677 
3678 	/*
3679 	 *	Loop over the interfaces, and write an info block for each.
3680 	 */
3681 
3682 	total = 0;
3683 	for_each_netdev(net, dev) {
3684 		for (i = 0; i < NPROTO; i++) {
3685 			if (gifconf_list[i]) {
3686 				int done;
3687 				if (!pos)
3688 					done = gifconf_list[i](dev, NULL, 0);
3689 				else
3690 					done = gifconf_list[i](dev, pos + total,
3691 							       len - total);
3692 				if (done < 0)
3693 					return -EFAULT;
3694 				total += done;
3695 			}
3696 		}
3697 	}
3698 
3699 	/*
3700 	 *	All done.  Write the updated control block back to the caller.
3701 	 */
3702 	ifc.ifc_len = total;
3703 
3704 	/*
3705 	 * 	Both BSD and Solaris return 0 here, so we do too.
3706 	 */
3707 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3708 }
3709 
3710 #ifdef CONFIG_PROC_FS
3711 /*
3712  *	This is invoked by the /proc filesystem handler to display a device
3713  *	in detail.
3714  */
3715 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3716 	__acquires(RCU)
3717 {
3718 	struct net *net = seq_file_net(seq);
3719 	loff_t off;
3720 	struct net_device *dev;
3721 
3722 	rcu_read_lock();
3723 	if (!*pos)
3724 		return SEQ_START_TOKEN;
3725 
3726 	off = 1;
3727 	for_each_netdev_rcu(net, dev)
3728 		if (off++ == *pos)
3729 			return dev;
3730 
3731 	return NULL;
3732 }
3733 
3734 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3735 {
3736 	struct net_device *dev = (v == SEQ_START_TOKEN) ?
3737 				  first_net_device(seq_file_net(seq)) :
3738 				  next_net_device((struct net_device *)v);
3739 
3740 	++*pos;
3741 	return rcu_dereference(dev);
3742 }
3743 
3744 void dev_seq_stop(struct seq_file *seq, void *v)
3745 	__releases(RCU)
3746 {
3747 	rcu_read_unlock();
3748 }
3749 
3750 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3751 {
3752 	struct rtnl_link_stats64 temp;
3753 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3754 
3755 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3756 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3757 		   dev->name, stats->rx_bytes, stats->rx_packets,
3758 		   stats->rx_errors,
3759 		   stats->rx_dropped + stats->rx_missed_errors,
3760 		   stats->rx_fifo_errors,
3761 		   stats->rx_length_errors + stats->rx_over_errors +
3762 		    stats->rx_crc_errors + stats->rx_frame_errors,
3763 		   stats->rx_compressed, stats->multicast,
3764 		   stats->tx_bytes, stats->tx_packets,
3765 		   stats->tx_errors, stats->tx_dropped,
3766 		   stats->tx_fifo_errors, stats->collisions,
3767 		   stats->tx_carrier_errors +
3768 		    stats->tx_aborted_errors +
3769 		    stats->tx_window_errors +
3770 		    stats->tx_heartbeat_errors,
3771 		   stats->tx_compressed);
3772 }
3773 
3774 /*
3775  *	Called from the PROCfs module. This now uses the new arbitrary sized
3776  *	/proc/net interface to create /proc/net/dev
3777  */
3778 static int dev_seq_show(struct seq_file *seq, void *v)
3779 {
3780 	if (v == SEQ_START_TOKEN)
3781 		seq_puts(seq, "Inter-|   Receive                            "
3782 			      "                    |  Transmit\n"
3783 			      " face |bytes    packets errs drop fifo frame "
3784 			      "compressed multicast|bytes    packets errs "
3785 			      "drop fifo colls carrier compressed\n");
3786 	else
3787 		dev_seq_printf_stats(seq, v);
3788 	return 0;
3789 }
3790 
3791 static struct softnet_data *softnet_get_online(loff_t *pos)
3792 {
3793 	struct softnet_data *sd = NULL;
3794 
3795 	while (*pos < nr_cpu_ids)
3796 		if (cpu_online(*pos)) {
3797 			sd = &per_cpu(softnet_data, *pos);
3798 			break;
3799 		} else
3800 			++*pos;
3801 	return sd;
3802 }
3803 
3804 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3805 {
3806 	return softnet_get_online(pos);
3807 }
3808 
3809 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3810 {
3811 	++*pos;
3812 	return softnet_get_online(pos);
3813 }
3814 
3815 static void softnet_seq_stop(struct seq_file *seq, void *v)
3816 {
3817 }
3818 
3819 static int softnet_seq_show(struct seq_file *seq, void *v)
3820 {
3821 	struct softnet_data *sd = v;
3822 
3823 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3824 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
3825 		   0, 0, 0, 0, /* was fastroute */
3826 		   sd->cpu_collision, sd->received_rps);
3827 	return 0;
3828 }
3829 
3830 static const struct seq_operations dev_seq_ops = {
3831 	.start = dev_seq_start,
3832 	.next  = dev_seq_next,
3833 	.stop  = dev_seq_stop,
3834 	.show  = dev_seq_show,
3835 };
3836 
3837 static int dev_seq_open(struct inode *inode, struct file *file)
3838 {
3839 	return seq_open_net(inode, file, &dev_seq_ops,
3840 			    sizeof(struct seq_net_private));
3841 }
3842 
3843 static const struct file_operations dev_seq_fops = {
3844 	.owner	 = THIS_MODULE,
3845 	.open    = dev_seq_open,
3846 	.read    = seq_read,
3847 	.llseek  = seq_lseek,
3848 	.release = seq_release_net,
3849 };
3850 
3851 static const struct seq_operations softnet_seq_ops = {
3852 	.start = softnet_seq_start,
3853 	.next  = softnet_seq_next,
3854 	.stop  = softnet_seq_stop,
3855 	.show  = softnet_seq_show,
3856 };
3857 
3858 static int softnet_seq_open(struct inode *inode, struct file *file)
3859 {
3860 	return seq_open(file, &softnet_seq_ops);
3861 }
3862 
3863 static const struct file_operations softnet_seq_fops = {
3864 	.owner	 = THIS_MODULE,
3865 	.open    = softnet_seq_open,
3866 	.read    = seq_read,
3867 	.llseek  = seq_lseek,
3868 	.release = seq_release,
3869 };
3870 
3871 static void *ptype_get_idx(loff_t pos)
3872 {
3873 	struct packet_type *pt = NULL;
3874 	loff_t i = 0;
3875 	int t;
3876 
3877 	list_for_each_entry_rcu(pt, &ptype_all, list) {
3878 		if (i == pos)
3879 			return pt;
3880 		++i;
3881 	}
3882 
3883 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3884 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3885 			if (i == pos)
3886 				return pt;
3887 			++i;
3888 		}
3889 	}
3890 	return NULL;
3891 }
3892 
3893 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3894 	__acquires(RCU)
3895 {
3896 	rcu_read_lock();
3897 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3898 }
3899 
3900 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3901 {
3902 	struct packet_type *pt;
3903 	struct list_head *nxt;
3904 	int hash;
3905 
3906 	++*pos;
3907 	if (v == SEQ_START_TOKEN)
3908 		return ptype_get_idx(0);
3909 
3910 	pt = v;
3911 	nxt = pt->list.next;
3912 	if (pt->type == htons(ETH_P_ALL)) {
3913 		if (nxt != &ptype_all)
3914 			goto found;
3915 		hash = 0;
3916 		nxt = ptype_base[0].next;
3917 	} else
3918 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3919 
3920 	while (nxt == &ptype_base[hash]) {
3921 		if (++hash >= PTYPE_HASH_SIZE)
3922 			return NULL;
3923 		nxt = ptype_base[hash].next;
3924 	}
3925 found:
3926 	return list_entry(nxt, struct packet_type, list);
3927 }
3928 
3929 static void ptype_seq_stop(struct seq_file *seq, void *v)
3930 	__releases(RCU)
3931 {
3932 	rcu_read_unlock();
3933 }
3934 
3935 static int ptype_seq_show(struct seq_file *seq, void *v)
3936 {
3937 	struct packet_type *pt = v;
3938 
3939 	if (v == SEQ_START_TOKEN)
3940 		seq_puts(seq, "Type Device      Function\n");
3941 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3942 		if (pt->type == htons(ETH_P_ALL))
3943 			seq_puts(seq, "ALL ");
3944 		else
3945 			seq_printf(seq, "%04x", ntohs(pt->type));
3946 
3947 		seq_printf(seq, " %-8s %pF\n",
3948 			   pt->dev ? pt->dev->name : "", pt->func);
3949 	}
3950 
3951 	return 0;
3952 }
3953 
3954 static const struct seq_operations ptype_seq_ops = {
3955 	.start = ptype_seq_start,
3956 	.next  = ptype_seq_next,
3957 	.stop  = ptype_seq_stop,
3958 	.show  = ptype_seq_show,
3959 };
3960 
3961 static int ptype_seq_open(struct inode *inode, struct file *file)
3962 {
3963 	return seq_open_net(inode, file, &ptype_seq_ops,
3964 			sizeof(struct seq_net_private));
3965 }
3966 
3967 static const struct file_operations ptype_seq_fops = {
3968 	.owner	 = THIS_MODULE,
3969 	.open    = ptype_seq_open,
3970 	.read    = seq_read,
3971 	.llseek  = seq_lseek,
3972 	.release = seq_release_net,
3973 };
3974 
3975 
3976 static int __net_init dev_proc_net_init(struct net *net)
3977 {
3978 	int rc = -ENOMEM;
3979 
3980 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3981 		goto out;
3982 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3983 		goto out_dev;
3984 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3985 		goto out_softnet;
3986 
3987 	if (wext_proc_init(net))
3988 		goto out_ptype;
3989 	rc = 0;
3990 out:
3991 	return rc;
3992 out_ptype:
3993 	proc_net_remove(net, "ptype");
3994 out_softnet:
3995 	proc_net_remove(net, "softnet_stat");
3996 out_dev:
3997 	proc_net_remove(net, "dev");
3998 	goto out;
3999 }
4000 
4001 static void __net_exit dev_proc_net_exit(struct net *net)
4002 {
4003 	wext_proc_exit(net);
4004 
4005 	proc_net_remove(net, "ptype");
4006 	proc_net_remove(net, "softnet_stat");
4007 	proc_net_remove(net, "dev");
4008 }
4009 
4010 static struct pernet_operations __net_initdata dev_proc_ops = {
4011 	.init = dev_proc_net_init,
4012 	.exit = dev_proc_net_exit,
4013 };
4014 
4015 static int __init dev_proc_init(void)
4016 {
4017 	return register_pernet_subsys(&dev_proc_ops);
4018 }
4019 #else
4020 #define dev_proc_init() 0
4021 #endif	/* CONFIG_PROC_FS */
4022 
4023 
4024 /**
4025  *	netdev_set_master	-	set up master/slave pair
4026  *	@slave: slave device
4027  *	@master: new master device
4028  *
4029  *	Changes the master device of the slave. Pass %NULL to break the
4030  *	bonding. The caller must hold the RTNL semaphore. On a failure
4031  *	a negative errno code is returned. On success the reference counts
4032  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4033  *	function returns zero.
4034  */
4035 int netdev_set_master(struct net_device *slave, struct net_device *master)
4036 {
4037 	struct net_device *old = slave->master;
4038 
4039 	ASSERT_RTNL();
4040 
4041 	if (master) {
4042 		if (old)
4043 			return -EBUSY;
4044 		dev_hold(master);
4045 	}
4046 
4047 	slave->master = master;
4048 
4049 	if (old) {
4050 		synchronize_net();
4051 		dev_put(old);
4052 	}
4053 	if (master)
4054 		slave->flags |= IFF_SLAVE;
4055 	else
4056 		slave->flags &= ~IFF_SLAVE;
4057 
4058 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4059 	return 0;
4060 }
4061 EXPORT_SYMBOL(netdev_set_master);
4062 
4063 static void dev_change_rx_flags(struct net_device *dev, int flags)
4064 {
4065 	const struct net_device_ops *ops = dev->netdev_ops;
4066 
4067 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4068 		ops->ndo_change_rx_flags(dev, flags);
4069 }
4070 
4071 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4072 {
4073 	unsigned short old_flags = dev->flags;
4074 	uid_t uid;
4075 	gid_t gid;
4076 
4077 	ASSERT_RTNL();
4078 
4079 	dev->flags |= IFF_PROMISC;
4080 	dev->promiscuity += inc;
4081 	if (dev->promiscuity == 0) {
4082 		/*
4083 		 * Avoid overflow.
4084 		 * If inc causes overflow, untouch promisc and return error.
4085 		 */
4086 		if (inc < 0)
4087 			dev->flags &= ~IFF_PROMISC;
4088 		else {
4089 			dev->promiscuity -= inc;
4090 			printk(KERN_WARNING "%s: promiscuity touches roof, "
4091 				"set promiscuity failed, promiscuity feature "
4092 				"of device might be broken.\n", dev->name);
4093 			return -EOVERFLOW;
4094 		}
4095 	}
4096 	if (dev->flags != old_flags) {
4097 		printk(KERN_INFO "device %s %s promiscuous mode\n",
4098 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4099 							       "left");
4100 		if (audit_enabled) {
4101 			current_uid_gid(&uid, &gid);
4102 			audit_log(current->audit_context, GFP_ATOMIC,
4103 				AUDIT_ANOM_PROMISCUOUS,
4104 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4105 				dev->name, (dev->flags & IFF_PROMISC),
4106 				(old_flags & IFF_PROMISC),
4107 				audit_get_loginuid(current),
4108 				uid, gid,
4109 				audit_get_sessionid(current));
4110 		}
4111 
4112 		dev_change_rx_flags(dev, IFF_PROMISC);
4113 	}
4114 	return 0;
4115 }
4116 
4117 /**
4118  *	dev_set_promiscuity	- update promiscuity count on a device
4119  *	@dev: device
4120  *	@inc: modifier
4121  *
4122  *	Add or remove promiscuity from a device. While the count in the device
4123  *	remains above zero the interface remains promiscuous. Once it hits zero
4124  *	the device reverts back to normal filtering operation. A negative inc
4125  *	value is used to drop promiscuity on the device.
4126  *	Return 0 if successful or a negative errno code on error.
4127  */
4128 int dev_set_promiscuity(struct net_device *dev, int inc)
4129 {
4130 	unsigned short old_flags = dev->flags;
4131 	int err;
4132 
4133 	err = __dev_set_promiscuity(dev, inc);
4134 	if (err < 0)
4135 		return err;
4136 	if (dev->flags != old_flags)
4137 		dev_set_rx_mode(dev);
4138 	return err;
4139 }
4140 EXPORT_SYMBOL(dev_set_promiscuity);
4141 
4142 /**
4143  *	dev_set_allmulti	- update allmulti count on a device
4144  *	@dev: device
4145  *	@inc: modifier
4146  *
4147  *	Add or remove reception of all multicast frames to a device. While the
4148  *	count in the device remains above zero the interface remains listening
4149  *	to all interfaces. Once it hits zero the device reverts back to normal
4150  *	filtering operation. A negative @inc value is used to drop the counter
4151  *	when releasing a resource needing all multicasts.
4152  *	Return 0 if successful or a negative errno code on error.
4153  */
4154 
4155 int dev_set_allmulti(struct net_device *dev, int inc)
4156 {
4157 	unsigned short old_flags = dev->flags;
4158 
4159 	ASSERT_RTNL();
4160 
4161 	dev->flags |= IFF_ALLMULTI;
4162 	dev->allmulti += inc;
4163 	if (dev->allmulti == 0) {
4164 		/*
4165 		 * Avoid overflow.
4166 		 * If inc causes overflow, untouch allmulti and return error.
4167 		 */
4168 		if (inc < 0)
4169 			dev->flags &= ~IFF_ALLMULTI;
4170 		else {
4171 			dev->allmulti -= inc;
4172 			printk(KERN_WARNING "%s: allmulti touches roof, "
4173 				"set allmulti failed, allmulti feature of "
4174 				"device might be broken.\n", dev->name);
4175 			return -EOVERFLOW;
4176 		}
4177 	}
4178 	if (dev->flags ^ old_flags) {
4179 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4180 		dev_set_rx_mode(dev);
4181 	}
4182 	return 0;
4183 }
4184 EXPORT_SYMBOL(dev_set_allmulti);
4185 
4186 /*
4187  *	Upload unicast and multicast address lists to device and
4188  *	configure RX filtering. When the device doesn't support unicast
4189  *	filtering it is put in promiscuous mode while unicast addresses
4190  *	are present.
4191  */
4192 void __dev_set_rx_mode(struct net_device *dev)
4193 {
4194 	const struct net_device_ops *ops = dev->netdev_ops;
4195 
4196 	/* dev_open will call this function so the list will stay sane. */
4197 	if (!(dev->flags&IFF_UP))
4198 		return;
4199 
4200 	if (!netif_device_present(dev))
4201 		return;
4202 
4203 	if (ops->ndo_set_rx_mode)
4204 		ops->ndo_set_rx_mode(dev);
4205 	else {
4206 		/* Unicast addresses changes may only happen under the rtnl,
4207 		 * therefore calling __dev_set_promiscuity here is safe.
4208 		 */
4209 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4210 			__dev_set_promiscuity(dev, 1);
4211 			dev->uc_promisc = 1;
4212 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4213 			__dev_set_promiscuity(dev, -1);
4214 			dev->uc_promisc = 0;
4215 		}
4216 
4217 		if (ops->ndo_set_multicast_list)
4218 			ops->ndo_set_multicast_list(dev);
4219 	}
4220 }
4221 
4222 void dev_set_rx_mode(struct net_device *dev)
4223 {
4224 	netif_addr_lock_bh(dev);
4225 	__dev_set_rx_mode(dev);
4226 	netif_addr_unlock_bh(dev);
4227 }
4228 
4229 /**
4230  *	dev_get_flags - get flags reported to userspace
4231  *	@dev: device
4232  *
4233  *	Get the combination of flag bits exported through APIs to userspace.
4234  */
4235 unsigned dev_get_flags(const struct net_device *dev)
4236 {
4237 	unsigned flags;
4238 
4239 	flags = (dev->flags & ~(IFF_PROMISC |
4240 				IFF_ALLMULTI |
4241 				IFF_RUNNING |
4242 				IFF_LOWER_UP |
4243 				IFF_DORMANT)) |
4244 		(dev->gflags & (IFF_PROMISC |
4245 				IFF_ALLMULTI));
4246 
4247 	if (netif_running(dev)) {
4248 		if (netif_oper_up(dev))
4249 			flags |= IFF_RUNNING;
4250 		if (netif_carrier_ok(dev))
4251 			flags |= IFF_LOWER_UP;
4252 		if (netif_dormant(dev))
4253 			flags |= IFF_DORMANT;
4254 	}
4255 
4256 	return flags;
4257 }
4258 EXPORT_SYMBOL(dev_get_flags);
4259 
4260 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4261 {
4262 	int old_flags = dev->flags;
4263 	int ret;
4264 
4265 	ASSERT_RTNL();
4266 
4267 	/*
4268 	 *	Set the flags on our device.
4269 	 */
4270 
4271 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4272 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4273 			       IFF_AUTOMEDIA)) |
4274 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4275 				    IFF_ALLMULTI));
4276 
4277 	/*
4278 	 *	Load in the correct multicast list now the flags have changed.
4279 	 */
4280 
4281 	if ((old_flags ^ flags) & IFF_MULTICAST)
4282 		dev_change_rx_flags(dev, IFF_MULTICAST);
4283 
4284 	dev_set_rx_mode(dev);
4285 
4286 	/*
4287 	 *	Have we downed the interface. We handle IFF_UP ourselves
4288 	 *	according to user attempts to set it, rather than blindly
4289 	 *	setting it.
4290 	 */
4291 
4292 	ret = 0;
4293 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4294 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4295 
4296 		if (!ret)
4297 			dev_set_rx_mode(dev);
4298 	}
4299 
4300 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4301 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4302 
4303 		dev->gflags ^= IFF_PROMISC;
4304 		dev_set_promiscuity(dev, inc);
4305 	}
4306 
4307 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4308 	   is important. Some (broken) drivers set IFF_PROMISC, when
4309 	   IFF_ALLMULTI is requested not asking us and not reporting.
4310 	 */
4311 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4312 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4313 
4314 		dev->gflags ^= IFF_ALLMULTI;
4315 		dev_set_allmulti(dev, inc);
4316 	}
4317 
4318 	return ret;
4319 }
4320 
4321 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4322 {
4323 	unsigned int changes = dev->flags ^ old_flags;
4324 
4325 	if (changes & IFF_UP) {
4326 		if (dev->flags & IFF_UP)
4327 			call_netdevice_notifiers(NETDEV_UP, dev);
4328 		else
4329 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4330 	}
4331 
4332 	if (dev->flags & IFF_UP &&
4333 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4334 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4335 }
4336 
4337 /**
4338  *	dev_change_flags - change device settings
4339  *	@dev: device
4340  *	@flags: device state flags
4341  *
4342  *	Change settings on device based state flags. The flags are
4343  *	in the userspace exported format.
4344  */
4345 int dev_change_flags(struct net_device *dev, unsigned flags)
4346 {
4347 	int ret, changes;
4348 	int old_flags = dev->flags;
4349 
4350 	ret = __dev_change_flags(dev, flags);
4351 	if (ret < 0)
4352 		return ret;
4353 
4354 	changes = old_flags ^ dev->flags;
4355 	if (changes)
4356 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4357 
4358 	__dev_notify_flags(dev, old_flags);
4359 	return ret;
4360 }
4361 EXPORT_SYMBOL(dev_change_flags);
4362 
4363 /**
4364  *	dev_set_mtu - Change maximum transfer unit
4365  *	@dev: device
4366  *	@new_mtu: new transfer unit
4367  *
4368  *	Change the maximum transfer size of the network device.
4369  */
4370 int dev_set_mtu(struct net_device *dev, int new_mtu)
4371 {
4372 	const struct net_device_ops *ops = dev->netdev_ops;
4373 	int err;
4374 
4375 	if (new_mtu == dev->mtu)
4376 		return 0;
4377 
4378 	/*	MTU must be positive.	 */
4379 	if (new_mtu < 0)
4380 		return -EINVAL;
4381 
4382 	if (!netif_device_present(dev))
4383 		return -ENODEV;
4384 
4385 	err = 0;
4386 	if (ops->ndo_change_mtu)
4387 		err = ops->ndo_change_mtu(dev, new_mtu);
4388 	else
4389 		dev->mtu = new_mtu;
4390 
4391 	if (!err && dev->flags & IFF_UP)
4392 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4393 	return err;
4394 }
4395 EXPORT_SYMBOL(dev_set_mtu);
4396 
4397 /**
4398  *	dev_set_mac_address - Change Media Access Control Address
4399  *	@dev: device
4400  *	@sa: new address
4401  *
4402  *	Change the hardware (MAC) address of the device
4403  */
4404 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4405 {
4406 	const struct net_device_ops *ops = dev->netdev_ops;
4407 	int err;
4408 
4409 	if (!ops->ndo_set_mac_address)
4410 		return -EOPNOTSUPP;
4411 	if (sa->sa_family != dev->type)
4412 		return -EINVAL;
4413 	if (!netif_device_present(dev))
4414 		return -ENODEV;
4415 	err = ops->ndo_set_mac_address(dev, sa);
4416 	if (!err)
4417 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4418 	return err;
4419 }
4420 EXPORT_SYMBOL(dev_set_mac_address);
4421 
4422 /*
4423  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4424  */
4425 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4426 {
4427 	int err;
4428 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4429 
4430 	if (!dev)
4431 		return -ENODEV;
4432 
4433 	switch (cmd) {
4434 	case SIOCGIFFLAGS:	/* Get interface flags */
4435 		ifr->ifr_flags = (short) dev_get_flags(dev);
4436 		return 0;
4437 
4438 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4439 				   (currently unused) */
4440 		ifr->ifr_metric = 0;
4441 		return 0;
4442 
4443 	case SIOCGIFMTU:	/* Get the MTU of a device */
4444 		ifr->ifr_mtu = dev->mtu;
4445 		return 0;
4446 
4447 	case SIOCGIFHWADDR:
4448 		if (!dev->addr_len)
4449 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4450 		else
4451 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4452 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4453 		ifr->ifr_hwaddr.sa_family = dev->type;
4454 		return 0;
4455 
4456 	case SIOCGIFSLAVE:
4457 		err = -EINVAL;
4458 		break;
4459 
4460 	case SIOCGIFMAP:
4461 		ifr->ifr_map.mem_start = dev->mem_start;
4462 		ifr->ifr_map.mem_end   = dev->mem_end;
4463 		ifr->ifr_map.base_addr = dev->base_addr;
4464 		ifr->ifr_map.irq       = dev->irq;
4465 		ifr->ifr_map.dma       = dev->dma;
4466 		ifr->ifr_map.port      = dev->if_port;
4467 		return 0;
4468 
4469 	case SIOCGIFINDEX:
4470 		ifr->ifr_ifindex = dev->ifindex;
4471 		return 0;
4472 
4473 	case SIOCGIFTXQLEN:
4474 		ifr->ifr_qlen = dev->tx_queue_len;
4475 		return 0;
4476 
4477 	default:
4478 		/* dev_ioctl() should ensure this case
4479 		 * is never reached
4480 		 */
4481 		WARN_ON(1);
4482 		err = -EINVAL;
4483 		break;
4484 
4485 	}
4486 	return err;
4487 }
4488 
4489 /*
4490  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4491  */
4492 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4493 {
4494 	int err;
4495 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4496 	const struct net_device_ops *ops;
4497 
4498 	if (!dev)
4499 		return -ENODEV;
4500 
4501 	ops = dev->netdev_ops;
4502 
4503 	switch (cmd) {
4504 	case SIOCSIFFLAGS:	/* Set interface flags */
4505 		return dev_change_flags(dev, ifr->ifr_flags);
4506 
4507 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4508 				   (currently unused) */
4509 		return -EOPNOTSUPP;
4510 
4511 	case SIOCSIFMTU:	/* Set the MTU of a device */
4512 		return dev_set_mtu(dev, ifr->ifr_mtu);
4513 
4514 	case SIOCSIFHWADDR:
4515 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4516 
4517 	case SIOCSIFHWBROADCAST:
4518 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4519 			return -EINVAL;
4520 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4521 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4522 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4523 		return 0;
4524 
4525 	case SIOCSIFMAP:
4526 		if (ops->ndo_set_config) {
4527 			if (!netif_device_present(dev))
4528 				return -ENODEV;
4529 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4530 		}
4531 		return -EOPNOTSUPP;
4532 
4533 	case SIOCADDMULTI:
4534 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4535 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4536 			return -EINVAL;
4537 		if (!netif_device_present(dev))
4538 			return -ENODEV;
4539 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4540 
4541 	case SIOCDELMULTI:
4542 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4543 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4544 			return -EINVAL;
4545 		if (!netif_device_present(dev))
4546 			return -ENODEV;
4547 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4548 
4549 	case SIOCSIFTXQLEN:
4550 		if (ifr->ifr_qlen < 0)
4551 			return -EINVAL;
4552 		dev->tx_queue_len = ifr->ifr_qlen;
4553 		return 0;
4554 
4555 	case SIOCSIFNAME:
4556 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4557 		return dev_change_name(dev, ifr->ifr_newname);
4558 
4559 	/*
4560 	 *	Unknown or private ioctl
4561 	 */
4562 	default:
4563 		if ((cmd >= SIOCDEVPRIVATE &&
4564 		    cmd <= SIOCDEVPRIVATE + 15) ||
4565 		    cmd == SIOCBONDENSLAVE ||
4566 		    cmd == SIOCBONDRELEASE ||
4567 		    cmd == SIOCBONDSETHWADDR ||
4568 		    cmd == SIOCBONDSLAVEINFOQUERY ||
4569 		    cmd == SIOCBONDINFOQUERY ||
4570 		    cmd == SIOCBONDCHANGEACTIVE ||
4571 		    cmd == SIOCGMIIPHY ||
4572 		    cmd == SIOCGMIIREG ||
4573 		    cmd == SIOCSMIIREG ||
4574 		    cmd == SIOCBRADDIF ||
4575 		    cmd == SIOCBRDELIF ||
4576 		    cmd == SIOCSHWTSTAMP ||
4577 		    cmd == SIOCWANDEV) {
4578 			err = -EOPNOTSUPP;
4579 			if (ops->ndo_do_ioctl) {
4580 				if (netif_device_present(dev))
4581 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4582 				else
4583 					err = -ENODEV;
4584 			}
4585 		} else
4586 			err = -EINVAL;
4587 
4588 	}
4589 	return err;
4590 }
4591 
4592 /*
4593  *	This function handles all "interface"-type I/O control requests. The actual
4594  *	'doing' part of this is dev_ifsioc above.
4595  */
4596 
4597 /**
4598  *	dev_ioctl	-	network device ioctl
4599  *	@net: the applicable net namespace
4600  *	@cmd: command to issue
4601  *	@arg: pointer to a struct ifreq in user space
4602  *
4603  *	Issue ioctl functions to devices. This is normally called by the
4604  *	user space syscall interfaces but can sometimes be useful for
4605  *	other purposes. The return value is the return from the syscall if
4606  *	positive or a negative errno code on error.
4607  */
4608 
4609 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4610 {
4611 	struct ifreq ifr;
4612 	int ret;
4613 	char *colon;
4614 
4615 	/* One special case: SIOCGIFCONF takes ifconf argument
4616 	   and requires shared lock, because it sleeps writing
4617 	   to user space.
4618 	 */
4619 
4620 	if (cmd == SIOCGIFCONF) {
4621 		rtnl_lock();
4622 		ret = dev_ifconf(net, (char __user *) arg);
4623 		rtnl_unlock();
4624 		return ret;
4625 	}
4626 	if (cmd == SIOCGIFNAME)
4627 		return dev_ifname(net, (struct ifreq __user *)arg);
4628 
4629 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4630 		return -EFAULT;
4631 
4632 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4633 
4634 	colon = strchr(ifr.ifr_name, ':');
4635 	if (colon)
4636 		*colon = 0;
4637 
4638 	/*
4639 	 *	See which interface the caller is talking about.
4640 	 */
4641 
4642 	switch (cmd) {
4643 	/*
4644 	 *	These ioctl calls:
4645 	 *	- can be done by all.
4646 	 *	- atomic and do not require locking.
4647 	 *	- return a value
4648 	 */
4649 	case SIOCGIFFLAGS:
4650 	case SIOCGIFMETRIC:
4651 	case SIOCGIFMTU:
4652 	case SIOCGIFHWADDR:
4653 	case SIOCGIFSLAVE:
4654 	case SIOCGIFMAP:
4655 	case SIOCGIFINDEX:
4656 	case SIOCGIFTXQLEN:
4657 		dev_load(net, ifr.ifr_name);
4658 		rcu_read_lock();
4659 		ret = dev_ifsioc_locked(net, &ifr, cmd);
4660 		rcu_read_unlock();
4661 		if (!ret) {
4662 			if (colon)
4663 				*colon = ':';
4664 			if (copy_to_user(arg, &ifr,
4665 					 sizeof(struct ifreq)))
4666 				ret = -EFAULT;
4667 		}
4668 		return ret;
4669 
4670 	case SIOCETHTOOL:
4671 		dev_load(net, ifr.ifr_name);
4672 		rtnl_lock();
4673 		ret = dev_ethtool(net, &ifr);
4674 		rtnl_unlock();
4675 		if (!ret) {
4676 			if (colon)
4677 				*colon = ':';
4678 			if (copy_to_user(arg, &ifr,
4679 					 sizeof(struct ifreq)))
4680 				ret = -EFAULT;
4681 		}
4682 		return ret;
4683 
4684 	/*
4685 	 *	These ioctl calls:
4686 	 *	- require superuser power.
4687 	 *	- require strict serialization.
4688 	 *	- return a value
4689 	 */
4690 	case SIOCGMIIPHY:
4691 	case SIOCGMIIREG:
4692 	case SIOCSIFNAME:
4693 		if (!capable(CAP_NET_ADMIN))
4694 			return -EPERM;
4695 		dev_load(net, ifr.ifr_name);
4696 		rtnl_lock();
4697 		ret = dev_ifsioc(net, &ifr, cmd);
4698 		rtnl_unlock();
4699 		if (!ret) {
4700 			if (colon)
4701 				*colon = ':';
4702 			if (copy_to_user(arg, &ifr,
4703 					 sizeof(struct ifreq)))
4704 				ret = -EFAULT;
4705 		}
4706 		return ret;
4707 
4708 	/*
4709 	 *	These ioctl calls:
4710 	 *	- require superuser power.
4711 	 *	- require strict serialization.
4712 	 *	- do not return a value
4713 	 */
4714 	case SIOCSIFFLAGS:
4715 	case SIOCSIFMETRIC:
4716 	case SIOCSIFMTU:
4717 	case SIOCSIFMAP:
4718 	case SIOCSIFHWADDR:
4719 	case SIOCSIFSLAVE:
4720 	case SIOCADDMULTI:
4721 	case SIOCDELMULTI:
4722 	case SIOCSIFHWBROADCAST:
4723 	case SIOCSIFTXQLEN:
4724 	case SIOCSMIIREG:
4725 	case SIOCBONDENSLAVE:
4726 	case SIOCBONDRELEASE:
4727 	case SIOCBONDSETHWADDR:
4728 	case SIOCBONDCHANGEACTIVE:
4729 	case SIOCBRADDIF:
4730 	case SIOCBRDELIF:
4731 	case SIOCSHWTSTAMP:
4732 		if (!capable(CAP_NET_ADMIN))
4733 			return -EPERM;
4734 		/* fall through */
4735 	case SIOCBONDSLAVEINFOQUERY:
4736 	case SIOCBONDINFOQUERY:
4737 		dev_load(net, ifr.ifr_name);
4738 		rtnl_lock();
4739 		ret = dev_ifsioc(net, &ifr, cmd);
4740 		rtnl_unlock();
4741 		return ret;
4742 
4743 	case SIOCGIFMEM:
4744 		/* Get the per device memory space. We can add this but
4745 		 * currently do not support it */
4746 	case SIOCSIFMEM:
4747 		/* Set the per device memory buffer space.
4748 		 * Not applicable in our case */
4749 	case SIOCSIFLINK:
4750 		return -EINVAL;
4751 
4752 	/*
4753 	 *	Unknown or private ioctl.
4754 	 */
4755 	default:
4756 		if (cmd == SIOCWANDEV ||
4757 		    (cmd >= SIOCDEVPRIVATE &&
4758 		     cmd <= SIOCDEVPRIVATE + 15)) {
4759 			dev_load(net, ifr.ifr_name);
4760 			rtnl_lock();
4761 			ret = dev_ifsioc(net, &ifr, cmd);
4762 			rtnl_unlock();
4763 			if (!ret && copy_to_user(arg, &ifr,
4764 						 sizeof(struct ifreq)))
4765 				ret = -EFAULT;
4766 			return ret;
4767 		}
4768 		/* Take care of Wireless Extensions */
4769 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4770 			return wext_handle_ioctl(net, &ifr, cmd, arg);
4771 		return -EINVAL;
4772 	}
4773 }
4774 
4775 
4776 /**
4777  *	dev_new_index	-	allocate an ifindex
4778  *	@net: the applicable net namespace
4779  *
4780  *	Returns a suitable unique value for a new device interface
4781  *	number.  The caller must hold the rtnl semaphore or the
4782  *	dev_base_lock to be sure it remains unique.
4783  */
4784 static int dev_new_index(struct net *net)
4785 {
4786 	static int ifindex;
4787 	for (;;) {
4788 		if (++ifindex <= 0)
4789 			ifindex = 1;
4790 		if (!__dev_get_by_index(net, ifindex))
4791 			return ifindex;
4792 	}
4793 }
4794 
4795 /* Delayed registration/unregisteration */
4796 static LIST_HEAD(net_todo_list);
4797 
4798 static void net_set_todo(struct net_device *dev)
4799 {
4800 	list_add_tail(&dev->todo_list, &net_todo_list);
4801 }
4802 
4803 static void rollback_registered_many(struct list_head *head)
4804 {
4805 	struct net_device *dev, *tmp;
4806 
4807 	BUG_ON(dev_boot_phase);
4808 	ASSERT_RTNL();
4809 
4810 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4811 		/* Some devices call without registering
4812 		 * for initialization unwind. Remove those
4813 		 * devices and proceed with the remaining.
4814 		 */
4815 		if (dev->reg_state == NETREG_UNINITIALIZED) {
4816 			pr_debug("unregister_netdevice: device %s/%p never "
4817 				 "was registered\n", dev->name, dev);
4818 
4819 			WARN_ON(1);
4820 			list_del(&dev->unreg_list);
4821 			continue;
4822 		}
4823 
4824 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
4825 
4826 		/* If device is running, close it first. */
4827 		dev_close(dev);
4828 
4829 		/* And unlink it from device chain. */
4830 		unlist_netdevice(dev);
4831 
4832 		dev->reg_state = NETREG_UNREGISTERING;
4833 	}
4834 
4835 	synchronize_net();
4836 
4837 	list_for_each_entry(dev, head, unreg_list) {
4838 		/* Shutdown queueing discipline. */
4839 		dev_shutdown(dev);
4840 
4841 
4842 		/* Notify protocols, that we are about to destroy
4843 		   this device. They should clean all the things.
4844 		*/
4845 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4846 
4847 		if (!dev->rtnl_link_ops ||
4848 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4849 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4850 
4851 		/*
4852 		 *	Flush the unicast and multicast chains
4853 		 */
4854 		dev_uc_flush(dev);
4855 		dev_mc_flush(dev);
4856 
4857 		if (dev->netdev_ops->ndo_uninit)
4858 			dev->netdev_ops->ndo_uninit(dev);
4859 
4860 		/* Notifier chain MUST detach us from master device. */
4861 		WARN_ON(dev->master);
4862 
4863 		/* Remove entries from kobject tree */
4864 		netdev_unregister_kobject(dev);
4865 	}
4866 
4867 	/* Process any work delayed until the end of the batch */
4868 	dev = list_first_entry(head, struct net_device, unreg_list);
4869 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4870 
4871 	rcu_barrier();
4872 
4873 	list_for_each_entry(dev, head, unreg_list)
4874 		dev_put(dev);
4875 }
4876 
4877 static void rollback_registered(struct net_device *dev)
4878 {
4879 	LIST_HEAD(single);
4880 
4881 	list_add(&dev->unreg_list, &single);
4882 	rollback_registered_many(&single);
4883 }
4884 
4885 static void __netdev_init_queue_locks_one(struct net_device *dev,
4886 					  struct netdev_queue *dev_queue,
4887 					  void *_unused)
4888 {
4889 	spin_lock_init(&dev_queue->_xmit_lock);
4890 	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4891 	dev_queue->xmit_lock_owner = -1;
4892 }
4893 
4894 static void netdev_init_queue_locks(struct net_device *dev)
4895 {
4896 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4897 	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4898 }
4899 
4900 unsigned long netdev_fix_features(unsigned long features, const char *name)
4901 {
4902 	/* Fix illegal SG+CSUM combinations. */
4903 	if ((features & NETIF_F_SG) &&
4904 	    !(features & NETIF_F_ALL_CSUM)) {
4905 		if (name)
4906 			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4907 			       "checksum feature.\n", name);
4908 		features &= ~NETIF_F_SG;
4909 	}
4910 
4911 	/* TSO requires that SG is present as well. */
4912 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4913 		if (name)
4914 			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4915 			       "SG feature.\n", name);
4916 		features &= ~NETIF_F_TSO;
4917 	}
4918 
4919 	if (features & NETIF_F_UFO) {
4920 		if (!(features & NETIF_F_GEN_CSUM)) {
4921 			if (name)
4922 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4923 				       "since no NETIF_F_HW_CSUM feature.\n",
4924 				       name);
4925 			features &= ~NETIF_F_UFO;
4926 		}
4927 
4928 		if (!(features & NETIF_F_SG)) {
4929 			if (name)
4930 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4931 				       "since no NETIF_F_SG feature.\n", name);
4932 			features &= ~NETIF_F_UFO;
4933 		}
4934 	}
4935 
4936 	return features;
4937 }
4938 EXPORT_SYMBOL(netdev_fix_features);
4939 
4940 /**
4941  *	netif_stacked_transfer_operstate -	transfer operstate
4942  *	@rootdev: the root or lower level device to transfer state from
4943  *	@dev: the device to transfer operstate to
4944  *
4945  *	Transfer operational state from root to device. This is normally
4946  *	called when a stacking relationship exists between the root
4947  *	device and the device(a leaf device).
4948  */
4949 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4950 					struct net_device *dev)
4951 {
4952 	if (rootdev->operstate == IF_OPER_DORMANT)
4953 		netif_dormant_on(dev);
4954 	else
4955 		netif_dormant_off(dev);
4956 
4957 	if (netif_carrier_ok(rootdev)) {
4958 		if (!netif_carrier_ok(dev))
4959 			netif_carrier_on(dev);
4960 	} else {
4961 		if (netif_carrier_ok(dev))
4962 			netif_carrier_off(dev);
4963 	}
4964 }
4965 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4966 
4967 static int netif_alloc_rx_queues(struct net_device *dev)
4968 {
4969 #ifdef CONFIG_RPS
4970 	unsigned int i, count = dev->num_rx_queues;
4971 
4972 	if (count) {
4973 		struct netdev_rx_queue *rx;
4974 
4975 		rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
4976 		if (!rx) {
4977 			pr_err("netdev: Unable to allocate %u rx queues.\n",
4978 			       count);
4979 			return -ENOMEM;
4980 		}
4981 		dev->_rx = rx;
4982 		atomic_set(&rx->count, count);
4983 
4984 		/*
4985 		 * Set a pointer to first element in the array which holds the
4986 		 * reference count.
4987 		 */
4988 		for (i = 0; i < count; i++)
4989 			rx[i].first = rx;
4990 	}
4991 #endif
4992 	return 0;
4993 }
4994 
4995 /**
4996  *	register_netdevice	- register a network device
4997  *	@dev: device to register
4998  *
4999  *	Take a completed network device structure and add it to the kernel
5000  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5001  *	chain. 0 is returned on success. A negative errno code is returned
5002  *	on a failure to set up the device, or if the name is a duplicate.
5003  *
5004  *	Callers must hold the rtnl semaphore. You may want
5005  *	register_netdev() instead of this.
5006  *
5007  *	BUGS:
5008  *	The locking appears insufficient to guarantee two parallel registers
5009  *	will not get the same name.
5010  */
5011 
5012 int register_netdevice(struct net_device *dev)
5013 {
5014 	int ret;
5015 	struct net *net = dev_net(dev);
5016 
5017 	BUG_ON(dev_boot_phase);
5018 	ASSERT_RTNL();
5019 
5020 	might_sleep();
5021 
5022 	/* When net_device's are persistent, this will be fatal. */
5023 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5024 	BUG_ON(!net);
5025 
5026 	spin_lock_init(&dev->addr_list_lock);
5027 	netdev_set_addr_lockdep_class(dev);
5028 	netdev_init_queue_locks(dev);
5029 
5030 	dev->iflink = -1;
5031 
5032 	ret = netif_alloc_rx_queues(dev);
5033 	if (ret)
5034 		goto out;
5035 
5036 	/* Init, if this function is available */
5037 	if (dev->netdev_ops->ndo_init) {
5038 		ret = dev->netdev_ops->ndo_init(dev);
5039 		if (ret) {
5040 			if (ret > 0)
5041 				ret = -EIO;
5042 			goto out;
5043 		}
5044 	}
5045 
5046 	ret = dev_get_valid_name(dev, dev->name, 0);
5047 	if (ret)
5048 		goto err_uninit;
5049 
5050 	dev->ifindex = dev_new_index(net);
5051 	if (dev->iflink == -1)
5052 		dev->iflink = dev->ifindex;
5053 
5054 	/* Fix illegal checksum combinations */
5055 	if ((dev->features & NETIF_F_HW_CSUM) &&
5056 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5057 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5058 		       dev->name);
5059 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5060 	}
5061 
5062 	if ((dev->features & NETIF_F_NO_CSUM) &&
5063 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5064 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5065 		       dev->name);
5066 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5067 	}
5068 
5069 	dev->features = netdev_fix_features(dev->features, dev->name);
5070 
5071 	/* Enable software GSO if SG is supported. */
5072 	if (dev->features & NETIF_F_SG)
5073 		dev->features |= NETIF_F_GSO;
5074 
5075 	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5076 	 * vlan_dev_init() will do the dev->features check, so these features
5077 	 * are enabled only if supported by underlying device.
5078 	 */
5079 	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5080 
5081 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5082 	ret = notifier_to_errno(ret);
5083 	if (ret)
5084 		goto err_uninit;
5085 
5086 	ret = netdev_register_kobject(dev);
5087 	if (ret)
5088 		goto err_uninit;
5089 	dev->reg_state = NETREG_REGISTERED;
5090 
5091 	/*
5092 	 *	Default initial state at registry is that the
5093 	 *	device is present.
5094 	 */
5095 
5096 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5097 
5098 	dev_init_scheduler(dev);
5099 	dev_hold(dev);
5100 	list_netdevice(dev);
5101 
5102 	/* Notify protocols, that a new device appeared. */
5103 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5104 	ret = notifier_to_errno(ret);
5105 	if (ret) {
5106 		rollback_registered(dev);
5107 		dev->reg_state = NETREG_UNREGISTERED;
5108 	}
5109 	/*
5110 	 *	Prevent userspace races by waiting until the network
5111 	 *	device is fully setup before sending notifications.
5112 	 */
5113 	if (!dev->rtnl_link_ops ||
5114 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5115 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5116 
5117 out:
5118 	return ret;
5119 
5120 err_uninit:
5121 	if (dev->netdev_ops->ndo_uninit)
5122 		dev->netdev_ops->ndo_uninit(dev);
5123 	goto out;
5124 }
5125 EXPORT_SYMBOL(register_netdevice);
5126 
5127 /**
5128  *	init_dummy_netdev	- init a dummy network device for NAPI
5129  *	@dev: device to init
5130  *
5131  *	This takes a network device structure and initialize the minimum
5132  *	amount of fields so it can be used to schedule NAPI polls without
5133  *	registering a full blown interface. This is to be used by drivers
5134  *	that need to tie several hardware interfaces to a single NAPI
5135  *	poll scheduler due to HW limitations.
5136  */
5137 int init_dummy_netdev(struct net_device *dev)
5138 {
5139 	/* Clear everything. Note we don't initialize spinlocks
5140 	 * are they aren't supposed to be taken by any of the
5141 	 * NAPI code and this dummy netdev is supposed to be
5142 	 * only ever used for NAPI polls
5143 	 */
5144 	memset(dev, 0, sizeof(struct net_device));
5145 
5146 	/* make sure we BUG if trying to hit standard
5147 	 * register/unregister code path
5148 	 */
5149 	dev->reg_state = NETREG_DUMMY;
5150 
5151 	/* initialize the ref count */
5152 	atomic_set(&dev->refcnt, 1);
5153 
5154 	/* NAPI wants this */
5155 	INIT_LIST_HEAD(&dev->napi_list);
5156 
5157 	/* a dummy interface is started by default */
5158 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5159 	set_bit(__LINK_STATE_START, &dev->state);
5160 
5161 	return 0;
5162 }
5163 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5164 
5165 
5166 /**
5167  *	register_netdev	- register a network device
5168  *	@dev: device to register
5169  *
5170  *	Take a completed network device structure and add it to the kernel
5171  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5172  *	chain. 0 is returned on success. A negative errno code is returned
5173  *	on a failure to set up the device, or if the name is a duplicate.
5174  *
5175  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5176  *	and expands the device name if you passed a format string to
5177  *	alloc_netdev.
5178  */
5179 int register_netdev(struct net_device *dev)
5180 {
5181 	int err;
5182 
5183 	rtnl_lock();
5184 
5185 	/*
5186 	 * If the name is a format string the caller wants us to do a
5187 	 * name allocation.
5188 	 */
5189 	if (strchr(dev->name, '%')) {
5190 		err = dev_alloc_name(dev, dev->name);
5191 		if (err < 0)
5192 			goto out;
5193 	}
5194 
5195 	err = register_netdevice(dev);
5196 out:
5197 	rtnl_unlock();
5198 	return err;
5199 }
5200 EXPORT_SYMBOL(register_netdev);
5201 
5202 /*
5203  * netdev_wait_allrefs - wait until all references are gone.
5204  *
5205  * This is called when unregistering network devices.
5206  *
5207  * Any protocol or device that holds a reference should register
5208  * for netdevice notification, and cleanup and put back the
5209  * reference if they receive an UNREGISTER event.
5210  * We can get stuck here if buggy protocols don't correctly
5211  * call dev_put.
5212  */
5213 static void netdev_wait_allrefs(struct net_device *dev)
5214 {
5215 	unsigned long rebroadcast_time, warning_time;
5216 
5217 	linkwatch_forget_dev(dev);
5218 
5219 	rebroadcast_time = warning_time = jiffies;
5220 	while (atomic_read(&dev->refcnt) != 0) {
5221 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5222 			rtnl_lock();
5223 
5224 			/* Rebroadcast unregister notification */
5225 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5226 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5227 			 * should have already handle it the first time */
5228 
5229 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5230 				     &dev->state)) {
5231 				/* We must not have linkwatch events
5232 				 * pending on unregister. If this
5233 				 * happens, we simply run the queue
5234 				 * unscheduled, resulting in a noop
5235 				 * for this device.
5236 				 */
5237 				linkwatch_run_queue();
5238 			}
5239 
5240 			__rtnl_unlock();
5241 
5242 			rebroadcast_time = jiffies;
5243 		}
5244 
5245 		msleep(250);
5246 
5247 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5248 			printk(KERN_EMERG "unregister_netdevice: "
5249 			       "waiting for %s to become free. Usage "
5250 			       "count = %d\n",
5251 			       dev->name, atomic_read(&dev->refcnt));
5252 			warning_time = jiffies;
5253 		}
5254 	}
5255 }
5256 
5257 /* The sequence is:
5258  *
5259  *	rtnl_lock();
5260  *	...
5261  *	register_netdevice(x1);
5262  *	register_netdevice(x2);
5263  *	...
5264  *	unregister_netdevice(y1);
5265  *	unregister_netdevice(y2);
5266  *      ...
5267  *	rtnl_unlock();
5268  *	free_netdev(y1);
5269  *	free_netdev(y2);
5270  *
5271  * We are invoked by rtnl_unlock().
5272  * This allows us to deal with problems:
5273  * 1) We can delete sysfs objects which invoke hotplug
5274  *    without deadlocking with linkwatch via keventd.
5275  * 2) Since we run with the RTNL semaphore not held, we can sleep
5276  *    safely in order to wait for the netdev refcnt to drop to zero.
5277  *
5278  * We must not return until all unregister events added during
5279  * the interval the lock was held have been completed.
5280  */
5281 void netdev_run_todo(void)
5282 {
5283 	struct list_head list;
5284 
5285 	/* Snapshot list, allow later requests */
5286 	list_replace_init(&net_todo_list, &list);
5287 
5288 	__rtnl_unlock();
5289 
5290 	while (!list_empty(&list)) {
5291 		struct net_device *dev
5292 			= list_first_entry(&list, struct net_device, todo_list);
5293 		list_del(&dev->todo_list);
5294 
5295 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5296 			printk(KERN_ERR "network todo '%s' but state %d\n",
5297 			       dev->name, dev->reg_state);
5298 			dump_stack();
5299 			continue;
5300 		}
5301 
5302 		dev->reg_state = NETREG_UNREGISTERED;
5303 
5304 		on_each_cpu(flush_backlog, dev, 1);
5305 
5306 		netdev_wait_allrefs(dev);
5307 
5308 		/* paranoia */
5309 		BUG_ON(atomic_read(&dev->refcnt));
5310 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5311 		WARN_ON(dev->ip6_ptr);
5312 		WARN_ON(dev->dn_ptr);
5313 
5314 		if (dev->destructor)
5315 			dev->destructor(dev);
5316 
5317 		/* Free network device */
5318 		kobject_put(&dev->dev.kobj);
5319 	}
5320 }
5321 
5322 /**
5323  *	dev_txq_stats_fold - fold tx_queues stats
5324  *	@dev: device to get statistics from
5325  *	@stats: struct rtnl_link_stats64 to hold results
5326  */
5327 void dev_txq_stats_fold(const struct net_device *dev,
5328 			struct rtnl_link_stats64 *stats)
5329 {
5330 	u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5331 	unsigned int i;
5332 	struct netdev_queue *txq;
5333 
5334 	for (i = 0; i < dev->num_tx_queues; i++) {
5335 		txq = netdev_get_tx_queue(dev, i);
5336 		spin_lock_bh(&txq->_xmit_lock);
5337 		tx_bytes   += txq->tx_bytes;
5338 		tx_packets += txq->tx_packets;
5339 		tx_dropped += txq->tx_dropped;
5340 		spin_unlock_bh(&txq->_xmit_lock);
5341 	}
5342 	if (tx_bytes || tx_packets || tx_dropped) {
5343 		stats->tx_bytes   = tx_bytes;
5344 		stats->tx_packets = tx_packets;
5345 		stats->tx_dropped = tx_dropped;
5346 	}
5347 }
5348 EXPORT_SYMBOL(dev_txq_stats_fold);
5349 
5350 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5351  * fields in the same order, with only the type differing.
5352  */
5353 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5354 				    const struct net_device_stats *netdev_stats)
5355 {
5356 #if BITS_PER_LONG == 64
5357         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5358         memcpy(stats64, netdev_stats, sizeof(*stats64));
5359 #else
5360 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5361 	const unsigned long *src = (const unsigned long *)netdev_stats;
5362 	u64 *dst = (u64 *)stats64;
5363 
5364 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5365 		     sizeof(*stats64) / sizeof(u64));
5366 	for (i = 0; i < n; i++)
5367 		dst[i] = src[i];
5368 #endif
5369 }
5370 
5371 /**
5372  *	dev_get_stats	- get network device statistics
5373  *	@dev: device to get statistics from
5374  *	@storage: place to store stats
5375  *
5376  *	Get network statistics from device. Return @storage.
5377  *	The device driver may provide its own method by setting
5378  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5379  *	otherwise the internal statistics structure is used.
5380  */
5381 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5382 					struct rtnl_link_stats64 *storage)
5383 {
5384 	const struct net_device_ops *ops = dev->netdev_ops;
5385 
5386 	if (ops->ndo_get_stats64) {
5387 		memset(storage, 0, sizeof(*storage));
5388 		return ops->ndo_get_stats64(dev, storage);
5389 	}
5390 	if (ops->ndo_get_stats) {
5391 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5392 		return storage;
5393 	}
5394 	netdev_stats_to_stats64(storage, &dev->stats);
5395 	dev_txq_stats_fold(dev, storage);
5396 	return storage;
5397 }
5398 EXPORT_SYMBOL(dev_get_stats);
5399 
5400 static void netdev_init_one_queue(struct net_device *dev,
5401 				  struct netdev_queue *queue,
5402 				  void *_unused)
5403 {
5404 	queue->dev = dev;
5405 }
5406 
5407 static void netdev_init_queues(struct net_device *dev)
5408 {
5409 	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5410 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5411 	spin_lock_init(&dev->tx_global_lock);
5412 }
5413 
5414 /**
5415  *	alloc_netdev_mq - allocate network device
5416  *	@sizeof_priv:	size of private data to allocate space for
5417  *	@name:		device name format string
5418  *	@setup:		callback to initialize device
5419  *	@queue_count:	the number of subqueues to allocate
5420  *
5421  *	Allocates a struct net_device with private data area for driver use
5422  *	and performs basic initialization.  Also allocates subquue structs
5423  *	for each queue on the device at the end of the netdevice.
5424  */
5425 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5426 		void (*setup)(struct net_device *), unsigned int queue_count)
5427 {
5428 	struct netdev_queue *tx;
5429 	struct net_device *dev;
5430 	size_t alloc_size;
5431 	struct net_device *p;
5432 
5433 	BUG_ON(strlen(name) >= sizeof(dev->name));
5434 
5435 	alloc_size = sizeof(struct net_device);
5436 	if (sizeof_priv) {
5437 		/* ensure 32-byte alignment of private area */
5438 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5439 		alloc_size += sizeof_priv;
5440 	}
5441 	/* ensure 32-byte alignment of whole construct */
5442 	alloc_size += NETDEV_ALIGN - 1;
5443 
5444 	p = kzalloc(alloc_size, GFP_KERNEL);
5445 	if (!p) {
5446 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5447 		return NULL;
5448 	}
5449 
5450 	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5451 	if (!tx) {
5452 		printk(KERN_ERR "alloc_netdev: Unable to allocate "
5453 		       "tx qdiscs.\n");
5454 		goto free_p;
5455 	}
5456 
5457 
5458 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5459 	dev->padded = (char *)dev - (char *)p;
5460 
5461 	if (dev_addr_init(dev))
5462 		goto free_tx;
5463 
5464 	dev_mc_init(dev);
5465 	dev_uc_init(dev);
5466 
5467 	dev_net_set(dev, &init_net);
5468 
5469 	dev->_tx = tx;
5470 	dev->num_tx_queues = queue_count;
5471 	dev->real_num_tx_queues = queue_count;
5472 
5473 #ifdef CONFIG_RPS
5474 	dev->num_rx_queues = queue_count;
5475 #endif
5476 
5477 	dev->gso_max_size = GSO_MAX_SIZE;
5478 
5479 	netdev_init_queues(dev);
5480 
5481 	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5482 	dev->ethtool_ntuple_list.count = 0;
5483 	INIT_LIST_HEAD(&dev->napi_list);
5484 	INIT_LIST_HEAD(&dev->unreg_list);
5485 	INIT_LIST_HEAD(&dev->link_watch_list);
5486 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5487 	setup(dev);
5488 	strcpy(dev->name, name);
5489 	return dev;
5490 
5491 free_tx:
5492 	kfree(tx);
5493 free_p:
5494 	kfree(p);
5495 	return NULL;
5496 }
5497 EXPORT_SYMBOL(alloc_netdev_mq);
5498 
5499 /**
5500  *	free_netdev - free network device
5501  *	@dev: device
5502  *
5503  *	This function does the last stage of destroying an allocated device
5504  * 	interface. The reference to the device object is released.
5505  *	If this is the last reference then it will be freed.
5506  */
5507 void free_netdev(struct net_device *dev)
5508 {
5509 	struct napi_struct *p, *n;
5510 
5511 	release_net(dev_net(dev));
5512 
5513 	kfree(dev->_tx);
5514 
5515 	/* Flush device addresses */
5516 	dev_addr_flush(dev);
5517 
5518 	/* Clear ethtool n-tuple list */
5519 	ethtool_ntuple_flush(dev);
5520 
5521 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5522 		netif_napi_del(p);
5523 
5524 	/*  Compatibility with error handling in drivers */
5525 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5526 		kfree((char *)dev - dev->padded);
5527 		return;
5528 	}
5529 
5530 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5531 	dev->reg_state = NETREG_RELEASED;
5532 
5533 	/* will free via device release */
5534 	put_device(&dev->dev);
5535 }
5536 EXPORT_SYMBOL(free_netdev);
5537 
5538 /**
5539  *	synchronize_net -  Synchronize with packet receive processing
5540  *
5541  *	Wait for packets currently being received to be done.
5542  *	Does not block later packets from starting.
5543  */
5544 void synchronize_net(void)
5545 {
5546 	might_sleep();
5547 	synchronize_rcu();
5548 }
5549 EXPORT_SYMBOL(synchronize_net);
5550 
5551 /**
5552  *	unregister_netdevice_queue - remove device from the kernel
5553  *	@dev: device
5554  *	@head: list
5555  *
5556  *	This function shuts down a device interface and removes it
5557  *	from the kernel tables.
5558  *	If head not NULL, device is queued to be unregistered later.
5559  *
5560  *	Callers must hold the rtnl semaphore.  You may want
5561  *	unregister_netdev() instead of this.
5562  */
5563 
5564 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5565 {
5566 	ASSERT_RTNL();
5567 
5568 	if (head) {
5569 		list_move_tail(&dev->unreg_list, head);
5570 	} else {
5571 		rollback_registered(dev);
5572 		/* Finish processing unregister after unlock */
5573 		net_set_todo(dev);
5574 	}
5575 }
5576 EXPORT_SYMBOL(unregister_netdevice_queue);
5577 
5578 /**
5579  *	unregister_netdevice_many - unregister many devices
5580  *	@head: list of devices
5581  */
5582 void unregister_netdevice_many(struct list_head *head)
5583 {
5584 	struct net_device *dev;
5585 
5586 	if (!list_empty(head)) {
5587 		rollback_registered_many(head);
5588 		list_for_each_entry(dev, head, unreg_list)
5589 			net_set_todo(dev);
5590 	}
5591 }
5592 EXPORT_SYMBOL(unregister_netdevice_many);
5593 
5594 /**
5595  *	unregister_netdev - remove device from the kernel
5596  *	@dev: device
5597  *
5598  *	This function shuts down a device interface and removes it
5599  *	from the kernel tables.
5600  *
5601  *	This is just a wrapper for unregister_netdevice that takes
5602  *	the rtnl semaphore.  In general you want to use this and not
5603  *	unregister_netdevice.
5604  */
5605 void unregister_netdev(struct net_device *dev)
5606 {
5607 	rtnl_lock();
5608 	unregister_netdevice(dev);
5609 	rtnl_unlock();
5610 }
5611 EXPORT_SYMBOL(unregister_netdev);
5612 
5613 /**
5614  *	dev_change_net_namespace - move device to different nethost namespace
5615  *	@dev: device
5616  *	@net: network namespace
5617  *	@pat: If not NULL name pattern to try if the current device name
5618  *	      is already taken in the destination network namespace.
5619  *
5620  *	This function shuts down a device interface and moves it
5621  *	to a new network namespace. On success 0 is returned, on
5622  *	a failure a netagive errno code is returned.
5623  *
5624  *	Callers must hold the rtnl semaphore.
5625  */
5626 
5627 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5628 {
5629 	int err;
5630 
5631 	ASSERT_RTNL();
5632 
5633 	/* Don't allow namespace local devices to be moved. */
5634 	err = -EINVAL;
5635 	if (dev->features & NETIF_F_NETNS_LOCAL)
5636 		goto out;
5637 
5638 	/* Ensure the device has been registrered */
5639 	err = -EINVAL;
5640 	if (dev->reg_state != NETREG_REGISTERED)
5641 		goto out;
5642 
5643 	/* Get out if there is nothing todo */
5644 	err = 0;
5645 	if (net_eq(dev_net(dev), net))
5646 		goto out;
5647 
5648 	/* Pick the destination device name, and ensure
5649 	 * we can use it in the destination network namespace.
5650 	 */
5651 	err = -EEXIST;
5652 	if (__dev_get_by_name(net, dev->name)) {
5653 		/* We get here if we can't use the current device name */
5654 		if (!pat)
5655 			goto out;
5656 		if (dev_get_valid_name(dev, pat, 1))
5657 			goto out;
5658 	}
5659 
5660 	/*
5661 	 * And now a mini version of register_netdevice unregister_netdevice.
5662 	 */
5663 
5664 	/* If device is running close it first. */
5665 	dev_close(dev);
5666 
5667 	/* And unlink it from device chain */
5668 	err = -ENODEV;
5669 	unlist_netdevice(dev);
5670 
5671 	synchronize_net();
5672 
5673 	/* Shutdown queueing discipline. */
5674 	dev_shutdown(dev);
5675 
5676 	/* Notify protocols, that we are about to destroy
5677 	   this device. They should clean all the things.
5678 
5679 	   Note that dev->reg_state stays at NETREG_REGISTERED.
5680 	   This is wanted because this way 8021q and macvlan know
5681 	   the device is just moving and can keep their slaves up.
5682 	*/
5683 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5684 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5685 
5686 	/*
5687 	 *	Flush the unicast and multicast chains
5688 	 */
5689 	dev_uc_flush(dev);
5690 	dev_mc_flush(dev);
5691 
5692 	/* Actually switch the network namespace */
5693 	dev_net_set(dev, net);
5694 
5695 	/* If there is an ifindex conflict assign a new one */
5696 	if (__dev_get_by_index(net, dev->ifindex)) {
5697 		int iflink = (dev->iflink == dev->ifindex);
5698 		dev->ifindex = dev_new_index(net);
5699 		if (iflink)
5700 			dev->iflink = dev->ifindex;
5701 	}
5702 
5703 	/* Fixup kobjects */
5704 	err = device_rename(&dev->dev, dev->name);
5705 	WARN_ON(err);
5706 
5707 	/* Add the device back in the hashes */
5708 	list_netdevice(dev);
5709 
5710 	/* Notify protocols, that a new device appeared. */
5711 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
5712 
5713 	/*
5714 	 *	Prevent userspace races by waiting until the network
5715 	 *	device is fully setup before sending notifications.
5716 	 */
5717 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5718 
5719 	synchronize_net();
5720 	err = 0;
5721 out:
5722 	return err;
5723 }
5724 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5725 
5726 static int dev_cpu_callback(struct notifier_block *nfb,
5727 			    unsigned long action,
5728 			    void *ocpu)
5729 {
5730 	struct sk_buff **list_skb;
5731 	struct sk_buff *skb;
5732 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5733 	struct softnet_data *sd, *oldsd;
5734 
5735 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5736 		return NOTIFY_OK;
5737 
5738 	local_irq_disable();
5739 	cpu = smp_processor_id();
5740 	sd = &per_cpu(softnet_data, cpu);
5741 	oldsd = &per_cpu(softnet_data, oldcpu);
5742 
5743 	/* Find end of our completion_queue. */
5744 	list_skb = &sd->completion_queue;
5745 	while (*list_skb)
5746 		list_skb = &(*list_skb)->next;
5747 	/* Append completion queue from offline CPU. */
5748 	*list_skb = oldsd->completion_queue;
5749 	oldsd->completion_queue = NULL;
5750 
5751 	/* Append output queue from offline CPU. */
5752 	if (oldsd->output_queue) {
5753 		*sd->output_queue_tailp = oldsd->output_queue;
5754 		sd->output_queue_tailp = oldsd->output_queue_tailp;
5755 		oldsd->output_queue = NULL;
5756 		oldsd->output_queue_tailp = &oldsd->output_queue;
5757 	}
5758 
5759 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5760 	local_irq_enable();
5761 
5762 	/* Process offline CPU's input_pkt_queue */
5763 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5764 		netif_rx(skb);
5765 		input_queue_head_incr(oldsd);
5766 	}
5767 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5768 		netif_rx(skb);
5769 		input_queue_head_incr(oldsd);
5770 	}
5771 
5772 	return NOTIFY_OK;
5773 }
5774 
5775 
5776 /**
5777  *	netdev_increment_features - increment feature set by one
5778  *	@all: current feature set
5779  *	@one: new feature set
5780  *	@mask: mask feature set
5781  *
5782  *	Computes a new feature set after adding a device with feature set
5783  *	@one to the master device with current feature set @all.  Will not
5784  *	enable anything that is off in @mask. Returns the new feature set.
5785  */
5786 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5787 					unsigned long mask)
5788 {
5789 	/* If device needs checksumming, downgrade to it. */
5790 	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5791 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5792 	else if (mask & NETIF_F_ALL_CSUM) {
5793 		/* If one device supports v4/v6 checksumming, set for all. */
5794 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5795 		    !(all & NETIF_F_GEN_CSUM)) {
5796 			all &= ~NETIF_F_ALL_CSUM;
5797 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5798 		}
5799 
5800 		/* If one device supports hw checksumming, set for all. */
5801 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5802 			all &= ~NETIF_F_ALL_CSUM;
5803 			all |= NETIF_F_HW_CSUM;
5804 		}
5805 	}
5806 
5807 	one |= NETIF_F_ALL_CSUM;
5808 
5809 	one |= all & NETIF_F_ONE_FOR_ALL;
5810 	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5811 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5812 
5813 	return all;
5814 }
5815 EXPORT_SYMBOL(netdev_increment_features);
5816 
5817 static struct hlist_head *netdev_create_hash(void)
5818 {
5819 	int i;
5820 	struct hlist_head *hash;
5821 
5822 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5823 	if (hash != NULL)
5824 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5825 			INIT_HLIST_HEAD(&hash[i]);
5826 
5827 	return hash;
5828 }
5829 
5830 /* Initialize per network namespace state */
5831 static int __net_init netdev_init(struct net *net)
5832 {
5833 	INIT_LIST_HEAD(&net->dev_base_head);
5834 
5835 	net->dev_name_head = netdev_create_hash();
5836 	if (net->dev_name_head == NULL)
5837 		goto err_name;
5838 
5839 	net->dev_index_head = netdev_create_hash();
5840 	if (net->dev_index_head == NULL)
5841 		goto err_idx;
5842 
5843 	return 0;
5844 
5845 err_idx:
5846 	kfree(net->dev_name_head);
5847 err_name:
5848 	return -ENOMEM;
5849 }
5850 
5851 /**
5852  *	netdev_drivername - network driver for the device
5853  *	@dev: network device
5854  *	@buffer: buffer for resulting name
5855  *	@len: size of buffer
5856  *
5857  *	Determine network driver for device.
5858  */
5859 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5860 {
5861 	const struct device_driver *driver;
5862 	const struct device *parent;
5863 
5864 	if (len <= 0 || !buffer)
5865 		return buffer;
5866 	buffer[0] = 0;
5867 
5868 	parent = dev->dev.parent;
5869 
5870 	if (!parent)
5871 		return buffer;
5872 
5873 	driver = parent->driver;
5874 	if (driver && driver->name)
5875 		strlcpy(buffer, driver->name, len);
5876 	return buffer;
5877 }
5878 
5879 static int __netdev_printk(const char *level, const struct net_device *dev,
5880 			   struct va_format *vaf)
5881 {
5882 	int r;
5883 
5884 	if (dev && dev->dev.parent)
5885 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
5886 			       netdev_name(dev), vaf);
5887 	else if (dev)
5888 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
5889 	else
5890 		r = printk("%s(NULL net_device): %pV", level, vaf);
5891 
5892 	return r;
5893 }
5894 
5895 int netdev_printk(const char *level, const struct net_device *dev,
5896 		  const char *format, ...)
5897 {
5898 	struct va_format vaf;
5899 	va_list args;
5900 	int r;
5901 
5902 	va_start(args, format);
5903 
5904 	vaf.fmt = format;
5905 	vaf.va = &args;
5906 
5907 	r = __netdev_printk(level, dev, &vaf);
5908 	va_end(args);
5909 
5910 	return r;
5911 }
5912 EXPORT_SYMBOL(netdev_printk);
5913 
5914 #define define_netdev_printk_level(func, level)			\
5915 int func(const struct net_device *dev, const char *fmt, ...)	\
5916 {								\
5917 	int r;							\
5918 	struct va_format vaf;					\
5919 	va_list args;						\
5920 								\
5921 	va_start(args, fmt);					\
5922 								\
5923 	vaf.fmt = fmt;						\
5924 	vaf.va = &args;						\
5925 								\
5926 	r = __netdev_printk(level, dev, &vaf);			\
5927 	va_end(args);						\
5928 								\
5929 	return r;						\
5930 }								\
5931 EXPORT_SYMBOL(func);
5932 
5933 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
5934 define_netdev_printk_level(netdev_alert, KERN_ALERT);
5935 define_netdev_printk_level(netdev_crit, KERN_CRIT);
5936 define_netdev_printk_level(netdev_err, KERN_ERR);
5937 define_netdev_printk_level(netdev_warn, KERN_WARNING);
5938 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
5939 define_netdev_printk_level(netdev_info, KERN_INFO);
5940 
5941 static void __net_exit netdev_exit(struct net *net)
5942 {
5943 	kfree(net->dev_name_head);
5944 	kfree(net->dev_index_head);
5945 }
5946 
5947 static struct pernet_operations __net_initdata netdev_net_ops = {
5948 	.init = netdev_init,
5949 	.exit = netdev_exit,
5950 };
5951 
5952 static void __net_exit default_device_exit(struct net *net)
5953 {
5954 	struct net_device *dev, *aux;
5955 	/*
5956 	 * Push all migratable network devices back to the
5957 	 * initial network namespace
5958 	 */
5959 	rtnl_lock();
5960 	for_each_netdev_safe(net, dev, aux) {
5961 		int err;
5962 		char fb_name[IFNAMSIZ];
5963 
5964 		/* Ignore unmoveable devices (i.e. loopback) */
5965 		if (dev->features & NETIF_F_NETNS_LOCAL)
5966 			continue;
5967 
5968 		/* Leave virtual devices for the generic cleanup */
5969 		if (dev->rtnl_link_ops)
5970 			continue;
5971 
5972 		/* Push remaing network devices to init_net */
5973 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5974 		err = dev_change_net_namespace(dev, &init_net, fb_name);
5975 		if (err) {
5976 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5977 				__func__, dev->name, err);
5978 			BUG();
5979 		}
5980 	}
5981 	rtnl_unlock();
5982 }
5983 
5984 static void __net_exit default_device_exit_batch(struct list_head *net_list)
5985 {
5986 	/* At exit all network devices most be removed from a network
5987 	 * namespace.  Do this in the reverse order of registeration.
5988 	 * Do this across as many network namespaces as possible to
5989 	 * improve batching efficiency.
5990 	 */
5991 	struct net_device *dev;
5992 	struct net *net;
5993 	LIST_HEAD(dev_kill_list);
5994 
5995 	rtnl_lock();
5996 	list_for_each_entry(net, net_list, exit_list) {
5997 		for_each_netdev_reverse(net, dev) {
5998 			if (dev->rtnl_link_ops)
5999 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6000 			else
6001 				unregister_netdevice_queue(dev, &dev_kill_list);
6002 		}
6003 	}
6004 	unregister_netdevice_many(&dev_kill_list);
6005 	rtnl_unlock();
6006 }
6007 
6008 static struct pernet_operations __net_initdata default_device_ops = {
6009 	.exit = default_device_exit,
6010 	.exit_batch = default_device_exit_batch,
6011 };
6012 
6013 /*
6014  *	Initialize the DEV module. At boot time this walks the device list and
6015  *	unhooks any devices that fail to initialise (normally hardware not
6016  *	present) and leaves us with a valid list of present and active devices.
6017  *
6018  */
6019 
6020 /*
6021  *       This is called single threaded during boot, so no need
6022  *       to take the rtnl semaphore.
6023  */
6024 static int __init net_dev_init(void)
6025 {
6026 	int i, rc = -ENOMEM;
6027 
6028 	BUG_ON(!dev_boot_phase);
6029 
6030 	if (dev_proc_init())
6031 		goto out;
6032 
6033 	if (netdev_kobject_init())
6034 		goto out;
6035 
6036 	INIT_LIST_HEAD(&ptype_all);
6037 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6038 		INIT_LIST_HEAD(&ptype_base[i]);
6039 
6040 	if (register_pernet_subsys(&netdev_net_ops))
6041 		goto out;
6042 
6043 	/*
6044 	 *	Initialise the packet receive queues.
6045 	 */
6046 
6047 	for_each_possible_cpu(i) {
6048 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6049 
6050 		memset(sd, 0, sizeof(*sd));
6051 		skb_queue_head_init(&sd->input_pkt_queue);
6052 		skb_queue_head_init(&sd->process_queue);
6053 		sd->completion_queue = NULL;
6054 		INIT_LIST_HEAD(&sd->poll_list);
6055 		sd->output_queue = NULL;
6056 		sd->output_queue_tailp = &sd->output_queue;
6057 #ifdef CONFIG_RPS
6058 		sd->csd.func = rps_trigger_softirq;
6059 		sd->csd.info = sd;
6060 		sd->csd.flags = 0;
6061 		sd->cpu = i;
6062 #endif
6063 
6064 		sd->backlog.poll = process_backlog;
6065 		sd->backlog.weight = weight_p;
6066 		sd->backlog.gro_list = NULL;
6067 		sd->backlog.gro_count = 0;
6068 	}
6069 
6070 	dev_boot_phase = 0;
6071 
6072 	/* The loopback device is special if any other network devices
6073 	 * is present in a network namespace the loopback device must
6074 	 * be present. Since we now dynamically allocate and free the
6075 	 * loopback device ensure this invariant is maintained by
6076 	 * keeping the loopback device as the first device on the
6077 	 * list of network devices.  Ensuring the loopback devices
6078 	 * is the first device that appears and the last network device
6079 	 * that disappears.
6080 	 */
6081 	if (register_pernet_device(&loopback_net_ops))
6082 		goto out;
6083 
6084 	if (register_pernet_device(&default_device_ops))
6085 		goto out;
6086 
6087 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6088 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6089 
6090 	hotcpu_notifier(dev_cpu_callback, 0);
6091 	dst_init();
6092 	dev_mcast_init();
6093 	rc = 0;
6094 out:
6095 	return rc;
6096 }
6097 
6098 subsys_initcall(net_dev_init);
6099 
6100 static int __init initialize_hashrnd(void)
6101 {
6102 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6103 	return 0;
6104 }
6105 
6106 late_initcall_sync(initialize_hashrnd);
6107 
6108