xref: /linux-6.15/net/core/dev.c (revision f7ae8d59)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <[email protected]>
12  *				Mark Evans, <[email protected]>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <[email protected]>
16  *		Alan Cox <[email protected]>
17  *		David Hinds <[email protected]>
18  *		Alexey Kuznetsov <[email protected]>
19  *		Adam Sulmicki <[email protected]>
20  *              Pekka Riikonen <[email protected]>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 #include <linux/cpu_rmap.h>
136 
137 #include "net-sysfs.h"
138 
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
141 
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144 
145 /*
146  *	The list of packet types we will receive (as opposed to discard)
147  *	and the routines to invoke.
148  *
149  *	Why 16. Because with 16 the only overlap we get on a hash of the
150  *	low nibble of the protocol value is RARP/SNAP/X.25.
151  *
152  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
153  *             sure which should go first, but I bet it won't make much
154  *             difference if we are running VLANs.  The good news is that
155  *             this protocol won't be in the list unless compiled in, so
156  *             the average user (w/out VLANs) will not be adversely affected.
157  *             --BLG
158  *
159  *		0800	IP
160  *		8100    802.1Q VLAN
161  *		0001	802.3
162  *		0002	AX.25
163  *		0004	802.2
164  *		8035	RARP
165  *		0005	SNAP
166  *		0805	X.25
167  *		0806	ARP
168  *		8137	IPX
169  *		0009	Localtalk
170  *		86DD	IPv6
171  */
172 
173 #define PTYPE_HASH_SIZE	(16)
174 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
175 
176 static DEFINE_SPINLOCK(ptype_lock);
177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178 static struct list_head ptype_all __read_mostly;	/* Taps */
179 
180 /*
181  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182  * semaphore.
183  *
184  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
185  *
186  * Writers must hold the rtnl semaphore while they loop through the
187  * dev_base_head list, and hold dev_base_lock for writing when they do the
188  * actual updates.  This allows pure readers to access the list even
189  * while a writer is preparing to update it.
190  *
191  * To put it another way, dev_base_lock is held for writing only to
192  * protect against pure readers; the rtnl semaphore provides the
193  * protection against other writers.
194  *
195  * See, for example usages, register_netdevice() and
196  * unregister_netdevice(), which must be called with the rtnl
197  * semaphore held.
198  */
199 DEFINE_RWLOCK(dev_base_lock);
200 EXPORT_SYMBOL(dev_base_lock);
201 
202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
203 {
204 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
205 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206 }
207 
208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
209 {
210 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211 }
212 
213 static inline void rps_lock(struct softnet_data *sd)
214 {
215 #ifdef CONFIG_RPS
216 	spin_lock(&sd->input_pkt_queue.lock);
217 #endif
218 }
219 
220 static inline void rps_unlock(struct softnet_data *sd)
221 {
222 #ifdef CONFIG_RPS
223 	spin_unlock(&sd->input_pkt_queue.lock);
224 #endif
225 }
226 
227 /* Device list insertion */
228 static int list_netdevice(struct net_device *dev)
229 {
230 	struct net *net = dev_net(dev);
231 
232 	ASSERT_RTNL();
233 
234 	write_lock_bh(&dev_base_lock);
235 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
236 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
237 	hlist_add_head_rcu(&dev->index_hlist,
238 			   dev_index_hash(net, dev->ifindex));
239 	write_unlock_bh(&dev_base_lock);
240 	return 0;
241 }
242 
243 /* Device list removal
244  * caller must respect a RCU grace period before freeing/reusing dev
245  */
246 static void unlist_netdevice(struct net_device *dev)
247 {
248 	ASSERT_RTNL();
249 
250 	/* Unlink dev from the device chain */
251 	write_lock_bh(&dev_base_lock);
252 	list_del_rcu(&dev->dev_list);
253 	hlist_del_rcu(&dev->name_hlist);
254 	hlist_del_rcu(&dev->index_hlist);
255 	write_unlock_bh(&dev_base_lock);
256 }
257 
258 /*
259  *	Our notifier list
260  */
261 
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263 
264 /*
265  *	Device drivers call our routines to queue packets here. We empty the
266  *	queue in the local softnet handler.
267  */
268 
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271 
272 #ifdef CONFIG_LOCKDEP
273 /*
274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275  * according to dev->type
276  */
277 static const unsigned short netdev_lock_type[] =
278 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
291 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
292 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
293 	 ARPHRD_VOID, ARPHRD_NONE};
294 
295 static const char *const netdev_lock_name[] =
296 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
309 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
310 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
311 	 "_xmit_VOID", "_xmit_NONE"};
312 
313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
315 
316 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
317 {
318 	int i;
319 
320 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
321 		if (netdev_lock_type[i] == dev_type)
322 			return i;
323 	/* the last key is used by default */
324 	return ARRAY_SIZE(netdev_lock_type) - 1;
325 }
326 
327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
328 						 unsigned short dev_type)
329 {
330 	int i;
331 
332 	i = netdev_lock_pos(dev_type);
333 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
334 				   netdev_lock_name[i]);
335 }
336 
337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
338 {
339 	int i;
340 
341 	i = netdev_lock_pos(dev->type);
342 	lockdep_set_class_and_name(&dev->addr_list_lock,
343 				   &netdev_addr_lock_key[i],
344 				   netdev_lock_name[i]);
345 }
346 #else
347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
348 						 unsigned short dev_type)
349 {
350 }
351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
352 {
353 }
354 #endif
355 
356 /*******************************************************************************
357 
358 		Protocol management and registration routines
359 
360 *******************************************************************************/
361 
362 /*
363  *	Add a protocol ID to the list. Now that the input handler is
364  *	smarter we can dispense with all the messy stuff that used to be
365  *	here.
366  *
367  *	BEWARE!!! Protocol handlers, mangling input packets,
368  *	MUST BE last in hash buckets and checking protocol handlers
369  *	MUST start from promiscuous ptype_all chain in net_bh.
370  *	It is true now, do not change it.
371  *	Explanation follows: if protocol handler, mangling packet, will
372  *	be the first on list, it is not able to sense, that packet
373  *	is cloned and should be copied-on-write, so that it will
374  *	change it and subsequent readers will get broken packet.
375  *							--ANK (980803)
376  */
377 
378 static inline struct list_head *ptype_head(const struct packet_type *pt)
379 {
380 	if (pt->type == htons(ETH_P_ALL))
381 		return &ptype_all;
382 	else
383 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384 }
385 
386 /**
387  *	dev_add_pack - add packet handler
388  *	@pt: packet type declaration
389  *
390  *	Add a protocol handler to the networking stack. The passed &packet_type
391  *	is linked into kernel lists and may not be freed until it has been
392  *	removed from the kernel lists.
393  *
394  *	This call does not sleep therefore it can not
395  *	guarantee all CPU's that are in middle of receiving packets
396  *	will see the new packet type (until the next received packet).
397  */
398 
399 void dev_add_pack(struct packet_type *pt)
400 {
401 	struct list_head *head = ptype_head(pt);
402 
403 	spin_lock(&ptype_lock);
404 	list_add_rcu(&pt->list, head);
405 	spin_unlock(&ptype_lock);
406 }
407 EXPORT_SYMBOL(dev_add_pack);
408 
409 /**
410  *	__dev_remove_pack	 - remove packet handler
411  *	@pt: packet type declaration
412  *
413  *	Remove a protocol handler that was previously added to the kernel
414  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
415  *	from the kernel lists and can be freed or reused once this function
416  *	returns.
417  *
418  *      The packet type might still be in use by receivers
419  *	and must not be freed until after all the CPU's have gone
420  *	through a quiescent state.
421  */
422 void __dev_remove_pack(struct packet_type *pt)
423 {
424 	struct list_head *head = ptype_head(pt);
425 	struct packet_type *pt1;
426 
427 	spin_lock(&ptype_lock);
428 
429 	list_for_each_entry(pt1, head, list) {
430 		if (pt == pt1) {
431 			list_del_rcu(&pt->list);
432 			goto out;
433 		}
434 	}
435 
436 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437 out:
438 	spin_unlock(&ptype_lock);
439 }
440 EXPORT_SYMBOL(__dev_remove_pack);
441 
442 /**
443  *	dev_remove_pack	 - remove packet handler
444  *	@pt: packet type declaration
445  *
446  *	Remove a protocol handler that was previously added to the kernel
447  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
448  *	from the kernel lists and can be freed or reused once this function
449  *	returns.
450  *
451  *	This call sleeps to guarantee that no CPU is looking at the packet
452  *	type after return.
453  */
454 void dev_remove_pack(struct packet_type *pt)
455 {
456 	__dev_remove_pack(pt);
457 
458 	synchronize_net();
459 }
460 EXPORT_SYMBOL(dev_remove_pack);
461 
462 /******************************************************************************
463 
464 		      Device Boot-time Settings Routines
465 
466 *******************************************************************************/
467 
468 /* Boot time configuration table */
469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470 
471 /**
472  *	netdev_boot_setup_add	- add new setup entry
473  *	@name: name of the device
474  *	@map: configured settings for the device
475  *
476  *	Adds new setup entry to the dev_boot_setup list.  The function
477  *	returns 0 on error and 1 on success.  This is a generic routine to
478  *	all netdevices.
479  */
480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
481 {
482 	struct netdev_boot_setup *s;
483 	int i;
484 
485 	s = dev_boot_setup;
486 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 			memset(s[i].name, 0, sizeof(s[i].name));
489 			strlcpy(s[i].name, name, IFNAMSIZ);
490 			memcpy(&s[i].map, map, sizeof(s[i].map));
491 			break;
492 		}
493 	}
494 
495 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496 }
497 
498 /**
499  *	netdev_boot_setup_check	- check boot time settings
500  *	@dev: the netdevice
501  *
502  * 	Check boot time settings for the device.
503  *	The found settings are set for the device to be used
504  *	later in the device probing.
505  *	Returns 0 if no settings found, 1 if they are.
506  */
507 int netdev_boot_setup_check(struct net_device *dev)
508 {
509 	struct netdev_boot_setup *s = dev_boot_setup;
510 	int i;
511 
512 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514 		    !strcmp(dev->name, s[i].name)) {
515 			dev->irq 	= s[i].map.irq;
516 			dev->base_addr 	= s[i].map.base_addr;
517 			dev->mem_start 	= s[i].map.mem_start;
518 			dev->mem_end 	= s[i].map.mem_end;
519 			return 1;
520 		}
521 	}
522 	return 0;
523 }
524 EXPORT_SYMBOL(netdev_boot_setup_check);
525 
526 
527 /**
528  *	netdev_boot_base	- get address from boot time settings
529  *	@prefix: prefix for network device
530  *	@unit: id for network device
531  *
532  * 	Check boot time settings for the base address of device.
533  *	The found settings are set for the device to be used
534  *	later in the device probing.
535  *	Returns 0 if no settings found.
536  */
537 unsigned long netdev_boot_base(const char *prefix, int unit)
538 {
539 	const struct netdev_boot_setup *s = dev_boot_setup;
540 	char name[IFNAMSIZ];
541 	int i;
542 
543 	sprintf(name, "%s%d", prefix, unit);
544 
545 	/*
546 	 * If device already registered then return base of 1
547 	 * to indicate not to probe for this interface
548 	 */
549 	if (__dev_get_by_name(&init_net, name))
550 		return 1;
551 
552 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 		if (!strcmp(name, s[i].name))
554 			return s[i].map.base_addr;
555 	return 0;
556 }
557 
558 /*
559  * Saves at boot time configured settings for any netdevice.
560  */
561 int __init netdev_boot_setup(char *str)
562 {
563 	int ints[5];
564 	struct ifmap map;
565 
566 	str = get_options(str, ARRAY_SIZE(ints), ints);
567 	if (!str || !*str)
568 		return 0;
569 
570 	/* Save settings */
571 	memset(&map, 0, sizeof(map));
572 	if (ints[0] > 0)
573 		map.irq = ints[1];
574 	if (ints[0] > 1)
575 		map.base_addr = ints[2];
576 	if (ints[0] > 2)
577 		map.mem_start = ints[3];
578 	if (ints[0] > 3)
579 		map.mem_end = ints[4];
580 
581 	/* Add new entry to the list */
582 	return netdev_boot_setup_add(str, &map);
583 }
584 
585 __setup("netdev=", netdev_boot_setup);
586 
587 /*******************************************************************************
588 
589 			    Device Interface Subroutines
590 
591 *******************************************************************************/
592 
593 /**
594  *	__dev_get_by_name	- find a device by its name
595  *	@net: the applicable net namespace
596  *	@name: name to find
597  *
598  *	Find an interface by name. Must be called under RTNL semaphore
599  *	or @dev_base_lock. If the name is found a pointer to the device
600  *	is returned. If the name is not found then %NULL is returned. The
601  *	reference counters are not incremented so the caller must be
602  *	careful with locks.
603  */
604 
605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
606 {
607 	struct hlist_node *p;
608 	struct net_device *dev;
609 	struct hlist_head *head = dev_name_hash(net, name);
610 
611 	hlist_for_each_entry(dev, p, head, name_hlist)
612 		if (!strncmp(dev->name, name, IFNAMSIZ))
613 			return dev;
614 
615 	return NULL;
616 }
617 EXPORT_SYMBOL(__dev_get_by_name);
618 
619 /**
620  *	dev_get_by_name_rcu	- find a device by its name
621  *	@net: the applicable net namespace
622  *	@name: name to find
623  *
624  *	Find an interface by name.
625  *	If the name is found a pointer to the device is returned.
626  * 	If the name is not found then %NULL is returned.
627  *	The reference counters are not incremented so the caller must be
628  *	careful with locks. The caller must hold RCU lock.
629  */
630 
631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632 {
633 	struct hlist_node *p;
634 	struct net_device *dev;
635 	struct hlist_head *head = dev_name_hash(net, name);
636 
637 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 		if (!strncmp(dev->name, name, IFNAMSIZ))
639 			return dev;
640 
641 	return NULL;
642 }
643 EXPORT_SYMBOL(dev_get_by_name_rcu);
644 
645 /**
646  *	dev_get_by_name		- find a device by its name
647  *	@net: the applicable net namespace
648  *	@name: name to find
649  *
650  *	Find an interface by name. This can be called from any
651  *	context and does its own locking. The returned handle has
652  *	the usage count incremented and the caller must use dev_put() to
653  *	release it when it is no longer needed. %NULL is returned if no
654  *	matching device is found.
655  */
656 
657 struct net_device *dev_get_by_name(struct net *net, const char *name)
658 {
659 	struct net_device *dev;
660 
661 	rcu_read_lock();
662 	dev = dev_get_by_name_rcu(net, name);
663 	if (dev)
664 		dev_hold(dev);
665 	rcu_read_unlock();
666 	return dev;
667 }
668 EXPORT_SYMBOL(dev_get_by_name);
669 
670 /**
671  *	__dev_get_by_index - find a device by its ifindex
672  *	@net: the applicable net namespace
673  *	@ifindex: index of device
674  *
675  *	Search for an interface by index. Returns %NULL if the device
676  *	is not found or a pointer to the device. The device has not
677  *	had its reference counter increased so the caller must be careful
678  *	about locking. The caller must hold either the RTNL semaphore
679  *	or @dev_base_lock.
680  */
681 
682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
683 {
684 	struct hlist_node *p;
685 	struct net_device *dev;
686 	struct hlist_head *head = dev_index_hash(net, ifindex);
687 
688 	hlist_for_each_entry(dev, p, head, index_hlist)
689 		if (dev->ifindex == ifindex)
690 			return dev;
691 
692 	return NULL;
693 }
694 EXPORT_SYMBOL(__dev_get_by_index);
695 
696 /**
697  *	dev_get_by_index_rcu - find a device by its ifindex
698  *	@net: the applicable net namespace
699  *	@ifindex: index of device
700  *
701  *	Search for an interface by index. Returns %NULL if the device
702  *	is not found or a pointer to the device. The device has not
703  *	had its reference counter increased so the caller must be careful
704  *	about locking. The caller must hold RCU lock.
705  */
706 
707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708 {
709 	struct hlist_node *p;
710 	struct net_device *dev;
711 	struct hlist_head *head = dev_index_hash(net, ifindex);
712 
713 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 		if (dev->ifindex == ifindex)
715 			return dev;
716 
717 	return NULL;
718 }
719 EXPORT_SYMBOL(dev_get_by_index_rcu);
720 
721 
722 /**
723  *	dev_get_by_index - find a device by its ifindex
724  *	@net: the applicable net namespace
725  *	@ifindex: index of device
726  *
727  *	Search for an interface by index. Returns NULL if the device
728  *	is not found or a pointer to the device. The device returned has
729  *	had a reference added and the pointer is safe until the user calls
730  *	dev_put to indicate they have finished with it.
731  */
732 
733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
734 {
735 	struct net_device *dev;
736 
737 	rcu_read_lock();
738 	dev = dev_get_by_index_rcu(net, ifindex);
739 	if (dev)
740 		dev_hold(dev);
741 	rcu_read_unlock();
742 	return dev;
743 }
744 EXPORT_SYMBOL(dev_get_by_index);
745 
746 /**
747  *	dev_getbyhwaddr_rcu - find a device by its hardware address
748  *	@net: the applicable net namespace
749  *	@type: media type of device
750  *	@ha: hardware address
751  *
752  *	Search for an interface by MAC address. Returns NULL if the device
753  *	is not found or a pointer to the device.
754  *	The caller must hold RCU or RTNL.
755  *	The returned device has not had its ref count increased
756  *	and the caller must therefore be careful about locking
757  *
758  */
759 
760 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
761 				       const char *ha)
762 {
763 	struct net_device *dev;
764 
765 	for_each_netdev_rcu(net, dev)
766 		if (dev->type == type &&
767 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
768 			return dev;
769 
770 	return NULL;
771 }
772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
773 
774 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
775 {
776 	struct net_device *dev;
777 
778 	ASSERT_RTNL();
779 	for_each_netdev(net, dev)
780 		if (dev->type == type)
781 			return dev;
782 
783 	return NULL;
784 }
785 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
786 
787 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
788 {
789 	struct net_device *dev, *ret = NULL;
790 
791 	rcu_read_lock();
792 	for_each_netdev_rcu(net, dev)
793 		if (dev->type == type) {
794 			dev_hold(dev);
795 			ret = dev;
796 			break;
797 		}
798 	rcu_read_unlock();
799 	return ret;
800 }
801 EXPORT_SYMBOL(dev_getfirstbyhwtype);
802 
803 /**
804  *	dev_get_by_flags_rcu - find any device with given flags
805  *	@net: the applicable net namespace
806  *	@if_flags: IFF_* values
807  *	@mask: bitmask of bits in if_flags to check
808  *
809  *	Search for any interface with the given flags. Returns NULL if a device
810  *	is not found or a pointer to the device. Must be called inside
811  *	rcu_read_lock(), and result refcount is unchanged.
812  */
813 
814 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
815 				    unsigned short mask)
816 {
817 	struct net_device *dev, *ret;
818 
819 	ret = NULL;
820 	for_each_netdev_rcu(net, dev) {
821 		if (((dev->flags ^ if_flags) & mask) == 0) {
822 			ret = dev;
823 			break;
824 		}
825 	}
826 	return ret;
827 }
828 EXPORT_SYMBOL(dev_get_by_flags_rcu);
829 
830 /**
831  *	dev_valid_name - check if name is okay for network device
832  *	@name: name string
833  *
834  *	Network device names need to be valid file names to
835  *	to allow sysfs to work.  We also disallow any kind of
836  *	whitespace.
837  */
838 int dev_valid_name(const char *name)
839 {
840 	if (*name == '\0')
841 		return 0;
842 	if (strlen(name) >= IFNAMSIZ)
843 		return 0;
844 	if (!strcmp(name, ".") || !strcmp(name, ".."))
845 		return 0;
846 
847 	while (*name) {
848 		if (*name == '/' || isspace(*name))
849 			return 0;
850 		name++;
851 	}
852 	return 1;
853 }
854 EXPORT_SYMBOL(dev_valid_name);
855 
856 /**
857  *	__dev_alloc_name - allocate a name for a device
858  *	@net: network namespace to allocate the device name in
859  *	@name: name format string
860  *	@buf:  scratch buffer and result name string
861  *
862  *	Passed a format string - eg "lt%d" it will try and find a suitable
863  *	id. It scans list of devices to build up a free map, then chooses
864  *	the first empty slot. The caller must hold the dev_base or rtnl lock
865  *	while allocating the name and adding the device in order to avoid
866  *	duplicates.
867  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
868  *	Returns the number of the unit assigned or a negative errno code.
869  */
870 
871 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
872 {
873 	int i = 0;
874 	const char *p;
875 	const int max_netdevices = 8*PAGE_SIZE;
876 	unsigned long *inuse;
877 	struct net_device *d;
878 
879 	p = strnchr(name, IFNAMSIZ-1, '%');
880 	if (p) {
881 		/*
882 		 * Verify the string as this thing may have come from
883 		 * the user.  There must be either one "%d" and no other "%"
884 		 * characters.
885 		 */
886 		if (p[1] != 'd' || strchr(p + 2, '%'))
887 			return -EINVAL;
888 
889 		/* Use one page as a bit array of possible slots */
890 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
891 		if (!inuse)
892 			return -ENOMEM;
893 
894 		for_each_netdev(net, d) {
895 			if (!sscanf(d->name, name, &i))
896 				continue;
897 			if (i < 0 || i >= max_netdevices)
898 				continue;
899 
900 			/*  avoid cases where sscanf is not exact inverse of printf */
901 			snprintf(buf, IFNAMSIZ, name, i);
902 			if (!strncmp(buf, d->name, IFNAMSIZ))
903 				set_bit(i, inuse);
904 		}
905 
906 		i = find_first_zero_bit(inuse, max_netdevices);
907 		free_page((unsigned long) inuse);
908 	}
909 
910 	if (buf != name)
911 		snprintf(buf, IFNAMSIZ, name, i);
912 	if (!__dev_get_by_name(net, buf))
913 		return i;
914 
915 	/* It is possible to run out of possible slots
916 	 * when the name is long and there isn't enough space left
917 	 * for the digits, or if all bits are used.
918 	 */
919 	return -ENFILE;
920 }
921 
922 /**
923  *	dev_alloc_name - allocate a name for a device
924  *	@dev: device
925  *	@name: name format string
926  *
927  *	Passed a format string - eg "lt%d" it will try and find a suitable
928  *	id. It scans list of devices to build up a free map, then chooses
929  *	the first empty slot. The caller must hold the dev_base or rtnl lock
930  *	while allocating the name and adding the device in order to avoid
931  *	duplicates.
932  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
933  *	Returns the number of the unit assigned or a negative errno code.
934  */
935 
936 int dev_alloc_name(struct net_device *dev, const char *name)
937 {
938 	char buf[IFNAMSIZ];
939 	struct net *net;
940 	int ret;
941 
942 	BUG_ON(!dev_net(dev));
943 	net = dev_net(dev);
944 	ret = __dev_alloc_name(net, name, buf);
945 	if (ret >= 0)
946 		strlcpy(dev->name, buf, IFNAMSIZ);
947 	return ret;
948 }
949 EXPORT_SYMBOL(dev_alloc_name);
950 
951 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
952 {
953 	struct net *net;
954 
955 	BUG_ON(!dev_net(dev));
956 	net = dev_net(dev);
957 
958 	if (!dev_valid_name(name))
959 		return -EINVAL;
960 
961 	if (fmt && strchr(name, '%'))
962 		return dev_alloc_name(dev, name);
963 	else if (__dev_get_by_name(net, name))
964 		return -EEXIST;
965 	else if (dev->name != name)
966 		strlcpy(dev->name, name, IFNAMSIZ);
967 
968 	return 0;
969 }
970 
971 /**
972  *	dev_change_name - change name of a device
973  *	@dev: device
974  *	@newname: name (or format string) must be at least IFNAMSIZ
975  *
976  *	Change name of a device, can pass format strings "eth%d".
977  *	for wildcarding.
978  */
979 int dev_change_name(struct net_device *dev, const char *newname)
980 {
981 	char oldname[IFNAMSIZ];
982 	int err = 0;
983 	int ret;
984 	struct net *net;
985 
986 	ASSERT_RTNL();
987 	BUG_ON(!dev_net(dev));
988 
989 	net = dev_net(dev);
990 	if (dev->flags & IFF_UP)
991 		return -EBUSY;
992 
993 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
994 		return 0;
995 
996 	memcpy(oldname, dev->name, IFNAMSIZ);
997 
998 	err = dev_get_valid_name(dev, newname, 1);
999 	if (err < 0)
1000 		return err;
1001 
1002 rollback:
1003 	ret = device_rename(&dev->dev, dev->name);
1004 	if (ret) {
1005 		memcpy(dev->name, oldname, IFNAMSIZ);
1006 		return ret;
1007 	}
1008 
1009 	write_lock_bh(&dev_base_lock);
1010 	hlist_del(&dev->name_hlist);
1011 	write_unlock_bh(&dev_base_lock);
1012 
1013 	synchronize_rcu();
1014 
1015 	write_lock_bh(&dev_base_lock);
1016 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017 	write_unlock_bh(&dev_base_lock);
1018 
1019 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020 	ret = notifier_to_errno(ret);
1021 
1022 	if (ret) {
1023 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1024 		if (err >= 0) {
1025 			err = ret;
1026 			memcpy(dev->name, oldname, IFNAMSIZ);
1027 			goto rollback;
1028 		} else {
1029 			printk(KERN_ERR
1030 			       "%s: name change rollback failed: %d.\n",
1031 			       dev->name, ret);
1032 		}
1033 	}
1034 
1035 	return err;
1036 }
1037 
1038 /**
1039  *	dev_set_alias - change ifalias of a device
1040  *	@dev: device
1041  *	@alias: name up to IFALIASZ
1042  *	@len: limit of bytes to copy from info
1043  *
1044  *	Set ifalias for a device,
1045  */
1046 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047 {
1048 	ASSERT_RTNL();
1049 
1050 	if (len >= IFALIASZ)
1051 		return -EINVAL;
1052 
1053 	if (!len) {
1054 		if (dev->ifalias) {
1055 			kfree(dev->ifalias);
1056 			dev->ifalias = NULL;
1057 		}
1058 		return 0;
1059 	}
1060 
1061 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062 	if (!dev->ifalias)
1063 		return -ENOMEM;
1064 
1065 	strlcpy(dev->ifalias, alias, len+1);
1066 	return len;
1067 }
1068 
1069 
1070 /**
1071  *	netdev_features_change - device changes features
1072  *	@dev: device to cause notification
1073  *
1074  *	Called to indicate a device has changed features.
1075  */
1076 void netdev_features_change(struct net_device *dev)
1077 {
1078 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079 }
1080 EXPORT_SYMBOL(netdev_features_change);
1081 
1082 /**
1083  *	netdev_state_change - device changes state
1084  *	@dev: device to cause notification
1085  *
1086  *	Called to indicate a device has changed state. This function calls
1087  *	the notifier chains for netdev_chain and sends a NEWLINK message
1088  *	to the routing socket.
1089  */
1090 void netdev_state_change(struct net_device *dev)
1091 {
1092 	if (dev->flags & IFF_UP) {
1093 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095 	}
1096 }
1097 EXPORT_SYMBOL(netdev_state_change);
1098 
1099 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100 {
1101 	return call_netdevice_notifiers(event, dev);
1102 }
1103 EXPORT_SYMBOL(netdev_bonding_change);
1104 
1105 /**
1106  *	dev_load 	- load a network module
1107  *	@net: the applicable net namespace
1108  *	@name: name of interface
1109  *
1110  *	If a network interface is not present and the process has suitable
1111  *	privileges this function loads the module. If module loading is not
1112  *	available in this kernel then it becomes a nop.
1113  */
1114 
1115 void dev_load(struct net *net, const char *name)
1116 {
1117 	struct net_device *dev;
1118 
1119 	rcu_read_lock();
1120 	dev = dev_get_by_name_rcu(net, name);
1121 	rcu_read_unlock();
1122 
1123 	if (!dev && capable(CAP_NET_ADMIN))
1124 		request_module("%s", name);
1125 }
1126 EXPORT_SYMBOL(dev_load);
1127 
1128 static int __dev_open(struct net_device *dev)
1129 {
1130 	const struct net_device_ops *ops = dev->netdev_ops;
1131 	int ret;
1132 
1133 	ASSERT_RTNL();
1134 
1135 	/*
1136 	 *	Is it even present?
1137 	 */
1138 	if (!netif_device_present(dev))
1139 		return -ENODEV;
1140 
1141 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1142 	ret = notifier_to_errno(ret);
1143 	if (ret)
1144 		return ret;
1145 
1146 	/*
1147 	 *	Call device private open method
1148 	 */
1149 	set_bit(__LINK_STATE_START, &dev->state);
1150 
1151 	if (ops->ndo_validate_addr)
1152 		ret = ops->ndo_validate_addr(dev);
1153 
1154 	if (!ret && ops->ndo_open)
1155 		ret = ops->ndo_open(dev);
1156 
1157 	/*
1158 	 *	If it went open OK then:
1159 	 */
1160 
1161 	if (ret)
1162 		clear_bit(__LINK_STATE_START, &dev->state);
1163 	else {
1164 		/*
1165 		 *	Set the flags.
1166 		 */
1167 		dev->flags |= IFF_UP;
1168 
1169 		/*
1170 		 *	Enable NET_DMA
1171 		 */
1172 		net_dmaengine_get();
1173 
1174 		/*
1175 		 *	Initialize multicasting status
1176 		 */
1177 		dev_set_rx_mode(dev);
1178 
1179 		/*
1180 		 *	Wakeup transmit queue engine
1181 		 */
1182 		dev_activate(dev);
1183 	}
1184 
1185 	return ret;
1186 }
1187 
1188 /**
1189  *	dev_open	- prepare an interface for use.
1190  *	@dev:	device to open
1191  *
1192  *	Takes a device from down to up state. The device's private open
1193  *	function is invoked and then the multicast lists are loaded. Finally
1194  *	the device is moved into the up state and a %NETDEV_UP message is
1195  *	sent to the netdev notifier chain.
1196  *
1197  *	Calling this function on an active interface is a nop. On a failure
1198  *	a negative errno code is returned.
1199  */
1200 int dev_open(struct net_device *dev)
1201 {
1202 	int ret;
1203 
1204 	/*
1205 	 *	Is it already up?
1206 	 */
1207 	if (dev->flags & IFF_UP)
1208 		return 0;
1209 
1210 	/*
1211 	 *	Open device
1212 	 */
1213 	ret = __dev_open(dev);
1214 	if (ret < 0)
1215 		return ret;
1216 
1217 	/*
1218 	 *	... and announce new interface.
1219 	 */
1220 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1221 	call_netdevice_notifiers(NETDEV_UP, dev);
1222 
1223 	return ret;
1224 }
1225 EXPORT_SYMBOL(dev_open);
1226 
1227 static int __dev_close_many(struct list_head *head)
1228 {
1229 	struct net_device *dev;
1230 
1231 	ASSERT_RTNL();
1232 	might_sleep();
1233 
1234 	list_for_each_entry(dev, head, unreg_list) {
1235 		/*
1236 		 *	Tell people we are going down, so that they can
1237 		 *	prepare to death, when device is still operating.
1238 		 */
1239 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1240 
1241 		clear_bit(__LINK_STATE_START, &dev->state);
1242 
1243 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1244 		 * can be even on different cpu. So just clear netif_running().
1245 		 *
1246 		 * dev->stop() will invoke napi_disable() on all of it's
1247 		 * napi_struct instances on this device.
1248 		 */
1249 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1250 	}
1251 
1252 	dev_deactivate_many(head);
1253 
1254 	list_for_each_entry(dev, head, unreg_list) {
1255 		const struct net_device_ops *ops = dev->netdev_ops;
1256 
1257 		/*
1258 		 *	Call the device specific close. This cannot fail.
1259 		 *	Only if device is UP
1260 		 *
1261 		 *	We allow it to be called even after a DETACH hot-plug
1262 		 *	event.
1263 		 */
1264 		if (ops->ndo_stop)
1265 			ops->ndo_stop(dev);
1266 
1267 		/*
1268 		 *	Device is now down.
1269 		 */
1270 
1271 		dev->flags &= ~IFF_UP;
1272 
1273 		/*
1274 		 *	Shutdown NET_DMA
1275 		 */
1276 		net_dmaengine_put();
1277 	}
1278 
1279 	return 0;
1280 }
1281 
1282 static int __dev_close(struct net_device *dev)
1283 {
1284 	int retval;
1285 	LIST_HEAD(single);
1286 
1287 	list_add(&dev->unreg_list, &single);
1288 	retval = __dev_close_many(&single);
1289 	list_del(&single);
1290 	return retval;
1291 }
1292 
1293 static int dev_close_many(struct list_head *head)
1294 {
1295 	struct net_device *dev, *tmp;
1296 	LIST_HEAD(tmp_list);
1297 
1298 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1299 		if (!(dev->flags & IFF_UP))
1300 			list_move(&dev->unreg_list, &tmp_list);
1301 
1302 	__dev_close_many(head);
1303 
1304 	/*
1305 	 * Tell people we are down
1306 	 */
1307 	list_for_each_entry(dev, head, unreg_list) {
1308 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1309 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1310 	}
1311 
1312 	/* rollback_registered_many needs the complete original list */
1313 	list_splice(&tmp_list, head);
1314 	return 0;
1315 }
1316 
1317 /**
1318  *	dev_close - shutdown an interface.
1319  *	@dev: device to shutdown
1320  *
1321  *	This function moves an active device into down state. A
1322  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1323  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1324  *	chain.
1325  */
1326 int dev_close(struct net_device *dev)
1327 {
1328 	LIST_HEAD(single);
1329 
1330 	list_add(&dev->unreg_list, &single);
1331 	dev_close_many(&single);
1332 	list_del(&single);
1333 	return 0;
1334 }
1335 EXPORT_SYMBOL(dev_close);
1336 
1337 
1338 /**
1339  *	dev_disable_lro - disable Large Receive Offload on a device
1340  *	@dev: device
1341  *
1342  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1343  *	called under RTNL.  This is needed if received packets may be
1344  *	forwarded to another interface.
1345  */
1346 void dev_disable_lro(struct net_device *dev)
1347 {
1348 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1349 	    dev->ethtool_ops->set_flags) {
1350 		u32 flags = dev->ethtool_ops->get_flags(dev);
1351 		if (flags & ETH_FLAG_LRO) {
1352 			flags &= ~ETH_FLAG_LRO;
1353 			dev->ethtool_ops->set_flags(dev, flags);
1354 		}
1355 	}
1356 	WARN_ON(dev->features & NETIF_F_LRO);
1357 }
1358 EXPORT_SYMBOL(dev_disable_lro);
1359 
1360 
1361 static int dev_boot_phase = 1;
1362 
1363 /*
1364  *	Device change register/unregister. These are not inline or static
1365  *	as we export them to the world.
1366  */
1367 
1368 /**
1369  *	register_netdevice_notifier - register a network notifier block
1370  *	@nb: notifier
1371  *
1372  *	Register a notifier to be called when network device events occur.
1373  *	The notifier passed is linked into the kernel structures and must
1374  *	not be reused until it has been unregistered. A negative errno code
1375  *	is returned on a failure.
1376  *
1377  * 	When registered all registration and up events are replayed
1378  *	to the new notifier to allow device to have a race free
1379  *	view of the network device list.
1380  */
1381 
1382 int register_netdevice_notifier(struct notifier_block *nb)
1383 {
1384 	struct net_device *dev;
1385 	struct net_device *last;
1386 	struct net *net;
1387 	int err;
1388 
1389 	rtnl_lock();
1390 	err = raw_notifier_chain_register(&netdev_chain, nb);
1391 	if (err)
1392 		goto unlock;
1393 	if (dev_boot_phase)
1394 		goto unlock;
1395 	for_each_net(net) {
1396 		for_each_netdev(net, dev) {
1397 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1398 			err = notifier_to_errno(err);
1399 			if (err)
1400 				goto rollback;
1401 
1402 			if (!(dev->flags & IFF_UP))
1403 				continue;
1404 
1405 			nb->notifier_call(nb, NETDEV_UP, dev);
1406 		}
1407 	}
1408 
1409 unlock:
1410 	rtnl_unlock();
1411 	return err;
1412 
1413 rollback:
1414 	last = dev;
1415 	for_each_net(net) {
1416 		for_each_netdev(net, dev) {
1417 			if (dev == last)
1418 				break;
1419 
1420 			if (dev->flags & IFF_UP) {
1421 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1422 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1423 			}
1424 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1425 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1426 		}
1427 	}
1428 
1429 	raw_notifier_chain_unregister(&netdev_chain, nb);
1430 	goto unlock;
1431 }
1432 EXPORT_SYMBOL(register_netdevice_notifier);
1433 
1434 /**
1435  *	unregister_netdevice_notifier - unregister a network notifier block
1436  *	@nb: notifier
1437  *
1438  *	Unregister a notifier previously registered by
1439  *	register_netdevice_notifier(). The notifier is unlinked into the
1440  *	kernel structures and may then be reused. A negative errno code
1441  *	is returned on a failure.
1442  */
1443 
1444 int unregister_netdevice_notifier(struct notifier_block *nb)
1445 {
1446 	int err;
1447 
1448 	rtnl_lock();
1449 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1450 	rtnl_unlock();
1451 	return err;
1452 }
1453 EXPORT_SYMBOL(unregister_netdevice_notifier);
1454 
1455 /**
1456  *	call_netdevice_notifiers - call all network notifier blocks
1457  *      @val: value passed unmodified to notifier function
1458  *      @dev: net_device pointer passed unmodified to notifier function
1459  *
1460  *	Call all network notifier blocks.  Parameters and return value
1461  *	are as for raw_notifier_call_chain().
1462  */
1463 
1464 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1465 {
1466 	ASSERT_RTNL();
1467 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1468 }
1469 
1470 /* When > 0 there are consumers of rx skb time stamps */
1471 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1472 
1473 void net_enable_timestamp(void)
1474 {
1475 	atomic_inc(&netstamp_needed);
1476 }
1477 EXPORT_SYMBOL(net_enable_timestamp);
1478 
1479 void net_disable_timestamp(void)
1480 {
1481 	atomic_dec(&netstamp_needed);
1482 }
1483 EXPORT_SYMBOL(net_disable_timestamp);
1484 
1485 static inline void net_timestamp_set(struct sk_buff *skb)
1486 {
1487 	if (atomic_read(&netstamp_needed))
1488 		__net_timestamp(skb);
1489 	else
1490 		skb->tstamp.tv64 = 0;
1491 }
1492 
1493 static inline void net_timestamp_check(struct sk_buff *skb)
1494 {
1495 	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1496 		__net_timestamp(skb);
1497 }
1498 
1499 /**
1500  * dev_forward_skb - loopback an skb to another netif
1501  *
1502  * @dev: destination network device
1503  * @skb: buffer to forward
1504  *
1505  * return values:
1506  *	NET_RX_SUCCESS	(no congestion)
1507  *	NET_RX_DROP     (packet was dropped, but freed)
1508  *
1509  * dev_forward_skb can be used for injecting an skb from the
1510  * start_xmit function of one device into the receive queue
1511  * of another device.
1512  *
1513  * The receiving device may be in another namespace, so
1514  * we have to clear all information in the skb that could
1515  * impact namespace isolation.
1516  */
1517 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1518 {
1519 	skb_orphan(skb);
1520 	nf_reset(skb);
1521 
1522 	if (unlikely(!(dev->flags & IFF_UP) ||
1523 		     (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1524 		atomic_long_inc(&dev->rx_dropped);
1525 		kfree_skb(skb);
1526 		return NET_RX_DROP;
1527 	}
1528 	skb_set_dev(skb, dev);
1529 	skb->tstamp.tv64 = 0;
1530 	skb->pkt_type = PACKET_HOST;
1531 	skb->protocol = eth_type_trans(skb, dev);
1532 	return netif_rx(skb);
1533 }
1534 EXPORT_SYMBOL_GPL(dev_forward_skb);
1535 
1536 static inline int deliver_skb(struct sk_buff *skb,
1537 			      struct packet_type *pt_prev,
1538 			      struct net_device *orig_dev)
1539 {
1540 	atomic_inc(&skb->users);
1541 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1542 }
1543 
1544 /*
1545  *	Support routine. Sends outgoing frames to any network
1546  *	taps currently in use.
1547  */
1548 
1549 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1550 {
1551 	struct packet_type *ptype;
1552 	struct sk_buff *skb2 = NULL;
1553 	struct packet_type *pt_prev = NULL;
1554 
1555 	rcu_read_lock();
1556 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1557 		/* Never send packets back to the socket
1558 		 * they originated from - MvS ([email protected])
1559 		 */
1560 		if ((ptype->dev == dev || !ptype->dev) &&
1561 		    (ptype->af_packet_priv == NULL ||
1562 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1563 			if (pt_prev) {
1564 				deliver_skb(skb2, pt_prev, skb->dev);
1565 				pt_prev = ptype;
1566 				continue;
1567 			}
1568 
1569 			skb2 = skb_clone(skb, GFP_ATOMIC);
1570 			if (!skb2)
1571 				break;
1572 
1573 			net_timestamp_set(skb2);
1574 
1575 			/* skb->nh should be correctly
1576 			   set by sender, so that the second statement is
1577 			   just protection against buggy protocols.
1578 			 */
1579 			skb_reset_mac_header(skb2);
1580 
1581 			if (skb_network_header(skb2) < skb2->data ||
1582 			    skb2->network_header > skb2->tail) {
1583 				if (net_ratelimit())
1584 					printk(KERN_CRIT "protocol %04x is "
1585 					       "buggy, dev %s\n",
1586 					       ntohs(skb2->protocol),
1587 					       dev->name);
1588 				skb_reset_network_header(skb2);
1589 			}
1590 
1591 			skb2->transport_header = skb2->network_header;
1592 			skb2->pkt_type = PACKET_OUTGOING;
1593 			pt_prev = ptype;
1594 		}
1595 	}
1596 	if (pt_prev)
1597 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1598 	rcu_read_unlock();
1599 }
1600 
1601 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1602  * @dev: Network device
1603  * @txq: number of queues available
1604  *
1605  * If real_num_tx_queues is changed the tc mappings may no longer be
1606  * valid. To resolve this verify the tc mapping remains valid and if
1607  * not NULL the mapping. With no priorities mapping to this
1608  * offset/count pair it will no longer be used. In the worst case TC0
1609  * is invalid nothing can be done so disable priority mappings. If is
1610  * expected that drivers will fix this mapping if they can before
1611  * calling netif_set_real_num_tx_queues.
1612  */
1613 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1614 {
1615 	int i;
1616 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1617 
1618 	/* If TC0 is invalidated disable TC mapping */
1619 	if (tc->offset + tc->count > txq) {
1620 		pr_warning("Number of in use tx queues changed "
1621 			   "invalidating tc mappings. Priority "
1622 			   "traffic classification disabled!\n");
1623 		dev->num_tc = 0;
1624 		return;
1625 	}
1626 
1627 	/* Invalidated prio to tc mappings set to TC0 */
1628 	for (i = 1; i < TC_BITMASK + 1; i++) {
1629 		int q = netdev_get_prio_tc_map(dev, i);
1630 
1631 		tc = &dev->tc_to_txq[q];
1632 		if (tc->offset + tc->count > txq) {
1633 			pr_warning("Number of in use tx queues "
1634 				   "changed. Priority %i to tc "
1635 				   "mapping %i is no longer valid "
1636 				   "setting map to 0\n",
1637 				   i, q);
1638 			netdev_set_prio_tc_map(dev, i, 0);
1639 		}
1640 	}
1641 }
1642 
1643 /*
1644  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1645  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1646  */
1647 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1648 {
1649 	int rc;
1650 
1651 	if (txq < 1 || txq > dev->num_tx_queues)
1652 		return -EINVAL;
1653 
1654 	if (dev->reg_state == NETREG_REGISTERED ||
1655 	    dev->reg_state == NETREG_UNREGISTERING) {
1656 		ASSERT_RTNL();
1657 
1658 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1659 						  txq);
1660 		if (rc)
1661 			return rc;
1662 
1663 		if (dev->num_tc)
1664 			netif_setup_tc(dev, txq);
1665 
1666 		if (txq < dev->real_num_tx_queues)
1667 			qdisc_reset_all_tx_gt(dev, txq);
1668 	}
1669 
1670 	dev->real_num_tx_queues = txq;
1671 	return 0;
1672 }
1673 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1674 
1675 #ifdef CONFIG_RPS
1676 /**
1677  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1678  *	@dev: Network device
1679  *	@rxq: Actual number of RX queues
1680  *
1681  *	This must be called either with the rtnl_lock held or before
1682  *	registration of the net device.  Returns 0 on success, or a
1683  *	negative error code.  If called before registration, it always
1684  *	succeeds.
1685  */
1686 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1687 {
1688 	int rc;
1689 
1690 	if (rxq < 1 || rxq > dev->num_rx_queues)
1691 		return -EINVAL;
1692 
1693 	if (dev->reg_state == NETREG_REGISTERED) {
1694 		ASSERT_RTNL();
1695 
1696 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1697 						  rxq);
1698 		if (rc)
1699 			return rc;
1700 	}
1701 
1702 	dev->real_num_rx_queues = rxq;
1703 	return 0;
1704 }
1705 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1706 #endif
1707 
1708 static inline void __netif_reschedule(struct Qdisc *q)
1709 {
1710 	struct softnet_data *sd;
1711 	unsigned long flags;
1712 
1713 	local_irq_save(flags);
1714 	sd = &__get_cpu_var(softnet_data);
1715 	q->next_sched = NULL;
1716 	*sd->output_queue_tailp = q;
1717 	sd->output_queue_tailp = &q->next_sched;
1718 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1719 	local_irq_restore(flags);
1720 }
1721 
1722 void __netif_schedule(struct Qdisc *q)
1723 {
1724 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1725 		__netif_reschedule(q);
1726 }
1727 EXPORT_SYMBOL(__netif_schedule);
1728 
1729 void dev_kfree_skb_irq(struct sk_buff *skb)
1730 {
1731 	if (atomic_dec_and_test(&skb->users)) {
1732 		struct softnet_data *sd;
1733 		unsigned long flags;
1734 
1735 		local_irq_save(flags);
1736 		sd = &__get_cpu_var(softnet_data);
1737 		skb->next = sd->completion_queue;
1738 		sd->completion_queue = skb;
1739 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1740 		local_irq_restore(flags);
1741 	}
1742 }
1743 EXPORT_SYMBOL(dev_kfree_skb_irq);
1744 
1745 void dev_kfree_skb_any(struct sk_buff *skb)
1746 {
1747 	if (in_irq() || irqs_disabled())
1748 		dev_kfree_skb_irq(skb);
1749 	else
1750 		dev_kfree_skb(skb);
1751 }
1752 EXPORT_SYMBOL(dev_kfree_skb_any);
1753 
1754 
1755 /**
1756  * netif_device_detach - mark device as removed
1757  * @dev: network device
1758  *
1759  * Mark device as removed from system and therefore no longer available.
1760  */
1761 void netif_device_detach(struct net_device *dev)
1762 {
1763 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1764 	    netif_running(dev)) {
1765 		netif_tx_stop_all_queues(dev);
1766 	}
1767 }
1768 EXPORT_SYMBOL(netif_device_detach);
1769 
1770 /**
1771  * netif_device_attach - mark device as attached
1772  * @dev: network device
1773  *
1774  * Mark device as attached from system and restart if needed.
1775  */
1776 void netif_device_attach(struct net_device *dev)
1777 {
1778 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1779 	    netif_running(dev)) {
1780 		netif_tx_wake_all_queues(dev);
1781 		__netdev_watchdog_up(dev);
1782 	}
1783 }
1784 EXPORT_SYMBOL(netif_device_attach);
1785 
1786 /**
1787  * skb_dev_set -- assign a new device to a buffer
1788  * @skb: buffer for the new device
1789  * @dev: network device
1790  *
1791  * If an skb is owned by a device already, we have to reset
1792  * all data private to the namespace a device belongs to
1793  * before assigning it a new device.
1794  */
1795 #ifdef CONFIG_NET_NS
1796 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1797 {
1798 	skb_dst_drop(skb);
1799 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1800 		secpath_reset(skb);
1801 		nf_reset(skb);
1802 		skb_init_secmark(skb);
1803 		skb->mark = 0;
1804 		skb->priority = 0;
1805 		skb->nf_trace = 0;
1806 		skb->ipvs_property = 0;
1807 #ifdef CONFIG_NET_SCHED
1808 		skb->tc_index = 0;
1809 #endif
1810 	}
1811 	skb->dev = dev;
1812 }
1813 EXPORT_SYMBOL(skb_set_dev);
1814 #endif /* CONFIG_NET_NS */
1815 
1816 /*
1817  * Invalidate hardware checksum when packet is to be mangled, and
1818  * complete checksum manually on outgoing path.
1819  */
1820 int skb_checksum_help(struct sk_buff *skb)
1821 {
1822 	__wsum csum;
1823 	int ret = 0, offset;
1824 
1825 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1826 		goto out_set_summed;
1827 
1828 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1829 		/* Let GSO fix up the checksum. */
1830 		goto out_set_summed;
1831 	}
1832 
1833 	offset = skb_checksum_start_offset(skb);
1834 	BUG_ON(offset >= skb_headlen(skb));
1835 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1836 
1837 	offset += skb->csum_offset;
1838 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1839 
1840 	if (skb_cloned(skb) &&
1841 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1842 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1843 		if (ret)
1844 			goto out;
1845 	}
1846 
1847 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1848 out_set_summed:
1849 	skb->ip_summed = CHECKSUM_NONE;
1850 out:
1851 	return ret;
1852 }
1853 EXPORT_SYMBOL(skb_checksum_help);
1854 
1855 /**
1856  *	skb_gso_segment - Perform segmentation on skb.
1857  *	@skb: buffer to segment
1858  *	@features: features for the output path (see dev->features)
1859  *
1860  *	This function segments the given skb and returns a list of segments.
1861  *
1862  *	It may return NULL if the skb requires no segmentation.  This is
1863  *	only possible when GSO is used for verifying header integrity.
1864  */
1865 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1866 {
1867 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1868 	struct packet_type *ptype;
1869 	__be16 type = skb->protocol;
1870 	int vlan_depth = ETH_HLEN;
1871 	int err;
1872 
1873 	while (type == htons(ETH_P_8021Q)) {
1874 		struct vlan_hdr *vh;
1875 
1876 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1877 			return ERR_PTR(-EINVAL);
1878 
1879 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1880 		type = vh->h_vlan_encapsulated_proto;
1881 		vlan_depth += VLAN_HLEN;
1882 	}
1883 
1884 	skb_reset_mac_header(skb);
1885 	skb->mac_len = skb->network_header - skb->mac_header;
1886 	__skb_pull(skb, skb->mac_len);
1887 
1888 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1889 		struct net_device *dev = skb->dev;
1890 		struct ethtool_drvinfo info = {};
1891 
1892 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1893 			dev->ethtool_ops->get_drvinfo(dev, &info);
1894 
1895 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1896 		     info.driver, dev ? dev->features : 0L,
1897 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1898 		     skb->len, skb->data_len, skb->ip_summed);
1899 
1900 		if (skb_header_cloned(skb) &&
1901 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1902 			return ERR_PTR(err);
1903 	}
1904 
1905 	rcu_read_lock();
1906 	list_for_each_entry_rcu(ptype,
1907 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1908 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1909 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1910 				err = ptype->gso_send_check(skb);
1911 				segs = ERR_PTR(err);
1912 				if (err || skb_gso_ok(skb, features))
1913 					break;
1914 				__skb_push(skb, (skb->data -
1915 						 skb_network_header(skb)));
1916 			}
1917 			segs = ptype->gso_segment(skb, features);
1918 			break;
1919 		}
1920 	}
1921 	rcu_read_unlock();
1922 
1923 	__skb_push(skb, skb->data - skb_mac_header(skb));
1924 
1925 	return segs;
1926 }
1927 EXPORT_SYMBOL(skb_gso_segment);
1928 
1929 /* Take action when hardware reception checksum errors are detected. */
1930 #ifdef CONFIG_BUG
1931 void netdev_rx_csum_fault(struct net_device *dev)
1932 {
1933 	if (net_ratelimit()) {
1934 		printk(KERN_ERR "%s: hw csum failure.\n",
1935 			dev ? dev->name : "<unknown>");
1936 		dump_stack();
1937 	}
1938 }
1939 EXPORT_SYMBOL(netdev_rx_csum_fault);
1940 #endif
1941 
1942 /* Actually, we should eliminate this check as soon as we know, that:
1943  * 1. IOMMU is present and allows to map all the memory.
1944  * 2. No high memory really exists on this machine.
1945  */
1946 
1947 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1948 {
1949 #ifdef CONFIG_HIGHMEM
1950 	int i;
1951 	if (!(dev->features & NETIF_F_HIGHDMA)) {
1952 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1953 			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1954 				return 1;
1955 	}
1956 
1957 	if (PCI_DMA_BUS_IS_PHYS) {
1958 		struct device *pdev = dev->dev.parent;
1959 
1960 		if (!pdev)
1961 			return 0;
1962 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1963 			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1964 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1965 				return 1;
1966 		}
1967 	}
1968 #endif
1969 	return 0;
1970 }
1971 
1972 struct dev_gso_cb {
1973 	void (*destructor)(struct sk_buff *skb);
1974 };
1975 
1976 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1977 
1978 static void dev_gso_skb_destructor(struct sk_buff *skb)
1979 {
1980 	struct dev_gso_cb *cb;
1981 
1982 	do {
1983 		struct sk_buff *nskb = skb->next;
1984 
1985 		skb->next = nskb->next;
1986 		nskb->next = NULL;
1987 		kfree_skb(nskb);
1988 	} while (skb->next);
1989 
1990 	cb = DEV_GSO_CB(skb);
1991 	if (cb->destructor)
1992 		cb->destructor(skb);
1993 }
1994 
1995 /**
1996  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1997  *	@skb: buffer to segment
1998  *	@features: device features as applicable to this skb
1999  *
2000  *	This function segments the given skb and stores the list of segments
2001  *	in skb->next.
2002  */
2003 static int dev_gso_segment(struct sk_buff *skb, int features)
2004 {
2005 	struct sk_buff *segs;
2006 
2007 	segs = skb_gso_segment(skb, features);
2008 
2009 	/* Verifying header integrity only. */
2010 	if (!segs)
2011 		return 0;
2012 
2013 	if (IS_ERR(segs))
2014 		return PTR_ERR(segs);
2015 
2016 	skb->next = segs;
2017 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2018 	skb->destructor = dev_gso_skb_destructor;
2019 
2020 	return 0;
2021 }
2022 
2023 /*
2024  * Try to orphan skb early, right before transmission by the device.
2025  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2026  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2027  */
2028 static inline void skb_orphan_try(struct sk_buff *skb)
2029 {
2030 	struct sock *sk = skb->sk;
2031 
2032 	if (sk && !skb_shinfo(skb)->tx_flags) {
2033 		/* skb_tx_hash() wont be able to get sk.
2034 		 * We copy sk_hash into skb->rxhash
2035 		 */
2036 		if (!skb->rxhash)
2037 			skb->rxhash = sk->sk_hash;
2038 		skb_orphan(skb);
2039 	}
2040 }
2041 
2042 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2043 {
2044 	return ((features & NETIF_F_GEN_CSUM) ||
2045 		((features & NETIF_F_V4_CSUM) &&
2046 		 protocol == htons(ETH_P_IP)) ||
2047 		((features & NETIF_F_V6_CSUM) &&
2048 		 protocol == htons(ETH_P_IPV6)) ||
2049 		((features & NETIF_F_FCOE_CRC) &&
2050 		 protocol == htons(ETH_P_FCOE)));
2051 }
2052 
2053 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2054 {
2055 	if (!can_checksum_protocol(features, protocol)) {
2056 		features &= ~NETIF_F_ALL_CSUM;
2057 		features &= ~NETIF_F_SG;
2058 	} else if (illegal_highdma(skb->dev, skb)) {
2059 		features &= ~NETIF_F_SG;
2060 	}
2061 
2062 	return features;
2063 }
2064 
2065 u32 netif_skb_features(struct sk_buff *skb)
2066 {
2067 	__be16 protocol = skb->protocol;
2068 	u32 features = skb->dev->features;
2069 
2070 	if (protocol == htons(ETH_P_8021Q)) {
2071 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2072 		protocol = veh->h_vlan_encapsulated_proto;
2073 	} else if (!vlan_tx_tag_present(skb)) {
2074 		return harmonize_features(skb, protocol, features);
2075 	}
2076 
2077 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2078 
2079 	if (protocol != htons(ETH_P_8021Q)) {
2080 		return harmonize_features(skb, protocol, features);
2081 	} else {
2082 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2083 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2084 		return harmonize_features(skb, protocol, features);
2085 	}
2086 }
2087 EXPORT_SYMBOL(netif_skb_features);
2088 
2089 /*
2090  * Returns true if either:
2091  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2092  *	2. skb is fragmented and the device does not support SG, or if
2093  *	   at least one of fragments is in highmem and device does not
2094  *	   support DMA from it.
2095  */
2096 static inline int skb_needs_linearize(struct sk_buff *skb,
2097 				      int features)
2098 {
2099 	return skb_is_nonlinear(skb) &&
2100 			((skb_has_frag_list(skb) &&
2101 				!(features & NETIF_F_FRAGLIST)) ||
2102 			(skb_shinfo(skb)->nr_frags &&
2103 				!(features & NETIF_F_SG)));
2104 }
2105 
2106 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2107 			struct netdev_queue *txq)
2108 {
2109 	const struct net_device_ops *ops = dev->netdev_ops;
2110 	int rc = NETDEV_TX_OK;
2111 
2112 	if (likely(!skb->next)) {
2113 		u32 features;
2114 
2115 		/*
2116 		 * If device doesnt need skb->dst, release it right now while
2117 		 * its hot in this cpu cache
2118 		 */
2119 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2120 			skb_dst_drop(skb);
2121 
2122 		if (!list_empty(&ptype_all))
2123 			dev_queue_xmit_nit(skb, dev);
2124 
2125 		skb_orphan_try(skb);
2126 
2127 		features = netif_skb_features(skb);
2128 
2129 		if (vlan_tx_tag_present(skb) &&
2130 		    !(features & NETIF_F_HW_VLAN_TX)) {
2131 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2132 			if (unlikely(!skb))
2133 				goto out;
2134 
2135 			skb->vlan_tci = 0;
2136 		}
2137 
2138 		if (netif_needs_gso(skb, features)) {
2139 			if (unlikely(dev_gso_segment(skb, features)))
2140 				goto out_kfree_skb;
2141 			if (skb->next)
2142 				goto gso;
2143 		} else {
2144 			if (skb_needs_linearize(skb, features) &&
2145 			    __skb_linearize(skb))
2146 				goto out_kfree_skb;
2147 
2148 			/* If packet is not checksummed and device does not
2149 			 * support checksumming for this protocol, complete
2150 			 * checksumming here.
2151 			 */
2152 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2153 				skb_set_transport_header(skb,
2154 					skb_checksum_start_offset(skb));
2155 				if (!(features & NETIF_F_ALL_CSUM) &&
2156 				     skb_checksum_help(skb))
2157 					goto out_kfree_skb;
2158 			}
2159 		}
2160 
2161 		rc = ops->ndo_start_xmit(skb, dev);
2162 		trace_net_dev_xmit(skb, rc);
2163 		if (rc == NETDEV_TX_OK)
2164 			txq_trans_update(txq);
2165 		return rc;
2166 	}
2167 
2168 gso:
2169 	do {
2170 		struct sk_buff *nskb = skb->next;
2171 
2172 		skb->next = nskb->next;
2173 		nskb->next = NULL;
2174 
2175 		/*
2176 		 * If device doesnt need nskb->dst, release it right now while
2177 		 * its hot in this cpu cache
2178 		 */
2179 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2180 			skb_dst_drop(nskb);
2181 
2182 		rc = ops->ndo_start_xmit(nskb, dev);
2183 		trace_net_dev_xmit(nskb, rc);
2184 		if (unlikely(rc != NETDEV_TX_OK)) {
2185 			if (rc & ~NETDEV_TX_MASK)
2186 				goto out_kfree_gso_skb;
2187 			nskb->next = skb->next;
2188 			skb->next = nskb;
2189 			return rc;
2190 		}
2191 		txq_trans_update(txq);
2192 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2193 			return NETDEV_TX_BUSY;
2194 	} while (skb->next);
2195 
2196 out_kfree_gso_skb:
2197 	if (likely(skb->next == NULL))
2198 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2199 out_kfree_skb:
2200 	kfree_skb(skb);
2201 out:
2202 	return rc;
2203 }
2204 
2205 static u32 hashrnd __read_mostly;
2206 
2207 /*
2208  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2209  * to be used as a distribution range.
2210  */
2211 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2212 		  unsigned int num_tx_queues)
2213 {
2214 	u32 hash;
2215 	u16 qoffset = 0;
2216 	u16 qcount = num_tx_queues;
2217 
2218 	if (skb_rx_queue_recorded(skb)) {
2219 		hash = skb_get_rx_queue(skb);
2220 		while (unlikely(hash >= num_tx_queues))
2221 			hash -= num_tx_queues;
2222 		return hash;
2223 	}
2224 
2225 	if (dev->num_tc) {
2226 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2227 		qoffset = dev->tc_to_txq[tc].offset;
2228 		qcount = dev->tc_to_txq[tc].count;
2229 	}
2230 
2231 	if (skb->sk && skb->sk->sk_hash)
2232 		hash = skb->sk->sk_hash;
2233 	else
2234 		hash = (__force u16) skb->protocol ^ skb->rxhash;
2235 	hash = jhash_1word(hash, hashrnd);
2236 
2237 	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2238 }
2239 EXPORT_SYMBOL(__skb_tx_hash);
2240 
2241 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2242 {
2243 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2244 		if (net_ratelimit()) {
2245 			pr_warning("%s selects TX queue %d, but "
2246 				"real number of TX queues is %d\n",
2247 				dev->name, queue_index, dev->real_num_tx_queues);
2248 		}
2249 		return 0;
2250 	}
2251 	return queue_index;
2252 }
2253 
2254 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2255 {
2256 #ifdef CONFIG_XPS
2257 	struct xps_dev_maps *dev_maps;
2258 	struct xps_map *map;
2259 	int queue_index = -1;
2260 
2261 	rcu_read_lock();
2262 	dev_maps = rcu_dereference(dev->xps_maps);
2263 	if (dev_maps) {
2264 		map = rcu_dereference(
2265 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2266 		if (map) {
2267 			if (map->len == 1)
2268 				queue_index = map->queues[0];
2269 			else {
2270 				u32 hash;
2271 				if (skb->sk && skb->sk->sk_hash)
2272 					hash = skb->sk->sk_hash;
2273 				else
2274 					hash = (__force u16) skb->protocol ^
2275 					    skb->rxhash;
2276 				hash = jhash_1word(hash, hashrnd);
2277 				queue_index = map->queues[
2278 				    ((u64)hash * map->len) >> 32];
2279 			}
2280 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2281 				queue_index = -1;
2282 		}
2283 	}
2284 	rcu_read_unlock();
2285 
2286 	return queue_index;
2287 #else
2288 	return -1;
2289 #endif
2290 }
2291 
2292 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2293 					struct sk_buff *skb)
2294 {
2295 	int queue_index;
2296 	const struct net_device_ops *ops = dev->netdev_ops;
2297 
2298 	if (dev->real_num_tx_queues == 1)
2299 		queue_index = 0;
2300 	else if (ops->ndo_select_queue) {
2301 		queue_index = ops->ndo_select_queue(dev, skb);
2302 		queue_index = dev_cap_txqueue(dev, queue_index);
2303 	} else {
2304 		struct sock *sk = skb->sk;
2305 		queue_index = sk_tx_queue_get(sk);
2306 
2307 		if (queue_index < 0 || skb->ooo_okay ||
2308 		    queue_index >= dev->real_num_tx_queues) {
2309 			int old_index = queue_index;
2310 
2311 			queue_index = get_xps_queue(dev, skb);
2312 			if (queue_index < 0)
2313 				queue_index = skb_tx_hash(dev, skb);
2314 
2315 			if (queue_index != old_index && sk) {
2316 				struct dst_entry *dst =
2317 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2318 
2319 				if (dst && skb_dst(skb) == dst)
2320 					sk_tx_queue_set(sk, queue_index);
2321 			}
2322 		}
2323 	}
2324 
2325 	skb_set_queue_mapping(skb, queue_index);
2326 	return netdev_get_tx_queue(dev, queue_index);
2327 }
2328 
2329 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2330 				 struct net_device *dev,
2331 				 struct netdev_queue *txq)
2332 {
2333 	spinlock_t *root_lock = qdisc_lock(q);
2334 	bool contended;
2335 	int rc;
2336 
2337 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2338 	qdisc_calculate_pkt_len(skb, q);
2339 	/*
2340 	 * Heuristic to force contended enqueues to serialize on a
2341 	 * separate lock before trying to get qdisc main lock.
2342 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2343 	 * and dequeue packets faster.
2344 	 */
2345 	contended = qdisc_is_running(q);
2346 	if (unlikely(contended))
2347 		spin_lock(&q->busylock);
2348 
2349 	spin_lock(root_lock);
2350 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2351 		kfree_skb(skb);
2352 		rc = NET_XMIT_DROP;
2353 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2354 		   qdisc_run_begin(q)) {
2355 		/*
2356 		 * This is a work-conserving queue; there are no old skbs
2357 		 * waiting to be sent out; and the qdisc is not running -
2358 		 * xmit the skb directly.
2359 		 */
2360 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2361 			skb_dst_force(skb);
2362 
2363 		qdisc_bstats_update(q, skb);
2364 
2365 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2366 			if (unlikely(contended)) {
2367 				spin_unlock(&q->busylock);
2368 				contended = false;
2369 			}
2370 			__qdisc_run(q);
2371 		} else
2372 			qdisc_run_end(q);
2373 
2374 		rc = NET_XMIT_SUCCESS;
2375 	} else {
2376 		skb_dst_force(skb);
2377 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2378 		if (qdisc_run_begin(q)) {
2379 			if (unlikely(contended)) {
2380 				spin_unlock(&q->busylock);
2381 				contended = false;
2382 			}
2383 			__qdisc_run(q);
2384 		}
2385 	}
2386 	spin_unlock(root_lock);
2387 	if (unlikely(contended))
2388 		spin_unlock(&q->busylock);
2389 	return rc;
2390 }
2391 
2392 static DEFINE_PER_CPU(int, xmit_recursion);
2393 #define RECURSION_LIMIT 10
2394 
2395 /**
2396  *	dev_queue_xmit - transmit a buffer
2397  *	@skb: buffer to transmit
2398  *
2399  *	Queue a buffer for transmission to a network device. The caller must
2400  *	have set the device and priority and built the buffer before calling
2401  *	this function. The function can be called from an interrupt.
2402  *
2403  *	A negative errno code is returned on a failure. A success does not
2404  *	guarantee the frame will be transmitted as it may be dropped due
2405  *	to congestion or traffic shaping.
2406  *
2407  * -----------------------------------------------------------------------------------
2408  *      I notice this method can also return errors from the queue disciplines,
2409  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2410  *      be positive.
2411  *
2412  *      Regardless of the return value, the skb is consumed, so it is currently
2413  *      difficult to retry a send to this method.  (You can bump the ref count
2414  *      before sending to hold a reference for retry if you are careful.)
2415  *
2416  *      When calling this method, interrupts MUST be enabled.  This is because
2417  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2418  *          --BLG
2419  */
2420 int dev_queue_xmit(struct sk_buff *skb)
2421 {
2422 	struct net_device *dev = skb->dev;
2423 	struct netdev_queue *txq;
2424 	struct Qdisc *q;
2425 	int rc = -ENOMEM;
2426 
2427 	/* Disable soft irqs for various locks below. Also
2428 	 * stops preemption for RCU.
2429 	 */
2430 	rcu_read_lock_bh();
2431 
2432 	txq = dev_pick_tx(dev, skb);
2433 	q = rcu_dereference_bh(txq->qdisc);
2434 
2435 #ifdef CONFIG_NET_CLS_ACT
2436 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2437 #endif
2438 	trace_net_dev_queue(skb);
2439 	if (q->enqueue) {
2440 		rc = __dev_xmit_skb(skb, q, dev, txq);
2441 		goto out;
2442 	}
2443 
2444 	/* The device has no queue. Common case for software devices:
2445 	   loopback, all the sorts of tunnels...
2446 
2447 	   Really, it is unlikely that netif_tx_lock protection is necessary
2448 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2449 	   counters.)
2450 	   However, it is possible, that they rely on protection
2451 	   made by us here.
2452 
2453 	   Check this and shot the lock. It is not prone from deadlocks.
2454 	   Either shot noqueue qdisc, it is even simpler 8)
2455 	 */
2456 	if (dev->flags & IFF_UP) {
2457 		int cpu = smp_processor_id(); /* ok because BHs are off */
2458 
2459 		if (txq->xmit_lock_owner != cpu) {
2460 
2461 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2462 				goto recursion_alert;
2463 
2464 			HARD_TX_LOCK(dev, txq, cpu);
2465 
2466 			if (!netif_tx_queue_stopped(txq)) {
2467 				__this_cpu_inc(xmit_recursion);
2468 				rc = dev_hard_start_xmit(skb, dev, txq);
2469 				__this_cpu_dec(xmit_recursion);
2470 				if (dev_xmit_complete(rc)) {
2471 					HARD_TX_UNLOCK(dev, txq);
2472 					goto out;
2473 				}
2474 			}
2475 			HARD_TX_UNLOCK(dev, txq);
2476 			if (net_ratelimit())
2477 				printk(KERN_CRIT "Virtual device %s asks to "
2478 				       "queue packet!\n", dev->name);
2479 		} else {
2480 			/* Recursion is detected! It is possible,
2481 			 * unfortunately
2482 			 */
2483 recursion_alert:
2484 			if (net_ratelimit())
2485 				printk(KERN_CRIT "Dead loop on virtual device "
2486 				       "%s, fix it urgently!\n", dev->name);
2487 		}
2488 	}
2489 
2490 	rc = -ENETDOWN;
2491 	rcu_read_unlock_bh();
2492 
2493 	kfree_skb(skb);
2494 	return rc;
2495 out:
2496 	rcu_read_unlock_bh();
2497 	return rc;
2498 }
2499 EXPORT_SYMBOL(dev_queue_xmit);
2500 
2501 
2502 /*=======================================================================
2503 			Receiver routines
2504   =======================================================================*/
2505 
2506 int netdev_max_backlog __read_mostly = 1000;
2507 int netdev_tstamp_prequeue __read_mostly = 1;
2508 int netdev_budget __read_mostly = 300;
2509 int weight_p __read_mostly = 64;            /* old backlog weight */
2510 
2511 /* Called with irq disabled */
2512 static inline void ____napi_schedule(struct softnet_data *sd,
2513 				     struct napi_struct *napi)
2514 {
2515 	list_add_tail(&napi->poll_list, &sd->poll_list);
2516 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2517 }
2518 
2519 /*
2520  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2521  * and src/dst port numbers. Returns a non-zero hash number on success
2522  * and 0 on failure.
2523  */
2524 __u32 __skb_get_rxhash(struct sk_buff *skb)
2525 {
2526 	int nhoff, hash = 0, poff;
2527 	struct ipv6hdr *ip6;
2528 	struct iphdr *ip;
2529 	u8 ip_proto;
2530 	u32 addr1, addr2, ihl;
2531 	union {
2532 		u32 v32;
2533 		u16 v16[2];
2534 	} ports;
2535 
2536 	nhoff = skb_network_offset(skb);
2537 
2538 	switch (skb->protocol) {
2539 	case __constant_htons(ETH_P_IP):
2540 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2541 			goto done;
2542 
2543 		ip = (struct iphdr *) (skb->data + nhoff);
2544 		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2545 			ip_proto = 0;
2546 		else
2547 			ip_proto = ip->protocol;
2548 		addr1 = (__force u32) ip->saddr;
2549 		addr2 = (__force u32) ip->daddr;
2550 		ihl = ip->ihl;
2551 		break;
2552 	case __constant_htons(ETH_P_IPV6):
2553 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2554 			goto done;
2555 
2556 		ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2557 		ip_proto = ip6->nexthdr;
2558 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2559 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2560 		ihl = (40 >> 2);
2561 		break;
2562 	default:
2563 		goto done;
2564 	}
2565 
2566 	ports.v32 = 0;
2567 	poff = proto_ports_offset(ip_proto);
2568 	if (poff >= 0) {
2569 		nhoff += ihl * 4 + poff;
2570 		if (pskb_may_pull(skb, nhoff + 4)) {
2571 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2572 			if (ports.v16[1] < ports.v16[0])
2573 				swap(ports.v16[0], ports.v16[1]);
2574 		}
2575 	}
2576 
2577 	/* get a consistent hash (same value on both flow directions) */
2578 	if (addr2 < addr1)
2579 		swap(addr1, addr2);
2580 
2581 	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2582 	if (!hash)
2583 		hash = 1;
2584 
2585 done:
2586 	return hash;
2587 }
2588 EXPORT_SYMBOL(__skb_get_rxhash);
2589 
2590 #ifdef CONFIG_RPS
2591 
2592 /* One global table that all flow-based protocols share. */
2593 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2594 EXPORT_SYMBOL(rps_sock_flow_table);
2595 
2596 static struct rps_dev_flow *
2597 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2598 	    struct rps_dev_flow *rflow, u16 next_cpu)
2599 {
2600 	u16 tcpu;
2601 
2602 	tcpu = rflow->cpu = next_cpu;
2603 	if (tcpu != RPS_NO_CPU) {
2604 #ifdef CONFIG_RFS_ACCEL
2605 		struct netdev_rx_queue *rxqueue;
2606 		struct rps_dev_flow_table *flow_table;
2607 		struct rps_dev_flow *old_rflow;
2608 		u32 flow_id;
2609 		u16 rxq_index;
2610 		int rc;
2611 
2612 		/* Should we steer this flow to a different hardware queue? */
2613 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2614 		    !(dev->features & NETIF_F_NTUPLE))
2615 			goto out;
2616 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2617 		if (rxq_index == skb_get_rx_queue(skb))
2618 			goto out;
2619 
2620 		rxqueue = dev->_rx + rxq_index;
2621 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2622 		if (!flow_table)
2623 			goto out;
2624 		flow_id = skb->rxhash & flow_table->mask;
2625 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2626 							rxq_index, flow_id);
2627 		if (rc < 0)
2628 			goto out;
2629 		old_rflow = rflow;
2630 		rflow = &flow_table->flows[flow_id];
2631 		rflow->cpu = next_cpu;
2632 		rflow->filter = rc;
2633 		if (old_rflow->filter == rflow->filter)
2634 			old_rflow->filter = RPS_NO_FILTER;
2635 	out:
2636 #endif
2637 		rflow->last_qtail =
2638 			per_cpu(softnet_data, tcpu).input_queue_head;
2639 	}
2640 
2641 	return rflow;
2642 }
2643 
2644 /*
2645  * get_rps_cpu is called from netif_receive_skb and returns the target
2646  * CPU from the RPS map of the receiving queue for a given skb.
2647  * rcu_read_lock must be held on entry.
2648  */
2649 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2650 		       struct rps_dev_flow **rflowp)
2651 {
2652 	struct netdev_rx_queue *rxqueue;
2653 	struct rps_map *map;
2654 	struct rps_dev_flow_table *flow_table;
2655 	struct rps_sock_flow_table *sock_flow_table;
2656 	int cpu = -1;
2657 	u16 tcpu;
2658 
2659 	if (skb_rx_queue_recorded(skb)) {
2660 		u16 index = skb_get_rx_queue(skb);
2661 		if (unlikely(index >= dev->real_num_rx_queues)) {
2662 			WARN_ONCE(dev->real_num_rx_queues > 1,
2663 				  "%s received packet on queue %u, but number "
2664 				  "of RX queues is %u\n",
2665 				  dev->name, index, dev->real_num_rx_queues);
2666 			goto done;
2667 		}
2668 		rxqueue = dev->_rx + index;
2669 	} else
2670 		rxqueue = dev->_rx;
2671 
2672 	map = rcu_dereference(rxqueue->rps_map);
2673 	if (map) {
2674 		if (map->len == 1 &&
2675 		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2676 			tcpu = map->cpus[0];
2677 			if (cpu_online(tcpu))
2678 				cpu = tcpu;
2679 			goto done;
2680 		}
2681 	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2682 		goto done;
2683 	}
2684 
2685 	skb_reset_network_header(skb);
2686 	if (!skb_get_rxhash(skb))
2687 		goto done;
2688 
2689 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2690 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2691 	if (flow_table && sock_flow_table) {
2692 		u16 next_cpu;
2693 		struct rps_dev_flow *rflow;
2694 
2695 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2696 		tcpu = rflow->cpu;
2697 
2698 		next_cpu = sock_flow_table->ents[skb->rxhash &
2699 		    sock_flow_table->mask];
2700 
2701 		/*
2702 		 * If the desired CPU (where last recvmsg was done) is
2703 		 * different from current CPU (one in the rx-queue flow
2704 		 * table entry), switch if one of the following holds:
2705 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2706 		 *   - Current CPU is offline.
2707 		 *   - The current CPU's queue tail has advanced beyond the
2708 		 *     last packet that was enqueued using this table entry.
2709 		 *     This guarantees that all previous packets for the flow
2710 		 *     have been dequeued, thus preserving in order delivery.
2711 		 */
2712 		if (unlikely(tcpu != next_cpu) &&
2713 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2714 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2715 		      rflow->last_qtail)) >= 0))
2716 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2717 
2718 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2719 			*rflowp = rflow;
2720 			cpu = tcpu;
2721 			goto done;
2722 		}
2723 	}
2724 
2725 	if (map) {
2726 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2727 
2728 		if (cpu_online(tcpu)) {
2729 			cpu = tcpu;
2730 			goto done;
2731 		}
2732 	}
2733 
2734 done:
2735 	return cpu;
2736 }
2737 
2738 #ifdef CONFIG_RFS_ACCEL
2739 
2740 /**
2741  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2742  * @dev: Device on which the filter was set
2743  * @rxq_index: RX queue index
2744  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2745  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2746  *
2747  * Drivers that implement ndo_rx_flow_steer() should periodically call
2748  * this function for each installed filter and remove the filters for
2749  * which it returns %true.
2750  */
2751 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2752 			 u32 flow_id, u16 filter_id)
2753 {
2754 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2755 	struct rps_dev_flow_table *flow_table;
2756 	struct rps_dev_flow *rflow;
2757 	bool expire = true;
2758 	int cpu;
2759 
2760 	rcu_read_lock();
2761 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2762 	if (flow_table && flow_id <= flow_table->mask) {
2763 		rflow = &flow_table->flows[flow_id];
2764 		cpu = ACCESS_ONCE(rflow->cpu);
2765 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2766 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2767 			   rflow->last_qtail) <
2768 		     (int)(10 * flow_table->mask)))
2769 			expire = false;
2770 	}
2771 	rcu_read_unlock();
2772 	return expire;
2773 }
2774 EXPORT_SYMBOL(rps_may_expire_flow);
2775 
2776 #endif /* CONFIG_RFS_ACCEL */
2777 
2778 /* Called from hardirq (IPI) context */
2779 static void rps_trigger_softirq(void *data)
2780 {
2781 	struct softnet_data *sd = data;
2782 
2783 	____napi_schedule(sd, &sd->backlog);
2784 	sd->received_rps++;
2785 }
2786 
2787 #endif /* CONFIG_RPS */
2788 
2789 /*
2790  * Check if this softnet_data structure is another cpu one
2791  * If yes, queue it to our IPI list and return 1
2792  * If no, return 0
2793  */
2794 static int rps_ipi_queued(struct softnet_data *sd)
2795 {
2796 #ifdef CONFIG_RPS
2797 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2798 
2799 	if (sd != mysd) {
2800 		sd->rps_ipi_next = mysd->rps_ipi_list;
2801 		mysd->rps_ipi_list = sd;
2802 
2803 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2804 		return 1;
2805 	}
2806 #endif /* CONFIG_RPS */
2807 	return 0;
2808 }
2809 
2810 /*
2811  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2812  * queue (may be a remote CPU queue).
2813  */
2814 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2815 			      unsigned int *qtail)
2816 {
2817 	struct softnet_data *sd;
2818 	unsigned long flags;
2819 
2820 	sd = &per_cpu(softnet_data, cpu);
2821 
2822 	local_irq_save(flags);
2823 
2824 	rps_lock(sd);
2825 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2826 		if (skb_queue_len(&sd->input_pkt_queue)) {
2827 enqueue:
2828 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2829 			input_queue_tail_incr_save(sd, qtail);
2830 			rps_unlock(sd);
2831 			local_irq_restore(flags);
2832 			return NET_RX_SUCCESS;
2833 		}
2834 
2835 		/* Schedule NAPI for backlog device
2836 		 * We can use non atomic operation since we own the queue lock
2837 		 */
2838 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2839 			if (!rps_ipi_queued(sd))
2840 				____napi_schedule(sd, &sd->backlog);
2841 		}
2842 		goto enqueue;
2843 	}
2844 
2845 	sd->dropped++;
2846 	rps_unlock(sd);
2847 
2848 	local_irq_restore(flags);
2849 
2850 	atomic_long_inc(&skb->dev->rx_dropped);
2851 	kfree_skb(skb);
2852 	return NET_RX_DROP;
2853 }
2854 
2855 /**
2856  *	netif_rx	-	post buffer to the network code
2857  *	@skb: buffer to post
2858  *
2859  *	This function receives a packet from a device driver and queues it for
2860  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2861  *	may be dropped during processing for congestion control or by the
2862  *	protocol layers.
2863  *
2864  *	return values:
2865  *	NET_RX_SUCCESS	(no congestion)
2866  *	NET_RX_DROP     (packet was dropped)
2867  *
2868  */
2869 
2870 int netif_rx(struct sk_buff *skb)
2871 {
2872 	int ret;
2873 
2874 	/* if netpoll wants it, pretend we never saw it */
2875 	if (netpoll_rx(skb))
2876 		return NET_RX_DROP;
2877 
2878 	if (netdev_tstamp_prequeue)
2879 		net_timestamp_check(skb);
2880 
2881 	trace_netif_rx(skb);
2882 #ifdef CONFIG_RPS
2883 	{
2884 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2885 		int cpu;
2886 
2887 		preempt_disable();
2888 		rcu_read_lock();
2889 
2890 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2891 		if (cpu < 0)
2892 			cpu = smp_processor_id();
2893 
2894 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2895 
2896 		rcu_read_unlock();
2897 		preempt_enable();
2898 	}
2899 #else
2900 	{
2901 		unsigned int qtail;
2902 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2903 		put_cpu();
2904 	}
2905 #endif
2906 	return ret;
2907 }
2908 EXPORT_SYMBOL(netif_rx);
2909 
2910 int netif_rx_ni(struct sk_buff *skb)
2911 {
2912 	int err;
2913 
2914 	preempt_disable();
2915 	err = netif_rx(skb);
2916 	if (local_softirq_pending())
2917 		do_softirq();
2918 	preempt_enable();
2919 
2920 	return err;
2921 }
2922 EXPORT_SYMBOL(netif_rx_ni);
2923 
2924 static void net_tx_action(struct softirq_action *h)
2925 {
2926 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2927 
2928 	if (sd->completion_queue) {
2929 		struct sk_buff *clist;
2930 
2931 		local_irq_disable();
2932 		clist = sd->completion_queue;
2933 		sd->completion_queue = NULL;
2934 		local_irq_enable();
2935 
2936 		while (clist) {
2937 			struct sk_buff *skb = clist;
2938 			clist = clist->next;
2939 
2940 			WARN_ON(atomic_read(&skb->users));
2941 			trace_kfree_skb(skb, net_tx_action);
2942 			__kfree_skb(skb);
2943 		}
2944 	}
2945 
2946 	if (sd->output_queue) {
2947 		struct Qdisc *head;
2948 
2949 		local_irq_disable();
2950 		head = sd->output_queue;
2951 		sd->output_queue = NULL;
2952 		sd->output_queue_tailp = &sd->output_queue;
2953 		local_irq_enable();
2954 
2955 		while (head) {
2956 			struct Qdisc *q = head;
2957 			spinlock_t *root_lock;
2958 
2959 			head = head->next_sched;
2960 
2961 			root_lock = qdisc_lock(q);
2962 			if (spin_trylock(root_lock)) {
2963 				smp_mb__before_clear_bit();
2964 				clear_bit(__QDISC_STATE_SCHED,
2965 					  &q->state);
2966 				qdisc_run(q);
2967 				spin_unlock(root_lock);
2968 			} else {
2969 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2970 					      &q->state)) {
2971 					__netif_reschedule(q);
2972 				} else {
2973 					smp_mb__before_clear_bit();
2974 					clear_bit(__QDISC_STATE_SCHED,
2975 						  &q->state);
2976 				}
2977 			}
2978 		}
2979 	}
2980 }
2981 
2982 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2983     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2984 /* This hook is defined here for ATM LANE */
2985 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2986 			     unsigned char *addr) __read_mostly;
2987 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2988 #endif
2989 
2990 #ifdef CONFIG_NET_CLS_ACT
2991 /* TODO: Maybe we should just force sch_ingress to be compiled in
2992  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2993  * a compare and 2 stores extra right now if we dont have it on
2994  * but have CONFIG_NET_CLS_ACT
2995  * NOTE: This doesnt stop any functionality; if you dont have
2996  * the ingress scheduler, you just cant add policies on ingress.
2997  *
2998  */
2999 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3000 {
3001 	struct net_device *dev = skb->dev;
3002 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3003 	int result = TC_ACT_OK;
3004 	struct Qdisc *q;
3005 
3006 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3007 		if (net_ratelimit())
3008 			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3009 			       skb->skb_iif, dev->ifindex);
3010 		return TC_ACT_SHOT;
3011 	}
3012 
3013 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3014 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3015 
3016 	q = rxq->qdisc;
3017 	if (q != &noop_qdisc) {
3018 		spin_lock(qdisc_lock(q));
3019 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3020 			result = qdisc_enqueue_root(skb, q);
3021 		spin_unlock(qdisc_lock(q));
3022 	}
3023 
3024 	return result;
3025 }
3026 
3027 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3028 					 struct packet_type **pt_prev,
3029 					 int *ret, struct net_device *orig_dev)
3030 {
3031 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3032 
3033 	if (!rxq || rxq->qdisc == &noop_qdisc)
3034 		goto out;
3035 
3036 	if (*pt_prev) {
3037 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3038 		*pt_prev = NULL;
3039 	}
3040 
3041 	switch (ing_filter(skb, rxq)) {
3042 	case TC_ACT_SHOT:
3043 	case TC_ACT_STOLEN:
3044 		kfree_skb(skb);
3045 		return NULL;
3046 	}
3047 
3048 out:
3049 	skb->tc_verd = 0;
3050 	return skb;
3051 }
3052 #endif
3053 
3054 /**
3055  *	netdev_rx_handler_register - register receive handler
3056  *	@dev: device to register a handler for
3057  *	@rx_handler: receive handler to register
3058  *	@rx_handler_data: data pointer that is used by rx handler
3059  *
3060  *	Register a receive hander for a device. This handler will then be
3061  *	called from __netif_receive_skb. A negative errno code is returned
3062  *	on a failure.
3063  *
3064  *	The caller must hold the rtnl_mutex.
3065  */
3066 int netdev_rx_handler_register(struct net_device *dev,
3067 			       rx_handler_func_t *rx_handler,
3068 			       void *rx_handler_data)
3069 {
3070 	ASSERT_RTNL();
3071 
3072 	if (dev->rx_handler)
3073 		return -EBUSY;
3074 
3075 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3076 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3077 
3078 	return 0;
3079 }
3080 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3081 
3082 /**
3083  *	netdev_rx_handler_unregister - unregister receive handler
3084  *	@dev: device to unregister a handler from
3085  *
3086  *	Unregister a receive hander from a device.
3087  *
3088  *	The caller must hold the rtnl_mutex.
3089  */
3090 void netdev_rx_handler_unregister(struct net_device *dev)
3091 {
3092 
3093 	ASSERT_RTNL();
3094 	rcu_assign_pointer(dev->rx_handler, NULL);
3095 	rcu_assign_pointer(dev->rx_handler_data, NULL);
3096 }
3097 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3098 
3099 static void vlan_on_bond_hook(struct sk_buff *skb)
3100 {
3101 	/*
3102 	 * Make sure ARP frames received on VLAN interfaces stacked on
3103 	 * bonding interfaces still make their way to any base bonding
3104 	 * device that may have registered for a specific ptype.
3105 	 */
3106 	if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
3107 	    vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
3108 	    skb->protocol == htons(ETH_P_ARP)) {
3109 		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
3110 
3111 		if (!skb2)
3112 			return;
3113 		skb2->dev = vlan_dev_real_dev(skb->dev);
3114 		netif_rx(skb2);
3115 	}
3116 }
3117 
3118 static int __netif_receive_skb(struct sk_buff *skb)
3119 {
3120 	struct packet_type *ptype, *pt_prev;
3121 	rx_handler_func_t *rx_handler;
3122 	struct net_device *orig_dev;
3123 	struct net_device *null_or_dev;
3124 	int ret = NET_RX_DROP;
3125 	__be16 type;
3126 
3127 	if (!netdev_tstamp_prequeue)
3128 		net_timestamp_check(skb);
3129 
3130 	trace_netif_receive_skb(skb);
3131 
3132 	/* if we've gotten here through NAPI, check netpoll */
3133 	if (netpoll_receive_skb(skb))
3134 		return NET_RX_DROP;
3135 
3136 	if (!skb->skb_iif)
3137 		skb->skb_iif = skb->dev->ifindex;
3138 	orig_dev = skb->dev;
3139 
3140 	skb_reset_network_header(skb);
3141 	skb_reset_transport_header(skb);
3142 	skb->mac_len = skb->network_header - skb->mac_header;
3143 
3144 	pt_prev = NULL;
3145 
3146 	rcu_read_lock();
3147 
3148 another_round:
3149 
3150 	__this_cpu_inc(softnet_data.processed);
3151 
3152 #ifdef CONFIG_NET_CLS_ACT
3153 	if (skb->tc_verd & TC_NCLS) {
3154 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3155 		goto ncls;
3156 	}
3157 #endif
3158 
3159 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3160 		if (!ptype->dev || ptype->dev == skb->dev) {
3161 			if (pt_prev)
3162 				ret = deliver_skb(skb, pt_prev, orig_dev);
3163 			pt_prev = ptype;
3164 		}
3165 	}
3166 
3167 #ifdef CONFIG_NET_CLS_ACT
3168 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3169 	if (!skb)
3170 		goto out;
3171 ncls:
3172 #endif
3173 
3174 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3175 	if (rx_handler) {
3176 		struct net_device *prev_dev;
3177 
3178 		if (pt_prev) {
3179 			ret = deliver_skb(skb, pt_prev, orig_dev);
3180 			pt_prev = NULL;
3181 		}
3182 		prev_dev = skb->dev;
3183 		skb = rx_handler(skb);
3184 		if (!skb)
3185 			goto out;
3186 		if (skb->dev != prev_dev)
3187 			goto another_round;
3188 	}
3189 
3190 	if (vlan_tx_tag_present(skb)) {
3191 		if (pt_prev) {
3192 			ret = deliver_skb(skb, pt_prev, orig_dev);
3193 			pt_prev = NULL;
3194 		}
3195 		if (vlan_hwaccel_do_receive(&skb)) {
3196 			ret = __netif_receive_skb(skb);
3197 			goto out;
3198 		} else if (unlikely(!skb))
3199 			goto out;
3200 	}
3201 
3202 	vlan_on_bond_hook(skb);
3203 
3204 	/* deliver only exact match when indicated */
3205 	null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
3206 
3207 	type = skb->protocol;
3208 	list_for_each_entry_rcu(ptype,
3209 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3210 		if (ptype->type == type &&
3211 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3212 		     ptype->dev == orig_dev)) {
3213 			if (pt_prev)
3214 				ret = deliver_skb(skb, pt_prev, orig_dev);
3215 			pt_prev = ptype;
3216 		}
3217 	}
3218 
3219 	if (pt_prev) {
3220 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3221 	} else {
3222 		atomic_long_inc(&skb->dev->rx_dropped);
3223 		kfree_skb(skb);
3224 		/* Jamal, now you will not able to escape explaining
3225 		 * me how you were going to use this. :-)
3226 		 */
3227 		ret = NET_RX_DROP;
3228 	}
3229 
3230 out:
3231 	rcu_read_unlock();
3232 	return ret;
3233 }
3234 
3235 /**
3236  *	netif_receive_skb - process receive buffer from network
3237  *	@skb: buffer to process
3238  *
3239  *	netif_receive_skb() is the main receive data processing function.
3240  *	It always succeeds. The buffer may be dropped during processing
3241  *	for congestion control or by the protocol layers.
3242  *
3243  *	This function may only be called from softirq context and interrupts
3244  *	should be enabled.
3245  *
3246  *	Return values (usually ignored):
3247  *	NET_RX_SUCCESS: no congestion
3248  *	NET_RX_DROP: packet was dropped
3249  */
3250 int netif_receive_skb(struct sk_buff *skb)
3251 {
3252 	if (netdev_tstamp_prequeue)
3253 		net_timestamp_check(skb);
3254 
3255 	if (skb_defer_rx_timestamp(skb))
3256 		return NET_RX_SUCCESS;
3257 
3258 #ifdef CONFIG_RPS
3259 	{
3260 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3261 		int cpu, ret;
3262 
3263 		rcu_read_lock();
3264 
3265 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3266 
3267 		if (cpu >= 0) {
3268 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3269 			rcu_read_unlock();
3270 		} else {
3271 			rcu_read_unlock();
3272 			ret = __netif_receive_skb(skb);
3273 		}
3274 
3275 		return ret;
3276 	}
3277 #else
3278 	return __netif_receive_skb(skb);
3279 #endif
3280 }
3281 EXPORT_SYMBOL(netif_receive_skb);
3282 
3283 /* Network device is going away, flush any packets still pending
3284  * Called with irqs disabled.
3285  */
3286 static void flush_backlog(void *arg)
3287 {
3288 	struct net_device *dev = arg;
3289 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3290 	struct sk_buff *skb, *tmp;
3291 
3292 	rps_lock(sd);
3293 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3294 		if (skb->dev == dev) {
3295 			__skb_unlink(skb, &sd->input_pkt_queue);
3296 			kfree_skb(skb);
3297 			input_queue_head_incr(sd);
3298 		}
3299 	}
3300 	rps_unlock(sd);
3301 
3302 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3303 		if (skb->dev == dev) {
3304 			__skb_unlink(skb, &sd->process_queue);
3305 			kfree_skb(skb);
3306 			input_queue_head_incr(sd);
3307 		}
3308 	}
3309 }
3310 
3311 static int napi_gro_complete(struct sk_buff *skb)
3312 {
3313 	struct packet_type *ptype;
3314 	__be16 type = skb->protocol;
3315 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3316 	int err = -ENOENT;
3317 
3318 	if (NAPI_GRO_CB(skb)->count == 1) {
3319 		skb_shinfo(skb)->gso_size = 0;
3320 		goto out;
3321 	}
3322 
3323 	rcu_read_lock();
3324 	list_for_each_entry_rcu(ptype, head, list) {
3325 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3326 			continue;
3327 
3328 		err = ptype->gro_complete(skb);
3329 		break;
3330 	}
3331 	rcu_read_unlock();
3332 
3333 	if (err) {
3334 		WARN_ON(&ptype->list == head);
3335 		kfree_skb(skb);
3336 		return NET_RX_SUCCESS;
3337 	}
3338 
3339 out:
3340 	return netif_receive_skb(skb);
3341 }
3342 
3343 inline void napi_gro_flush(struct napi_struct *napi)
3344 {
3345 	struct sk_buff *skb, *next;
3346 
3347 	for (skb = napi->gro_list; skb; skb = next) {
3348 		next = skb->next;
3349 		skb->next = NULL;
3350 		napi_gro_complete(skb);
3351 	}
3352 
3353 	napi->gro_count = 0;
3354 	napi->gro_list = NULL;
3355 }
3356 EXPORT_SYMBOL(napi_gro_flush);
3357 
3358 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3359 {
3360 	struct sk_buff **pp = NULL;
3361 	struct packet_type *ptype;
3362 	__be16 type = skb->protocol;
3363 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3364 	int same_flow;
3365 	int mac_len;
3366 	enum gro_result ret;
3367 
3368 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3369 		goto normal;
3370 
3371 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3372 		goto normal;
3373 
3374 	rcu_read_lock();
3375 	list_for_each_entry_rcu(ptype, head, list) {
3376 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3377 			continue;
3378 
3379 		skb_set_network_header(skb, skb_gro_offset(skb));
3380 		mac_len = skb->network_header - skb->mac_header;
3381 		skb->mac_len = mac_len;
3382 		NAPI_GRO_CB(skb)->same_flow = 0;
3383 		NAPI_GRO_CB(skb)->flush = 0;
3384 		NAPI_GRO_CB(skb)->free = 0;
3385 
3386 		pp = ptype->gro_receive(&napi->gro_list, skb);
3387 		break;
3388 	}
3389 	rcu_read_unlock();
3390 
3391 	if (&ptype->list == head)
3392 		goto normal;
3393 
3394 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3395 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3396 
3397 	if (pp) {
3398 		struct sk_buff *nskb = *pp;
3399 
3400 		*pp = nskb->next;
3401 		nskb->next = NULL;
3402 		napi_gro_complete(nskb);
3403 		napi->gro_count--;
3404 	}
3405 
3406 	if (same_flow)
3407 		goto ok;
3408 
3409 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3410 		goto normal;
3411 
3412 	napi->gro_count++;
3413 	NAPI_GRO_CB(skb)->count = 1;
3414 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3415 	skb->next = napi->gro_list;
3416 	napi->gro_list = skb;
3417 	ret = GRO_HELD;
3418 
3419 pull:
3420 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3421 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3422 
3423 		BUG_ON(skb->end - skb->tail < grow);
3424 
3425 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3426 
3427 		skb->tail += grow;
3428 		skb->data_len -= grow;
3429 
3430 		skb_shinfo(skb)->frags[0].page_offset += grow;
3431 		skb_shinfo(skb)->frags[0].size -= grow;
3432 
3433 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3434 			put_page(skb_shinfo(skb)->frags[0].page);
3435 			memmove(skb_shinfo(skb)->frags,
3436 				skb_shinfo(skb)->frags + 1,
3437 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3438 		}
3439 	}
3440 
3441 ok:
3442 	return ret;
3443 
3444 normal:
3445 	ret = GRO_NORMAL;
3446 	goto pull;
3447 }
3448 EXPORT_SYMBOL(dev_gro_receive);
3449 
3450 static inline gro_result_t
3451 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3452 {
3453 	struct sk_buff *p;
3454 
3455 	for (p = napi->gro_list; p; p = p->next) {
3456 		unsigned long diffs;
3457 
3458 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3459 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3460 		diffs |= compare_ether_header(skb_mac_header(p),
3461 					      skb_gro_mac_header(skb));
3462 		NAPI_GRO_CB(p)->same_flow = !diffs;
3463 		NAPI_GRO_CB(p)->flush = 0;
3464 	}
3465 
3466 	return dev_gro_receive(napi, skb);
3467 }
3468 
3469 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3470 {
3471 	switch (ret) {
3472 	case GRO_NORMAL:
3473 		if (netif_receive_skb(skb))
3474 			ret = GRO_DROP;
3475 		break;
3476 
3477 	case GRO_DROP:
3478 	case GRO_MERGED_FREE:
3479 		kfree_skb(skb);
3480 		break;
3481 
3482 	case GRO_HELD:
3483 	case GRO_MERGED:
3484 		break;
3485 	}
3486 
3487 	return ret;
3488 }
3489 EXPORT_SYMBOL(napi_skb_finish);
3490 
3491 void skb_gro_reset_offset(struct sk_buff *skb)
3492 {
3493 	NAPI_GRO_CB(skb)->data_offset = 0;
3494 	NAPI_GRO_CB(skb)->frag0 = NULL;
3495 	NAPI_GRO_CB(skb)->frag0_len = 0;
3496 
3497 	if (skb->mac_header == skb->tail &&
3498 	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3499 		NAPI_GRO_CB(skb)->frag0 =
3500 			page_address(skb_shinfo(skb)->frags[0].page) +
3501 			skb_shinfo(skb)->frags[0].page_offset;
3502 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3503 	}
3504 }
3505 EXPORT_SYMBOL(skb_gro_reset_offset);
3506 
3507 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3508 {
3509 	skb_gro_reset_offset(skb);
3510 
3511 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3512 }
3513 EXPORT_SYMBOL(napi_gro_receive);
3514 
3515 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3516 {
3517 	__skb_pull(skb, skb_headlen(skb));
3518 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3519 	skb->vlan_tci = 0;
3520 	skb->dev = napi->dev;
3521 	skb->skb_iif = 0;
3522 
3523 	napi->skb = skb;
3524 }
3525 
3526 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3527 {
3528 	struct sk_buff *skb = napi->skb;
3529 
3530 	if (!skb) {
3531 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3532 		if (skb)
3533 			napi->skb = skb;
3534 	}
3535 	return skb;
3536 }
3537 EXPORT_SYMBOL(napi_get_frags);
3538 
3539 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3540 			       gro_result_t ret)
3541 {
3542 	switch (ret) {
3543 	case GRO_NORMAL:
3544 	case GRO_HELD:
3545 		skb->protocol = eth_type_trans(skb, skb->dev);
3546 
3547 		if (ret == GRO_HELD)
3548 			skb_gro_pull(skb, -ETH_HLEN);
3549 		else if (netif_receive_skb(skb))
3550 			ret = GRO_DROP;
3551 		break;
3552 
3553 	case GRO_DROP:
3554 	case GRO_MERGED_FREE:
3555 		napi_reuse_skb(napi, skb);
3556 		break;
3557 
3558 	case GRO_MERGED:
3559 		break;
3560 	}
3561 
3562 	return ret;
3563 }
3564 EXPORT_SYMBOL(napi_frags_finish);
3565 
3566 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3567 {
3568 	struct sk_buff *skb = napi->skb;
3569 	struct ethhdr *eth;
3570 	unsigned int hlen;
3571 	unsigned int off;
3572 
3573 	napi->skb = NULL;
3574 
3575 	skb_reset_mac_header(skb);
3576 	skb_gro_reset_offset(skb);
3577 
3578 	off = skb_gro_offset(skb);
3579 	hlen = off + sizeof(*eth);
3580 	eth = skb_gro_header_fast(skb, off);
3581 	if (skb_gro_header_hard(skb, hlen)) {
3582 		eth = skb_gro_header_slow(skb, hlen, off);
3583 		if (unlikely(!eth)) {
3584 			napi_reuse_skb(napi, skb);
3585 			skb = NULL;
3586 			goto out;
3587 		}
3588 	}
3589 
3590 	skb_gro_pull(skb, sizeof(*eth));
3591 
3592 	/*
3593 	 * This works because the only protocols we care about don't require
3594 	 * special handling.  We'll fix it up properly at the end.
3595 	 */
3596 	skb->protocol = eth->h_proto;
3597 
3598 out:
3599 	return skb;
3600 }
3601 EXPORT_SYMBOL(napi_frags_skb);
3602 
3603 gro_result_t napi_gro_frags(struct napi_struct *napi)
3604 {
3605 	struct sk_buff *skb = napi_frags_skb(napi);
3606 
3607 	if (!skb)
3608 		return GRO_DROP;
3609 
3610 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3611 }
3612 EXPORT_SYMBOL(napi_gro_frags);
3613 
3614 /*
3615  * net_rps_action sends any pending IPI's for rps.
3616  * Note: called with local irq disabled, but exits with local irq enabled.
3617  */
3618 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3619 {
3620 #ifdef CONFIG_RPS
3621 	struct softnet_data *remsd = sd->rps_ipi_list;
3622 
3623 	if (remsd) {
3624 		sd->rps_ipi_list = NULL;
3625 
3626 		local_irq_enable();
3627 
3628 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3629 		while (remsd) {
3630 			struct softnet_data *next = remsd->rps_ipi_next;
3631 
3632 			if (cpu_online(remsd->cpu))
3633 				__smp_call_function_single(remsd->cpu,
3634 							   &remsd->csd, 0);
3635 			remsd = next;
3636 		}
3637 	} else
3638 #endif
3639 		local_irq_enable();
3640 }
3641 
3642 static int process_backlog(struct napi_struct *napi, int quota)
3643 {
3644 	int work = 0;
3645 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3646 
3647 #ifdef CONFIG_RPS
3648 	/* Check if we have pending ipi, its better to send them now,
3649 	 * not waiting net_rx_action() end.
3650 	 */
3651 	if (sd->rps_ipi_list) {
3652 		local_irq_disable();
3653 		net_rps_action_and_irq_enable(sd);
3654 	}
3655 #endif
3656 	napi->weight = weight_p;
3657 	local_irq_disable();
3658 	while (work < quota) {
3659 		struct sk_buff *skb;
3660 		unsigned int qlen;
3661 
3662 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3663 			local_irq_enable();
3664 			__netif_receive_skb(skb);
3665 			local_irq_disable();
3666 			input_queue_head_incr(sd);
3667 			if (++work >= quota) {
3668 				local_irq_enable();
3669 				return work;
3670 			}
3671 		}
3672 
3673 		rps_lock(sd);
3674 		qlen = skb_queue_len(&sd->input_pkt_queue);
3675 		if (qlen)
3676 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3677 						   &sd->process_queue);
3678 
3679 		if (qlen < quota - work) {
3680 			/*
3681 			 * Inline a custom version of __napi_complete().
3682 			 * only current cpu owns and manipulates this napi,
3683 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3684 			 * we can use a plain write instead of clear_bit(),
3685 			 * and we dont need an smp_mb() memory barrier.
3686 			 */
3687 			list_del(&napi->poll_list);
3688 			napi->state = 0;
3689 
3690 			quota = work + qlen;
3691 		}
3692 		rps_unlock(sd);
3693 	}
3694 	local_irq_enable();
3695 
3696 	return work;
3697 }
3698 
3699 /**
3700  * __napi_schedule - schedule for receive
3701  * @n: entry to schedule
3702  *
3703  * The entry's receive function will be scheduled to run
3704  */
3705 void __napi_schedule(struct napi_struct *n)
3706 {
3707 	unsigned long flags;
3708 
3709 	local_irq_save(flags);
3710 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3711 	local_irq_restore(flags);
3712 }
3713 EXPORT_SYMBOL(__napi_schedule);
3714 
3715 void __napi_complete(struct napi_struct *n)
3716 {
3717 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3718 	BUG_ON(n->gro_list);
3719 
3720 	list_del(&n->poll_list);
3721 	smp_mb__before_clear_bit();
3722 	clear_bit(NAPI_STATE_SCHED, &n->state);
3723 }
3724 EXPORT_SYMBOL(__napi_complete);
3725 
3726 void napi_complete(struct napi_struct *n)
3727 {
3728 	unsigned long flags;
3729 
3730 	/*
3731 	 * don't let napi dequeue from the cpu poll list
3732 	 * just in case its running on a different cpu
3733 	 */
3734 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3735 		return;
3736 
3737 	napi_gro_flush(n);
3738 	local_irq_save(flags);
3739 	__napi_complete(n);
3740 	local_irq_restore(flags);
3741 }
3742 EXPORT_SYMBOL(napi_complete);
3743 
3744 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3745 		    int (*poll)(struct napi_struct *, int), int weight)
3746 {
3747 	INIT_LIST_HEAD(&napi->poll_list);
3748 	napi->gro_count = 0;
3749 	napi->gro_list = NULL;
3750 	napi->skb = NULL;
3751 	napi->poll = poll;
3752 	napi->weight = weight;
3753 	list_add(&napi->dev_list, &dev->napi_list);
3754 	napi->dev = dev;
3755 #ifdef CONFIG_NETPOLL
3756 	spin_lock_init(&napi->poll_lock);
3757 	napi->poll_owner = -1;
3758 #endif
3759 	set_bit(NAPI_STATE_SCHED, &napi->state);
3760 }
3761 EXPORT_SYMBOL(netif_napi_add);
3762 
3763 void netif_napi_del(struct napi_struct *napi)
3764 {
3765 	struct sk_buff *skb, *next;
3766 
3767 	list_del_init(&napi->dev_list);
3768 	napi_free_frags(napi);
3769 
3770 	for (skb = napi->gro_list; skb; skb = next) {
3771 		next = skb->next;
3772 		skb->next = NULL;
3773 		kfree_skb(skb);
3774 	}
3775 
3776 	napi->gro_list = NULL;
3777 	napi->gro_count = 0;
3778 }
3779 EXPORT_SYMBOL(netif_napi_del);
3780 
3781 static void net_rx_action(struct softirq_action *h)
3782 {
3783 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3784 	unsigned long time_limit = jiffies + 2;
3785 	int budget = netdev_budget;
3786 	void *have;
3787 
3788 	local_irq_disable();
3789 
3790 	while (!list_empty(&sd->poll_list)) {
3791 		struct napi_struct *n;
3792 		int work, weight;
3793 
3794 		/* If softirq window is exhuasted then punt.
3795 		 * Allow this to run for 2 jiffies since which will allow
3796 		 * an average latency of 1.5/HZ.
3797 		 */
3798 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3799 			goto softnet_break;
3800 
3801 		local_irq_enable();
3802 
3803 		/* Even though interrupts have been re-enabled, this
3804 		 * access is safe because interrupts can only add new
3805 		 * entries to the tail of this list, and only ->poll()
3806 		 * calls can remove this head entry from the list.
3807 		 */
3808 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3809 
3810 		have = netpoll_poll_lock(n);
3811 
3812 		weight = n->weight;
3813 
3814 		/* This NAPI_STATE_SCHED test is for avoiding a race
3815 		 * with netpoll's poll_napi().  Only the entity which
3816 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3817 		 * actually make the ->poll() call.  Therefore we avoid
3818 		 * accidently calling ->poll() when NAPI is not scheduled.
3819 		 */
3820 		work = 0;
3821 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3822 			work = n->poll(n, weight);
3823 			trace_napi_poll(n);
3824 		}
3825 
3826 		WARN_ON_ONCE(work > weight);
3827 
3828 		budget -= work;
3829 
3830 		local_irq_disable();
3831 
3832 		/* Drivers must not modify the NAPI state if they
3833 		 * consume the entire weight.  In such cases this code
3834 		 * still "owns" the NAPI instance and therefore can
3835 		 * move the instance around on the list at-will.
3836 		 */
3837 		if (unlikely(work == weight)) {
3838 			if (unlikely(napi_disable_pending(n))) {
3839 				local_irq_enable();
3840 				napi_complete(n);
3841 				local_irq_disable();
3842 			} else
3843 				list_move_tail(&n->poll_list, &sd->poll_list);
3844 		}
3845 
3846 		netpoll_poll_unlock(have);
3847 	}
3848 out:
3849 	net_rps_action_and_irq_enable(sd);
3850 
3851 #ifdef CONFIG_NET_DMA
3852 	/*
3853 	 * There may not be any more sk_buffs coming right now, so push
3854 	 * any pending DMA copies to hardware
3855 	 */
3856 	dma_issue_pending_all();
3857 #endif
3858 
3859 	return;
3860 
3861 softnet_break:
3862 	sd->time_squeeze++;
3863 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3864 	goto out;
3865 }
3866 
3867 static gifconf_func_t *gifconf_list[NPROTO];
3868 
3869 /**
3870  *	register_gifconf	-	register a SIOCGIF handler
3871  *	@family: Address family
3872  *	@gifconf: Function handler
3873  *
3874  *	Register protocol dependent address dumping routines. The handler
3875  *	that is passed must not be freed or reused until it has been replaced
3876  *	by another handler.
3877  */
3878 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3879 {
3880 	if (family >= NPROTO)
3881 		return -EINVAL;
3882 	gifconf_list[family] = gifconf;
3883 	return 0;
3884 }
3885 EXPORT_SYMBOL(register_gifconf);
3886 
3887 
3888 /*
3889  *	Map an interface index to its name (SIOCGIFNAME)
3890  */
3891 
3892 /*
3893  *	We need this ioctl for efficient implementation of the
3894  *	if_indextoname() function required by the IPv6 API.  Without
3895  *	it, we would have to search all the interfaces to find a
3896  *	match.  --pb
3897  */
3898 
3899 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3900 {
3901 	struct net_device *dev;
3902 	struct ifreq ifr;
3903 
3904 	/*
3905 	 *	Fetch the caller's info block.
3906 	 */
3907 
3908 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3909 		return -EFAULT;
3910 
3911 	rcu_read_lock();
3912 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3913 	if (!dev) {
3914 		rcu_read_unlock();
3915 		return -ENODEV;
3916 	}
3917 
3918 	strcpy(ifr.ifr_name, dev->name);
3919 	rcu_read_unlock();
3920 
3921 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3922 		return -EFAULT;
3923 	return 0;
3924 }
3925 
3926 /*
3927  *	Perform a SIOCGIFCONF call. This structure will change
3928  *	size eventually, and there is nothing I can do about it.
3929  *	Thus we will need a 'compatibility mode'.
3930  */
3931 
3932 static int dev_ifconf(struct net *net, char __user *arg)
3933 {
3934 	struct ifconf ifc;
3935 	struct net_device *dev;
3936 	char __user *pos;
3937 	int len;
3938 	int total;
3939 	int i;
3940 
3941 	/*
3942 	 *	Fetch the caller's info block.
3943 	 */
3944 
3945 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3946 		return -EFAULT;
3947 
3948 	pos = ifc.ifc_buf;
3949 	len = ifc.ifc_len;
3950 
3951 	/*
3952 	 *	Loop over the interfaces, and write an info block for each.
3953 	 */
3954 
3955 	total = 0;
3956 	for_each_netdev(net, dev) {
3957 		for (i = 0; i < NPROTO; i++) {
3958 			if (gifconf_list[i]) {
3959 				int done;
3960 				if (!pos)
3961 					done = gifconf_list[i](dev, NULL, 0);
3962 				else
3963 					done = gifconf_list[i](dev, pos + total,
3964 							       len - total);
3965 				if (done < 0)
3966 					return -EFAULT;
3967 				total += done;
3968 			}
3969 		}
3970 	}
3971 
3972 	/*
3973 	 *	All done.  Write the updated control block back to the caller.
3974 	 */
3975 	ifc.ifc_len = total;
3976 
3977 	/*
3978 	 * 	Both BSD and Solaris return 0 here, so we do too.
3979 	 */
3980 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3981 }
3982 
3983 #ifdef CONFIG_PROC_FS
3984 /*
3985  *	This is invoked by the /proc filesystem handler to display a device
3986  *	in detail.
3987  */
3988 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3989 	__acquires(RCU)
3990 {
3991 	struct net *net = seq_file_net(seq);
3992 	loff_t off;
3993 	struct net_device *dev;
3994 
3995 	rcu_read_lock();
3996 	if (!*pos)
3997 		return SEQ_START_TOKEN;
3998 
3999 	off = 1;
4000 	for_each_netdev_rcu(net, dev)
4001 		if (off++ == *pos)
4002 			return dev;
4003 
4004 	return NULL;
4005 }
4006 
4007 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4008 {
4009 	struct net_device *dev = v;
4010 
4011 	if (v == SEQ_START_TOKEN)
4012 		dev = first_net_device_rcu(seq_file_net(seq));
4013 	else
4014 		dev = next_net_device_rcu(dev);
4015 
4016 	++*pos;
4017 	return dev;
4018 }
4019 
4020 void dev_seq_stop(struct seq_file *seq, void *v)
4021 	__releases(RCU)
4022 {
4023 	rcu_read_unlock();
4024 }
4025 
4026 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4027 {
4028 	struct rtnl_link_stats64 temp;
4029 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4030 
4031 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4032 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4033 		   dev->name, stats->rx_bytes, stats->rx_packets,
4034 		   stats->rx_errors,
4035 		   stats->rx_dropped + stats->rx_missed_errors,
4036 		   stats->rx_fifo_errors,
4037 		   stats->rx_length_errors + stats->rx_over_errors +
4038 		    stats->rx_crc_errors + stats->rx_frame_errors,
4039 		   stats->rx_compressed, stats->multicast,
4040 		   stats->tx_bytes, stats->tx_packets,
4041 		   stats->tx_errors, stats->tx_dropped,
4042 		   stats->tx_fifo_errors, stats->collisions,
4043 		   stats->tx_carrier_errors +
4044 		    stats->tx_aborted_errors +
4045 		    stats->tx_window_errors +
4046 		    stats->tx_heartbeat_errors,
4047 		   stats->tx_compressed);
4048 }
4049 
4050 /*
4051  *	Called from the PROCfs module. This now uses the new arbitrary sized
4052  *	/proc/net interface to create /proc/net/dev
4053  */
4054 static int dev_seq_show(struct seq_file *seq, void *v)
4055 {
4056 	if (v == SEQ_START_TOKEN)
4057 		seq_puts(seq, "Inter-|   Receive                            "
4058 			      "                    |  Transmit\n"
4059 			      " face |bytes    packets errs drop fifo frame "
4060 			      "compressed multicast|bytes    packets errs "
4061 			      "drop fifo colls carrier compressed\n");
4062 	else
4063 		dev_seq_printf_stats(seq, v);
4064 	return 0;
4065 }
4066 
4067 static struct softnet_data *softnet_get_online(loff_t *pos)
4068 {
4069 	struct softnet_data *sd = NULL;
4070 
4071 	while (*pos < nr_cpu_ids)
4072 		if (cpu_online(*pos)) {
4073 			sd = &per_cpu(softnet_data, *pos);
4074 			break;
4075 		} else
4076 			++*pos;
4077 	return sd;
4078 }
4079 
4080 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4081 {
4082 	return softnet_get_online(pos);
4083 }
4084 
4085 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4086 {
4087 	++*pos;
4088 	return softnet_get_online(pos);
4089 }
4090 
4091 static void softnet_seq_stop(struct seq_file *seq, void *v)
4092 {
4093 }
4094 
4095 static int softnet_seq_show(struct seq_file *seq, void *v)
4096 {
4097 	struct softnet_data *sd = v;
4098 
4099 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4100 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4101 		   0, 0, 0, 0, /* was fastroute */
4102 		   sd->cpu_collision, sd->received_rps);
4103 	return 0;
4104 }
4105 
4106 static const struct seq_operations dev_seq_ops = {
4107 	.start = dev_seq_start,
4108 	.next  = dev_seq_next,
4109 	.stop  = dev_seq_stop,
4110 	.show  = dev_seq_show,
4111 };
4112 
4113 static int dev_seq_open(struct inode *inode, struct file *file)
4114 {
4115 	return seq_open_net(inode, file, &dev_seq_ops,
4116 			    sizeof(struct seq_net_private));
4117 }
4118 
4119 static const struct file_operations dev_seq_fops = {
4120 	.owner	 = THIS_MODULE,
4121 	.open    = dev_seq_open,
4122 	.read    = seq_read,
4123 	.llseek  = seq_lseek,
4124 	.release = seq_release_net,
4125 };
4126 
4127 static const struct seq_operations softnet_seq_ops = {
4128 	.start = softnet_seq_start,
4129 	.next  = softnet_seq_next,
4130 	.stop  = softnet_seq_stop,
4131 	.show  = softnet_seq_show,
4132 };
4133 
4134 static int softnet_seq_open(struct inode *inode, struct file *file)
4135 {
4136 	return seq_open(file, &softnet_seq_ops);
4137 }
4138 
4139 static const struct file_operations softnet_seq_fops = {
4140 	.owner	 = THIS_MODULE,
4141 	.open    = softnet_seq_open,
4142 	.read    = seq_read,
4143 	.llseek  = seq_lseek,
4144 	.release = seq_release,
4145 };
4146 
4147 static void *ptype_get_idx(loff_t pos)
4148 {
4149 	struct packet_type *pt = NULL;
4150 	loff_t i = 0;
4151 	int t;
4152 
4153 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4154 		if (i == pos)
4155 			return pt;
4156 		++i;
4157 	}
4158 
4159 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4160 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4161 			if (i == pos)
4162 				return pt;
4163 			++i;
4164 		}
4165 	}
4166 	return NULL;
4167 }
4168 
4169 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4170 	__acquires(RCU)
4171 {
4172 	rcu_read_lock();
4173 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4174 }
4175 
4176 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4177 {
4178 	struct packet_type *pt;
4179 	struct list_head *nxt;
4180 	int hash;
4181 
4182 	++*pos;
4183 	if (v == SEQ_START_TOKEN)
4184 		return ptype_get_idx(0);
4185 
4186 	pt = v;
4187 	nxt = pt->list.next;
4188 	if (pt->type == htons(ETH_P_ALL)) {
4189 		if (nxt != &ptype_all)
4190 			goto found;
4191 		hash = 0;
4192 		nxt = ptype_base[0].next;
4193 	} else
4194 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4195 
4196 	while (nxt == &ptype_base[hash]) {
4197 		if (++hash >= PTYPE_HASH_SIZE)
4198 			return NULL;
4199 		nxt = ptype_base[hash].next;
4200 	}
4201 found:
4202 	return list_entry(nxt, struct packet_type, list);
4203 }
4204 
4205 static void ptype_seq_stop(struct seq_file *seq, void *v)
4206 	__releases(RCU)
4207 {
4208 	rcu_read_unlock();
4209 }
4210 
4211 static int ptype_seq_show(struct seq_file *seq, void *v)
4212 {
4213 	struct packet_type *pt = v;
4214 
4215 	if (v == SEQ_START_TOKEN)
4216 		seq_puts(seq, "Type Device      Function\n");
4217 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4218 		if (pt->type == htons(ETH_P_ALL))
4219 			seq_puts(seq, "ALL ");
4220 		else
4221 			seq_printf(seq, "%04x", ntohs(pt->type));
4222 
4223 		seq_printf(seq, " %-8s %pF\n",
4224 			   pt->dev ? pt->dev->name : "", pt->func);
4225 	}
4226 
4227 	return 0;
4228 }
4229 
4230 static const struct seq_operations ptype_seq_ops = {
4231 	.start = ptype_seq_start,
4232 	.next  = ptype_seq_next,
4233 	.stop  = ptype_seq_stop,
4234 	.show  = ptype_seq_show,
4235 };
4236 
4237 static int ptype_seq_open(struct inode *inode, struct file *file)
4238 {
4239 	return seq_open_net(inode, file, &ptype_seq_ops,
4240 			sizeof(struct seq_net_private));
4241 }
4242 
4243 static const struct file_operations ptype_seq_fops = {
4244 	.owner	 = THIS_MODULE,
4245 	.open    = ptype_seq_open,
4246 	.read    = seq_read,
4247 	.llseek  = seq_lseek,
4248 	.release = seq_release_net,
4249 };
4250 
4251 
4252 static int __net_init dev_proc_net_init(struct net *net)
4253 {
4254 	int rc = -ENOMEM;
4255 
4256 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4257 		goto out;
4258 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4259 		goto out_dev;
4260 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4261 		goto out_softnet;
4262 
4263 	if (wext_proc_init(net))
4264 		goto out_ptype;
4265 	rc = 0;
4266 out:
4267 	return rc;
4268 out_ptype:
4269 	proc_net_remove(net, "ptype");
4270 out_softnet:
4271 	proc_net_remove(net, "softnet_stat");
4272 out_dev:
4273 	proc_net_remove(net, "dev");
4274 	goto out;
4275 }
4276 
4277 static void __net_exit dev_proc_net_exit(struct net *net)
4278 {
4279 	wext_proc_exit(net);
4280 
4281 	proc_net_remove(net, "ptype");
4282 	proc_net_remove(net, "softnet_stat");
4283 	proc_net_remove(net, "dev");
4284 }
4285 
4286 static struct pernet_operations __net_initdata dev_proc_ops = {
4287 	.init = dev_proc_net_init,
4288 	.exit = dev_proc_net_exit,
4289 };
4290 
4291 static int __init dev_proc_init(void)
4292 {
4293 	return register_pernet_subsys(&dev_proc_ops);
4294 }
4295 #else
4296 #define dev_proc_init() 0
4297 #endif	/* CONFIG_PROC_FS */
4298 
4299 
4300 /**
4301  *	netdev_set_master	-	set up master pointer
4302  *	@slave: slave device
4303  *	@master: new master device
4304  *
4305  *	Changes the master device of the slave. Pass %NULL to break the
4306  *	bonding. The caller must hold the RTNL semaphore. On a failure
4307  *	a negative errno code is returned. On success the reference counts
4308  *	are adjusted and the function returns zero.
4309  */
4310 int netdev_set_master(struct net_device *slave, struct net_device *master)
4311 {
4312 	struct net_device *old = slave->master;
4313 
4314 	ASSERT_RTNL();
4315 
4316 	if (master) {
4317 		if (old)
4318 			return -EBUSY;
4319 		dev_hold(master);
4320 	}
4321 
4322 	slave->master = master;
4323 
4324 	if (old) {
4325 		synchronize_net();
4326 		dev_put(old);
4327 	}
4328 	return 0;
4329 }
4330 EXPORT_SYMBOL(netdev_set_master);
4331 
4332 /**
4333  *	netdev_set_bond_master	-	set up bonding master/slave pair
4334  *	@slave: slave device
4335  *	@master: new master device
4336  *
4337  *	Changes the master device of the slave. Pass %NULL to break the
4338  *	bonding. The caller must hold the RTNL semaphore. On a failure
4339  *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4340  *	to the routing socket and the function returns zero.
4341  */
4342 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4343 {
4344 	int err;
4345 
4346 	ASSERT_RTNL();
4347 
4348 	err = netdev_set_master(slave, master);
4349 	if (err)
4350 		return err;
4351 	if (master)
4352 		slave->flags |= IFF_SLAVE;
4353 	else
4354 		slave->flags &= ~IFF_SLAVE;
4355 
4356 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4357 	return 0;
4358 }
4359 EXPORT_SYMBOL(netdev_set_bond_master);
4360 
4361 static void dev_change_rx_flags(struct net_device *dev, int flags)
4362 {
4363 	const struct net_device_ops *ops = dev->netdev_ops;
4364 
4365 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4366 		ops->ndo_change_rx_flags(dev, flags);
4367 }
4368 
4369 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4370 {
4371 	unsigned short old_flags = dev->flags;
4372 	uid_t uid;
4373 	gid_t gid;
4374 
4375 	ASSERT_RTNL();
4376 
4377 	dev->flags |= IFF_PROMISC;
4378 	dev->promiscuity += inc;
4379 	if (dev->promiscuity == 0) {
4380 		/*
4381 		 * Avoid overflow.
4382 		 * If inc causes overflow, untouch promisc and return error.
4383 		 */
4384 		if (inc < 0)
4385 			dev->flags &= ~IFF_PROMISC;
4386 		else {
4387 			dev->promiscuity -= inc;
4388 			printk(KERN_WARNING "%s: promiscuity touches roof, "
4389 				"set promiscuity failed, promiscuity feature "
4390 				"of device might be broken.\n", dev->name);
4391 			return -EOVERFLOW;
4392 		}
4393 	}
4394 	if (dev->flags != old_flags) {
4395 		printk(KERN_INFO "device %s %s promiscuous mode\n",
4396 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4397 							       "left");
4398 		if (audit_enabled) {
4399 			current_uid_gid(&uid, &gid);
4400 			audit_log(current->audit_context, GFP_ATOMIC,
4401 				AUDIT_ANOM_PROMISCUOUS,
4402 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4403 				dev->name, (dev->flags & IFF_PROMISC),
4404 				(old_flags & IFF_PROMISC),
4405 				audit_get_loginuid(current),
4406 				uid, gid,
4407 				audit_get_sessionid(current));
4408 		}
4409 
4410 		dev_change_rx_flags(dev, IFF_PROMISC);
4411 	}
4412 	return 0;
4413 }
4414 
4415 /**
4416  *	dev_set_promiscuity	- update promiscuity count on a device
4417  *	@dev: device
4418  *	@inc: modifier
4419  *
4420  *	Add or remove promiscuity from a device. While the count in the device
4421  *	remains above zero the interface remains promiscuous. Once it hits zero
4422  *	the device reverts back to normal filtering operation. A negative inc
4423  *	value is used to drop promiscuity on the device.
4424  *	Return 0 if successful or a negative errno code on error.
4425  */
4426 int dev_set_promiscuity(struct net_device *dev, int inc)
4427 {
4428 	unsigned short old_flags = dev->flags;
4429 	int err;
4430 
4431 	err = __dev_set_promiscuity(dev, inc);
4432 	if (err < 0)
4433 		return err;
4434 	if (dev->flags != old_flags)
4435 		dev_set_rx_mode(dev);
4436 	return err;
4437 }
4438 EXPORT_SYMBOL(dev_set_promiscuity);
4439 
4440 /**
4441  *	dev_set_allmulti	- update allmulti count on a device
4442  *	@dev: device
4443  *	@inc: modifier
4444  *
4445  *	Add or remove reception of all multicast frames to a device. While the
4446  *	count in the device remains above zero the interface remains listening
4447  *	to all interfaces. Once it hits zero the device reverts back to normal
4448  *	filtering operation. A negative @inc value is used to drop the counter
4449  *	when releasing a resource needing all multicasts.
4450  *	Return 0 if successful or a negative errno code on error.
4451  */
4452 
4453 int dev_set_allmulti(struct net_device *dev, int inc)
4454 {
4455 	unsigned short old_flags = dev->flags;
4456 
4457 	ASSERT_RTNL();
4458 
4459 	dev->flags |= IFF_ALLMULTI;
4460 	dev->allmulti += inc;
4461 	if (dev->allmulti == 0) {
4462 		/*
4463 		 * Avoid overflow.
4464 		 * If inc causes overflow, untouch allmulti and return error.
4465 		 */
4466 		if (inc < 0)
4467 			dev->flags &= ~IFF_ALLMULTI;
4468 		else {
4469 			dev->allmulti -= inc;
4470 			printk(KERN_WARNING "%s: allmulti touches roof, "
4471 				"set allmulti failed, allmulti feature of "
4472 				"device might be broken.\n", dev->name);
4473 			return -EOVERFLOW;
4474 		}
4475 	}
4476 	if (dev->flags ^ old_flags) {
4477 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4478 		dev_set_rx_mode(dev);
4479 	}
4480 	return 0;
4481 }
4482 EXPORT_SYMBOL(dev_set_allmulti);
4483 
4484 /*
4485  *	Upload unicast and multicast address lists to device and
4486  *	configure RX filtering. When the device doesn't support unicast
4487  *	filtering it is put in promiscuous mode while unicast addresses
4488  *	are present.
4489  */
4490 void __dev_set_rx_mode(struct net_device *dev)
4491 {
4492 	const struct net_device_ops *ops = dev->netdev_ops;
4493 
4494 	/* dev_open will call this function so the list will stay sane. */
4495 	if (!(dev->flags&IFF_UP))
4496 		return;
4497 
4498 	if (!netif_device_present(dev))
4499 		return;
4500 
4501 	if (ops->ndo_set_rx_mode)
4502 		ops->ndo_set_rx_mode(dev);
4503 	else {
4504 		/* Unicast addresses changes may only happen under the rtnl,
4505 		 * therefore calling __dev_set_promiscuity here is safe.
4506 		 */
4507 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4508 			__dev_set_promiscuity(dev, 1);
4509 			dev->uc_promisc = 1;
4510 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4511 			__dev_set_promiscuity(dev, -1);
4512 			dev->uc_promisc = 0;
4513 		}
4514 
4515 		if (ops->ndo_set_multicast_list)
4516 			ops->ndo_set_multicast_list(dev);
4517 	}
4518 }
4519 
4520 void dev_set_rx_mode(struct net_device *dev)
4521 {
4522 	netif_addr_lock_bh(dev);
4523 	__dev_set_rx_mode(dev);
4524 	netif_addr_unlock_bh(dev);
4525 }
4526 
4527 /**
4528  *	dev_get_flags - get flags reported to userspace
4529  *	@dev: device
4530  *
4531  *	Get the combination of flag bits exported through APIs to userspace.
4532  */
4533 unsigned dev_get_flags(const struct net_device *dev)
4534 {
4535 	unsigned flags;
4536 
4537 	flags = (dev->flags & ~(IFF_PROMISC |
4538 				IFF_ALLMULTI |
4539 				IFF_RUNNING |
4540 				IFF_LOWER_UP |
4541 				IFF_DORMANT)) |
4542 		(dev->gflags & (IFF_PROMISC |
4543 				IFF_ALLMULTI));
4544 
4545 	if (netif_running(dev)) {
4546 		if (netif_oper_up(dev))
4547 			flags |= IFF_RUNNING;
4548 		if (netif_carrier_ok(dev))
4549 			flags |= IFF_LOWER_UP;
4550 		if (netif_dormant(dev))
4551 			flags |= IFF_DORMANT;
4552 	}
4553 
4554 	return flags;
4555 }
4556 EXPORT_SYMBOL(dev_get_flags);
4557 
4558 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4559 {
4560 	int old_flags = dev->flags;
4561 	int ret;
4562 
4563 	ASSERT_RTNL();
4564 
4565 	/*
4566 	 *	Set the flags on our device.
4567 	 */
4568 
4569 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4570 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4571 			       IFF_AUTOMEDIA)) |
4572 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4573 				    IFF_ALLMULTI));
4574 
4575 	/*
4576 	 *	Load in the correct multicast list now the flags have changed.
4577 	 */
4578 
4579 	if ((old_flags ^ flags) & IFF_MULTICAST)
4580 		dev_change_rx_flags(dev, IFF_MULTICAST);
4581 
4582 	dev_set_rx_mode(dev);
4583 
4584 	/*
4585 	 *	Have we downed the interface. We handle IFF_UP ourselves
4586 	 *	according to user attempts to set it, rather than blindly
4587 	 *	setting it.
4588 	 */
4589 
4590 	ret = 0;
4591 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4592 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4593 
4594 		if (!ret)
4595 			dev_set_rx_mode(dev);
4596 	}
4597 
4598 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4599 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4600 
4601 		dev->gflags ^= IFF_PROMISC;
4602 		dev_set_promiscuity(dev, inc);
4603 	}
4604 
4605 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4606 	   is important. Some (broken) drivers set IFF_PROMISC, when
4607 	   IFF_ALLMULTI is requested not asking us and not reporting.
4608 	 */
4609 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4610 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4611 
4612 		dev->gflags ^= IFF_ALLMULTI;
4613 		dev_set_allmulti(dev, inc);
4614 	}
4615 
4616 	return ret;
4617 }
4618 
4619 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4620 {
4621 	unsigned int changes = dev->flags ^ old_flags;
4622 
4623 	if (changes & IFF_UP) {
4624 		if (dev->flags & IFF_UP)
4625 			call_netdevice_notifiers(NETDEV_UP, dev);
4626 		else
4627 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4628 	}
4629 
4630 	if (dev->flags & IFF_UP &&
4631 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4632 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4633 }
4634 
4635 /**
4636  *	dev_change_flags - change device settings
4637  *	@dev: device
4638  *	@flags: device state flags
4639  *
4640  *	Change settings on device based state flags. The flags are
4641  *	in the userspace exported format.
4642  */
4643 int dev_change_flags(struct net_device *dev, unsigned flags)
4644 {
4645 	int ret, changes;
4646 	int old_flags = dev->flags;
4647 
4648 	ret = __dev_change_flags(dev, flags);
4649 	if (ret < 0)
4650 		return ret;
4651 
4652 	changes = old_flags ^ dev->flags;
4653 	if (changes)
4654 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4655 
4656 	__dev_notify_flags(dev, old_flags);
4657 	return ret;
4658 }
4659 EXPORT_SYMBOL(dev_change_flags);
4660 
4661 /**
4662  *	dev_set_mtu - Change maximum transfer unit
4663  *	@dev: device
4664  *	@new_mtu: new transfer unit
4665  *
4666  *	Change the maximum transfer size of the network device.
4667  */
4668 int dev_set_mtu(struct net_device *dev, int new_mtu)
4669 {
4670 	const struct net_device_ops *ops = dev->netdev_ops;
4671 	int err;
4672 
4673 	if (new_mtu == dev->mtu)
4674 		return 0;
4675 
4676 	/*	MTU must be positive.	 */
4677 	if (new_mtu < 0)
4678 		return -EINVAL;
4679 
4680 	if (!netif_device_present(dev))
4681 		return -ENODEV;
4682 
4683 	err = 0;
4684 	if (ops->ndo_change_mtu)
4685 		err = ops->ndo_change_mtu(dev, new_mtu);
4686 	else
4687 		dev->mtu = new_mtu;
4688 
4689 	if (!err && dev->flags & IFF_UP)
4690 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4691 	return err;
4692 }
4693 EXPORT_SYMBOL(dev_set_mtu);
4694 
4695 /**
4696  *	dev_set_group - Change group this device belongs to
4697  *	@dev: device
4698  *	@new_group: group this device should belong to
4699  */
4700 void dev_set_group(struct net_device *dev, int new_group)
4701 {
4702 	dev->group = new_group;
4703 }
4704 EXPORT_SYMBOL(dev_set_group);
4705 
4706 /**
4707  *	dev_set_mac_address - Change Media Access Control Address
4708  *	@dev: device
4709  *	@sa: new address
4710  *
4711  *	Change the hardware (MAC) address of the device
4712  */
4713 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4714 {
4715 	const struct net_device_ops *ops = dev->netdev_ops;
4716 	int err;
4717 
4718 	if (!ops->ndo_set_mac_address)
4719 		return -EOPNOTSUPP;
4720 	if (sa->sa_family != dev->type)
4721 		return -EINVAL;
4722 	if (!netif_device_present(dev))
4723 		return -ENODEV;
4724 	err = ops->ndo_set_mac_address(dev, sa);
4725 	if (!err)
4726 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4727 	return err;
4728 }
4729 EXPORT_SYMBOL(dev_set_mac_address);
4730 
4731 /*
4732  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4733  */
4734 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4735 {
4736 	int err;
4737 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4738 
4739 	if (!dev)
4740 		return -ENODEV;
4741 
4742 	switch (cmd) {
4743 	case SIOCGIFFLAGS:	/* Get interface flags */
4744 		ifr->ifr_flags = (short) dev_get_flags(dev);
4745 		return 0;
4746 
4747 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4748 				   (currently unused) */
4749 		ifr->ifr_metric = 0;
4750 		return 0;
4751 
4752 	case SIOCGIFMTU:	/* Get the MTU of a device */
4753 		ifr->ifr_mtu = dev->mtu;
4754 		return 0;
4755 
4756 	case SIOCGIFHWADDR:
4757 		if (!dev->addr_len)
4758 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4759 		else
4760 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4761 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4762 		ifr->ifr_hwaddr.sa_family = dev->type;
4763 		return 0;
4764 
4765 	case SIOCGIFSLAVE:
4766 		err = -EINVAL;
4767 		break;
4768 
4769 	case SIOCGIFMAP:
4770 		ifr->ifr_map.mem_start = dev->mem_start;
4771 		ifr->ifr_map.mem_end   = dev->mem_end;
4772 		ifr->ifr_map.base_addr = dev->base_addr;
4773 		ifr->ifr_map.irq       = dev->irq;
4774 		ifr->ifr_map.dma       = dev->dma;
4775 		ifr->ifr_map.port      = dev->if_port;
4776 		return 0;
4777 
4778 	case SIOCGIFINDEX:
4779 		ifr->ifr_ifindex = dev->ifindex;
4780 		return 0;
4781 
4782 	case SIOCGIFTXQLEN:
4783 		ifr->ifr_qlen = dev->tx_queue_len;
4784 		return 0;
4785 
4786 	default:
4787 		/* dev_ioctl() should ensure this case
4788 		 * is never reached
4789 		 */
4790 		WARN_ON(1);
4791 		err = -EINVAL;
4792 		break;
4793 
4794 	}
4795 	return err;
4796 }
4797 
4798 /*
4799  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4800  */
4801 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4802 {
4803 	int err;
4804 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4805 	const struct net_device_ops *ops;
4806 
4807 	if (!dev)
4808 		return -ENODEV;
4809 
4810 	ops = dev->netdev_ops;
4811 
4812 	switch (cmd) {
4813 	case SIOCSIFFLAGS:	/* Set interface flags */
4814 		return dev_change_flags(dev, ifr->ifr_flags);
4815 
4816 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4817 				   (currently unused) */
4818 		return -EOPNOTSUPP;
4819 
4820 	case SIOCSIFMTU:	/* Set the MTU of a device */
4821 		return dev_set_mtu(dev, ifr->ifr_mtu);
4822 
4823 	case SIOCSIFHWADDR:
4824 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4825 
4826 	case SIOCSIFHWBROADCAST:
4827 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4828 			return -EINVAL;
4829 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4830 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4831 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4832 		return 0;
4833 
4834 	case SIOCSIFMAP:
4835 		if (ops->ndo_set_config) {
4836 			if (!netif_device_present(dev))
4837 				return -ENODEV;
4838 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4839 		}
4840 		return -EOPNOTSUPP;
4841 
4842 	case SIOCADDMULTI:
4843 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4844 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4845 			return -EINVAL;
4846 		if (!netif_device_present(dev))
4847 			return -ENODEV;
4848 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4849 
4850 	case SIOCDELMULTI:
4851 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4852 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4853 			return -EINVAL;
4854 		if (!netif_device_present(dev))
4855 			return -ENODEV;
4856 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4857 
4858 	case SIOCSIFTXQLEN:
4859 		if (ifr->ifr_qlen < 0)
4860 			return -EINVAL;
4861 		dev->tx_queue_len = ifr->ifr_qlen;
4862 		return 0;
4863 
4864 	case SIOCSIFNAME:
4865 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4866 		return dev_change_name(dev, ifr->ifr_newname);
4867 
4868 	/*
4869 	 *	Unknown or private ioctl
4870 	 */
4871 	default:
4872 		if ((cmd >= SIOCDEVPRIVATE &&
4873 		    cmd <= SIOCDEVPRIVATE + 15) ||
4874 		    cmd == SIOCBONDENSLAVE ||
4875 		    cmd == SIOCBONDRELEASE ||
4876 		    cmd == SIOCBONDSETHWADDR ||
4877 		    cmd == SIOCBONDSLAVEINFOQUERY ||
4878 		    cmd == SIOCBONDINFOQUERY ||
4879 		    cmd == SIOCBONDCHANGEACTIVE ||
4880 		    cmd == SIOCGMIIPHY ||
4881 		    cmd == SIOCGMIIREG ||
4882 		    cmd == SIOCSMIIREG ||
4883 		    cmd == SIOCBRADDIF ||
4884 		    cmd == SIOCBRDELIF ||
4885 		    cmd == SIOCSHWTSTAMP ||
4886 		    cmd == SIOCWANDEV) {
4887 			err = -EOPNOTSUPP;
4888 			if (ops->ndo_do_ioctl) {
4889 				if (netif_device_present(dev))
4890 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4891 				else
4892 					err = -ENODEV;
4893 			}
4894 		} else
4895 			err = -EINVAL;
4896 
4897 	}
4898 	return err;
4899 }
4900 
4901 /*
4902  *	This function handles all "interface"-type I/O control requests. The actual
4903  *	'doing' part of this is dev_ifsioc above.
4904  */
4905 
4906 /**
4907  *	dev_ioctl	-	network device ioctl
4908  *	@net: the applicable net namespace
4909  *	@cmd: command to issue
4910  *	@arg: pointer to a struct ifreq in user space
4911  *
4912  *	Issue ioctl functions to devices. This is normally called by the
4913  *	user space syscall interfaces but can sometimes be useful for
4914  *	other purposes. The return value is the return from the syscall if
4915  *	positive or a negative errno code on error.
4916  */
4917 
4918 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4919 {
4920 	struct ifreq ifr;
4921 	int ret;
4922 	char *colon;
4923 
4924 	/* One special case: SIOCGIFCONF takes ifconf argument
4925 	   and requires shared lock, because it sleeps writing
4926 	   to user space.
4927 	 */
4928 
4929 	if (cmd == SIOCGIFCONF) {
4930 		rtnl_lock();
4931 		ret = dev_ifconf(net, (char __user *) arg);
4932 		rtnl_unlock();
4933 		return ret;
4934 	}
4935 	if (cmd == SIOCGIFNAME)
4936 		return dev_ifname(net, (struct ifreq __user *)arg);
4937 
4938 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4939 		return -EFAULT;
4940 
4941 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4942 
4943 	colon = strchr(ifr.ifr_name, ':');
4944 	if (colon)
4945 		*colon = 0;
4946 
4947 	/*
4948 	 *	See which interface the caller is talking about.
4949 	 */
4950 
4951 	switch (cmd) {
4952 	/*
4953 	 *	These ioctl calls:
4954 	 *	- can be done by all.
4955 	 *	- atomic and do not require locking.
4956 	 *	- return a value
4957 	 */
4958 	case SIOCGIFFLAGS:
4959 	case SIOCGIFMETRIC:
4960 	case SIOCGIFMTU:
4961 	case SIOCGIFHWADDR:
4962 	case SIOCGIFSLAVE:
4963 	case SIOCGIFMAP:
4964 	case SIOCGIFINDEX:
4965 	case SIOCGIFTXQLEN:
4966 		dev_load(net, ifr.ifr_name);
4967 		rcu_read_lock();
4968 		ret = dev_ifsioc_locked(net, &ifr, cmd);
4969 		rcu_read_unlock();
4970 		if (!ret) {
4971 			if (colon)
4972 				*colon = ':';
4973 			if (copy_to_user(arg, &ifr,
4974 					 sizeof(struct ifreq)))
4975 				ret = -EFAULT;
4976 		}
4977 		return ret;
4978 
4979 	case SIOCETHTOOL:
4980 		dev_load(net, ifr.ifr_name);
4981 		rtnl_lock();
4982 		ret = dev_ethtool(net, &ifr);
4983 		rtnl_unlock();
4984 		if (!ret) {
4985 			if (colon)
4986 				*colon = ':';
4987 			if (copy_to_user(arg, &ifr,
4988 					 sizeof(struct ifreq)))
4989 				ret = -EFAULT;
4990 		}
4991 		return ret;
4992 
4993 	/*
4994 	 *	These ioctl calls:
4995 	 *	- require superuser power.
4996 	 *	- require strict serialization.
4997 	 *	- return a value
4998 	 */
4999 	case SIOCGMIIPHY:
5000 	case SIOCGMIIREG:
5001 	case SIOCSIFNAME:
5002 		if (!capable(CAP_NET_ADMIN))
5003 			return -EPERM;
5004 		dev_load(net, ifr.ifr_name);
5005 		rtnl_lock();
5006 		ret = dev_ifsioc(net, &ifr, cmd);
5007 		rtnl_unlock();
5008 		if (!ret) {
5009 			if (colon)
5010 				*colon = ':';
5011 			if (copy_to_user(arg, &ifr,
5012 					 sizeof(struct ifreq)))
5013 				ret = -EFAULT;
5014 		}
5015 		return ret;
5016 
5017 	/*
5018 	 *	These ioctl calls:
5019 	 *	- require superuser power.
5020 	 *	- require strict serialization.
5021 	 *	- do not return a value
5022 	 */
5023 	case SIOCSIFFLAGS:
5024 	case SIOCSIFMETRIC:
5025 	case SIOCSIFMTU:
5026 	case SIOCSIFMAP:
5027 	case SIOCSIFHWADDR:
5028 	case SIOCSIFSLAVE:
5029 	case SIOCADDMULTI:
5030 	case SIOCDELMULTI:
5031 	case SIOCSIFHWBROADCAST:
5032 	case SIOCSIFTXQLEN:
5033 	case SIOCSMIIREG:
5034 	case SIOCBONDENSLAVE:
5035 	case SIOCBONDRELEASE:
5036 	case SIOCBONDSETHWADDR:
5037 	case SIOCBONDCHANGEACTIVE:
5038 	case SIOCBRADDIF:
5039 	case SIOCBRDELIF:
5040 	case SIOCSHWTSTAMP:
5041 		if (!capable(CAP_NET_ADMIN))
5042 			return -EPERM;
5043 		/* fall through */
5044 	case SIOCBONDSLAVEINFOQUERY:
5045 	case SIOCBONDINFOQUERY:
5046 		dev_load(net, ifr.ifr_name);
5047 		rtnl_lock();
5048 		ret = dev_ifsioc(net, &ifr, cmd);
5049 		rtnl_unlock();
5050 		return ret;
5051 
5052 	case SIOCGIFMEM:
5053 		/* Get the per device memory space. We can add this but
5054 		 * currently do not support it */
5055 	case SIOCSIFMEM:
5056 		/* Set the per device memory buffer space.
5057 		 * Not applicable in our case */
5058 	case SIOCSIFLINK:
5059 		return -EINVAL;
5060 
5061 	/*
5062 	 *	Unknown or private ioctl.
5063 	 */
5064 	default:
5065 		if (cmd == SIOCWANDEV ||
5066 		    (cmd >= SIOCDEVPRIVATE &&
5067 		     cmd <= SIOCDEVPRIVATE + 15)) {
5068 			dev_load(net, ifr.ifr_name);
5069 			rtnl_lock();
5070 			ret = dev_ifsioc(net, &ifr, cmd);
5071 			rtnl_unlock();
5072 			if (!ret && copy_to_user(arg, &ifr,
5073 						 sizeof(struct ifreq)))
5074 				ret = -EFAULT;
5075 			return ret;
5076 		}
5077 		/* Take care of Wireless Extensions */
5078 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5079 			return wext_handle_ioctl(net, &ifr, cmd, arg);
5080 		return -EINVAL;
5081 	}
5082 }
5083 
5084 
5085 /**
5086  *	dev_new_index	-	allocate an ifindex
5087  *	@net: the applicable net namespace
5088  *
5089  *	Returns a suitable unique value for a new device interface
5090  *	number.  The caller must hold the rtnl semaphore or the
5091  *	dev_base_lock to be sure it remains unique.
5092  */
5093 static int dev_new_index(struct net *net)
5094 {
5095 	static int ifindex;
5096 	for (;;) {
5097 		if (++ifindex <= 0)
5098 			ifindex = 1;
5099 		if (!__dev_get_by_index(net, ifindex))
5100 			return ifindex;
5101 	}
5102 }
5103 
5104 /* Delayed registration/unregisteration */
5105 static LIST_HEAD(net_todo_list);
5106 
5107 static void net_set_todo(struct net_device *dev)
5108 {
5109 	list_add_tail(&dev->todo_list, &net_todo_list);
5110 }
5111 
5112 static void rollback_registered_many(struct list_head *head)
5113 {
5114 	struct net_device *dev, *tmp;
5115 
5116 	BUG_ON(dev_boot_phase);
5117 	ASSERT_RTNL();
5118 
5119 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5120 		/* Some devices call without registering
5121 		 * for initialization unwind. Remove those
5122 		 * devices and proceed with the remaining.
5123 		 */
5124 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5125 			pr_debug("unregister_netdevice: device %s/%p never "
5126 				 "was registered\n", dev->name, dev);
5127 
5128 			WARN_ON(1);
5129 			list_del(&dev->unreg_list);
5130 			continue;
5131 		}
5132 
5133 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5134 	}
5135 
5136 	/* If device is running, close it first. */
5137 	dev_close_many(head);
5138 
5139 	list_for_each_entry(dev, head, unreg_list) {
5140 		/* And unlink it from device chain. */
5141 		unlist_netdevice(dev);
5142 
5143 		dev->reg_state = NETREG_UNREGISTERING;
5144 	}
5145 
5146 	synchronize_net();
5147 
5148 	list_for_each_entry(dev, head, unreg_list) {
5149 		/* Shutdown queueing discipline. */
5150 		dev_shutdown(dev);
5151 
5152 
5153 		/* Notify protocols, that we are about to destroy
5154 		   this device. They should clean all the things.
5155 		*/
5156 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5157 
5158 		if (!dev->rtnl_link_ops ||
5159 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5160 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5161 
5162 		/*
5163 		 *	Flush the unicast and multicast chains
5164 		 */
5165 		dev_uc_flush(dev);
5166 		dev_mc_flush(dev);
5167 
5168 		if (dev->netdev_ops->ndo_uninit)
5169 			dev->netdev_ops->ndo_uninit(dev);
5170 
5171 		/* Notifier chain MUST detach us from master device. */
5172 		WARN_ON(dev->master);
5173 
5174 		/* Remove entries from kobject tree */
5175 		netdev_unregister_kobject(dev);
5176 	}
5177 
5178 	/* Process any work delayed until the end of the batch */
5179 	dev = list_first_entry(head, struct net_device, unreg_list);
5180 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5181 
5182 	rcu_barrier();
5183 
5184 	list_for_each_entry(dev, head, unreg_list)
5185 		dev_put(dev);
5186 }
5187 
5188 static void rollback_registered(struct net_device *dev)
5189 {
5190 	LIST_HEAD(single);
5191 
5192 	list_add(&dev->unreg_list, &single);
5193 	rollback_registered_many(&single);
5194 	list_del(&single);
5195 }
5196 
5197 u32 netdev_fix_features(struct net_device *dev, u32 features)
5198 {
5199 	/* Fix illegal checksum combinations */
5200 	if ((features & NETIF_F_HW_CSUM) &&
5201 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5202 		netdev_info(dev, "mixed HW and IP checksum settings.\n");
5203 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5204 	}
5205 
5206 	if ((features & NETIF_F_NO_CSUM) &&
5207 	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5208 		netdev_info(dev, "mixed no checksumming and other settings.\n");
5209 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5210 	}
5211 
5212 	/* Fix illegal SG+CSUM combinations. */
5213 	if ((features & NETIF_F_SG) &&
5214 	    !(features & NETIF_F_ALL_CSUM)) {
5215 		netdev_info(dev,
5216 			    "Dropping NETIF_F_SG since no checksum feature.\n");
5217 		features &= ~NETIF_F_SG;
5218 	}
5219 
5220 	/* TSO requires that SG is present as well. */
5221 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5222 		netdev_info(dev, "Dropping NETIF_F_TSO since no SG feature.\n");
5223 		features &= ~NETIF_F_TSO;
5224 	}
5225 
5226 	/* Software GSO depends on SG. */
5227 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5228 		netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5229 		features &= ~NETIF_F_GSO;
5230 	}
5231 
5232 	/* UFO needs SG and checksumming */
5233 	if (features & NETIF_F_UFO) {
5234 		/* maybe split UFO into V4 and V6? */
5235 		if (!((features & NETIF_F_GEN_CSUM) ||
5236 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5237 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5238 			netdev_info(dev,
5239 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5240 			features &= ~NETIF_F_UFO;
5241 		}
5242 
5243 		if (!(features & NETIF_F_SG)) {
5244 			netdev_info(dev,
5245 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5246 			features &= ~NETIF_F_UFO;
5247 		}
5248 	}
5249 
5250 	return features;
5251 }
5252 EXPORT_SYMBOL(netdev_fix_features);
5253 
5254 void netdev_update_features(struct net_device *dev)
5255 {
5256 	u32 features;
5257 	int err = 0;
5258 
5259 	features = netdev_get_wanted_features(dev);
5260 
5261 	if (dev->netdev_ops->ndo_fix_features)
5262 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5263 
5264 	/* driver might be less strict about feature dependencies */
5265 	features = netdev_fix_features(dev, features);
5266 
5267 	if (dev->features == features)
5268 		return;
5269 
5270 	netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n",
5271 		dev->features, features);
5272 
5273 	if (dev->netdev_ops->ndo_set_features)
5274 		err = dev->netdev_ops->ndo_set_features(dev, features);
5275 
5276 	if (!err)
5277 		dev->features = features;
5278 	else if (err < 0)
5279 		netdev_err(dev,
5280 			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5281 			err, features, dev->features);
5282 }
5283 EXPORT_SYMBOL(netdev_update_features);
5284 
5285 /**
5286  *	netif_stacked_transfer_operstate -	transfer operstate
5287  *	@rootdev: the root or lower level device to transfer state from
5288  *	@dev: the device to transfer operstate to
5289  *
5290  *	Transfer operational state from root to device. This is normally
5291  *	called when a stacking relationship exists between the root
5292  *	device and the device(a leaf device).
5293  */
5294 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5295 					struct net_device *dev)
5296 {
5297 	if (rootdev->operstate == IF_OPER_DORMANT)
5298 		netif_dormant_on(dev);
5299 	else
5300 		netif_dormant_off(dev);
5301 
5302 	if (netif_carrier_ok(rootdev)) {
5303 		if (!netif_carrier_ok(dev))
5304 			netif_carrier_on(dev);
5305 	} else {
5306 		if (netif_carrier_ok(dev))
5307 			netif_carrier_off(dev);
5308 	}
5309 }
5310 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5311 
5312 #ifdef CONFIG_RPS
5313 static int netif_alloc_rx_queues(struct net_device *dev)
5314 {
5315 	unsigned int i, count = dev->num_rx_queues;
5316 	struct netdev_rx_queue *rx;
5317 
5318 	BUG_ON(count < 1);
5319 
5320 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5321 	if (!rx) {
5322 		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5323 		return -ENOMEM;
5324 	}
5325 	dev->_rx = rx;
5326 
5327 	for (i = 0; i < count; i++)
5328 		rx[i].dev = dev;
5329 	return 0;
5330 }
5331 #endif
5332 
5333 static void netdev_init_one_queue(struct net_device *dev,
5334 				  struct netdev_queue *queue, void *_unused)
5335 {
5336 	/* Initialize queue lock */
5337 	spin_lock_init(&queue->_xmit_lock);
5338 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5339 	queue->xmit_lock_owner = -1;
5340 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5341 	queue->dev = dev;
5342 }
5343 
5344 static int netif_alloc_netdev_queues(struct net_device *dev)
5345 {
5346 	unsigned int count = dev->num_tx_queues;
5347 	struct netdev_queue *tx;
5348 
5349 	BUG_ON(count < 1);
5350 
5351 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5352 	if (!tx) {
5353 		pr_err("netdev: Unable to allocate %u tx queues.\n",
5354 		       count);
5355 		return -ENOMEM;
5356 	}
5357 	dev->_tx = tx;
5358 
5359 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5360 	spin_lock_init(&dev->tx_global_lock);
5361 
5362 	return 0;
5363 }
5364 
5365 /**
5366  *	register_netdevice	- register a network device
5367  *	@dev: device to register
5368  *
5369  *	Take a completed network device structure and add it to the kernel
5370  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5371  *	chain. 0 is returned on success. A negative errno code is returned
5372  *	on a failure to set up the device, or if the name is a duplicate.
5373  *
5374  *	Callers must hold the rtnl semaphore. You may want
5375  *	register_netdev() instead of this.
5376  *
5377  *	BUGS:
5378  *	The locking appears insufficient to guarantee two parallel registers
5379  *	will not get the same name.
5380  */
5381 
5382 int register_netdevice(struct net_device *dev)
5383 {
5384 	int ret;
5385 	struct net *net = dev_net(dev);
5386 
5387 	BUG_ON(dev_boot_phase);
5388 	ASSERT_RTNL();
5389 
5390 	might_sleep();
5391 
5392 	/* When net_device's are persistent, this will be fatal. */
5393 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5394 	BUG_ON(!net);
5395 
5396 	spin_lock_init(&dev->addr_list_lock);
5397 	netdev_set_addr_lockdep_class(dev);
5398 
5399 	dev->iflink = -1;
5400 
5401 	/* Init, if this function is available */
5402 	if (dev->netdev_ops->ndo_init) {
5403 		ret = dev->netdev_ops->ndo_init(dev);
5404 		if (ret) {
5405 			if (ret > 0)
5406 				ret = -EIO;
5407 			goto out;
5408 		}
5409 	}
5410 
5411 	ret = dev_get_valid_name(dev, dev->name, 0);
5412 	if (ret)
5413 		goto err_uninit;
5414 
5415 	dev->ifindex = dev_new_index(net);
5416 	if (dev->iflink == -1)
5417 		dev->iflink = dev->ifindex;
5418 
5419 	/* Transfer changeable features to wanted_features and enable
5420 	 * software offloads (GSO and GRO).
5421 	 */
5422 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5423 	dev->features |= NETIF_F_SOFT_FEATURES;
5424 	dev->wanted_features = dev->features & dev->hw_features;
5425 
5426 	/* Avoid warning from netdev_fix_features() for GSO without SG */
5427 	if (!(dev->wanted_features & NETIF_F_SG)) {
5428 		dev->wanted_features &= ~NETIF_F_GSO;
5429 		dev->features &= ~NETIF_F_GSO;
5430 	}
5431 
5432 	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5433 	 * vlan_dev_init() will do the dev->features check, so these features
5434 	 * are enabled only if supported by underlying device.
5435 	 */
5436 	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5437 
5438 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5439 	ret = notifier_to_errno(ret);
5440 	if (ret)
5441 		goto err_uninit;
5442 
5443 	ret = netdev_register_kobject(dev);
5444 	if (ret)
5445 		goto err_uninit;
5446 	dev->reg_state = NETREG_REGISTERED;
5447 
5448 	netdev_update_features(dev);
5449 
5450 	/*
5451 	 *	Default initial state at registry is that the
5452 	 *	device is present.
5453 	 */
5454 
5455 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5456 
5457 	dev_init_scheduler(dev);
5458 	dev_hold(dev);
5459 	list_netdevice(dev);
5460 
5461 	/* Notify protocols, that a new device appeared. */
5462 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5463 	ret = notifier_to_errno(ret);
5464 	if (ret) {
5465 		rollback_registered(dev);
5466 		dev->reg_state = NETREG_UNREGISTERED;
5467 	}
5468 	/*
5469 	 *	Prevent userspace races by waiting until the network
5470 	 *	device is fully setup before sending notifications.
5471 	 */
5472 	if (!dev->rtnl_link_ops ||
5473 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5474 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5475 
5476 out:
5477 	return ret;
5478 
5479 err_uninit:
5480 	if (dev->netdev_ops->ndo_uninit)
5481 		dev->netdev_ops->ndo_uninit(dev);
5482 	goto out;
5483 }
5484 EXPORT_SYMBOL(register_netdevice);
5485 
5486 /**
5487  *	init_dummy_netdev	- init a dummy network device for NAPI
5488  *	@dev: device to init
5489  *
5490  *	This takes a network device structure and initialize the minimum
5491  *	amount of fields so it can be used to schedule NAPI polls without
5492  *	registering a full blown interface. This is to be used by drivers
5493  *	that need to tie several hardware interfaces to a single NAPI
5494  *	poll scheduler due to HW limitations.
5495  */
5496 int init_dummy_netdev(struct net_device *dev)
5497 {
5498 	/* Clear everything. Note we don't initialize spinlocks
5499 	 * are they aren't supposed to be taken by any of the
5500 	 * NAPI code and this dummy netdev is supposed to be
5501 	 * only ever used for NAPI polls
5502 	 */
5503 	memset(dev, 0, sizeof(struct net_device));
5504 
5505 	/* make sure we BUG if trying to hit standard
5506 	 * register/unregister code path
5507 	 */
5508 	dev->reg_state = NETREG_DUMMY;
5509 
5510 	/* NAPI wants this */
5511 	INIT_LIST_HEAD(&dev->napi_list);
5512 
5513 	/* a dummy interface is started by default */
5514 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5515 	set_bit(__LINK_STATE_START, &dev->state);
5516 
5517 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5518 	 * because users of this 'device' dont need to change
5519 	 * its refcount.
5520 	 */
5521 
5522 	return 0;
5523 }
5524 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5525 
5526 
5527 /**
5528  *	register_netdev	- register a network device
5529  *	@dev: device to register
5530  *
5531  *	Take a completed network device structure and add it to the kernel
5532  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5533  *	chain. 0 is returned on success. A negative errno code is returned
5534  *	on a failure to set up the device, or if the name is a duplicate.
5535  *
5536  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5537  *	and expands the device name if you passed a format string to
5538  *	alloc_netdev.
5539  */
5540 int register_netdev(struct net_device *dev)
5541 {
5542 	int err;
5543 
5544 	rtnl_lock();
5545 
5546 	/*
5547 	 * If the name is a format string the caller wants us to do a
5548 	 * name allocation.
5549 	 */
5550 	if (strchr(dev->name, '%')) {
5551 		err = dev_alloc_name(dev, dev->name);
5552 		if (err < 0)
5553 			goto out;
5554 	}
5555 
5556 	err = register_netdevice(dev);
5557 out:
5558 	rtnl_unlock();
5559 	return err;
5560 }
5561 EXPORT_SYMBOL(register_netdev);
5562 
5563 int netdev_refcnt_read(const struct net_device *dev)
5564 {
5565 	int i, refcnt = 0;
5566 
5567 	for_each_possible_cpu(i)
5568 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5569 	return refcnt;
5570 }
5571 EXPORT_SYMBOL(netdev_refcnt_read);
5572 
5573 /*
5574  * netdev_wait_allrefs - wait until all references are gone.
5575  *
5576  * This is called when unregistering network devices.
5577  *
5578  * Any protocol or device that holds a reference should register
5579  * for netdevice notification, and cleanup and put back the
5580  * reference if they receive an UNREGISTER event.
5581  * We can get stuck here if buggy protocols don't correctly
5582  * call dev_put.
5583  */
5584 static void netdev_wait_allrefs(struct net_device *dev)
5585 {
5586 	unsigned long rebroadcast_time, warning_time;
5587 	int refcnt;
5588 
5589 	linkwatch_forget_dev(dev);
5590 
5591 	rebroadcast_time = warning_time = jiffies;
5592 	refcnt = netdev_refcnt_read(dev);
5593 
5594 	while (refcnt != 0) {
5595 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5596 			rtnl_lock();
5597 
5598 			/* Rebroadcast unregister notification */
5599 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5600 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5601 			 * should have already handle it the first time */
5602 
5603 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5604 				     &dev->state)) {
5605 				/* We must not have linkwatch events
5606 				 * pending on unregister. If this
5607 				 * happens, we simply run the queue
5608 				 * unscheduled, resulting in a noop
5609 				 * for this device.
5610 				 */
5611 				linkwatch_run_queue();
5612 			}
5613 
5614 			__rtnl_unlock();
5615 
5616 			rebroadcast_time = jiffies;
5617 		}
5618 
5619 		msleep(250);
5620 
5621 		refcnt = netdev_refcnt_read(dev);
5622 
5623 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5624 			printk(KERN_EMERG "unregister_netdevice: "
5625 			       "waiting for %s to become free. Usage "
5626 			       "count = %d\n",
5627 			       dev->name, refcnt);
5628 			warning_time = jiffies;
5629 		}
5630 	}
5631 }
5632 
5633 /* The sequence is:
5634  *
5635  *	rtnl_lock();
5636  *	...
5637  *	register_netdevice(x1);
5638  *	register_netdevice(x2);
5639  *	...
5640  *	unregister_netdevice(y1);
5641  *	unregister_netdevice(y2);
5642  *      ...
5643  *	rtnl_unlock();
5644  *	free_netdev(y1);
5645  *	free_netdev(y2);
5646  *
5647  * We are invoked by rtnl_unlock().
5648  * This allows us to deal with problems:
5649  * 1) We can delete sysfs objects which invoke hotplug
5650  *    without deadlocking with linkwatch via keventd.
5651  * 2) Since we run with the RTNL semaphore not held, we can sleep
5652  *    safely in order to wait for the netdev refcnt to drop to zero.
5653  *
5654  * We must not return until all unregister events added during
5655  * the interval the lock was held have been completed.
5656  */
5657 void netdev_run_todo(void)
5658 {
5659 	struct list_head list;
5660 
5661 	/* Snapshot list, allow later requests */
5662 	list_replace_init(&net_todo_list, &list);
5663 
5664 	__rtnl_unlock();
5665 
5666 	while (!list_empty(&list)) {
5667 		struct net_device *dev
5668 			= list_first_entry(&list, struct net_device, todo_list);
5669 		list_del(&dev->todo_list);
5670 
5671 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5672 			printk(KERN_ERR "network todo '%s' but state %d\n",
5673 			       dev->name, dev->reg_state);
5674 			dump_stack();
5675 			continue;
5676 		}
5677 
5678 		dev->reg_state = NETREG_UNREGISTERED;
5679 
5680 		on_each_cpu(flush_backlog, dev, 1);
5681 
5682 		netdev_wait_allrefs(dev);
5683 
5684 		/* paranoia */
5685 		BUG_ON(netdev_refcnt_read(dev));
5686 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5687 		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5688 		WARN_ON(dev->dn_ptr);
5689 
5690 		if (dev->destructor)
5691 			dev->destructor(dev);
5692 
5693 		/* Free network device */
5694 		kobject_put(&dev->dev.kobj);
5695 	}
5696 }
5697 
5698 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5699  * fields in the same order, with only the type differing.
5700  */
5701 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5702 				    const struct net_device_stats *netdev_stats)
5703 {
5704 #if BITS_PER_LONG == 64
5705         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5706         memcpy(stats64, netdev_stats, sizeof(*stats64));
5707 #else
5708 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5709 	const unsigned long *src = (const unsigned long *)netdev_stats;
5710 	u64 *dst = (u64 *)stats64;
5711 
5712 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5713 		     sizeof(*stats64) / sizeof(u64));
5714 	for (i = 0; i < n; i++)
5715 		dst[i] = src[i];
5716 #endif
5717 }
5718 
5719 /**
5720  *	dev_get_stats	- get network device statistics
5721  *	@dev: device to get statistics from
5722  *	@storage: place to store stats
5723  *
5724  *	Get network statistics from device. Return @storage.
5725  *	The device driver may provide its own method by setting
5726  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5727  *	otherwise the internal statistics structure is used.
5728  */
5729 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5730 					struct rtnl_link_stats64 *storage)
5731 {
5732 	const struct net_device_ops *ops = dev->netdev_ops;
5733 
5734 	if (ops->ndo_get_stats64) {
5735 		memset(storage, 0, sizeof(*storage));
5736 		ops->ndo_get_stats64(dev, storage);
5737 	} else if (ops->ndo_get_stats) {
5738 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5739 	} else {
5740 		netdev_stats_to_stats64(storage, &dev->stats);
5741 	}
5742 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5743 	return storage;
5744 }
5745 EXPORT_SYMBOL(dev_get_stats);
5746 
5747 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5748 {
5749 	struct netdev_queue *queue = dev_ingress_queue(dev);
5750 
5751 #ifdef CONFIG_NET_CLS_ACT
5752 	if (queue)
5753 		return queue;
5754 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5755 	if (!queue)
5756 		return NULL;
5757 	netdev_init_one_queue(dev, queue, NULL);
5758 	queue->qdisc = &noop_qdisc;
5759 	queue->qdisc_sleeping = &noop_qdisc;
5760 	rcu_assign_pointer(dev->ingress_queue, queue);
5761 #endif
5762 	return queue;
5763 }
5764 
5765 /**
5766  *	alloc_netdev_mqs - allocate network device
5767  *	@sizeof_priv:	size of private data to allocate space for
5768  *	@name:		device name format string
5769  *	@setup:		callback to initialize device
5770  *	@txqs:		the number of TX subqueues to allocate
5771  *	@rxqs:		the number of RX subqueues to allocate
5772  *
5773  *	Allocates a struct net_device with private data area for driver use
5774  *	and performs basic initialization.  Also allocates subquue structs
5775  *	for each queue on the device.
5776  */
5777 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5778 		void (*setup)(struct net_device *),
5779 		unsigned int txqs, unsigned int rxqs)
5780 {
5781 	struct net_device *dev;
5782 	size_t alloc_size;
5783 	struct net_device *p;
5784 
5785 	BUG_ON(strlen(name) >= sizeof(dev->name));
5786 
5787 	if (txqs < 1) {
5788 		pr_err("alloc_netdev: Unable to allocate device "
5789 		       "with zero queues.\n");
5790 		return NULL;
5791 	}
5792 
5793 #ifdef CONFIG_RPS
5794 	if (rxqs < 1) {
5795 		pr_err("alloc_netdev: Unable to allocate device "
5796 		       "with zero RX queues.\n");
5797 		return NULL;
5798 	}
5799 #endif
5800 
5801 	alloc_size = sizeof(struct net_device);
5802 	if (sizeof_priv) {
5803 		/* ensure 32-byte alignment of private area */
5804 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5805 		alloc_size += sizeof_priv;
5806 	}
5807 	/* ensure 32-byte alignment of whole construct */
5808 	alloc_size += NETDEV_ALIGN - 1;
5809 
5810 	p = kzalloc(alloc_size, GFP_KERNEL);
5811 	if (!p) {
5812 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5813 		return NULL;
5814 	}
5815 
5816 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5817 	dev->padded = (char *)dev - (char *)p;
5818 
5819 	dev->pcpu_refcnt = alloc_percpu(int);
5820 	if (!dev->pcpu_refcnt)
5821 		goto free_p;
5822 
5823 	if (dev_addr_init(dev))
5824 		goto free_pcpu;
5825 
5826 	dev_mc_init(dev);
5827 	dev_uc_init(dev);
5828 
5829 	dev_net_set(dev, &init_net);
5830 
5831 	dev->gso_max_size = GSO_MAX_SIZE;
5832 
5833 	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5834 	dev->ethtool_ntuple_list.count = 0;
5835 	INIT_LIST_HEAD(&dev->napi_list);
5836 	INIT_LIST_HEAD(&dev->unreg_list);
5837 	INIT_LIST_HEAD(&dev->link_watch_list);
5838 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5839 	setup(dev);
5840 
5841 	dev->num_tx_queues = txqs;
5842 	dev->real_num_tx_queues = txqs;
5843 	if (netif_alloc_netdev_queues(dev))
5844 		goto free_all;
5845 
5846 #ifdef CONFIG_RPS
5847 	dev->num_rx_queues = rxqs;
5848 	dev->real_num_rx_queues = rxqs;
5849 	if (netif_alloc_rx_queues(dev))
5850 		goto free_all;
5851 #endif
5852 
5853 	strcpy(dev->name, name);
5854 	dev->group = INIT_NETDEV_GROUP;
5855 	return dev;
5856 
5857 free_all:
5858 	free_netdev(dev);
5859 	return NULL;
5860 
5861 free_pcpu:
5862 	free_percpu(dev->pcpu_refcnt);
5863 	kfree(dev->_tx);
5864 #ifdef CONFIG_RPS
5865 	kfree(dev->_rx);
5866 #endif
5867 
5868 free_p:
5869 	kfree(p);
5870 	return NULL;
5871 }
5872 EXPORT_SYMBOL(alloc_netdev_mqs);
5873 
5874 /**
5875  *	free_netdev - free network device
5876  *	@dev: device
5877  *
5878  *	This function does the last stage of destroying an allocated device
5879  * 	interface. The reference to the device object is released.
5880  *	If this is the last reference then it will be freed.
5881  */
5882 void free_netdev(struct net_device *dev)
5883 {
5884 	struct napi_struct *p, *n;
5885 
5886 	release_net(dev_net(dev));
5887 
5888 	kfree(dev->_tx);
5889 #ifdef CONFIG_RPS
5890 	kfree(dev->_rx);
5891 #endif
5892 
5893 	kfree(rcu_dereference_raw(dev->ingress_queue));
5894 
5895 	/* Flush device addresses */
5896 	dev_addr_flush(dev);
5897 
5898 	/* Clear ethtool n-tuple list */
5899 	ethtool_ntuple_flush(dev);
5900 
5901 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5902 		netif_napi_del(p);
5903 
5904 	free_percpu(dev->pcpu_refcnt);
5905 	dev->pcpu_refcnt = NULL;
5906 
5907 	/*  Compatibility with error handling in drivers */
5908 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5909 		kfree((char *)dev - dev->padded);
5910 		return;
5911 	}
5912 
5913 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5914 	dev->reg_state = NETREG_RELEASED;
5915 
5916 	/* will free via device release */
5917 	put_device(&dev->dev);
5918 }
5919 EXPORT_SYMBOL(free_netdev);
5920 
5921 /**
5922  *	synchronize_net -  Synchronize with packet receive processing
5923  *
5924  *	Wait for packets currently being received to be done.
5925  *	Does not block later packets from starting.
5926  */
5927 void synchronize_net(void)
5928 {
5929 	might_sleep();
5930 	synchronize_rcu();
5931 }
5932 EXPORT_SYMBOL(synchronize_net);
5933 
5934 /**
5935  *	unregister_netdevice_queue - remove device from the kernel
5936  *	@dev: device
5937  *	@head: list
5938  *
5939  *	This function shuts down a device interface and removes it
5940  *	from the kernel tables.
5941  *	If head not NULL, device is queued to be unregistered later.
5942  *
5943  *	Callers must hold the rtnl semaphore.  You may want
5944  *	unregister_netdev() instead of this.
5945  */
5946 
5947 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5948 {
5949 	ASSERT_RTNL();
5950 
5951 	if (head) {
5952 		list_move_tail(&dev->unreg_list, head);
5953 	} else {
5954 		rollback_registered(dev);
5955 		/* Finish processing unregister after unlock */
5956 		net_set_todo(dev);
5957 	}
5958 }
5959 EXPORT_SYMBOL(unregister_netdevice_queue);
5960 
5961 /**
5962  *	unregister_netdevice_many - unregister many devices
5963  *	@head: list of devices
5964  */
5965 void unregister_netdevice_many(struct list_head *head)
5966 {
5967 	struct net_device *dev;
5968 
5969 	if (!list_empty(head)) {
5970 		rollback_registered_many(head);
5971 		list_for_each_entry(dev, head, unreg_list)
5972 			net_set_todo(dev);
5973 	}
5974 }
5975 EXPORT_SYMBOL(unregister_netdevice_many);
5976 
5977 /**
5978  *	unregister_netdev - remove device from the kernel
5979  *	@dev: device
5980  *
5981  *	This function shuts down a device interface and removes it
5982  *	from the kernel tables.
5983  *
5984  *	This is just a wrapper for unregister_netdevice that takes
5985  *	the rtnl semaphore.  In general you want to use this and not
5986  *	unregister_netdevice.
5987  */
5988 void unregister_netdev(struct net_device *dev)
5989 {
5990 	rtnl_lock();
5991 	unregister_netdevice(dev);
5992 	rtnl_unlock();
5993 }
5994 EXPORT_SYMBOL(unregister_netdev);
5995 
5996 /**
5997  *	dev_change_net_namespace - move device to different nethost namespace
5998  *	@dev: device
5999  *	@net: network namespace
6000  *	@pat: If not NULL name pattern to try if the current device name
6001  *	      is already taken in the destination network namespace.
6002  *
6003  *	This function shuts down a device interface and moves it
6004  *	to a new network namespace. On success 0 is returned, on
6005  *	a failure a netagive errno code is returned.
6006  *
6007  *	Callers must hold the rtnl semaphore.
6008  */
6009 
6010 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6011 {
6012 	int err;
6013 
6014 	ASSERT_RTNL();
6015 
6016 	/* Don't allow namespace local devices to be moved. */
6017 	err = -EINVAL;
6018 	if (dev->features & NETIF_F_NETNS_LOCAL)
6019 		goto out;
6020 
6021 	/* Ensure the device has been registrered */
6022 	err = -EINVAL;
6023 	if (dev->reg_state != NETREG_REGISTERED)
6024 		goto out;
6025 
6026 	/* Get out if there is nothing todo */
6027 	err = 0;
6028 	if (net_eq(dev_net(dev), net))
6029 		goto out;
6030 
6031 	/* Pick the destination device name, and ensure
6032 	 * we can use it in the destination network namespace.
6033 	 */
6034 	err = -EEXIST;
6035 	if (__dev_get_by_name(net, dev->name)) {
6036 		/* We get here if we can't use the current device name */
6037 		if (!pat)
6038 			goto out;
6039 		if (dev_get_valid_name(dev, pat, 1))
6040 			goto out;
6041 	}
6042 
6043 	/*
6044 	 * And now a mini version of register_netdevice unregister_netdevice.
6045 	 */
6046 
6047 	/* If device is running close it first. */
6048 	dev_close(dev);
6049 
6050 	/* And unlink it from device chain */
6051 	err = -ENODEV;
6052 	unlist_netdevice(dev);
6053 
6054 	synchronize_net();
6055 
6056 	/* Shutdown queueing discipline. */
6057 	dev_shutdown(dev);
6058 
6059 	/* Notify protocols, that we are about to destroy
6060 	   this device. They should clean all the things.
6061 
6062 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6063 	   This is wanted because this way 8021q and macvlan know
6064 	   the device is just moving and can keep their slaves up.
6065 	*/
6066 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6067 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6068 
6069 	/*
6070 	 *	Flush the unicast and multicast chains
6071 	 */
6072 	dev_uc_flush(dev);
6073 	dev_mc_flush(dev);
6074 
6075 	/* Actually switch the network namespace */
6076 	dev_net_set(dev, net);
6077 
6078 	/* If there is an ifindex conflict assign a new one */
6079 	if (__dev_get_by_index(net, dev->ifindex)) {
6080 		int iflink = (dev->iflink == dev->ifindex);
6081 		dev->ifindex = dev_new_index(net);
6082 		if (iflink)
6083 			dev->iflink = dev->ifindex;
6084 	}
6085 
6086 	/* Fixup kobjects */
6087 	err = device_rename(&dev->dev, dev->name);
6088 	WARN_ON(err);
6089 
6090 	/* Add the device back in the hashes */
6091 	list_netdevice(dev);
6092 
6093 	/* Notify protocols, that a new device appeared. */
6094 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6095 
6096 	/*
6097 	 *	Prevent userspace races by waiting until the network
6098 	 *	device is fully setup before sending notifications.
6099 	 */
6100 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6101 
6102 	synchronize_net();
6103 	err = 0;
6104 out:
6105 	return err;
6106 }
6107 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6108 
6109 static int dev_cpu_callback(struct notifier_block *nfb,
6110 			    unsigned long action,
6111 			    void *ocpu)
6112 {
6113 	struct sk_buff **list_skb;
6114 	struct sk_buff *skb;
6115 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6116 	struct softnet_data *sd, *oldsd;
6117 
6118 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6119 		return NOTIFY_OK;
6120 
6121 	local_irq_disable();
6122 	cpu = smp_processor_id();
6123 	sd = &per_cpu(softnet_data, cpu);
6124 	oldsd = &per_cpu(softnet_data, oldcpu);
6125 
6126 	/* Find end of our completion_queue. */
6127 	list_skb = &sd->completion_queue;
6128 	while (*list_skb)
6129 		list_skb = &(*list_skb)->next;
6130 	/* Append completion queue from offline CPU. */
6131 	*list_skb = oldsd->completion_queue;
6132 	oldsd->completion_queue = NULL;
6133 
6134 	/* Append output queue from offline CPU. */
6135 	if (oldsd->output_queue) {
6136 		*sd->output_queue_tailp = oldsd->output_queue;
6137 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6138 		oldsd->output_queue = NULL;
6139 		oldsd->output_queue_tailp = &oldsd->output_queue;
6140 	}
6141 
6142 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6143 	local_irq_enable();
6144 
6145 	/* Process offline CPU's input_pkt_queue */
6146 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6147 		netif_rx(skb);
6148 		input_queue_head_incr(oldsd);
6149 	}
6150 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6151 		netif_rx(skb);
6152 		input_queue_head_incr(oldsd);
6153 	}
6154 
6155 	return NOTIFY_OK;
6156 }
6157 
6158 
6159 /**
6160  *	netdev_increment_features - increment feature set by one
6161  *	@all: current feature set
6162  *	@one: new feature set
6163  *	@mask: mask feature set
6164  *
6165  *	Computes a new feature set after adding a device with feature set
6166  *	@one to the master device with current feature set @all.  Will not
6167  *	enable anything that is off in @mask. Returns the new feature set.
6168  */
6169 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6170 {
6171 	/* If device needs checksumming, downgrade to it. */
6172 	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6173 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6174 	else if (mask & NETIF_F_ALL_CSUM) {
6175 		/* If one device supports v4/v6 checksumming, set for all. */
6176 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6177 		    !(all & NETIF_F_GEN_CSUM)) {
6178 			all &= ~NETIF_F_ALL_CSUM;
6179 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6180 		}
6181 
6182 		/* If one device supports hw checksumming, set for all. */
6183 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6184 			all &= ~NETIF_F_ALL_CSUM;
6185 			all |= NETIF_F_HW_CSUM;
6186 		}
6187 	}
6188 
6189 	one |= NETIF_F_ALL_CSUM;
6190 
6191 	one |= all & NETIF_F_ONE_FOR_ALL;
6192 	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6193 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
6194 
6195 	return all;
6196 }
6197 EXPORT_SYMBOL(netdev_increment_features);
6198 
6199 static struct hlist_head *netdev_create_hash(void)
6200 {
6201 	int i;
6202 	struct hlist_head *hash;
6203 
6204 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6205 	if (hash != NULL)
6206 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6207 			INIT_HLIST_HEAD(&hash[i]);
6208 
6209 	return hash;
6210 }
6211 
6212 /* Initialize per network namespace state */
6213 static int __net_init netdev_init(struct net *net)
6214 {
6215 	INIT_LIST_HEAD(&net->dev_base_head);
6216 
6217 	net->dev_name_head = netdev_create_hash();
6218 	if (net->dev_name_head == NULL)
6219 		goto err_name;
6220 
6221 	net->dev_index_head = netdev_create_hash();
6222 	if (net->dev_index_head == NULL)
6223 		goto err_idx;
6224 
6225 	return 0;
6226 
6227 err_idx:
6228 	kfree(net->dev_name_head);
6229 err_name:
6230 	return -ENOMEM;
6231 }
6232 
6233 /**
6234  *	netdev_drivername - network driver for the device
6235  *	@dev: network device
6236  *	@buffer: buffer for resulting name
6237  *	@len: size of buffer
6238  *
6239  *	Determine network driver for device.
6240  */
6241 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6242 {
6243 	const struct device_driver *driver;
6244 	const struct device *parent;
6245 
6246 	if (len <= 0 || !buffer)
6247 		return buffer;
6248 	buffer[0] = 0;
6249 
6250 	parent = dev->dev.parent;
6251 
6252 	if (!parent)
6253 		return buffer;
6254 
6255 	driver = parent->driver;
6256 	if (driver && driver->name)
6257 		strlcpy(buffer, driver->name, len);
6258 	return buffer;
6259 }
6260 
6261 static int __netdev_printk(const char *level, const struct net_device *dev,
6262 			   struct va_format *vaf)
6263 {
6264 	int r;
6265 
6266 	if (dev && dev->dev.parent)
6267 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6268 			       netdev_name(dev), vaf);
6269 	else if (dev)
6270 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6271 	else
6272 		r = printk("%s(NULL net_device): %pV", level, vaf);
6273 
6274 	return r;
6275 }
6276 
6277 int netdev_printk(const char *level, const struct net_device *dev,
6278 		  const char *format, ...)
6279 {
6280 	struct va_format vaf;
6281 	va_list args;
6282 	int r;
6283 
6284 	va_start(args, format);
6285 
6286 	vaf.fmt = format;
6287 	vaf.va = &args;
6288 
6289 	r = __netdev_printk(level, dev, &vaf);
6290 	va_end(args);
6291 
6292 	return r;
6293 }
6294 EXPORT_SYMBOL(netdev_printk);
6295 
6296 #define define_netdev_printk_level(func, level)			\
6297 int func(const struct net_device *dev, const char *fmt, ...)	\
6298 {								\
6299 	int r;							\
6300 	struct va_format vaf;					\
6301 	va_list args;						\
6302 								\
6303 	va_start(args, fmt);					\
6304 								\
6305 	vaf.fmt = fmt;						\
6306 	vaf.va = &args;						\
6307 								\
6308 	r = __netdev_printk(level, dev, &vaf);			\
6309 	va_end(args);						\
6310 								\
6311 	return r;						\
6312 }								\
6313 EXPORT_SYMBOL(func);
6314 
6315 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6316 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6317 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6318 define_netdev_printk_level(netdev_err, KERN_ERR);
6319 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6320 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6321 define_netdev_printk_level(netdev_info, KERN_INFO);
6322 
6323 static void __net_exit netdev_exit(struct net *net)
6324 {
6325 	kfree(net->dev_name_head);
6326 	kfree(net->dev_index_head);
6327 }
6328 
6329 static struct pernet_operations __net_initdata netdev_net_ops = {
6330 	.init = netdev_init,
6331 	.exit = netdev_exit,
6332 };
6333 
6334 static void __net_exit default_device_exit(struct net *net)
6335 {
6336 	struct net_device *dev, *aux;
6337 	/*
6338 	 * Push all migratable network devices back to the
6339 	 * initial network namespace
6340 	 */
6341 	rtnl_lock();
6342 	for_each_netdev_safe(net, dev, aux) {
6343 		int err;
6344 		char fb_name[IFNAMSIZ];
6345 
6346 		/* Ignore unmoveable devices (i.e. loopback) */
6347 		if (dev->features & NETIF_F_NETNS_LOCAL)
6348 			continue;
6349 
6350 		/* Leave virtual devices for the generic cleanup */
6351 		if (dev->rtnl_link_ops)
6352 			continue;
6353 
6354 		/* Push remaing network devices to init_net */
6355 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6356 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6357 		if (err) {
6358 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6359 				__func__, dev->name, err);
6360 			BUG();
6361 		}
6362 	}
6363 	rtnl_unlock();
6364 }
6365 
6366 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6367 {
6368 	/* At exit all network devices most be removed from a network
6369 	 * namespace.  Do this in the reverse order of registration.
6370 	 * Do this across as many network namespaces as possible to
6371 	 * improve batching efficiency.
6372 	 */
6373 	struct net_device *dev;
6374 	struct net *net;
6375 	LIST_HEAD(dev_kill_list);
6376 
6377 	rtnl_lock();
6378 	list_for_each_entry(net, net_list, exit_list) {
6379 		for_each_netdev_reverse(net, dev) {
6380 			if (dev->rtnl_link_ops)
6381 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6382 			else
6383 				unregister_netdevice_queue(dev, &dev_kill_list);
6384 		}
6385 	}
6386 	unregister_netdevice_many(&dev_kill_list);
6387 	list_del(&dev_kill_list);
6388 	rtnl_unlock();
6389 }
6390 
6391 static struct pernet_operations __net_initdata default_device_ops = {
6392 	.exit = default_device_exit,
6393 	.exit_batch = default_device_exit_batch,
6394 };
6395 
6396 /*
6397  *	Initialize the DEV module. At boot time this walks the device list and
6398  *	unhooks any devices that fail to initialise (normally hardware not
6399  *	present) and leaves us with a valid list of present and active devices.
6400  *
6401  */
6402 
6403 /*
6404  *       This is called single threaded during boot, so no need
6405  *       to take the rtnl semaphore.
6406  */
6407 static int __init net_dev_init(void)
6408 {
6409 	int i, rc = -ENOMEM;
6410 
6411 	BUG_ON(!dev_boot_phase);
6412 
6413 	if (dev_proc_init())
6414 		goto out;
6415 
6416 	if (netdev_kobject_init())
6417 		goto out;
6418 
6419 	INIT_LIST_HEAD(&ptype_all);
6420 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6421 		INIT_LIST_HEAD(&ptype_base[i]);
6422 
6423 	if (register_pernet_subsys(&netdev_net_ops))
6424 		goto out;
6425 
6426 	/*
6427 	 *	Initialise the packet receive queues.
6428 	 */
6429 
6430 	for_each_possible_cpu(i) {
6431 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6432 
6433 		memset(sd, 0, sizeof(*sd));
6434 		skb_queue_head_init(&sd->input_pkt_queue);
6435 		skb_queue_head_init(&sd->process_queue);
6436 		sd->completion_queue = NULL;
6437 		INIT_LIST_HEAD(&sd->poll_list);
6438 		sd->output_queue = NULL;
6439 		sd->output_queue_tailp = &sd->output_queue;
6440 #ifdef CONFIG_RPS
6441 		sd->csd.func = rps_trigger_softirq;
6442 		sd->csd.info = sd;
6443 		sd->csd.flags = 0;
6444 		sd->cpu = i;
6445 #endif
6446 
6447 		sd->backlog.poll = process_backlog;
6448 		sd->backlog.weight = weight_p;
6449 		sd->backlog.gro_list = NULL;
6450 		sd->backlog.gro_count = 0;
6451 	}
6452 
6453 	dev_boot_phase = 0;
6454 
6455 	/* The loopback device is special if any other network devices
6456 	 * is present in a network namespace the loopback device must
6457 	 * be present. Since we now dynamically allocate and free the
6458 	 * loopback device ensure this invariant is maintained by
6459 	 * keeping the loopback device as the first device on the
6460 	 * list of network devices.  Ensuring the loopback devices
6461 	 * is the first device that appears and the last network device
6462 	 * that disappears.
6463 	 */
6464 	if (register_pernet_device(&loopback_net_ops))
6465 		goto out;
6466 
6467 	if (register_pernet_device(&default_device_ops))
6468 		goto out;
6469 
6470 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6471 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6472 
6473 	hotcpu_notifier(dev_cpu_callback, 0);
6474 	dst_init();
6475 	dev_mcast_init();
6476 	rc = 0;
6477 out:
6478 	return rc;
6479 }
6480 
6481 subsys_initcall(net_dev_init);
6482 
6483 static int __init initialize_hashrnd(void)
6484 {
6485 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6486 	return 0;
6487 }
6488 
6489 late_initcall_sync(initialize_hashrnd);
6490 
6491