xref: /linux-6.15/net/core/dev.c (revision 2e4c77be)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <[email protected]>
12  *				Mark Evans, <[email protected]>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <[email protected]>
16  *		Alan Cox <[email protected]>
17  *		David Hinds <[email protected]>
18  *		Alexey Kuznetsov <[email protected]>
19  *		Adam Sulmicki <[email protected]>
20  *              Pekka Riikonen <[email protected]>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/sched.h>
83 #include <linux/mutex.h>
84 #include <linux/string.h>
85 #include <linux/mm.h>
86 #include <linux/socket.h>
87 #include <linux/sockios.h>
88 #include <linux/errno.h>
89 #include <linux/interrupt.h>
90 #include <linux/if_ether.h>
91 #include <linux/netdevice.h>
92 #include <linux/etherdevice.h>
93 #include <linux/ethtool.h>
94 #include <linux/notifier.h>
95 #include <linux/skbuff.h>
96 #include <net/net_namespace.h>
97 #include <net/sock.h>
98 #include <linux/rtnetlink.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/stat.h>
102 #include <linux/if_bridge.h>
103 #include <linux/if_macvlan.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 
130 #include "net-sysfs.h"
131 
132 /* Instead of increasing this, you should create a hash table. */
133 #define MAX_GRO_SKBS 8
134 
135 /* This should be increased if a protocol with a bigger head is added. */
136 #define GRO_MAX_HEAD (MAX_HEADER + 128)
137 
138 /*
139  *	The list of packet types we will receive (as opposed to discard)
140  *	and the routines to invoke.
141  *
142  *	Why 16. Because with 16 the only overlap we get on a hash of the
143  *	low nibble of the protocol value is RARP/SNAP/X.25.
144  *
145  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
146  *             sure which should go first, but I bet it won't make much
147  *             difference if we are running VLANs.  The good news is that
148  *             this protocol won't be in the list unless compiled in, so
149  *             the average user (w/out VLANs) will not be adversely affected.
150  *             --BLG
151  *
152  *		0800	IP
153  *		8100    802.1Q VLAN
154  *		0001	802.3
155  *		0002	AX.25
156  *		0004	802.2
157  *		8035	RARP
158  *		0005	SNAP
159  *		0805	X.25
160  *		0806	ARP
161  *		8137	IPX
162  *		0009	Localtalk
163  *		86DD	IPv6
164  */
165 
166 #define PTYPE_HASH_SIZE	(16)
167 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
168 
169 static DEFINE_SPINLOCK(ptype_lock);
170 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
171 static struct list_head ptype_all __read_mostly;	/* Taps */
172 
173 /*
174  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
175  * semaphore.
176  *
177  * Pure readers hold dev_base_lock for reading.
178  *
179  * Writers must hold the rtnl semaphore while they loop through the
180  * dev_base_head list, and hold dev_base_lock for writing when they do the
181  * actual updates.  This allows pure readers to access the list even
182  * while a writer is preparing to update it.
183  *
184  * To put it another way, dev_base_lock is held for writing only to
185  * protect against pure readers; the rtnl semaphore provides the
186  * protection against other writers.
187  *
188  * See, for example usages, register_netdevice() and
189  * unregister_netdevice(), which must be called with the rtnl
190  * semaphore held.
191  */
192 DEFINE_RWLOCK(dev_base_lock);
193 
194 EXPORT_SYMBOL(dev_base_lock);
195 
196 #define NETDEV_HASHBITS	8
197 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
198 
199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
200 {
201 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
202 	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
203 }
204 
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207 	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
208 }
209 
210 /* Device list insertion */
211 static int list_netdevice(struct net_device *dev)
212 {
213 	struct net *net = dev_net(dev);
214 
215 	ASSERT_RTNL();
216 
217 	write_lock_bh(&dev_base_lock);
218 	list_add_tail(&dev->dev_list, &net->dev_base_head);
219 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
220 	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
221 	write_unlock_bh(&dev_base_lock);
222 	return 0;
223 }
224 
225 /* Device list removal */
226 static void unlist_netdevice(struct net_device *dev)
227 {
228 	ASSERT_RTNL();
229 
230 	/* Unlink dev from the device chain */
231 	write_lock_bh(&dev_base_lock);
232 	list_del(&dev->dev_list);
233 	hlist_del(&dev->name_hlist);
234 	hlist_del(&dev->index_hlist);
235 	write_unlock_bh(&dev_base_lock);
236 }
237 
238 /*
239  *	Our notifier list
240  */
241 
242 static RAW_NOTIFIER_HEAD(netdev_chain);
243 
244 /*
245  *	Device drivers call our routines to queue packets here. We empty the
246  *	queue in the local softnet handler.
247  */
248 
249 DEFINE_PER_CPU(struct softnet_data, softnet_data);
250 
251 #ifdef CONFIG_LOCKDEP
252 /*
253  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
254  * according to dev->type
255  */
256 static const unsigned short netdev_lock_type[] =
257 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
258 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
259 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
260 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
261 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
262 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
263 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
264 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
265 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
266 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
267 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
268 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
269 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
270 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
271 	 ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
272 
273 static const char *netdev_lock_name[] =
274 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
275 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
276 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
277 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
278 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
279 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
280 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
281 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
282 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
283 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
284 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
285 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
286 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
287 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
288 	 "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
289 
290 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
291 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
292 
293 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
294 {
295 	int i;
296 
297 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
298 		if (netdev_lock_type[i] == dev_type)
299 			return i;
300 	/* the last key is used by default */
301 	return ARRAY_SIZE(netdev_lock_type) - 1;
302 }
303 
304 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
305 						 unsigned short dev_type)
306 {
307 	int i;
308 
309 	i = netdev_lock_pos(dev_type);
310 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
311 				   netdev_lock_name[i]);
312 }
313 
314 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
315 {
316 	int i;
317 
318 	i = netdev_lock_pos(dev->type);
319 	lockdep_set_class_and_name(&dev->addr_list_lock,
320 				   &netdev_addr_lock_key[i],
321 				   netdev_lock_name[i]);
322 }
323 #else
324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
325 						 unsigned short dev_type)
326 {
327 }
328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
329 {
330 }
331 #endif
332 
333 /*******************************************************************************
334 
335 		Protocol management and registration routines
336 
337 *******************************************************************************/
338 
339 /*
340  *	Add a protocol ID to the list. Now that the input handler is
341  *	smarter we can dispense with all the messy stuff that used to be
342  *	here.
343  *
344  *	BEWARE!!! Protocol handlers, mangling input packets,
345  *	MUST BE last in hash buckets and checking protocol handlers
346  *	MUST start from promiscuous ptype_all chain in net_bh.
347  *	It is true now, do not change it.
348  *	Explanation follows: if protocol handler, mangling packet, will
349  *	be the first on list, it is not able to sense, that packet
350  *	is cloned and should be copied-on-write, so that it will
351  *	change it and subsequent readers will get broken packet.
352  *							--ANK (980803)
353  */
354 
355 /**
356  *	dev_add_pack - add packet handler
357  *	@pt: packet type declaration
358  *
359  *	Add a protocol handler to the networking stack. The passed &packet_type
360  *	is linked into kernel lists and may not be freed until it has been
361  *	removed from the kernel lists.
362  *
363  *	This call does not sleep therefore it can not
364  *	guarantee all CPU's that are in middle of receiving packets
365  *	will see the new packet type (until the next received packet).
366  */
367 
368 void dev_add_pack(struct packet_type *pt)
369 {
370 	int hash;
371 
372 	spin_lock_bh(&ptype_lock);
373 	if (pt->type == htons(ETH_P_ALL))
374 		list_add_rcu(&pt->list, &ptype_all);
375 	else {
376 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
377 		list_add_rcu(&pt->list, &ptype_base[hash]);
378 	}
379 	spin_unlock_bh(&ptype_lock);
380 }
381 
382 /**
383  *	__dev_remove_pack	 - remove packet handler
384  *	@pt: packet type declaration
385  *
386  *	Remove a protocol handler that was previously added to the kernel
387  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
388  *	from the kernel lists and can be freed or reused once this function
389  *	returns.
390  *
391  *      The packet type might still be in use by receivers
392  *	and must not be freed until after all the CPU's have gone
393  *	through a quiescent state.
394  */
395 void __dev_remove_pack(struct packet_type *pt)
396 {
397 	struct list_head *head;
398 	struct packet_type *pt1;
399 
400 	spin_lock_bh(&ptype_lock);
401 
402 	if (pt->type == htons(ETH_P_ALL))
403 		head = &ptype_all;
404 	else
405 		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
406 
407 	list_for_each_entry(pt1, head, list) {
408 		if (pt == pt1) {
409 			list_del_rcu(&pt->list);
410 			goto out;
411 		}
412 	}
413 
414 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
415 out:
416 	spin_unlock_bh(&ptype_lock);
417 }
418 /**
419  *	dev_remove_pack	 - remove packet handler
420  *	@pt: packet type declaration
421  *
422  *	Remove a protocol handler that was previously added to the kernel
423  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
424  *	from the kernel lists and can be freed or reused once this function
425  *	returns.
426  *
427  *	This call sleeps to guarantee that no CPU is looking at the packet
428  *	type after return.
429  */
430 void dev_remove_pack(struct packet_type *pt)
431 {
432 	__dev_remove_pack(pt);
433 
434 	synchronize_net();
435 }
436 
437 /******************************************************************************
438 
439 		      Device Boot-time Settings Routines
440 
441 *******************************************************************************/
442 
443 /* Boot time configuration table */
444 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
445 
446 /**
447  *	netdev_boot_setup_add	- add new setup entry
448  *	@name: name of the device
449  *	@map: configured settings for the device
450  *
451  *	Adds new setup entry to the dev_boot_setup list.  The function
452  *	returns 0 on error and 1 on success.  This is a generic routine to
453  *	all netdevices.
454  */
455 static int netdev_boot_setup_add(char *name, struct ifmap *map)
456 {
457 	struct netdev_boot_setup *s;
458 	int i;
459 
460 	s = dev_boot_setup;
461 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
462 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
463 			memset(s[i].name, 0, sizeof(s[i].name));
464 			strlcpy(s[i].name, name, IFNAMSIZ);
465 			memcpy(&s[i].map, map, sizeof(s[i].map));
466 			break;
467 		}
468 	}
469 
470 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
471 }
472 
473 /**
474  *	netdev_boot_setup_check	- check boot time settings
475  *	@dev: the netdevice
476  *
477  * 	Check boot time settings for the device.
478  *	The found settings are set for the device to be used
479  *	later in the device probing.
480  *	Returns 0 if no settings found, 1 if they are.
481  */
482 int netdev_boot_setup_check(struct net_device *dev)
483 {
484 	struct netdev_boot_setup *s = dev_boot_setup;
485 	int i;
486 
487 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
488 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
489 		    !strcmp(dev->name, s[i].name)) {
490 			dev->irq 	= s[i].map.irq;
491 			dev->base_addr 	= s[i].map.base_addr;
492 			dev->mem_start 	= s[i].map.mem_start;
493 			dev->mem_end 	= s[i].map.mem_end;
494 			return 1;
495 		}
496 	}
497 	return 0;
498 }
499 
500 
501 /**
502  *	netdev_boot_base	- get address from boot time settings
503  *	@prefix: prefix for network device
504  *	@unit: id for network device
505  *
506  * 	Check boot time settings for the base address of device.
507  *	The found settings are set for the device to be used
508  *	later in the device probing.
509  *	Returns 0 if no settings found.
510  */
511 unsigned long netdev_boot_base(const char *prefix, int unit)
512 {
513 	const struct netdev_boot_setup *s = dev_boot_setup;
514 	char name[IFNAMSIZ];
515 	int i;
516 
517 	sprintf(name, "%s%d", prefix, unit);
518 
519 	/*
520 	 * If device already registered then return base of 1
521 	 * to indicate not to probe for this interface
522 	 */
523 	if (__dev_get_by_name(&init_net, name))
524 		return 1;
525 
526 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
527 		if (!strcmp(name, s[i].name))
528 			return s[i].map.base_addr;
529 	return 0;
530 }
531 
532 /*
533  * Saves at boot time configured settings for any netdevice.
534  */
535 int __init netdev_boot_setup(char *str)
536 {
537 	int ints[5];
538 	struct ifmap map;
539 
540 	str = get_options(str, ARRAY_SIZE(ints), ints);
541 	if (!str || !*str)
542 		return 0;
543 
544 	/* Save settings */
545 	memset(&map, 0, sizeof(map));
546 	if (ints[0] > 0)
547 		map.irq = ints[1];
548 	if (ints[0] > 1)
549 		map.base_addr = ints[2];
550 	if (ints[0] > 2)
551 		map.mem_start = ints[3];
552 	if (ints[0] > 3)
553 		map.mem_end = ints[4];
554 
555 	/* Add new entry to the list */
556 	return netdev_boot_setup_add(str, &map);
557 }
558 
559 __setup("netdev=", netdev_boot_setup);
560 
561 /*******************************************************************************
562 
563 			    Device Interface Subroutines
564 
565 *******************************************************************************/
566 
567 /**
568  *	__dev_get_by_name	- find a device by its name
569  *	@net: the applicable net namespace
570  *	@name: name to find
571  *
572  *	Find an interface by name. Must be called under RTNL semaphore
573  *	or @dev_base_lock. If the name is found a pointer to the device
574  *	is returned. If the name is not found then %NULL is returned. The
575  *	reference counters are not incremented so the caller must be
576  *	careful with locks.
577  */
578 
579 struct net_device *__dev_get_by_name(struct net *net, const char *name)
580 {
581 	struct hlist_node *p;
582 
583 	hlist_for_each(p, dev_name_hash(net, name)) {
584 		struct net_device *dev
585 			= hlist_entry(p, struct net_device, name_hlist);
586 		if (!strncmp(dev->name, name, IFNAMSIZ))
587 			return dev;
588 	}
589 	return NULL;
590 }
591 
592 /**
593  *	dev_get_by_name		- find a device by its name
594  *	@net: the applicable net namespace
595  *	@name: name to find
596  *
597  *	Find an interface by name. This can be called from any
598  *	context and does its own locking. The returned handle has
599  *	the usage count incremented and the caller must use dev_put() to
600  *	release it when it is no longer needed. %NULL is returned if no
601  *	matching device is found.
602  */
603 
604 struct net_device *dev_get_by_name(struct net *net, const char *name)
605 {
606 	struct net_device *dev;
607 
608 	read_lock(&dev_base_lock);
609 	dev = __dev_get_by_name(net, name);
610 	if (dev)
611 		dev_hold(dev);
612 	read_unlock(&dev_base_lock);
613 	return dev;
614 }
615 
616 /**
617  *	__dev_get_by_index - find a device by its ifindex
618  *	@net: the applicable net namespace
619  *	@ifindex: index of device
620  *
621  *	Search for an interface by index. Returns %NULL if the device
622  *	is not found or a pointer to the device. The device has not
623  *	had its reference counter increased so the caller must be careful
624  *	about locking. The caller must hold either the RTNL semaphore
625  *	or @dev_base_lock.
626  */
627 
628 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
629 {
630 	struct hlist_node *p;
631 
632 	hlist_for_each(p, dev_index_hash(net, ifindex)) {
633 		struct net_device *dev
634 			= hlist_entry(p, struct net_device, index_hlist);
635 		if (dev->ifindex == ifindex)
636 			return dev;
637 	}
638 	return NULL;
639 }
640 
641 
642 /**
643  *	dev_get_by_index - find a device by its ifindex
644  *	@net: the applicable net namespace
645  *	@ifindex: index of device
646  *
647  *	Search for an interface by index. Returns NULL if the device
648  *	is not found or a pointer to the device. The device returned has
649  *	had a reference added and the pointer is safe until the user calls
650  *	dev_put to indicate they have finished with it.
651  */
652 
653 struct net_device *dev_get_by_index(struct net *net, int ifindex)
654 {
655 	struct net_device *dev;
656 
657 	read_lock(&dev_base_lock);
658 	dev = __dev_get_by_index(net, ifindex);
659 	if (dev)
660 		dev_hold(dev);
661 	read_unlock(&dev_base_lock);
662 	return dev;
663 }
664 
665 /**
666  *	dev_getbyhwaddr - find a device by its hardware address
667  *	@net: the applicable net namespace
668  *	@type: media type of device
669  *	@ha: hardware address
670  *
671  *	Search for an interface by MAC address. Returns NULL if the device
672  *	is not found or a pointer to the device. The caller must hold the
673  *	rtnl semaphore. The returned device has not had its ref count increased
674  *	and the caller must therefore be careful about locking
675  *
676  *	BUGS:
677  *	If the API was consistent this would be __dev_get_by_hwaddr
678  */
679 
680 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
681 {
682 	struct net_device *dev;
683 
684 	ASSERT_RTNL();
685 
686 	for_each_netdev(net, dev)
687 		if (dev->type == type &&
688 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
689 			return dev;
690 
691 	return NULL;
692 }
693 
694 EXPORT_SYMBOL(dev_getbyhwaddr);
695 
696 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
697 {
698 	struct net_device *dev;
699 
700 	ASSERT_RTNL();
701 	for_each_netdev(net, dev)
702 		if (dev->type == type)
703 			return dev;
704 
705 	return NULL;
706 }
707 
708 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
709 
710 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
711 {
712 	struct net_device *dev;
713 
714 	rtnl_lock();
715 	dev = __dev_getfirstbyhwtype(net, type);
716 	if (dev)
717 		dev_hold(dev);
718 	rtnl_unlock();
719 	return dev;
720 }
721 
722 EXPORT_SYMBOL(dev_getfirstbyhwtype);
723 
724 /**
725  *	dev_get_by_flags - find any device with given flags
726  *	@net: the applicable net namespace
727  *	@if_flags: IFF_* values
728  *	@mask: bitmask of bits in if_flags to check
729  *
730  *	Search for any interface with the given flags. Returns NULL if a device
731  *	is not found or a pointer to the device. The device returned has
732  *	had a reference added and the pointer is safe until the user calls
733  *	dev_put to indicate they have finished with it.
734  */
735 
736 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
737 {
738 	struct net_device *dev, *ret;
739 
740 	ret = NULL;
741 	read_lock(&dev_base_lock);
742 	for_each_netdev(net, dev) {
743 		if (((dev->flags ^ if_flags) & mask) == 0) {
744 			dev_hold(dev);
745 			ret = dev;
746 			break;
747 		}
748 	}
749 	read_unlock(&dev_base_lock);
750 	return ret;
751 }
752 
753 /**
754  *	dev_valid_name - check if name is okay for network device
755  *	@name: name string
756  *
757  *	Network device names need to be valid file names to
758  *	to allow sysfs to work.  We also disallow any kind of
759  *	whitespace.
760  */
761 int dev_valid_name(const char *name)
762 {
763 	if (*name == '\0')
764 		return 0;
765 	if (strlen(name) >= IFNAMSIZ)
766 		return 0;
767 	if (!strcmp(name, ".") || !strcmp(name, ".."))
768 		return 0;
769 
770 	while (*name) {
771 		if (*name == '/' || isspace(*name))
772 			return 0;
773 		name++;
774 	}
775 	return 1;
776 }
777 
778 /**
779  *	__dev_alloc_name - allocate a name for a device
780  *	@net: network namespace to allocate the device name in
781  *	@name: name format string
782  *	@buf:  scratch buffer and result name string
783  *
784  *	Passed a format string - eg "lt%d" it will try and find a suitable
785  *	id. It scans list of devices to build up a free map, then chooses
786  *	the first empty slot. The caller must hold the dev_base or rtnl lock
787  *	while allocating the name and adding the device in order to avoid
788  *	duplicates.
789  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
790  *	Returns the number of the unit assigned or a negative errno code.
791  */
792 
793 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
794 {
795 	int i = 0;
796 	const char *p;
797 	const int max_netdevices = 8*PAGE_SIZE;
798 	unsigned long *inuse;
799 	struct net_device *d;
800 
801 	p = strnchr(name, IFNAMSIZ-1, '%');
802 	if (p) {
803 		/*
804 		 * Verify the string as this thing may have come from
805 		 * the user.  There must be either one "%d" and no other "%"
806 		 * characters.
807 		 */
808 		if (p[1] != 'd' || strchr(p + 2, '%'))
809 			return -EINVAL;
810 
811 		/* Use one page as a bit array of possible slots */
812 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
813 		if (!inuse)
814 			return -ENOMEM;
815 
816 		for_each_netdev(net, d) {
817 			if (!sscanf(d->name, name, &i))
818 				continue;
819 			if (i < 0 || i >= max_netdevices)
820 				continue;
821 
822 			/*  avoid cases where sscanf is not exact inverse of printf */
823 			snprintf(buf, IFNAMSIZ, name, i);
824 			if (!strncmp(buf, d->name, IFNAMSIZ))
825 				set_bit(i, inuse);
826 		}
827 
828 		i = find_first_zero_bit(inuse, max_netdevices);
829 		free_page((unsigned long) inuse);
830 	}
831 
832 	snprintf(buf, IFNAMSIZ, name, i);
833 	if (!__dev_get_by_name(net, buf))
834 		return i;
835 
836 	/* It is possible to run out of possible slots
837 	 * when the name is long and there isn't enough space left
838 	 * for the digits, or if all bits are used.
839 	 */
840 	return -ENFILE;
841 }
842 
843 /**
844  *	dev_alloc_name - allocate a name for a device
845  *	@dev: device
846  *	@name: name format string
847  *
848  *	Passed a format string - eg "lt%d" it will try and find a suitable
849  *	id. It scans list of devices to build up a free map, then chooses
850  *	the first empty slot. The caller must hold the dev_base or rtnl lock
851  *	while allocating the name and adding the device in order to avoid
852  *	duplicates.
853  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
854  *	Returns the number of the unit assigned or a negative errno code.
855  */
856 
857 int dev_alloc_name(struct net_device *dev, const char *name)
858 {
859 	char buf[IFNAMSIZ];
860 	struct net *net;
861 	int ret;
862 
863 	BUG_ON(!dev_net(dev));
864 	net = dev_net(dev);
865 	ret = __dev_alloc_name(net, name, buf);
866 	if (ret >= 0)
867 		strlcpy(dev->name, buf, IFNAMSIZ);
868 	return ret;
869 }
870 
871 
872 /**
873  *	dev_change_name - change name of a device
874  *	@dev: device
875  *	@newname: name (or format string) must be at least IFNAMSIZ
876  *
877  *	Change name of a device, can pass format strings "eth%d".
878  *	for wildcarding.
879  */
880 int dev_change_name(struct net_device *dev, const char *newname)
881 {
882 	char oldname[IFNAMSIZ];
883 	int err = 0;
884 	int ret;
885 	struct net *net;
886 
887 	ASSERT_RTNL();
888 	BUG_ON(!dev_net(dev));
889 
890 	net = dev_net(dev);
891 	if (dev->flags & IFF_UP)
892 		return -EBUSY;
893 
894 	if (!dev_valid_name(newname))
895 		return -EINVAL;
896 
897 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
898 		return 0;
899 
900 	memcpy(oldname, dev->name, IFNAMSIZ);
901 
902 	if (strchr(newname, '%')) {
903 		err = dev_alloc_name(dev, newname);
904 		if (err < 0)
905 			return err;
906 	}
907 	else if (__dev_get_by_name(net, newname))
908 		return -EEXIST;
909 	else
910 		strlcpy(dev->name, newname, IFNAMSIZ);
911 
912 rollback:
913 	/* For now only devices in the initial network namespace
914 	 * are in sysfs.
915 	 */
916 	if (net == &init_net) {
917 		ret = device_rename(&dev->dev, dev->name);
918 		if (ret) {
919 			memcpy(dev->name, oldname, IFNAMSIZ);
920 			return ret;
921 		}
922 	}
923 
924 	write_lock_bh(&dev_base_lock);
925 	hlist_del(&dev->name_hlist);
926 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
927 	write_unlock_bh(&dev_base_lock);
928 
929 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
930 	ret = notifier_to_errno(ret);
931 
932 	if (ret) {
933 		if (err) {
934 			printk(KERN_ERR
935 			       "%s: name change rollback failed: %d.\n",
936 			       dev->name, ret);
937 		} else {
938 			err = ret;
939 			memcpy(dev->name, oldname, IFNAMSIZ);
940 			goto rollback;
941 		}
942 	}
943 
944 	return err;
945 }
946 
947 /**
948  *	dev_set_alias - change ifalias of a device
949  *	@dev: device
950  *	@alias: name up to IFALIASZ
951  *	@len: limit of bytes to copy from info
952  *
953  *	Set ifalias for a device,
954  */
955 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
956 {
957 	ASSERT_RTNL();
958 
959 	if (len >= IFALIASZ)
960 		return -EINVAL;
961 
962 	if (!len) {
963 		if (dev->ifalias) {
964 			kfree(dev->ifalias);
965 			dev->ifalias = NULL;
966 		}
967 		return 0;
968 	}
969 
970 	dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
971 	if (!dev->ifalias)
972 		return -ENOMEM;
973 
974 	strlcpy(dev->ifalias, alias, len+1);
975 	return len;
976 }
977 
978 
979 /**
980  *	netdev_features_change - device changes features
981  *	@dev: device to cause notification
982  *
983  *	Called to indicate a device has changed features.
984  */
985 void netdev_features_change(struct net_device *dev)
986 {
987 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
988 }
989 EXPORT_SYMBOL(netdev_features_change);
990 
991 /**
992  *	netdev_state_change - device changes state
993  *	@dev: device to cause notification
994  *
995  *	Called to indicate a device has changed state. This function calls
996  *	the notifier chains for netdev_chain and sends a NEWLINK message
997  *	to the routing socket.
998  */
999 void netdev_state_change(struct net_device *dev)
1000 {
1001 	if (dev->flags & IFF_UP) {
1002 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1003 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1004 	}
1005 }
1006 
1007 void netdev_bonding_change(struct net_device *dev)
1008 {
1009 	call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1010 }
1011 EXPORT_SYMBOL(netdev_bonding_change);
1012 
1013 /**
1014  *	dev_load 	- load a network module
1015  *	@net: the applicable net namespace
1016  *	@name: name of interface
1017  *
1018  *	If a network interface is not present and the process has suitable
1019  *	privileges this function loads the module. If module loading is not
1020  *	available in this kernel then it becomes a nop.
1021  */
1022 
1023 void dev_load(struct net *net, const char *name)
1024 {
1025 	struct net_device *dev;
1026 
1027 	read_lock(&dev_base_lock);
1028 	dev = __dev_get_by_name(net, name);
1029 	read_unlock(&dev_base_lock);
1030 
1031 	if (!dev && capable(CAP_SYS_MODULE))
1032 		request_module("%s", name);
1033 }
1034 
1035 /**
1036  *	dev_open	- prepare an interface for use.
1037  *	@dev:	device to open
1038  *
1039  *	Takes a device from down to up state. The device's private open
1040  *	function is invoked and then the multicast lists are loaded. Finally
1041  *	the device is moved into the up state and a %NETDEV_UP message is
1042  *	sent to the netdev notifier chain.
1043  *
1044  *	Calling this function on an active interface is a nop. On a failure
1045  *	a negative errno code is returned.
1046  */
1047 int dev_open(struct net_device *dev)
1048 {
1049 	const struct net_device_ops *ops = dev->netdev_ops;
1050 	int ret = 0;
1051 
1052 	ASSERT_RTNL();
1053 
1054 	/*
1055 	 *	Is it already up?
1056 	 */
1057 
1058 	if (dev->flags & IFF_UP)
1059 		return 0;
1060 
1061 	/*
1062 	 *	Is it even present?
1063 	 */
1064 	if (!netif_device_present(dev))
1065 		return -ENODEV;
1066 
1067 	/*
1068 	 *	Call device private open method
1069 	 */
1070 	set_bit(__LINK_STATE_START, &dev->state);
1071 
1072 	if (ops->ndo_validate_addr)
1073 		ret = ops->ndo_validate_addr(dev);
1074 
1075 	if (!ret && ops->ndo_open)
1076 		ret = ops->ndo_open(dev);
1077 
1078 	/*
1079 	 *	If it went open OK then:
1080 	 */
1081 
1082 	if (ret)
1083 		clear_bit(__LINK_STATE_START, &dev->state);
1084 	else {
1085 		/*
1086 		 *	Set the flags.
1087 		 */
1088 		dev->flags |= IFF_UP;
1089 
1090 		/*
1091 		 *	Initialize multicasting status
1092 		 */
1093 		dev_set_rx_mode(dev);
1094 
1095 		/*
1096 		 *	Wakeup transmit queue engine
1097 		 */
1098 		dev_activate(dev);
1099 
1100 		/*
1101 		 *	... and announce new interface.
1102 		 */
1103 		call_netdevice_notifiers(NETDEV_UP, dev);
1104 	}
1105 
1106 	return ret;
1107 }
1108 
1109 /**
1110  *	dev_close - shutdown an interface.
1111  *	@dev: device to shutdown
1112  *
1113  *	This function moves an active device into down state. A
1114  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1115  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1116  *	chain.
1117  */
1118 int dev_close(struct net_device *dev)
1119 {
1120 	const struct net_device_ops *ops = dev->netdev_ops;
1121 	ASSERT_RTNL();
1122 
1123 	might_sleep();
1124 
1125 	if (!(dev->flags & IFF_UP))
1126 		return 0;
1127 
1128 	/*
1129 	 *	Tell people we are going down, so that they can
1130 	 *	prepare to death, when device is still operating.
1131 	 */
1132 	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1133 
1134 	clear_bit(__LINK_STATE_START, &dev->state);
1135 
1136 	/* Synchronize to scheduled poll. We cannot touch poll list,
1137 	 * it can be even on different cpu. So just clear netif_running().
1138 	 *
1139 	 * dev->stop() will invoke napi_disable() on all of it's
1140 	 * napi_struct instances on this device.
1141 	 */
1142 	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1143 
1144 	dev_deactivate(dev);
1145 
1146 	/*
1147 	 *	Call the device specific close. This cannot fail.
1148 	 *	Only if device is UP
1149 	 *
1150 	 *	We allow it to be called even after a DETACH hot-plug
1151 	 *	event.
1152 	 */
1153 	if (ops->ndo_stop)
1154 		ops->ndo_stop(dev);
1155 
1156 	/*
1157 	 *	Device is now down.
1158 	 */
1159 
1160 	dev->flags &= ~IFF_UP;
1161 
1162 	/*
1163 	 * Tell people we are down
1164 	 */
1165 	call_netdevice_notifiers(NETDEV_DOWN, dev);
1166 
1167 	return 0;
1168 }
1169 
1170 
1171 /**
1172  *	dev_disable_lro - disable Large Receive Offload on a device
1173  *	@dev: device
1174  *
1175  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1176  *	called under RTNL.  This is needed if received packets may be
1177  *	forwarded to another interface.
1178  */
1179 void dev_disable_lro(struct net_device *dev)
1180 {
1181 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1182 	    dev->ethtool_ops->set_flags) {
1183 		u32 flags = dev->ethtool_ops->get_flags(dev);
1184 		if (flags & ETH_FLAG_LRO) {
1185 			flags &= ~ETH_FLAG_LRO;
1186 			dev->ethtool_ops->set_flags(dev, flags);
1187 		}
1188 	}
1189 	WARN_ON(dev->features & NETIF_F_LRO);
1190 }
1191 EXPORT_SYMBOL(dev_disable_lro);
1192 
1193 
1194 static int dev_boot_phase = 1;
1195 
1196 /*
1197  *	Device change register/unregister. These are not inline or static
1198  *	as we export them to the world.
1199  */
1200 
1201 /**
1202  *	register_netdevice_notifier - register a network notifier block
1203  *	@nb: notifier
1204  *
1205  *	Register a notifier to be called when network device events occur.
1206  *	The notifier passed is linked into the kernel structures and must
1207  *	not be reused until it has been unregistered. A negative errno code
1208  *	is returned on a failure.
1209  *
1210  * 	When registered all registration and up events are replayed
1211  *	to the new notifier to allow device to have a race free
1212  *	view of the network device list.
1213  */
1214 
1215 int register_netdevice_notifier(struct notifier_block *nb)
1216 {
1217 	struct net_device *dev;
1218 	struct net_device *last;
1219 	struct net *net;
1220 	int err;
1221 
1222 	rtnl_lock();
1223 	err = raw_notifier_chain_register(&netdev_chain, nb);
1224 	if (err)
1225 		goto unlock;
1226 	if (dev_boot_phase)
1227 		goto unlock;
1228 	for_each_net(net) {
1229 		for_each_netdev(net, dev) {
1230 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1231 			err = notifier_to_errno(err);
1232 			if (err)
1233 				goto rollback;
1234 
1235 			if (!(dev->flags & IFF_UP))
1236 				continue;
1237 
1238 			nb->notifier_call(nb, NETDEV_UP, dev);
1239 		}
1240 	}
1241 
1242 unlock:
1243 	rtnl_unlock();
1244 	return err;
1245 
1246 rollback:
1247 	last = dev;
1248 	for_each_net(net) {
1249 		for_each_netdev(net, dev) {
1250 			if (dev == last)
1251 				break;
1252 
1253 			if (dev->flags & IFF_UP) {
1254 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1255 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1256 			}
1257 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1258 		}
1259 	}
1260 
1261 	raw_notifier_chain_unregister(&netdev_chain, nb);
1262 	goto unlock;
1263 }
1264 
1265 /**
1266  *	unregister_netdevice_notifier - unregister a network notifier block
1267  *	@nb: notifier
1268  *
1269  *	Unregister a notifier previously registered by
1270  *	register_netdevice_notifier(). The notifier is unlinked into the
1271  *	kernel structures and may then be reused. A negative errno code
1272  *	is returned on a failure.
1273  */
1274 
1275 int unregister_netdevice_notifier(struct notifier_block *nb)
1276 {
1277 	int err;
1278 
1279 	rtnl_lock();
1280 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1281 	rtnl_unlock();
1282 	return err;
1283 }
1284 
1285 /**
1286  *	call_netdevice_notifiers - call all network notifier blocks
1287  *      @val: value passed unmodified to notifier function
1288  *      @dev: net_device pointer passed unmodified to notifier function
1289  *
1290  *	Call all network notifier blocks.  Parameters and return value
1291  *	are as for raw_notifier_call_chain().
1292  */
1293 
1294 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1295 {
1296 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1297 }
1298 
1299 /* When > 0 there are consumers of rx skb time stamps */
1300 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1301 
1302 void net_enable_timestamp(void)
1303 {
1304 	atomic_inc(&netstamp_needed);
1305 }
1306 
1307 void net_disable_timestamp(void)
1308 {
1309 	atomic_dec(&netstamp_needed);
1310 }
1311 
1312 static inline void net_timestamp(struct sk_buff *skb)
1313 {
1314 	if (atomic_read(&netstamp_needed))
1315 		__net_timestamp(skb);
1316 	else
1317 		skb->tstamp.tv64 = 0;
1318 }
1319 
1320 /*
1321  *	Support routine. Sends outgoing frames to any network
1322  *	taps currently in use.
1323  */
1324 
1325 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1326 {
1327 	struct packet_type *ptype;
1328 
1329 	net_timestamp(skb);
1330 
1331 	rcu_read_lock();
1332 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1333 		/* Never send packets back to the socket
1334 		 * they originated from - MvS ([email protected])
1335 		 */
1336 		if ((ptype->dev == dev || !ptype->dev) &&
1337 		    (ptype->af_packet_priv == NULL ||
1338 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1339 			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1340 			if (!skb2)
1341 				break;
1342 
1343 			/* skb->nh should be correctly
1344 			   set by sender, so that the second statement is
1345 			   just protection against buggy protocols.
1346 			 */
1347 			skb_reset_mac_header(skb2);
1348 
1349 			if (skb_network_header(skb2) < skb2->data ||
1350 			    skb2->network_header > skb2->tail) {
1351 				if (net_ratelimit())
1352 					printk(KERN_CRIT "protocol %04x is "
1353 					       "buggy, dev %s\n",
1354 					       skb2->protocol, dev->name);
1355 				skb_reset_network_header(skb2);
1356 			}
1357 
1358 			skb2->transport_header = skb2->network_header;
1359 			skb2->pkt_type = PACKET_OUTGOING;
1360 			ptype->func(skb2, skb->dev, ptype, skb->dev);
1361 		}
1362 	}
1363 	rcu_read_unlock();
1364 }
1365 
1366 
1367 static inline void __netif_reschedule(struct Qdisc *q)
1368 {
1369 	struct softnet_data *sd;
1370 	unsigned long flags;
1371 
1372 	local_irq_save(flags);
1373 	sd = &__get_cpu_var(softnet_data);
1374 	q->next_sched = sd->output_queue;
1375 	sd->output_queue = q;
1376 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1377 	local_irq_restore(flags);
1378 }
1379 
1380 void __netif_schedule(struct Qdisc *q)
1381 {
1382 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1383 		__netif_reschedule(q);
1384 }
1385 EXPORT_SYMBOL(__netif_schedule);
1386 
1387 void dev_kfree_skb_irq(struct sk_buff *skb)
1388 {
1389 	if (atomic_dec_and_test(&skb->users)) {
1390 		struct softnet_data *sd;
1391 		unsigned long flags;
1392 
1393 		local_irq_save(flags);
1394 		sd = &__get_cpu_var(softnet_data);
1395 		skb->next = sd->completion_queue;
1396 		sd->completion_queue = skb;
1397 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1398 		local_irq_restore(flags);
1399 	}
1400 }
1401 EXPORT_SYMBOL(dev_kfree_skb_irq);
1402 
1403 void dev_kfree_skb_any(struct sk_buff *skb)
1404 {
1405 	if (in_irq() || irqs_disabled())
1406 		dev_kfree_skb_irq(skb);
1407 	else
1408 		dev_kfree_skb(skb);
1409 }
1410 EXPORT_SYMBOL(dev_kfree_skb_any);
1411 
1412 
1413 /**
1414  * netif_device_detach - mark device as removed
1415  * @dev: network device
1416  *
1417  * Mark device as removed from system and therefore no longer available.
1418  */
1419 void netif_device_detach(struct net_device *dev)
1420 {
1421 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1422 	    netif_running(dev)) {
1423 		netif_stop_queue(dev);
1424 	}
1425 }
1426 EXPORT_SYMBOL(netif_device_detach);
1427 
1428 /**
1429  * netif_device_attach - mark device as attached
1430  * @dev: network device
1431  *
1432  * Mark device as attached from system and restart if needed.
1433  */
1434 void netif_device_attach(struct net_device *dev)
1435 {
1436 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1437 	    netif_running(dev)) {
1438 		netif_wake_queue(dev);
1439 		__netdev_watchdog_up(dev);
1440 	}
1441 }
1442 EXPORT_SYMBOL(netif_device_attach);
1443 
1444 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1445 {
1446 	return ((features & NETIF_F_GEN_CSUM) ||
1447 		((features & NETIF_F_IP_CSUM) &&
1448 		 protocol == htons(ETH_P_IP)) ||
1449 		((features & NETIF_F_IPV6_CSUM) &&
1450 		 protocol == htons(ETH_P_IPV6)));
1451 }
1452 
1453 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1454 {
1455 	if (can_checksum_protocol(dev->features, skb->protocol))
1456 		return true;
1457 
1458 	if (skb->protocol == htons(ETH_P_8021Q)) {
1459 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1460 		if (can_checksum_protocol(dev->features & dev->vlan_features,
1461 					  veh->h_vlan_encapsulated_proto))
1462 			return true;
1463 	}
1464 
1465 	return false;
1466 }
1467 
1468 /*
1469  * Invalidate hardware checksum when packet is to be mangled, and
1470  * complete checksum manually on outgoing path.
1471  */
1472 int skb_checksum_help(struct sk_buff *skb)
1473 {
1474 	__wsum csum;
1475 	int ret = 0, offset;
1476 
1477 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1478 		goto out_set_summed;
1479 
1480 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1481 		/* Let GSO fix up the checksum. */
1482 		goto out_set_summed;
1483 	}
1484 
1485 	offset = skb->csum_start - skb_headroom(skb);
1486 	BUG_ON(offset >= skb_headlen(skb));
1487 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1488 
1489 	offset += skb->csum_offset;
1490 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1491 
1492 	if (skb_cloned(skb) &&
1493 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1494 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1495 		if (ret)
1496 			goto out;
1497 	}
1498 
1499 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1500 out_set_summed:
1501 	skb->ip_summed = CHECKSUM_NONE;
1502 out:
1503 	return ret;
1504 }
1505 
1506 /**
1507  *	skb_gso_segment - Perform segmentation on skb.
1508  *	@skb: buffer to segment
1509  *	@features: features for the output path (see dev->features)
1510  *
1511  *	This function segments the given skb and returns a list of segments.
1512  *
1513  *	It may return NULL if the skb requires no segmentation.  This is
1514  *	only possible when GSO is used for verifying header integrity.
1515  */
1516 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1517 {
1518 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1519 	struct packet_type *ptype;
1520 	__be16 type = skb->protocol;
1521 	int err;
1522 
1523 	skb_reset_mac_header(skb);
1524 	skb->mac_len = skb->network_header - skb->mac_header;
1525 	__skb_pull(skb, skb->mac_len);
1526 
1527 	if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1528 		if (skb_header_cloned(skb) &&
1529 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1530 			return ERR_PTR(err);
1531 	}
1532 
1533 	rcu_read_lock();
1534 	list_for_each_entry_rcu(ptype,
1535 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1536 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1537 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1538 				err = ptype->gso_send_check(skb);
1539 				segs = ERR_PTR(err);
1540 				if (err || skb_gso_ok(skb, features))
1541 					break;
1542 				__skb_push(skb, (skb->data -
1543 						 skb_network_header(skb)));
1544 			}
1545 			segs = ptype->gso_segment(skb, features);
1546 			break;
1547 		}
1548 	}
1549 	rcu_read_unlock();
1550 
1551 	__skb_push(skb, skb->data - skb_mac_header(skb));
1552 
1553 	return segs;
1554 }
1555 
1556 EXPORT_SYMBOL(skb_gso_segment);
1557 
1558 /* Take action when hardware reception checksum errors are detected. */
1559 #ifdef CONFIG_BUG
1560 void netdev_rx_csum_fault(struct net_device *dev)
1561 {
1562 	if (net_ratelimit()) {
1563 		printk(KERN_ERR "%s: hw csum failure.\n",
1564 			dev ? dev->name : "<unknown>");
1565 		dump_stack();
1566 	}
1567 }
1568 EXPORT_SYMBOL(netdev_rx_csum_fault);
1569 #endif
1570 
1571 /* Actually, we should eliminate this check as soon as we know, that:
1572  * 1. IOMMU is present and allows to map all the memory.
1573  * 2. No high memory really exists on this machine.
1574  */
1575 
1576 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1577 {
1578 #ifdef CONFIG_HIGHMEM
1579 	int i;
1580 
1581 	if (dev->features & NETIF_F_HIGHDMA)
1582 		return 0;
1583 
1584 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1585 		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1586 			return 1;
1587 
1588 #endif
1589 	return 0;
1590 }
1591 
1592 struct dev_gso_cb {
1593 	void (*destructor)(struct sk_buff *skb);
1594 };
1595 
1596 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1597 
1598 static void dev_gso_skb_destructor(struct sk_buff *skb)
1599 {
1600 	struct dev_gso_cb *cb;
1601 
1602 	do {
1603 		struct sk_buff *nskb = skb->next;
1604 
1605 		skb->next = nskb->next;
1606 		nskb->next = NULL;
1607 		kfree_skb(nskb);
1608 	} while (skb->next);
1609 
1610 	cb = DEV_GSO_CB(skb);
1611 	if (cb->destructor)
1612 		cb->destructor(skb);
1613 }
1614 
1615 /**
1616  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1617  *	@skb: buffer to segment
1618  *
1619  *	This function segments the given skb and stores the list of segments
1620  *	in skb->next.
1621  */
1622 static int dev_gso_segment(struct sk_buff *skb)
1623 {
1624 	struct net_device *dev = skb->dev;
1625 	struct sk_buff *segs;
1626 	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1627 					 NETIF_F_SG : 0);
1628 
1629 	segs = skb_gso_segment(skb, features);
1630 
1631 	/* Verifying header integrity only. */
1632 	if (!segs)
1633 		return 0;
1634 
1635 	if (IS_ERR(segs))
1636 		return PTR_ERR(segs);
1637 
1638 	skb->next = segs;
1639 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1640 	skb->destructor = dev_gso_skb_destructor;
1641 
1642 	return 0;
1643 }
1644 
1645 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1646 			struct netdev_queue *txq)
1647 {
1648 	const struct net_device_ops *ops = dev->netdev_ops;
1649 
1650 	prefetch(&dev->netdev_ops->ndo_start_xmit);
1651 	if (likely(!skb->next)) {
1652 		if (!list_empty(&ptype_all))
1653 			dev_queue_xmit_nit(skb, dev);
1654 
1655 		if (netif_needs_gso(dev, skb)) {
1656 			if (unlikely(dev_gso_segment(skb)))
1657 				goto out_kfree_skb;
1658 			if (skb->next)
1659 				goto gso;
1660 		}
1661 
1662 		return ops->ndo_start_xmit(skb, dev);
1663 	}
1664 
1665 gso:
1666 	do {
1667 		struct sk_buff *nskb = skb->next;
1668 		int rc;
1669 
1670 		skb->next = nskb->next;
1671 		nskb->next = NULL;
1672 		rc = ops->ndo_start_xmit(nskb, dev);
1673 		if (unlikely(rc)) {
1674 			nskb->next = skb->next;
1675 			skb->next = nskb;
1676 			return rc;
1677 		}
1678 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1679 			return NETDEV_TX_BUSY;
1680 	} while (skb->next);
1681 
1682 	skb->destructor = DEV_GSO_CB(skb)->destructor;
1683 
1684 out_kfree_skb:
1685 	kfree_skb(skb);
1686 	return 0;
1687 }
1688 
1689 static u32 simple_tx_hashrnd;
1690 static int simple_tx_hashrnd_initialized = 0;
1691 
1692 static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
1693 {
1694 	u32 addr1, addr2, ports;
1695 	u32 hash, ihl;
1696 	u8 ip_proto = 0;
1697 
1698 	if (unlikely(!simple_tx_hashrnd_initialized)) {
1699 		get_random_bytes(&simple_tx_hashrnd, 4);
1700 		simple_tx_hashrnd_initialized = 1;
1701 	}
1702 
1703 	switch (skb->protocol) {
1704 	case htons(ETH_P_IP):
1705 		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
1706 			ip_proto = ip_hdr(skb)->protocol;
1707 		addr1 = ip_hdr(skb)->saddr;
1708 		addr2 = ip_hdr(skb)->daddr;
1709 		ihl = ip_hdr(skb)->ihl;
1710 		break;
1711 	case htons(ETH_P_IPV6):
1712 		ip_proto = ipv6_hdr(skb)->nexthdr;
1713 		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
1714 		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
1715 		ihl = (40 >> 2);
1716 		break;
1717 	default:
1718 		return 0;
1719 	}
1720 
1721 
1722 	switch (ip_proto) {
1723 	case IPPROTO_TCP:
1724 	case IPPROTO_UDP:
1725 	case IPPROTO_DCCP:
1726 	case IPPROTO_ESP:
1727 	case IPPROTO_AH:
1728 	case IPPROTO_SCTP:
1729 	case IPPROTO_UDPLITE:
1730 		ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
1731 		break;
1732 
1733 	default:
1734 		ports = 0;
1735 		break;
1736 	}
1737 
1738 	hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
1739 
1740 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1741 }
1742 
1743 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1744 					struct sk_buff *skb)
1745 {
1746 	const struct net_device_ops *ops = dev->netdev_ops;
1747 	u16 queue_index = 0;
1748 
1749 	if (ops->ndo_select_queue)
1750 		queue_index = ops->ndo_select_queue(dev, skb);
1751 	else if (dev->real_num_tx_queues > 1)
1752 		queue_index = simple_tx_hash(dev, skb);
1753 
1754 	skb_set_queue_mapping(skb, queue_index);
1755 	return netdev_get_tx_queue(dev, queue_index);
1756 }
1757 
1758 /**
1759  *	dev_queue_xmit - transmit a buffer
1760  *	@skb: buffer to transmit
1761  *
1762  *	Queue a buffer for transmission to a network device. The caller must
1763  *	have set the device and priority and built the buffer before calling
1764  *	this function. The function can be called from an interrupt.
1765  *
1766  *	A negative errno code is returned on a failure. A success does not
1767  *	guarantee the frame will be transmitted as it may be dropped due
1768  *	to congestion or traffic shaping.
1769  *
1770  * -----------------------------------------------------------------------------------
1771  *      I notice this method can also return errors from the queue disciplines,
1772  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1773  *      be positive.
1774  *
1775  *      Regardless of the return value, the skb is consumed, so it is currently
1776  *      difficult to retry a send to this method.  (You can bump the ref count
1777  *      before sending to hold a reference for retry if you are careful.)
1778  *
1779  *      When calling this method, interrupts MUST be enabled.  This is because
1780  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1781  *          --BLG
1782  */
1783 int dev_queue_xmit(struct sk_buff *skb)
1784 {
1785 	struct net_device *dev = skb->dev;
1786 	struct netdev_queue *txq;
1787 	struct Qdisc *q;
1788 	int rc = -ENOMEM;
1789 
1790 	/* GSO will handle the following emulations directly. */
1791 	if (netif_needs_gso(dev, skb))
1792 		goto gso;
1793 
1794 	if (skb_shinfo(skb)->frag_list &&
1795 	    !(dev->features & NETIF_F_FRAGLIST) &&
1796 	    __skb_linearize(skb))
1797 		goto out_kfree_skb;
1798 
1799 	/* Fragmented skb is linearized if device does not support SG,
1800 	 * or if at least one of fragments is in highmem and device
1801 	 * does not support DMA from it.
1802 	 */
1803 	if (skb_shinfo(skb)->nr_frags &&
1804 	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1805 	    __skb_linearize(skb))
1806 		goto out_kfree_skb;
1807 
1808 	/* If packet is not checksummed and device does not support
1809 	 * checksumming for this protocol, complete checksumming here.
1810 	 */
1811 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1812 		skb_set_transport_header(skb, skb->csum_start -
1813 					      skb_headroom(skb));
1814 		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1815 			goto out_kfree_skb;
1816 	}
1817 
1818 gso:
1819 	/* Disable soft irqs for various locks below. Also
1820 	 * stops preemption for RCU.
1821 	 */
1822 	rcu_read_lock_bh();
1823 
1824 	txq = dev_pick_tx(dev, skb);
1825 	q = rcu_dereference(txq->qdisc);
1826 
1827 #ifdef CONFIG_NET_CLS_ACT
1828 	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1829 #endif
1830 	if (q->enqueue) {
1831 		spinlock_t *root_lock = qdisc_lock(q);
1832 
1833 		spin_lock(root_lock);
1834 
1835 		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1836 			kfree_skb(skb);
1837 			rc = NET_XMIT_DROP;
1838 		} else {
1839 			rc = qdisc_enqueue_root(skb, q);
1840 			qdisc_run(q);
1841 		}
1842 		spin_unlock(root_lock);
1843 
1844 		goto out;
1845 	}
1846 
1847 	/* The device has no queue. Common case for software devices:
1848 	   loopback, all the sorts of tunnels...
1849 
1850 	   Really, it is unlikely that netif_tx_lock protection is necessary
1851 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1852 	   counters.)
1853 	   However, it is possible, that they rely on protection
1854 	   made by us here.
1855 
1856 	   Check this and shot the lock. It is not prone from deadlocks.
1857 	   Either shot noqueue qdisc, it is even simpler 8)
1858 	 */
1859 	if (dev->flags & IFF_UP) {
1860 		int cpu = smp_processor_id(); /* ok because BHs are off */
1861 
1862 		if (txq->xmit_lock_owner != cpu) {
1863 
1864 			HARD_TX_LOCK(dev, txq, cpu);
1865 
1866 			if (!netif_tx_queue_stopped(txq)) {
1867 				rc = 0;
1868 				if (!dev_hard_start_xmit(skb, dev, txq)) {
1869 					HARD_TX_UNLOCK(dev, txq);
1870 					goto out;
1871 				}
1872 			}
1873 			HARD_TX_UNLOCK(dev, txq);
1874 			if (net_ratelimit())
1875 				printk(KERN_CRIT "Virtual device %s asks to "
1876 				       "queue packet!\n", dev->name);
1877 		} else {
1878 			/* Recursion is detected! It is possible,
1879 			 * unfortunately */
1880 			if (net_ratelimit())
1881 				printk(KERN_CRIT "Dead loop on virtual device "
1882 				       "%s, fix it urgently!\n", dev->name);
1883 		}
1884 	}
1885 
1886 	rc = -ENETDOWN;
1887 	rcu_read_unlock_bh();
1888 
1889 out_kfree_skb:
1890 	kfree_skb(skb);
1891 	return rc;
1892 out:
1893 	rcu_read_unlock_bh();
1894 	return rc;
1895 }
1896 
1897 
1898 /*=======================================================================
1899 			Receiver routines
1900   =======================================================================*/
1901 
1902 int netdev_max_backlog __read_mostly = 1000;
1903 int netdev_budget __read_mostly = 300;
1904 int weight_p __read_mostly = 64;            /* old backlog weight */
1905 
1906 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1907 
1908 
1909 /**
1910  *	netif_rx	-	post buffer to the network code
1911  *	@skb: buffer to post
1912  *
1913  *	This function receives a packet from a device driver and queues it for
1914  *	the upper (protocol) levels to process.  It always succeeds. The buffer
1915  *	may be dropped during processing for congestion control or by the
1916  *	protocol layers.
1917  *
1918  *	return values:
1919  *	NET_RX_SUCCESS	(no congestion)
1920  *	NET_RX_DROP     (packet was dropped)
1921  *
1922  */
1923 
1924 int netif_rx(struct sk_buff *skb)
1925 {
1926 	struct softnet_data *queue;
1927 	unsigned long flags;
1928 
1929 	/* if netpoll wants it, pretend we never saw it */
1930 	if (netpoll_rx(skb))
1931 		return NET_RX_DROP;
1932 
1933 	if (!skb->tstamp.tv64)
1934 		net_timestamp(skb);
1935 
1936 	/*
1937 	 * The code is rearranged so that the path is the most
1938 	 * short when CPU is congested, but is still operating.
1939 	 */
1940 	local_irq_save(flags);
1941 	queue = &__get_cpu_var(softnet_data);
1942 
1943 	__get_cpu_var(netdev_rx_stat).total++;
1944 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1945 		if (queue->input_pkt_queue.qlen) {
1946 enqueue:
1947 			__skb_queue_tail(&queue->input_pkt_queue, skb);
1948 			local_irq_restore(flags);
1949 			return NET_RX_SUCCESS;
1950 		}
1951 
1952 		napi_schedule(&queue->backlog);
1953 		goto enqueue;
1954 	}
1955 
1956 	__get_cpu_var(netdev_rx_stat).dropped++;
1957 	local_irq_restore(flags);
1958 
1959 	kfree_skb(skb);
1960 	return NET_RX_DROP;
1961 }
1962 
1963 int netif_rx_ni(struct sk_buff *skb)
1964 {
1965 	int err;
1966 
1967 	preempt_disable();
1968 	err = netif_rx(skb);
1969 	if (local_softirq_pending())
1970 		do_softirq();
1971 	preempt_enable();
1972 
1973 	return err;
1974 }
1975 
1976 EXPORT_SYMBOL(netif_rx_ni);
1977 
1978 static void net_tx_action(struct softirq_action *h)
1979 {
1980 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
1981 
1982 	if (sd->completion_queue) {
1983 		struct sk_buff *clist;
1984 
1985 		local_irq_disable();
1986 		clist = sd->completion_queue;
1987 		sd->completion_queue = NULL;
1988 		local_irq_enable();
1989 
1990 		while (clist) {
1991 			struct sk_buff *skb = clist;
1992 			clist = clist->next;
1993 
1994 			WARN_ON(atomic_read(&skb->users));
1995 			__kfree_skb(skb);
1996 		}
1997 	}
1998 
1999 	if (sd->output_queue) {
2000 		struct Qdisc *head;
2001 
2002 		local_irq_disable();
2003 		head = sd->output_queue;
2004 		sd->output_queue = NULL;
2005 		local_irq_enable();
2006 
2007 		while (head) {
2008 			struct Qdisc *q = head;
2009 			spinlock_t *root_lock;
2010 
2011 			head = head->next_sched;
2012 
2013 			root_lock = qdisc_lock(q);
2014 			if (spin_trylock(root_lock)) {
2015 				smp_mb__before_clear_bit();
2016 				clear_bit(__QDISC_STATE_SCHED,
2017 					  &q->state);
2018 				qdisc_run(q);
2019 				spin_unlock(root_lock);
2020 			} else {
2021 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2022 					      &q->state)) {
2023 					__netif_reschedule(q);
2024 				} else {
2025 					smp_mb__before_clear_bit();
2026 					clear_bit(__QDISC_STATE_SCHED,
2027 						  &q->state);
2028 				}
2029 			}
2030 		}
2031 	}
2032 }
2033 
2034 static inline int deliver_skb(struct sk_buff *skb,
2035 			      struct packet_type *pt_prev,
2036 			      struct net_device *orig_dev)
2037 {
2038 	atomic_inc(&skb->users);
2039 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2040 }
2041 
2042 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2043 /* These hooks defined here for ATM */
2044 struct net_bridge;
2045 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2046 						unsigned char *addr);
2047 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2048 
2049 /*
2050  * If bridge module is loaded call bridging hook.
2051  *  returns NULL if packet was consumed.
2052  */
2053 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2054 					struct sk_buff *skb) __read_mostly;
2055 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2056 					    struct packet_type **pt_prev, int *ret,
2057 					    struct net_device *orig_dev)
2058 {
2059 	struct net_bridge_port *port;
2060 
2061 	if (skb->pkt_type == PACKET_LOOPBACK ||
2062 	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2063 		return skb;
2064 
2065 	if (*pt_prev) {
2066 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2067 		*pt_prev = NULL;
2068 	}
2069 
2070 	return br_handle_frame_hook(port, skb);
2071 }
2072 #else
2073 #define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2074 #endif
2075 
2076 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2077 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2078 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2079 
2080 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2081 					     struct packet_type **pt_prev,
2082 					     int *ret,
2083 					     struct net_device *orig_dev)
2084 {
2085 	if (skb->dev->macvlan_port == NULL)
2086 		return skb;
2087 
2088 	if (*pt_prev) {
2089 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2090 		*pt_prev = NULL;
2091 	}
2092 	return macvlan_handle_frame_hook(skb);
2093 }
2094 #else
2095 #define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2096 #endif
2097 
2098 #ifdef CONFIG_NET_CLS_ACT
2099 /* TODO: Maybe we should just force sch_ingress to be compiled in
2100  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2101  * a compare and 2 stores extra right now if we dont have it on
2102  * but have CONFIG_NET_CLS_ACT
2103  * NOTE: This doesnt stop any functionality; if you dont have
2104  * the ingress scheduler, you just cant add policies on ingress.
2105  *
2106  */
2107 static int ing_filter(struct sk_buff *skb)
2108 {
2109 	struct net_device *dev = skb->dev;
2110 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2111 	struct netdev_queue *rxq;
2112 	int result = TC_ACT_OK;
2113 	struct Qdisc *q;
2114 
2115 	if (MAX_RED_LOOP < ttl++) {
2116 		printk(KERN_WARNING
2117 		       "Redir loop detected Dropping packet (%d->%d)\n",
2118 		       skb->iif, dev->ifindex);
2119 		return TC_ACT_SHOT;
2120 	}
2121 
2122 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2123 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2124 
2125 	rxq = &dev->rx_queue;
2126 
2127 	q = rxq->qdisc;
2128 	if (q != &noop_qdisc) {
2129 		spin_lock(qdisc_lock(q));
2130 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2131 			result = qdisc_enqueue_root(skb, q);
2132 		spin_unlock(qdisc_lock(q));
2133 	}
2134 
2135 	return result;
2136 }
2137 
2138 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2139 					 struct packet_type **pt_prev,
2140 					 int *ret, struct net_device *orig_dev)
2141 {
2142 	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2143 		goto out;
2144 
2145 	if (*pt_prev) {
2146 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2147 		*pt_prev = NULL;
2148 	} else {
2149 		/* Huh? Why does turning on AF_PACKET affect this? */
2150 		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2151 	}
2152 
2153 	switch (ing_filter(skb)) {
2154 	case TC_ACT_SHOT:
2155 	case TC_ACT_STOLEN:
2156 		kfree_skb(skb);
2157 		return NULL;
2158 	}
2159 
2160 out:
2161 	skb->tc_verd = 0;
2162 	return skb;
2163 }
2164 #endif
2165 
2166 /*
2167  * 	netif_nit_deliver - deliver received packets to network taps
2168  * 	@skb: buffer
2169  *
2170  * 	This function is used to deliver incoming packets to network
2171  * 	taps. It should be used when the normal netif_receive_skb path
2172  * 	is bypassed, for example because of VLAN acceleration.
2173  */
2174 void netif_nit_deliver(struct sk_buff *skb)
2175 {
2176 	struct packet_type *ptype;
2177 
2178 	if (list_empty(&ptype_all))
2179 		return;
2180 
2181 	skb_reset_network_header(skb);
2182 	skb_reset_transport_header(skb);
2183 	skb->mac_len = skb->network_header - skb->mac_header;
2184 
2185 	rcu_read_lock();
2186 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2187 		if (!ptype->dev || ptype->dev == skb->dev)
2188 			deliver_skb(skb, ptype, skb->dev);
2189 	}
2190 	rcu_read_unlock();
2191 }
2192 
2193 /**
2194  *	netif_receive_skb - process receive buffer from network
2195  *	@skb: buffer to process
2196  *
2197  *	netif_receive_skb() is the main receive data processing function.
2198  *	It always succeeds. The buffer may be dropped during processing
2199  *	for congestion control or by the protocol layers.
2200  *
2201  *	This function may only be called from softirq context and interrupts
2202  *	should be enabled.
2203  *
2204  *	Return values (usually ignored):
2205  *	NET_RX_SUCCESS: no congestion
2206  *	NET_RX_DROP: packet was dropped
2207  */
2208 int netif_receive_skb(struct sk_buff *skb)
2209 {
2210 	struct packet_type *ptype, *pt_prev;
2211 	struct net_device *orig_dev;
2212 	struct net_device *null_or_orig;
2213 	int ret = NET_RX_DROP;
2214 	__be16 type;
2215 
2216 	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2217 		return NET_RX_SUCCESS;
2218 
2219 	/* if we've gotten here through NAPI, check netpoll */
2220 	if (netpoll_receive_skb(skb))
2221 		return NET_RX_DROP;
2222 
2223 	if (!skb->tstamp.tv64)
2224 		net_timestamp(skb);
2225 
2226 	if (!skb->iif)
2227 		skb->iif = skb->dev->ifindex;
2228 
2229 	null_or_orig = NULL;
2230 	orig_dev = skb->dev;
2231 	if (orig_dev->master) {
2232 		if (skb_bond_should_drop(skb))
2233 			null_or_orig = orig_dev; /* deliver only exact match */
2234 		else
2235 			skb->dev = orig_dev->master;
2236 	}
2237 
2238 	__get_cpu_var(netdev_rx_stat).total++;
2239 
2240 	skb_reset_network_header(skb);
2241 	skb_reset_transport_header(skb);
2242 	skb->mac_len = skb->network_header - skb->mac_header;
2243 
2244 	pt_prev = NULL;
2245 
2246 	rcu_read_lock();
2247 
2248 	/* Don't receive packets in an exiting network namespace */
2249 	if (!net_alive(dev_net(skb->dev))) {
2250 		kfree_skb(skb);
2251 		goto out;
2252 	}
2253 
2254 #ifdef CONFIG_NET_CLS_ACT
2255 	if (skb->tc_verd & TC_NCLS) {
2256 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2257 		goto ncls;
2258 	}
2259 #endif
2260 
2261 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2262 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2263 		    ptype->dev == orig_dev) {
2264 			if (pt_prev)
2265 				ret = deliver_skb(skb, pt_prev, orig_dev);
2266 			pt_prev = ptype;
2267 		}
2268 	}
2269 
2270 #ifdef CONFIG_NET_CLS_ACT
2271 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2272 	if (!skb)
2273 		goto out;
2274 ncls:
2275 #endif
2276 
2277 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2278 	if (!skb)
2279 		goto out;
2280 	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2281 	if (!skb)
2282 		goto out;
2283 
2284 	type = skb->protocol;
2285 	list_for_each_entry_rcu(ptype,
2286 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2287 		if (ptype->type == type &&
2288 		    (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2289 		     ptype->dev == orig_dev)) {
2290 			if (pt_prev)
2291 				ret = deliver_skb(skb, pt_prev, orig_dev);
2292 			pt_prev = ptype;
2293 		}
2294 	}
2295 
2296 	if (pt_prev) {
2297 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2298 	} else {
2299 		kfree_skb(skb);
2300 		/* Jamal, now you will not able to escape explaining
2301 		 * me how you were going to use this. :-)
2302 		 */
2303 		ret = NET_RX_DROP;
2304 	}
2305 
2306 out:
2307 	rcu_read_unlock();
2308 	return ret;
2309 }
2310 
2311 /* Network device is going away, flush any packets still pending  */
2312 static void flush_backlog(void *arg)
2313 {
2314 	struct net_device *dev = arg;
2315 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2316 	struct sk_buff *skb, *tmp;
2317 
2318 	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2319 		if (skb->dev == dev) {
2320 			__skb_unlink(skb, &queue->input_pkt_queue);
2321 			kfree_skb(skb);
2322 		}
2323 }
2324 
2325 static int napi_gro_complete(struct sk_buff *skb)
2326 {
2327 	struct packet_type *ptype;
2328 	__be16 type = skb->protocol;
2329 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2330 	int err = -ENOENT;
2331 
2332 	if (NAPI_GRO_CB(skb)->count == 1)
2333 		goto out;
2334 
2335 	rcu_read_lock();
2336 	list_for_each_entry_rcu(ptype, head, list) {
2337 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2338 			continue;
2339 
2340 		err = ptype->gro_complete(skb);
2341 		break;
2342 	}
2343 	rcu_read_unlock();
2344 
2345 	if (err) {
2346 		WARN_ON(&ptype->list == head);
2347 		kfree_skb(skb);
2348 		return NET_RX_SUCCESS;
2349 	}
2350 
2351 out:
2352 	skb_shinfo(skb)->gso_size = 0;
2353 	__skb_push(skb, -skb_network_offset(skb));
2354 	return netif_receive_skb(skb);
2355 }
2356 
2357 void napi_gro_flush(struct napi_struct *napi)
2358 {
2359 	struct sk_buff *skb, *next;
2360 
2361 	for (skb = napi->gro_list; skb; skb = next) {
2362 		next = skb->next;
2363 		skb->next = NULL;
2364 		napi_gro_complete(skb);
2365 	}
2366 
2367 	napi->gro_list = NULL;
2368 }
2369 EXPORT_SYMBOL(napi_gro_flush);
2370 
2371 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2372 {
2373 	struct sk_buff **pp = NULL;
2374 	struct packet_type *ptype;
2375 	__be16 type = skb->protocol;
2376 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2377 	int count = 0;
2378 	int same_flow;
2379 	int mac_len;
2380 	int free;
2381 
2382 	if (!(skb->dev->features & NETIF_F_GRO))
2383 		goto normal;
2384 
2385 	rcu_read_lock();
2386 	list_for_each_entry_rcu(ptype, head, list) {
2387 		struct sk_buff *p;
2388 
2389 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2390 			continue;
2391 
2392 		skb_reset_network_header(skb);
2393 		mac_len = skb->network_header - skb->mac_header;
2394 		skb->mac_len = mac_len;
2395 		NAPI_GRO_CB(skb)->same_flow = 0;
2396 		NAPI_GRO_CB(skb)->flush = 0;
2397 		NAPI_GRO_CB(skb)->free = 0;
2398 
2399 		for (p = napi->gro_list; p; p = p->next) {
2400 			count++;
2401 
2402 			if (!NAPI_GRO_CB(p)->same_flow)
2403 				continue;
2404 
2405 			if (p->mac_len != mac_len ||
2406 			    memcmp(skb_mac_header(p), skb_mac_header(skb),
2407 				   mac_len))
2408 				NAPI_GRO_CB(p)->same_flow = 0;
2409 		}
2410 
2411 		pp = ptype->gro_receive(&napi->gro_list, skb);
2412 		break;
2413 	}
2414 	rcu_read_unlock();
2415 
2416 	if (&ptype->list == head)
2417 		goto normal;
2418 
2419 	same_flow = NAPI_GRO_CB(skb)->same_flow;
2420 	free = NAPI_GRO_CB(skb)->free;
2421 
2422 	if (pp) {
2423 		struct sk_buff *nskb = *pp;
2424 
2425 		*pp = nskb->next;
2426 		nskb->next = NULL;
2427 		napi_gro_complete(nskb);
2428 		count--;
2429 	}
2430 
2431 	if (same_flow)
2432 		goto ok;
2433 
2434 	if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) {
2435 		__skb_push(skb, -skb_network_offset(skb));
2436 		goto normal;
2437 	}
2438 
2439 	NAPI_GRO_CB(skb)->count = 1;
2440 	skb_shinfo(skb)->gso_size = skb->len;
2441 	skb->next = napi->gro_list;
2442 	napi->gro_list = skb;
2443 
2444 ok:
2445 	return free;
2446 
2447 normal:
2448 	return -1;
2449 }
2450 EXPORT_SYMBOL(dev_gro_receive);
2451 
2452 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2453 {
2454 	struct sk_buff *p;
2455 
2456 	for (p = napi->gro_list; p; p = p->next) {
2457 		NAPI_GRO_CB(p)->same_flow = 1;
2458 		NAPI_GRO_CB(p)->flush = 0;
2459 	}
2460 
2461 	return dev_gro_receive(napi, skb);
2462 }
2463 
2464 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2465 {
2466 	switch (__napi_gro_receive(napi, skb)) {
2467 	case -1:
2468 		return netif_receive_skb(skb);
2469 
2470 	case 1:
2471 		kfree_skb(skb);
2472 		break;
2473 	}
2474 
2475 	return NET_RX_SUCCESS;
2476 }
2477 EXPORT_SYMBOL(napi_gro_receive);
2478 
2479 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2480 {
2481 	skb_shinfo(skb)->nr_frags = 0;
2482 
2483 	skb->len -= skb->data_len;
2484 	skb->truesize -= skb->data_len;
2485 	skb->data_len = 0;
2486 
2487 	__skb_pull(skb, skb_headlen(skb));
2488 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2489 
2490 	napi->skb = skb;
2491 }
2492 EXPORT_SYMBOL(napi_reuse_skb);
2493 
2494 struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
2495 				  struct napi_gro_fraginfo *info)
2496 {
2497 	struct net_device *dev = napi->dev;
2498 	struct sk_buff *skb = napi->skb;
2499 
2500 	napi->skb = NULL;
2501 
2502 	if (!skb) {
2503 		skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2504 		if (!skb)
2505 			goto out;
2506 
2507 		skb_reserve(skb, NET_IP_ALIGN);
2508 	}
2509 
2510 	BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
2511 	skb_shinfo(skb)->nr_frags = info->nr_frags;
2512 	memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags));
2513 
2514 	skb->data_len = info->len;
2515 	skb->len += info->len;
2516 	skb->truesize += info->len;
2517 
2518 	if (!pskb_may_pull(skb, ETH_HLEN)) {
2519 		napi_reuse_skb(napi, skb);
2520 		goto out;
2521 	}
2522 
2523 	skb->protocol = eth_type_trans(skb, dev);
2524 
2525 	skb->ip_summed = info->ip_summed;
2526 	skb->csum = info->csum;
2527 
2528 out:
2529 	return skb;
2530 }
2531 EXPORT_SYMBOL(napi_fraginfo_skb);
2532 
2533 int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
2534 {
2535 	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
2536 	int err = NET_RX_DROP;
2537 
2538 	if (!skb)
2539 		goto out;
2540 
2541 	err = NET_RX_SUCCESS;
2542 
2543 	switch (__napi_gro_receive(napi, skb)) {
2544 	case -1:
2545 		return netif_receive_skb(skb);
2546 
2547 	case 0:
2548 		goto out;
2549 	}
2550 
2551 	napi_reuse_skb(napi, skb);
2552 
2553 out:
2554 	return err;
2555 }
2556 EXPORT_SYMBOL(napi_gro_frags);
2557 
2558 static int process_backlog(struct napi_struct *napi, int quota)
2559 {
2560 	int work = 0;
2561 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2562 	unsigned long start_time = jiffies;
2563 
2564 	napi->weight = weight_p;
2565 	do {
2566 		struct sk_buff *skb;
2567 
2568 		local_irq_disable();
2569 		skb = __skb_dequeue(&queue->input_pkt_queue);
2570 		if (!skb) {
2571 			__napi_complete(napi);
2572 			local_irq_enable();
2573 			break;
2574 		}
2575 		local_irq_enable();
2576 
2577 		napi_gro_receive(napi, skb);
2578 	} while (++work < quota && jiffies == start_time);
2579 
2580 	napi_gro_flush(napi);
2581 
2582 	return work;
2583 }
2584 
2585 /**
2586  * __napi_schedule - schedule for receive
2587  * @n: entry to schedule
2588  *
2589  * The entry's receive function will be scheduled to run
2590  */
2591 void __napi_schedule(struct napi_struct *n)
2592 {
2593 	unsigned long flags;
2594 
2595 	local_irq_save(flags);
2596 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2597 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2598 	local_irq_restore(flags);
2599 }
2600 EXPORT_SYMBOL(__napi_schedule);
2601 
2602 void __napi_complete(struct napi_struct *n)
2603 {
2604 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2605 	BUG_ON(n->gro_list);
2606 
2607 	list_del(&n->poll_list);
2608 	smp_mb__before_clear_bit();
2609 	clear_bit(NAPI_STATE_SCHED, &n->state);
2610 }
2611 EXPORT_SYMBOL(__napi_complete);
2612 
2613 void napi_complete(struct napi_struct *n)
2614 {
2615 	unsigned long flags;
2616 
2617 	/*
2618 	 * don't let napi dequeue from the cpu poll list
2619 	 * just in case its running on a different cpu
2620 	 */
2621 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2622 		return;
2623 
2624 	napi_gro_flush(n);
2625 	local_irq_save(flags);
2626 	__napi_complete(n);
2627 	local_irq_restore(flags);
2628 }
2629 EXPORT_SYMBOL(napi_complete);
2630 
2631 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2632 		    int (*poll)(struct napi_struct *, int), int weight)
2633 {
2634 	INIT_LIST_HEAD(&napi->poll_list);
2635 	napi->gro_list = NULL;
2636 	napi->skb = NULL;
2637 	napi->poll = poll;
2638 	napi->weight = weight;
2639 	list_add(&napi->dev_list, &dev->napi_list);
2640 	napi->dev = dev;
2641 #ifdef CONFIG_NETPOLL
2642 	spin_lock_init(&napi->poll_lock);
2643 	napi->poll_owner = -1;
2644 #endif
2645 	set_bit(NAPI_STATE_SCHED, &napi->state);
2646 }
2647 EXPORT_SYMBOL(netif_napi_add);
2648 
2649 void netif_napi_del(struct napi_struct *napi)
2650 {
2651 	struct sk_buff *skb, *next;
2652 
2653 	list_del_init(&napi->dev_list);
2654 	kfree(napi->skb);
2655 
2656 	for (skb = napi->gro_list; skb; skb = next) {
2657 		next = skb->next;
2658 		skb->next = NULL;
2659 		kfree_skb(skb);
2660 	}
2661 
2662 	napi->gro_list = NULL;
2663 }
2664 EXPORT_SYMBOL(netif_napi_del);
2665 
2666 
2667 static void net_rx_action(struct softirq_action *h)
2668 {
2669 	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2670 	unsigned long time_limit = jiffies + 2;
2671 	int budget = netdev_budget;
2672 	void *have;
2673 
2674 	local_irq_disable();
2675 
2676 	while (!list_empty(list)) {
2677 		struct napi_struct *n;
2678 		int work, weight;
2679 
2680 		/* If softirq window is exhuasted then punt.
2681 		 * Allow this to run for 2 jiffies since which will allow
2682 		 * an average latency of 1.5/HZ.
2683 		 */
2684 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2685 			goto softnet_break;
2686 
2687 		local_irq_enable();
2688 
2689 		/* Even though interrupts have been re-enabled, this
2690 		 * access is safe because interrupts can only add new
2691 		 * entries to the tail of this list, and only ->poll()
2692 		 * calls can remove this head entry from the list.
2693 		 */
2694 		n = list_entry(list->next, struct napi_struct, poll_list);
2695 
2696 		have = netpoll_poll_lock(n);
2697 
2698 		weight = n->weight;
2699 
2700 		/* This NAPI_STATE_SCHED test is for avoiding a race
2701 		 * with netpoll's poll_napi().  Only the entity which
2702 		 * obtains the lock and sees NAPI_STATE_SCHED set will
2703 		 * actually make the ->poll() call.  Therefore we avoid
2704 		 * accidently calling ->poll() when NAPI is not scheduled.
2705 		 */
2706 		work = 0;
2707 		if (test_bit(NAPI_STATE_SCHED, &n->state))
2708 			work = n->poll(n, weight);
2709 
2710 		WARN_ON_ONCE(work > weight);
2711 
2712 		budget -= work;
2713 
2714 		local_irq_disable();
2715 
2716 		/* Drivers must not modify the NAPI state if they
2717 		 * consume the entire weight.  In such cases this code
2718 		 * still "owns" the NAPI instance and therefore can
2719 		 * move the instance around on the list at-will.
2720 		 */
2721 		if (unlikely(work == weight)) {
2722 			if (unlikely(napi_disable_pending(n)))
2723 				__napi_complete(n);
2724 			else
2725 				list_move_tail(&n->poll_list, list);
2726 		}
2727 
2728 		netpoll_poll_unlock(have);
2729 	}
2730 out:
2731 	local_irq_enable();
2732 
2733 #ifdef CONFIG_NET_DMA
2734 	/*
2735 	 * There may not be any more sk_buffs coming right now, so push
2736 	 * any pending DMA copies to hardware
2737 	 */
2738 	dma_issue_pending_all();
2739 #endif
2740 
2741 	return;
2742 
2743 softnet_break:
2744 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2745 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2746 	goto out;
2747 }
2748 
2749 static gifconf_func_t * gifconf_list [NPROTO];
2750 
2751 /**
2752  *	register_gifconf	-	register a SIOCGIF handler
2753  *	@family: Address family
2754  *	@gifconf: Function handler
2755  *
2756  *	Register protocol dependent address dumping routines. The handler
2757  *	that is passed must not be freed or reused until it has been replaced
2758  *	by another handler.
2759  */
2760 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2761 {
2762 	if (family >= NPROTO)
2763 		return -EINVAL;
2764 	gifconf_list[family] = gifconf;
2765 	return 0;
2766 }
2767 
2768 
2769 /*
2770  *	Map an interface index to its name (SIOCGIFNAME)
2771  */
2772 
2773 /*
2774  *	We need this ioctl for efficient implementation of the
2775  *	if_indextoname() function required by the IPv6 API.  Without
2776  *	it, we would have to search all the interfaces to find a
2777  *	match.  --pb
2778  */
2779 
2780 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2781 {
2782 	struct net_device *dev;
2783 	struct ifreq ifr;
2784 
2785 	/*
2786 	 *	Fetch the caller's info block.
2787 	 */
2788 
2789 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2790 		return -EFAULT;
2791 
2792 	read_lock(&dev_base_lock);
2793 	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2794 	if (!dev) {
2795 		read_unlock(&dev_base_lock);
2796 		return -ENODEV;
2797 	}
2798 
2799 	strcpy(ifr.ifr_name, dev->name);
2800 	read_unlock(&dev_base_lock);
2801 
2802 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2803 		return -EFAULT;
2804 	return 0;
2805 }
2806 
2807 /*
2808  *	Perform a SIOCGIFCONF call. This structure will change
2809  *	size eventually, and there is nothing I can do about it.
2810  *	Thus we will need a 'compatibility mode'.
2811  */
2812 
2813 static int dev_ifconf(struct net *net, char __user *arg)
2814 {
2815 	struct ifconf ifc;
2816 	struct net_device *dev;
2817 	char __user *pos;
2818 	int len;
2819 	int total;
2820 	int i;
2821 
2822 	/*
2823 	 *	Fetch the caller's info block.
2824 	 */
2825 
2826 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2827 		return -EFAULT;
2828 
2829 	pos = ifc.ifc_buf;
2830 	len = ifc.ifc_len;
2831 
2832 	/*
2833 	 *	Loop over the interfaces, and write an info block for each.
2834 	 */
2835 
2836 	total = 0;
2837 	for_each_netdev(net, dev) {
2838 		for (i = 0; i < NPROTO; i++) {
2839 			if (gifconf_list[i]) {
2840 				int done;
2841 				if (!pos)
2842 					done = gifconf_list[i](dev, NULL, 0);
2843 				else
2844 					done = gifconf_list[i](dev, pos + total,
2845 							       len - total);
2846 				if (done < 0)
2847 					return -EFAULT;
2848 				total += done;
2849 			}
2850 		}
2851 	}
2852 
2853 	/*
2854 	 *	All done.  Write the updated control block back to the caller.
2855 	 */
2856 	ifc.ifc_len = total;
2857 
2858 	/*
2859 	 * 	Both BSD and Solaris return 0 here, so we do too.
2860 	 */
2861 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2862 }
2863 
2864 #ifdef CONFIG_PROC_FS
2865 /*
2866  *	This is invoked by the /proc filesystem handler to display a device
2867  *	in detail.
2868  */
2869 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2870 	__acquires(dev_base_lock)
2871 {
2872 	struct net *net = seq_file_net(seq);
2873 	loff_t off;
2874 	struct net_device *dev;
2875 
2876 	read_lock(&dev_base_lock);
2877 	if (!*pos)
2878 		return SEQ_START_TOKEN;
2879 
2880 	off = 1;
2881 	for_each_netdev(net, dev)
2882 		if (off++ == *pos)
2883 			return dev;
2884 
2885 	return NULL;
2886 }
2887 
2888 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2889 {
2890 	struct net *net = seq_file_net(seq);
2891 	++*pos;
2892 	return v == SEQ_START_TOKEN ?
2893 		first_net_device(net) : next_net_device((struct net_device *)v);
2894 }
2895 
2896 void dev_seq_stop(struct seq_file *seq, void *v)
2897 	__releases(dev_base_lock)
2898 {
2899 	read_unlock(&dev_base_lock);
2900 }
2901 
2902 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2903 {
2904 	const struct net_device_stats *stats = dev_get_stats(dev);
2905 
2906 	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2907 		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2908 		   dev->name, stats->rx_bytes, stats->rx_packets,
2909 		   stats->rx_errors,
2910 		   stats->rx_dropped + stats->rx_missed_errors,
2911 		   stats->rx_fifo_errors,
2912 		   stats->rx_length_errors + stats->rx_over_errors +
2913 		    stats->rx_crc_errors + stats->rx_frame_errors,
2914 		   stats->rx_compressed, stats->multicast,
2915 		   stats->tx_bytes, stats->tx_packets,
2916 		   stats->tx_errors, stats->tx_dropped,
2917 		   stats->tx_fifo_errors, stats->collisions,
2918 		   stats->tx_carrier_errors +
2919 		    stats->tx_aborted_errors +
2920 		    stats->tx_window_errors +
2921 		    stats->tx_heartbeat_errors,
2922 		   stats->tx_compressed);
2923 }
2924 
2925 /*
2926  *	Called from the PROCfs module. This now uses the new arbitrary sized
2927  *	/proc/net interface to create /proc/net/dev
2928  */
2929 static int dev_seq_show(struct seq_file *seq, void *v)
2930 {
2931 	if (v == SEQ_START_TOKEN)
2932 		seq_puts(seq, "Inter-|   Receive                            "
2933 			      "                    |  Transmit\n"
2934 			      " face |bytes    packets errs drop fifo frame "
2935 			      "compressed multicast|bytes    packets errs "
2936 			      "drop fifo colls carrier compressed\n");
2937 	else
2938 		dev_seq_printf_stats(seq, v);
2939 	return 0;
2940 }
2941 
2942 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2943 {
2944 	struct netif_rx_stats *rc = NULL;
2945 
2946 	while (*pos < nr_cpu_ids)
2947 		if (cpu_online(*pos)) {
2948 			rc = &per_cpu(netdev_rx_stat, *pos);
2949 			break;
2950 		} else
2951 			++*pos;
2952 	return rc;
2953 }
2954 
2955 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2956 {
2957 	return softnet_get_online(pos);
2958 }
2959 
2960 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2961 {
2962 	++*pos;
2963 	return softnet_get_online(pos);
2964 }
2965 
2966 static void softnet_seq_stop(struct seq_file *seq, void *v)
2967 {
2968 }
2969 
2970 static int softnet_seq_show(struct seq_file *seq, void *v)
2971 {
2972 	struct netif_rx_stats *s = v;
2973 
2974 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2975 		   s->total, s->dropped, s->time_squeeze, 0,
2976 		   0, 0, 0, 0, /* was fastroute */
2977 		   s->cpu_collision );
2978 	return 0;
2979 }
2980 
2981 static const struct seq_operations dev_seq_ops = {
2982 	.start = dev_seq_start,
2983 	.next  = dev_seq_next,
2984 	.stop  = dev_seq_stop,
2985 	.show  = dev_seq_show,
2986 };
2987 
2988 static int dev_seq_open(struct inode *inode, struct file *file)
2989 {
2990 	return seq_open_net(inode, file, &dev_seq_ops,
2991 			    sizeof(struct seq_net_private));
2992 }
2993 
2994 static const struct file_operations dev_seq_fops = {
2995 	.owner	 = THIS_MODULE,
2996 	.open    = dev_seq_open,
2997 	.read    = seq_read,
2998 	.llseek  = seq_lseek,
2999 	.release = seq_release_net,
3000 };
3001 
3002 static const struct seq_operations softnet_seq_ops = {
3003 	.start = softnet_seq_start,
3004 	.next  = softnet_seq_next,
3005 	.stop  = softnet_seq_stop,
3006 	.show  = softnet_seq_show,
3007 };
3008 
3009 static int softnet_seq_open(struct inode *inode, struct file *file)
3010 {
3011 	return seq_open(file, &softnet_seq_ops);
3012 }
3013 
3014 static const struct file_operations softnet_seq_fops = {
3015 	.owner	 = THIS_MODULE,
3016 	.open    = softnet_seq_open,
3017 	.read    = seq_read,
3018 	.llseek  = seq_lseek,
3019 	.release = seq_release,
3020 };
3021 
3022 static void *ptype_get_idx(loff_t pos)
3023 {
3024 	struct packet_type *pt = NULL;
3025 	loff_t i = 0;
3026 	int t;
3027 
3028 	list_for_each_entry_rcu(pt, &ptype_all, list) {
3029 		if (i == pos)
3030 			return pt;
3031 		++i;
3032 	}
3033 
3034 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3035 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3036 			if (i == pos)
3037 				return pt;
3038 			++i;
3039 		}
3040 	}
3041 	return NULL;
3042 }
3043 
3044 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3045 	__acquires(RCU)
3046 {
3047 	rcu_read_lock();
3048 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3049 }
3050 
3051 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3052 {
3053 	struct packet_type *pt;
3054 	struct list_head *nxt;
3055 	int hash;
3056 
3057 	++*pos;
3058 	if (v == SEQ_START_TOKEN)
3059 		return ptype_get_idx(0);
3060 
3061 	pt = v;
3062 	nxt = pt->list.next;
3063 	if (pt->type == htons(ETH_P_ALL)) {
3064 		if (nxt != &ptype_all)
3065 			goto found;
3066 		hash = 0;
3067 		nxt = ptype_base[0].next;
3068 	} else
3069 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3070 
3071 	while (nxt == &ptype_base[hash]) {
3072 		if (++hash >= PTYPE_HASH_SIZE)
3073 			return NULL;
3074 		nxt = ptype_base[hash].next;
3075 	}
3076 found:
3077 	return list_entry(nxt, struct packet_type, list);
3078 }
3079 
3080 static void ptype_seq_stop(struct seq_file *seq, void *v)
3081 	__releases(RCU)
3082 {
3083 	rcu_read_unlock();
3084 }
3085 
3086 static int ptype_seq_show(struct seq_file *seq, void *v)
3087 {
3088 	struct packet_type *pt = v;
3089 
3090 	if (v == SEQ_START_TOKEN)
3091 		seq_puts(seq, "Type Device      Function\n");
3092 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3093 		if (pt->type == htons(ETH_P_ALL))
3094 			seq_puts(seq, "ALL ");
3095 		else
3096 			seq_printf(seq, "%04x", ntohs(pt->type));
3097 
3098 		seq_printf(seq, " %-8s %pF\n",
3099 			   pt->dev ? pt->dev->name : "", pt->func);
3100 	}
3101 
3102 	return 0;
3103 }
3104 
3105 static const struct seq_operations ptype_seq_ops = {
3106 	.start = ptype_seq_start,
3107 	.next  = ptype_seq_next,
3108 	.stop  = ptype_seq_stop,
3109 	.show  = ptype_seq_show,
3110 };
3111 
3112 static int ptype_seq_open(struct inode *inode, struct file *file)
3113 {
3114 	return seq_open_net(inode, file, &ptype_seq_ops,
3115 			sizeof(struct seq_net_private));
3116 }
3117 
3118 static const struct file_operations ptype_seq_fops = {
3119 	.owner	 = THIS_MODULE,
3120 	.open    = ptype_seq_open,
3121 	.read    = seq_read,
3122 	.llseek  = seq_lseek,
3123 	.release = seq_release_net,
3124 };
3125 
3126 
3127 static int __net_init dev_proc_net_init(struct net *net)
3128 {
3129 	int rc = -ENOMEM;
3130 
3131 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3132 		goto out;
3133 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3134 		goto out_dev;
3135 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3136 		goto out_softnet;
3137 
3138 	if (wext_proc_init(net))
3139 		goto out_ptype;
3140 	rc = 0;
3141 out:
3142 	return rc;
3143 out_ptype:
3144 	proc_net_remove(net, "ptype");
3145 out_softnet:
3146 	proc_net_remove(net, "softnet_stat");
3147 out_dev:
3148 	proc_net_remove(net, "dev");
3149 	goto out;
3150 }
3151 
3152 static void __net_exit dev_proc_net_exit(struct net *net)
3153 {
3154 	wext_proc_exit(net);
3155 
3156 	proc_net_remove(net, "ptype");
3157 	proc_net_remove(net, "softnet_stat");
3158 	proc_net_remove(net, "dev");
3159 }
3160 
3161 static struct pernet_operations __net_initdata dev_proc_ops = {
3162 	.init = dev_proc_net_init,
3163 	.exit = dev_proc_net_exit,
3164 };
3165 
3166 static int __init dev_proc_init(void)
3167 {
3168 	return register_pernet_subsys(&dev_proc_ops);
3169 }
3170 #else
3171 #define dev_proc_init() 0
3172 #endif	/* CONFIG_PROC_FS */
3173 
3174 
3175 /**
3176  *	netdev_set_master	-	set up master/slave pair
3177  *	@slave: slave device
3178  *	@master: new master device
3179  *
3180  *	Changes the master device of the slave. Pass %NULL to break the
3181  *	bonding. The caller must hold the RTNL semaphore. On a failure
3182  *	a negative errno code is returned. On success the reference counts
3183  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3184  *	function returns zero.
3185  */
3186 int netdev_set_master(struct net_device *slave, struct net_device *master)
3187 {
3188 	struct net_device *old = slave->master;
3189 
3190 	ASSERT_RTNL();
3191 
3192 	if (master) {
3193 		if (old)
3194 			return -EBUSY;
3195 		dev_hold(master);
3196 	}
3197 
3198 	slave->master = master;
3199 
3200 	synchronize_net();
3201 
3202 	if (old)
3203 		dev_put(old);
3204 
3205 	if (master)
3206 		slave->flags |= IFF_SLAVE;
3207 	else
3208 		slave->flags &= ~IFF_SLAVE;
3209 
3210 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3211 	return 0;
3212 }
3213 
3214 static void dev_change_rx_flags(struct net_device *dev, int flags)
3215 {
3216 	const struct net_device_ops *ops = dev->netdev_ops;
3217 
3218 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3219 		ops->ndo_change_rx_flags(dev, flags);
3220 }
3221 
3222 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3223 {
3224 	unsigned short old_flags = dev->flags;
3225 	uid_t uid;
3226 	gid_t gid;
3227 
3228 	ASSERT_RTNL();
3229 
3230 	dev->flags |= IFF_PROMISC;
3231 	dev->promiscuity += inc;
3232 	if (dev->promiscuity == 0) {
3233 		/*
3234 		 * Avoid overflow.
3235 		 * If inc causes overflow, untouch promisc and return error.
3236 		 */
3237 		if (inc < 0)
3238 			dev->flags &= ~IFF_PROMISC;
3239 		else {
3240 			dev->promiscuity -= inc;
3241 			printk(KERN_WARNING "%s: promiscuity touches roof, "
3242 				"set promiscuity failed, promiscuity feature "
3243 				"of device might be broken.\n", dev->name);
3244 			return -EOVERFLOW;
3245 		}
3246 	}
3247 	if (dev->flags != old_flags) {
3248 		printk(KERN_INFO "device %s %s promiscuous mode\n",
3249 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3250 							       "left");
3251 		if (audit_enabled) {
3252 			current_uid_gid(&uid, &gid);
3253 			audit_log(current->audit_context, GFP_ATOMIC,
3254 				AUDIT_ANOM_PROMISCUOUS,
3255 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3256 				dev->name, (dev->flags & IFF_PROMISC),
3257 				(old_flags & IFF_PROMISC),
3258 				audit_get_loginuid(current),
3259 				uid, gid,
3260 				audit_get_sessionid(current));
3261 		}
3262 
3263 		dev_change_rx_flags(dev, IFF_PROMISC);
3264 	}
3265 	return 0;
3266 }
3267 
3268 /**
3269  *	dev_set_promiscuity	- update promiscuity count on a device
3270  *	@dev: device
3271  *	@inc: modifier
3272  *
3273  *	Add or remove promiscuity from a device. While the count in the device
3274  *	remains above zero the interface remains promiscuous. Once it hits zero
3275  *	the device reverts back to normal filtering operation. A negative inc
3276  *	value is used to drop promiscuity on the device.
3277  *	Return 0 if successful or a negative errno code on error.
3278  */
3279 int dev_set_promiscuity(struct net_device *dev, int inc)
3280 {
3281 	unsigned short old_flags = dev->flags;
3282 	int err;
3283 
3284 	err = __dev_set_promiscuity(dev, inc);
3285 	if (err < 0)
3286 		return err;
3287 	if (dev->flags != old_flags)
3288 		dev_set_rx_mode(dev);
3289 	return err;
3290 }
3291 
3292 /**
3293  *	dev_set_allmulti	- update allmulti count on a device
3294  *	@dev: device
3295  *	@inc: modifier
3296  *
3297  *	Add or remove reception of all multicast frames to a device. While the
3298  *	count in the device remains above zero the interface remains listening
3299  *	to all interfaces. Once it hits zero the device reverts back to normal
3300  *	filtering operation. A negative @inc value is used to drop the counter
3301  *	when releasing a resource needing all multicasts.
3302  *	Return 0 if successful or a negative errno code on error.
3303  */
3304 
3305 int dev_set_allmulti(struct net_device *dev, int inc)
3306 {
3307 	unsigned short old_flags = dev->flags;
3308 
3309 	ASSERT_RTNL();
3310 
3311 	dev->flags |= IFF_ALLMULTI;
3312 	dev->allmulti += inc;
3313 	if (dev->allmulti == 0) {
3314 		/*
3315 		 * Avoid overflow.
3316 		 * If inc causes overflow, untouch allmulti and return error.
3317 		 */
3318 		if (inc < 0)
3319 			dev->flags &= ~IFF_ALLMULTI;
3320 		else {
3321 			dev->allmulti -= inc;
3322 			printk(KERN_WARNING "%s: allmulti touches roof, "
3323 				"set allmulti failed, allmulti feature of "
3324 				"device might be broken.\n", dev->name);
3325 			return -EOVERFLOW;
3326 		}
3327 	}
3328 	if (dev->flags ^ old_flags) {
3329 		dev_change_rx_flags(dev, IFF_ALLMULTI);
3330 		dev_set_rx_mode(dev);
3331 	}
3332 	return 0;
3333 }
3334 
3335 /*
3336  *	Upload unicast and multicast address lists to device and
3337  *	configure RX filtering. When the device doesn't support unicast
3338  *	filtering it is put in promiscuous mode while unicast addresses
3339  *	are present.
3340  */
3341 void __dev_set_rx_mode(struct net_device *dev)
3342 {
3343 	const struct net_device_ops *ops = dev->netdev_ops;
3344 
3345 	/* dev_open will call this function so the list will stay sane. */
3346 	if (!(dev->flags&IFF_UP))
3347 		return;
3348 
3349 	if (!netif_device_present(dev))
3350 		return;
3351 
3352 	if (ops->ndo_set_rx_mode)
3353 		ops->ndo_set_rx_mode(dev);
3354 	else {
3355 		/* Unicast addresses changes may only happen under the rtnl,
3356 		 * therefore calling __dev_set_promiscuity here is safe.
3357 		 */
3358 		if (dev->uc_count > 0 && !dev->uc_promisc) {
3359 			__dev_set_promiscuity(dev, 1);
3360 			dev->uc_promisc = 1;
3361 		} else if (dev->uc_count == 0 && dev->uc_promisc) {
3362 			__dev_set_promiscuity(dev, -1);
3363 			dev->uc_promisc = 0;
3364 		}
3365 
3366 		if (ops->ndo_set_multicast_list)
3367 			ops->ndo_set_multicast_list(dev);
3368 	}
3369 }
3370 
3371 void dev_set_rx_mode(struct net_device *dev)
3372 {
3373 	netif_addr_lock_bh(dev);
3374 	__dev_set_rx_mode(dev);
3375 	netif_addr_unlock_bh(dev);
3376 }
3377 
3378 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3379 		      void *addr, int alen, int glbl)
3380 {
3381 	struct dev_addr_list *da;
3382 
3383 	for (; (da = *list) != NULL; list = &da->next) {
3384 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3385 		    alen == da->da_addrlen) {
3386 			if (glbl) {
3387 				int old_glbl = da->da_gusers;
3388 				da->da_gusers = 0;
3389 				if (old_glbl == 0)
3390 					break;
3391 			}
3392 			if (--da->da_users)
3393 				return 0;
3394 
3395 			*list = da->next;
3396 			kfree(da);
3397 			(*count)--;
3398 			return 0;
3399 		}
3400 	}
3401 	return -ENOENT;
3402 }
3403 
3404 int __dev_addr_add(struct dev_addr_list **list, int *count,
3405 		   void *addr, int alen, int glbl)
3406 {
3407 	struct dev_addr_list *da;
3408 
3409 	for (da = *list; da != NULL; da = da->next) {
3410 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3411 		    da->da_addrlen == alen) {
3412 			if (glbl) {
3413 				int old_glbl = da->da_gusers;
3414 				da->da_gusers = 1;
3415 				if (old_glbl)
3416 					return 0;
3417 			}
3418 			da->da_users++;
3419 			return 0;
3420 		}
3421 	}
3422 
3423 	da = kzalloc(sizeof(*da), GFP_ATOMIC);
3424 	if (da == NULL)
3425 		return -ENOMEM;
3426 	memcpy(da->da_addr, addr, alen);
3427 	da->da_addrlen = alen;
3428 	da->da_users = 1;
3429 	da->da_gusers = glbl ? 1 : 0;
3430 	da->next = *list;
3431 	*list = da;
3432 	(*count)++;
3433 	return 0;
3434 }
3435 
3436 /**
3437  *	dev_unicast_delete	- Release secondary unicast address.
3438  *	@dev: device
3439  *	@addr: address to delete
3440  *	@alen: length of @addr
3441  *
3442  *	Release reference to a secondary unicast address and remove it
3443  *	from the device if the reference count drops to zero.
3444  *
3445  * 	The caller must hold the rtnl_mutex.
3446  */
3447 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3448 {
3449 	int err;
3450 
3451 	ASSERT_RTNL();
3452 
3453 	netif_addr_lock_bh(dev);
3454 	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3455 	if (!err)
3456 		__dev_set_rx_mode(dev);
3457 	netif_addr_unlock_bh(dev);
3458 	return err;
3459 }
3460 EXPORT_SYMBOL(dev_unicast_delete);
3461 
3462 /**
3463  *	dev_unicast_add		- add a secondary unicast address
3464  *	@dev: device
3465  *	@addr: address to add
3466  *	@alen: length of @addr
3467  *
3468  *	Add a secondary unicast address to the device or increase
3469  *	the reference count if it already exists.
3470  *
3471  *	The caller must hold the rtnl_mutex.
3472  */
3473 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3474 {
3475 	int err;
3476 
3477 	ASSERT_RTNL();
3478 
3479 	netif_addr_lock_bh(dev);
3480 	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3481 	if (!err)
3482 		__dev_set_rx_mode(dev);
3483 	netif_addr_unlock_bh(dev);
3484 	return err;
3485 }
3486 EXPORT_SYMBOL(dev_unicast_add);
3487 
3488 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3489 		    struct dev_addr_list **from, int *from_count)
3490 {
3491 	struct dev_addr_list *da, *next;
3492 	int err = 0;
3493 
3494 	da = *from;
3495 	while (da != NULL) {
3496 		next = da->next;
3497 		if (!da->da_synced) {
3498 			err = __dev_addr_add(to, to_count,
3499 					     da->da_addr, da->da_addrlen, 0);
3500 			if (err < 0)
3501 				break;
3502 			da->da_synced = 1;
3503 			da->da_users++;
3504 		} else if (da->da_users == 1) {
3505 			__dev_addr_delete(to, to_count,
3506 					  da->da_addr, da->da_addrlen, 0);
3507 			__dev_addr_delete(from, from_count,
3508 					  da->da_addr, da->da_addrlen, 0);
3509 		}
3510 		da = next;
3511 	}
3512 	return err;
3513 }
3514 
3515 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3516 		       struct dev_addr_list **from, int *from_count)
3517 {
3518 	struct dev_addr_list *da, *next;
3519 
3520 	da = *from;
3521 	while (da != NULL) {
3522 		next = da->next;
3523 		if (da->da_synced) {
3524 			__dev_addr_delete(to, to_count,
3525 					  da->da_addr, da->da_addrlen, 0);
3526 			da->da_synced = 0;
3527 			__dev_addr_delete(from, from_count,
3528 					  da->da_addr, da->da_addrlen, 0);
3529 		}
3530 		da = next;
3531 	}
3532 }
3533 
3534 /**
3535  *	dev_unicast_sync - Synchronize device's unicast list to another device
3536  *	@to: destination device
3537  *	@from: source device
3538  *
3539  *	Add newly added addresses to the destination device and release
3540  *	addresses that have no users left. The source device must be
3541  *	locked by netif_tx_lock_bh.
3542  *
3543  *	This function is intended to be called from the dev->set_rx_mode
3544  *	function of layered software devices.
3545  */
3546 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3547 {
3548 	int err = 0;
3549 
3550 	netif_addr_lock_bh(to);
3551 	err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3552 			      &from->uc_list, &from->uc_count);
3553 	if (!err)
3554 		__dev_set_rx_mode(to);
3555 	netif_addr_unlock_bh(to);
3556 	return err;
3557 }
3558 EXPORT_SYMBOL(dev_unicast_sync);
3559 
3560 /**
3561  *	dev_unicast_unsync - Remove synchronized addresses from the destination device
3562  *	@to: destination device
3563  *	@from: source device
3564  *
3565  *	Remove all addresses that were added to the destination device by
3566  *	dev_unicast_sync(). This function is intended to be called from the
3567  *	dev->stop function of layered software devices.
3568  */
3569 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3570 {
3571 	netif_addr_lock_bh(from);
3572 	netif_addr_lock(to);
3573 
3574 	__dev_addr_unsync(&to->uc_list, &to->uc_count,
3575 			  &from->uc_list, &from->uc_count);
3576 	__dev_set_rx_mode(to);
3577 
3578 	netif_addr_unlock(to);
3579 	netif_addr_unlock_bh(from);
3580 }
3581 EXPORT_SYMBOL(dev_unicast_unsync);
3582 
3583 static void __dev_addr_discard(struct dev_addr_list **list)
3584 {
3585 	struct dev_addr_list *tmp;
3586 
3587 	while (*list != NULL) {
3588 		tmp = *list;
3589 		*list = tmp->next;
3590 		if (tmp->da_users > tmp->da_gusers)
3591 			printk("__dev_addr_discard: address leakage! "
3592 			       "da_users=%d\n", tmp->da_users);
3593 		kfree(tmp);
3594 	}
3595 }
3596 
3597 static void dev_addr_discard(struct net_device *dev)
3598 {
3599 	netif_addr_lock_bh(dev);
3600 
3601 	__dev_addr_discard(&dev->uc_list);
3602 	dev->uc_count = 0;
3603 
3604 	__dev_addr_discard(&dev->mc_list);
3605 	dev->mc_count = 0;
3606 
3607 	netif_addr_unlock_bh(dev);
3608 }
3609 
3610 /**
3611  *	dev_get_flags - get flags reported to userspace
3612  *	@dev: device
3613  *
3614  *	Get the combination of flag bits exported through APIs to userspace.
3615  */
3616 unsigned dev_get_flags(const struct net_device *dev)
3617 {
3618 	unsigned flags;
3619 
3620 	flags = (dev->flags & ~(IFF_PROMISC |
3621 				IFF_ALLMULTI |
3622 				IFF_RUNNING |
3623 				IFF_LOWER_UP |
3624 				IFF_DORMANT)) |
3625 		(dev->gflags & (IFF_PROMISC |
3626 				IFF_ALLMULTI));
3627 
3628 	if (netif_running(dev)) {
3629 		if (netif_oper_up(dev))
3630 			flags |= IFF_RUNNING;
3631 		if (netif_carrier_ok(dev))
3632 			flags |= IFF_LOWER_UP;
3633 		if (netif_dormant(dev))
3634 			flags |= IFF_DORMANT;
3635 	}
3636 
3637 	return flags;
3638 }
3639 
3640 /**
3641  *	dev_change_flags - change device settings
3642  *	@dev: device
3643  *	@flags: device state flags
3644  *
3645  *	Change settings on device based state flags. The flags are
3646  *	in the userspace exported format.
3647  */
3648 int dev_change_flags(struct net_device *dev, unsigned flags)
3649 {
3650 	int ret, changes;
3651 	int old_flags = dev->flags;
3652 
3653 	ASSERT_RTNL();
3654 
3655 	/*
3656 	 *	Set the flags on our device.
3657 	 */
3658 
3659 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3660 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3661 			       IFF_AUTOMEDIA)) |
3662 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3663 				    IFF_ALLMULTI));
3664 
3665 	/*
3666 	 *	Load in the correct multicast list now the flags have changed.
3667 	 */
3668 
3669 	if ((old_flags ^ flags) & IFF_MULTICAST)
3670 		dev_change_rx_flags(dev, IFF_MULTICAST);
3671 
3672 	dev_set_rx_mode(dev);
3673 
3674 	/*
3675 	 *	Have we downed the interface. We handle IFF_UP ourselves
3676 	 *	according to user attempts to set it, rather than blindly
3677 	 *	setting it.
3678 	 */
3679 
3680 	ret = 0;
3681 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
3682 		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3683 
3684 		if (!ret)
3685 			dev_set_rx_mode(dev);
3686 	}
3687 
3688 	if (dev->flags & IFF_UP &&
3689 	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3690 					  IFF_VOLATILE)))
3691 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
3692 
3693 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
3694 		int inc = (flags & IFF_PROMISC) ? +1 : -1;
3695 		dev->gflags ^= IFF_PROMISC;
3696 		dev_set_promiscuity(dev, inc);
3697 	}
3698 
3699 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3700 	   is important. Some (broken) drivers set IFF_PROMISC, when
3701 	   IFF_ALLMULTI is requested not asking us and not reporting.
3702 	 */
3703 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3704 		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3705 		dev->gflags ^= IFF_ALLMULTI;
3706 		dev_set_allmulti(dev, inc);
3707 	}
3708 
3709 	/* Exclude state transition flags, already notified */
3710 	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3711 	if (changes)
3712 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3713 
3714 	return ret;
3715 }
3716 
3717 /**
3718  *	dev_set_mtu - Change maximum transfer unit
3719  *	@dev: device
3720  *	@new_mtu: new transfer unit
3721  *
3722  *	Change the maximum transfer size of the network device.
3723  */
3724 int dev_set_mtu(struct net_device *dev, int new_mtu)
3725 {
3726 	const struct net_device_ops *ops = dev->netdev_ops;
3727 	int err;
3728 
3729 	if (new_mtu == dev->mtu)
3730 		return 0;
3731 
3732 	/*	MTU must be positive.	 */
3733 	if (new_mtu < 0)
3734 		return -EINVAL;
3735 
3736 	if (!netif_device_present(dev))
3737 		return -ENODEV;
3738 
3739 	err = 0;
3740 	if (ops->ndo_change_mtu)
3741 		err = ops->ndo_change_mtu(dev, new_mtu);
3742 	else
3743 		dev->mtu = new_mtu;
3744 
3745 	if (!err && dev->flags & IFF_UP)
3746 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3747 	return err;
3748 }
3749 
3750 /**
3751  *	dev_set_mac_address - Change Media Access Control Address
3752  *	@dev: device
3753  *	@sa: new address
3754  *
3755  *	Change the hardware (MAC) address of the device
3756  */
3757 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3758 {
3759 	const struct net_device_ops *ops = dev->netdev_ops;
3760 	int err;
3761 
3762 	if (!ops->ndo_set_mac_address)
3763 		return -EOPNOTSUPP;
3764 	if (sa->sa_family != dev->type)
3765 		return -EINVAL;
3766 	if (!netif_device_present(dev))
3767 		return -ENODEV;
3768 	err = ops->ndo_set_mac_address(dev, sa);
3769 	if (!err)
3770 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3771 	return err;
3772 }
3773 
3774 /*
3775  *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3776  */
3777 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3778 {
3779 	int err;
3780 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3781 
3782 	if (!dev)
3783 		return -ENODEV;
3784 
3785 	switch (cmd) {
3786 		case SIOCGIFFLAGS:	/* Get interface flags */
3787 			ifr->ifr_flags = dev_get_flags(dev);
3788 			return 0;
3789 
3790 		case SIOCGIFMETRIC:	/* Get the metric on the interface
3791 					   (currently unused) */
3792 			ifr->ifr_metric = 0;
3793 			return 0;
3794 
3795 		case SIOCGIFMTU:	/* Get the MTU of a device */
3796 			ifr->ifr_mtu = dev->mtu;
3797 			return 0;
3798 
3799 		case SIOCGIFHWADDR:
3800 			if (!dev->addr_len)
3801 				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3802 			else
3803 				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3804 				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3805 			ifr->ifr_hwaddr.sa_family = dev->type;
3806 			return 0;
3807 
3808 		case SIOCGIFSLAVE:
3809 			err = -EINVAL;
3810 			break;
3811 
3812 		case SIOCGIFMAP:
3813 			ifr->ifr_map.mem_start = dev->mem_start;
3814 			ifr->ifr_map.mem_end   = dev->mem_end;
3815 			ifr->ifr_map.base_addr = dev->base_addr;
3816 			ifr->ifr_map.irq       = dev->irq;
3817 			ifr->ifr_map.dma       = dev->dma;
3818 			ifr->ifr_map.port      = dev->if_port;
3819 			return 0;
3820 
3821 		case SIOCGIFINDEX:
3822 			ifr->ifr_ifindex = dev->ifindex;
3823 			return 0;
3824 
3825 		case SIOCGIFTXQLEN:
3826 			ifr->ifr_qlen = dev->tx_queue_len;
3827 			return 0;
3828 
3829 		default:
3830 			/* dev_ioctl() should ensure this case
3831 			 * is never reached
3832 			 */
3833 			WARN_ON(1);
3834 			err = -EINVAL;
3835 			break;
3836 
3837 	}
3838 	return err;
3839 }
3840 
3841 /*
3842  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
3843  */
3844 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3845 {
3846 	int err;
3847 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3848 	const struct net_device_ops *ops;
3849 
3850 	if (!dev)
3851 		return -ENODEV;
3852 
3853 	ops = dev->netdev_ops;
3854 
3855 	switch (cmd) {
3856 		case SIOCSIFFLAGS:	/* Set interface flags */
3857 			return dev_change_flags(dev, ifr->ifr_flags);
3858 
3859 		case SIOCSIFMETRIC:	/* Set the metric on the interface
3860 					   (currently unused) */
3861 			return -EOPNOTSUPP;
3862 
3863 		case SIOCSIFMTU:	/* Set the MTU of a device */
3864 			return dev_set_mtu(dev, ifr->ifr_mtu);
3865 
3866 		case SIOCSIFHWADDR:
3867 			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3868 
3869 		case SIOCSIFHWBROADCAST:
3870 			if (ifr->ifr_hwaddr.sa_family != dev->type)
3871 				return -EINVAL;
3872 			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3873 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3874 			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3875 			return 0;
3876 
3877 		case SIOCSIFMAP:
3878 			if (ops->ndo_set_config) {
3879 				if (!netif_device_present(dev))
3880 					return -ENODEV;
3881 				return ops->ndo_set_config(dev, &ifr->ifr_map);
3882 			}
3883 			return -EOPNOTSUPP;
3884 
3885 		case SIOCADDMULTI:
3886 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3887 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3888 				return -EINVAL;
3889 			if (!netif_device_present(dev))
3890 				return -ENODEV;
3891 			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3892 					  dev->addr_len, 1);
3893 
3894 		case SIOCDELMULTI:
3895 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3896 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3897 				return -EINVAL;
3898 			if (!netif_device_present(dev))
3899 				return -ENODEV;
3900 			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3901 					     dev->addr_len, 1);
3902 
3903 		case SIOCSIFTXQLEN:
3904 			if (ifr->ifr_qlen < 0)
3905 				return -EINVAL;
3906 			dev->tx_queue_len = ifr->ifr_qlen;
3907 			return 0;
3908 
3909 		case SIOCSIFNAME:
3910 			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3911 			return dev_change_name(dev, ifr->ifr_newname);
3912 
3913 		/*
3914 		 *	Unknown or private ioctl
3915 		 */
3916 
3917 		default:
3918 			if ((cmd >= SIOCDEVPRIVATE &&
3919 			    cmd <= SIOCDEVPRIVATE + 15) ||
3920 			    cmd == SIOCBONDENSLAVE ||
3921 			    cmd == SIOCBONDRELEASE ||
3922 			    cmd == SIOCBONDSETHWADDR ||
3923 			    cmd == SIOCBONDSLAVEINFOQUERY ||
3924 			    cmd == SIOCBONDINFOQUERY ||
3925 			    cmd == SIOCBONDCHANGEACTIVE ||
3926 			    cmd == SIOCGMIIPHY ||
3927 			    cmd == SIOCGMIIREG ||
3928 			    cmd == SIOCSMIIREG ||
3929 			    cmd == SIOCBRADDIF ||
3930 			    cmd == SIOCBRDELIF ||
3931 			    cmd == SIOCWANDEV) {
3932 				err = -EOPNOTSUPP;
3933 				if (ops->ndo_do_ioctl) {
3934 					if (netif_device_present(dev))
3935 						err = ops->ndo_do_ioctl(dev, ifr, cmd);
3936 					else
3937 						err = -ENODEV;
3938 				}
3939 			} else
3940 				err = -EINVAL;
3941 
3942 	}
3943 	return err;
3944 }
3945 
3946 /*
3947  *	This function handles all "interface"-type I/O control requests. The actual
3948  *	'doing' part of this is dev_ifsioc above.
3949  */
3950 
3951 /**
3952  *	dev_ioctl	-	network device ioctl
3953  *	@net: the applicable net namespace
3954  *	@cmd: command to issue
3955  *	@arg: pointer to a struct ifreq in user space
3956  *
3957  *	Issue ioctl functions to devices. This is normally called by the
3958  *	user space syscall interfaces but can sometimes be useful for
3959  *	other purposes. The return value is the return from the syscall if
3960  *	positive or a negative errno code on error.
3961  */
3962 
3963 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3964 {
3965 	struct ifreq ifr;
3966 	int ret;
3967 	char *colon;
3968 
3969 	/* One special case: SIOCGIFCONF takes ifconf argument
3970 	   and requires shared lock, because it sleeps writing
3971 	   to user space.
3972 	 */
3973 
3974 	if (cmd == SIOCGIFCONF) {
3975 		rtnl_lock();
3976 		ret = dev_ifconf(net, (char __user *) arg);
3977 		rtnl_unlock();
3978 		return ret;
3979 	}
3980 	if (cmd == SIOCGIFNAME)
3981 		return dev_ifname(net, (struct ifreq __user *)arg);
3982 
3983 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3984 		return -EFAULT;
3985 
3986 	ifr.ifr_name[IFNAMSIZ-1] = 0;
3987 
3988 	colon = strchr(ifr.ifr_name, ':');
3989 	if (colon)
3990 		*colon = 0;
3991 
3992 	/*
3993 	 *	See which interface the caller is talking about.
3994 	 */
3995 
3996 	switch (cmd) {
3997 		/*
3998 		 *	These ioctl calls:
3999 		 *	- can be done by all.
4000 		 *	- atomic and do not require locking.
4001 		 *	- return a value
4002 		 */
4003 		case SIOCGIFFLAGS:
4004 		case SIOCGIFMETRIC:
4005 		case SIOCGIFMTU:
4006 		case SIOCGIFHWADDR:
4007 		case SIOCGIFSLAVE:
4008 		case SIOCGIFMAP:
4009 		case SIOCGIFINDEX:
4010 		case SIOCGIFTXQLEN:
4011 			dev_load(net, ifr.ifr_name);
4012 			read_lock(&dev_base_lock);
4013 			ret = dev_ifsioc_locked(net, &ifr, cmd);
4014 			read_unlock(&dev_base_lock);
4015 			if (!ret) {
4016 				if (colon)
4017 					*colon = ':';
4018 				if (copy_to_user(arg, &ifr,
4019 						 sizeof(struct ifreq)))
4020 					ret = -EFAULT;
4021 			}
4022 			return ret;
4023 
4024 		case SIOCETHTOOL:
4025 			dev_load(net, ifr.ifr_name);
4026 			rtnl_lock();
4027 			ret = dev_ethtool(net, &ifr);
4028 			rtnl_unlock();
4029 			if (!ret) {
4030 				if (colon)
4031 					*colon = ':';
4032 				if (copy_to_user(arg, &ifr,
4033 						 sizeof(struct ifreq)))
4034 					ret = -EFAULT;
4035 			}
4036 			return ret;
4037 
4038 		/*
4039 		 *	These ioctl calls:
4040 		 *	- require superuser power.
4041 		 *	- require strict serialization.
4042 		 *	- return a value
4043 		 */
4044 		case SIOCGMIIPHY:
4045 		case SIOCGMIIREG:
4046 		case SIOCSIFNAME:
4047 			if (!capable(CAP_NET_ADMIN))
4048 				return -EPERM;
4049 			dev_load(net, ifr.ifr_name);
4050 			rtnl_lock();
4051 			ret = dev_ifsioc(net, &ifr, cmd);
4052 			rtnl_unlock();
4053 			if (!ret) {
4054 				if (colon)
4055 					*colon = ':';
4056 				if (copy_to_user(arg, &ifr,
4057 						 sizeof(struct ifreq)))
4058 					ret = -EFAULT;
4059 			}
4060 			return ret;
4061 
4062 		/*
4063 		 *	These ioctl calls:
4064 		 *	- require superuser power.
4065 		 *	- require strict serialization.
4066 		 *	- do not return a value
4067 		 */
4068 		case SIOCSIFFLAGS:
4069 		case SIOCSIFMETRIC:
4070 		case SIOCSIFMTU:
4071 		case SIOCSIFMAP:
4072 		case SIOCSIFHWADDR:
4073 		case SIOCSIFSLAVE:
4074 		case SIOCADDMULTI:
4075 		case SIOCDELMULTI:
4076 		case SIOCSIFHWBROADCAST:
4077 		case SIOCSIFTXQLEN:
4078 		case SIOCSMIIREG:
4079 		case SIOCBONDENSLAVE:
4080 		case SIOCBONDRELEASE:
4081 		case SIOCBONDSETHWADDR:
4082 		case SIOCBONDCHANGEACTIVE:
4083 		case SIOCBRADDIF:
4084 		case SIOCBRDELIF:
4085 			if (!capable(CAP_NET_ADMIN))
4086 				return -EPERM;
4087 			/* fall through */
4088 		case SIOCBONDSLAVEINFOQUERY:
4089 		case SIOCBONDINFOQUERY:
4090 			dev_load(net, ifr.ifr_name);
4091 			rtnl_lock();
4092 			ret = dev_ifsioc(net, &ifr, cmd);
4093 			rtnl_unlock();
4094 			return ret;
4095 
4096 		case SIOCGIFMEM:
4097 			/* Get the per device memory space. We can add this but
4098 			 * currently do not support it */
4099 		case SIOCSIFMEM:
4100 			/* Set the per device memory buffer space.
4101 			 * Not applicable in our case */
4102 		case SIOCSIFLINK:
4103 			return -EINVAL;
4104 
4105 		/*
4106 		 *	Unknown or private ioctl.
4107 		 */
4108 		default:
4109 			if (cmd == SIOCWANDEV ||
4110 			    (cmd >= SIOCDEVPRIVATE &&
4111 			     cmd <= SIOCDEVPRIVATE + 15)) {
4112 				dev_load(net, ifr.ifr_name);
4113 				rtnl_lock();
4114 				ret = dev_ifsioc(net, &ifr, cmd);
4115 				rtnl_unlock();
4116 				if (!ret && copy_to_user(arg, &ifr,
4117 							 sizeof(struct ifreq)))
4118 					ret = -EFAULT;
4119 				return ret;
4120 			}
4121 			/* Take care of Wireless Extensions */
4122 			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4123 				return wext_handle_ioctl(net, &ifr, cmd, arg);
4124 			return -EINVAL;
4125 	}
4126 }
4127 
4128 
4129 /**
4130  *	dev_new_index	-	allocate an ifindex
4131  *	@net: the applicable net namespace
4132  *
4133  *	Returns a suitable unique value for a new device interface
4134  *	number.  The caller must hold the rtnl semaphore or the
4135  *	dev_base_lock to be sure it remains unique.
4136  */
4137 static int dev_new_index(struct net *net)
4138 {
4139 	static int ifindex;
4140 	for (;;) {
4141 		if (++ifindex <= 0)
4142 			ifindex = 1;
4143 		if (!__dev_get_by_index(net, ifindex))
4144 			return ifindex;
4145 	}
4146 }
4147 
4148 /* Delayed registration/unregisteration */
4149 static LIST_HEAD(net_todo_list);
4150 
4151 static void net_set_todo(struct net_device *dev)
4152 {
4153 	list_add_tail(&dev->todo_list, &net_todo_list);
4154 }
4155 
4156 static void rollback_registered(struct net_device *dev)
4157 {
4158 	BUG_ON(dev_boot_phase);
4159 	ASSERT_RTNL();
4160 
4161 	/* Some devices call without registering for initialization unwind. */
4162 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4163 		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4164 				  "was registered\n", dev->name, dev);
4165 
4166 		WARN_ON(1);
4167 		return;
4168 	}
4169 
4170 	BUG_ON(dev->reg_state != NETREG_REGISTERED);
4171 
4172 	/* If device is running, close it first. */
4173 	dev_close(dev);
4174 
4175 	/* And unlink it from device chain. */
4176 	unlist_netdevice(dev);
4177 
4178 	dev->reg_state = NETREG_UNREGISTERING;
4179 
4180 	synchronize_net();
4181 
4182 	/* Shutdown queueing discipline. */
4183 	dev_shutdown(dev);
4184 
4185 
4186 	/* Notify protocols, that we are about to destroy
4187 	   this device. They should clean all the things.
4188 	*/
4189 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4190 
4191 	/*
4192 	 *	Flush the unicast and multicast chains
4193 	 */
4194 	dev_addr_discard(dev);
4195 
4196 	if (dev->netdev_ops->ndo_uninit)
4197 		dev->netdev_ops->ndo_uninit(dev);
4198 
4199 	/* Notifier chain MUST detach us from master device. */
4200 	WARN_ON(dev->master);
4201 
4202 	/* Remove entries from kobject tree */
4203 	netdev_unregister_kobject(dev);
4204 
4205 	synchronize_net();
4206 
4207 	dev_put(dev);
4208 }
4209 
4210 static void __netdev_init_queue_locks_one(struct net_device *dev,
4211 					  struct netdev_queue *dev_queue,
4212 					  void *_unused)
4213 {
4214 	spin_lock_init(&dev_queue->_xmit_lock);
4215 	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4216 	dev_queue->xmit_lock_owner = -1;
4217 }
4218 
4219 static void netdev_init_queue_locks(struct net_device *dev)
4220 {
4221 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4222 	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4223 }
4224 
4225 unsigned long netdev_fix_features(unsigned long features, const char *name)
4226 {
4227 	/* Fix illegal SG+CSUM combinations. */
4228 	if ((features & NETIF_F_SG) &&
4229 	    !(features & NETIF_F_ALL_CSUM)) {
4230 		if (name)
4231 			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4232 			       "checksum feature.\n", name);
4233 		features &= ~NETIF_F_SG;
4234 	}
4235 
4236 	/* TSO requires that SG is present as well. */
4237 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4238 		if (name)
4239 			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4240 			       "SG feature.\n", name);
4241 		features &= ~NETIF_F_TSO;
4242 	}
4243 
4244 	if (features & NETIF_F_UFO) {
4245 		if (!(features & NETIF_F_GEN_CSUM)) {
4246 			if (name)
4247 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4248 				       "since no NETIF_F_HW_CSUM feature.\n",
4249 				       name);
4250 			features &= ~NETIF_F_UFO;
4251 		}
4252 
4253 		if (!(features & NETIF_F_SG)) {
4254 			if (name)
4255 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4256 				       "since no NETIF_F_SG feature.\n", name);
4257 			features &= ~NETIF_F_UFO;
4258 		}
4259 	}
4260 
4261 	return features;
4262 }
4263 EXPORT_SYMBOL(netdev_fix_features);
4264 
4265 /**
4266  *	register_netdevice	- register a network device
4267  *	@dev: device to register
4268  *
4269  *	Take a completed network device structure and add it to the kernel
4270  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4271  *	chain. 0 is returned on success. A negative errno code is returned
4272  *	on a failure to set up the device, or if the name is a duplicate.
4273  *
4274  *	Callers must hold the rtnl semaphore. You may want
4275  *	register_netdev() instead of this.
4276  *
4277  *	BUGS:
4278  *	The locking appears insufficient to guarantee two parallel registers
4279  *	will not get the same name.
4280  */
4281 
4282 int register_netdevice(struct net_device *dev)
4283 {
4284 	struct hlist_head *head;
4285 	struct hlist_node *p;
4286 	int ret;
4287 	struct net *net = dev_net(dev);
4288 
4289 	BUG_ON(dev_boot_phase);
4290 	ASSERT_RTNL();
4291 
4292 	might_sleep();
4293 
4294 	/* When net_device's are persistent, this will be fatal. */
4295 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4296 	BUG_ON(!net);
4297 
4298 	spin_lock_init(&dev->addr_list_lock);
4299 	netdev_set_addr_lockdep_class(dev);
4300 	netdev_init_queue_locks(dev);
4301 
4302 	dev->iflink = -1;
4303 
4304 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4305 	/* Netdevice_ops API compatiability support.
4306 	 * This is temporary until all network devices are converted.
4307 	 */
4308 	if (dev->netdev_ops) {
4309 		const struct net_device_ops *ops = dev->netdev_ops;
4310 
4311 		dev->init = ops->ndo_init;
4312 		dev->uninit = ops->ndo_uninit;
4313 		dev->open = ops->ndo_open;
4314 		dev->change_rx_flags = ops->ndo_change_rx_flags;
4315 		dev->set_rx_mode = ops->ndo_set_rx_mode;
4316 		dev->set_multicast_list = ops->ndo_set_multicast_list;
4317 		dev->set_mac_address = ops->ndo_set_mac_address;
4318 		dev->validate_addr = ops->ndo_validate_addr;
4319 		dev->do_ioctl = ops->ndo_do_ioctl;
4320 		dev->set_config = ops->ndo_set_config;
4321 		dev->change_mtu = ops->ndo_change_mtu;
4322 		dev->tx_timeout = ops->ndo_tx_timeout;
4323 		dev->get_stats = ops->ndo_get_stats;
4324 		dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4325 		dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4326 		dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4327 #ifdef CONFIG_NET_POLL_CONTROLLER
4328 		dev->poll_controller = ops->ndo_poll_controller;
4329 #endif
4330 	} else {
4331 		char drivername[64];
4332 		pr_info("%s (%s): not using net_device_ops yet\n",
4333 			dev->name, netdev_drivername(dev, drivername, 64));
4334 
4335 		/* This works only because net_device_ops and the
4336 		   compatiablity structure are the same. */
4337 		dev->netdev_ops = (void *) &(dev->init);
4338 	}
4339 #endif
4340 
4341 	/* Init, if this function is available */
4342 	if (dev->netdev_ops->ndo_init) {
4343 		ret = dev->netdev_ops->ndo_init(dev);
4344 		if (ret) {
4345 			if (ret > 0)
4346 				ret = -EIO;
4347 			goto out;
4348 		}
4349 	}
4350 
4351 	if (!dev_valid_name(dev->name)) {
4352 		ret = -EINVAL;
4353 		goto err_uninit;
4354 	}
4355 
4356 	dev->ifindex = dev_new_index(net);
4357 	if (dev->iflink == -1)
4358 		dev->iflink = dev->ifindex;
4359 
4360 	/* Check for existence of name */
4361 	head = dev_name_hash(net, dev->name);
4362 	hlist_for_each(p, head) {
4363 		struct net_device *d
4364 			= hlist_entry(p, struct net_device, name_hlist);
4365 		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4366 			ret = -EEXIST;
4367 			goto err_uninit;
4368 		}
4369 	}
4370 
4371 	/* Fix illegal checksum combinations */
4372 	if ((dev->features & NETIF_F_HW_CSUM) &&
4373 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4374 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4375 		       dev->name);
4376 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4377 	}
4378 
4379 	if ((dev->features & NETIF_F_NO_CSUM) &&
4380 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4381 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4382 		       dev->name);
4383 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4384 	}
4385 
4386 	dev->features = netdev_fix_features(dev->features, dev->name);
4387 
4388 	/* Enable software GSO if SG is supported. */
4389 	if (dev->features & NETIF_F_SG)
4390 		dev->features |= NETIF_F_GSO;
4391 
4392 	netdev_initialize_kobject(dev);
4393 	ret = netdev_register_kobject(dev);
4394 	if (ret)
4395 		goto err_uninit;
4396 	dev->reg_state = NETREG_REGISTERED;
4397 
4398 	/*
4399 	 *	Default initial state at registry is that the
4400 	 *	device is present.
4401 	 */
4402 
4403 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4404 
4405 	dev_init_scheduler(dev);
4406 	dev_hold(dev);
4407 	list_netdevice(dev);
4408 
4409 	/* Notify protocols, that a new device appeared. */
4410 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4411 	ret = notifier_to_errno(ret);
4412 	if (ret) {
4413 		rollback_registered(dev);
4414 		dev->reg_state = NETREG_UNREGISTERED;
4415 	}
4416 
4417 out:
4418 	return ret;
4419 
4420 err_uninit:
4421 	if (dev->netdev_ops->ndo_uninit)
4422 		dev->netdev_ops->ndo_uninit(dev);
4423 	goto out;
4424 }
4425 
4426 /**
4427  *	register_netdev	- register a network device
4428  *	@dev: device to register
4429  *
4430  *	Take a completed network device structure and add it to the kernel
4431  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4432  *	chain. 0 is returned on success. A negative errno code is returned
4433  *	on a failure to set up the device, or if the name is a duplicate.
4434  *
4435  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
4436  *	and expands the device name if you passed a format string to
4437  *	alloc_netdev.
4438  */
4439 int register_netdev(struct net_device *dev)
4440 {
4441 	int err;
4442 
4443 	rtnl_lock();
4444 
4445 	/*
4446 	 * If the name is a format string the caller wants us to do a
4447 	 * name allocation.
4448 	 */
4449 	if (strchr(dev->name, '%')) {
4450 		err = dev_alloc_name(dev, dev->name);
4451 		if (err < 0)
4452 			goto out;
4453 	}
4454 
4455 	err = register_netdevice(dev);
4456 out:
4457 	rtnl_unlock();
4458 	return err;
4459 }
4460 EXPORT_SYMBOL(register_netdev);
4461 
4462 /*
4463  * netdev_wait_allrefs - wait until all references are gone.
4464  *
4465  * This is called when unregistering network devices.
4466  *
4467  * Any protocol or device that holds a reference should register
4468  * for netdevice notification, and cleanup and put back the
4469  * reference if they receive an UNREGISTER event.
4470  * We can get stuck here if buggy protocols don't correctly
4471  * call dev_put.
4472  */
4473 static void netdev_wait_allrefs(struct net_device *dev)
4474 {
4475 	unsigned long rebroadcast_time, warning_time;
4476 
4477 	rebroadcast_time = warning_time = jiffies;
4478 	while (atomic_read(&dev->refcnt) != 0) {
4479 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4480 			rtnl_lock();
4481 
4482 			/* Rebroadcast unregister notification */
4483 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4484 
4485 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4486 				     &dev->state)) {
4487 				/* We must not have linkwatch events
4488 				 * pending on unregister. If this
4489 				 * happens, we simply run the queue
4490 				 * unscheduled, resulting in a noop
4491 				 * for this device.
4492 				 */
4493 				linkwatch_run_queue();
4494 			}
4495 
4496 			__rtnl_unlock();
4497 
4498 			rebroadcast_time = jiffies;
4499 		}
4500 
4501 		msleep(250);
4502 
4503 		if (time_after(jiffies, warning_time + 10 * HZ)) {
4504 			printk(KERN_EMERG "unregister_netdevice: "
4505 			       "waiting for %s to become free. Usage "
4506 			       "count = %d\n",
4507 			       dev->name, atomic_read(&dev->refcnt));
4508 			warning_time = jiffies;
4509 		}
4510 	}
4511 }
4512 
4513 /* The sequence is:
4514  *
4515  *	rtnl_lock();
4516  *	...
4517  *	register_netdevice(x1);
4518  *	register_netdevice(x2);
4519  *	...
4520  *	unregister_netdevice(y1);
4521  *	unregister_netdevice(y2);
4522  *      ...
4523  *	rtnl_unlock();
4524  *	free_netdev(y1);
4525  *	free_netdev(y2);
4526  *
4527  * We are invoked by rtnl_unlock().
4528  * This allows us to deal with problems:
4529  * 1) We can delete sysfs objects which invoke hotplug
4530  *    without deadlocking with linkwatch via keventd.
4531  * 2) Since we run with the RTNL semaphore not held, we can sleep
4532  *    safely in order to wait for the netdev refcnt to drop to zero.
4533  *
4534  * We must not return until all unregister events added during
4535  * the interval the lock was held have been completed.
4536  */
4537 void netdev_run_todo(void)
4538 {
4539 	struct list_head list;
4540 
4541 	/* Snapshot list, allow later requests */
4542 	list_replace_init(&net_todo_list, &list);
4543 
4544 	__rtnl_unlock();
4545 
4546 	while (!list_empty(&list)) {
4547 		struct net_device *dev
4548 			= list_entry(list.next, struct net_device, todo_list);
4549 		list_del(&dev->todo_list);
4550 
4551 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4552 			printk(KERN_ERR "network todo '%s' but state %d\n",
4553 			       dev->name, dev->reg_state);
4554 			dump_stack();
4555 			continue;
4556 		}
4557 
4558 		dev->reg_state = NETREG_UNREGISTERED;
4559 
4560 		on_each_cpu(flush_backlog, dev, 1);
4561 
4562 		netdev_wait_allrefs(dev);
4563 
4564 		/* paranoia */
4565 		BUG_ON(atomic_read(&dev->refcnt));
4566 		WARN_ON(dev->ip_ptr);
4567 		WARN_ON(dev->ip6_ptr);
4568 		WARN_ON(dev->dn_ptr);
4569 
4570 		if (dev->destructor)
4571 			dev->destructor(dev);
4572 
4573 		/* Free network device */
4574 		kobject_put(&dev->dev.kobj);
4575 	}
4576 }
4577 
4578 /**
4579  *	dev_get_stats	- get network device statistics
4580  *	@dev: device to get statistics from
4581  *
4582  *	Get network statistics from device. The device driver may provide
4583  *	its own method by setting dev->netdev_ops->get_stats; otherwise
4584  *	the internal statistics structure is used.
4585  */
4586 const struct net_device_stats *dev_get_stats(struct net_device *dev)
4587  {
4588 	const struct net_device_ops *ops = dev->netdev_ops;
4589 
4590 	if (ops->ndo_get_stats)
4591 		return ops->ndo_get_stats(dev);
4592 	else
4593 		return &dev->stats;
4594 }
4595 EXPORT_SYMBOL(dev_get_stats);
4596 
4597 static void netdev_init_one_queue(struct net_device *dev,
4598 				  struct netdev_queue *queue,
4599 				  void *_unused)
4600 {
4601 	queue->dev = dev;
4602 }
4603 
4604 static void netdev_init_queues(struct net_device *dev)
4605 {
4606 	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4607 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4608 	spin_lock_init(&dev->tx_global_lock);
4609 }
4610 
4611 /**
4612  *	alloc_netdev_mq - allocate network device
4613  *	@sizeof_priv:	size of private data to allocate space for
4614  *	@name:		device name format string
4615  *	@setup:		callback to initialize device
4616  *	@queue_count:	the number of subqueues to allocate
4617  *
4618  *	Allocates a struct net_device with private data area for driver use
4619  *	and performs basic initialization.  Also allocates subquue structs
4620  *	for each queue on the device at the end of the netdevice.
4621  */
4622 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4623 		void (*setup)(struct net_device *), unsigned int queue_count)
4624 {
4625 	struct netdev_queue *tx;
4626 	struct net_device *dev;
4627 	size_t alloc_size;
4628 	void *p;
4629 
4630 	BUG_ON(strlen(name) >= sizeof(dev->name));
4631 
4632 	alloc_size = sizeof(struct net_device);
4633 	if (sizeof_priv) {
4634 		/* ensure 32-byte alignment of private area */
4635 		alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4636 		alloc_size += sizeof_priv;
4637 	}
4638 	/* ensure 32-byte alignment of whole construct */
4639 	alloc_size += NETDEV_ALIGN_CONST;
4640 
4641 	p = kzalloc(alloc_size, GFP_KERNEL);
4642 	if (!p) {
4643 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4644 		return NULL;
4645 	}
4646 
4647 	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4648 	if (!tx) {
4649 		printk(KERN_ERR "alloc_netdev: Unable to allocate "
4650 		       "tx qdiscs.\n");
4651 		kfree(p);
4652 		return NULL;
4653 	}
4654 
4655 	dev = (struct net_device *)
4656 		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4657 	dev->padded = (char *)dev - (char *)p;
4658 	dev_net_set(dev, &init_net);
4659 
4660 	dev->_tx = tx;
4661 	dev->num_tx_queues = queue_count;
4662 	dev->real_num_tx_queues = queue_count;
4663 
4664 	dev->gso_max_size = GSO_MAX_SIZE;
4665 
4666 	netdev_init_queues(dev);
4667 
4668 	INIT_LIST_HEAD(&dev->napi_list);
4669 	setup(dev);
4670 	strcpy(dev->name, name);
4671 	return dev;
4672 }
4673 EXPORT_SYMBOL(alloc_netdev_mq);
4674 
4675 /**
4676  *	free_netdev - free network device
4677  *	@dev: device
4678  *
4679  *	This function does the last stage of destroying an allocated device
4680  * 	interface. The reference to the device object is released.
4681  *	If this is the last reference then it will be freed.
4682  */
4683 void free_netdev(struct net_device *dev)
4684 {
4685 	struct napi_struct *p, *n;
4686 
4687 	release_net(dev_net(dev));
4688 
4689 	kfree(dev->_tx);
4690 
4691 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
4692 		netif_napi_del(p);
4693 
4694 	/*  Compatibility with error handling in drivers */
4695 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4696 		kfree((char *)dev - dev->padded);
4697 		return;
4698 	}
4699 
4700 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4701 	dev->reg_state = NETREG_RELEASED;
4702 
4703 	/* will free via device release */
4704 	put_device(&dev->dev);
4705 }
4706 
4707 /**
4708  *	synchronize_net -  Synchronize with packet receive processing
4709  *
4710  *	Wait for packets currently being received to be done.
4711  *	Does not block later packets from starting.
4712  */
4713 void synchronize_net(void)
4714 {
4715 	might_sleep();
4716 	synchronize_rcu();
4717 }
4718 
4719 /**
4720  *	unregister_netdevice - remove device from the kernel
4721  *	@dev: device
4722  *
4723  *	This function shuts down a device interface and removes it
4724  *	from the kernel tables.
4725  *
4726  *	Callers must hold the rtnl semaphore.  You may want
4727  *	unregister_netdev() instead of this.
4728  */
4729 
4730 void unregister_netdevice(struct net_device *dev)
4731 {
4732 	ASSERT_RTNL();
4733 
4734 	rollback_registered(dev);
4735 	/* Finish processing unregister after unlock */
4736 	net_set_todo(dev);
4737 }
4738 
4739 /**
4740  *	unregister_netdev - remove device from the kernel
4741  *	@dev: device
4742  *
4743  *	This function shuts down a device interface and removes it
4744  *	from the kernel tables.
4745  *
4746  *	This is just a wrapper for unregister_netdevice that takes
4747  *	the rtnl semaphore.  In general you want to use this and not
4748  *	unregister_netdevice.
4749  */
4750 void unregister_netdev(struct net_device *dev)
4751 {
4752 	rtnl_lock();
4753 	unregister_netdevice(dev);
4754 	rtnl_unlock();
4755 }
4756 
4757 EXPORT_SYMBOL(unregister_netdev);
4758 
4759 /**
4760  *	dev_change_net_namespace - move device to different nethost namespace
4761  *	@dev: device
4762  *	@net: network namespace
4763  *	@pat: If not NULL name pattern to try if the current device name
4764  *	      is already taken in the destination network namespace.
4765  *
4766  *	This function shuts down a device interface and moves it
4767  *	to a new network namespace. On success 0 is returned, on
4768  *	a failure a netagive errno code is returned.
4769  *
4770  *	Callers must hold the rtnl semaphore.
4771  */
4772 
4773 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4774 {
4775 	char buf[IFNAMSIZ];
4776 	const char *destname;
4777 	int err;
4778 
4779 	ASSERT_RTNL();
4780 
4781 	/* Don't allow namespace local devices to be moved. */
4782 	err = -EINVAL;
4783 	if (dev->features & NETIF_F_NETNS_LOCAL)
4784 		goto out;
4785 
4786 #ifdef CONFIG_SYSFS
4787 	/* Don't allow real devices to be moved when sysfs
4788 	 * is enabled.
4789 	 */
4790 	err = -EINVAL;
4791 	if (dev->dev.parent)
4792 		goto out;
4793 #endif
4794 
4795 	/* Ensure the device has been registrered */
4796 	err = -EINVAL;
4797 	if (dev->reg_state != NETREG_REGISTERED)
4798 		goto out;
4799 
4800 	/* Get out if there is nothing todo */
4801 	err = 0;
4802 	if (net_eq(dev_net(dev), net))
4803 		goto out;
4804 
4805 	/* Pick the destination device name, and ensure
4806 	 * we can use it in the destination network namespace.
4807 	 */
4808 	err = -EEXIST;
4809 	destname = dev->name;
4810 	if (__dev_get_by_name(net, destname)) {
4811 		/* We get here if we can't use the current device name */
4812 		if (!pat)
4813 			goto out;
4814 		if (!dev_valid_name(pat))
4815 			goto out;
4816 		if (strchr(pat, '%')) {
4817 			if (__dev_alloc_name(net, pat, buf) < 0)
4818 				goto out;
4819 			destname = buf;
4820 		} else
4821 			destname = pat;
4822 		if (__dev_get_by_name(net, destname))
4823 			goto out;
4824 	}
4825 
4826 	/*
4827 	 * And now a mini version of register_netdevice unregister_netdevice.
4828 	 */
4829 
4830 	/* If device is running close it first. */
4831 	dev_close(dev);
4832 
4833 	/* And unlink it from device chain */
4834 	err = -ENODEV;
4835 	unlist_netdevice(dev);
4836 
4837 	synchronize_net();
4838 
4839 	/* Shutdown queueing discipline. */
4840 	dev_shutdown(dev);
4841 
4842 	/* Notify protocols, that we are about to destroy
4843 	   this device. They should clean all the things.
4844 	*/
4845 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4846 
4847 	/*
4848 	 *	Flush the unicast and multicast chains
4849 	 */
4850 	dev_addr_discard(dev);
4851 
4852 	netdev_unregister_kobject(dev);
4853 
4854 	/* Actually switch the network namespace */
4855 	dev_net_set(dev, net);
4856 
4857 	/* Assign the new device name */
4858 	if (destname != dev->name)
4859 		strcpy(dev->name, destname);
4860 
4861 	/* If there is an ifindex conflict assign a new one */
4862 	if (__dev_get_by_index(net, dev->ifindex)) {
4863 		int iflink = (dev->iflink == dev->ifindex);
4864 		dev->ifindex = dev_new_index(net);
4865 		if (iflink)
4866 			dev->iflink = dev->ifindex;
4867 	}
4868 
4869 	/* Fixup kobjects */
4870 	err = netdev_register_kobject(dev);
4871 	WARN_ON(err);
4872 
4873 	/* Add the device back in the hashes */
4874 	list_netdevice(dev);
4875 
4876 	/* Notify protocols, that a new device appeared. */
4877 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
4878 
4879 	synchronize_net();
4880 	err = 0;
4881 out:
4882 	return err;
4883 }
4884 
4885 static int dev_cpu_callback(struct notifier_block *nfb,
4886 			    unsigned long action,
4887 			    void *ocpu)
4888 {
4889 	struct sk_buff **list_skb;
4890 	struct Qdisc **list_net;
4891 	struct sk_buff *skb;
4892 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
4893 	struct softnet_data *sd, *oldsd;
4894 
4895 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4896 		return NOTIFY_OK;
4897 
4898 	local_irq_disable();
4899 	cpu = smp_processor_id();
4900 	sd = &per_cpu(softnet_data, cpu);
4901 	oldsd = &per_cpu(softnet_data, oldcpu);
4902 
4903 	/* Find end of our completion_queue. */
4904 	list_skb = &sd->completion_queue;
4905 	while (*list_skb)
4906 		list_skb = &(*list_skb)->next;
4907 	/* Append completion queue from offline CPU. */
4908 	*list_skb = oldsd->completion_queue;
4909 	oldsd->completion_queue = NULL;
4910 
4911 	/* Find end of our output_queue. */
4912 	list_net = &sd->output_queue;
4913 	while (*list_net)
4914 		list_net = &(*list_net)->next_sched;
4915 	/* Append output queue from offline CPU. */
4916 	*list_net = oldsd->output_queue;
4917 	oldsd->output_queue = NULL;
4918 
4919 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
4920 	local_irq_enable();
4921 
4922 	/* Process offline CPU's input_pkt_queue */
4923 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4924 		netif_rx(skb);
4925 
4926 	return NOTIFY_OK;
4927 }
4928 
4929 
4930 /**
4931  *	netdev_increment_features - increment feature set by one
4932  *	@all: current feature set
4933  *	@one: new feature set
4934  *	@mask: mask feature set
4935  *
4936  *	Computes a new feature set after adding a device with feature set
4937  *	@one to the master device with current feature set @all.  Will not
4938  *	enable anything that is off in @mask. Returns the new feature set.
4939  */
4940 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
4941 					unsigned long mask)
4942 {
4943 	/* If device needs checksumming, downgrade to it. */
4944         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4945 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
4946 	else if (mask & NETIF_F_ALL_CSUM) {
4947 		/* If one device supports v4/v6 checksumming, set for all. */
4948 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
4949 		    !(all & NETIF_F_GEN_CSUM)) {
4950 			all &= ~NETIF_F_ALL_CSUM;
4951 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
4952 		}
4953 
4954 		/* If one device supports hw checksumming, set for all. */
4955 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
4956 			all &= ~NETIF_F_ALL_CSUM;
4957 			all |= NETIF_F_HW_CSUM;
4958 		}
4959 	}
4960 
4961 	one |= NETIF_F_ALL_CSUM;
4962 
4963 	one |= all & NETIF_F_ONE_FOR_ALL;
4964 	all &= one | NETIF_F_LLTX | NETIF_F_GSO;
4965 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
4966 
4967 	return all;
4968 }
4969 EXPORT_SYMBOL(netdev_increment_features);
4970 
4971 static struct hlist_head *netdev_create_hash(void)
4972 {
4973 	int i;
4974 	struct hlist_head *hash;
4975 
4976 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
4977 	if (hash != NULL)
4978 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
4979 			INIT_HLIST_HEAD(&hash[i]);
4980 
4981 	return hash;
4982 }
4983 
4984 /* Initialize per network namespace state */
4985 static int __net_init netdev_init(struct net *net)
4986 {
4987 	INIT_LIST_HEAD(&net->dev_base_head);
4988 
4989 	net->dev_name_head = netdev_create_hash();
4990 	if (net->dev_name_head == NULL)
4991 		goto err_name;
4992 
4993 	net->dev_index_head = netdev_create_hash();
4994 	if (net->dev_index_head == NULL)
4995 		goto err_idx;
4996 
4997 	return 0;
4998 
4999 err_idx:
5000 	kfree(net->dev_name_head);
5001 err_name:
5002 	return -ENOMEM;
5003 }
5004 
5005 /**
5006  *	netdev_drivername - network driver for the device
5007  *	@dev: network device
5008  *	@buffer: buffer for resulting name
5009  *	@len: size of buffer
5010  *
5011  *	Determine network driver for device.
5012  */
5013 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5014 {
5015 	const struct device_driver *driver;
5016 	const struct device *parent;
5017 
5018 	if (len <= 0 || !buffer)
5019 		return buffer;
5020 	buffer[0] = 0;
5021 
5022 	parent = dev->dev.parent;
5023 
5024 	if (!parent)
5025 		return buffer;
5026 
5027 	driver = parent->driver;
5028 	if (driver && driver->name)
5029 		strlcpy(buffer, driver->name, len);
5030 	return buffer;
5031 }
5032 
5033 static void __net_exit netdev_exit(struct net *net)
5034 {
5035 	kfree(net->dev_name_head);
5036 	kfree(net->dev_index_head);
5037 }
5038 
5039 static struct pernet_operations __net_initdata netdev_net_ops = {
5040 	.init = netdev_init,
5041 	.exit = netdev_exit,
5042 };
5043 
5044 static void __net_exit default_device_exit(struct net *net)
5045 {
5046 	struct net_device *dev;
5047 	/*
5048 	 * Push all migratable of the network devices back to the
5049 	 * initial network namespace
5050 	 */
5051 	rtnl_lock();
5052 restart:
5053 	for_each_netdev(net, dev) {
5054 		int err;
5055 		char fb_name[IFNAMSIZ];
5056 
5057 		/* Ignore unmoveable devices (i.e. loopback) */
5058 		if (dev->features & NETIF_F_NETNS_LOCAL)
5059 			continue;
5060 
5061 		/* Delete virtual devices */
5062 		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5063 			dev->rtnl_link_ops->dellink(dev);
5064 			goto restart;
5065 		}
5066 
5067 		/* Push remaing network devices to init_net */
5068 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5069 		err = dev_change_net_namespace(dev, &init_net, fb_name);
5070 		if (err) {
5071 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5072 				__func__, dev->name, err);
5073 			BUG();
5074 		}
5075 		goto restart;
5076 	}
5077 	rtnl_unlock();
5078 }
5079 
5080 static struct pernet_operations __net_initdata default_device_ops = {
5081 	.exit = default_device_exit,
5082 };
5083 
5084 /*
5085  *	Initialize the DEV module. At boot time this walks the device list and
5086  *	unhooks any devices that fail to initialise (normally hardware not
5087  *	present) and leaves us with a valid list of present and active devices.
5088  *
5089  */
5090 
5091 /*
5092  *       This is called single threaded during boot, so no need
5093  *       to take the rtnl semaphore.
5094  */
5095 static int __init net_dev_init(void)
5096 {
5097 	int i, rc = -ENOMEM;
5098 
5099 	BUG_ON(!dev_boot_phase);
5100 
5101 	if (dev_proc_init())
5102 		goto out;
5103 
5104 	if (netdev_kobject_init())
5105 		goto out;
5106 
5107 	INIT_LIST_HEAD(&ptype_all);
5108 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5109 		INIT_LIST_HEAD(&ptype_base[i]);
5110 
5111 	if (register_pernet_subsys(&netdev_net_ops))
5112 		goto out;
5113 
5114 	/*
5115 	 *	Initialise the packet receive queues.
5116 	 */
5117 
5118 	for_each_possible_cpu(i) {
5119 		struct softnet_data *queue;
5120 
5121 		queue = &per_cpu(softnet_data, i);
5122 		skb_queue_head_init(&queue->input_pkt_queue);
5123 		queue->completion_queue = NULL;
5124 		INIT_LIST_HEAD(&queue->poll_list);
5125 
5126 		queue->backlog.poll = process_backlog;
5127 		queue->backlog.weight = weight_p;
5128 		queue->backlog.gro_list = NULL;
5129 	}
5130 
5131 	dev_boot_phase = 0;
5132 
5133 	/* The loopback device is special if any other network devices
5134 	 * is present in a network namespace the loopback device must
5135 	 * be present. Since we now dynamically allocate and free the
5136 	 * loopback device ensure this invariant is maintained by
5137 	 * keeping the loopback device as the first device on the
5138 	 * list of network devices.  Ensuring the loopback devices
5139 	 * is the first device that appears and the last network device
5140 	 * that disappears.
5141 	 */
5142 	if (register_pernet_device(&loopback_net_ops))
5143 		goto out;
5144 
5145 	if (register_pernet_device(&default_device_ops))
5146 		goto out;
5147 
5148 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5149 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5150 
5151 	hotcpu_notifier(dev_cpu_callback, 0);
5152 	dst_init();
5153 	dev_mcast_init();
5154 	#ifdef CONFIG_NET_DMA
5155 	dmaengine_get();
5156 	#endif
5157 	rc = 0;
5158 out:
5159 	return rc;
5160 }
5161 
5162 subsys_initcall(net_dev_init);
5163 
5164 EXPORT_SYMBOL(__dev_get_by_index);
5165 EXPORT_SYMBOL(__dev_get_by_name);
5166 EXPORT_SYMBOL(__dev_remove_pack);
5167 EXPORT_SYMBOL(dev_valid_name);
5168 EXPORT_SYMBOL(dev_add_pack);
5169 EXPORT_SYMBOL(dev_alloc_name);
5170 EXPORT_SYMBOL(dev_close);
5171 EXPORT_SYMBOL(dev_get_by_flags);
5172 EXPORT_SYMBOL(dev_get_by_index);
5173 EXPORT_SYMBOL(dev_get_by_name);
5174 EXPORT_SYMBOL(dev_open);
5175 EXPORT_SYMBOL(dev_queue_xmit);
5176 EXPORT_SYMBOL(dev_remove_pack);
5177 EXPORT_SYMBOL(dev_set_allmulti);
5178 EXPORT_SYMBOL(dev_set_promiscuity);
5179 EXPORT_SYMBOL(dev_change_flags);
5180 EXPORT_SYMBOL(dev_set_mtu);
5181 EXPORT_SYMBOL(dev_set_mac_address);
5182 EXPORT_SYMBOL(free_netdev);
5183 EXPORT_SYMBOL(netdev_boot_setup_check);
5184 EXPORT_SYMBOL(netdev_set_master);
5185 EXPORT_SYMBOL(netdev_state_change);
5186 EXPORT_SYMBOL(netif_receive_skb);
5187 EXPORT_SYMBOL(netif_rx);
5188 EXPORT_SYMBOL(register_gifconf);
5189 EXPORT_SYMBOL(register_netdevice);
5190 EXPORT_SYMBOL(register_netdevice_notifier);
5191 EXPORT_SYMBOL(skb_checksum_help);
5192 EXPORT_SYMBOL(synchronize_net);
5193 EXPORT_SYMBOL(unregister_netdevice);
5194 EXPORT_SYMBOL(unregister_netdevice_notifier);
5195 EXPORT_SYMBOL(net_enable_timestamp);
5196 EXPORT_SYMBOL(net_disable_timestamp);
5197 EXPORT_SYMBOL(dev_get_flags);
5198 
5199 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
5200 EXPORT_SYMBOL(br_handle_frame_hook);
5201 EXPORT_SYMBOL(br_fdb_get_hook);
5202 EXPORT_SYMBOL(br_fdb_put_hook);
5203 #endif
5204 
5205 EXPORT_SYMBOL(dev_load);
5206 
5207 EXPORT_PER_CPU_SYMBOL(softnet_data);
5208