xref: /linux-6.15/net/core/dev.c (revision 7fcab099)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <[email protected]>
12  *				Mark Evans, <[email protected]>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <[email protected]>
16  *		Alan Cox <[email protected]>
17  *		David Hinds <[email protected]>
18  *		Alexey Kuznetsov <[email protected]>
19  *		Adam Sulmicki <[email protected]>
20  *              Pekka Riikonen <[email protected]>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/sched.h>
83 #include <linux/mutex.h>
84 #include <linux/string.h>
85 #include <linux/mm.h>
86 #include <linux/socket.h>
87 #include <linux/sockios.h>
88 #include <linux/errno.h>
89 #include <linux/interrupt.h>
90 #include <linux/if_ether.h>
91 #include <linux/netdevice.h>
92 #include <linux/etherdevice.h>
93 #include <linux/ethtool.h>
94 #include <linux/notifier.h>
95 #include <linux/skbuff.h>
96 #include <net/net_namespace.h>
97 #include <net/sock.h>
98 #include <linux/rtnetlink.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/stat.h>
102 #include <linux/if_bridge.h>
103 #include <linux/if_macvlan.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 
131 #include "net-sysfs.h"
132 
133 /* Instead of increasing this, you should create a hash table. */
134 #define MAX_GRO_SKBS 8
135 
136 /* This should be increased if a protocol with a bigger head is added. */
137 #define GRO_MAX_HEAD (MAX_HEADER + 128)
138 
139 /*
140  *	The list of packet types we will receive (as opposed to discard)
141  *	and the routines to invoke.
142  *
143  *	Why 16. Because with 16 the only overlap we get on a hash of the
144  *	low nibble of the protocol value is RARP/SNAP/X.25.
145  *
146  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
147  *             sure which should go first, but I bet it won't make much
148  *             difference if we are running VLANs.  The good news is that
149  *             this protocol won't be in the list unless compiled in, so
150  *             the average user (w/out VLANs) will not be adversely affected.
151  *             --BLG
152  *
153  *		0800	IP
154  *		8100    802.1Q VLAN
155  *		0001	802.3
156  *		0002	AX.25
157  *		0004	802.2
158  *		8035	RARP
159  *		0005	SNAP
160  *		0805	X.25
161  *		0806	ARP
162  *		8137	IPX
163  *		0009	Localtalk
164  *		86DD	IPv6
165  */
166 
167 #define PTYPE_HASH_SIZE	(16)
168 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
169 
170 static DEFINE_SPINLOCK(ptype_lock);
171 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
172 static struct list_head ptype_all __read_mostly;	/* Taps */
173 
174 /*
175  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
176  * semaphore.
177  *
178  * Pure readers hold dev_base_lock for reading.
179  *
180  * Writers must hold the rtnl semaphore while they loop through the
181  * dev_base_head list, and hold dev_base_lock for writing when they do the
182  * actual updates.  This allows pure readers to access the list even
183  * while a writer is preparing to update it.
184  *
185  * To put it another way, dev_base_lock is held for writing only to
186  * protect against pure readers; the rtnl semaphore provides the
187  * protection against other writers.
188  *
189  * See, for example usages, register_netdevice() and
190  * unregister_netdevice(), which must be called with the rtnl
191  * semaphore held.
192  */
193 DEFINE_RWLOCK(dev_base_lock);
194 
195 EXPORT_SYMBOL(dev_base_lock);
196 
197 #define NETDEV_HASHBITS	8
198 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
199 
200 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
201 {
202 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
203 	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
204 }
205 
206 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
207 {
208 	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
209 }
210 
211 /* Device list insertion */
212 static int list_netdevice(struct net_device *dev)
213 {
214 	struct net *net = dev_net(dev);
215 
216 	ASSERT_RTNL();
217 
218 	write_lock_bh(&dev_base_lock);
219 	list_add_tail(&dev->dev_list, &net->dev_base_head);
220 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
221 	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
222 	write_unlock_bh(&dev_base_lock);
223 	return 0;
224 }
225 
226 /* Device list removal */
227 static void unlist_netdevice(struct net_device *dev)
228 {
229 	ASSERT_RTNL();
230 
231 	/* Unlink dev from the device chain */
232 	write_lock_bh(&dev_base_lock);
233 	list_del(&dev->dev_list);
234 	hlist_del(&dev->name_hlist);
235 	hlist_del(&dev->index_hlist);
236 	write_unlock_bh(&dev_base_lock);
237 }
238 
239 /*
240  *	Our notifier list
241  */
242 
243 static RAW_NOTIFIER_HEAD(netdev_chain);
244 
245 /*
246  *	Device drivers call our routines to queue packets here. We empty the
247  *	queue in the local softnet handler.
248  */
249 
250 DEFINE_PER_CPU(struct softnet_data, softnet_data);
251 
252 #ifdef CONFIG_LOCKDEP
253 /*
254  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
255  * according to dev->type
256  */
257 static const unsigned short netdev_lock_type[] =
258 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
259 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
260 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
261 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
262 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
263 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
264 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
265 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
266 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
267 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
268 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
269 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
270 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
271 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
272 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_IEEE802154_PHY,
273 	 ARPHRD_VOID, ARPHRD_NONE};
274 
275 static const char *netdev_lock_name[] =
276 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
277 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
278 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
279 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
280 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
281 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
282 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
283 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
284 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
285 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
286 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
287 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
288 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
289 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
290 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154", "_xmit_IEEE802154_PHY",
291 	 "_xmit_VOID", "_xmit_NONE"};
292 
293 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
294 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
295 
296 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
297 {
298 	int i;
299 
300 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
301 		if (netdev_lock_type[i] == dev_type)
302 			return i;
303 	/* the last key is used by default */
304 	return ARRAY_SIZE(netdev_lock_type) - 1;
305 }
306 
307 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
308 						 unsigned short dev_type)
309 {
310 	int i;
311 
312 	i = netdev_lock_pos(dev_type);
313 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
314 				   netdev_lock_name[i]);
315 }
316 
317 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
318 {
319 	int i;
320 
321 	i = netdev_lock_pos(dev->type);
322 	lockdep_set_class_and_name(&dev->addr_list_lock,
323 				   &netdev_addr_lock_key[i],
324 				   netdev_lock_name[i]);
325 }
326 #else
327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
328 						 unsigned short dev_type)
329 {
330 }
331 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
332 {
333 }
334 #endif
335 
336 /*******************************************************************************
337 
338 		Protocol management and registration routines
339 
340 *******************************************************************************/
341 
342 /*
343  *	Add a protocol ID to the list. Now that the input handler is
344  *	smarter we can dispense with all the messy stuff that used to be
345  *	here.
346  *
347  *	BEWARE!!! Protocol handlers, mangling input packets,
348  *	MUST BE last in hash buckets and checking protocol handlers
349  *	MUST start from promiscuous ptype_all chain in net_bh.
350  *	It is true now, do not change it.
351  *	Explanation follows: if protocol handler, mangling packet, will
352  *	be the first on list, it is not able to sense, that packet
353  *	is cloned and should be copied-on-write, so that it will
354  *	change it and subsequent readers will get broken packet.
355  *							--ANK (980803)
356  */
357 
358 /**
359  *	dev_add_pack - add packet handler
360  *	@pt: packet type declaration
361  *
362  *	Add a protocol handler to the networking stack. The passed &packet_type
363  *	is linked into kernel lists and may not be freed until it has been
364  *	removed from the kernel lists.
365  *
366  *	This call does not sleep therefore it can not
367  *	guarantee all CPU's that are in middle of receiving packets
368  *	will see the new packet type (until the next received packet).
369  */
370 
371 void dev_add_pack(struct packet_type *pt)
372 {
373 	int hash;
374 
375 	spin_lock_bh(&ptype_lock);
376 	if (pt->type == htons(ETH_P_ALL))
377 		list_add_rcu(&pt->list, &ptype_all);
378 	else {
379 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
380 		list_add_rcu(&pt->list, &ptype_base[hash]);
381 	}
382 	spin_unlock_bh(&ptype_lock);
383 }
384 
385 /**
386  *	__dev_remove_pack	 - remove packet handler
387  *	@pt: packet type declaration
388  *
389  *	Remove a protocol handler that was previously added to the kernel
390  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
391  *	from the kernel lists and can be freed or reused once this function
392  *	returns.
393  *
394  *      The packet type might still be in use by receivers
395  *	and must not be freed until after all the CPU's have gone
396  *	through a quiescent state.
397  */
398 void __dev_remove_pack(struct packet_type *pt)
399 {
400 	struct list_head *head;
401 	struct packet_type *pt1;
402 
403 	spin_lock_bh(&ptype_lock);
404 
405 	if (pt->type == htons(ETH_P_ALL))
406 		head = &ptype_all;
407 	else
408 		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
409 
410 	list_for_each_entry(pt1, head, list) {
411 		if (pt == pt1) {
412 			list_del_rcu(&pt->list);
413 			goto out;
414 		}
415 	}
416 
417 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
418 out:
419 	spin_unlock_bh(&ptype_lock);
420 }
421 /**
422  *	dev_remove_pack	 - remove packet handler
423  *	@pt: packet type declaration
424  *
425  *	Remove a protocol handler that was previously added to the kernel
426  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
427  *	from the kernel lists and can be freed or reused once this function
428  *	returns.
429  *
430  *	This call sleeps to guarantee that no CPU is looking at the packet
431  *	type after return.
432  */
433 void dev_remove_pack(struct packet_type *pt)
434 {
435 	__dev_remove_pack(pt);
436 
437 	synchronize_net();
438 }
439 
440 /******************************************************************************
441 
442 		      Device Boot-time Settings Routines
443 
444 *******************************************************************************/
445 
446 /* Boot time configuration table */
447 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
448 
449 /**
450  *	netdev_boot_setup_add	- add new setup entry
451  *	@name: name of the device
452  *	@map: configured settings for the device
453  *
454  *	Adds new setup entry to the dev_boot_setup list.  The function
455  *	returns 0 on error and 1 on success.  This is a generic routine to
456  *	all netdevices.
457  */
458 static int netdev_boot_setup_add(char *name, struct ifmap *map)
459 {
460 	struct netdev_boot_setup *s;
461 	int i;
462 
463 	s = dev_boot_setup;
464 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
465 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
466 			memset(s[i].name, 0, sizeof(s[i].name));
467 			strlcpy(s[i].name, name, IFNAMSIZ);
468 			memcpy(&s[i].map, map, sizeof(s[i].map));
469 			break;
470 		}
471 	}
472 
473 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
474 }
475 
476 /**
477  *	netdev_boot_setup_check	- check boot time settings
478  *	@dev: the netdevice
479  *
480  * 	Check boot time settings for the device.
481  *	The found settings are set for the device to be used
482  *	later in the device probing.
483  *	Returns 0 if no settings found, 1 if they are.
484  */
485 int netdev_boot_setup_check(struct net_device *dev)
486 {
487 	struct netdev_boot_setup *s = dev_boot_setup;
488 	int i;
489 
490 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
491 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
492 		    !strcmp(dev->name, s[i].name)) {
493 			dev->irq 	= s[i].map.irq;
494 			dev->base_addr 	= s[i].map.base_addr;
495 			dev->mem_start 	= s[i].map.mem_start;
496 			dev->mem_end 	= s[i].map.mem_end;
497 			return 1;
498 		}
499 	}
500 	return 0;
501 }
502 
503 
504 /**
505  *	netdev_boot_base	- get address from boot time settings
506  *	@prefix: prefix for network device
507  *	@unit: id for network device
508  *
509  * 	Check boot time settings for the base address of device.
510  *	The found settings are set for the device to be used
511  *	later in the device probing.
512  *	Returns 0 if no settings found.
513  */
514 unsigned long netdev_boot_base(const char *prefix, int unit)
515 {
516 	const struct netdev_boot_setup *s = dev_boot_setup;
517 	char name[IFNAMSIZ];
518 	int i;
519 
520 	sprintf(name, "%s%d", prefix, unit);
521 
522 	/*
523 	 * If device already registered then return base of 1
524 	 * to indicate not to probe for this interface
525 	 */
526 	if (__dev_get_by_name(&init_net, name))
527 		return 1;
528 
529 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
530 		if (!strcmp(name, s[i].name))
531 			return s[i].map.base_addr;
532 	return 0;
533 }
534 
535 /*
536  * Saves at boot time configured settings for any netdevice.
537  */
538 int __init netdev_boot_setup(char *str)
539 {
540 	int ints[5];
541 	struct ifmap map;
542 
543 	str = get_options(str, ARRAY_SIZE(ints), ints);
544 	if (!str || !*str)
545 		return 0;
546 
547 	/* Save settings */
548 	memset(&map, 0, sizeof(map));
549 	if (ints[0] > 0)
550 		map.irq = ints[1];
551 	if (ints[0] > 1)
552 		map.base_addr = ints[2];
553 	if (ints[0] > 2)
554 		map.mem_start = ints[3];
555 	if (ints[0] > 3)
556 		map.mem_end = ints[4];
557 
558 	/* Add new entry to the list */
559 	return netdev_boot_setup_add(str, &map);
560 }
561 
562 __setup("netdev=", netdev_boot_setup);
563 
564 /*******************************************************************************
565 
566 			    Device Interface Subroutines
567 
568 *******************************************************************************/
569 
570 /**
571  *	__dev_get_by_name	- find a device by its name
572  *	@net: the applicable net namespace
573  *	@name: name to find
574  *
575  *	Find an interface by name. Must be called under RTNL semaphore
576  *	or @dev_base_lock. If the name is found a pointer to the device
577  *	is returned. If the name is not found then %NULL is returned. The
578  *	reference counters are not incremented so the caller must be
579  *	careful with locks.
580  */
581 
582 struct net_device *__dev_get_by_name(struct net *net, const char *name)
583 {
584 	struct hlist_node *p;
585 
586 	hlist_for_each(p, dev_name_hash(net, name)) {
587 		struct net_device *dev
588 			= hlist_entry(p, struct net_device, name_hlist);
589 		if (!strncmp(dev->name, name, IFNAMSIZ))
590 			return dev;
591 	}
592 	return NULL;
593 }
594 
595 /**
596  *	dev_get_by_name		- find a device by its name
597  *	@net: the applicable net namespace
598  *	@name: name to find
599  *
600  *	Find an interface by name. This can be called from any
601  *	context and does its own locking. The returned handle has
602  *	the usage count incremented and the caller must use dev_put() to
603  *	release it when it is no longer needed. %NULL is returned if no
604  *	matching device is found.
605  */
606 
607 struct net_device *dev_get_by_name(struct net *net, const char *name)
608 {
609 	struct net_device *dev;
610 
611 	read_lock(&dev_base_lock);
612 	dev = __dev_get_by_name(net, name);
613 	if (dev)
614 		dev_hold(dev);
615 	read_unlock(&dev_base_lock);
616 	return dev;
617 }
618 
619 /**
620  *	__dev_get_by_index - find a device by its ifindex
621  *	@net: the applicable net namespace
622  *	@ifindex: index of device
623  *
624  *	Search for an interface by index. Returns %NULL if the device
625  *	is not found or a pointer to the device. The device has not
626  *	had its reference counter increased so the caller must be careful
627  *	about locking. The caller must hold either the RTNL semaphore
628  *	or @dev_base_lock.
629  */
630 
631 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
632 {
633 	struct hlist_node *p;
634 
635 	hlist_for_each(p, dev_index_hash(net, ifindex)) {
636 		struct net_device *dev
637 			= hlist_entry(p, struct net_device, index_hlist);
638 		if (dev->ifindex == ifindex)
639 			return dev;
640 	}
641 	return NULL;
642 }
643 
644 
645 /**
646  *	dev_get_by_index - find a device by its ifindex
647  *	@net: the applicable net namespace
648  *	@ifindex: index of device
649  *
650  *	Search for an interface by index. Returns NULL if the device
651  *	is not found or a pointer to the device. The device returned has
652  *	had a reference added and the pointer is safe until the user calls
653  *	dev_put to indicate they have finished with it.
654  */
655 
656 struct net_device *dev_get_by_index(struct net *net, int ifindex)
657 {
658 	struct net_device *dev;
659 
660 	read_lock(&dev_base_lock);
661 	dev = __dev_get_by_index(net, ifindex);
662 	if (dev)
663 		dev_hold(dev);
664 	read_unlock(&dev_base_lock);
665 	return dev;
666 }
667 
668 /**
669  *	dev_getbyhwaddr - find a device by its hardware address
670  *	@net: the applicable net namespace
671  *	@type: media type of device
672  *	@ha: hardware address
673  *
674  *	Search for an interface by MAC address. Returns NULL if the device
675  *	is not found or a pointer to the device. The caller must hold the
676  *	rtnl semaphore. The returned device has not had its ref count increased
677  *	and the caller must therefore be careful about locking
678  *
679  *	BUGS:
680  *	If the API was consistent this would be __dev_get_by_hwaddr
681  */
682 
683 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
684 {
685 	struct net_device *dev;
686 
687 	ASSERT_RTNL();
688 
689 	for_each_netdev(net, dev)
690 		if (dev->type == type &&
691 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
692 			return dev;
693 
694 	return NULL;
695 }
696 
697 EXPORT_SYMBOL(dev_getbyhwaddr);
698 
699 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
700 {
701 	struct net_device *dev;
702 
703 	ASSERT_RTNL();
704 	for_each_netdev(net, dev)
705 		if (dev->type == type)
706 			return dev;
707 
708 	return NULL;
709 }
710 
711 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
712 
713 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
714 {
715 	struct net_device *dev;
716 
717 	rtnl_lock();
718 	dev = __dev_getfirstbyhwtype(net, type);
719 	if (dev)
720 		dev_hold(dev);
721 	rtnl_unlock();
722 	return dev;
723 }
724 
725 EXPORT_SYMBOL(dev_getfirstbyhwtype);
726 
727 /**
728  *	dev_get_by_flags - find any device with given flags
729  *	@net: the applicable net namespace
730  *	@if_flags: IFF_* values
731  *	@mask: bitmask of bits in if_flags to check
732  *
733  *	Search for any interface with the given flags. Returns NULL if a device
734  *	is not found or a pointer to the device. The device returned has
735  *	had a reference added and the pointer is safe until the user calls
736  *	dev_put to indicate they have finished with it.
737  */
738 
739 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
740 {
741 	struct net_device *dev, *ret;
742 
743 	ret = NULL;
744 	read_lock(&dev_base_lock);
745 	for_each_netdev(net, dev) {
746 		if (((dev->flags ^ if_flags) & mask) == 0) {
747 			dev_hold(dev);
748 			ret = dev;
749 			break;
750 		}
751 	}
752 	read_unlock(&dev_base_lock);
753 	return ret;
754 }
755 
756 /**
757  *	dev_valid_name - check if name is okay for network device
758  *	@name: name string
759  *
760  *	Network device names need to be valid file names to
761  *	to allow sysfs to work.  We also disallow any kind of
762  *	whitespace.
763  */
764 int dev_valid_name(const char *name)
765 {
766 	if (*name == '\0')
767 		return 0;
768 	if (strlen(name) >= IFNAMSIZ)
769 		return 0;
770 	if (!strcmp(name, ".") || !strcmp(name, ".."))
771 		return 0;
772 
773 	while (*name) {
774 		if (*name == '/' || isspace(*name))
775 			return 0;
776 		name++;
777 	}
778 	return 1;
779 }
780 
781 /**
782  *	__dev_alloc_name - allocate a name for a device
783  *	@net: network namespace to allocate the device name in
784  *	@name: name format string
785  *	@buf:  scratch buffer and result name string
786  *
787  *	Passed a format string - eg "lt%d" it will try and find a suitable
788  *	id. It scans list of devices to build up a free map, then chooses
789  *	the first empty slot. The caller must hold the dev_base or rtnl lock
790  *	while allocating the name and adding the device in order to avoid
791  *	duplicates.
792  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
793  *	Returns the number of the unit assigned or a negative errno code.
794  */
795 
796 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
797 {
798 	int i = 0;
799 	const char *p;
800 	const int max_netdevices = 8*PAGE_SIZE;
801 	unsigned long *inuse;
802 	struct net_device *d;
803 
804 	p = strnchr(name, IFNAMSIZ-1, '%');
805 	if (p) {
806 		/*
807 		 * Verify the string as this thing may have come from
808 		 * the user.  There must be either one "%d" and no other "%"
809 		 * characters.
810 		 */
811 		if (p[1] != 'd' || strchr(p + 2, '%'))
812 			return -EINVAL;
813 
814 		/* Use one page as a bit array of possible slots */
815 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
816 		if (!inuse)
817 			return -ENOMEM;
818 
819 		for_each_netdev(net, d) {
820 			if (!sscanf(d->name, name, &i))
821 				continue;
822 			if (i < 0 || i >= max_netdevices)
823 				continue;
824 
825 			/*  avoid cases where sscanf is not exact inverse of printf */
826 			snprintf(buf, IFNAMSIZ, name, i);
827 			if (!strncmp(buf, d->name, IFNAMSIZ))
828 				set_bit(i, inuse);
829 		}
830 
831 		i = find_first_zero_bit(inuse, max_netdevices);
832 		free_page((unsigned long) inuse);
833 	}
834 
835 	snprintf(buf, IFNAMSIZ, name, i);
836 	if (!__dev_get_by_name(net, buf))
837 		return i;
838 
839 	/* It is possible to run out of possible slots
840 	 * when the name is long and there isn't enough space left
841 	 * for the digits, or if all bits are used.
842 	 */
843 	return -ENFILE;
844 }
845 
846 /**
847  *	dev_alloc_name - allocate a name for a device
848  *	@dev: device
849  *	@name: name format string
850  *
851  *	Passed a format string - eg "lt%d" it will try and find a suitable
852  *	id. It scans list of devices to build up a free map, then chooses
853  *	the first empty slot. The caller must hold the dev_base or rtnl lock
854  *	while allocating the name and adding the device in order to avoid
855  *	duplicates.
856  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
857  *	Returns the number of the unit assigned or a negative errno code.
858  */
859 
860 int dev_alloc_name(struct net_device *dev, const char *name)
861 {
862 	char buf[IFNAMSIZ];
863 	struct net *net;
864 	int ret;
865 
866 	BUG_ON(!dev_net(dev));
867 	net = dev_net(dev);
868 	ret = __dev_alloc_name(net, name, buf);
869 	if (ret >= 0)
870 		strlcpy(dev->name, buf, IFNAMSIZ);
871 	return ret;
872 }
873 
874 
875 /**
876  *	dev_change_name - change name of a device
877  *	@dev: device
878  *	@newname: name (or format string) must be at least IFNAMSIZ
879  *
880  *	Change name of a device, can pass format strings "eth%d".
881  *	for wildcarding.
882  */
883 int dev_change_name(struct net_device *dev, const char *newname)
884 {
885 	char oldname[IFNAMSIZ];
886 	int err = 0;
887 	int ret;
888 	struct net *net;
889 
890 	ASSERT_RTNL();
891 	BUG_ON(!dev_net(dev));
892 
893 	net = dev_net(dev);
894 	if (dev->flags & IFF_UP)
895 		return -EBUSY;
896 
897 	if (!dev_valid_name(newname))
898 		return -EINVAL;
899 
900 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
901 		return 0;
902 
903 	memcpy(oldname, dev->name, IFNAMSIZ);
904 
905 	if (strchr(newname, '%')) {
906 		err = dev_alloc_name(dev, newname);
907 		if (err < 0)
908 			return err;
909 	}
910 	else if (__dev_get_by_name(net, newname))
911 		return -EEXIST;
912 	else
913 		strlcpy(dev->name, newname, IFNAMSIZ);
914 
915 rollback:
916 	/* For now only devices in the initial network namespace
917 	 * are in sysfs.
918 	 */
919 	if (net == &init_net) {
920 		ret = device_rename(&dev->dev, dev->name);
921 		if (ret) {
922 			memcpy(dev->name, oldname, IFNAMSIZ);
923 			return ret;
924 		}
925 	}
926 
927 	write_lock_bh(&dev_base_lock);
928 	hlist_del(&dev->name_hlist);
929 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
930 	write_unlock_bh(&dev_base_lock);
931 
932 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
933 	ret = notifier_to_errno(ret);
934 
935 	if (ret) {
936 		if (err) {
937 			printk(KERN_ERR
938 			       "%s: name change rollback failed: %d.\n",
939 			       dev->name, ret);
940 		} else {
941 			err = ret;
942 			memcpy(dev->name, oldname, IFNAMSIZ);
943 			goto rollback;
944 		}
945 	}
946 
947 	return err;
948 }
949 
950 /**
951  *	dev_set_alias - change ifalias of a device
952  *	@dev: device
953  *	@alias: name up to IFALIASZ
954  *	@len: limit of bytes to copy from info
955  *
956  *	Set ifalias for a device,
957  */
958 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
959 {
960 	ASSERT_RTNL();
961 
962 	if (len >= IFALIASZ)
963 		return -EINVAL;
964 
965 	if (!len) {
966 		if (dev->ifalias) {
967 			kfree(dev->ifalias);
968 			dev->ifalias = NULL;
969 		}
970 		return 0;
971 	}
972 
973 	dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
974 	if (!dev->ifalias)
975 		return -ENOMEM;
976 
977 	strlcpy(dev->ifalias, alias, len+1);
978 	return len;
979 }
980 
981 
982 /**
983  *	netdev_features_change - device changes features
984  *	@dev: device to cause notification
985  *
986  *	Called to indicate a device has changed features.
987  */
988 void netdev_features_change(struct net_device *dev)
989 {
990 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
991 }
992 EXPORT_SYMBOL(netdev_features_change);
993 
994 /**
995  *	netdev_state_change - device changes state
996  *	@dev: device to cause notification
997  *
998  *	Called to indicate a device has changed state. This function calls
999  *	the notifier chains for netdev_chain and sends a NEWLINK message
1000  *	to the routing socket.
1001  */
1002 void netdev_state_change(struct net_device *dev)
1003 {
1004 	if (dev->flags & IFF_UP) {
1005 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1006 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1007 	}
1008 }
1009 
1010 void netdev_bonding_change(struct net_device *dev)
1011 {
1012 	call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1013 }
1014 EXPORT_SYMBOL(netdev_bonding_change);
1015 
1016 /**
1017  *	dev_load 	- load a network module
1018  *	@net: the applicable net namespace
1019  *	@name: name of interface
1020  *
1021  *	If a network interface is not present and the process has suitable
1022  *	privileges this function loads the module. If module loading is not
1023  *	available in this kernel then it becomes a nop.
1024  */
1025 
1026 void dev_load(struct net *net, const char *name)
1027 {
1028 	struct net_device *dev;
1029 
1030 	read_lock(&dev_base_lock);
1031 	dev = __dev_get_by_name(net, name);
1032 	read_unlock(&dev_base_lock);
1033 
1034 	if (!dev && capable(CAP_SYS_MODULE))
1035 		request_module("%s", name);
1036 }
1037 
1038 /**
1039  *	dev_open	- prepare an interface for use.
1040  *	@dev:	device to open
1041  *
1042  *	Takes a device from down to up state. The device's private open
1043  *	function is invoked and then the multicast lists are loaded. Finally
1044  *	the device is moved into the up state and a %NETDEV_UP message is
1045  *	sent to the netdev notifier chain.
1046  *
1047  *	Calling this function on an active interface is a nop. On a failure
1048  *	a negative errno code is returned.
1049  */
1050 int dev_open(struct net_device *dev)
1051 {
1052 	const struct net_device_ops *ops = dev->netdev_ops;
1053 	int ret;
1054 
1055 	ASSERT_RTNL();
1056 
1057 	/*
1058 	 *	Is it already up?
1059 	 */
1060 
1061 	if (dev->flags & IFF_UP)
1062 		return 0;
1063 
1064 	/*
1065 	 *	Is it even present?
1066 	 */
1067 	if (!netif_device_present(dev))
1068 		return -ENODEV;
1069 
1070 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1071 	ret = notifier_to_errno(ret);
1072 	if (ret)
1073 		return ret;
1074 
1075 	/*
1076 	 *	Call device private open method
1077 	 */
1078 	set_bit(__LINK_STATE_START, &dev->state);
1079 
1080 	if (ops->ndo_validate_addr)
1081 		ret = ops->ndo_validate_addr(dev);
1082 
1083 	if (!ret && ops->ndo_open)
1084 		ret = ops->ndo_open(dev);
1085 
1086 	/*
1087 	 *	If it went open OK then:
1088 	 */
1089 
1090 	if (ret)
1091 		clear_bit(__LINK_STATE_START, &dev->state);
1092 	else {
1093 		/*
1094 		 *	Set the flags.
1095 		 */
1096 		dev->flags |= IFF_UP;
1097 
1098 		/*
1099 		 *	Enable NET_DMA
1100 		 */
1101 		net_dmaengine_get();
1102 
1103 		/*
1104 		 *	Initialize multicasting status
1105 		 */
1106 		dev_set_rx_mode(dev);
1107 
1108 		/*
1109 		 *	Wakeup transmit queue engine
1110 		 */
1111 		dev_activate(dev);
1112 
1113 		/*
1114 		 *	... and announce new interface.
1115 		 */
1116 		call_netdevice_notifiers(NETDEV_UP, dev);
1117 	}
1118 
1119 	return ret;
1120 }
1121 
1122 /**
1123  *	dev_close - shutdown an interface.
1124  *	@dev: device to shutdown
1125  *
1126  *	This function moves an active device into down state. A
1127  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1128  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1129  *	chain.
1130  */
1131 int dev_close(struct net_device *dev)
1132 {
1133 	const struct net_device_ops *ops = dev->netdev_ops;
1134 	ASSERT_RTNL();
1135 
1136 	might_sleep();
1137 
1138 	if (!(dev->flags & IFF_UP))
1139 		return 0;
1140 
1141 	/*
1142 	 *	Tell people we are going down, so that they can
1143 	 *	prepare to death, when device is still operating.
1144 	 */
1145 	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1146 
1147 	clear_bit(__LINK_STATE_START, &dev->state);
1148 
1149 	/* Synchronize to scheduled poll. We cannot touch poll list,
1150 	 * it can be even on different cpu. So just clear netif_running().
1151 	 *
1152 	 * dev->stop() will invoke napi_disable() on all of it's
1153 	 * napi_struct instances on this device.
1154 	 */
1155 	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1156 
1157 	dev_deactivate(dev);
1158 
1159 	/*
1160 	 *	Call the device specific close. This cannot fail.
1161 	 *	Only if device is UP
1162 	 *
1163 	 *	We allow it to be called even after a DETACH hot-plug
1164 	 *	event.
1165 	 */
1166 	if (ops->ndo_stop)
1167 		ops->ndo_stop(dev);
1168 
1169 	/*
1170 	 *	Device is now down.
1171 	 */
1172 
1173 	dev->flags &= ~IFF_UP;
1174 
1175 	/*
1176 	 * Tell people we are down
1177 	 */
1178 	call_netdevice_notifiers(NETDEV_DOWN, dev);
1179 
1180 	/*
1181 	 *	Shutdown NET_DMA
1182 	 */
1183 	net_dmaengine_put();
1184 
1185 	return 0;
1186 }
1187 
1188 
1189 /**
1190  *	dev_disable_lro - disable Large Receive Offload on a device
1191  *	@dev: device
1192  *
1193  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1194  *	called under RTNL.  This is needed if received packets may be
1195  *	forwarded to another interface.
1196  */
1197 void dev_disable_lro(struct net_device *dev)
1198 {
1199 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1200 	    dev->ethtool_ops->set_flags) {
1201 		u32 flags = dev->ethtool_ops->get_flags(dev);
1202 		if (flags & ETH_FLAG_LRO) {
1203 			flags &= ~ETH_FLAG_LRO;
1204 			dev->ethtool_ops->set_flags(dev, flags);
1205 		}
1206 	}
1207 	WARN_ON(dev->features & NETIF_F_LRO);
1208 }
1209 EXPORT_SYMBOL(dev_disable_lro);
1210 
1211 
1212 static int dev_boot_phase = 1;
1213 
1214 /*
1215  *	Device change register/unregister. These are not inline or static
1216  *	as we export them to the world.
1217  */
1218 
1219 /**
1220  *	register_netdevice_notifier - register a network notifier block
1221  *	@nb: notifier
1222  *
1223  *	Register a notifier to be called when network device events occur.
1224  *	The notifier passed is linked into the kernel structures and must
1225  *	not be reused until it has been unregistered. A negative errno code
1226  *	is returned on a failure.
1227  *
1228  * 	When registered all registration and up events are replayed
1229  *	to the new notifier to allow device to have a race free
1230  *	view of the network device list.
1231  */
1232 
1233 int register_netdevice_notifier(struct notifier_block *nb)
1234 {
1235 	struct net_device *dev;
1236 	struct net_device *last;
1237 	struct net *net;
1238 	int err;
1239 
1240 	rtnl_lock();
1241 	err = raw_notifier_chain_register(&netdev_chain, nb);
1242 	if (err)
1243 		goto unlock;
1244 	if (dev_boot_phase)
1245 		goto unlock;
1246 	for_each_net(net) {
1247 		for_each_netdev(net, dev) {
1248 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1249 			err = notifier_to_errno(err);
1250 			if (err)
1251 				goto rollback;
1252 
1253 			if (!(dev->flags & IFF_UP))
1254 				continue;
1255 
1256 			nb->notifier_call(nb, NETDEV_UP, dev);
1257 		}
1258 	}
1259 
1260 unlock:
1261 	rtnl_unlock();
1262 	return err;
1263 
1264 rollback:
1265 	last = dev;
1266 	for_each_net(net) {
1267 		for_each_netdev(net, dev) {
1268 			if (dev == last)
1269 				break;
1270 
1271 			if (dev->flags & IFF_UP) {
1272 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1273 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1274 			}
1275 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1276 		}
1277 	}
1278 
1279 	raw_notifier_chain_unregister(&netdev_chain, nb);
1280 	goto unlock;
1281 }
1282 
1283 /**
1284  *	unregister_netdevice_notifier - unregister a network notifier block
1285  *	@nb: notifier
1286  *
1287  *	Unregister a notifier previously registered by
1288  *	register_netdevice_notifier(). The notifier is unlinked into the
1289  *	kernel structures and may then be reused. A negative errno code
1290  *	is returned on a failure.
1291  */
1292 
1293 int unregister_netdevice_notifier(struct notifier_block *nb)
1294 {
1295 	int err;
1296 
1297 	rtnl_lock();
1298 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1299 	rtnl_unlock();
1300 	return err;
1301 }
1302 
1303 /**
1304  *	call_netdevice_notifiers - call all network notifier blocks
1305  *      @val: value passed unmodified to notifier function
1306  *      @dev: net_device pointer passed unmodified to notifier function
1307  *
1308  *	Call all network notifier blocks.  Parameters and return value
1309  *	are as for raw_notifier_call_chain().
1310  */
1311 
1312 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1313 {
1314 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1315 }
1316 
1317 /* When > 0 there are consumers of rx skb time stamps */
1318 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1319 
1320 void net_enable_timestamp(void)
1321 {
1322 	atomic_inc(&netstamp_needed);
1323 }
1324 
1325 void net_disable_timestamp(void)
1326 {
1327 	atomic_dec(&netstamp_needed);
1328 }
1329 
1330 static inline void net_timestamp(struct sk_buff *skb)
1331 {
1332 	if (atomic_read(&netstamp_needed))
1333 		__net_timestamp(skb);
1334 	else
1335 		skb->tstamp.tv64 = 0;
1336 }
1337 
1338 /*
1339  *	Support routine. Sends outgoing frames to any network
1340  *	taps currently in use.
1341  */
1342 
1343 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1344 {
1345 	struct packet_type *ptype;
1346 
1347 #ifdef CONFIG_NET_CLS_ACT
1348 	if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1349 		net_timestamp(skb);
1350 #else
1351 	net_timestamp(skb);
1352 #endif
1353 
1354 	rcu_read_lock();
1355 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1356 		/* Never send packets back to the socket
1357 		 * they originated from - MvS ([email protected])
1358 		 */
1359 		if ((ptype->dev == dev || !ptype->dev) &&
1360 		    (ptype->af_packet_priv == NULL ||
1361 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1362 			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1363 			if (!skb2)
1364 				break;
1365 
1366 			/* skb->nh should be correctly
1367 			   set by sender, so that the second statement is
1368 			   just protection against buggy protocols.
1369 			 */
1370 			skb_reset_mac_header(skb2);
1371 
1372 			if (skb_network_header(skb2) < skb2->data ||
1373 			    skb2->network_header > skb2->tail) {
1374 				if (net_ratelimit())
1375 					printk(KERN_CRIT "protocol %04x is "
1376 					       "buggy, dev %s\n",
1377 					       skb2->protocol, dev->name);
1378 				skb_reset_network_header(skb2);
1379 			}
1380 
1381 			skb2->transport_header = skb2->network_header;
1382 			skb2->pkt_type = PACKET_OUTGOING;
1383 			ptype->func(skb2, skb->dev, ptype, skb->dev);
1384 		}
1385 	}
1386 	rcu_read_unlock();
1387 }
1388 
1389 
1390 static inline void __netif_reschedule(struct Qdisc *q)
1391 {
1392 	struct softnet_data *sd;
1393 	unsigned long flags;
1394 
1395 	local_irq_save(flags);
1396 	sd = &__get_cpu_var(softnet_data);
1397 	q->next_sched = sd->output_queue;
1398 	sd->output_queue = q;
1399 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1400 	local_irq_restore(flags);
1401 }
1402 
1403 void __netif_schedule(struct Qdisc *q)
1404 {
1405 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1406 		__netif_reschedule(q);
1407 }
1408 EXPORT_SYMBOL(__netif_schedule);
1409 
1410 void dev_kfree_skb_irq(struct sk_buff *skb)
1411 {
1412 	if (atomic_dec_and_test(&skb->users)) {
1413 		struct softnet_data *sd;
1414 		unsigned long flags;
1415 
1416 		local_irq_save(flags);
1417 		sd = &__get_cpu_var(softnet_data);
1418 		skb->next = sd->completion_queue;
1419 		sd->completion_queue = skb;
1420 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1421 		local_irq_restore(flags);
1422 	}
1423 }
1424 EXPORT_SYMBOL(dev_kfree_skb_irq);
1425 
1426 void dev_kfree_skb_any(struct sk_buff *skb)
1427 {
1428 	if (in_irq() || irqs_disabled())
1429 		dev_kfree_skb_irq(skb);
1430 	else
1431 		dev_kfree_skb(skb);
1432 }
1433 EXPORT_SYMBOL(dev_kfree_skb_any);
1434 
1435 
1436 /**
1437  * netif_device_detach - mark device as removed
1438  * @dev: network device
1439  *
1440  * Mark device as removed from system and therefore no longer available.
1441  */
1442 void netif_device_detach(struct net_device *dev)
1443 {
1444 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1445 	    netif_running(dev)) {
1446 		netif_tx_stop_all_queues(dev);
1447 	}
1448 }
1449 EXPORT_SYMBOL(netif_device_detach);
1450 
1451 /**
1452  * netif_device_attach - mark device as attached
1453  * @dev: network device
1454  *
1455  * Mark device as attached from system and restart if needed.
1456  */
1457 void netif_device_attach(struct net_device *dev)
1458 {
1459 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1460 	    netif_running(dev)) {
1461 		netif_tx_wake_all_queues(dev);
1462 		__netdev_watchdog_up(dev);
1463 	}
1464 }
1465 EXPORT_SYMBOL(netif_device_attach);
1466 
1467 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1468 {
1469 	return ((features & NETIF_F_GEN_CSUM) ||
1470 		((features & NETIF_F_IP_CSUM) &&
1471 		 protocol == htons(ETH_P_IP)) ||
1472 		((features & NETIF_F_IPV6_CSUM) &&
1473 		 protocol == htons(ETH_P_IPV6)) ||
1474 		((features & NETIF_F_FCOE_CRC) &&
1475 		 protocol == htons(ETH_P_FCOE)));
1476 }
1477 
1478 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1479 {
1480 	if (can_checksum_protocol(dev->features, skb->protocol))
1481 		return true;
1482 
1483 	if (skb->protocol == htons(ETH_P_8021Q)) {
1484 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1485 		if (can_checksum_protocol(dev->features & dev->vlan_features,
1486 					  veh->h_vlan_encapsulated_proto))
1487 			return true;
1488 	}
1489 
1490 	return false;
1491 }
1492 
1493 /*
1494  * Invalidate hardware checksum when packet is to be mangled, and
1495  * complete checksum manually on outgoing path.
1496  */
1497 int skb_checksum_help(struct sk_buff *skb)
1498 {
1499 	__wsum csum;
1500 	int ret = 0, offset;
1501 
1502 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1503 		goto out_set_summed;
1504 
1505 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1506 		/* Let GSO fix up the checksum. */
1507 		goto out_set_summed;
1508 	}
1509 
1510 	offset = skb->csum_start - skb_headroom(skb);
1511 	BUG_ON(offset >= skb_headlen(skb));
1512 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1513 
1514 	offset += skb->csum_offset;
1515 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1516 
1517 	if (skb_cloned(skb) &&
1518 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1519 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1520 		if (ret)
1521 			goto out;
1522 	}
1523 
1524 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1525 out_set_summed:
1526 	skb->ip_summed = CHECKSUM_NONE;
1527 out:
1528 	return ret;
1529 }
1530 
1531 /**
1532  *	skb_gso_segment - Perform segmentation on skb.
1533  *	@skb: buffer to segment
1534  *	@features: features for the output path (see dev->features)
1535  *
1536  *	This function segments the given skb and returns a list of segments.
1537  *
1538  *	It may return NULL if the skb requires no segmentation.  This is
1539  *	only possible when GSO is used for verifying header integrity.
1540  */
1541 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1542 {
1543 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1544 	struct packet_type *ptype;
1545 	__be16 type = skb->protocol;
1546 	int err;
1547 
1548 	skb_reset_mac_header(skb);
1549 	skb->mac_len = skb->network_header - skb->mac_header;
1550 	__skb_pull(skb, skb->mac_len);
1551 
1552 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1553 		struct net_device *dev = skb->dev;
1554 		struct ethtool_drvinfo info = {};
1555 
1556 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1557 			dev->ethtool_ops->get_drvinfo(dev, &info);
1558 
1559 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1560 			"ip_summed=%d",
1561 		     info.driver, dev ? dev->features : 0L,
1562 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1563 		     skb->len, skb->data_len, skb->ip_summed);
1564 
1565 		if (skb_header_cloned(skb) &&
1566 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1567 			return ERR_PTR(err);
1568 	}
1569 
1570 	rcu_read_lock();
1571 	list_for_each_entry_rcu(ptype,
1572 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1573 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1574 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1575 				err = ptype->gso_send_check(skb);
1576 				segs = ERR_PTR(err);
1577 				if (err || skb_gso_ok(skb, features))
1578 					break;
1579 				__skb_push(skb, (skb->data -
1580 						 skb_network_header(skb)));
1581 			}
1582 			segs = ptype->gso_segment(skb, features);
1583 			break;
1584 		}
1585 	}
1586 	rcu_read_unlock();
1587 
1588 	__skb_push(skb, skb->data - skb_mac_header(skb));
1589 
1590 	return segs;
1591 }
1592 
1593 EXPORT_SYMBOL(skb_gso_segment);
1594 
1595 /* Take action when hardware reception checksum errors are detected. */
1596 #ifdef CONFIG_BUG
1597 void netdev_rx_csum_fault(struct net_device *dev)
1598 {
1599 	if (net_ratelimit()) {
1600 		printk(KERN_ERR "%s: hw csum failure.\n",
1601 			dev ? dev->name : "<unknown>");
1602 		dump_stack();
1603 	}
1604 }
1605 EXPORT_SYMBOL(netdev_rx_csum_fault);
1606 #endif
1607 
1608 /* Actually, we should eliminate this check as soon as we know, that:
1609  * 1. IOMMU is present and allows to map all the memory.
1610  * 2. No high memory really exists on this machine.
1611  */
1612 
1613 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1614 {
1615 #ifdef CONFIG_HIGHMEM
1616 	int i;
1617 
1618 	if (dev->features & NETIF_F_HIGHDMA)
1619 		return 0;
1620 
1621 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1622 		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1623 			return 1;
1624 
1625 #endif
1626 	return 0;
1627 }
1628 
1629 struct dev_gso_cb {
1630 	void (*destructor)(struct sk_buff *skb);
1631 };
1632 
1633 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1634 
1635 static void dev_gso_skb_destructor(struct sk_buff *skb)
1636 {
1637 	struct dev_gso_cb *cb;
1638 
1639 	do {
1640 		struct sk_buff *nskb = skb->next;
1641 
1642 		skb->next = nskb->next;
1643 		nskb->next = NULL;
1644 		kfree_skb(nskb);
1645 	} while (skb->next);
1646 
1647 	cb = DEV_GSO_CB(skb);
1648 	if (cb->destructor)
1649 		cb->destructor(skb);
1650 }
1651 
1652 /**
1653  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1654  *	@skb: buffer to segment
1655  *
1656  *	This function segments the given skb and stores the list of segments
1657  *	in skb->next.
1658  */
1659 static int dev_gso_segment(struct sk_buff *skb)
1660 {
1661 	struct net_device *dev = skb->dev;
1662 	struct sk_buff *segs;
1663 	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1664 					 NETIF_F_SG : 0);
1665 
1666 	segs = skb_gso_segment(skb, features);
1667 
1668 	/* Verifying header integrity only. */
1669 	if (!segs)
1670 		return 0;
1671 
1672 	if (IS_ERR(segs))
1673 		return PTR_ERR(segs);
1674 
1675 	skb->next = segs;
1676 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1677 	skb->destructor = dev_gso_skb_destructor;
1678 
1679 	return 0;
1680 }
1681 
1682 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1683 			struct netdev_queue *txq)
1684 {
1685 	const struct net_device_ops *ops = dev->netdev_ops;
1686 	int rc;
1687 
1688 	if (likely(!skb->next)) {
1689 		if (!list_empty(&ptype_all))
1690 			dev_queue_xmit_nit(skb, dev);
1691 
1692 		if (netif_needs_gso(dev, skb)) {
1693 			if (unlikely(dev_gso_segment(skb)))
1694 				goto out_kfree_skb;
1695 			if (skb->next)
1696 				goto gso;
1697 		}
1698 
1699 		/*
1700 		 * If device doesnt need skb->dst, release it right now while
1701 		 * its hot in this cpu cache
1702 		 */
1703 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1704 			skb_dst_drop(skb);
1705 
1706 		rc = ops->ndo_start_xmit(skb, dev);
1707 		if (rc == 0)
1708 			txq_trans_update(txq);
1709 		/*
1710 		 * TODO: if skb_orphan() was called by
1711 		 * dev->hard_start_xmit() (for example, the unmodified
1712 		 * igb driver does that; bnx2 doesn't), then
1713 		 * skb_tx_software_timestamp() will be unable to send
1714 		 * back the time stamp.
1715 		 *
1716 		 * How can this be prevented? Always create another
1717 		 * reference to the socket before calling
1718 		 * dev->hard_start_xmit()? Prevent that skb_orphan()
1719 		 * does anything in dev->hard_start_xmit() by clearing
1720 		 * the skb destructor before the call and restoring it
1721 		 * afterwards, then doing the skb_orphan() ourselves?
1722 		 */
1723 		return rc;
1724 	}
1725 
1726 gso:
1727 	do {
1728 		struct sk_buff *nskb = skb->next;
1729 
1730 		skb->next = nskb->next;
1731 		nskb->next = NULL;
1732 		rc = ops->ndo_start_xmit(nskb, dev);
1733 		if (unlikely(rc)) {
1734 			nskb->next = skb->next;
1735 			skb->next = nskb;
1736 			return rc;
1737 		}
1738 		txq_trans_update(txq);
1739 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1740 			return NETDEV_TX_BUSY;
1741 	} while (skb->next);
1742 
1743 	skb->destructor = DEV_GSO_CB(skb)->destructor;
1744 
1745 out_kfree_skb:
1746 	kfree_skb(skb);
1747 	return 0;
1748 }
1749 
1750 static u32 skb_tx_hashrnd;
1751 
1752 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1753 {
1754 	u32 hash;
1755 
1756 	if (skb_rx_queue_recorded(skb)) {
1757 		hash = skb_get_rx_queue(skb);
1758 		while (unlikely (hash >= dev->real_num_tx_queues))
1759 			hash -= dev->real_num_tx_queues;
1760 		return hash;
1761 	}
1762 
1763 	if (skb->sk && skb->sk->sk_hash)
1764 		hash = skb->sk->sk_hash;
1765 	else
1766 		hash = skb->protocol;
1767 
1768 	hash = jhash_1word(hash, skb_tx_hashrnd);
1769 
1770 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1771 }
1772 EXPORT_SYMBOL(skb_tx_hash);
1773 
1774 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1775 					struct sk_buff *skb)
1776 {
1777 	const struct net_device_ops *ops = dev->netdev_ops;
1778 	u16 queue_index = 0;
1779 
1780 	if (ops->ndo_select_queue)
1781 		queue_index = ops->ndo_select_queue(dev, skb);
1782 	else if (dev->real_num_tx_queues > 1)
1783 		queue_index = skb_tx_hash(dev, skb);
1784 
1785 	skb_set_queue_mapping(skb, queue_index);
1786 	return netdev_get_tx_queue(dev, queue_index);
1787 }
1788 
1789 /**
1790  *	dev_queue_xmit - transmit a buffer
1791  *	@skb: buffer to transmit
1792  *
1793  *	Queue a buffer for transmission to a network device. The caller must
1794  *	have set the device and priority and built the buffer before calling
1795  *	this function. The function can be called from an interrupt.
1796  *
1797  *	A negative errno code is returned on a failure. A success does not
1798  *	guarantee the frame will be transmitted as it may be dropped due
1799  *	to congestion or traffic shaping.
1800  *
1801  * -----------------------------------------------------------------------------------
1802  *      I notice this method can also return errors from the queue disciplines,
1803  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1804  *      be positive.
1805  *
1806  *      Regardless of the return value, the skb is consumed, so it is currently
1807  *      difficult to retry a send to this method.  (You can bump the ref count
1808  *      before sending to hold a reference for retry if you are careful.)
1809  *
1810  *      When calling this method, interrupts MUST be enabled.  This is because
1811  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1812  *          --BLG
1813  */
1814 int dev_queue_xmit(struct sk_buff *skb)
1815 {
1816 	struct net_device *dev = skb->dev;
1817 	struct netdev_queue *txq;
1818 	struct Qdisc *q;
1819 	int rc = -ENOMEM;
1820 
1821 	/* GSO will handle the following emulations directly. */
1822 	if (netif_needs_gso(dev, skb))
1823 		goto gso;
1824 
1825 	if (skb_has_frags(skb) &&
1826 	    !(dev->features & NETIF_F_FRAGLIST) &&
1827 	    __skb_linearize(skb))
1828 		goto out_kfree_skb;
1829 
1830 	/* Fragmented skb is linearized if device does not support SG,
1831 	 * or if at least one of fragments is in highmem and device
1832 	 * does not support DMA from it.
1833 	 */
1834 	if (skb_shinfo(skb)->nr_frags &&
1835 	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1836 	    __skb_linearize(skb))
1837 		goto out_kfree_skb;
1838 
1839 	/* If packet is not checksummed and device does not support
1840 	 * checksumming for this protocol, complete checksumming here.
1841 	 */
1842 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1843 		skb_set_transport_header(skb, skb->csum_start -
1844 					      skb_headroom(skb));
1845 		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1846 			goto out_kfree_skb;
1847 	}
1848 
1849 gso:
1850 	/* Disable soft irqs for various locks below. Also
1851 	 * stops preemption for RCU.
1852 	 */
1853 	rcu_read_lock_bh();
1854 
1855 	txq = dev_pick_tx(dev, skb);
1856 	q = rcu_dereference(txq->qdisc);
1857 
1858 #ifdef CONFIG_NET_CLS_ACT
1859 	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1860 #endif
1861 	if (q->enqueue) {
1862 		spinlock_t *root_lock = qdisc_lock(q);
1863 
1864 		spin_lock(root_lock);
1865 
1866 		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1867 			kfree_skb(skb);
1868 			rc = NET_XMIT_DROP;
1869 		} else {
1870 			rc = qdisc_enqueue_root(skb, q);
1871 			qdisc_run(q);
1872 		}
1873 		spin_unlock(root_lock);
1874 
1875 		goto out;
1876 	}
1877 
1878 	/* The device has no queue. Common case for software devices:
1879 	   loopback, all the sorts of tunnels...
1880 
1881 	   Really, it is unlikely that netif_tx_lock protection is necessary
1882 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1883 	   counters.)
1884 	   However, it is possible, that they rely on protection
1885 	   made by us here.
1886 
1887 	   Check this and shot the lock. It is not prone from deadlocks.
1888 	   Either shot noqueue qdisc, it is even simpler 8)
1889 	 */
1890 	if (dev->flags & IFF_UP) {
1891 		int cpu = smp_processor_id(); /* ok because BHs are off */
1892 
1893 		if (txq->xmit_lock_owner != cpu) {
1894 
1895 			HARD_TX_LOCK(dev, txq, cpu);
1896 
1897 			if (!netif_tx_queue_stopped(txq)) {
1898 				rc = 0;
1899 				if (!dev_hard_start_xmit(skb, dev, txq)) {
1900 					HARD_TX_UNLOCK(dev, txq);
1901 					goto out;
1902 				}
1903 			}
1904 			HARD_TX_UNLOCK(dev, txq);
1905 			if (net_ratelimit())
1906 				printk(KERN_CRIT "Virtual device %s asks to "
1907 				       "queue packet!\n", dev->name);
1908 		} else {
1909 			/* Recursion is detected! It is possible,
1910 			 * unfortunately */
1911 			if (net_ratelimit())
1912 				printk(KERN_CRIT "Dead loop on virtual device "
1913 				       "%s, fix it urgently!\n", dev->name);
1914 		}
1915 	}
1916 
1917 	rc = -ENETDOWN;
1918 	rcu_read_unlock_bh();
1919 
1920 out_kfree_skb:
1921 	kfree_skb(skb);
1922 	return rc;
1923 out:
1924 	rcu_read_unlock_bh();
1925 	return rc;
1926 }
1927 
1928 
1929 /*=======================================================================
1930 			Receiver routines
1931   =======================================================================*/
1932 
1933 int netdev_max_backlog __read_mostly = 1000;
1934 int netdev_budget __read_mostly = 300;
1935 int weight_p __read_mostly = 64;            /* old backlog weight */
1936 
1937 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1938 
1939 
1940 /**
1941  *	netif_rx	-	post buffer to the network code
1942  *	@skb: buffer to post
1943  *
1944  *	This function receives a packet from a device driver and queues it for
1945  *	the upper (protocol) levels to process.  It always succeeds. The buffer
1946  *	may be dropped during processing for congestion control or by the
1947  *	protocol layers.
1948  *
1949  *	return values:
1950  *	NET_RX_SUCCESS	(no congestion)
1951  *	NET_RX_DROP     (packet was dropped)
1952  *
1953  */
1954 
1955 int netif_rx(struct sk_buff *skb)
1956 {
1957 	struct softnet_data *queue;
1958 	unsigned long flags;
1959 
1960 	/* if netpoll wants it, pretend we never saw it */
1961 	if (netpoll_rx(skb))
1962 		return NET_RX_DROP;
1963 
1964 	if (!skb->tstamp.tv64)
1965 		net_timestamp(skb);
1966 
1967 	/*
1968 	 * The code is rearranged so that the path is the most
1969 	 * short when CPU is congested, but is still operating.
1970 	 */
1971 	local_irq_save(flags);
1972 	queue = &__get_cpu_var(softnet_data);
1973 
1974 	__get_cpu_var(netdev_rx_stat).total++;
1975 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1976 		if (queue->input_pkt_queue.qlen) {
1977 enqueue:
1978 			__skb_queue_tail(&queue->input_pkt_queue, skb);
1979 			local_irq_restore(flags);
1980 			return NET_RX_SUCCESS;
1981 		}
1982 
1983 		napi_schedule(&queue->backlog);
1984 		goto enqueue;
1985 	}
1986 
1987 	__get_cpu_var(netdev_rx_stat).dropped++;
1988 	local_irq_restore(flags);
1989 
1990 	kfree_skb(skb);
1991 	return NET_RX_DROP;
1992 }
1993 
1994 int netif_rx_ni(struct sk_buff *skb)
1995 {
1996 	int err;
1997 
1998 	preempt_disable();
1999 	err = netif_rx(skb);
2000 	if (local_softirq_pending())
2001 		do_softirq();
2002 	preempt_enable();
2003 
2004 	return err;
2005 }
2006 
2007 EXPORT_SYMBOL(netif_rx_ni);
2008 
2009 static void net_tx_action(struct softirq_action *h)
2010 {
2011 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2012 
2013 	if (sd->completion_queue) {
2014 		struct sk_buff *clist;
2015 
2016 		local_irq_disable();
2017 		clist = sd->completion_queue;
2018 		sd->completion_queue = NULL;
2019 		local_irq_enable();
2020 
2021 		while (clist) {
2022 			struct sk_buff *skb = clist;
2023 			clist = clist->next;
2024 
2025 			WARN_ON(atomic_read(&skb->users));
2026 			__kfree_skb(skb);
2027 		}
2028 	}
2029 
2030 	if (sd->output_queue) {
2031 		struct Qdisc *head;
2032 
2033 		local_irq_disable();
2034 		head = sd->output_queue;
2035 		sd->output_queue = NULL;
2036 		local_irq_enable();
2037 
2038 		while (head) {
2039 			struct Qdisc *q = head;
2040 			spinlock_t *root_lock;
2041 
2042 			head = head->next_sched;
2043 
2044 			root_lock = qdisc_lock(q);
2045 			if (spin_trylock(root_lock)) {
2046 				smp_mb__before_clear_bit();
2047 				clear_bit(__QDISC_STATE_SCHED,
2048 					  &q->state);
2049 				qdisc_run(q);
2050 				spin_unlock(root_lock);
2051 			} else {
2052 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2053 					      &q->state)) {
2054 					__netif_reschedule(q);
2055 				} else {
2056 					smp_mb__before_clear_bit();
2057 					clear_bit(__QDISC_STATE_SCHED,
2058 						  &q->state);
2059 				}
2060 			}
2061 		}
2062 	}
2063 }
2064 
2065 static inline int deliver_skb(struct sk_buff *skb,
2066 			      struct packet_type *pt_prev,
2067 			      struct net_device *orig_dev)
2068 {
2069 	atomic_inc(&skb->users);
2070 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2071 }
2072 
2073 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2074 
2075 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2076 /* This hook is defined here for ATM LANE */
2077 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2078 			     unsigned char *addr) __read_mostly;
2079 EXPORT_SYMBOL(br_fdb_test_addr_hook);
2080 #endif
2081 
2082 /*
2083  * If bridge module is loaded call bridging hook.
2084  *  returns NULL if packet was consumed.
2085  */
2086 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2087 					struct sk_buff *skb) __read_mostly;
2088 EXPORT_SYMBOL(br_handle_frame_hook);
2089 
2090 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2091 					    struct packet_type **pt_prev, int *ret,
2092 					    struct net_device *orig_dev)
2093 {
2094 	struct net_bridge_port *port;
2095 
2096 	if (skb->pkt_type == PACKET_LOOPBACK ||
2097 	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2098 		return skb;
2099 
2100 	if (*pt_prev) {
2101 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2102 		*pt_prev = NULL;
2103 	}
2104 
2105 	return br_handle_frame_hook(port, skb);
2106 }
2107 #else
2108 #define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2109 #endif
2110 
2111 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2112 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2113 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2114 
2115 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2116 					     struct packet_type **pt_prev,
2117 					     int *ret,
2118 					     struct net_device *orig_dev)
2119 {
2120 	if (skb->dev->macvlan_port == NULL)
2121 		return skb;
2122 
2123 	if (*pt_prev) {
2124 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2125 		*pt_prev = NULL;
2126 	}
2127 	return macvlan_handle_frame_hook(skb);
2128 }
2129 #else
2130 #define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2131 #endif
2132 
2133 #ifdef CONFIG_NET_CLS_ACT
2134 /* TODO: Maybe we should just force sch_ingress to be compiled in
2135  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2136  * a compare and 2 stores extra right now if we dont have it on
2137  * but have CONFIG_NET_CLS_ACT
2138  * NOTE: This doesnt stop any functionality; if you dont have
2139  * the ingress scheduler, you just cant add policies on ingress.
2140  *
2141  */
2142 static int ing_filter(struct sk_buff *skb)
2143 {
2144 	struct net_device *dev = skb->dev;
2145 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2146 	struct netdev_queue *rxq;
2147 	int result = TC_ACT_OK;
2148 	struct Qdisc *q;
2149 
2150 	if (MAX_RED_LOOP < ttl++) {
2151 		printk(KERN_WARNING
2152 		       "Redir loop detected Dropping packet (%d->%d)\n",
2153 		       skb->iif, dev->ifindex);
2154 		return TC_ACT_SHOT;
2155 	}
2156 
2157 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2158 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2159 
2160 	rxq = &dev->rx_queue;
2161 
2162 	q = rxq->qdisc;
2163 	if (q != &noop_qdisc) {
2164 		spin_lock(qdisc_lock(q));
2165 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2166 			result = qdisc_enqueue_root(skb, q);
2167 		spin_unlock(qdisc_lock(q));
2168 	}
2169 
2170 	return result;
2171 }
2172 
2173 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2174 					 struct packet_type **pt_prev,
2175 					 int *ret, struct net_device *orig_dev)
2176 {
2177 	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2178 		goto out;
2179 
2180 	if (*pt_prev) {
2181 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2182 		*pt_prev = NULL;
2183 	} else {
2184 		/* Huh? Why does turning on AF_PACKET affect this? */
2185 		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2186 	}
2187 
2188 	switch (ing_filter(skb)) {
2189 	case TC_ACT_SHOT:
2190 	case TC_ACT_STOLEN:
2191 		kfree_skb(skb);
2192 		return NULL;
2193 	}
2194 
2195 out:
2196 	skb->tc_verd = 0;
2197 	return skb;
2198 }
2199 #endif
2200 
2201 /*
2202  * 	netif_nit_deliver - deliver received packets to network taps
2203  * 	@skb: buffer
2204  *
2205  * 	This function is used to deliver incoming packets to network
2206  * 	taps. It should be used when the normal netif_receive_skb path
2207  * 	is bypassed, for example because of VLAN acceleration.
2208  */
2209 void netif_nit_deliver(struct sk_buff *skb)
2210 {
2211 	struct packet_type *ptype;
2212 
2213 	if (list_empty(&ptype_all))
2214 		return;
2215 
2216 	skb_reset_network_header(skb);
2217 	skb_reset_transport_header(skb);
2218 	skb->mac_len = skb->network_header - skb->mac_header;
2219 
2220 	rcu_read_lock();
2221 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2222 		if (!ptype->dev || ptype->dev == skb->dev)
2223 			deliver_skb(skb, ptype, skb->dev);
2224 	}
2225 	rcu_read_unlock();
2226 }
2227 
2228 /**
2229  *	netif_receive_skb - process receive buffer from network
2230  *	@skb: buffer to process
2231  *
2232  *	netif_receive_skb() is the main receive data processing function.
2233  *	It always succeeds. The buffer may be dropped during processing
2234  *	for congestion control or by the protocol layers.
2235  *
2236  *	This function may only be called from softirq context and interrupts
2237  *	should be enabled.
2238  *
2239  *	Return values (usually ignored):
2240  *	NET_RX_SUCCESS: no congestion
2241  *	NET_RX_DROP: packet was dropped
2242  */
2243 int netif_receive_skb(struct sk_buff *skb)
2244 {
2245 	struct packet_type *ptype, *pt_prev;
2246 	struct net_device *orig_dev;
2247 	struct net_device *null_or_orig;
2248 	int ret = NET_RX_DROP;
2249 	__be16 type;
2250 
2251 	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2252 		return NET_RX_SUCCESS;
2253 
2254 	/* if we've gotten here through NAPI, check netpoll */
2255 	if (netpoll_receive_skb(skb))
2256 		return NET_RX_DROP;
2257 
2258 	if (!skb->tstamp.tv64)
2259 		net_timestamp(skb);
2260 
2261 	if (!skb->iif)
2262 		skb->iif = skb->dev->ifindex;
2263 
2264 	null_or_orig = NULL;
2265 	orig_dev = skb->dev;
2266 	if (orig_dev->master) {
2267 		if (skb_bond_should_drop(skb))
2268 			null_or_orig = orig_dev; /* deliver only exact match */
2269 		else
2270 			skb->dev = orig_dev->master;
2271 	}
2272 
2273 	__get_cpu_var(netdev_rx_stat).total++;
2274 
2275 	skb_reset_network_header(skb);
2276 	skb_reset_transport_header(skb);
2277 	skb->mac_len = skb->network_header - skb->mac_header;
2278 
2279 	pt_prev = NULL;
2280 
2281 	rcu_read_lock();
2282 
2283 #ifdef CONFIG_NET_CLS_ACT
2284 	if (skb->tc_verd & TC_NCLS) {
2285 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2286 		goto ncls;
2287 	}
2288 #endif
2289 
2290 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2291 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2292 		    ptype->dev == orig_dev) {
2293 			if (pt_prev)
2294 				ret = deliver_skb(skb, pt_prev, orig_dev);
2295 			pt_prev = ptype;
2296 		}
2297 	}
2298 
2299 #ifdef CONFIG_NET_CLS_ACT
2300 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2301 	if (!skb)
2302 		goto out;
2303 ncls:
2304 #endif
2305 
2306 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2307 	if (!skb)
2308 		goto out;
2309 	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2310 	if (!skb)
2311 		goto out;
2312 
2313 	skb_orphan(skb);
2314 
2315 	type = skb->protocol;
2316 	list_for_each_entry_rcu(ptype,
2317 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2318 		if (ptype->type == type &&
2319 		    (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2320 		     ptype->dev == orig_dev)) {
2321 			if (pt_prev)
2322 				ret = deliver_skb(skb, pt_prev, orig_dev);
2323 			pt_prev = ptype;
2324 		}
2325 	}
2326 
2327 	if (pt_prev) {
2328 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2329 	} else {
2330 		kfree_skb(skb);
2331 		/* Jamal, now you will not able to escape explaining
2332 		 * me how you were going to use this. :-)
2333 		 */
2334 		ret = NET_RX_DROP;
2335 	}
2336 
2337 out:
2338 	rcu_read_unlock();
2339 	return ret;
2340 }
2341 
2342 /* Network device is going away, flush any packets still pending  */
2343 static void flush_backlog(void *arg)
2344 {
2345 	struct net_device *dev = arg;
2346 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2347 	struct sk_buff *skb, *tmp;
2348 
2349 	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2350 		if (skb->dev == dev) {
2351 			__skb_unlink(skb, &queue->input_pkt_queue);
2352 			kfree_skb(skb);
2353 		}
2354 }
2355 
2356 static int napi_gro_complete(struct sk_buff *skb)
2357 {
2358 	struct packet_type *ptype;
2359 	__be16 type = skb->protocol;
2360 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2361 	int err = -ENOENT;
2362 
2363 	if (NAPI_GRO_CB(skb)->count == 1) {
2364 		skb_shinfo(skb)->gso_size = 0;
2365 		goto out;
2366 	}
2367 
2368 	rcu_read_lock();
2369 	list_for_each_entry_rcu(ptype, head, list) {
2370 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2371 			continue;
2372 
2373 		err = ptype->gro_complete(skb);
2374 		break;
2375 	}
2376 	rcu_read_unlock();
2377 
2378 	if (err) {
2379 		WARN_ON(&ptype->list == head);
2380 		kfree_skb(skb);
2381 		return NET_RX_SUCCESS;
2382 	}
2383 
2384 out:
2385 	return netif_receive_skb(skb);
2386 }
2387 
2388 void napi_gro_flush(struct napi_struct *napi)
2389 {
2390 	struct sk_buff *skb, *next;
2391 
2392 	for (skb = napi->gro_list; skb; skb = next) {
2393 		next = skb->next;
2394 		skb->next = NULL;
2395 		napi_gro_complete(skb);
2396 	}
2397 
2398 	napi->gro_count = 0;
2399 	napi->gro_list = NULL;
2400 }
2401 EXPORT_SYMBOL(napi_gro_flush);
2402 
2403 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2404 {
2405 	struct sk_buff **pp = NULL;
2406 	struct packet_type *ptype;
2407 	__be16 type = skb->protocol;
2408 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2409 	int same_flow;
2410 	int mac_len;
2411 	int ret;
2412 
2413 	if (!(skb->dev->features & NETIF_F_GRO))
2414 		goto normal;
2415 
2416 	if (skb_is_gso(skb) || skb_has_frags(skb))
2417 		goto normal;
2418 
2419 	rcu_read_lock();
2420 	list_for_each_entry_rcu(ptype, head, list) {
2421 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2422 			continue;
2423 
2424 		skb_set_network_header(skb, skb_gro_offset(skb));
2425 		mac_len = skb->network_header - skb->mac_header;
2426 		skb->mac_len = mac_len;
2427 		NAPI_GRO_CB(skb)->same_flow = 0;
2428 		NAPI_GRO_CB(skb)->flush = 0;
2429 		NAPI_GRO_CB(skb)->free = 0;
2430 
2431 		pp = ptype->gro_receive(&napi->gro_list, skb);
2432 		break;
2433 	}
2434 	rcu_read_unlock();
2435 
2436 	if (&ptype->list == head)
2437 		goto normal;
2438 
2439 	same_flow = NAPI_GRO_CB(skb)->same_flow;
2440 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2441 
2442 	if (pp) {
2443 		struct sk_buff *nskb = *pp;
2444 
2445 		*pp = nskb->next;
2446 		nskb->next = NULL;
2447 		napi_gro_complete(nskb);
2448 		napi->gro_count--;
2449 	}
2450 
2451 	if (same_flow)
2452 		goto ok;
2453 
2454 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2455 		goto normal;
2456 
2457 	napi->gro_count++;
2458 	NAPI_GRO_CB(skb)->count = 1;
2459 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2460 	skb->next = napi->gro_list;
2461 	napi->gro_list = skb;
2462 	ret = GRO_HELD;
2463 
2464 pull:
2465 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
2466 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
2467 
2468 		BUG_ON(skb->end - skb->tail < grow);
2469 
2470 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
2471 
2472 		skb->tail += grow;
2473 		skb->data_len -= grow;
2474 
2475 		skb_shinfo(skb)->frags[0].page_offset += grow;
2476 		skb_shinfo(skb)->frags[0].size -= grow;
2477 
2478 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
2479 			put_page(skb_shinfo(skb)->frags[0].page);
2480 			memmove(skb_shinfo(skb)->frags,
2481 				skb_shinfo(skb)->frags + 1,
2482 				--skb_shinfo(skb)->nr_frags);
2483 		}
2484 	}
2485 
2486 ok:
2487 	return ret;
2488 
2489 normal:
2490 	ret = GRO_NORMAL;
2491 	goto pull;
2492 }
2493 EXPORT_SYMBOL(dev_gro_receive);
2494 
2495 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2496 {
2497 	struct sk_buff *p;
2498 
2499 	if (netpoll_rx_on(skb))
2500 		return GRO_NORMAL;
2501 
2502 	for (p = napi->gro_list; p; p = p->next) {
2503 		NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev)
2504 			&& !compare_ether_header(skb_mac_header(p),
2505 						 skb_gro_mac_header(skb));
2506 		NAPI_GRO_CB(p)->flush = 0;
2507 	}
2508 
2509 	return dev_gro_receive(napi, skb);
2510 }
2511 
2512 int napi_skb_finish(int ret, struct sk_buff *skb)
2513 {
2514 	int err = NET_RX_SUCCESS;
2515 
2516 	switch (ret) {
2517 	case GRO_NORMAL:
2518 		return netif_receive_skb(skb);
2519 
2520 	case GRO_DROP:
2521 		err = NET_RX_DROP;
2522 		/* fall through */
2523 
2524 	case GRO_MERGED_FREE:
2525 		kfree_skb(skb);
2526 		break;
2527 	}
2528 
2529 	return err;
2530 }
2531 EXPORT_SYMBOL(napi_skb_finish);
2532 
2533 void skb_gro_reset_offset(struct sk_buff *skb)
2534 {
2535 	NAPI_GRO_CB(skb)->data_offset = 0;
2536 	NAPI_GRO_CB(skb)->frag0 = NULL;
2537 	NAPI_GRO_CB(skb)->frag0_len = 0;
2538 
2539 	if (skb->mac_header == skb->tail &&
2540 	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
2541 		NAPI_GRO_CB(skb)->frag0 =
2542 			page_address(skb_shinfo(skb)->frags[0].page) +
2543 			skb_shinfo(skb)->frags[0].page_offset;
2544 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
2545 	}
2546 }
2547 EXPORT_SYMBOL(skb_gro_reset_offset);
2548 
2549 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2550 {
2551 	skb_gro_reset_offset(skb);
2552 
2553 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2554 }
2555 EXPORT_SYMBOL(napi_gro_receive);
2556 
2557 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2558 {
2559 	__skb_pull(skb, skb_headlen(skb));
2560 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2561 
2562 	napi->skb = skb;
2563 }
2564 EXPORT_SYMBOL(napi_reuse_skb);
2565 
2566 struct sk_buff *napi_get_frags(struct napi_struct *napi)
2567 {
2568 	struct net_device *dev = napi->dev;
2569 	struct sk_buff *skb = napi->skb;
2570 
2571 	if (!skb) {
2572 		skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2573 		if (!skb)
2574 			goto out;
2575 
2576 		skb_reserve(skb, NET_IP_ALIGN);
2577 
2578 		napi->skb = skb;
2579 	}
2580 
2581 out:
2582 	return skb;
2583 }
2584 EXPORT_SYMBOL(napi_get_frags);
2585 
2586 int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
2587 {
2588 	int err = NET_RX_SUCCESS;
2589 
2590 	switch (ret) {
2591 	case GRO_NORMAL:
2592 	case GRO_HELD:
2593 		skb->protocol = eth_type_trans(skb, napi->dev);
2594 
2595 		if (ret == GRO_NORMAL)
2596 			return netif_receive_skb(skb);
2597 
2598 		skb_gro_pull(skb, -ETH_HLEN);
2599 		break;
2600 
2601 	case GRO_DROP:
2602 		err = NET_RX_DROP;
2603 		/* fall through */
2604 
2605 	case GRO_MERGED_FREE:
2606 		napi_reuse_skb(napi, skb);
2607 		break;
2608 	}
2609 
2610 	return err;
2611 }
2612 EXPORT_SYMBOL(napi_frags_finish);
2613 
2614 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
2615 {
2616 	struct sk_buff *skb = napi->skb;
2617 	struct ethhdr *eth;
2618 	unsigned int hlen;
2619 	unsigned int off;
2620 
2621 	napi->skb = NULL;
2622 
2623 	skb_reset_mac_header(skb);
2624 	skb_gro_reset_offset(skb);
2625 
2626 	off = skb_gro_offset(skb);
2627 	hlen = off + sizeof(*eth);
2628 	eth = skb_gro_header_fast(skb, off);
2629 	if (skb_gro_header_hard(skb, hlen)) {
2630 		eth = skb_gro_header_slow(skb, hlen, off);
2631 		if (unlikely(!eth)) {
2632 			napi_reuse_skb(napi, skb);
2633 			skb = NULL;
2634 			goto out;
2635 		}
2636 	}
2637 
2638 	skb_gro_pull(skb, sizeof(*eth));
2639 
2640 	/*
2641 	 * This works because the only protocols we care about don't require
2642 	 * special handling.  We'll fix it up properly at the end.
2643 	 */
2644 	skb->protocol = eth->h_proto;
2645 
2646 out:
2647 	return skb;
2648 }
2649 EXPORT_SYMBOL(napi_frags_skb);
2650 
2651 int napi_gro_frags(struct napi_struct *napi)
2652 {
2653 	struct sk_buff *skb = napi_frags_skb(napi);
2654 
2655 	if (!skb)
2656 		return NET_RX_DROP;
2657 
2658 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2659 }
2660 EXPORT_SYMBOL(napi_gro_frags);
2661 
2662 static int process_backlog(struct napi_struct *napi, int quota)
2663 {
2664 	int work = 0;
2665 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2666 	unsigned long start_time = jiffies;
2667 
2668 	napi->weight = weight_p;
2669 	do {
2670 		struct sk_buff *skb;
2671 
2672 		local_irq_disable();
2673 		skb = __skb_dequeue(&queue->input_pkt_queue);
2674 		if (!skb) {
2675 			__napi_complete(napi);
2676 			local_irq_enable();
2677 			break;
2678 		}
2679 		local_irq_enable();
2680 
2681 		netif_receive_skb(skb);
2682 	} while (++work < quota && jiffies == start_time);
2683 
2684 	return work;
2685 }
2686 
2687 /**
2688  * __napi_schedule - schedule for receive
2689  * @n: entry to schedule
2690  *
2691  * The entry's receive function will be scheduled to run
2692  */
2693 void __napi_schedule(struct napi_struct *n)
2694 {
2695 	unsigned long flags;
2696 
2697 	local_irq_save(flags);
2698 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2699 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2700 	local_irq_restore(flags);
2701 }
2702 EXPORT_SYMBOL(__napi_schedule);
2703 
2704 void __napi_complete(struct napi_struct *n)
2705 {
2706 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2707 	BUG_ON(n->gro_list);
2708 
2709 	list_del(&n->poll_list);
2710 	smp_mb__before_clear_bit();
2711 	clear_bit(NAPI_STATE_SCHED, &n->state);
2712 }
2713 EXPORT_SYMBOL(__napi_complete);
2714 
2715 void napi_complete(struct napi_struct *n)
2716 {
2717 	unsigned long flags;
2718 
2719 	/*
2720 	 * don't let napi dequeue from the cpu poll list
2721 	 * just in case its running on a different cpu
2722 	 */
2723 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2724 		return;
2725 
2726 	napi_gro_flush(n);
2727 	local_irq_save(flags);
2728 	__napi_complete(n);
2729 	local_irq_restore(flags);
2730 }
2731 EXPORT_SYMBOL(napi_complete);
2732 
2733 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2734 		    int (*poll)(struct napi_struct *, int), int weight)
2735 {
2736 	INIT_LIST_HEAD(&napi->poll_list);
2737 	napi->gro_count = 0;
2738 	napi->gro_list = NULL;
2739 	napi->skb = NULL;
2740 	napi->poll = poll;
2741 	napi->weight = weight;
2742 	list_add(&napi->dev_list, &dev->napi_list);
2743 	napi->dev = dev;
2744 #ifdef CONFIG_NETPOLL
2745 	spin_lock_init(&napi->poll_lock);
2746 	napi->poll_owner = -1;
2747 #endif
2748 	set_bit(NAPI_STATE_SCHED, &napi->state);
2749 }
2750 EXPORT_SYMBOL(netif_napi_add);
2751 
2752 void netif_napi_del(struct napi_struct *napi)
2753 {
2754 	struct sk_buff *skb, *next;
2755 
2756 	list_del_init(&napi->dev_list);
2757 	napi_free_frags(napi);
2758 
2759 	for (skb = napi->gro_list; skb; skb = next) {
2760 		next = skb->next;
2761 		skb->next = NULL;
2762 		kfree_skb(skb);
2763 	}
2764 
2765 	napi->gro_list = NULL;
2766 	napi->gro_count = 0;
2767 }
2768 EXPORT_SYMBOL(netif_napi_del);
2769 
2770 
2771 static void net_rx_action(struct softirq_action *h)
2772 {
2773 	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2774 	unsigned long time_limit = jiffies + 2;
2775 	int budget = netdev_budget;
2776 	void *have;
2777 
2778 	local_irq_disable();
2779 
2780 	while (!list_empty(list)) {
2781 		struct napi_struct *n;
2782 		int work, weight;
2783 
2784 		/* If softirq window is exhuasted then punt.
2785 		 * Allow this to run for 2 jiffies since which will allow
2786 		 * an average latency of 1.5/HZ.
2787 		 */
2788 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2789 			goto softnet_break;
2790 
2791 		local_irq_enable();
2792 
2793 		/* Even though interrupts have been re-enabled, this
2794 		 * access is safe because interrupts can only add new
2795 		 * entries to the tail of this list, and only ->poll()
2796 		 * calls can remove this head entry from the list.
2797 		 */
2798 		n = list_entry(list->next, struct napi_struct, poll_list);
2799 
2800 		have = netpoll_poll_lock(n);
2801 
2802 		weight = n->weight;
2803 
2804 		/* This NAPI_STATE_SCHED test is for avoiding a race
2805 		 * with netpoll's poll_napi().  Only the entity which
2806 		 * obtains the lock and sees NAPI_STATE_SCHED set will
2807 		 * actually make the ->poll() call.  Therefore we avoid
2808 		 * accidently calling ->poll() when NAPI is not scheduled.
2809 		 */
2810 		work = 0;
2811 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
2812 			work = n->poll(n, weight);
2813 			trace_napi_poll(n);
2814 		}
2815 
2816 		WARN_ON_ONCE(work > weight);
2817 
2818 		budget -= work;
2819 
2820 		local_irq_disable();
2821 
2822 		/* Drivers must not modify the NAPI state if they
2823 		 * consume the entire weight.  In such cases this code
2824 		 * still "owns" the NAPI instance and therefore can
2825 		 * move the instance around on the list at-will.
2826 		 */
2827 		if (unlikely(work == weight)) {
2828 			if (unlikely(napi_disable_pending(n)))
2829 				__napi_complete(n);
2830 			else
2831 				list_move_tail(&n->poll_list, list);
2832 		}
2833 
2834 		netpoll_poll_unlock(have);
2835 	}
2836 out:
2837 	local_irq_enable();
2838 
2839 #ifdef CONFIG_NET_DMA
2840 	/*
2841 	 * There may not be any more sk_buffs coming right now, so push
2842 	 * any pending DMA copies to hardware
2843 	 */
2844 	dma_issue_pending_all();
2845 #endif
2846 
2847 	return;
2848 
2849 softnet_break:
2850 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2851 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2852 	goto out;
2853 }
2854 
2855 static gifconf_func_t * gifconf_list [NPROTO];
2856 
2857 /**
2858  *	register_gifconf	-	register a SIOCGIF handler
2859  *	@family: Address family
2860  *	@gifconf: Function handler
2861  *
2862  *	Register protocol dependent address dumping routines. The handler
2863  *	that is passed must not be freed or reused until it has been replaced
2864  *	by another handler.
2865  */
2866 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2867 {
2868 	if (family >= NPROTO)
2869 		return -EINVAL;
2870 	gifconf_list[family] = gifconf;
2871 	return 0;
2872 }
2873 
2874 
2875 /*
2876  *	Map an interface index to its name (SIOCGIFNAME)
2877  */
2878 
2879 /*
2880  *	We need this ioctl for efficient implementation of the
2881  *	if_indextoname() function required by the IPv6 API.  Without
2882  *	it, we would have to search all the interfaces to find a
2883  *	match.  --pb
2884  */
2885 
2886 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2887 {
2888 	struct net_device *dev;
2889 	struct ifreq ifr;
2890 
2891 	/*
2892 	 *	Fetch the caller's info block.
2893 	 */
2894 
2895 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2896 		return -EFAULT;
2897 
2898 	read_lock(&dev_base_lock);
2899 	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2900 	if (!dev) {
2901 		read_unlock(&dev_base_lock);
2902 		return -ENODEV;
2903 	}
2904 
2905 	strcpy(ifr.ifr_name, dev->name);
2906 	read_unlock(&dev_base_lock);
2907 
2908 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2909 		return -EFAULT;
2910 	return 0;
2911 }
2912 
2913 /*
2914  *	Perform a SIOCGIFCONF call. This structure will change
2915  *	size eventually, and there is nothing I can do about it.
2916  *	Thus we will need a 'compatibility mode'.
2917  */
2918 
2919 static int dev_ifconf(struct net *net, char __user *arg)
2920 {
2921 	struct ifconf ifc;
2922 	struct net_device *dev;
2923 	char __user *pos;
2924 	int len;
2925 	int total;
2926 	int i;
2927 
2928 	/*
2929 	 *	Fetch the caller's info block.
2930 	 */
2931 
2932 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2933 		return -EFAULT;
2934 
2935 	pos = ifc.ifc_buf;
2936 	len = ifc.ifc_len;
2937 
2938 	/*
2939 	 *	Loop over the interfaces, and write an info block for each.
2940 	 */
2941 
2942 	total = 0;
2943 	for_each_netdev(net, dev) {
2944 		for (i = 0; i < NPROTO; i++) {
2945 			if (gifconf_list[i]) {
2946 				int done;
2947 				if (!pos)
2948 					done = gifconf_list[i](dev, NULL, 0);
2949 				else
2950 					done = gifconf_list[i](dev, pos + total,
2951 							       len - total);
2952 				if (done < 0)
2953 					return -EFAULT;
2954 				total += done;
2955 			}
2956 		}
2957 	}
2958 
2959 	/*
2960 	 *	All done.  Write the updated control block back to the caller.
2961 	 */
2962 	ifc.ifc_len = total;
2963 
2964 	/*
2965 	 * 	Both BSD and Solaris return 0 here, so we do too.
2966 	 */
2967 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2968 }
2969 
2970 #ifdef CONFIG_PROC_FS
2971 /*
2972  *	This is invoked by the /proc filesystem handler to display a device
2973  *	in detail.
2974  */
2975 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2976 	__acquires(dev_base_lock)
2977 {
2978 	struct net *net = seq_file_net(seq);
2979 	loff_t off;
2980 	struct net_device *dev;
2981 
2982 	read_lock(&dev_base_lock);
2983 	if (!*pos)
2984 		return SEQ_START_TOKEN;
2985 
2986 	off = 1;
2987 	for_each_netdev(net, dev)
2988 		if (off++ == *pos)
2989 			return dev;
2990 
2991 	return NULL;
2992 }
2993 
2994 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2995 {
2996 	struct net *net = seq_file_net(seq);
2997 	++*pos;
2998 	return v == SEQ_START_TOKEN ?
2999 		first_net_device(net) : next_net_device((struct net_device *)v);
3000 }
3001 
3002 void dev_seq_stop(struct seq_file *seq, void *v)
3003 	__releases(dev_base_lock)
3004 {
3005 	read_unlock(&dev_base_lock);
3006 }
3007 
3008 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3009 {
3010 	const struct net_device_stats *stats = dev_get_stats(dev);
3011 
3012 	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3013 		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3014 		   dev->name, stats->rx_bytes, stats->rx_packets,
3015 		   stats->rx_errors,
3016 		   stats->rx_dropped + stats->rx_missed_errors,
3017 		   stats->rx_fifo_errors,
3018 		   stats->rx_length_errors + stats->rx_over_errors +
3019 		    stats->rx_crc_errors + stats->rx_frame_errors,
3020 		   stats->rx_compressed, stats->multicast,
3021 		   stats->tx_bytes, stats->tx_packets,
3022 		   stats->tx_errors, stats->tx_dropped,
3023 		   stats->tx_fifo_errors, stats->collisions,
3024 		   stats->tx_carrier_errors +
3025 		    stats->tx_aborted_errors +
3026 		    stats->tx_window_errors +
3027 		    stats->tx_heartbeat_errors,
3028 		   stats->tx_compressed);
3029 }
3030 
3031 /*
3032  *	Called from the PROCfs module. This now uses the new arbitrary sized
3033  *	/proc/net interface to create /proc/net/dev
3034  */
3035 static int dev_seq_show(struct seq_file *seq, void *v)
3036 {
3037 	if (v == SEQ_START_TOKEN)
3038 		seq_puts(seq, "Inter-|   Receive                            "
3039 			      "                    |  Transmit\n"
3040 			      " face |bytes    packets errs drop fifo frame "
3041 			      "compressed multicast|bytes    packets errs "
3042 			      "drop fifo colls carrier compressed\n");
3043 	else
3044 		dev_seq_printf_stats(seq, v);
3045 	return 0;
3046 }
3047 
3048 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3049 {
3050 	struct netif_rx_stats *rc = NULL;
3051 
3052 	while (*pos < nr_cpu_ids)
3053 		if (cpu_online(*pos)) {
3054 			rc = &per_cpu(netdev_rx_stat, *pos);
3055 			break;
3056 		} else
3057 			++*pos;
3058 	return rc;
3059 }
3060 
3061 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3062 {
3063 	return softnet_get_online(pos);
3064 }
3065 
3066 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3067 {
3068 	++*pos;
3069 	return softnet_get_online(pos);
3070 }
3071 
3072 static void softnet_seq_stop(struct seq_file *seq, void *v)
3073 {
3074 }
3075 
3076 static int softnet_seq_show(struct seq_file *seq, void *v)
3077 {
3078 	struct netif_rx_stats *s = v;
3079 
3080 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3081 		   s->total, s->dropped, s->time_squeeze, 0,
3082 		   0, 0, 0, 0, /* was fastroute */
3083 		   s->cpu_collision );
3084 	return 0;
3085 }
3086 
3087 static const struct seq_operations dev_seq_ops = {
3088 	.start = dev_seq_start,
3089 	.next  = dev_seq_next,
3090 	.stop  = dev_seq_stop,
3091 	.show  = dev_seq_show,
3092 };
3093 
3094 static int dev_seq_open(struct inode *inode, struct file *file)
3095 {
3096 	return seq_open_net(inode, file, &dev_seq_ops,
3097 			    sizeof(struct seq_net_private));
3098 }
3099 
3100 static const struct file_operations dev_seq_fops = {
3101 	.owner	 = THIS_MODULE,
3102 	.open    = dev_seq_open,
3103 	.read    = seq_read,
3104 	.llseek  = seq_lseek,
3105 	.release = seq_release_net,
3106 };
3107 
3108 static const struct seq_operations softnet_seq_ops = {
3109 	.start = softnet_seq_start,
3110 	.next  = softnet_seq_next,
3111 	.stop  = softnet_seq_stop,
3112 	.show  = softnet_seq_show,
3113 };
3114 
3115 static int softnet_seq_open(struct inode *inode, struct file *file)
3116 {
3117 	return seq_open(file, &softnet_seq_ops);
3118 }
3119 
3120 static const struct file_operations softnet_seq_fops = {
3121 	.owner	 = THIS_MODULE,
3122 	.open    = softnet_seq_open,
3123 	.read    = seq_read,
3124 	.llseek  = seq_lseek,
3125 	.release = seq_release,
3126 };
3127 
3128 static void *ptype_get_idx(loff_t pos)
3129 {
3130 	struct packet_type *pt = NULL;
3131 	loff_t i = 0;
3132 	int t;
3133 
3134 	list_for_each_entry_rcu(pt, &ptype_all, list) {
3135 		if (i == pos)
3136 			return pt;
3137 		++i;
3138 	}
3139 
3140 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3141 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3142 			if (i == pos)
3143 				return pt;
3144 			++i;
3145 		}
3146 	}
3147 	return NULL;
3148 }
3149 
3150 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3151 	__acquires(RCU)
3152 {
3153 	rcu_read_lock();
3154 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3155 }
3156 
3157 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3158 {
3159 	struct packet_type *pt;
3160 	struct list_head *nxt;
3161 	int hash;
3162 
3163 	++*pos;
3164 	if (v == SEQ_START_TOKEN)
3165 		return ptype_get_idx(0);
3166 
3167 	pt = v;
3168 	nxt = pt->list.next;
3169 	if (pt->type == htons(ETH_P_ALL)) {
3170 		if (nxt != &ptype_all)
3171 			goto found;
3172 		hash = 0;
3173 		nxt = ptype_base[0].next;
3174 	} else
3175 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3176 
3177 	while (nxt == &ptype_base[hash]) {
3178 		if (++hash >= PTYPE_HASH_SIZE)
3179 			return NULL;
3180 		nxt = ptype_base[hash].next;
3181 	}
3182 found:
3183 	return list_entry(nxt, struct packet_type, list);
3184 }
3185 
3186 static void ptype_seq_stop(struct seq_file *seq, void *v)
3187 	__releases(RCU)
3188 {
3189 	rcu_read_unlock();
3190 }
3191 
3192 static int ptype_seq_show(struct seq_file *seq, void *v)
3193 {
3194 	struct packet_type *pt = v;
3195 
3196 	if (v == SEQ_START_TOKEN)
3197 		seq_puts(seq, "Type Device      Function\n");
3198 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3199 		if (pt->type == htons(ETH_P_ALL))
3200 			seq_puts(seq, "ALL ");
3201 		else
3202 			seq_printf(seq, "%04x", ntohs(pt->type));
3203 
3204 		seq_printf(seq, " %-8s %pF\n",
3205 			   pt->dev ? pt->dev->name : "", pt->func);
3206 	}
3207 
3208 	return 0;
3209 }
3210 
3211 static const struct seq_operations ptype_seq_ops = {
3212 	.start = ptype_seq_start,
3213 	.next  = ptype_seq_next,
3214 	.stop  = ptype_seq_stop,
3215 	.show  = ptype_seq_show,
3216 };
3217 
3218 static int ptype_seq_open(struct inode *inode, struct file *file)
3219 {
3220 	return seq_open_net(inode, file, &ptype_seq_ops,
3221 			sizeof(struct seq_net_private));
3222 }
3223 
3224 static const struct file_operations ptype_seq_fops = {
3225 	.owner	 = THIS_MODULE,
3226 	.open    = ptype_seq_open,
3227 	.read    = seq_read,
3228 	.llseek  = seq_lseek,
3229 	.release = seq_release_net,
3230 };
3231 
3232 
3233 static int __net_init dev_proc_net_init(struct net *net)
3234 {
3235 	int rc = -ENOMEM;
3236 
3237 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3238 		goto out;
3239 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3240 		goto out_dev;
3241 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3242 		goto out_softnet;
3243 
3244 	if (wext_proc_init(net))
3245 		goto out_ptype;
3246 	rc = 0;
3247 out:
3248 	return rc;
3249 out_ptype:
3250 	proc_net_remove(net, "ptype");
3251 out_softnet:
3252 	proc_net_remove(net, "softnet_stat");
3253 out_dev:
3254 	proc_net_remove(net, "dev");
3255 	goto out;
3256 }
3257 
3258 static void __net_exit dev_proc_net_exit(struct net *net)
3259 {
3260 	wext_proc_exit(net);
3261 
3262 	proc_net_remove(net, "ptype");
3263 	proc_net_remove(net, "softnet_stat");
3264 	proc_net_remove(net, "dev");
3265 }
3266 
3267 static struct pernet_operations __net_initdata dev_proc_ops = {
3268 	.init = dev_proc_net_init,
3269 	.exit = dev_proc_net_exit,
3270 };
3271 
3272 static int __init dev_proc_init(void)
3273 {
3274 	return register_pernet_subsys(&dev_proc_ops);
3275 }
3276 #else
3277 #define dev_proc_init() 0
3278 #endif	/* CONFIG_PROC_FS */
3279 
3280 
3281 /**
3282  *	netdev_set_master	-	set up master/slave pair
3283  *	@slave: slave device
3284  *	@master: new master device
3285  *
3286  *	Changes the master device of the slave. Pass %NULL to break the
3287  *	bonding. The caller must hold the RTNL semaphore. On a failure
3288  *	a negative errno code is returned. On success the reference counts
3289  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3290  *	function returns zero.
3291  */
3292 int netdev_set_master(struct net_device *slave, struct net_device *master)
3293 {
3294 	struct net_device *old = slave->master;
3295 
3296 	ASSERT_RTNL();
3297 
3298 	if (master) {
3299 		if (old)
3300 			return -EBUSY;
3301 		dev_hold(master);
3302 	}
3303 
3304 	slave->master = master;
3305 
3306 	synchronize_net();
3307 
3308 	if (old)
3309 		dev_put(old);
3310 
3311 	if (master)
3312 		slave->flags |= IFF_SLAVE;
3313 	else
3314 		slave->flags &= ~IFF_SLAVE;
3315 
3316 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3317 	return 0;
3318 }
3319 
3320 static void dev_change_rx_flags(struct net_device *dev, int flags)
3321 {
3322 	const struct net_device_ops *ops = dev->netdev_ops;
3323 
3324 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3325 		ops->ndo_change_rx_flags(dev, flags);
3326 }
3327 
3328 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3329 {
3330 	unsigned short old_flags = dev->flags;
3331 	uid_t uid;
3332 	gid_t gid;
3333 
3334 	ASSERT_RTNL();
3335 
3336 	dev->flags |= IFF_PROMISC;
3337 	dev->promiscuity += inc;
3338 	if (dev->promiscuity == 0) {
3339 		/*
3340 		 * Avoid overflow.
3341 		 * If inc causes overflow, untouch promisc and return error.
3342 		 */
3343 		if (inc < 0)
3344 			dev->flags &= ~IFF_PROMISC;
3345 		else {
3346 			dev->promiscuity -= inc;
3347 			printk(KERN_WARNING "%s: promiscuity touches roof, "
3348 				"set promiscuity failed, promiscuity feature "
3349 				"of device might be broken.\n", dev->name);
3350 			return -EOVERFLOW;
3351 		}
3352 	}
3353 	if (dev->flags != old_flags) {
3354 		printk(KERN_INFO "device %s %s promiscuous mode\n",
3355 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3356 							       "left");
3357 		if (audit_enabled) {
3358 			current_uid_gid(&uid, &gid);
3359 			audit_log(current->audit_context, GFP_ATOMIC,
3360 				AUDIT_ANOM_PROMISCUOUS,
3361 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3362 				dev->name, (dev->flags & IFF_PROMISC),
3363 				(old_flags & IFF_PROMISC),
3364 				audit_get_loginuid(current),
3365 				uid, gid,
3366 				audit_get_sessionid(current));
3367 		}
3368 
3369 		dev_change_rx_flags(dev, IFF_PROMISC);
3370 	}
3371 	return 0;
3372 }
3373 
3374 /**
3375  *	dev_set_promiscuity	- update promiscuity count on a device
3376  *	@dev: device
3377  *	@inc: modifier
3378  *
3379  *	Add or remove promiscuity from a device. While the count in the device
3380  *	remains above zero the interface remains promiscuous. Once it hits zero
3381  *	the device reverts back to normal filtering operation. A negative inc
3382  *	value is used to drop promiscuity on the device.
3383  *	Return 0 if successful or a negative errno code on error.
3384  */
3385 int dev_set_promiscuity(struct net_device *dev, int inc)
3386 {
3387 	unsigned short old_flags = dev->flags;
3388 	int err;
3389 
3390 	err = __dev_set_promiscuity(dev, inc);
3391 	if (err < 0)
3392 		return err;
3393 	if (dev->flags != old_flags)
3394 		dev_set_rx_mode(dev);
3395 	return err;
3396 }
3397 
3398 /**
3399  *	dev_set_allmulti	- update allmulti count on a device
3400  *	@dev: device
3401  *	@inc: modifier
3402  *
3403  *	Add or remove reception of all multicast frames to a device. While the
3404  *	count in the device remains above zero the interface remains listening
3405  *	to all interfaces. Once it hits zero the device reverts back to normal
3406  *	filtering operation. A negative @inc value is used to drop the counter
3407  *	when releasing a resource needing all multicasts.
3408  *	Return 0 if successful or a negative errno code on error.
3409  */
3410 
3411 int dev_set_allmulti(struct net_device *dev, int inc)
3412 {
3413 	unsigned short old_flags = dev->flags;
3414 
3415 	ASSERT_RTNL();
3416 
3417 	dev->flags |= IFF_ALLMULTI;
3418 	dev->allmulti += inc;
3419 	if (dev->allmulti == 0) {
3420 		/*
3421 		 * Avoid overflow.
3422 		 * If inc causes overflow, untouch allmulti and return error.
3423 		 */
3424 		if (inc < 0)
3425 			dev->flags &= ~IFF_ALLMULTI;
3426 		else {
3427 			dev->allmulti -= inc;
3428 			printk(KERN_WARNING "%s: allmulti touches roof, "
3429 				"set allmulti failed, allmulti feature of "
3430 				"device might be broken.\n", dev->name);
3431 			return -EOVERFLOW;
3432 		}
3433 	}
3434 	if (dev->flags ^ old_flags) {
3435 		dev_change_rx_flags(dev, IFF_ALLMULTI);
3436 		dev_set_rx_mode(dev);
3437 	}
3438 	return 0;
3439 }
3440 
3441 /*
3442  *	Upload unicast and multicast address lists to device and
3443  *	configure RX filtering. When the device doesn't support unicast
3444  *	filtering it is put in promiscuous mode while unicast addresses
3445  *	are present.
3446  */
3447 void __dev_set_rx_mode(struct net_device *dev)
3448 {
3449 	const struct net_device_ops *ops = dev->netdev_ops;
3450 
3451 	/* dev_open will call this function so the list will stay sane. */
3452 	if (!(dev->flags&IFF_UP))
3453 		return;
3454 
3455 	if (!netif_device_present(dev))
3456 		return;
3457 
3458 	if (ops->ndo_set_rx_mode)
3459 		ops->ndo_set_rx_mode(dev);
3460 	else {
3461 		/* Unicast addresses changes may only happen under the rtnl,
3462 		 * therefore calling __dev_set_promiscuity here is safe.
3463 		 */
3464 		if (dev->uc_count > 0 && !dev->uc_promisc) {
3465 			__dev_set_promiscuity(dev, 1);
3466 			dev->uc_promisc = 1;
3467 		} else if (dev->uc_count == 0 && dev->uc_promisc) {
3468 			__dev_set_promiscuity(dev, -1);
3469 			dev->uc_promisc = 0;
3470 		}
3471 
3472 		if (ops->ndo_set_multicast_list)
3473 			ops->ndo_set_multicast_list(dev);
3474 	}
3475 }
3476 
3477 void dev_set_rx_mode(struct net_device *dev)
3478 {
3479 	netif_addr_lock_bh(dev);
3480 	__dev_set_rx_mode(dev);
3481 	netif_addr_unlock_bh(dev);
3482 }
3483 
3484 /* hw addresses list handling functions */
3485 
3486 static int __hw_addr_add(struct list_head *list, int *delta,
3487 			 unsigned char *addr, int addr_len,
3488 			 unsigned char addr_type)
3489 {
3490 	struct netdev_hw_addr *ha;
3491 	int alloc_size;
3492 
3493 	if (addr_len > MAX_ADDR_LEN)
3494 		return -EINVAL;
3495 
3496 	list_for_each_entry(ha, list, list) {
3497 		if (!memcmp(ha->addr, addr, addr_len) &&
3498 		    ha->type == addr_type) {
3499 			ha->refcount++;
3500 			return 0;
3501 		}
3502 	}
3503 
3504 
3505 	alloc_size = sizeof(*ha);
3506 	if (alloc_size < L1_CACHE_BYTES)
3507 		alloc_size = L1_CACHE_BYTES;
3508 	ha = kmalloc(alloc_size, GFP_ATOMIC);
3509 	if (!ha)
3510 		return -ENOMEM;
3511 	memcpy(ha->addr, addr, addr_len);
3512 	ha->type = addr_type;
3513 	ha->refcount = 1;
3514 	ha->synced = false;
3515 	list_add_tail_rcu(&ha->list, list);
3516 	if (delta)
3517 		(*delta)++;
3518 	return 0;
3519 }
3520 
3521 static void ha_rcu_free(struct rcu_head *head)
3522 {
3523 	struct netdev_hw_addr *ha;
3524 
3525 	ha = container_of(head, struct netdev_hw_addr, rcu_head);
3526 	kfree(ha);
3527 }
3528 
3529 static int __hw_addr_del(struct list_head *list, int *delta,
3530 			 unsigned char *addr, int addr_len,
3531 			 unsigned char addr_type)
3532 {
3533 	struct netdev_hw_addr *ha;
3534 
3535 	list_for_each_entry(ha, list, list) {
3536 		if (!memcmp(ha->addr, addr, addr_len) &&
3537 		    (ha->type == addr_type || !addr_type)) {
3538 			if (--ha->refcount)
3539 				return 0;
3540 			list_del_rcu(&ha->list);
3541 			call_rcu(&ha->rcu_head, ha_rcu_free);
3542 			if (delta)
3543 				(*delta)--;
3544 			return 0;
3545 		}
3546 	}
3547 	return -ENOENT;
3548 }
3549 
3550 static int __hw_addr_add_multiple(struct list_head *to_list, int *to_delta,
3551 				  struct list_head *from_list, int addr_len,
3552 				  unsigned char addr_type)
3553 {
3554 	int err;
3555 	struct netdev_hw_addr *ha, *ha2;
3556 	unsigned char type;
3557 
3558 	list_for_each_entry(ha, from_list, list) {
3559 		type = addr_type ? addr_type : ha->type;
3560 		err = __hw_addr_add(to_list, to_delta, ha->addr,
3561 				    addr_len, type);
3562 		if (err)
3563 			goto unroll;
3564 	}
3565 	return 0;
3566 
3567 unroll:
3568 	list_for_each_entry(ha2, from_list, list) {
3569 		if (ha2 == ha)
3570 			break;
3571 		type = addr_type ? addr_type : ha2->type;
3572 		__hw_addr_del(to_list, to_delta, ha2->addr,
3573 			      addr_len, type);
3574 	}
3575 	return err;
3576 }
3577 
3578 static void __hw_addr_del_multiple(struct list_head *to_list, int *to_delta,
3579 				   struct list_head *from_list, int addr_len,
3580 				   unsigned char addr_type)
3581 {
3582 	struct netdev_hw_addr *ha;
3583 	unsigned char type;
3584 
3585 	list_for_each_entry(ha, from_list, list) {
3586 		type = addr_type ? addr_type : ha->type;
3587 		__hw_addr_del(to_list, to_delta, ha->addr,
3588 			      addr_len, addr_type);
3589 	}
3590 }
3591 
3592 static int __hw_addr_sync(struct list_head *to_list, int *to_delta,
3593 			  struct list_head *from_list, int *from_delta,
3594 			  int addr_len)
3595 {
3596 	int err = 0;
3597 	struct netdev_hw_addr *ha, *tmp;
3598 
3599 	list_for_each_entry_safe(ha, tmp, from_list, list) {
3600 		if (!ha->synced) {
3601 			err = __hw_addr_add(to_list, to_delta, ha->addr,
3602 					    addr_len, ha->type);
3603 			if (err)
3604 				break;
3605 			ha->synced = true;
3606 			ha->refcount++;
3607 		} else if (ha->refcount == 1) {
3608 			__hw_addr_del(to_list, to_delta, ha->addr,
3609 				      addr_len, ha->type);
3610 			__hw_addr_del(from_list, from_delta, ha->addr,
3611 				      addr_len, ha->type);
3612 		}
3613 	}
3614 	return err;
3615 }
3616 
3617 static void __hw_addr_unsync(struct list_head *to_list, int *to_delta,
3618 			     struct list_head *from_list, int *from_delta,
3619 			     int addr_len)
3620 {
3621 	struct netdev_hw_addr *ha, *tmp;
3622 
3623 	list_for_each_entry_safe(ha, tmp, from_list, list) {
3624 		if (ha->synced) {
3625 			__hw_addr_del(to_list, to_delta, ha->addr,
3626 				      addr_len, ha->type);
3627 			ha->synced = false;
3628 			__hw_addr_del(from_list, from_delta, ha->addr,
3629 				      addr_len, ha->type);
3630 		}
3631 	}
3632 }
3633 
3634 
3635 static void __hw_addr_flush(struct list_head *list)
3636 {
3637 	struct netdev_hw_addr *ha, *tmp;
3638 
3639 	list_for_each_entry_safe(ha, tmp, list, list) {
3640 		list_del_rcu(&ha->list);
3641 		call_rcu(&ha->rcu_head, ha_rcu_free);
3642 	}
3643 }
3644 
3645 /* Device addresses handling functions */
3646 
3647 static void dev_addr_flush(struct net_device *dev)
3648 {
3649 	/* rtnl_mutex must be held here */
3650 
3651 	__hw_addr_flush(&dev->dev_addr_list);
3652 	dev->dev_addr = NULL;
3653 }
3654 
3655 static int dev_addr_init(struct net_device *dev)
3656 {
3657 	unsigned char addr[MAX_ADDR_LEN];
3658 	struct netdev_hw_addr *ha;
3659 	int err;
3660 
3661 	/* rtnl_mutex must be held here */
3662 
3663 	INIT_LIST_HEAD(&dev->dev_addr_list);
3664 	memset(addr, 0, sizeof(addr));
3665 	err = __hw_addr_add(&dev->dev_addr_list, NULL, addr, sizeof(addr),
3666 			    NETDEV_HW_ADDR_T_LAN);
3667 	if (!err) {
3668 		/*
3669 		 * Get the first (previously created) address from the list
3670 		 * and set dev_addr pointer to this location.
3671 		 */
3672 		ha = list_first_entry(&dev->dev_addr_list,
3673 				      struct netdev_hw_addr, list);
3674 		dev->dev_addr = ha->addr;
3675 	}
3676 	return err;
3677 }
3678 
3679 /**
3680  *	dev_addr_add	- Add a device address
3681  *	@dev: device
3682  *	@addr: address to add
3683  *	@addr_type: address type
3684  *
3685  *	Add a device address to the device or increase the reference count if
3686  *	it already exists.
3687  *
3688  *	The caller must hold the rtnl_mutex.
3689  */
3690 int dev_addr_add(struct net_device *dev, unsigned char *addr,
3691 		 unsigned char addr_type)
3692 {
3693 	int err;
3694 
3695 	ASSERT_RTNL();
3696 
3697 	err = __hw_addr_add(&dev->dev_addr_list, NULL, addr, dev->addr_len,
3698 			    addr_type);
3699 	if (!err)
3700 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3701 	return err;
3702 }
3703 EXPORT_SYMBOL(dev_addr_add);
3704 
3705 /**
3706  *	dev_addr_del	- Release a device address.
3707  *	@dev: device
3708  *	@addr: address to delete
3709  *	@addr_type: address type
3710  *
3711  *	Release reference to a device address and remove it from the device
3712  *	if the reference count drops to zero.
3713  *
3714  *	The caller must hold the rtnl_mutex.
3715  */
3716 int dev_addr_del(struct net_device *dev, unsigned char *addr,
3717 		 unsigned char addr_type)
3718 {
3719 	int err;
3720 	struct netdev_hw_addr *ha;
3721 
3722 	ASSERT_RTNL();
3723 
3724 	/*
3725 	 * We can not remove the first address from the list because
3726 	 * dev->dev_addr points to that.
3727 	 */
3728 	ha = list_first_entry(&dev->dev_addr_list, struct netdev_hw_addr, list);
3729 	if (ha->addr == dev->dev_addr && ha->refcount == 1)
3730 		return -ENOENT;
3731 
3732 	err = __hw_addr_del(&dev->dev_addr_list, NULL, addr, dev->addr_len,
3733 			    addr_type);
3734 	if (!err)
3735 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3736 	return err;
3737 }
3738 EXPORT_SYMBOL(dev_addr_del);
3739 
3740 /**
3741  *	dev_addr_add_multiple	- Add device addresses from another device
3742  *	@to_dev: device to which addresses will be added
3743  *	@from_dev: device from which addresses will be added
3744  *	@addr_type: address type - 0 means type will be used from from_dev
3745  *
3746  *	Add device addresses of the one device to another.
3747  **
3748  *	The caller must hold the rtnl_mutex.
3749  */
3750 int dev_addr_add_multiple(struct net_device *to_dev,
3751 			  struct net_device *from_dev,
3752 			  unsigned char addr_type)
3753 {
3754 	int err;
3755 
3756 	ASSERT_RTNL();
3757 
3758 	if (from_dev->addr_len != to_dev->addr_len)
3759 		return -EINVAL;
3760 	err = __hw_addr_add_multiple(&to_dev->dev_addr_list, NULL,
3761 				     &from_dev->dev_addr_list,
3762 				     to_dev->addr_len, addr_type);
3763 	if (!err)
3764 		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3765 	return err;
3766 }
3767 EXPORT_SYMBOL(dev_addr_add_multiple);
3768 
3769 /**
3770  *	dev_addr_del_multiple	- Delete device addresses by another device
3771  *	@to_dev: device where the addresses will be deleted
3772  *	@from_dev: device by which addresses the addresses will be deleted
3773  *	@addr_type: address type - 0 means type will used from from_dev
3774  *
3775  *	Deletes addresses in to device by the list of addresses in from device.
3776  *
3777  *	The caller must hold the rtnl_mutex.
3778  */
3779 int dev_addr_del_multiple(struct net_device *to_dev,
3780 			  struct net_device *from_dev,
3781 			  unsigned char addr_type)
3782 {
3783 	ASSERT_RTNL();
3784 
3785 	if (from_dev->addr_len != to_dev->addr_len)
3786 		return -EINVAL;
3787 	__hw_addr_del_multiple(&to_dev->dev_addr_list, NULL,
3788 			       &from_dev->dev_addr_list,
3789 			       to_dev->addr_len, addr_type);
3790 	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3791 	return 0;
3792 }
3793 EXPORT_SYMBOL(dev_addr_del_multiple);
3794 
3795 /* unicast and multicast addresses handling functions */
3796 
3797 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3798 		      void *addr, int alen, int glbl)
3799 {
3800 	struct dev_addr_list *da;
3801 
3802 	for (; (da = *list) != NULL; list = &da->next) {
3803 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3804 		    alen == da->da_addrlen) {
3805 			if (glbl) {
3806 				int old_glbl = da->da_gusers;
3807 				da->da_gusers = 0;
3808 				if (old_glbl == 0)
3809 					break;
3810 			}
3811 			if (--da->da_users)
3812 				return 0;
3813 
3814 			*list = da->next;
3815 			kfree(da);
3816 			(*count)--;
3817 			return 0;
3818 		}
3819 	}
3820 	return -ENOENT;
3821 }
3822 
3823 int __dev_addr_add(struct dev_addr_list **list, int *count,
3824 		   void *addr, int alen, int glbl)
3825 {
3826 	struct dev_addr_list *da;
3827 
3828 	for (da = *list; da != NULL; da = da->next) {
3829 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3830 		    da->da_addrlen == alen) {
3831 			if (glbl) {
3832 				int old_glbl = da->da_gusers;
3833 				da->da_gusers = 1;
3834 				if (old_glbl)
3835 					return 0;
3836 			}
3837 			da->da_users++;
3838 			return 0;
3839 		}
3840 	}
3841 
3842 	da = kzalloc(sizeof(*da), GFP_ATOMIC);
3843 	if (da == NULL)
3844 		return -ENOMEM;
3845 	memcpy(da->da_addr, addr, alen);
3846 	da->da_addrlen = alen;
3847 	da->da_users = 1;
3848 	da->da_gusers = glbl ? 1 : 0;
3849 	da->next = *list;
3850 	*list = da;
3851 	(*count)++;
3852 	return 0;
3853 }
3854 
3855 /**
3856  *	dev_unicast_delete	- Release secondary unicast address.
3857  *	@dev: device
3858  *	@addr: address to delete
3859  *
3860  *	Release reference to a secondary unicast address and remove it
3861  *	from the device if the reference count drops to zero.
3862  *
3863  * 	The caller must hold the rtnl_mutex.
3864  */
3865 int dev_unicast_delete(struct net_device *dev, void *addr)
3866 {
3867 	int err;
3868 
3869 	ASSERT_RTNL();
3870 
3871 	err = __hw_addr_del(&dev->uc_list, &dev->uc_count, addr,
3872 			    dev->addr_len, NETDEV_HW_ADDR_T_UNICAST);
3873 	if (!err)
3874 		__dev_set_rx_mode(dev);
3875 	return err;
3876 }
3877 EXPORT_SYMBOL(dev_unicast_delete);
3878 
3879 /**
3880  *	dev_unicast_add		- add a secondary unicast address
3881  *	@dev: device
3882  *	@addr: address to add
3883  *
3884  *	Add a secondary unicast address to the device or increase
3885  *	the reference count if it already exists.
3886  *
3887  *	The caller must hold the rtnl_mutex.
3888  */
3889 int dev_unicast_add(struct net_device *dev, void *addr)
3890 {
3891 	int err;
3892 
3893 	ASSERT_RTNL();
3894 
3895 	err = __hw_addr_add(&dev->uc_list, &dev->uc_count, addr,
3896 			    dev->addr_len, NETDEV_HW_ADDR_T_UNICAST);
3897 	if (!err)
3898 		__dev_set_rx_mode(dev);
3899 	return err;
3900 }
3901 EXPORT_SYMBOL(dev_unicast_add);
3902 
3903 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3904 		    struct dev_addr_list **from, int *from_count)
3905 {
3906 	struct dev_addr_list *da, *next;
3907 	int err = 0;
3908 
3909 	da = *from;
3910 	while (da != NULL) {
3911 		next = da->next;
3912 		if (!da->da_synced) {
3913 			err = __dev_addr_add(to, to_count,
3914 					     da->da_addr, da->da_addrlen, 0);
3915 			if (err < 0)
3916 				break;
3917 			da->da_synced = 1;
3918 			da->da_users++;
3919 		} else if (da->da_users == 1) {
3920 			__dev_addr_delete(to, to_count,
3921 					  da->da_addr, da->da_addrlen, 0);
3922 			__dev_addr_delete(from, from_count,
3923 					  da->da_addr, da->da_addrlen, 0);
3924 		}
3925 		da = next;
3926 	}
3927 	return err;
3928 }
3929 
3930 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3931 		       struct dev_addr_list **from, int *from_count)
3932 {
3933 	struct dev_addr_list *da, *next;
3934 
3935 	da = *from;
3936 	while (da != NULL) {
3937 		next = da->next;
3938 		if (da->da_synced) {
3939 			__dev_addr_delete(to, to_count,
3940 					  da->da_addr, da->da_addrlen, 0);
3941 			da->da_synced = 0;
3942 			__dev_addr_delete(from, from_count,
3943 					  da->da_addr, da->da_addrlen, 0);
3944 		}
3945 		da = next;
3946 	}
3947 }
3948 
3949 /**
3950  *	dev_unicast_sync - Synchronize device's unicast list to another device
3951  *	@to: destination device
3952  *	@from: source device
3953  *
3954  *	Add newly added addresses to the destination device and release
3955  *	addresses that have no users left.
3956  *
3957  *	This function is intended to be called from the dev->set_rx_mode
3958  *	function of layered software devices.
3959  */
3960 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3961 {
3962 	int err = 0;
3963 
3964 	ASSERT_RTNL();
3965 
3966 	if (to->addr_len != from->addr_len)
3967 		return -EINVAL;
3968 
3969 	err = __hw_addr_sync(&to->uc_list, &to->uc_count,
3970 			     &from->uc_list, &from->uc_count, to->addr_len);
3971 	if (!err)
3972 		__dev_set_rx_mode(to);
3973 	return err;
3974 }
3975 EXPORT_SYMBOL(dev_unicast_sync);
3976 
3977 /**
3978  *	dev_unicast_unsync - Remove synchronized addresses from the destination device
3979  *	@to: destination device
3980  *	@from: source device
3981  *
3982  *	Remove all addresses that were added to the destination device by
3983  *	dev_unicast_sync(). This function is intended to be called from the
3984  *	dev->stop function of layered software devices.
3985  */
3986 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3987 {
3988 	ASSERT_RTNL();
3989 
3990 	if (to->addr_len != from->addr_len)
3991 		return;
3992 
3993 	__hw_addr_unsync(&to->uc_list, &to->uc_count,
3994 			 &from->uc_list, &from->uc_count, to->addr_len);
3995 	__dev_set_rx_mode(to);
3996 }
3997 EXPORT_SYMBOL(dev_unicast_unsync);
3998 
3999 static void dev_unicast_flush(struct net_device *dev)
4000 {
4001 	/* rtnl_mutex must be held here */
4002 
4003 	__hw_addr_flush(&dev->uc_list);
4004 	dev->uc_count = 0;
4005 }
4006 
4007 static void dev_unicast_init(struct net_device *dev)
4008 {
4009 	/* rtnl_mutex must be held here */
4010 
4011 	INIT_LIST_HEAD(&dev->uc_list);
4012 }
4013 
4014 
4015 static void __dev_addr_discard(struct dev_addr_list **list)
4016 {
4017 	struct dev_addr_list *tmp;
4018 
4019 	while (*list != NULL) {
4020 		tmp = *list;
4021 		*list = tmp->next;
4022 		if (tmp->da_users > tmp->da_gusers)
4023 			printk("__dev_addr_discard: address leakage! "
4024 			       "da_users=%d\n", tmp->da_users);
4025 		kfree(tmp);
4026 	}
4027 }
4028 
4029 static void dev_addr_discard(struct net_device *dev)
4030 {
4031 	netif_addr_lock_bh(dev);
4032 
4033 	__dev_addr_discard(&dev->mc_list);
4034 	dev->mc_count = 0;
4035 
4036 	netif_addr_unlock_bh(dev);
4037 }
4038 
4039 /**
4040  *	dev_get_flags - get flags reported to userspace
4041  *	@dev: device
4042  *
4043  *	Get the combination of flag bits exported through APIs to userspace.
4044  */
4045 unsigned dev_get_flags(const struct net_device *dev)
4046 {
4047 	unsigned flags;
4048 
4049 	flags = (dev->flags & ~(IFF_PROMISC |
4050 				IFF_ALLMULTI |
4051 				IFF_RUNNING |
4052 				IFF_LOWER_UP |
4053 				IFF_DORMANT)) |
4054 		(dev->gflags & (IFF_PROMISC |
4055 				IFF_ALLMULTI));
4056 
4057 	if (netif_running(dev)) {
4058 		if (netif_oper_up(dev))
4059 			flags |= IFF_RUNNING;
4060 		if (netif_carrier_ok(dev))
4061 			flags |= IFF_LOWER_UP;
4062 		if (netif_dormant(dev))
4063 			flags |= IFF_DORMANT;
4064 	}
4065 
4066 	return flags;
4067 }
4068 
4069 /**
4070  *	dev_change_flags - change device settings
4071  *	@dev: device
4072  *	@flags: device state flags
4073  *
4074  *	Change settings on device based state flags. The flags are
4075  *	in the userspace exported format.
4076  */
4077 int dev_change_flags(struct net_device *dev, unsigned flags)
4078 {
4079 	int ret, changes;
4080 	int old_flags = dev->flags;
4081 
4082 	ASSERT_RTNL();
4083 
4084 	/*
4085 	 *	Set the flags on our device.
4086 	 */
4087 
4088 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4089 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4090 			       IFF_AUTOMEDIA)) |
4091 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4092 				    IFF_ALLMULTI));
4093 
4094 	/*
4095 	 *	Load in the correct multicast list now the flags have changed.
4096 	 */
4097 
4098 	if ((old_flags ^ flags) & IFF_MULTICAST)
4099 		dev_change_rx_flags(dev, IFF_MULTICAST);
4100 
4101 	dev_set_rx_mode(dev);
4102 
4103 	/*
4104 	 *	Have we downed the interface. We handle IFF_UP ourselves
4105 	 *	according to user attempts to set it, rather than blindly
4106 	 *	setting it.
4107 	 */
4108 
4109 	ret = 0;
4110 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4111 		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
4112 
4113 		if (!ret)
4114 			dev_set_rx_mode(dev);
4115 	}
4116 
4117 	if (dev->flags & IFF_UP &&
4118 	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
4119 					  IFF_VOLATILE)))
4120 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4121 
4122 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4123 		int inc = (flags & IFF_PROMISC) ? +1 : -1;
4124 		dev->gflags ^= IFF_PROMISC;
4125 		dev_set_promiscuity(dev, inc);
4126 	}
4127 
4128 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4129 	   is important. Some (broken) drivers set IFF_PROMISC, when
4130 	   IFF_ALLMULTI is requested not asking us and not reporting.
4131 	 */
4132 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4133 		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
4134 		dev->gflags ^= IFF_ALLMULTI;
4135 		dev_set_allmulti(dev, inc);
4136 	}
4137 
4138 	/* Exclude state transition flags, already notified */
4139 	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
4140 	if (changes)
4141 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4142 
4143 	return ret;
4144 }
4145 
4146 /**
4147  *	dev_set_mtu - Change maximum transfer unit
4148  *	@dev: device
4149  *	@new_mtu: new transfer unit
4150  *
4151  *	Change the maximum transfer size of the network device.
4152  */
4153 int dev_set_mtu(struct net_device *dev, int new_mtu)
4154 {
4155 	const struct net_device_ops *ops = dev->netdev_ops;
4156 	int err;
4157 
4158 	if (new_mtu == dev->mtu)
4159 		return 0;
4160 
4161 	/*	MTU must be positive.	 */
4162 	if (new_mtu < 0)
4163 		return -EINVAL;
4164 
4165 	if (!netif_device_present(dev))
4166 		return -ENODEV;
4167 
4168 	err = 0;
4169 	if (ops->ndo_change_mtu)
4170 		err = ops->ndo_change_mtu(dev, new_mtu);
4171 	else
4172 		dev->mtu = new_mtu;
4173 
4174 	if (!err && dev->flags & IFF_UP)
4175 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4176 	return err;
4177 }
4178 
4179 /**
4180  *	dev_set_mac_address - Change Media Access Control Address
4181  *	@dev: device
4182  *	@sa: new address
4183  *
4184  *	Change the hardware (MAC) address of the device
4185  */
4186 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4187 {
4188 	const struct net_device_ops *ops = dev->netdev_ops;
4189 	int err;
4190 
4191 	if (!ops->ndo_set_mac_address)
4192 		return -EOPNOTSUPP;
4193 	if (sa->sa_family != dev->type)
4194 		return -EINVAL;
4195 	if (!netif_device_present(dev))
4196 		return -ENODEV;
4197 	err = ops->ndo_set_mac_address(dev, sa);
4198 	if (!err)
4199 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4200 	return err;
4201 }
4202 
4203 /*
4204  *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
4205  */
4206 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4207 {
4208 	int err;
4209 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4210 
4211 	if (!dev)
4212 		return -ENODEV;
4213 
4214 	switch (cmd) {
4215 		case SIOCGIFFLAGS:	/* Get interface flags */
4216 			ifr->ifr_flags = (short) dev_get_flags(dev);
4217 			return 0;
4218 
4219 		case SIOCGIFMETRIC:	/* Get the metric on the interface
4220 					   (currently unused) */
4221 			ifr->ifr_metric = 0;
4222 			return 0;
4223 
4224 		case SIOCGIFMTU:	/* Get the MTU of a device */
4225 			ifr->ifr_mtu = dev->mtu;
4226 			return 0;
4227 
4228 		case SIOCGIFHWADDR:
4229 			if (!dev->addr_len)
4230 				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4231 			else
4232 				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4233 				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4234 			ifr->ifr_hwaddr.sa_family = dev->type;
4235 			return 0;
4236 
4237 		case SIOCGIFSLAVE:
4238 			err = -EINVAL;
4239 			break;
4240 
4241 		case SIOCGIFMAP:
4242 			ifr->ifr_map.mem_start = dev->mem_start;
4243 			ifr->ifr_map.mem_end   = dev->mem_end;
4244 			ifr->ifr_map.base_addr = dev->base_addr;
4245 			ifr->ifr_map.irq       = dev->irq;
4246 			ifr->ifr_map.dma       = dev->dma;
4247 			ifr->ifr_map.port      = dev->if_port;
4248 			return 0;
4249 
4250 		case SIOCGIFINDEX:
4251 			ifr->ifr_ifindex = dev->ifindex;
4252 			return 0;
4253 
4254 		case SIOCGIFTXQLEN:
4255 			ifr->ifr_qlen = dev->tx_queue_len;
4256 			return 0;
4257 
4258 		default:
4259 			/* dev_ioctl() should ensure this case
4260 			 * is never reached
4261 			 */
4262 			WARN_ON(1);
4263 			err = -EINVAL;
4264 			break;
4265 
4266 	}
4267 	return err;
4268 }
4269 
4270 /*
4271  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4272  */
4273 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4274 {
4275 	int err;
4276 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4277 	const struct net_device_ops *ops;
4278 
4279 	if (!dev)
4280 		return -ENODEV;
4281 
4282 	ops = dev->netdev_ops;
4283 
4284 	switch (cmd) {
4285 		case SIOCSIFFLAGS:	/* Set interface flags */
4286 			return dev_change_flags(dev, ifr->ifr_flags);
4287 
4288 		case SIOCSIFMETRIC:	/* Set the metric on the interface
4289 					   (currently unused) */
4290 			return -EOPNOTSUPP;
4291 
4292 		case SIOCSIFMTU:	/* Set the MTU of a device */
4293 			return dev_set_mtu(dev, ifr->ifr_mtu);
4294 
4295 		case SIOCSIFHWADDR:
4296 			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4297 
4298 		case SIOCSIFHWBROADCAST:
4299 			if (ifr->ifr_hwaddr.sa_family != dev->type)
4300 				return -EINVAL;
4301 			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4302 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4303 			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4304 			return 0;
4305 
4306 		case SIOCSIFMAP:
4307 			if (ops->ndo_set_config) {
4308 				if (!netif_device_present(dev))
4309 					return -ENODEV;
4310 				return ops->ndo_set_config(dev, &ifr->ifr_map);
4311 			}
4312 			return -EOPNOTSUPP;
4313 
4314 		case SIOCADDMULTI:
4315 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4316 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4317 				return -EINVAL;
4318 			if (!netif_device_present(dev))
4319 				return -ENODEV;
4320 			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
4321 					  dev->addr_len, 1);
4322 
4323 		case SIOCDELMULTI:
4324 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4325 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4326 				return -EINVAL;
4327 			if (!netif_device_present(dev))
4328 				return -ENODEV;
4329 			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
4330 					     dev->addr_len, 1);
4331 
4332 		case SIOCSIFTXQLEN:
4333 			if (ifr->ifr_qlen < 0)
4334 				return -EINVAL;
4335 			dev->tx_queue_len = ifr->ifr_qlen;
4336 			return 0;
4337 
4338 		case SIOCSIFNAME:
4339 			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4340 			return dev_change_name(dev, ifr->ifr_newname);
4341 
4342 		/*
4343 		 *	Unknown or private ioctl
4344 		 */
4345 
4346 		default:
4347 			if ((cmd >= SIOCDEVPRIVATE &&
4348 			    cmd <= SIOCDEVPRIVATE + 15) ||
4349 			    cmd == SIOCBONDENSLAVE ||
4350 			    cmd == SIOCBONDRELEASE ||
4351 			    cmd == SIOCBONDSETHWADDR ||
4352 			    cmd == SIOCBONDSLAVEINFOQUERY ||
4353 			    cmd == SIOCBONDINFOQUERY ||
4354 			    cmd == SIOCBONDCHANGEACTIVE ||
4355 			    cmd == SIOCGMIIPHY ||
4356 			    cmd == SIOCGMIIREG ||
4357 			    cmd == SIOCSMIIREG ||
4358 			    cmd == SIOCBRADDIF ||
4359 			    cmd == SIOCBRDELIF ||
4360 			    cmd == SIOCSHWTSTAMP ||
4361 			    cmd == SIOCWANDEV) {
4362 				err = -EOPNOTSUPP;
4363 				if (ops->ndo_do_ioctl) {
4364 					if (netif_device_present(dev))
4365 						err = ops->ndo_do_ioctl(dev, ifr, cmd);
4366 					else
4367 						err = -ENODEV;
4368 				}
4369 			} else
4370 				err = -EINVAL;
4371 
4372 	}
4373 	return err;
4374 }
4375 
4376 /*
4377  *	This function handles all "interface"-type I/O control requests. The actual
4378  *	'doing' part of this is dev_ifsioc above.
4379  */
4380 
4381 /**
4382  *	dev_ioctl	-	network device ioctl
4383  *	@net: the applicable net namespace
4384  *	@cmd: command to issue
4385  *	@arg: pointer to a struct ifreq in user space
4386  *
4387  *	Issue ioctl functions to devices. This is normally called by the
4388  *	user space syscall interfaces but can sometimes be useful for
4389  *	other purposes. The return value is the return from the syscall if
4390  *	positive or a negative errno code on error.
4391  */
4392 
4393 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4394 {
4395 	struct ifreq ifr;
4396 	int ret;
4397 	char *colon;
4398 
4399 	/* One special case: SIOCGIFCONF takes ifconf argument
4400 	   and requires shared lock, because it sleeps writing
4401 	   to user space.
4402 	 */
4403 
4404 	if (cmd == SIOCGIFCONF) {
4405 		rtnl_lock();
4406 		ret = dev_ifconf(net, (char __user *) arg);
4407 		rtnl_unlock();
4408 		return ret;
4409 	}
4410 	if (cmd == SIOCGIFNAME)
4411 		return dev_ifname(net, (struct ifreq __user *)arg);
4412 
4413 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4414 		return -EFAULT;
4415 
4416 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4417 
4418 	colon = strchr(ifr.ifr_name, ':');
4419 	if (colon)
4420 		*colon = 0;
4421 
4422 	/*
4423 	 *	See which interface the caller is talking about.
4424 	 */
4425 
4426 	switch (cmd) {
4427 		/*
4428 		 *	These ioctl calls:
4429 		 *	- can be done by all.
4430 		 *	- atomic and do not require locking.
4431 		 *	- return a value
4432 		 */
4433 		case SIOCGIFFLAGS:
4434 		case SIOCGIFMETRIC:
4435 		case SIOCGIFMTU:
4436 		case SIOCGIFHWADDR:
4437 		case SIOCGIFSLAVE:
4438 		case SIOCGIFMAP:
4439 		case SIOCGIFINDEX:
4440 		case SIOCGIFTXQLEN:
4441 			dev_load(net, ifr.ifr_name);
4442 			read_lock(&dev_base_lock);
4443 			ret = dev_ifsioc_locked(net, &ifr, cmd);
4444 			read_unlock(&dev_base_lock);
4445 			if (!ret) {
4446 				if (colon)
4447 					*colon = ':';
4448 				if (copy_to_user(arg, &ifr,
4449 						 sizeof(struct ifreq)))
4450 					ret = -EFAULT;
4451 			}
4452 			return ret;
4453 
4454 		case SIOCETHTOOL:
4455 			dev_load(net, ifr.ifr_name);
4456 			rtnl_lock();
4457 			ret = dev_ethtool(net, &ifr);
4458 			rtnl_unlock();
4459 			if (!ret) {
4460 				if (colon)
4461 					*colon = ':';
4462 				if (copy_to_user(arg, &ifr,
4463 						 sizeof(struct ifreq)))
4464 					ret = -EFAULT;
4465 			}
4466 			return ret;
4467 
4468 		/*
4469 		 *	These ioctl calls:
4470 		 *	- require superuser power.
4471 		 *	- require strict serialization.
4472 		 *	- return a value
4473 		 */
4474 		case SIOCGMIIPHY:
4475 		case SIOCGMIIREG:
4476 		case SIOCSIFNAME:
4477 			if (!capable(CAP_NET_ADMIN))
4478 				return -EPERM;
4479 			dev_load(net, ifr.ifr_name);
4480 			rtnl_lock();
4481 			ret = dev_ifsioc(net, &ifr, cmd);
4482 			rtnl_unlock();
4483 			if (!ret) {
4484 				if (colon)
4485 					*colon = ':';
4486 				if (copy_to_user(arg, &ifr,
4487 						 sizeof(struct ifreq)))
4488 					ret = -EFAULT;
4489 			}
4490 			return ret;
4491 
4492 		/*
4493 		 *	These ioctl calls:
4494 		 *	- require superuser power.
4495 		 *	- require strict serialization.
4496 		 *	- do not return a value
4497 		 */
4498 		case SIOCSIFFLAGS:
4499 		case SIOCSIFMETRIC:
4500 		case SIOCSIFMTU:
4501 		case SIOCSIFMAP:
4502 		case SIOCSIFHWADDR:
4503 		case SIOCSIFSLAVE:
4504 		case SIOCADDMULTI:
4505 		case SIOCDELMULTI:
4506 		case SIOCSIFHWBROADCAST:
4507 		case SIOCSIFTXQLEN:
4508 		case SIOCSMIIREG:
4509 		case SIOCBONDENSLAVE:
4510 		case SIOCBONDRELEASE:
4511 		case SIOCBONDSETHWADDR:
4512 		case SIOCBONDCHANGEACTIVE:
4513 		case SIOCBRADDIF:
4514 		case SIOCBRDELIF:
4515 		case SIOCSHWTSTAMP:
4516 			if (!capable(CAP_NET_ADMIN))
4517 				return -EPERM;
4518 			/* fall through */
4519 		case SIOCBONDSLAVEINFOQUERY:
4520 		case SIOCBONDINFOQUERY:
4521 			dev_load(net, ifr.ifr_name);
4522 			rtnl_lock();
4523 			ret = dev_ifsioc(net, &ifr, cmd);
4524 			rtnl_unlock();
4525 			return ret;
4526 
4527 		case SIOCGIFMEM:
4528 			/* Get the per device memory space. We can add this but
4529 			 * currently do not support it */
4530 		case SIOCSIFMEM:
4531 			/* Set the per device memory buffer space.
4532 			 * Not applicable in our case */
4533 		case SIOCSIFLINK:
4534 			return -EINVAL;
4535 
4536 		/*
4537 		 *	Unknown or private ioctl.
4538 		 */
4539 		default:
4540 			if (cmd == SIOCWANDEV ||
4541 			    (cmd >= SIOCDEVPRIVATE &&
4542 			     cmd <= SIOCDEVPRIVATE + 15)) {
4543 				dev_load(net, ifr.ifr_name);
4544 				rtnl_lock();
4545 				ret = dev_ifsioc(net, &ifr, cmd);
4546 				rtnl_unlock();
4547 				if (!ret && copy_to_user(arg, &ifr,
4548 							 sizeof(struct ifreq)))
4549 					ret = -EFAULT;
4550 				return ret;
4551 			}
4552 			/* Take care of Wireless Extensions */
4553 			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4554 				return wext_handle_ioctl(net, &ifr, cmd, arg);
4555 			return -EINVAL;
4556 	}
4557 }
4558 
4559 
4560 /**
4561  *	dev_new_index	-	allocate an ifindex
4562  *	@net: the applicable net namespace
4563  *
4564  *	Returns a suitable unique value for a new device interface
4565  *	number.  The caller must hold the rtnl semaphore or the
4566  *	dev_base_lock to be sure it remains unique.
4567  */
4568 static int dev_new_index(struct net *net)
4569 {
4570 	static int ifindex;
4571 	for (;;) {
4572 		if (++ifindex <= 0)
4573 			ifindex = 1;
4574 		if (!__dev_get_by_index(net, ifindex))
4575 			return ifindex;
4576 	}
4577 }
4578 
4579 /* Delayed registration/unregisteration */
4580 static LIST_HEAD(net_todo_list);
4581 
4582 static void net_set_todo(struct net_device *dev)
4583 {
4584 	list_add_tail(&dev->todo_list, &net_todo_list);
4585 }
4586 
4587 static void rollback_registered(struct net_device *dev)
4588 {
4589 	BUG_ON(dev_boot_phase);
4590 	ASSERT_RTNL();
4591 
4592 	/* Some devices call without registering for initialization unwind. */
4593 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4594 		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4595 				  "was registered\n", dev->name, dev);
4596 
4597 		WARN_ON(1);
4598 		return;
4599 	}
4600 
4601 	BUG_ON(dev->reg_state != NETREG_REGISTERED);
4602 
4603 	/* If device is running, close it first. */
4604 	dev_close(dev);
4605 
4606 	/* And unlink it from device chain. */
4607 	unlist_netdevice(dev);
4608 
4609 	dev->reg_state = NETREG_UNREGISTERING;
4610 
4611 	synchronize_net();
4612 
4613 	/* Shutdown queueing discipline. */
4614 	dev_shutdown(dev);
4615 
4616 
4617 	/* Notify protocols, that we are about to destroy
4618 	   this device. They should clean all the things.
4619 	*/
4620 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4621 
4622 	/*
4623 	 *	Flush the unicast and multicast chains
4624 	 */
4625 	dev_unicast_flush(dev);
4626 	dev_addr_discard(dev);
4627 
4628 	if (dev->netdev_ops->ndo_uninit)
4629 		dev->netdev_ops->ndo_uninit(dev);
4630 
4631 	/* Notifier chain MUST detach us from master device. */
4632 	WARN_ON(dev->master);
4633 
4634 	/* Remove entries from kobject tree */
4635 	netdev_unregister_kobject(dev);
4636 
4637 	synchronize_net();
4638 
4639 	dev_put(dev);
4640 }
4641 
4642 static void __netdev_init_queue_locks_one(struct net_device *dev,
4643 					  struct netdev_queue *dev_queue,
4644 					  void *_unused)
4645 {
4646 	spin_lock_init(&dev_queue->_xmit_lock);
4647 	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4648 	dev_queue->xmit_lock_owner = -1;
4649 }
4650 
4651 static void netdev_init_queue_locks(struct net_device *dev)
4652 {
4653 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4654 	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4655 }
4656 
4657 unsigned long netdev_fix_features(unsigned long features, const char *name)
4658 {
4659 	/* Fix illegal SG+CSUM combinations. */
4660 	if ((features & NETIF_F_SG) &&
4661 	    !(features & NETIF_F_ALL_CSUM)) {
4662 		if (name)
4663 			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4664 			       "checksum feature.\n", name);
4665 		features &= ~NETIF_F_SG;
4666 	}
4667 
4668 	/* TSO requires that SG is present as well. */
4669 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4670 		if (name)
4671 			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4672 			       "SG feature.\n", name);
4673 		features &= ~NETIF_F_TSO;
4674 	}
4675 
4676 	if (features & NETIF_F_UFO) {
4677 		if (!(features & NETIF_F_GEN_CSUM)) {
4678 			if (name)
4679 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4680 				       "since no NETIF_F_HW_CSUM feature.\n",
4681 				       name);
4682 			features &= ~NETIF_F_UFO;
4683 		}
4684 
4685 		if (!(features & NETIF_F_SG)) {
4686 			if (name)
4687 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4688 				       "since no NETIF_F_SG feature.\n", name);
4689 			features &= ~NETIF_F_UFO;
4690 		}
4691 	}
4692 
4693 	return features;
4694 }
4695 EXPORT_SYMBOL(netdev_fix_features);
4696 
4697 /**
4698  *	register_netdevice	- register a network device
4699  *	@dev: device to register
4700  *
4701  *	Take a completed network device structure and add it to the kernel
4702  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4703  *	chain. 0 is returned on success. A negative errno code is returned
4704  *	on a failure to set up the device, or if the name is a duplicate.
4705  *
4706  *	Callers must hold the rtnl semaphore. You may want
4707  *	register_netdev() instead of this.
4708  *
4709  *	BUGS:
4710  *	The locking appears insufficient to guarantee two parallel registers
4711  *	will not get the same name.
4712  */
4713 
4714 int register_netdevice(struct net_device *dev)
4715 {
4716 	struct hlist_head *head;
4717 	struct hlist_node *p;
4718 	int ret;
4719 	struct net *net = dev_net(dev);
4720 
4721 	BUG_ON(dev_boot_phase);
4722 	ASSERT_RTNL();
4723 
4724 	might_sleep();
4725 
4726 	/* When net_device's are persistent, this will be fatal. */
4727 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4728 	BUG_ON(!net);
4729 
4730 	spin_lock_init(&dev->addr_list_lock);
4731 	netdev_set_addr_lockdep_class(dev);
4732 	netdev_init_queue_locks(dev);
4733 
4734 	dev->iflink = -1;
4735 
4736 	/* Init, if this function is available */
4737 	if (dev->netdev_ops->ndo_init) {
4738 		ret = dev->netdev_ops->ndo_init(dev);
4739 		if (ret) {
4740 			if (ret > 0)
4741 				ret = -EIO;
4742 			goto out;
4743 		}
4744 	}
4745 
4746 	if (!dev_valid_name(dev->name)) {
4747 		ret = -EINVAL;
4748 		goto err_uninit;
4749 	}
4750 
4751 	dev->ifindex = dev_new_index(net);
4752 	if (dev->iflink == -1)
4753 		dev->iflink = dev->ifindex;
4754 
4755 	/* Check for existence of name */
4756 	head = dev_name_hash(net, dev->name);
4757 	hlist_for_each(p, head) {
4758 		struct net_device *d
4759 			= hlist_entry(p, struct net_device, name_hlist);
4760 		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4761 			ret = -EEXIST;
4762 			goto err_uninit;
4763 		}
4764 	}
4765 
4766 	/* Fix illegal checksum combinations */
4767 	if ((dev->features & NETIF_F_HW_CSUM) &&
4768 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4769 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4770 		       dev->name);
4771 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4772 	}
4773 
4774 	if ((dev->features & NETIF_F_NO_CSUM) &&
4775 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4776 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4777 		       dev->name);
4778 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4779 	}
4780 
4781 	dev->features = netdev_fix_features(dev->features, dev->name);
4782 
4783 	/* Enable software GSO if SG is supported. */
4784 	if (dev->features & NETIF_F_SG)
4785 		dev->features |= NETIF_F_GSO;
4786 
4787 	netdev_initialize_kobject(dev);
4788 	ret = netdev_register_kobject(dev);
4789 	if (ret)
4790 		goto err_uninit;
4791 	dev->reg_state = NETREG_REGISTERED;
4792 
4793 	/*
4794 	 *	Default initial state at registry is that the
4795 	 *	device is present.
4796 	 */
4797 
4798 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4799 
4800 	dev_init_scheduler(dev);
4801 	dev_hold(dev);
4802 	list_netdevice(dev);
4803 
4804 	/* Notify protocols, that a new device appeared. */
4805 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4806 	ret = notifier_to_errno(ret);
4807 	if (ret) {
4808 		rollback_registered(dev);
4809 		dev->reg_state = NETREG_UNREGISTERED;
4810 	}
4811 
4812 out:
4813 	return ret;
4814 
4815 err_uninit:
4816 	if (dev->netdev_ops->ndo_uninit)
4817 		dev->netdev_ops->ndo_uninit(dev);
4818 	goto out;
4819 }
4820 
4821 /**
4822  *	init_dummy_netdev	- init a dummy network device for NAPI
4823  *	@dev: device to init
4824  *
4825  *	This takes a network device structure and initialize the minimum
4826  *	amount of fields so it can be used to schedule NAPI polls without
4827  *	registering a full blown interface. This is to be used by drivers
4828  *	that need to tie several hardware interfaces to a single NAPI
4829  *	poll scheduler due to HW limitations.
4830  */
4831 int init_dummy_netdev(struct net_device *dev)
4832 {
4833 	/* Clear everything. Note we don't initialize spinlocks
4834 	 * are they aren't supposed to be taken by any of the
4835 	 * NAPI code and this dummy netdev is supposed to be
4836 	 * only ever used for NAPI polls
4837 	 */
4838 	memset(dev, 0, sizeof(struct net_device));
4839 
4840 	/* make sure we BUG if trying to hit standard
4841 	 * register/unregister code path
4842 	 */
4843 	dev->reg_state = NETREG_DUMMY;
4844 
4845 	/* initialize the ref count */
4846 	atomic_set(&dev->refcnt, 1);
4847 
4848 	/* NAPI wants this */
4849 	INIT_LIST_HEAD(&dev->napi_list);
4850 
4851 	/* a dummy interface is started by default */
4852 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4853 	set_bit(__LINK_STATE_START, &dev->state);
4854 
4855 	return 0;
4856 }
4857 EXPORT_SYMBOL_GPL(init_dummy_netdev);
4858 
4859 
4860 /**
4861  *	register_netdev	- register a network device
4862  *	@dev: device to register
4863  *
4864  *	Take a completed network device structure and add it to the kernel
4865  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4866  *	chain. 0 is returned on success. A negative errno code is returned
4867  *	on a failure to set up the device, or if the name is a duplicate.
4868  *
4869  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
4870  *	and expands the device name if you passed a format string to
4871  *	alloc_netdev.
4872  */
4873 int register_netdev(struct net_device *dev)
4874 {
4875 	int err;
4876 
4877 	rtnl_lock();
4878 
4879 	/*
4880 	 * If the name is a format string the caller wants us to do a
4881 	 * name allocation.
4882 	 */
4883 	if (strchr(dev->name, '%')) {
4884 		err = dev_alloc_name(dev, dev->name);
4885 		if (err < 0)
4886 			goto out;
4887 	}
4888 
4889 	err = register_netdevice(dev);
4890 out:
4891 	rtnl_unlock();
4892 	return err;
4893 }
4894 EXPORT_SYMBOL(register_netdev);
4895 
4896 /*
4897  * netdev_wait_allrefs - wait until all references are gone.
4898  *
4899  * This is called when unregistering network devices.
4900  *
4901  * Any protocol or device that holds a reference should register
4902  * for netdevice notification, and cleanup and put back the
4903  * reference if they receive an UNREGISTER event.
4904  * We can get stuck here if buggy protocols don't correctly
4905  * call dev_put.
4906  */
4907 static void netdev_wait_allrefs(struct net_device *dev)
4908 {
4909 	unsigned long rebroadcast_time, warning_time;
4910 
4911 	rebroadcast_time = warning_time = jiffies;
4912 	while (atomic_read(&dev->refcnt) != 0) {
4913 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4914 			rtnl_lock();
4915 
4916 			/* Rebroadcast unregister notification */
4917 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4918 
4919 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4920 				     &dev->state)) {
4921 				/* We must not have linkwatch events
4922 				 * pending on unregister. If this
4923 				 * happens, we simply run the queue
4924 				 * unscheduled, resulting in a noop
4925 				 * for this device.
4926 				 */
4927 				linkwatch_run_queue();
4928 			}
4929 
4930 			__rtnl_unlock();
4931 
4932 			rebroadcast_time = jiffies;
4933 		}
4934 
4935 		msleep(250);
4936 
4937 		if (time_after(jiffies, warning_time + 10 * HZ)) {
4938 			printk(KERN_EMERG "unregister_netdevice: "
4939 			       "waiting for %s to become free. Usage "
4940 			       "count = %d\n",
4941 			       dev->name, atomic_read(&dev->refcnt));
4942 			warning_time = jiffies;
4943 		}
4944 	}
4945 }
4946 
4947 /* The sequence is:
4948  *
4949  *	rtnl_lock();
4950  *	...
4951  *	register_netdevice(x1);
4952  *	register_netdevice(x2);
4953  *	...
4954  *	unregister_netdevice(y1);
4955  *	unregister_netdevice(y2);
4956  *      ...
4957  *	rtnl_unlock();
4958  *	free_netdev(y1);
4959  *	free_netdev(y2);
4960  *
4961  * We are invoked by rtnl_unlock().
4962  * This allows us to deal with problems:
4963  * 1) We can delete sysfs objects which invoke hotplug
4964  *    without deadlocking with linkwatch via keventd.
4965  * 2) Since we run with the RTNL semaphore not held, we can sleep
4966  *    safely in order to wait for the netdev refcnt to drop to zero.
4967  *
4968  * We must not return until all unregister events added during
4969  * the interval the lock was held have been completed.
4970  */
4971 void netdev_run_todo(void)
4972 {
4973 	struct list_head list;
4974 
4975 	/* Snapshot list, allow later requests */
4976 	list_replace_init(&net_todo_list, &list);
4977 
4978 	__rtnl_unlock();
4979 
4980 	while (!list_empty(&list)) {
4981 		struct net_device *dev
4982 			= list_entry(list.next, struct net_device, todo_list);
4983 		list_del(&dev->todo_list);
4984 
4985 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4986 			printk(KERN_ERR "network todo '%s' but state %d\n",
4987 			       dev->name, dev->reg_state);
4988 			dump_stack();
4989 			continue;
4990 		}
4991 
4992 		dev->reg_state = NETREG_UNREGISTERED;
4993 
4994 		on_each_cpu(flush_backlog, dev, 1);
4995 
4996 		netdev_wait_allrefs(dev);
4997 
4998 		/* paranoia */
4999 		BUG_ON(atomic_read(&dev->refcnt));
5000 		WARN_ON(dev->ip_ptr);
5001 		WARN_ON(dev->ip6_ptr);
5002 		WARN_ON(dev->dn_ptr);
5003 
5004 		if (dev->destructor)
5005 			dev->destructor(dev);
5006 
5007 		/* Free network device */
5008 		kobject_put(&dev->dev.kobj);
5009 	}
5010 }
5011 
5012 /**
5013  *	dev_get_stats	- get network device statistics
5014  *	@dev: device to get statistics from
5015  *
5016  *	Get network statistics from device. The device driver may provide
5017  *	its own method by setting dev->netdev_ops->get_stats; otherwise
5018  *	the internal statistics structure is used.
5019  */
5020 const struct net_device_stats *dev_get_stats(struct net_device *dev)
5021 {
5022 	const struct net_device_ops *ops = dev->netdev_ops;
5023 
5024 	if (ops->ndo_get_stats)
5025 		return ops->ndo_get_stats(dev);
5026 	else {
5027 		unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5028 		struct net_device_stats *stats = &dev->stats;
5029 		unsigned int i;
5030 		struct netdev_queue *txq;
5031 
5032 		for (i = 0; i < dev->num_tx_queues; i++) {
5033 			txq = netdev_get_tx_queue(dev, i);
5034 			tx_bytes   += txq->tx_bytes;
5035 			tx_packets += txq->tx_packets;
5036 			tx_dropped += txq->tx_dropped;
5037 		}
5038 		if (tx_bytes || tx_packets || tx_dropped) {
5039 			stats->tx_bytes   = tx_bytes;
5040 			stats->tx_packets = tx_packets;
5041 			stats->tx_dropped = tx_dropped;
5042 		}
5043 		return stats;
5044 	}
5045 }
5046 EXPORT_SYMBOL(dev_get_stats);
5047 
5048 static void netdev_init_one_queue(struct net_device *dev,
5049 				  struct netdev_queue *queue,
5050 				  void *_unused)
5051 {
5052 	queue->dev = dev;
5053 }
5054 
5055 static void netdev_init_queues(struct net_device *dev)
5056 {
5057 	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5058 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5059 	spin_lock_init(&dev->tx_global_lock);
5060 }
5061 
5062 /**
5063  *	alloc_netdev_mq - allocate network device
5064  *	@sizeof_priv:	size of private data to allocate space for
5065  *	@name:		device name format string
5066  *	@setup:		callback to initialize device
5067  *	@queue_count:	the number of subqueues to allocate
5068  *
5069  *	Allocates a struct net_device with private data area for driver use
5070  *	and performs basic initialization.  Also allocates subquue structs
5071  *	for each queue on the device at the end of the netdevice.
5072  */
5073 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5074 		void (*setup)(struct net_device *), unsigned int queue_count)
5075 {
5076 	struct netdev_queue *tx;
5077 	struct net_device *dev;
5078 	size_t alloc_size;
5079 	struct net_device *p;
5080 
5081 	BUG_ON(strlen(name) >= sizeof(dev->name));
5082 
5083 	alloc_size = sizeof(struct net_device);
5084 	if (sizeof_priv) {
5085 		/* ensure 32-byte alignment of private area */
5086 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5087 		alloc_size += sizeof_priv;
5088 	}
5089 	/* ensure 32-byte alignment of whole construct */
5090 	alloc_size += NETDEV_ALIGN - 1;
5091 
5092 	p = kzalloc(alloc_size, GFP_KERNEL);
5093 	if (!p) {
5094 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5095 		return NULL;
5096 	}
5097 
5098 	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5099 	if (!tx) {
5100 		printk(KERN_ERR "alloc_netdev: Unable to allocate "
5101 		       "tx qdiscs.\n");
5102 		goto free_p;
5103 	}
5104 
5105 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5106 	dev->padded = (char *)dev - (char *)p;
5107 
5108 	if (dev_addr_init(dev))
5109 		goto free_tx;
5110 
5111 	dev_unicast_init(dev);
5112 
5113 	dev_net_set(dev, &init_net);
5114 
5115 	dev->_tx = tx;
5116 	dev->num_tx_queues = queue_count;
5117 	dev->real_num_tx_queues = queue_count;
5118 
5119 	dev->gso_max_size = GSO_MAX_SIZE;
5120 
5121 	netdev_init_queues(dev);
5122 
5123 	INIT_LIST_HEAD(&dev->napi_list);
5124 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5125 	setup(dev);
5126 	strcpy(dev->name, name);
5127 	return dev;
5128 
5129 free_tx:
5130 	kfree(tx);
5131 
5132 free_p:
5133 	kfree(p);
5134 	return NULL;
5135 }
5136 EXPORT_SYMBOL(alloc_netdev_mq);
5137 
5138 /**
5139  *	free_netdev - free network device
5140  *	@dev: device
5141  *
5142  *	This function does the last stage of destroying an allocated device
5143  * 	interface. The reference to the device object is released.
5144  *	If this is the last reference then it will be freed.
5145  */
5146 void free_netdev(struct net_device *dev)
5147 {
5148 	struct napi_struct *p, *n;
5149 
5150 	release_net(dev_net(dev));
5151 
5152 	kfree(dev->_tx);
5153 
5154 	/* Flush device addresses */
5155 	dev_addr_flush(dev);
5156 
5157 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5158 		netif_napi_del(p);
5159 
5160 	/*  Compatibility with error handling in drivers */
5161 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5162 		kfree((char *)dev - dev->padded);
5163 		return;
5164 	}
5165 
5166 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5167 	dev->reg_state = NETREG_RELEASED;
5168 
5169 	/* will free via device release */
5170 	put_device(&dev->dev);
5171 }
5172 
5173 /**
5174  *	synchronize_net -  Synchronize with packet receive processing
5175  *
5176  *	Wait for packets currently being received to be done.
5177  *	Does not block later packets from starting.
5178  */
5179 void synchronize_net(void)
5180 {
5181 	might_sleep();
5182 	synchronize_rcu();
5183 }
5184 
5185 /**
5186  *	unregister_netdevice - remove device from the kernel
5187  *	@dev: device
5188  *
5189  *	This function shuts down a device interface and removes it
5190  *	from the kernel tables.
5191  *
5192  *	Callers must hold the rtnl semaphore.  You may want
5193  *	unregister_netdev() instead of this.
5194  */
5195 
5196 void unregister_netdevice(struct net_device *dev)
5197 {
5198 	ASSERT_RTNL();
5199 
5200 	rollback_registered(dev);
5201 	/* Finish processing unregister after unlock */
5202 	net_set_todo(dev);
5203 }
5204 
5205 /**
5206  *	unregister_netdev - remove device from the kernel
5207  *	@dev: device
5208  *
5209  *	This function shuts down a device interface and removes it
5210  *	from the kernel tables.
5211  *
5212  *	This is just a wrapper for unregister_netdevice that takes
5213  *	the rtnl semaphore.  In general you want to use this and not
5214  *	unregister_netdevice.
5215  */
5216 void unregister_netdev(struct net_device *dev)
5217 {
5218 	rtnl_lock();
5219 	unregister_netdevice(dev);
5220 	rtnl_unlock();
5221 }
5222 
5223 EXPORT_SYMBOL(unregister_netdev);
5224 
5225 /**
5226  *	dev_change_net_namespace - move device to different nethost namespace
5227  *	@dev: device
5228  *	@net: network namespace
5229  *	@pat: If not NULL name pattern to try if the current device name
5230  *	      is already taken in the destination network namespace.
5231  *
5232  *	This function shuts down a device interface and moves it
5233  *	to a new network namespace. On success 0 is returned, on
5234  *	a failure a netagive errno code is returned.
5235  *
5236  *	Callers must hold the rtnl semaphore.
5237  */
5238 
5239 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5240 {
5241 	char buf[IFNAMSIZ];
5242 	const char *destname;
5243 	int err;
5244 
5245 	ASSERT_RTNL();
5246 
5247 	/* Don't allow namespace local devices to be moved. */
5248 	err = -EINVAL;
5249 	if (dev->features & NETIF_F_NETNS_LOCAL)
5250 		goto out;
5251 
5252 #ifdef CONFIG_SYSFS
5253 	/* Don't allow real devices to be moved when sysfs
5254 	 * is enabled.
5255 	 */
5256 	err = -EINVAL;
5257 	if (dev->dev.parent)
5258 		goto out;
5259 #endif
5260 
5261 	/* Ensure the device has been registrered */
5262 	err = -EINVAL;
5263 	if (dev->reg_state != NETREG_REGISTERED)
5264 		goto out;
5265 
5266 	/* Get out if there is nothing todo */
5267 	err = 0;
5268 	if (net_eq(dev_net(dev), net))
5269 		goto out;
5270 
5271 	/* Pick the destination device name, and ensure
5272 	 * we can use it in the destination network namespace.
5273 	 */
5274 	err = -EEXIST;
5275 	destname = dev->name;
5276 	if (__dev_get_by_name(net, destname)) {
5277 		/* We get here if we can't use the current device name */
5278 		if (!pat)
5279 			goto out;
5280 		if (!dev_valid_name(pat))
5281 			goto out;
5282 		if (strchr(pat, '%')) {
5283 			if (__dev_alloc_name(net, pat, buf) < 0)
5284 				goto out;
5285 			destname = buf;
5286 		} else
5287 			destname = pat;
5288 		if (__dev_get_by_name(net, destname))
5289 			goto out;
5290 	}
5291 
5292 	/*
5293 	 * And now a mini version of register_netdevice unregister_netdevice.
5294 	 */
5295 
5296 	/* If device is running close it first. */
5297 	dev_close(dev);
5298 
5299 	/* And unlink it from device chain */
5300 	err = -ENODEV;
5301 	unlist_netdevice(dev);
5302 
5303 	synchronize_net();
5304 
5305 	/* Shutdown queueing discipline. */
5306 	dev_shutdown(dev);
5307 
5308 	/* Notify protocols, that we are about to destroy
5309 	   this device. They should clean all the things.
5310 	*/
5311 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5312 
5313 	/*
5314 	 *	Flush the unicast and multicast chains
5315 	 */
5316 	dev_unicast_flush(dev);
5317 	dev_addr_discard(dev);
5318 
5319 	netdev_unregister_kobject(dev);
5320 
5321 	/* Actually switch the network namespace */
5322 	dev_net_set(dev, net);
5323 
5324 	/* Assign the new device name */
5325 	if (destname != dev->name)
5326 		strcpy(dev->name, destname);
5327 
5328 	/* If there is an ifindex conflict assign a new one */
5329 	if (__dev_get_by_index(net, dev->ifindex)) {
5330 		int iflink = (dev->iflink == dev->ifindex);
5331 		dev->ifindex = dev_new_index(net);
5332 		if (iflink)
5333 			dev->iflink = dev->ifindex;
5334 	}
5335 
5336 	/* Fixup kobjects */
5337 	err = netdev_register_kobject(dev);
5338 	WARN_ON(err);
5339 
5340 	/* Add the device back in the hashes */
5341 	list_netdevice(dev);
5342 
5343 	/* Notify protocols, that a new device appeared. */
5344 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
5345 
5346 	synchronize_net();
5347 	err = 0;
5348 out:
5349 	return err;
5350 }
5351 
5352 static int dev_cpu_callback(struct notifier_block *nfb,
5353 			    unsigned long action,
5354 			    void *ocpu)
5355 {
5356 	struct sk_buff **list_skb;
5357 	struct Qdisc **list_net;
5358 	struct sk_buff *skb;
5359 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5360 	struct softnet_data *sd, *oldsd;
5361 
5362 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5363 		return NOTIFY_OK;
5364 
5365 	local_irq_disable();
5366 	cpu = smp_processor_id();
5367 	sd = &per_cpu(softnet_data, cpu);
5368 	oldsd = &per_cpu(softnet_data, oldcpu);
5369 
5370 	/* Find end of our completion_queue. */
5371 	list_skb = &sd->completion_queue;
5372 	while (*list_skb)
5373 		list_skb = &(*list_skb)->next;
5374 	/* Append completion queue from offline CPU. */
5375 	*list_skb = oldsd->completion_queue;
5376 	oldsd->completion_queue = NULL;
5377 
5378 	/* Find end of our output_queue. */
5379 	list_net = &sd->output_queue;
5380 	while (*list_net)
5381 		list_net = &(*list_net)->next_sched;
5382 	/* Append output queue from offline CPU. */
5383 	*list_net = oldsd->output_queue;
5384 	oldsd->output_queue = NULL;
5385 
5386 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5387 	local_irq_enable();
5388 
5389 	/* Process offline CPU's input_pkt_queue */
5390 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5391 		netif_rx(skb);
5392 
5393 	return NOTIFY_OK;
5394 }
5395 
5396 
5397 /**
5398  *	netdev_increment_features - increment feature set by one
5399  *	@all: current feature set
5400  *	@one: new feature set
5401  *	@mask: mask feature set
5402  *
5403  *	Computes a new feature set after adding a device with feature set
5404  *	@one to the master device with current feature set @all.  Will not
5405  *	enable anything that is off in @mask. Returns the new feature set.
5406  */
5407 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5408 					unsigned long mask)
5409 {
5410 	/* If device needs checksumming, downgrade to it. */
5411         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5412 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5413 	else if (mask & NETIF_F_ALL_CSUM) {
5414 		/* If one device supports v4/v6 checksumming, set for all. */
5415 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5416 		    !(all & NETIF_F_GEN_CSUM)) {
5417 			all &= ~NETIF_F_ALL_CSUM;
5418 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5419 		}
5420 
5421 		/* If one device supports hw checksumming, set for all. */
5422 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5423 			all &= ~NETIF_F_ALL_CSUM;
5424 			all |= NETIF_F_HW_CSUM;
5425 		}
5426 	}
5427 
5428 	one |= NETIF_F_ALL_CSUM;
5429 
5430 	one |= all & NETIF_F_ONE_FOR_ALL;
5431 	all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5432 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5433 
5434 	return all;
5435 }
5436 EXPORT_SYMBOL(netdev_increment_features);
5437 
5438 static struct hlist_head *netdev_create_hash(void)
5439 {
5440 	int i;
5441 	struct hlist_head *hash;
5442 
5443 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5444 	if (hash != NULL)
5445 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5446 			INIT_HLIST_HEAD(&hash[i]);
5447 
5448 	return hash;
5449 }
5450 
5451 /* Initialize per network namespace state */
5452 static int __net_init netdev_init(struct net *net)
5453 {
5454 	INIT_LIST_HEAD(&net->dev_base_head);
5455 
5456 	net->dev_name_head = netdev_create_hash();
5457 	if (net->dev_name_head == NULL)
5458 		goto err_name;
5459 
5460 	net->dev_index_head = netdev_create_hash();
5461 	if (net->dev_index_head == NULL)
5462 		goto err_idx;
5463 
5464 	return 0;
5465 
5466 err_idx:
5467 	kfree(net->dev_name_head);
5468 err_name:
5469 	return -ENOMEM;
5470 }
5471 
5472 /**
5473  *	netdev_drivername - network driver for the device
5474  *	@dev: network device
5475  *	@buffer: buffer for resulting name
5476  *	@len: size of buffer
5477  *
5478  *	Determine network driver for device.
5479  */
5480 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5481 {
5482 	const struct device_driver *driver;
5483 	const struct device *parent;
5484 
5485 	if (len <= 0 || !buffer)
5486 		return buffer;
5487 	buffer[0] = 0;
5488 
5489 	parent = dev->dev.parent;
5490 
5491 	if (!parent)
5492 		return buffer;
5493 
5494 	driver = parent->driver;
5495 	if (driver && driver->name)
5496 		strlcpy(buffer, driver->name, len);
5497 	return buffer;
5498 }
5499 
5500 static void __net_exit netdev_exit(struct net *net)
5501 {
5502 	kfree(net->dev_name_head);
5503 	kfree(net->dev_index_head);
5504 }
5505 
5506 static struct pernet_operations __net_initdata netdev_net_ops = {
5507 	.init = netdev_init,
5508 	.exit = netdev_exit,
5509 };
5510 
5511 static void __net_exit default_device_exit(struct net *net)
5512 {
5513 	struct net_device *dev;
5514 	/*
5515 	 * Push all migratable of the network devices back to the
5516 	 * initial network namespace
5517 	 */
5518 	rtnl_lock();
5519 restart:
5520 	for_each_netdev(net, dev) {
5521 		int err;
5522 		char fb_name[IFNAMSIZ];
5523 
5524 		/* Ignore unmoveable devices (i.e. loopback) */
5525 		if (dev->features & NETIF_F_NETNS_LOCAL)
5526 			continue;
5527 
5528 		/* Delete virtual devices */
5529 		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5530 			dev->rtnl_link_ops->dellink(dev);
5531 			goto restart;
5532 		}
5533 
5534 		/* Push remaing network devices to init_net */
5535 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5536 		err = dev_change_net_namespace(dev, &init_net, fb_name);
5537 		if (err) {
5538 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5539 				__func__, dev->name, err);
5540 			BUG();
5541 		}
5542 		goto restart;
5543 	}
5544 	rtnl_unlock();
5545 }
5546 
5547 static struct pernet_operations __net_initdata default_device_ops = {
5548 	.exit = default_device_exit,
5549 };
5550 
5551 /*
5552  *	Initialize the DEV module. At boot time this walks the device list and
5553  *	unhooks any devices that fail to initialise (normally hardware not
5554  *	present) and leaves us with a valid list of present and active devices.
5555  *
5556  */
5557 
5558 /*
5559  *       This is called single threaded during boot, so no need
5560  *       to take the rtnl semaphore.
5561  */
5562 static int __init net_dev_init(void)
5563 {
5564 	int i, rc = -ENOMEM;
5565 
5566 	BUG_ON(!dev_boot_phase);
5567 
5568 	if (dev_proc_init())
5569 		goto out;
5570 
5571 	if (netdev_kobject_init())
5572 		goto out;
5573 
5574 	INIT_LIST_HEAD(&ptype_all);
5575 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5576 		INIT_LIST_HEAD(&ptype_base[i]);
5577 
5578 	if (register_pernet_subsys(&netdev_net_ops))
5579 		goto out;
5580 
5581 	/*
5582 	 *	Initialise the packet receive queues.
5583 	 */
5584 
5585 	for_each_possible_cpu(i) {
5586 		struct softnet_data *queue;
5587 
5588 		queue = &per_cpu(softnet_data, i);
5589 		skb_queue_head_init(&queue->input_pkt_queue);
5590 		queue->completion_queue = NULL;
5591 		INIT_LIST_HEAD(&queue->poll_list);
5592 
5593 		queue->backlog.poll = process_backlog;
5594 		queue->backlog.weight = weight_p;
5595 		queue->backlog.gro_list = NULL;
5596 		queue->backlog.gro_count = 0;
5597 	}
5598 
5599 	dev_boot_phase = 0;
5600 
5601 	/* The loopback device is special if any other network devices
5602 	 * is present in a network namespace the loopback device must
5603 	 * be present. Since we now dynamically allocate and free the
5604 	 * loopback device ensure this invariant is maintained by
5605 	 * keeping the loopback device as the first device on the
5606 	 * list of network devices.  Ensuring the loopback devices
5607 	 * is the first device that appears and the last network device
5608 	 * that disappears.
5609 	 */
5610 	if (register_pernet_device(&loopback_net_ops))
5611 		goto out;
5612 
5613 	if (register_pernet_device(&default_device_ops))
5614 		goto out;
5615 
5616 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5617 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5618 
5619 	hotcpu_notifier(dev_cpu_callback, 0);
5620 	dst_init();
5621 	dev_mcast_init();
5622 	rc = 0;
5623 out:
5624 	return rc;
5625 }
5626 
5627 subsys_initcall(net_dev_init);
5628 
5629 static int __init initialize_hashrnd(void)
5630 {
5631 	get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
5632 	return 0;
5633 }
5634 
5635 late_initcall_sync(initialize_hashrnd);
5636 
5637 EXPORT_SYMBOL(__dev_get_by_index);
5638 EXPORT_SYMBOL(__dev_get_by_name);
5639 EXPORT_SYMBOL(__dev_remove_pack);
5640 EXPORT_SYMBOL(dev_valid_name);
5641 EXPORT_SYMBOL(dev_add_pack);
5642 EXPORT_SYMBOL(dev_alloc_name);
5643 EXPORT_SYMBOL(dev_close);
5644 EXPORT_SYMBOL(dev_get_by_flags);
5645 EXPORT_SYMBOL(dev_get_by_index);
5646 EXPORT_SYMBOL(dev_get_by_name);
5647 EXPORT_SYMBOL(dev_open);
5648 EXPORT_SYMBOL(dev_queue_xmit);
5649 EXPORT_SYMBOL(dev_remove_pack);
5650 EXPORT_SYMBOL(dev_set_allmulti);
5651 EXPORT_SYMBOL(dev_set_promiscuity);
5652 EXPORT_SYMBOL(dev_change_flags);
5653 EXPORT_SYMBOL(dev_set_mtu);
5654 EXPORT_SYMBOL(dev_set_mac_address);
5655 EXPORT_SYMBOL(free_netdev);
5656 EXPORT_SYMBOL(netdev_boot_setup_check);
5657 EXPORT_SYMBOL(netdev_set_master);
5658 EXPORT_SYMBOL(netdev_state_change);
5659 EXPORT_SYMBOL(netif_receive_skb);
5660 EXPORT_SYMBOL(netif_rx);
5661 EXPORT_SYMBOL(register_gifconf);
5662 EXPORT_SYMBOL(register_netdevice);
5663 EXPORT_SYMBOL(register_netdevice_notifier);
5664 EXPORT_SYMBOL(skb_checksum_help);
5665 EXPORT_SYMBOL(synchronize_net);
5666 EXPORT_SYMBOL(unregister_netdevice);
5667 EXPORT_SYMBOL(unregister_netdevice_notifier);
5668 EXPORT_SYMBOL(net_enable_timestamp);
5669 EXPORT_SYMBOL(net_disable_timestamp);
5670 EXPORT_SYMBOL(dev_get_flags);
5671 
5672 EXPORT_SYMBOL(dev_load);
5673 
5674 EXPORT_PER_CPU_SYMBOL(softnet_data);
5675