1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <[email protected]> 12 * Mark Evans, <[email protected]> 13 * 14 * Additional Authors: 15 * Florian la Roche <[email protected]> 16 * Alan Cox <[email protected]> 17 * David Hinds <[email protected]> 18 * Alexey Kuznetsov <[email protected]> 19 * Adam Sulmicki <[email protected]> 20 * Pekka Riikonen <[email protected]> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <asm/uaccess.h> 76 #include <asm/system.h> 77 #include <linux/bitops.h> 78 #include <linux/capability.h> 79 #include <linux/cpu.h> 80 #include <linux/types.h> 81 #include <linux/kernel.h> 82 #include <linux/hash.h> 83 #include <linux/slab.h> 84 #include <linux/sched.h> 85 #include <linux/mutex.h> 86 #include <linux/string.h> 87 #include <linux/mm.h> 88 #include <linux/socket.h> 89 #include <linux/sockios.h> 90 #include <linux/errno.h> 91 #include <linux/interrupt.h> 92 #include <linux/if_ether.h> 93 #include <linux/netdevice.h> 94 #include <linux/etherdevice.h> 95 #include <linux/ethtool.h> 96 #include <linux/notifier.h> 97 #include <linux/skbuff.h> 98 #include <net/net_namespace.h> 99 #include <net/sock.h> 100 #include <linux/rtnetlink.h> 101 #include <linux/proc_fs.h> 102 #include <linux/seq_file.h> 103 #include <linux/stat.h> 104 #include <net/dst.h> 105 #include <net/pkt_sched.h> 106 #include <net/checksum.h> 107 #include <net/xfrm.h> 108 #include <linux/highmem.h> 109 #include <linux/init.h> 110 #include <linux/kmod.h> 111 #include <linux/module.h> 112 #include <linux/netpoll.h> 113 #include <linux/rcupdate.h> 114 #include <linux/delay.h> 115 #include <net/wext.h> 116 #include <net/iw_handler.h> 117 #include <asm/current.h> 118 #include <linux/audit.h> 119 #include <linux/dmaengine.h> 120 #include <linux/err.h> 121 #include <linux/ctype.h> 122 #include <linux/if_arp.h> 123 #include <linux/if_vlan.h> 124 #include <linux/ip.h> 125 #include <net/ip.h> 126 #include <linux/ipv6.h> 127 #include <linux/in.h> 128 #include <linux/jhash.h> 129 #include <linux/random.h> 130 #include <trace/events/napi.h> 131 #include <linux/pci.h> 132 133 #include "net-sysfs.h" 134 135 /* Instead of increasing this, you should create a hash table. */ 136 #define MAX_GRO_SKBS 8 137 138 /* This should be increased if a protocol with a bigger head is added. */ 139 #define GRO_MAX_HEAD (MAX_HEADER + 128) 140 141 /* 142 * The list of packet types we will receive (as opposed to discard) 143 * and the routines to invoke. 144 * 145 * Why 16. Because with 16 the only overlap we get on a hash of the 146 * low nibble of the protocol value is RARP/SNAP/X.25. 147 * 148 * NOTE: That is no longer true with the addition of VLAN tags. Not 149 * sure which should go first, but I bet it won't make much 150 * difference if we are running VLANs. The good news is that 151 * this protocol won't be in the list unless compiled in, so 152 * the average user (w/out VLANs) will not be adversely affected. 153 * --BLG 154 * 155 * 0800 IP 156 * 8100 802.1Q VLAN 157 * 0001 802.3 158 * 0002 AX.25 159 * 0004 802.2 160 * 8035 RARP 161 * 0005 SNAP 162 * 0805 X.25 163 * 0806 ARP 164 * 8137 IPX 165 * 0009 Localtalk 166 * 86DD IPv6 167 */ 168 169 #define PTYPE_HASH_SIZE (16) 170 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) 171 172 static DEFINE_SPINLOCK(ptype_lock); 173 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 174 static struct list_head ptype_all __read_mostly; /* Taps */ 175 176 /* 177 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 178 * semaphore. 179 * 180 * Pure readers hold dev_base_lock for reading, or rcu_read_lock() 181 * 182 * Writers must hold the rtnl semaphore while they loop through the 183 * dev_base_head list, and hold dev_base_lock for writing when they do the 184 * actual updates. This allows pure readers to access the list even 185 * while a writer is preparing to update it. 186 * 187 * To put it another way, dev_base_lock is held for writing only to 188 * protect against pure readers; the rtnl semaphore provides the 189 * protection against other writers. 190 * 191 * See, for example usages, register_netdevice() and 192 * unregister_netdevice(), which must be called with the rtnl 193 * semaphore held. 194 */ 195 DEFINE_RWLOCK(dev_base_lock); 196 EXPORT_SYMBOL(dev_base_lock); 197 198 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 199 { 200 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); 201 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; 202 } 203 204 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 205 { 206 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 207 } 208 209 static inline void rps_lock(struct softnet_data *sd) 210 { 211 #ifdef CONFIG_RPS 212 spin_lock(&sd->input_pkt_queue.lock); 213 #endif 214 } 215 216 static inline void rps_unlock(struct softnet_data *sd) 217 { 218 #ifdef CONFIG_RPS 219 spin_unlock(&sd->input_pkt_queue.lock); 220 #endif 221 } 222 223 /* Device list insertion */ 224 static int list_netdevice(struct net_device *dev) 225 { 226 struct net *net = dev_net(dev); 227 228 ASSERT_RTNL(); 229 230 write_lock_bh(&dev_base_lock); 231 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); 232 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 233 hlist_add_head_rcu(&dev->index_hlist, 234 dev_index_hash(net, dev->ifindex)); 235 write_unlock_bh(&dev_base_lock); 236 return 0; 237 } 238 239 /* Device list removal 240 * caller must respect a RCU grace period before freeing/reusing dev 241 */ 242 static void unlist_netdevice(struct net_device *dev) 243 { 244 ASSERT_RTNL(); 245 246 /* Unlink dev from the device chain */ 247 write_lock_bh(&dev_base_lock); 248 list_del_rcu(&dev->dev_list); 249 hlist_del_rcu(&dev->name_hlist); 250 hlist_del_rcu(&dev->index_hlist); 251 write_unlock_bh(&dev_base_lock); 252 } 253 254 /* 255 * Our notifier list 256 */ 257 258 static RAW_NOTIFIER_HEAD(netdev_chain); 259 260 /* 261 * Device drivers call our routines to queue packets here. We empty the 262 * queue in the local softnet handler. 263 */ 264 265 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 266 EXPORT_PER_CPU_SYMBOL(softnet_data); 267 268 #ifdef CONFIG_LOCKDEP 269 /* 270 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 271 * according to dev->type 272 */ 273 static const unsigned short netdev_lock_type[] = 274 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 275 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 276 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 277 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 278 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 279 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 280 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 281 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 282 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 283 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 284 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 285 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 286 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, 287 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, 288 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, 289 ARPHRD_VOID, ARPHRD_NONE}; 290 291 static const char *const netdev_lock_name[] = 292 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 293 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 294 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 295 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 296 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 297 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 298 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 299 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 300 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 301 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 302 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 303 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 304 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", 305 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", 306 "_xmit_PHONET_PIPE", "_xmit_IEEE802154", 307 "_xmit_VOID", "_xmit_NONE"}; 308 309 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 310 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 311 312 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 313 { 314 int i; 315 316 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 317 if (netdev_lock_type[i] == dev_type) 318 return i; 319 /* the last key is used by default */ 320 return ARRAY_SIZE(netdev_lock_type) - 1; 321 } 322 323 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 324 unsigned short dev_type) 325 { 326 int i; 327 328 i = netdev_lock_pos(dev_type); 329 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 330 netdev_lock_name[i]); 331 } 332 333 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 334 { 335 int i; 336 337 i = netdev_lock_pos(dev->type); 338 lockdep_set_class_and_name(&dev->addr_list_lock, 339 &netdev_addr_lock_key[i], 340 netdev_lock_name[i]); 341 } 342 #else 343 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 344 unsigned short dev_type) 345 { 346 } 347 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 348 { 349 } 350 #endif 351 352 /******************************************************************************* 353 354 Protocol management and registration routines 355 356 *******************************************************************************/ 357 358 /* 359 * Add a protocol ID to the list. Now that the input handler is 360 * smarter we can dispense with all the messy stuff that used to be 361 * here. 362 * 363 * BEWARE!!! Protocol handlers, mangling input packets, 364 * MUST BE last in hash buckets and checking protocol handlers 365 * MUST start from promiscuous ptype_all chain in net_bh. 366 * It is true now, do not change it. 367 * Explanation follows: if protocol handler, mangling packet, will 368 * be the first on list, it is not able to sense, that packet 369 * is cloned and should be copied-on-write, so that it will 370 * change it and subsequent readers will get broken packet. 371 * --ANK (980803) 372 */ 373 374 static inline struct list_head *ptype_head(const struct packet_type *pt) 375 { 376 if (pt->type == htons(ETH_P_ALL)) 377 return &ptype_all; 378 else 379 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 380 } 381 382 /** 383 * dev_add_pack - add packet handler 384 * @pt: packet type declaration 385 * 386 * Add a protocol handler to the networking stack. The passed &packet_type 387 * is linked into kernel lists and may not be freed until it has been 388 * removed from the kernel lists. 389 * 390 * This call does not sleep therefore it can not 391 * guarantee all CPU's that are in middle of receiving packets 392 * will see the new packet type (until the next received packet). 393 */ 394 395 void dev_add_pack(struct packet_type *pt) 396 { 397 struct list_head *head = ptype_head(pt); 398 399 spin_lock(&ptype_lock); 400 list_add_rcu(&pt->list, head); 401 spin_unlock(&ptype_lock); 402 } 403 EXPORT_SYMBOL(dev_add_pack); 404 405 /** 406 * __dev_remove_pack - remove packet handler 407 * @pt: packet type declaration 408 * 409 * Remove a protocol handler that was previously added to the kernel 410 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 411 * from the kernel lists and can be freed or reused once this function 412 * returns. 413 * 414 * The packet type might still be in use by receivers 415 * and must not be freed until after all the CPU's have gone 416 * through a quiescent state. 417 */ 418 void __dev_remove_pack(struct packet_type *pt) 419 { 420 struct list_head *head = ptype_head(pt); 421 struct packet_type *pt1; 422 423 spin_lock(&ptype_lock); 424 425 list_for_each_entry(pt1, head, list) { 426 if (pt == pt1) { 427 list_del_rcu(&pt->list); 428 goto out; 429 } 430 } 431 432 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); 433 out: 434 spin_unlock(&ptype_lock); 435 } 436 EXPORT_SYMBOL(__dev_remove_pack); 437 438 /** 439 * dev_remove_pack - remove packet handler 440 * @pt: packet type declaration 441 * 442 * Remove a protocol handler that was previously added to the kernel 443 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 444 * from the kernel lists and can be freed or reused once this function 445 * returns. 446 * 447 * This call sleeps to guarantee that no CPU is looking at the packet 448 * type after return. 449 */ 450 void dev_remove_pack(struct packet_type *pt) 451 { 452 __dev_remove_pack(pt); 453 454 synchronize_net(); 455 } 456 EXPORT_SYMBOL(dev_remove_pack); 457 458 /****************************************************************************** 459 460 Device Boot-time Settings Routines 461 462 *******************************************************************************/ 463 464 /* Boot time configuration table */ 465 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 466 467 /** 468 * netdev_boot_setup_add - add new setup entry 469 * @name: name of the device 470 * @map: configured settings for the device 471 * 472 * Adds new setup entry to the dev_boot_setup list. The function 473 * returns 0 on error and 1 on success. This is a generic routine to 474 * all netdevices. 475 */ 476 static int netdev_boot_setup_add(char *name, struct ifmap *map) 477 { 478 struct netdev_boot_setup *s; 479 int i; 480 481 s = dev_boot_setup; 482 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 483 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 484 memset(s[i].name, 0, sizeof(s[i].name)); 485 strlcpy(s[i].name, name, IFNAMSIZ); 486 memcpy(&s[i].map, map, sizeof(s[i].map)); 487 break; 488 } 489 } 490 491 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 492 } 493 494 /** 495 * netdev_boot_setup_check - check boot time settings 496 * @dev: the netdevice 497 * 498 * Check boot time settings for the device. 499 * The found settings are set for the device to be used 500 * later in the device probing. 501 * Returns 0 if no settings found, 1 if they are. 502 */ 503 int netdev_boot_setup_check(struct net_device *dev) 504 { 505 struct netdev_boot_setup *s = dev_boot_setup; 506 int i; 507 508 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 509 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 510 !strcmp(dev->name, s[i].name)) { 511 dev->irq = s[i].map.irq; 512 dev->base_addr = s[i].map.base_addr; 513 dev->mem_start = s[i].map.mem_start; 514 dev->mem_end = s[i].map.mem_end; 515 return 1; 516 } 517 } 518 return 0; 519 } 520 EXPORT_SYMBOL(netdev_boot_setup_check); 521 522 523 /** 524 * netdev_boot_base - get address from boot time settings 525 * @prefix: prefix for network device 526 * @unit: id for network device 527 * 528 * Check boot time settings for the base address of device. 529 * The found settings are set for the device to be used 530 * later in the device probing. 531 * Returns 0 if no settings found. 532 */ 533 unsigned long netdev_boot_base(const char *prefix, int unit) 534 { 535 const struct netdev_boot_setup *s = dev_boot_setup; 536 char name[IFNAMSIZ]; 537 int i; 538 539 sprintf(name, "%s%d", prefix, unit); 540 541 /* 542 * If device already registered then return base of 1 543 * to indicate not to probe for this interface 544 */ 545 if (__dev_get_by_name(&init_net, name)) 546 return 1; 547 548 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 549 if (!strcmp(name, s[i].name)) 550 return s[i].map.base_addr; 551 return 0; 552 } 553 554 /* 555 * Saves at boot time configured settings for any netdevice. 556 */ 557 int __init netdev_boot_setup(char *str) 558 { 559 int ints[5]; 560 struct ifmap map; 561 562 str = get_options(str, ARRAY_SIZE(ints), ints); 563 if (!str || !*str) 564 return 0; 565 566 /* Save settings */ 567 memset(&map, 0, sizeof(map)); 568 if (ints[0] > 0) 569 map.irq = ints[1]; 570 if (ints[0] > 1) 571 map.base_addr = ints[2]; 572 if (ints[0] > 2) 573 map.mem_start = ints[3]; 574 if (ints[0] > 3) 575 map.mem_end = ints[4]; 576 577 /* Add new entry to the list */ 578 return netdev_boot_setup_add(str, &map); 579 } 580 581 __setup("netdev=", netdev_boot_setup); 582 583 /******************************************************************************* 584 585 Device Interface Subroutines 586 587 *******************************************************************************/ 588 589 /** 590 * __dev_get_by_name - find a device by its name 591 * @net: the applicable net namespace 592 * @name: name to find 593 * 594 * Find an interface by name. Must be called under RTNL semaphore 595 * or @dev_base_lock. If the name is found a pointer to the device 596 * is returned. If the name is not found then %NULL is returned. The 597 * reference counters are not incremented so the caller must be 598 * careful with locks. 599 */ 600 601 struct net_device *__dev_get_by_name(struct net *net, const char *name) 602 { 603 struct hlist_node *p; 604 struct net_device *dev; 605 struct hlist_head *head = dev_name_hash(net, name); 606 607 hlist_for_each_entry(dev, p, head, name_hlist) 608 if (!strncmp(dev->name, name, IFNAMSIZ)) 609 return dev; 610 611 return NULL; 612 } 613 EXPORT_SYMBOL(__dev_get_by_name); 614 615 /** 616 * dev_get_by_name_rcu - find a device by its name 617 * @net: the applicable net namespace 618 * @name: name to find 619 * 620 * Find an interface by name. 621 * If the name is found a pointer to the device is returned. 622 * If the name is not found then %NULL is returned. 623 * The reference counters are not incremented so the caller must be 624 * careful with locks. The caller must hold RCU lock. 625 */ 626 627 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 628 { 629 struct hlist_node *p; 630 struct net_device *dev; 631 struct hlist_head *head = dev_name_hash(net, name); 632 633 hlist_for_each_entry_rcu(dev, p, head, name_hlist) 634 if (!strncmp(dev->name, name, IFNAMSIZ)) 635 return dev; 636 637 return NULL; 638 } 639 EXPORT_SYMBOL(dev_get_by_name_rcu); 640 641 /** 642 * dev_get_by_name - find a device by its name 643 * @net: the applicable net namespace 644 * @name: name to find 645 * 646 * Find an interface by name. This can be called from any 647 * context and does its own locking. The returned handle has 648 * the usage count incremented and the caller must use dev_put() to 649 * release it when it is no longer needed. %NULL is returned if no 650 * matching device is found. 651 */ 652 653 struct net_device *dev_get_by_name(struct net *net, const char *name) 654 { 655 struct net_device *dev; 656 657 rcu_read_lock(); 658 dev = dev_get_by_name_rcu(net, name); 659 if (dev) 660 dev_hold(dev); 661 rcu_read_unlock(); 662 return dev; 663 } 664 EXPORT_SYMBOL(dev_get_by_name); 665 666 /** 667 * __dev_get_by_index - find a device by its ifindex 668 * @net: the applicable net namespace 669 * @ifindex: index of device 670 * 671 * Search for an interface by index. Returns %NULL if the device 672 * is not found or a pointer to the device. The device has not 673 * had its reference counter increased so the caller must be careful 674 * about locking. The caller must hold either the RTNL semaphore 675 * or @dev_base_lock. 676 */ 677 678 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 679 { 680 struct hlist_node *p; 681 struct net_device *dev; 682 struct hlist_head *head = dev_index_hash(net, ifindex); 683 684 hlist_for_each_entry(dev, p, head, index_hlist) 685 if (dev->ifindex == ifindex) 686 return dev; 687 688 return NULL; 689 } 690 EXPORT_SYMBOL(__dev_get_by_index); 691 692 /** 693 * dev_get_by_index_rcu - find a device by its ifindex 694 * @net: the applicable net namespace 695 * @ifindex: index of device 696 * 697 * Search for an interface by index. Returns %NULL if the device 698 * is not found or a pointer to the device. The device has not 699 * had its reference counter increased so the caller must be careful 700 * about locking. The caller must hold RCU lock. 701 */ 702 703 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) 704 { 705 struct hlist_node *p; 706 struct net_device *dev; 707 struct hlist_head *head = dev_index_hash(net, ifindex); 708 709 hlist_for_each_entry_rcu(dev, p, head, index_hlist) 710 if (dev->ifindex == ifindex) 711 return dev; 712 713 return NULL; 714 } 715 EXPORT_SYMBOL(dev_get_by_index_rcu); 716 717 718 /** 719 * dev_get_by_index - find a device by its ifindex 720 * @net: the applicable net namespace 721 * @ifindex: index of device 722 * 723 * Search for an interface by index. Returns NULL if the device 724 * is not found or a pointer to the device. The device returned has 725 * had a reference added and the pointer is safe until the user calls 726 * dev_put to indicate they have finished with it. 727 */ 728 729 struct net_device *dev_get_by_index(struct net *net, int ifindex) 730 { 731 struct net_device *dev; 732 733 rcu_read_lock(); 734 dev = dev_get_by_index_rcu(net, ifindex); 735 if (dev) 736 dev_hold(dev); 737 rcu_read_unlock(); 738 return dev; 739 } 740 EXPORT_SYMBOL(dev_get_by_index); 741 742 /** 743 * dev_getbyhwaddr - find a device by its hardware address 744 * @net: the applicable net namespace 745 * @type: media type of device 746 * @ha: hardware address 747 * 748 * Search for an interface by MAC address. Returns NULL if the device 749 * is not found or a pointer to the device. The caller must hold the 750 * rtnl semaphore. The returned device has not had its ref count increased 751 * and the caller must therefore be careful about locking 752 * 753 * BUGS: 754 * If the API was consistent this would be __dev_get_by_hwaddr 755 */ 756 757 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) 758 { 759 struct net_device *dev; 760 761 ASSERT_RTNL(); 762 763 for_each_netdev(net, dev) 764 if (dev->type == type && 765 !memcmp(dev->dev_addr, ha, dev->addr_len)) 766 return dev; 767 768 return NULL; 769 } 770 EXPORT_SYMBOL(dev_getbyhwaddr); 771 772 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 773 { 774 struct net_device *dev; 775 776 ASSERT_RTNL(); 777 for_each_netdev(net, dev) 778 if (dev->type == type) 779 return dev; 780 781 return NULL; 782 } 783 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 784 785 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 786 { 787 struct net_device *dev, *ret = NULL; 788 789 rcu_read_lock(); 790 for_each_netdev_rcu(net, dev) 791 if (dev->type == type) { 792 dev_hold(dev); 793 ret = dev; 794 break; 795 } 796 rcu_read_unlock(); 797 return ret; 798 } 799 EXPORT_SYMBOL(dev_getfirstbyhwtype); 800 801 /** 802 * dev_get_by_flags_rcu - find any device with given flags 803 * @net: the applicable net namespace 804 * @if_flags: IFF_* values 805 * @mask: bitmask of bits in if_flags to check 806 * 807 * Search for any interface with the given flags. Returns NULL if a device 808 * is not found or a pointer to the device. Must be called inside 809 * rcu_read_lock(), and result refcount is unchanged. 810 */ 811 812 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, 813 unsigned short mask) 814 { 815 struct net_device *dev, *ret; 816 817 ret = NULL; 818 for_each_netdev_rcu(net, dev) { 819 if (((dev->flags ^ if_flags) & mask) == 0) { 820 ret = dev; 821 break; 822 } 823 } 824 return ret; 825 } 826 EXPORT_SYMBOL(dev_get_by_flags_rcu); 827 828 /** 829 * dev_valid_name - check if name is okay for network device 830 * @name: name string 831 * 832 * Network device names need to be valid file names to 833 * to allow sysfs to work. We also disallow any kind of 834 * whitespace. 835 */ 836 int dev_valid_name(const char *name) 837 { 838 if (*name == '\0') 839 return 0; 840 if (strlen(name) >= IFNAMSIZ) 841 return 0; 842 if (!strcmp(name, ".") || !strcmp(name, "..")) 843 return 0; 844 845 while (*name) { 846 if (*name == '/' || isspace(*name)) 847 return 0; 848 name++; 849 } 850 return 1; 851 } 852 EXPORT_SYMBOL(dev_valid_name); 853 854 /** 855 * __dev_alloc_name - allocate a name for a device 856 * @net: network namespace to allocate the device name in 857 * @name: name format string 858 * @buf: scratch buffer and result name string 859 * 860 * Passed a format string - eg "lt%d" it will try and find a suitable 861 * id. It scans list of devices to build up a free map, then chooses 862 * the first empty slot. The caller must hold the dev_base or rtnl lock 863 * while allocating the name and adding the device in order to avoid 864 * duplicates. 865 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 866 * Returns the number of the unit assigned or a negative errno code. 867 */ 868 869 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 870 { 871 int i = 0; 872 const char *p; 873 const int max_netdevices = 8*PAGE_SIZE; 874 unsigned long *inuse; 875 struct net_device *d; 876 877 p = strnchr(name, IFNAMSIZ-1, '%'); 878 if (p) { 879 /* 880 * Verify the string as this thing may have come from 881 * the user. There must be either one "%d" and no other "%" 882 * characters. 883 */ 884 if (p[1] != 'd' || strchr(p + 2, '%')) 885 return -EINVAL; 886 887 /* Use one page as a bit array of possible slots */ 888 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 889 if (!inuse) 890 return -ENOMEM; 891 892 for_each_netdev(net, d) { 893 if (!sscanf(d->name, name, &i)) 894 continue; 895 if (i < 0 || i >= max_netdevices) 896 continue; 897 898 /* avoid cases where sscanf is not exact inverse of printf */ 899 snprintf(buf, IFNAMSIZ, name, i); 900 if (!strncmp(buf, d->name, IFNAMSIZ)) 901 set_bit(i, inuse); 902 } 903 904 i = find_first_zero_bit(inuse, max_netdevices); 905 free_page((unsigned long) inuse); 906 } 907 908 if (buf != name) 909 snprintf(buf, IFNAMSIZ, name, i); 910 if (!__dev_get_by_name(net, buf)) 911 return i; 912 913 /* It is possible to run out of possible slots 914 * when the name is long and there isn't enough space left 915 * for the digits, or if all bits are used. 916 */ 917 return -ENFILE; 918 } 919 920 /** 921 * dev_alloc_name - allocate a name for a device 922 * @dev: device 923 * @name: name format string 924 * 925 * Passed a format string - eg "lt%d" it will try and find a suitable 926 * id. It scans list of devices to build up a free map, then chooses 927 * the first empty slot. The caller must hold the dev_base or rtnl lock 928 * while allocating the name and adding the device in order to avoid 929 * duplicates. 930 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 931 * Returns the number of the unit assigned or a negative errno code. 932 */ 933 934 int dev_alloc_name(struct net_device *dev, const char *name) 935 { 936 char buf[IFNAMSIZ]; 937 struct net *net; 938 int ret; 939 940 BUG_ON(!dev_net(dev)); 941 net = dev_net(dev); 942 ret = __dev_alloc_name(net, name, buf); 943 if (ret >= 0) 944 strlcpy(dev->name, buf, IFNAMSIZ); 945 return ret; 946 } 947 EXPORT_SYMBOL(dev_alloc_name); 948 949 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt) 950 { 951 struct net *net; 952 953 BUG_ON(!dev_net(dev)); 954 net = dev_net(dev); 955 956 if (!dev_valid_name(name)) 957 return -EINVAL; 958 959 if (fmt && strchr(name, '%')) 960 return dev_alloc_name(dev, name); 961 else if (__dev_get_by_name(net, name)) 962 return -EEXIST; 963 else if (dev->name != name) 964 strlcpy(dev->name, name, IFNAMSIZ); 965 966 return 0; 967 } 968 969 /** 970 * dev_change_name - change name of a device 971 * @dev: device 972 * @newname: name (or format string) must be at least IFNAMSIZ 973 * 974 * Change name of a device, can pass format strings "eth%d". 975 * for wildcarding. 976 */ 977 int dev_change_name(struct net_device *dev, const char *newname) 978 { 979 char oldname[IFNAMSIZ]; 980 int err = 0; 981 int ret; 982 struct net *net; 983 984 ASSERT_RTNL(); 985 BUG_ON(!dev_net(dev)); 986 987 net = dev_net(dev); 988 if (dev->flags & IFF_UP) 989 return -EBUSY; 990 991 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) 992 return 0; 993 994 memcpy(oldname, dev->name, IFNAMSIZ); 995 996 err = dev_get_valid_name(dev, newname, 1); 997 if (err < 0) 998 return err; 999 1000 rollback: 1001 ret = device_rename(&dev->dev, dev->name); 1002 if (ret) { 1003 memcpy(dev->name, oldname, IFNAMSIZ); 1004 return ret; 1005 } 1006 1007 write_lock_bh(&dev_base_lock); 1008 hlist_del(&dev->name_hlist); 1009 write_unlock_bh(&dev_base_lock); 1010 1011 synchronize_rcu(); 1012 1013 write_lock_bh(&dev_base_lock); 1014 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 1015 write_unlock_bh(&dev_base_lock); 1016 1017 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 1018 ret = notifier_to_errno(ret); 1019 1020 if (ret) { 1021 /* err >= 0 after dev_alloc_name() or stores the first errno */ 1022 if (err >= 0) { 1023 err = ret; 1024 memcpy(dev->name, oldname, IFNAMSIZ); 1025 goto rollback; 1026 } else { 1027 printk(KERN_ERR 1028 "%s: name change rollback failed: %d.\n", 1029 dev->name, ret); 1030 } 1031 } 1032 1033 return err; 1034 } 1035 1036 /** 1037 * dev_set_alias - change ifalias of a device 1038 * @dev: device 1039 * @alias: name up to IFALIASZ 1040 * @len: limit of bytes to copy from info 1041 * 1042 * Set ifalias for a device, 1043 */ 1044 int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 1045 { 1046 ASSERT_RTNL(); 1047 1048 if (len >= IFALIASZ) 1049 return -EINVAL; 1050 1051 if (!len) { 1052 if (dev->ifalias) { 1053 kfree(dev->ifalias); 1054 dev->ifalias = NULL; 1055 } 1056 return 0; 1057 } 1058 1059 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 1060 if (!dev->ifalias) 1061 return -ENOMEM; 1062 1063 strlcpy(dev->ifalias, alias, len+1); 1064 return len; 1065 } 1066 1067 1068 /** 1069 * netdev_features_change - device changes features 1070 * @dev: device to cause notification 1071 * 1072 * Called to indicate a device has changed features. 1073 */ 1074 void netdev_features_change(struct net_device *dev) 1075 { 1076 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 1077 } 1078 EXPORT_SYMBOL(netdev_features_change); 1079 1080 /** 1081 * netdev_state_change - device changes state 1082 * @dev: device to cause notification 1083 * 1084 * Called to indicate a device has changed state. This function calls 1085 * the notifier chains for netdev_chain and sends a NEWLINK message 1086 * to the routing socket. 1087 */ 1088 void netdev_state_change(struct net_device *dev) 1089 { 1090 if (dev->flags & IFF_UP) { 1091 call_netdevice_notifiers(NETDEV_CHANGE, dev); 1092 rtmsg_ifinfo(RTM_NEWLINK, dev, 0); 1093 } 1094 } 1095 EXPORT_SYMBOL(netdev_state_change); 1096 1097 int netdev_bonding_change(struct net_device *dev, unsigned long event) 1098 { 1099 return call_netdevice_notifiers(event, dev); 1100 } 1101 EXPORT_SYMBOL(netdev_bonding_change); 1102 1103 /** 1104 * dev_load - load a network module 1105 * @net: the applicable net namespace 1106 * @name: name of interface 1107 * 1108 * If a network interface is not present and the process has suitable 1109 * privileges this function loads the module. If module loading is not 1110 * available in this kernel then it becomes a nop. 1111 */ 1112 1113 void dev_load(struct net *net, const char *name) 1114 { 1115 struct net_device *dev; 1116 1117 rcu_read_lock(); 1118 dev = dev_get_by_name_rcu(net, name); 1119 rcu_read_unlock(); 1120 1121 if (!dev && capable(CAP_NET_ADMIN)) 1122 request_module("%s", name); 1123 } 1124 EXPORT_SYMBOL(dev_load); 1125 1126 static int __dev_open(struct net_device *dev) 1127 { 1128 const struct net_device_ops *ops = dev->netdev_ops; 1129 int ret; 1130 1131 ASSERT_RTNL(); 1132 1133 /* 1134 * Is it even present? 1135 */ 1136 if (!netif_device_present(dev)) 1137 return -ENODEV; 1138 1139 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1140 ret = notifier_to_errno(ret); 1141 if (ret) 1142 return ret; 1143 1144 /* 1145 * Call device private open method 1146 */ 1147 set_bit(__LINK_STATE_START, &dev->state); 1148 1149 if (ops->ndo_validate_addr) 1150 ret = ops->ndo_validate_addr(dev); 1151 1152 if (!ret && ops->ndo_open) 1153 ret = ops->ndo_open(dev); 1154 1155 /* 1156 * If it went open OK then: 1157 */ 1158 1159 if (ret) 1160 clear_bit(__LINK_STATE_START, &dev->state); 1161 else { 1162 /* 1163 * Set the flags. 1164 */ 1165 dev->flags |= IFF_UP; 1166 1167 /* 1168 * Enable NET_DMA 1169 */ 1170 net_dmaengine_get(); 1171 1172 /* 1173 * Initialize multicasting status 1174 */ 1175 dev_set_rx_mode(dev); 1176 1177 /* 1178 * Wakeup transmit queue engine 1179 */ 1180 dev_activate(dev); 1181 } 1182 1183 return ret; 1184 } 1185 1186 /** 1187 * dev_open - prepare an interface for use. 1188 * @dev: device to open 1189 * 1190 * Takes a device from down to up state. The device's private open 1191 * function is invoked and then the multicast lists are loaded. Finally 1192 * the device is moved into the up state and a %NETDEV_UP message is 1193 * sent to the netdev notifier chain. 1194 * 1195 * Calling this function on an active interface is a nop. On a failure 1196 * a negative errno code is returned. 1197 */ 1198 int dev_open(struct net_device *dev) 1199 { 1200 int ret; 1201 1202 /* 1203 * Is it already up? 1204 */ 1205 if (dev->flags & IFF_UP) 1206 return 0; 1207 1208 /* 1209 * Open device 1210 */ 1211 ret = __dev_open(dev); 1212 if (ret < 0) 1213 return ret; 1214 1215 /* 1216 * ... and announce new interface. 1217 */ 1218 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); 1219 call_netdevice_notifiers(NETDEV_UP, dev); 1220 1221 return ret; 1222 } 1223 EXPORT_SYMBOL(dev_open); 1224 1225 static int __dev_close(struct net_device *dev) 1226 { 1227 const struct net_device_ops *ops = dev->netdev_ops; 1228 1229 ASSERT_RTNL(); 1230 might_sleep(); 1231 1232 /* 1233 * Tell people we are going down, so that they can 1234 * prepare to death, when device is still operating. 1235 */ 1236 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1237 1238 clear_bit(__LINK_STATE_START, &dev->state); 1239 1240 /* Synchronize to scheduled poll. We cannot touch poll list, 1241 * it can be even on different cpu. So just clear netif_running(). 1242 * 1243 * dev->stop() will invoke napi_disable() on all of it's 1244 * napi_struct instances on this device. 1245 */ 1246 smp_mb__after_clear_bit(); /* Commit netif_running(). */ 1247 1248 dev_deactivate(dev); 1249 1250 /* 1251 * Call the device specific close. This cannot fail. 1252 * Only if device is UP 1253 * 1254 * We allow it to be called even after a DETACH hot-plug 1255 * event. 1256 */ 1257 if (ops->ndo_stop) 1258 ops->ndo_stop(dev); 1259 1260 /* 1261 * Device is now down. 1262 */ 1263 1264 dev->flags &= ~IFF_UP; 1265 1266 /* 1267 * Shutdown NET_DMA 1268 */ 1269 net_dmaengine_put(); 1270 1271 return 0; 1272 } 1273 1274 /** 1275 * dev_close - shutdown an interface. 1276 * @dev: device to shutdown 1277 * 1278 * This function moves an active device into down state. A 1279 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1280 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1281 * chain. 1282 */ 1283 int dev_close(struct net_device *dev) 1284 { 1285 if (!(dev->flags & IFF_UP)) 1286 return 0; 1287 1288 __dev_close(dev); 1289 1290 /* 1291 * Tell people we are down 1292 */ 1293 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); 1294 call_netdevice_notifiers(NETDEV_DOWN, dev); 1295 1296 return 0; 1297 } 1298 EXPORT_SYMBOL(dev_close); 1299 1300 1301 /** 1302 * dev_disable_lro - disable Large Receive Offload on a device 1303 * @dev: device 1304 * 1305 * Disable Large Receive Offload (LRO) on a net device. Must be 1306 * called under RTNL. This is needed if received packets may be 1307 * forwarded to another interface. 1308 */ 1309 void dev_disable_lro(struct net_device *dev) 1310 { 1311 if (dev->ethtool_ops && dev->ethtool_ops->get_flags && 1312 dev->ethtool_ops->set_flags) { 1313 u32 flags = dev->ethtool_ops->get_flags(dev); 1314 if (flags & ETH_FLAG_LRO) { 1315 flags &= ~ETH_FLAG_LRO; 1316 dev->ethtool_ops->set_flags(dev, flags); 1317 } 1318 } 1319 WARN_ON(dev->features & NETIF_F_LRO); 1320 } 1321 EXPORT_SYMBOL(dev_disable_lro); 1322 1323 1324 static int dev_boot_phase = 1; 1325 1326 /* 1327 * Device change register/unregister. These are not inline or static 1328 * as we export them to the world. 1329 */ 1330 1331 /** 1332 * register_netdevice_notifier - register a network notifier block 1333 * @nb: notifier 1334 * 1335 * Register a notifier to be called when network device events occur. 1336 * The notifier passed is linked into the kernel structures and must 1337 * not be reused until it has been unregistered. A negative errno code 1338 * is returned on a failure. 1339 * 1340 * When registered all registration and up events are replayed 1341 * to the new notifier to allow device to have a race free 1342 * view of the network device list. 1343 */ 1344 1345 int register_netdevice_notifier(struct notifier_block *nb) 1346 { 1347 struct net_device *dev; 1348 struct net_device *last; 1349 struct net *net; 1350 int err; 1351 1352 rtnl_lock(); 1353 err = raw_notifier_chain_register(&netdev_chain, nb); 1354 if (err) 1355 goto unlock; 1356 if (dev_boot_phase) 1357 goto unlock; 1358 for_each_net(net) { 1359 for_each_netdev(net, dev) { 1360 err = nb->notifier_call(nb, NETDEV_REGISTER, dev); 1361 err = notifier_to_errno(err); 1362 if (err) 1363 goto rollback; 1364 1365 if (!(dev->flags & IFF_UP)) 1366 continue; 1367 1368 nb->notifier_call(nb, NETDEV_UP, dev); 1369 } 1370 } 1371 1372 unlock: 1373 rtnl_unlock(); 1374 return err; 1375 1376 rollback: 1377 last = dev; 1378 for_each_net(net) { 1379 for_each_netdev(net, dev) { 1380 if (dev == last) 1381 break; 1382 1383 if (dev->flags & IFF_UP) { 1384 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); 1385 nb->notifier_call(nb, NETDEV_DOWN, dev); 1386 } 1387 nb->notifier_call(nb, NETDEV_UNREGISTER, dev); 1388 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); 1389 } 1390 } 1391 1392 raw_notifier_chain_unregister(&netdev_chain, nb); 1393 goto unlock; 1394 } 1395 EXPORT_SYMBOL(register_netdevice_notifier); 1396 1397 /** 1398 * unregister_netdevice_notifier - unregister a network notifier block 1399 * @nb: notifier 1400 * 1401 * Unregister a notifier previously registered by 1402 * register_netdevice_notifier(). The notifier is unlinked into the 1403 * kernel structures and may then be reused. A negative errno code 1404 * is returned on a failure. 1405 */ 1406 1407 int unregister_netdevice_notifier(struct notifier_block *nb) 1408 { 1409 int err; 1410 1411 rtnl_lock(); 1412 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1413 rtnl_unlock(); 1414 return err; 1415 } 1416 EXPORT_SYMBOL(unregister_netdevice_notifier); 1417 1418 /** 1419 * call_netdevice_notifiers - call all network notifier blocks 1420 * @val: value passed unmodified to notifier function 1421 * @dev: net_device pointer passed unmodified to notifier function 1422 * 1423 * Call all network notifier blocks. Parameters and return value 1424 * are as for raw_notifier_call_chain(). 1425 */ 1426 1427 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1428 { 1429 ASSERT_RTNL(); 1430 return raw_notifier_call_chain(&netdev_chain, val, dev); 1431 } 1432 1433 /* When > 0 there are consumers of rx skb time stamps */ 1434 static atomic_t netstamp_needed = ATOMIC_INIT(0); 1435 1436 void net_enable_timestamp(void) 1437 { 1438 atomic_inc(&netstamp_needed); 1439 } 1440 EXPORT_SYMBOL(net_enable_timestamp); 1441 1442 void net_disable_timestamp(void) 1443 { 1444 atomic_dec(&netstamp_needed); 1445 } 1446 EXPORT_SYMBOL(net_disable_timestamp); 1447 1448 static inline void net_timestamp_set(struct sk_buff *skb) 1449 { 1450 if (atomic_read(&netstamp_needed)) 1451 __net_timestamp(skb); 1452 else 1453 skb->tstamp.tv64 = 0; 1454 } 1455 1456 static inline void net_timestamp_check(struct sk_buff *skb) 1457 { 1458 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) 1459 __net_timestamp(skb); 1460 } 1461 1462 /** 1463 * dev_forward_skb - loopback an skb to another netif 1464 * 1465 * @dev: destination network device 1466 * @skb: buffer to forward 1467 * 1468 * return values: 1469 * NET_RX_SUCCESS (no congestion) 1470 * NET_RX_DROP (packet was dropped, but freed) 1471 * 1472 * dev_forward_skb can be used for injecting an skb from the 1473 * start_xmit function of one device into the receive queue 1474 * of another device. 1475 * 1476 * The receiving device may be in another namespace, so 1477 * we have to clear all information in the skb that could 1478 * impact namespace isolation. 1479 */ 1480 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1481 { 1482 skb_orphan(skb); 1483 nf_reset(skb); 1484 1485 if (!(dev->flags & IFF_UP) || 1486 (skb->len > (dev->mtu + dev->hard_header_len))) { 1487 kfree_skb(skb); 1488 return NET_RX_DROP; 1489 } 1490 skb_set_dev(skb, dev); 1491 skb->tstamp.tv64 = 0; 1492 skb->pkt_type = PACKET_HOST; 1493 skb->protocol = eth_type_trans(skb, dev); 1494 return netif_rx(skb); 1495 } 1496 EXPORT_SYMBOL_GPL(dev_forward_skb); 1497 1498 /* 1499 * Support routine. Sends outgoing frames to any network 1500 * taps currently in use. 1501 */ 1502 1503 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1504 { 1505 struct packet_type *ptype; 1506 1507 #ifdef CONFIG_NET_CLS_ACT 1508 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) 1509 net_timestamp_set(skb); 1510 #else 1511 net_timestamp_set(skb); 1512 #endif 1513 1514 rcu_read_lock(); 1515 list_for_each_entry_rcu(ptype, &ptype_all, list) { 1516 /* Never send packets back to the socket 1517 * they originated from - MvS ([email protected]) 1518 */ 1519 if ((ptype->dev == dev || !ptype->dev) && 1520 (ptype->af_packet_priv == NULL || 1521 (struct sock *)ptype->af_packet_priv != skb->sk)) { 1522 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1523 if (!skb2) 1524 break; 1525 1526 /* skb->nh should be correctly 1527 set by sender, so that the second statement is 1528 just protection against buggy protocols. 1529 */ 1530 skb_reset_mac_header(skb2); 1531 1532 if (skb_network_header(skb2) < skb2->data || 1533 skb2->network_header > skb2->tail) { 1534 if (net_ratelimit()) 1535 printk(KERN_CRIT "protocol %04x is " 1536 "buggy, dev %s\n", 1537 ntohs(skb2->protocol), 1538 dev->name); 1539 skb_reset_network_header(skb2); 1540 } 1541 1542 skb2->transport_header = skb2->network_header; 1543 skb2->pkt_type = PACKET_OUTGOING; 1544 ptype->func(skb2, skb->dev, ptype, skb->dev); 1545 } 1546 } 1547 rcu_read_unlock(); 1548 } 1549 1550 /* 1551 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 1552 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 1553 */ 1554 void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 1555 { 1556 unsigned int real_num = dev->real_num_tx_queues; 1557 1558 if (unlikely(txq > dev->num_tx_queues)) 1559 ; 1560 else if (txq > real_num) 1561 dev->real_num_tx_queues = txq; 1562 else if (txq < real_num) { 1563 dev->real_num_tx_queues = txq; 1564 qdisc_reset_all_tx_gt(dev, txq); 1565 } 1566 } 1567 EXPORT_SYMBOL(netif_set_real_num_tx_queues); 1568 1569 static inline void __netif_reschedule(struct Qdisc *q) 1570 { 1571 struct softnet_data *sd; 1572 unsigned long flags; 1573 1574 local_irq_save(flags); 1575 sd = &__get_cpu_var(softnet_data); 1576 q->next_sched = NULL; 1577 *sd->output_queue_tailp = q; 1578 sd->output_queue_tailp = &q->next_sched; 1579 raise_softirq_irqoff(NET_TX_SOFTIRQ); 1580 local_irq_restore(flags); 1581 } 1582 1583 void __netif_schedule(struct Qdisc *q) 1584 { 1585 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 1586 __netif_reschedule(q); 1587 } 1588 EXPORT_SYMBOL(__netif_schedule); 1589 1590 void dev_kfree_skb_irq(struct sk_buff *skb) 1591 { 1592 if (atomic_dec_and_test(&skb->users)) { 1593 struct softnet_data *sd; 1594 unsigned long flags; 1595 1596 local_irq_save(flags); 1597 sd = &__get_cpu_var(softnet_data); 1598 skb->next = sd->completion_queue; 1599 sd->completion_queue = skb; 1600 raise_softirq_irqoff(NET_TX_SOFTIRQ); 1601 local_irq_restore(flags); 1602 } 1603 } 1604 EXPORT_SYMBOL(dev_kfree_skb_irq); 1605 1606 void dev_kfree_skb_any(struct sk_buff *skb) 1607 { 1608 if (in_irq() || irqs_disabled()) 1609 dev_kfree_skb_irq(skb); 1610 else 1611 dev_kfree_skb(skb); 1612 } 1613 EXPORT_SYMBOL(dev_kfree_skb_any); 1614 1615 1616 /** 1617 * netif_device_detach - mark device as removed 1618 * @dev: network device 1619 * 1620 * Mark device as removed from system and therefore no longer available. 1621 */ 1622 void netif_device_detach(struct net_device *dev) 1623 { 1624 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 1625 netif_running(dev)) { 1626 netif_tx_stop_all_queues(dev); 1627 } 1628 } 1629 EXPORT_SYMBOL(netif_device_detach); 1630 1631 /** 1632 * netif_device_attach - mark device as attached 1633 * @dev: network device 1634 * 1635 * Mark device as attached from system and restart if needed. 1636 */ 1637 void netif_device_attach(struct net_device *dev) 1638 { 1639 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 1640 netif_running(dev)) { 1641 netif_tx_wake_all_queues(dev); 1642 __netdev_watchdog_up(dev); 1643 } 1644 } 1645 EXPORT_SYMBOL(netif_device_attach); 1646 1647 static bool can_checksum_protocol(unsigned long features, __be16 protocol) 1648 { 1649 return ((features & NETIF_F_GEN_CSUM) || 1650 ((features & NETIF_F_IP_CSUM) && 1651 protocol == htons(ETH_P_IP)) || 1652 ((features & NETIF_F_IPV6_CSUM) && 1653 protocol == htons(ETH_P_IPV6)) || 1654 ((features & NETIF_F_FCOE_CRC) && 1655 protocol == htons(ETH_P_FCOE))); 1656 } 1657 1658 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) 1659 { 1660 if (can_checksum_protocol(dev->features, skb->protocol)) 1661 return true; 1662 1663 if (skb->protocol == htons(ETH_P_8021Q)) { 1664 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; 1665 if (can_checksum_protocol(dev->features & dev->vlan_features, 1666 veh->h_vlan_encapsulated_proto)) 1667 return true; 1668 } 1669 1670 return false; 1671 } 1672 1673 /** 1674 * skb_dev_set -- assign a new device to a buffer 1675 * @skb: buffer for the new device 1676 * @dev: network device 1677 * 1678 * If an skb is owned by a device already, we have to reset 1679 * all data private to the namespace a device belongs to 1680 * before assigning it a new device. 1681 */ 1682 #ifdef CONFIG_NET_NS 1683 void skb_set_dev(struct sk_buff *skb, struct net_device *dev) 1684 { 1685 skb_dst_drop(skb); 1686 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) { 1687 secpath_reset(skb); 1688 nf_reset(skb); 1689 skb_init_secmark(skb); 1690 skb->mark = 0; 1691 skb->priority = 0; 1692 skb->nf_trace = 0; 1693 skb->ipvs_property = 0; 1694 #ifdef CONFIG_NET_SCHED 1695 skb->tc_index = 0; 1696 #endif 1697 } 1698 skb->dev = dev; 1699 } 1700 EXPORT_SYMBOL(skb_set_dev); 1701 #endif /* CONFIG_NET_NS */ 1702 1703 /* 1704 * Invalidate hardware checksum when packet is to be mangled, and 1705 * complete checksum manually on outgoing path. 1706 */ 1707 int skb_checksum_help(struct sk_buff *skb) 1708 { 1709 __wsum csum; 1710 int ret = 0, offset; 1711 1712 if (skb->ip_summed == CHECKSUM_COMPLETE) 1713 goto out_set_summed; 1714 1715 if (unlikely(skb_shinfo(skb)->gso_size)) { 1716 /* Let GSO fix up the checksum. */ 1717 goto out_set_summed; 1718 } 1719 1720 offset = skb->csum_start - skb_headroom(skb); 1721 BUG_ON(offset >= skb_headlen(skb)); 1722 csum = skb_checksum(skb, offset, skb->len - offset, 0); 1723 1724 offset += skb->csum_offset; 1725 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 1726 1727 if (skb_cloned(skb) && 1728 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 1729 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 1730 if (ret) 1731 goto out; 1732 } 1733 1734 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 1735 out_set_summed: 1736 skb->ip_summed = CHECKSUM_NONE; 1737 out: 1738 return ret; 1739 } 1740 EXPORT_SYMBOL(skb_checksum_help); 1741 1742 /** 1743 * skb_gso_segment - Perform segmentation on skb. 1744 * @skb: buffer to segment 1745 * @features: features for the output path (see dev->features) 1746 * 1747 * This function segments the given skb and returns a list of segments. 1748 * 1749 * It may return NULL if the skb requires no segmentation. This is 1750 * only possible when GSO is used for verifying header integrity. 1751 */ 1752 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) 1753 { 1754 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 1755 struct packet_type *ptype; 1756 __be16 type = skb->protocol; 1757 int err; 1758 1759 skb_reset_mac_header(skb); 1760 skb->mac_len = skb->network_header - skb->mac_header; 1761 __skb_pull(skb, skb->mac_len); 1762 1763 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 1764 struct net_device *dev = skb->dev; 1765 struct ethtool_drvinfo info = {}; 1766 1767 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) 1768 dev->ethtool_ops->get_drvinfo(dev, &info); 1769 1770 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d " 1771 "ip_summed=%d", 1772 info.driver, dev ? dev->features : 0L, 1773 skb->sk ? skb->sk->sk_route_caps : 0L, 1774 skb->len, skb->data_len, skb->ip_summed); 1775 1776 if (skb_header_cloned(skb) && 1777 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) 1778 return ERR_PTR(err); 1779 } 1780 1781 rcu_read_lock(); 1782 list_for_each_entry_rcu(ptype, 1783 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 1784 if (ptype->type == type && !ptype->dev && ptype->gso_segment) { 1785 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 1786 err = ptype->gso_send_check(skb); 1787 segs = ERR_PTR(err); 1788 if (err || skb_gso_ok(skb, features)) 1789 break; 1790 __skb_push(skb, (skb->data - 1791 skb_network_header(skb))); 1792 } 1793 segs = ptype->gso_segment(skb, features); 1794 break; 1795 } 1796 } 1797 rcu_read_unlock(); 1798 1799 __skb_push(skb, skb->data - skb_mac_header(skb)); 1800 1801 return segs; 1802 } 1803 EXPORT_SYMBOL(skb_gso_segment); 1804 1805 /* Take action when hardware reception checksum errors are detected. */ 1806 #ifdef CONFIG_BUG 1807 void netdev_rx_csum_fault(struct net_device *dev) 1808 { 1809 if (net_ratelimit()) { 1810 printk(KERN_ERR "%s: hw csum failure.\n", 1811 dev ? dev->name : "<unknown>"); 1812 dump_stack(); 1813 } 1814 } 1815 EXPORT_SYMBOL(netdev_rx_csum_fault); 1816 #endif 1817 1818 /* Actually, we should eliminate this check as soon as we know, that: 1819 * 1. IOMMU is present and allows to map all the memory. 1820 * 2. No high memory really exists on this machine. 1821 */ 1822 1823 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 1824 { 1825 #ifdef CONFIG_HIGHMEM 1826 int i; 1827 if (!(dev->features & NETIF_F_HIGHDMA)) { 1828 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1829 if (PageHighMem(skb_shinfo(skb)->frags[i].page)) 1830 return 1; 1831 } 1832 1833 if (PCI_DMA_BUS_IS_PHYS) { 1834 struct device *pdev = dev->dev.parent; 1835 1836 if (!pdev) 1837 return 0; 1838 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1839 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page); 1840 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 1841 return 1; 1842 } 1843 } 1844 #endif 1845 return 0; 1846 } 1847 1848 struct dev_gso_cb { 1849 void (*destructor)(struct sk_buff *skb); 1850 }; 1851 1852 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) 1853 1854 static void dev_gso_skb_destructor(struct sk_buff *skb) 1855 { 1856 struct dev_gso_cb *cb; 1857 1858 do { 1859 struct sk_buff *nskb = skb->next; 1860 1861 skb->next = nskb->next; 1862 nskb->next = NULL; 1863 kfree_skb(nskb); 1864 } while (skb->next); 1865 1866 cb = DEV_GSO_CB(skb); 1867 if (cb->destructor) 1868 cb->destructor(skb); 1869 } 1870 1871 /** 1872 * dev_gso_segment - Perform emulated hardware segmentation on skb. 1873 * @skb: buffer to segment 1874 * 1875 * This function segments the given skb and stores the list of segments 1876 * in skb->next. 1877 */ 1878 static int dev_gso_segment(struct sk_buff *skb) 1879 { 1880 struct net_device *dev = skb->dev; 1881 struct sk_buff *segs; 1882 int features = dev->features & ~(illegal_highdma(dev, skb) ? 1883 NETIF_F_SG : 0); 1884 1885 segs = skb_gso_segment(skb, features); 1886 1887 /* Verifying header integrity only. */ 1888 if (!segs) 1889 return 0; 1890 1891 if (IS_ERR(segs)) 1892 return PTR_ERR(segs); 1893 1894 skb->next = segs; 1895 DEV_GSO_CB(skb)->destructor = skb->destructor; 1896 skb->destructor = dev_gso_skb_destructor; 1897 1898 return 0; 1899 } 1900 1901 /* 1902 * Try to orphan skb early, right before transmission by the device. 1903 * We cannot orphan skb if tx timestamp is requested or the sk-reference 1904 * is needed on driver level for other reasons, e.g. see net/can/raw.c 1905 */ 1906 static inline void skb_orphan_try(struct sk_buff *skb) 1907 { 1908 struct sock *sk = skb->sk; 1909 1910 if (sk && !skb_shinfo(skb)->tx_flags) { 1911 /* skb_tx_hash() wont be able to get sk. 1912 * We copy sk_hash into skb->rxhash 1913 */ 1914 if (!skb->rxhash) 1915 skb->rxhash = sk->sk_hash; 1916 skb_orphan(skb); 1917 } 1918 } 1919 1920 /* 1921 * Returns true if either: 1922 * 1. skb has frag_list and the device doesn't support FRAGLIST, or 1923 * 2. skb is fragmented and the device does not support SG, or if 1924 * at least one of fragments is in highmem and device does not 1925 * support DMA from it. 1926 */ 1927 static inline int skb_needs_linearize(struct sk_buff *skb, 1928 struct net_device *dev) 1929 { 1930 return skb_is_nonlinear(skb) && 1931 ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) || 1932 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || 1933 illegal_highdma(dev, skb)))); 1934 } 1935 1936 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, 1937 struct netdev_queue *txq) 1938 { 1939 const struct net_device_ops *ops = dev->netdev_ops; 1940 int rc = NETDEV_TX_OK; 1941 1942 if (likely(!skb->next)) { 1943 if (!list_empty(&ptype_all)) 1944 dev_queue_xmit_nit(skb, dev); 1945 1946 /* 1947 * If device doesnt need skb->dst, release it right now while 1948 * its hot in this cpu cache 1949 */ 1950 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 1951 skb_dst_drop(skb); 1952 1953 skb_orphan_try(skb); 1954 1955 if (netif_needs_gso(dev, skb)) { 1956 if (unlikely(dev_gso_segment(skb))) 1957 goto out_kfree_skb; 1958 if (skb->next) 1959 goto gso; 1960 } else { 1961 if (skb_needs_linearize(skb, dev) && 1962 __skb_linearize(skb)) 1963 goto out_kfree_skb; 1964 1965 /* If packet is not checksummed and device does not 1966 * support checksumming for this protocol, complete 1967 * checksumming here. 1968 */ 1969 if (skb->ip_summed == CHECKSUM_PARTIAL) { 1970 skb_set_transport_header(skb, skb->csum_start - 1971 skb_headroom(skb)); 1972 if (!dev_can_checksum(dev, skb) && 1973 skb_checksum_help(skb)) 1974 goto out_kfree_skb; 1975 } 1976 } 1977 1978 rc = ops->ndo_start_xmit(skb, dev); 1979 if (rc == NETDEV_TX_OK) 1980 txq_trans_update(txq); 1981 return rc; 1982 } 1983 1984 gso: 1985 do { 1986 struct sk_buff *nskb = skb->next; 1987 1988 skb->next = nskb->next; 1989 nskb->next = NULL; 1990 1991 /* 1992 * If device doesnt need nskb->dst, release it right now while 1993 * its hot in this cpu cache 1994 */ 1995 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 1996 skb_dst_drop(nskb); 1997 1998 rc = ops->ndo_start_xmit(nskb, dev); 1999 if (unlikely(rc != NETDEV_TX_OK)) { 2000 if (rc & ~NETDEV_TX_MASK) 2001 goto out_kfree_gso_skb; 2002 nskb->next = skb->next; 2003 skb->next = nskb; 2004 return rc; 2005 } 2006 txq_trans_update(txq); 2007 if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) 2008 return NETDEV_TX_BUSY; 2009 } while (skb->next); 2010 2011 out_kfree_gso_skb: 2012 if (likely(skb->next == NULL)) 2013 skb->destructor = DEV_GSO_CB(skb)->destructor; 2014 out_kfree_skb: 2015 kfree_skb(skb); 2016 return rc; 2017 } 2018 2019 static u32 hashrnd __read_mostly; 2020 2021 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) 2022 { 2023 u32 hash; 2024 2025 if (skb_rx_queue_recorded(skb)) { 2026 hash = skb_get_rx_queue(skb); 2027 while (unlikely(hash >= dev->real_num_tx_queues)) 2028 hash -= dev->real_num_tx_queues; 2029 return hash; 2030 } 2031 2032 if (skb->sk && skb->sk->sk_hash) 2033 hash = skb->sk->sk_hash; 2034 else 2035 hash = (__force u16) skb->protocol ^ skb->rxhash; 2036 hash = jhash_1word(hash, hashrnd); 2037 2038 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); 2039 } 2040 EXPORT_SYMBOL(skb_tx_hash); 2041 2042 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) 2043 { 2044 if (unlikely(queue_index >= dev->real_num_tx_queues)) { 2045 if (net_ratelimit()) { 2046 pr_warning("%s selects TX queue %d, but " 2047 "real number of TX queues is %d\n", 2048 dev->name, queue_index, dev->real_num_tx_queues); 2049 } 2050 return 0; 2051 } 2052 return queue_index; 2053 } 2054 2055 static struct netdev_queue *dev_pick_tx(struct net_device *dev, 2056 struct sk_buff *skb) 2057 { 2058 int queue_index; 2059 const struct net_device_ops *ops = dev->netdev_ops; 2060 2061 if (ops->ndo_select_queue) { 2062 queue_index = ops->ndo_select_queue(dev, skb); 2063 queue_index = dev_cap_txqueue(dev, queue_index); 2064 } else { 2065 struct sock *sk = skb->sk; 2066 queue_index = sk_tx_queue_get(sk); 2067 if (queue_index < 0) { 2068 2069 queue_index = 0; 2070 if (dev->real_num_tx_queues > 1) 2071 queue_index = skb_tx_hash(dev, skb); 2072 2073 if (sk) { 2074 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1); 2075 2076 if (dst && skb_dst(skb) == dst) 2077 sk_tx_queue_set(sk, queue_index); 2078 } 2079 } 2080 } 2081 2082 skb_set_queue_mapping(skb, queue_index); 2083 return netdev_get_tx_queue(dev, queue_index); 2084 } 2085 2086 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 2087 struct net_device *dev, 2088 struct netdev_queue *txq) 2089 { 2090 spinlock_t *root_lock = qdisc_lock(q); 2091 bool contended = qdisc_is_running(q); 2092 int rc; 2093 2094 /* 2095 * Heuristic to force contended enqueues to serialize on a 2096 * separate lock before trying to get qdisc main lock. 2097 * This permits __QDISC_STATE_RUNNING owner to get the lock more often 2098 * and dequeue packets faster. 2099 */ 2100 if (unlikely(contended)) 2101 spin_lock(&q->busylock); 2102 2103 spin_lock(root_lock); 2104 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 2105 kfree_skb(skb); 2106 rc = NET_XMIT_DROP; 2107 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 2108 qdisc_run_begin(q)) { 2109 /* 2110 * This is a work-conserving queue; there are no old skbs 2111 * waiting to be sent out; and the qdisc is not running - 2112 * xmit the skb directly. 2113 */ 2114 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) 2115 skb_dst_force(skb); 2116 __qdisc_update_bstats(q, skb->len); 2117 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { 2118 if (unlikely(contended)) { 2119 spin_unlock(&q->busylock); 2120 contended = false; 2121 } 2122 __qdisc_run(q); 2123 } else 2124 qdisc_run_end(q); 2125 2126 rc = NET_XMIT_SUCCESS; 2127 } else { 2128 skb_dst_force(skb); 2129 rc = qdisc_enqueue_root(skb, q); 2130 if (qdisc_run_begin(q)) { 2131 if (unlikely(contended)) { 2132 spin_unlock(&q->busylock); 2133 contended = false; 2134 } 2135 __qdisc_run(q); 2136 } 2137 } 2138 spin_unlock(root_lock); 2139 if (unlikely(contended)) 2140 spin_unlock(&q->busylock); 2141 return rc; 2142 } 2143 2144 /** 2145 * dev_queue_xmit - transmit a buffer 2146 * @skb: buffer to transmit 2147 * 2148 * Queue a buffer for transmission to a network device. The caller must 2149 * have set the device and priority and built the buffer before calling 2150 * this function. The function can be called from an interrupt. 2151 * 2152 * A negative errno code is returned on a failure. A success does not 2153 * guarantee the frame will be transmitted as it may be dropped due 2154 * to congestion or traffic shaping. 2155 * 2156 * ----------------------------------------------------------------------------------- 2157 * I notice this method can also return errors from the queue disciplines, 2158 * including NET_XMIT_DROP, which is a positive value. So, errors can also 2159 * be positive. 2160 * 2161 * Regardless of the return value, the skb is consumed, so it is currently 2162 * difficult to retry a send to this method. (You can bump the ref count 2163 * before sending to hold a reference for retry if you are careful.) 2164 * 2165 * When calling this method, interrupts MUST be enabled. This is because 2166 * the BH enable code must have IRQs enabled so that it will not deadlock. 2167 * --BLG 2168 */ 2169 int dev_queue_xmit(struct sk_buff *skb) 2170 { 2171 struct net_device *dev = skb->dev; 2172 struct netdev_queue *txq; 2173 struct Qdisc *q; 2174 int rc = -ENOMEM; 2175 2176 /* Disable soft irqs for various locks below. Also 2177 * stops preemption for RCU. 2178 */ 2179 rcu_read_lock_bh(); 2180 2181 txq = dev_pick_tx(dev, skb); 2182 q = rcu_dereference_bh(txq->qdisc); 2183 2184 #ifdef CONFIG_NET_CLS_ACT 2185 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 2186 #endif 2187 if (q->enqueue) { 2188 rc = __dev_xmit_skb(skb, q, dev, txq); 2189 goto out; 2190 } 2191 2192 /* The device has no queue. Common case for software devices: 2193 loopback, all the sorts of tunnels... 2194 2195 Really, it is unlikely that netif_tx_lock protection is necessary 2196 here. (f.e. loopback and IP tunnels are clean ignoring statistics 2197 counters.) 2198 However, it is possible, that they rely on protection 2199 made by us here. 2200 2201 Check this and shot the lock. It is not prone from deadlocks. 2202 Either shot noqueue qdisc, it is even simpler 8) 2203 */ 2204 if (dev->flags & IFF_UP) { 2205 int cpu = smp_processor_id(); /* ok because BHs are off */ 2206 2207 if (txq->xmit_lock_owner != cpu) { 2208 2209 HARD_TX_LOCK(dev, txq, cpu); 2210 2211 if (!netif_tx_queue_stopped(txq)) { 2212 rc = dev_hard_start_xmit(skb, dev, txq); 2213 if (dev_xmit_complete(rc)) { 2214 HARD_TX_UNLOCK(dev, txq); 2215 goto out; 2216 } 2217 } 2218 HARD_TX_UNLOCK(dev, txq); 2219 if (net_ratelimit()) 2220 printk(KERN_CRIT "Virtual device %s asks to " 2221 "queue packet!\n", dev->name); 2222 } else { 2223 /* Recursion is detected! It is possible, 2224 * unfortunately */ 2225 if (net_ratelimit()) 2226 printk(KERN_CRIT "Dead loop on virtual device " 2227 "%s, fix it urgently!\n", dev->name); 2228 } 2229 } 2230 2231 rc = -ENETDOWN; 2232 rcu_read_unlock_bh(); 2233 2234 kfree_skb(skb); 2235 return rc; 2236 out: 2237 rcu_read_unlock_bh(); 2238 return rc; 2239 } 2240 EXPORT_SYMBOL(dev_queue_xmit); 2241 2242 2243 /*======================================================================= 2244 Receiver routines 2245 =======================================================================*/ 2246 2247 int netdev_max_backlog __read_mostly = 1000; 2248 int netdev_tstamp_prequeue __read_mostly = 1; 2249 int netdev_budget __read_mostly = 300; 2250 int weight_p __read_mostly = 64; /* old backlog weight */ 2251 2252 /* Called with irq disabled */ 2253 static inline void ____napi_schedule(struct softnet_data *sd, 2254 struct napi_struct *napi) 2255 { 2256 list_add_tail(&napi->poll_list, &sd->poll_list); 2257 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2258 } 2259 2260 /* 2261 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses 2262 * and src/dst port numbers. Returns a non-zero hash number on success 2263 * and 0 on failure. 2264 */ 2265 __u32 __skb_get_rxhash(struct sk_buff *skb) 2266 { 2267 int nhoff, hash = 0, poff; 2268 struct ipv6hdr *ip6; 2269 struct iphdr *ip; 2270 u8 ip_proto; 2271 u32 addr1, addr2, ihl; 2272 union { 2273 u32 v32; 2274 u16 v16[2]; 2275 } ports; 2276 2277 nhoff = skb_network_offset(skb); 2278 2279 switch (skb->protocol) { 2280 case __constant_htons(ETH_P_IP): 2281 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) 2282 goto done; 2283 2284 ip = (struct iphdr *) (skb->data + nhoff); 2285 if (ip->frag_off & htons(IP_MF | IP_OFFSET)) 2286 ip_proto = 0; 2287 else 2288 ip_proto = ip->protocol; 2289 addr1 = (__force u32) ip->saddr; 2290 addr2 = (__force u32) ip->daddr; 2291 ihl = ip->ihl; 2292 break; 2293 case __constant_htons(ETH_P_IPV6): 2294 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) 2295 goto done; 2296 2297 ip6 = (struct ipv6hdr *) (skb->data + nhoff); 2298 ip_proto = ip6->nexthdr; 2299 addr1 = (__force u32) ip6->saddr.s6_addr32[3]; 2300 addr2 = (__force u32) ip6->daddr.s6_addr32[3]; 2301 ihl = (40 >> 2); 2302 break; 2303 default: 2304 goto done; 2305 } 2306 2307 ports.v32 = 0; 2308 poff = proto_ports_offset(ip_proto); 2309 if (poff >= 0) { 2310 nhoff += ihl * 4 + poff; 2311 if (pskb_may_pull(skb, nhoff + 4)) { 2312 ports.v32 = * (__force u32 *) (skb->data + nhoff); 2313 if (ports.v16[1] < ports.v16[0]) 2314 swap(ports.v16[0], ports.v16[1]); 2315 } 2316 } 2317 2318 /* get a consistent hash (same value on both flow directions) */ 2319 if (addr2 < addr1) 2320 swap(addr1, addr2); 2321 2322 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); 2323 if (!hash) 2324 hash = 1; 2325 2326 done: 2327 return hash; 2328 } 2329 EXPORT_SYMBOL(__skb_get_rxhash); 2330 2331 #ifdef CONFIG_RPS 2332 2333 /* One global table that all flow-based protocols share. */ 2334 struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; 2335 EXPORT_SYMBOL(rps_sock_flow_table); 2336 2337 /* 2338 * get_rps_cpu is called from netif_receive_skb and returns the target 2339 * CPU from the RPS map of the receiving queue for a given skb. 2340 * rcu_read_lock must be held on entry. 2341 */ 2342 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 2343 struct rps_dev_flow **rflowp) 2344 { 2345 struct netdev_rx_queue *rxqueue; 2346 struct rps_map *map = NULL; 2347 struct rps_dev_flow_table *flow_table; 2348 struct rps_sock_flow_table *sock_flow_table; 2349 int cpu = -1; 2350 u16 tcpu; 2351 2352 if (skb_rx_queue_recorded(skb)) { 2353 u16 index = skb_get_rx_queue(skb); 2354 if (unlikely(index >= dev->num_rx_queues)) { 2355 WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " 2356 "on queue %u, but number of RX queues is %u\n", 2357 dev->name, index, dev->num_rx_queues); 2358 goto done; 2359 } 2360 rxqueue = dev->_rx + index; 2361 } else 2362 rxqueue = dev->_rx; 2363 2364 if (rxqueue->rps_map) { 2365 map = rcu_dereference(rxqueue->rps_map); 2366 if (map && map->len == 1) { 2367 tcpu = map->cpus[0]; 2368 if (cpu_online(tcpu)) 2369 cpu = tcpu; 2370 goto done; 2371 } 2372 } else if (!rxqueue->rps_flow_table) { 2373 goto done; 2374 } 2375 2376 skb_reset_network_header(skb); 2377 if (!skb_get_rxhash(skb)) 2378 goto done; 2379 2380 flow_table = rcu_dereference(rxqueue->rps_flow_table); 2381 sock_flow_table = rcu_dereference(rps_sock_flow_table); 2382 if (flow_table && sock_flow_table) { 2383 u16 next_cpu; 2384 struct rps_dev_flow *rflow; 2385 2386 rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; 2387 tcpu = rflow->cpu; 2388 2389 next_cpu = sock_flow_table->ents[skb->rxhash & 2390 sock_flow_table->mask]; 2391 2392 /* 2393 * If the desired CPU (where last recvmsg was done) is 2394 * different from current CPU (one in the rx-queue flow 2395 * table entry), switch if one of the following holds: 2396 * - Current CPU is unset (equal to RPS_NO_CPU). 2397 * - Current CPU is offline. 2398 * - The current CPU's queue tail has advanced beyond the 2399 * last packet that was enqueued using this table entry. 2400 * This guarantees that all previous packets for the flow 2401 * have been dequeued, thus preserving in order delivery. 2402 */ 2403 if (unlikely(tcpu != next_cpu) && 2404 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || 2405 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 2406 rflow->last_qtail)) >= 0)) { 2407 tcpu = rflow->cpu = next_cpu; 2408 if (tcpu != RPS_NO_CPU) 2409 rflow->last_qtail = per_cpu(softnet_data, 2410 tcpu).input_queue_head; 2411 } 2412 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { 2413 *rflowp = rflow; 2414 cpu = tcpu; 2415 goto done; 2416 } 2417 } 2418 2419 if (map) { 2420 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; 2421 2422 if (cpu_online(tcpu)) { 2423 cpu = tcpu; 2424 goto done; 2425 } 2426 } 2427 2428 done: 2429 return cpu; 2430 } 2431 2432 /* Called from hardirq (IPI) context */ 2433 static void rps_trigger_softirq(void *data) 2434 { 2435 struct softnet_data *sd = data; 2436 2437 ____napi_schedule(sd, &sd->backlog); 2438 sd->received_rps++; 2439 } 2440 2441 #endif /* CONFIG_RPS */ 2442 2443 /* 2444 * Check if this softnet_data structure is another cpu one 2445 * If yes, queue it to our IPI list and return 1 2446 * If no, return 0 2447 */ 2448 static int rps_ipi_queued(struct softnet_data *sd) 2449 { 2450 #ifdef CONFIG_RPS 2451 struct softnet_data *mysd = &__get_cpu_var(softnet_data); 2452 2453 if (sd != mysd) { 2454 sd->rps_ipi_next = mysd->rps_ipi_list; 2455 mysd->rps_ipi_list = sd; 2456 2457 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2458 return 1; 2459 } 2460 #endif /* CONFIG_RPS */ 2461 return 0; 2462 } 2463 2464 /* 2465 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 2466 * queue (may be a remote CPU queue). 2467 */ 2468 static int enqueue_to_backlog(struct sk_buff *skb, int cpu, 2469 unsigned int *qtail) 2470 { 2471 struct softnet_data *sd; 2472 unsigned long flags; 2473 2474 sd = &per_cpu(softnet_data, cpu); 2475 2476 local_irq_save(flags); 2477 2478 rps_lock(sd); 2479 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { 2480 if (skb_queue_len(&sd->input_pkt_queue)) { 2481 enqueue: 2482 __skb_queue_tail(&sd->input_pkt_queue, skb); 2483 input_queue_tail_incr_save(sd, qtail); 2484 rps_unlock(sd); 2485 local_irq_restore(flags); 2486 return NET_RX_SUCCESS; 2487 } 2488 2489 /* Schedule NAPI for backlog device 2490 * We can use non atomic operation since we own the queue lock 2491 */ 2492 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { 2493 if (!rps_ipi_queued(sd)) 2494 ____napi_schedule(sd, &sd->backlog); 2495 } 2496 goto enqueue; 2497 } 2498 2499 sd->dropped++; 2500 rps_unlock(sd); 2501 2502 local_irq_restore(flags); 2503 2504 kfree_skb(skb); 2505 return NET_RX_DROP; 2506 } 2507 2508 /** 2509 * netif_rx - post buffer to the network code 2510 * @skb: buffer to post 2511 * 2512 * This function receives a packet from a device driver and queues it for 2513 * the upper (protocol) levels to process. It always succeeds. The buffer 2514 * may be dropped during processing for congestion control or by the 2515 * protocol layers. 2516 * 2517 * return values: 2518 * NET_RX_SUCCESS (no congestion) 2519 * NET_RX_DROP (packet was dropped) 2520 * 2521 */ 2522 2523 int netif_rx(struct sk_buff *skb) 2524 { 2525 int ret; 2526 2527 /* if netpoll wants it, pretend we never saw it */ 2528 if (netpoll_rx(skb)) 2529 return NET_RX_DROP; 2530 2531 if (netdev_tstamp_prequeue) 2532 net_timestamp_check(skb); 2533 2534 #ifdef CONFIG_RPS 2535 { 2536 struct rps_dev_flow voidflow, *rflow = &voidflow; 2537 int cpu; 2538 2539 preempt_disable(); 2540 rcu_read_lock(); 2541 2542 cpu = get_rps_cpu(skb->dev, skb, &rflow); 2543 if (cpu < 0) 2544 cpu = smp_processor_id(); 2545 2546 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 2547 2548 rcu_read_unlock(); 2549 preempt_enable(); 2550 } 2551 #else 2552 { 2553 unsigned int qtail; 2554 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 2555 put_cpu(); 2556 } 2557 #endif 2558 return ret; 2559 } 2560 EXPORT_SYMBOL(netif_rx); 2561 2562 int netif_rx_ni(struct sk_buff *skb) 2563 { 2564 int err; 2565 2566 preempt_disable(); 2567 err = netif_rx(skb); 2568 if (local_softirq_pending()) 2569 do_softirq(); 2570 preempt_enable(); 2571 2572 return err; 2573 } 2574 EXPORT_SYMBOL(netif_rx_ni); 2575 2576 static void net_tx_action(struct softirq_action *h) 2577 { 2578 struct softnet_data *sd = &__get_cpu_var(softnet_data); 2579 2580 if (sd->completion_queue) { 2581 struct sk_buff *clist; 2582 2583 local_irq_disable(); 2584 clist = sd->completion_queue; 2585 sd->completion_queue = NULL; 2586 local_irq_enable(); 2587 2588 while (clist) { 2589 struct sk_buff *skb = clist; 2590 clist = clist->next; 2591 2592 WARN_ON(atomic_read(&skb->users)); 2593 __kfree_skb(skb); 2594 } 2595 } 2596 2597 if (sd->output_queue) { 2598 struct Qdisc *head; 2599 2600 local_irq_disable(); 2601 head = sd->output_queue; 2602 sd->output_queue = NULL; 2603 sd->output_queue_tailp = &sd->output_queue; 2604 local_irq_enable(); 2605 2606 while (head) { 2607 struct Qdisc *q = head; 2608 spinlock_t *root_lock; 2609 2610 head = head->next_sched; 2611 2612 root_lock = qdisc_lock(q); 2613 if (spin_trylock(root_lock)) { 2614 smp_mb__before_clear_bit(); 2615 clear_bit(__QDISC_STATE_SCHED, 2616 &q->state); 2617 qdisc_run(q); 2618 spin_unlock(root_lock); 2619 } else { 2620 if (!test_bit(__QDISC_STATE_DEACTIVATED, 2621 &q->state)) { 2622 __netif_reschedule(q); 2623 } else { 2624 smp_mb__before_clear_bit(); 2625 clear_bit(__QDISC_STATE_SCHED, 2626 &q->state); 2627 } 2628 } 2629 } 2630 } 2631 } 2632 2633 static inline int deliver_skb(struct sk_buff *skb, 2634 struct packet_type *pt_prev, 2635 struct net_device *orig_dev) 2636 { 2637 atomic_inc(&skb->users); 2638 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 2639 } 2640 2641 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ 2642 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) 2643 /* This hook is defined here for ATM LANE */ 2644 int (*br_fdb_test_addr_hook)(struct net_device *dev, 2645 unsigned char *addr) __read_mostly; 2646 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 2647 #endif 2648 2649 #ifdef CONFIG_NET_CLS_ACT 2650 /* TODO: Maybe we should just force sch_ingress to be compiled in 2651 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions 2652 * a compare and 2 stores extra right now if we dont have it on 2653 * but have CONFIG_NET_CLS_ACT 2654 * NOTE: This doesnt stop any functionality; if you dont have 2655 * the ingress scheduler, you just cant add policies on ingress. 2656 * 2657 */ 2658 static int ing_filter(struct sk_buff *skb) 2659 { 2660 struct net_device *dev = skb->dev; 2661 u32 ttl = G_TC_RTTL(skb->tc_verd); 2662 struct netdev_queue *rxq; 2663 int result = TC_ACT_OK; 2664 struct Qdisc *q; 2665 2666 if (unlikely(MAX_RED_LOOP < ttl++)) { 2667 if (net_ratelimit()) 2668 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n", 2669 skb->skb_iif, dev->ifindex); 2670 return TC_ACT_SHOT; 2671 } 2672 2673 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); 2674 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 2675 2676 rxq = &dev->rx_queue; 2677 2678 q = rxq->qdisc; 2679 if (q != &noop_qdisc) { 2680 spin_lock(qdisc_lock(q)); 2681 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) 2682 result = qdisc_enqueue_root(skb, q); 2683 spin_unlock(qdisc_lock(q)); 2684 } 2685 2686 return result; 2687 } 2688 2689 static inline struct sk_buff *handle_ing(struct sk_buff *skb, 2690 struct packet_type **pt_prev, 2691 int *ret, struct net_device *orig_dev) 2692 { 2693 if (skb->dev->rx_queue.qdisc == &noop_qdisc) 2694 goto out; 2695 2696 if (*pt_prev) { 2697 *ret = deliver_skb(skb, *pt_prev, orig_dev); 2698 *pt_prev = NULL; 2699 } 2700 2701 switch (ing_filter(skb)) { 2702 case TC_ACT_SHOT: 2703 case TC_ACT_STOLEN: 2704 kfree_skb(skb); 2705 return NULL; 2706 } 2707 2708 out: 2709 skb->tc_verd = 0; 2710 return skb; 2711 } 2712 #endif 2713 2714 /* 2715 * netif_nit_deliver - deliver received packets to network taps 2716 * @skb: buffer 2717 * 2718 * This function is used to deliver incoming packets to network 2719 * taps. It should be used when the normal netif_receive_skb path 2720 * is bypassed, for example because of VLAN acceleration. 2721 */ 2722 void netif_nit_deliver(struct sk_buff *skb) 2723 { 2724 struct packet_type *ptype; 2725 2726 if (list_empty(&ptype_all)) 2727 return; 2728 2729 skb_reset_network_header(skb); 2730 skb_reset_transport_header(skb); 2731 skb->mac_len = skb->network_header - skb->mac_header; 2732 2733 rcu_read_lock(); 2734 list_for_each_entry_rcu(ptype, &ptype_all, list) { 2735 if (!ptype->dev || ptype->dev == skb->dev) 2736 deliver_skb(skb, ptype, skb->dev); 2737 } 2738 rcu_read_unlock(); 2739 } 2740 2741 /** 2742 * netdev_rx_handler_register - register receive handler 2743 * @dev: device to register a handler for 2744 * @rx_handler: receive handler to register 2745 * @rx_handler_data: data pointer that is used by rx handler 2746 * 2747 * Register a receive hander for a device. This handler will then be 2748 * called from __netif_receive_skb. A negative errno code is returned 2749 * on a failure. 2750 * 2751 * The caller must hold the rtnl_mutex. 2752 */ 2753 int netdev_rx_handler_register(struct net_device *dev, 2754 rx_handler_func_t *rx_handler, 2755 void *rx_handler_data) 2756 { 2757 ASSERT_RTNL(); 2758 2759 if (dev->rx_handler) 2760 return -EBUSY; 2761 2762 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); 2763 rcu_assign_pointer(dev->rx_handler, rx_handler); 2764 2765 return 0; 2766 } 2767 EXPORT_SYMBOL_GPL(netdev_rx_handler_register); 2768 2769 /** 2770 * netdev_rx_handler_unregister - unregister receive handler 2771 * @dev: device to unregister a handler from 2772 * 2773 * Unregister a receive hander from a device. 2774 * 2775 * The caller must hold the rtnl_mutex. 2776 */ 2777 void netdev_rx_handler_unregister(struct net_device *dev) 2778 { 2779 2780 ASSERT_RTNL(); 2781 rcu_assign_pointer(dev->rx_handler, NULL); 2782 rcu_assign_pointer(dev->rx_handler_data, NULL); 2783 } 2784 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 2785 2786 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb, 2787 struct net_device *master) 2788 { 2789 if (skb->pkt_type == PACKET_HOST) { 2790 u16 *dest = (u16 *) eth_hdr(skb)->h_dest; 2791 2792 memcpy(dest, master->dev_addr, ETH_ALEN); 2793 } 2794 } 2795 2796 /* On bonding slaves other than the currently active slave, suppress 2797 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and 2798 * ARP on active-backup slaves with arp_validate enabled. 2799 */ 2800 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master) 2801 { 2802 struct net_device *dev = skb->dev; 2803 2804 if (master->priv_flags & IFF_MASTER_ARPMON) 2805 dev->last_rx = jiffies; 2806 2807 if ((master->priv_flags & IFF_MASTER_ALB) && 2808 (master->priv_flags & IFF_BRIDGE_PORT)) { 2809 /* Do address unmangle. The local destination address 2810 * will be always the one master has. Provides the right 2811 * functionality in a bridge. 2812 */ 2813 skb_bond_set_mac_by_master(skb, master); 2814 } 2815 2816 if (dev->priv_flags & IFF_SLAVE_INACTIVE) { 2817 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && 2818 skb->protocol == __cpu_to_be16(ETH_P_ARP)) 2819 return 0; 2820 2821 if (master->priv_flags & IFF_MASTER_ALB) { 2822 if (skb->pkt_type != PACKET_BROADCAST && 2823 skb->pkt_type != PACKET_MULTICAST) 2824 return 0; 2825 } 2826 if (master->priv_flags & IFF_MASTER_8023AD && 2827 skb->protocol == __cpu_to_be16(ETH_P_SLOW)) 2828 return 0; 2829 2830 return 1; 2831 } 2832 return 0; 2833 } 2834 EXPORT_SYMBOL(__skb_bond_should_drop); 2835 2836 static int __netif_receive_skb(struct sk_buff *skb) 2837 { 2838 struct packet_type *ptype, *pt_prev; 2839 rx_handler_func_t *rx_handler; 2840 struct net_device *orig_dev; 2841 struct net_device *master; 2842 struct net_device *null_or_orig; 2843 struct net_device *orig_or_bond; 2844 int ret = NET_RX_DROP; 2845 __be16 type; 2846 2847 if (!netdev_tstamp_prequeue) 2848 net_timestamp_check(skb); 2849 2850 if (vlan_tx_tag_present(skb)) 2851 vlan_hwaccel_do_receive(skb); 2852 2853 /* if we've gotten here through NAPI, check netpoll */ 2854 if (netpoll_receive_skb(skb)) 2855 return NET_RX_DROP; 2856 2857 if (!skb->skb_iif) 2858 skb->skb_iif = skb->dev->ifindex; 2859 2860 /* 2861 * bonding note: skbs received on inactive slaves should only 2862 * be delivered to pkt handlers that are exact matches. Also 2863 * the deliver_no_wcard flag will be set. If packet handlers 2864 * are sensitive to duplicate packets these skbs will need to 2865 * be dropped at the handler. The vlan accel path may have 2866 * already set the deliver_no_wcard flag. 2867 */ 2868 null_or_orig = NULL; 2869 orig_dev = skb->dev; 2870 master = ACCESS_ONCE(orig_dev->master); 2871 if (skb->deliver_no_wcard) 2872 null_or_orig = orig_dev; 2873 else if (master) { 2874 if (skb_bond_should_drop(skb, master)) { 2875 skb->deliver_no_wcard = 1; 2876 null_or_orig = orig_dev; /* deliver only exact match */ 2877 } else 2878 skb->dev = master; 2879 } 2880 2881 __this_cpu_inc(softnet_data.processed); 2882 skb_reset_network_header(skb); 2883 skb_reset_transport_header(skb); 2884 skb->mac_len = skb->network_header - skb->mac_header; 2885 2886 pt_prev = NULL; 2887 2888 rcu_read_lock(); 2889 2890 #ifdef CONFIG_NET_CLS_ACT 2891 if (skb->tc_verd & TC_NCLS) { 2892 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 2893 goto ncls; 2894 } 2895 #endif 2896 2897 list_for_each_entry_rcu(ptype, &ptype_all, list) { 2898 if (ptype->dev == null_or_orig || ptype->dev == skb->dev || 2899 ptype->dev == orig_dev) { 2900 if (pt_prev) 2901 ret = deliver_skb(skb, pt_prev, orig_dev); 2902 pt_prev = ptype; 2903 } 2904 } 2905 2906 #ifdef CONFIG_NET_CLS_ACT 2907 skb = handle_ing(skb, &pt_prev, &ret, orig_dev); 2908 if (!skb) 2909 goto out; 2910 ncls: 2911 #endif 2912 2913 /* Handle special case of bridge or macvlan */ 2914 rx_handler = rcu_dereference(skb->dev->rx_handler); 2915 if (rx_handler) { 2916 if (pt_prev) { 2917 ret = deliver_skb(skb, pt_prev, orig_dev); 2918 pt_prev = NULL; 2919 } 2920 skb = rx_handler(skb); 2921 if (!skb) 2922 goto out; 2923 } 2924 2925 /* 2926 * Make sure frames received on VLAN interfaces stacked on 2927 * bonding interfaces still make their way to any base bonding 2928 * device that may have registered for a specific ptype. The 2929 * handler may have to adjust skb->dev and orig_dev. 2930 */ 2931 orig_or_bond = orig_dev; 2932 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && 2933 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { 2934 orig_or_bond = vlan_dev_real_dev(skb->dev); 2935 } 2936 2937 type = skb->protocol; 2938 list_for_each_entry_rcu(ptype, 2939 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 2940 if (ptype->type == type && (ptype->dev == null_or_orig || 2941 ptype->dev == skb->dev || ptype->dev == orig_dev || 2942 ptype->dev == orig_or_bond)) { 2943 if (pt_prev) 2944 ret = deliver_skb(skb, pt_prev, orig_dev); 2945 pt_prev = ptype; 2946 } 2947 } 2948 2949 if (pt_prev) { 2950 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 2951 } else { 2952 kfree_skb(skb); 2953 /* Jamal, now you will not able to escape explaining 2954 * me how you were going to use this. :-) 2955 */ 2956 ret = NET_RX_DROP; 2957 } 2958 2959 out: 2960 rcu_read_unlock(); 2961 return ret; 2962 } 2963 2964 /** 2965 * netif_receive_skb - process receive buffer from network 2966 * @skb: buffer to process 2967 * 2968 * netif_receive_skb() is the main receive data processing function. 2969 * It always succeeds. The buffer may be dropped during processing 2970 * for congestion control or by the protocol layers. 2971 * 2972 * This function may only be called from softirq context and interrupts 2973 * should be enabled. 2974 * 2975 * Return values (usually ignored): 2976 * NET_RX_SUCCESS: no congestion 2977 * NET_RX_DROP: packet was dropped 2978 */ 2979 int netif_receive_skb(struct sk_buff *skb) 2980 { 2981 if (netdev_tstamp_prequeue) 2982 net_timestamp_check(skb); 2983 2984 if (skb_defer_rx_timestamp(skb)) 2985 return NET_RX_SUCCESS; 2986 2987 #ifdef CONFIG_RPS 2988 { 2989 struct rps_dev_flow voidflow, *rflow = &voidflow; 2990 int cpu, ret; 2991 2992 rcu_read_lock(); 2993 2994 cpu = get_rps_cpu(skb->dev, skb, &rflow); 2995 2996 if (cpu >= 0) { 2997 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 2998 rcu_read_unlock(); 2999 } else { 3000 rcu_read_unlock(); 3001 ret = __netif_receive_skb(skb); 3002 } 3003 3004 return ret; 3005 } 3006 #else 3007 return __netif_receive_skb(skb); 3008 #endif 3009 } 3010 EXPORT_SYMBOL(netif_receive_skb); 3011 3012 /* Network device is going away, flush any packets still pending 3013 * Called with irqs disabled. 3014 */ 3015 static void flush_backlog(void *arg) 3016 { 3017 struct net_device *dev = arg; 3018 struct softnet_data *sd = &__get_cpu_var(softnet_data); 3019 struct sk_buff *skb, *tmp; 3020 3021 rps_lock(sd); 3022 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 3023 if (skb->dev == dev) { 3024 __skb_unlink(skb, &sd->input_pkt_queue); 3025 kfree_skb(skb); 3026 input_queue_head_incr(sd); 3027 } 3028 } 3029 rps_unlock(sd); 3030 3031 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 3032 if (skb->dev == dev) { 3033 __skb_unlink(skb, &sd->process_queue); 3034 kfree_skb(skb); 3035 input_queue_head_incr(sd); 3036 } 3037 } 3038 } 3039 3040 static int napi_gro_complete(struct sk_buff *skb) 3041 { 3042 struct packet_type *ptype; 3043 __be16 type = skb->protocol; 3044 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; 3045 int err = -ENOENT; 3046 3047 if (NAPI_GRO_CB(skb)->count == 1) { 3048 skb_shinfo(skb)->gso_size = 0; 3049 goto out; 3050 } 3051 3052 rcu_read_lock(); 3053 list_for_each_entry_rcu(ptype, head, list) { 3054 if (ptype->type != type || ptype->dev || !ptype->gro_complete) 3055 continue; 3056 3057 err = ptype->gro_complete(skb); 3058 break; 3059 } 3060 rcu_read_unlock(); 3061 3062 if (err) { 3063 WARN_ON(&ptype->list == head); 3064 kfree_skb(skb); 3065 return NET_RX_SUCCESS; 3066 } 3067 3068 out: 3069 return netif_receive_skb(skb); 3070 } 3071 3072 inline void napi_gro_flush(struct napi_struct *napi) 3073 { 3074 struct sk_buff *skb, *next; 3075 3076 for (skb = napi->gro_list; skb; skb = next) { 3077 next = skb->next; 3078 skb->next = NULL; 3079 napi_gro_complete(skb); 3080 } 3081 3082 napi->gro_count = 0; 3083 napi->gro_list = NULL; 3084 } 3085 EXPORT_SYMBOL(napi_gro_flush); 3086 3087 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3088 { 3089 struct sk_buff **pp = NULL; 3090 struct packet_type *ptype; 3091 __be16 type = skb->protocol; 3092 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; 3093 int same_flow; 3094 int mac_len; 3095 enum gro_result ret; 3096 3097 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) 3098 goto normal; 3099 3100 if (skb_is_gso(skb) || skb_has_frag_list(skb)) 3101 goto normal; 3102 3103 rcu_read_lock(); 3104 list_for_each_entry_rcu(ptype, head, list) { 3105 if (ptype->type != type || ptype->dev || !ptype->gro_receive) 3106 continue; 3107 3108 skb_set_network_header(skb, skb_gro_offset(skb)); 3109 mac_len = skb->network_header - skb->mac_header; 3110 skb->mac_len = mac_len; 3111 NAPI_GRO_CB(skb)->same_flow = 0; 3112 NAPI_GRO_CB(skb)->flush = 0; 3113 NAPI_GRO_CB(skb)->free = 0; 3114 3115 pp = ptype->gro_receive(&napi->gro_list, skb); 3116 break; 3117 } 3118 rcu_read_unlock(); 3119 3120 if (&ptype->list == head) 3121 goto normal; 3122 3123 same_flow = NAPI_GRO_CB(skb)->same_flow; 3124 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 3125 3126 if (pp) { 3127 struct sk_buff *nskb = *pp; 3128 3129 *pp = nskb->next; 3130 nskb->next = NULL; 3131 napi_gro_complete(nskb); 3132 napi->gro_count--; 3133 } 3134 3135 if (same_flow) 3136 goto ok; 3137 3138 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) 3139 goto normal; 3140 3141 napi->gro_count++; 3142 NAPI_GRO_CB(skb)->count = 1; 3143 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 3144 skb->next = napi->gro_list; 3145 napi->gro_list = skb; 3146 ret = GRO_HELD; 3147 3148 pull: 3149 if (skb_headlen(skb) < skb_gro_offset(skb)) { 3150 int grow = skb_gro_offset(skb) - skb_headlen(skb); 3151 3152 BUG_ON(skb->end - skb->tail < grow); 3153 3154 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); 3155 3156 skb->tail += grow; 3157 skb->data_len -= grow; 3158 3159 skb_shinfo(skb)->frags[0].page_offset += grow; 3160 skb_shinfo(skb)->frags[0].size -= grow; 3161 3162 if (unlikely(!skb_shinfo(skb)->frags[0].size)) { 3163 put_page(skb_shinfo(skb)->frags[0].page); 3164 memmove(skb_shinfo(skb)->frags, 3165 skb_shinfo(skb)->frags + 1, 3166 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); 3167 } 3168 } 3169 3170 ok: 3171 return ret; 3172 3173 normal: 3174 ret = GRO_NORMAL; 3175 goto pull; 3176 } 3177 EXPORT_SYMBOL(dev_gro_receive); 3178 3179 static inline gro_result_t 3180 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3181 { 3182 struct sk_buff *p; 3183 3184 for (p = napi->gro_list; p; p = p->next) { 3185 unsigned long diffs; 3186 3187 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; 3188 diffs |= compare_ether_header(skb_mac_header(p), 3189 skb_gro_mac_header(skb)); 3190 NAPI_GRO_CB(p)->same_flow = !diffs; 3191 NAPI_GRO_CB(p)->flush = 0; 3192 } 3193 3194 return dev_gro_receive(napi, skb); 3195 } 3196 3197 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 3198 { 3199 switch (ret) { 3200 case GRO_NORMAL: 3201 if (netif_receive_skb(skb)) 3202 ret = GRO_DROP; 3203 break; 3204 3205 case GRO_DROP: 3206 case GRO_MERGED_FREE: 3207 kfree_skb(skb); 3208 break; 3209 3210 case GRO_HELD: 3211 case GRO_MERGED: 3212 break; 3213 } 3214 3215 return ret; 3216 } 3217 EXPORT_SYMBOL(napi_skb_finish); 3218 3219 void skb_gro_reset_offset(struct sk_buff *skb) 3220 { 3221 NAPI_GRO_CB(skb)->data_offset = 0; 3222 NAPI_GRO_CB(skb)->frag0 = NULL; 3223 NAPI_GRO_CB(skb)->frag0_len = 0; 3224 3225 if (skb->mac_header == skb->tail && 3226 !PageHighMem(skb_shinfo(skb)->frags[0].page)) { 3227 NAPI_GRO_CB(skb)->frag0 = 3228 page_address(skb_shinfo(skb)->frags[0].page) + 3229 skb_shinfo(skb)->frags[0].page_offset; 3230 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size; 3231 } 3232 } 3233 EXPORT_SYMBOL(skb_gro_reset_offset); 3234 3235 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3236 { 3237 skb_gro_reset_offset(skb); 3238 3239 return napi_skb_finish(__napi_gro_receive(napi, skb), skb); 3240 } 3241 EXPORT_SYMBOL(napi_gro_receive); 3242 3243 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 3244 { 3245 __skb_pull(skb, skb_headlen(skb)); 3246 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); 3247 3248 napi->skb = skb; 3249 } 3250 EXPORT_SYMBOL(napi_reuse_skb); 3251 3252 struct sk_buff *napi_get_frags(struct napi_struct *napi) 3253 { 3254 struct sk_buff *skb = napi->skb; 3255 3256 if (!skb) { 3257 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); 3258 if (skb) 3259 napi->skb = skb; 3260 } 3261 return skb; 3262 } 3263 EXPORT_SYMBOL(napi_get_frags); 3264 3265 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, 3266 gro_result_t ret) 3267 { 3268 switch (ret) { 3269 case GRO_NORMAL: 3270 case GRO_HELD: 3271 skb->protocol = eth_type_trans(skb, skb->dev); 3272 3273 if (ret == GRO_HELD) 3274 skb_gro_pull(skb, -ETH_HLEN); 3275 else if (netif_receive_skb(skb)) 3276 ret = GRO_DROP; 3277 break; 3278 3279 case GRO_DROP: 3280 case GRO_MERGED_FREE: 3281 napi_reuse_skb(napi, skb); 3282 break; 3283 3284 case GRO_MERGED: 3285 break; 3286 } 3287 3288 return ret; 3289 } 3290 EXPORT_SYMBOL(napi_frags_finish); 3291 3292 struct sk_buff *napi_frags_skb(struct napi_struct *napi) 3293 { 3294 struct sk_buff *skb = napi->skb; 3295 struct ethhdr *eth; 3296 unsigned int hlen; 3297 unsigned int off; 3298 3299 napi->skb = NULL; 3300 3301 skb_reset_mac_header(skb); 3302 skb_gro_reset_offset(skb); 3303 3304 off = skb_gro_offset(skb); 3305 hlen = off + sizeof(*eth); 3306 eth = skb_gro_header_fast(skb, off); 3307 if (skb_gro_header_hard(skb, hlen)) { 3308 eth = skb_gro_header_slow(skb, hlen, off); 3309 if (unlikely(!eth)) { 3310 napi_reuse_skb(napi, skb); 3311 skb = NULL; 3312 goto out; 3313 } 3314 } 3315 3316 skb_gro_pull(skb, sizeof(*eth)); 3317 3318 /* 3319 * This works because the only protocols we care about don't require 3320 * special handling. We'll fix it up properly at the end. 3321 */ 3322 skb->protocol = eth->h_proto; 3323 3324 out: 3325 return skb; 3326 } 3327 EXPORT_SYMBOL(napi_frags_skb); 3328 3329 gro_result_t napi_gro_frags(struct napi_struct *napi) 3330 { 3331 struct sk_buff *skb = napi_frags_skb(napi); 3332 3333 if (!skb) 3334 return GRO_DROP; 3335 3336 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); 3337 } 3338 EXPORT_SYMBOL(napi_gro_frags); 3339 3340 /* 3341 * net_rps_action sends any pending IPI's for rps. 3342 * Note: called with local irq disabled, but exits with local irq enabled. 3343 */ 3344 static void net_rps_action_and_irq_enable(struct softnet_data *sd) 3345 { 3346 #ifdef CONFIG_RPS 3347 struct softnet_data *remsd = sd->rps_ipi_list; 3348 3349 if (remsd) { 3350 sd->rps_ipi_list = NULL; 3351 3352 local_irq_enable(); 3353 3354 /* Send pending IPI's to kick RPS processing on remote cpus. */ 3355 while (remsd) { 3356 struct softnet_data *next = remsd->rps_ipi_next; 3357 3358 if (cpu_online(remsd->cpu)) 3359 __smp_call_function_single(remsd->cpu, 3360 &remsd->csd, 0); 3361 remsd = next; 3362 } 3363 } else 3364 #endif 3365 local_irq_enable(); 3366 } 3367 3368 static int process_backlog(struct napi_struct *napi, int quota) 3369 { 3370 int work = 0; 3371 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 3372 3373 #ifdef CONFIG_RPS 3374 /* Check if we have pending ipi, its better to send them now, 3375 * not waiting net_rx_action() end. 3376 */ 3377 if (sd->rps_ipi_list) { 3378 local_irq_disable(); 3379 net_rps_action_and_irq_enable(sd); 3380 } 3381 #endif 3382 napi->weight = weight_p; 3383 local_irq_disable(); 3384 while (work < quota) { 3385 struct sk_buff *skb; 3386 unsigned int qlen; 3387 3388 while ((skb = __skb_dequeue(&sd->process_queue))) { 3389 local_irq_enable(); 3390 __netif_receive_skb(skb); 3391 local_irq_disable(); 3392 input_queue_head_incr(sd); 3393 if (++work >= quota) { 3394 local_irq_enable(); 3395 return work; 3396 } 3397 } 3398 3399 rps_lock(sd); 3400 qlen = skb_queue_len(&sd->input_pkt_queue); 3401 if (qlen) 3402 skb_queue_splice_tail_init(&sd->input_pkt_queue, 3403 &sd->process_queue); 3404 3405 if (qlen < quota - work) { 3406 /* 3407 * Inline a custom version of __napi_complete(). 3408 * only current cpu owns and manipulates this napi, 3409 * and NAPI_STATE_SCHED is the only possible flag set on backlog. 3410 * we can use a plain write instead of clear_bit(), 3411 * and we dont need an smp_mb() memory barrier. 3412 */ 3413 list_del(&napi->poll_list); 3414 napi->state = 0; 3415 3416 quota = work + qlen; 3417 } 3418 rps_unlock(sd); 3419 } 3420 local_irq_enable(); 3421 3422 return work; 3423 } 3424 3425 /** 3426 * __napi_schedule - schedule for receive 3427 * @n: entry to schedule 3428 * 3429 * The entry's receive function will be scheduled to run 3430 */ 3431 void __napi_schedule(struct napi_struct *n) 3432 { 3433 unsigned long flags; 3434 3435 local_irq_save(flags); 3436 ____napi_schedule(&__get_cpu_var(softnet_data), n); 3437 local_irq_restore(flags); 3438 } 3439 EXPORT_SYMBOL(__napi_schedule); 3440 3441 void __napi_complete(struct napi_struct *n) 3442 { 3443 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 3444 BUG_ON(n->gro_list); 3445 3446 list_del(&n->poll_list); 3447 smp_mb__before_clear_bit(); 3448 clear_bit(NAPI_STATE_SCHED, &n->state); 3449 } 3450 EXPORT_SYMBOL(__napi_complete); 3451 3452 void napi_complete(struct napi_struct *n) 3453 { 3454 unsigned long flags; 3455 3456 /* 3457 * don't let napi dequeue from the cpu poll list 3458 * just in case its running on a different cpu 3459 */ 3460 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 3461 return; 3462 3463 napi_gro_flush(n); 3464 local_irq_save(flags); 3465 __napi_complete(n); 3466 local_irq_restore(flags); 3467 } 3468 EXPORT_SYMBOL(napi_complete); 3469 3470 void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 3471 int (*poll)(struct napi_struct *, int), int weight) 3472 { 3473 INIT_LIST_HEAD(&napi->poll_list); 3474 napi->gro_count = 0; 3475 napi->gro_list = NULL; 3476 napi->skb = NULL; 3477 napi->poll = poll; 3478 napi->weight = weight; 3479 list_add(&napi->dev_list, &dev->napi_list); 3480 napi->dev = dev; 3481 #ifdef CONFIG_NETPOLL 3482 spin_lock_init(&napi->poll_lock); 3483 napi->poll_owner = -1; 3484 #endif 3485 set_bit(NAPI_STATE_SCHED, &napi->state); 3486 } 3487 EXPORT_SYMBOL(netif_napi_add); 3488 3489 void netif_napi_del(struct napi_struct *napi) 3490 { 3491 struct sk_buff *skb, *next; 3492 3493 list_del_init(&napi->dev_list); 3494 napi_free_frags(napi); 3495 3496 for (skb = napi->gro_list; skb; skb = next) { 3497 next = skb->next; 3498 skb->next = NULL; 3499 kfree_skb(skb); 3500 } 3501 3502 napi->gro_list = NULL; 3503 napi->gro_count = 0; 3504 } 3505 EXPORT_SYMBOL(netif_napi_del); 3506 3507 static void net_rx_action(struct softirq_action *h) 3508 { 3509 struct softnet_data *sd = &__get_cpu_var(softnet_data); 3510 unsigned long time_limit = jiffies + 2; 3511 int budget = netdev_budget; 3512 void *have; 3513 3514 local_irq_disable(); 3515 3516 while (!list_empty(&sd->poll_list)) { 3517 struct napi_struct *n; 3518 int work, weight; 3519 3520 /* If softirq window is exhuasted then punt. 3521 * Allow this to run for 2 jiffies since which will allow 3522 * an average latency of 1.5/HZ. 3523 */ 3524 if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) 3525 goto softnet_break; 3526 3527 local_irq_enable(); 3528 3529 /* Even though interrupts have been re-enabled, this 3530 * access is safe because interrupts can only add new 3531 * entries to the tail of this list, and only ->poll() 3532 * calls can remove this head entry from the list. 3533 */ 3534 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); 3535 3536 have = netpoll_poll_lock(n); 3537 3538 weight = n->weight; 3539 3540 /* This NAPI_STATE_SCHED test is for avoiding a race 3541 * with netpoll's poll_napi(). Only the entity which 3542 * obtains the lock and sees NAPI_STATE_SCHED set will 3543 * actually make the ->poll() call. Therefore we avoid 3544 * accidently calling ->poll() when NAPI is not scheduled. 3545 */ 3546 work = 0; 3547 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 3548 work = n->poll(n, weight); 3549 trace_napi_poll(n); 3550 } 3551 3552 WARN_ON_ONCE(work > weight); 3553 3554 budget -= work; 3555 3556 local_irq_disable(); 3557 3558 /* Drivers must not modify the NAPI state if they 3559 * consume the entire weight. In such cases this code 3560 * still "owns" the NAPI instance and therefore can 3561 * move the instance around on the list at-will. 3562 */ 3563 if (unlikely(work == weight)) { 3564 if (unlikely(napi_disable_pending(n))) { 3565 local_irq_enable(); 3566 napi_complete(n); 3567 local_irq_disable(); 3568 } else 3569 list_move_tail(&n->poll_list, &sd->poll_list); 3570 } 3571 3572 netpoll_poll_unlock(have); 3573 } 3574 out: 3575 net_rps_action_and_irq_enable(sd); 3576 3577 #ifdef CONFIG_NET_DMA 3578 /* 3579 * There may not be any more sk_buffs coming right now, so push 3580 * any pending DMA copies to hardware 3581 */ 3582 dma_issue_pending_all(); 3583 #endif 3584 3585 return; 3586 3587 softnet_break: 3588 sd->time_squeeze++; 3589 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3590 goto out; 3591 } 3592 3593 static gifconf_func_t *gifconf_list[NPROTO]; 3594 3595 /** 3596 * register_gifconf - register a SIOCGIF handler 3597 * @family: Address family 3598 * @gifconf: Function handler 3599 * 3600 * Register protocol dependent address dumping routines. The handler 3601 * that is passed must not be freed or reused until it has been replaced 3602 * by another handler. 3603 */ 3604 int register_gifconf(unsigned int family, gifconf_func_t *gifconf) 3605 { 3606 if (family >= NPROTO) 3607 return -EINVAL; 3608 gifconf_list[family] = gifconf; 3609 return 0; 3610 } 3611 EXPORT_SYMBOL(register_gifconf); 3612 3613 3614 /* 3615 * Map an interface index to its name (SIOCGIFNAME) 3616 */ 3617 3618 /* 3619 * We need this ioctl for efficient implementation of the 3620 * if_indextoname() function required by the IPv6 API. Without 3621 * it, we would have to search all the interfaces to find a 3622 * match. --pb 3623 */ 3624 3625 static int dev_ifname(struct net *net, struct ifreq __user *arg) 3626 { 3627 struct net_device *dev; 3628 struct ifreq ifr; 3629 3630 /* 3631 * Fetch the caller's info block. 3632 */ 3633 3634 if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) 3635 return -EFAULT; 3636 3637 rcu_read_lock(); 3638 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); 3639 if (!dev) { 3640 rcu_read_unlock(); 3641 return -ENODEV; 3642 } 3643 3644 strcpy(ifr.ifr_name, dev->name); 3645 rcu_read_unlock(); 3646 3647 if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) 3648 return -EFAULT; 3649 return 0; 3650 } 3651 3652 /* 3653 * Perform a SIOCGIFCONF call. This structure will change 3654 * size eventually, and there is nothing I can do about it. 3655 * Thus we will need a 'compatibility mode'. 3656 */ 3657 3658 static int dev_ifconf(struct net *net, char __user *arg) 3659 { 3660 struct ifconf ifc; 3661 struct net_device *dev; 3662 char __user *pos; 3663 int len; 3664 int total; 3665 int i; 3666 3667 /* 3668 * Fetch the caller's info block. 3669 */ 3670 3671 if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) 3672 return -EFAULT; 3673 3674 pos = ifc.ifc_buf; 3675 len = ifc.ifc_len; 3676 3677 /* 3678 * Loop over the interfaces, and write an info block for each. 3679 */ 3680 3681 total = 0; 3682 for_each_netdev(net, dev) { 3683 for (i = 0; i < NPROTO; i++) { 3684 if (gifconf_list[i]) { 3685 int done; 3686 if (!pos) 3687 done = gifconf_list[i](dev, NULL, 0); 3688 else 3689 done = gifconf_list[i](dev, pos + total, 3690 len - total); 3691 if (done < 0) 3692 return -EFAULT; 3693 total += done; 3694 } 3695 } 3696 } 3697 3698 /* 3699 * All done. Write the updated control block back to the caller. 3700 */ 3701 ifc.ifc_len = total; 3702 3703 /* 3704 * Both BSD and Solaris return 0 here, so we do too. 3705 */ 3706 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; 3707 } 3708 3709 #ifdef CONFIG_PROC_FS 3710 /* 3711 * This is invoked by the /proc filesystem handler to display a device 3712 * in detail. 3713 */ 3714 void *dev_seq_start(struct seq_file *seq, loff_t *pos) 3715 __acquires(RCU) 3716 { 3717 struct net *net = seq_file_net(seq); 3718 loff_t off; 3719 struct net_device *dev; 3720 3721 rcu_read_lock(); 3722 if (!*pos) 3723 return SEQ_START_TOKEN; 3724 3725 off = 1; 3726 for_each_netdev_rcu(net, dev) 3727 if (off++ == *pos) 3728 return dev; 3729 3730 return NULL; 3731 } 3732 3733 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3734 { 3735 struct net_device *dev = (v == SEQ_START_TOKEN) ? 3736 first_net_device(seq_file_net(seq)) : 3737 next_net_device((struct net_device *)v); 3738 3739 ++*pos; 3740 return rcu_dereference(dev); 3741 } 3742 3743 void dev_seq_stop(struct seq_file *seq, void *v) 3744 __releases(RCU) 3745 { 3746 rcu_read_unlock(); 3747 } 3748 3749 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) 3750 { 3751 struct rtnl_link_stats64 temp; 3752 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); 3753 3754 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " 3755 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", 3756 dev->name, stats->rx_bytes, stats->rx_packets, 3757 stats->rx_errors, 3758 stats->rx_dropped + stats->rx_missed_errors, 3759 stats->rx_fifo_errors, 3760 stats->rx_length_errors + stats->rx_over_errors + 3761 stats->rx_crc_errors + stats->rx_frame_errors, 3762 stats->rx_compressed, stats->multicast, 3763 stats->tx_bytes, stats->tx_packets, 3764 stats->tx_errors, stats->tx_dropped, 3765 stats->tx_fifo_errors, stats->collisions, 3766 stats->tx_carrier_errors + 3767 stats->tx_aborted_errors + 3768 stats->tx_window_errors + 3769 stats->tx_heartbeat_errors, 3770 stats->tx_compressed); 3771 } 3772 3773 /* 3774 * Called from the PROCfs module. This now uses the new arbitrary sized 3775 * /proc/net interface to create /proc/net/dev 3776 */ 3777 static int dev_seq_show(struct seq_file *seq, void *v) 3778 { 3779 if (v == SEQ_START_TOKEN) 3780 seq_puts(seq, "Inter-| Receive " 3781 " | Transmit\n" 3782 " face |bytes packets errs drop fifo frame " 3783 "compressed multicast|bytes packets errs " 3784 "drop fifo colls carrier compressed\n"); 3785 else 3786 dev_seq_printf_stats(seq, v); 3787 return 0; 3788 } 3789 3790 static struct softnet_data *softnet_get_online(loff_t *pos) 3791 { 3792 struct softnet_data *sd = NULL; 3793 3794 while (*pos < nr_cpu_ids) 3795 if (cpu_online(*pos)) { 3796 sd = &per_cpu(softnet_data, *pos); 3797 break; 3798 } else 3799 ++*pos; 3800 return sd; 3801 } 3802 3803 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) 3804 { 3805 return softnet_get_online(pos); 3806 } 3807 3808 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3809 { 3810 ++*pos; 3811 return softnet_get_online(pos); 3812 } 3813 3814 static void softnet_seq_stop(struct seq_file *seq, void *v) 3815 { 3816 } 3817 3818 static int softnet_seq_show(struct seq_file *seq, void *v) 3819 { 3820 struct softnet_data *sd = v; 3821 3822 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 3823 sd->processed, sd->dropped, sd->time_squeeze, 0, 3824 0, 0, 0, 0, /* was fastroute */ 3825 sd->cpu_collision, sd->received_rps); 3826 return 0; 3827 } 3828 3829 static const struct seq_operations dev_seq_ops = { 3830 .start = dev_seq_start, 3831 .next = dev_seq_next, 3832 .stop = dev_seq_stop, 3833 .show = dev_seq_show, 3834 }; 3835 3836 static int dev_seq_open(struct inode *inode, struct file *file) 3837 { 3838 return seq_open_net(inode, file, &dev_seq_ops, 3839 sizeof(struct seq_net_private)); 3840 } 3841 3842 static const struct file_operations dev_seq_fops = { 3843 .owner = THIS_MODULE, 3844 .open = dev_seq_open, 3845 .read = seq_read, 3846 .llseek = seq_lseek, 3847 .release = seq_release_net, 3848 }; 3849 3850 static const struct seq_operations softnet_seq_ops = { 3851 .start = softnet_seq_start, 3852 .next = softnet_seq_next, 3853 .stop = softnet_seq_stop, 3854 .show = softnet_seq_show, 3855 }; 3856 3857 static int softnet_seq_open(struct inode *inode, struct file *file) 3858 { 3859 return seq_open(file, &softnet_seq_ops); 3860 } 3861 3862 static const struct file_operations softnet_seq_fops = { 3863 .owner = THIS_MODULE, 3864 .open = softnet_seq_open, 3865 .read = seq_read, 3866 .llseek = seq_lseek, 3867 .release = seq_release, 3868 }; 3869 3870 static void *ptype_get_idx(loff_t pos) 3871 { 3872 struct packet_type *pt = NULL; 3873 loff_t i = 0; 3874 int t; 3875 3876 list_for_each_entry_rcu(pt, &ptype_all, list) { 3877 if (i == pos) 3878 return pt; 3879 ++i; 3880 } 3881 3882 for (t = 0; t < PTYPE_HASH_SIZE; t++) { 3883 list_for_each_entry_rcu(pt, &ptype_base[t], list) { 3884 if (i == pos) 3885 return pt; 3886 ++i; 3887 } 3888 } 3889 return NULL; 3890 } 3891 3892 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) 3893 __acquires(RCU) 3894 { 3895 rcu_read_lock(); 3896 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; 3897 } 3898 3899 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3900 { 3901 struct packet_type *pt; 3902 struct list_head *nxt; 3903 int hash; 3904 3905 ++*pos; 3906 if (v == SEQ_START_TOKEN) 3907 return ptype_get_idx(0); 3908 3909 pt = v; 3910 nxt = pt->list.next; 3911 if (pt->type == htons(ETH_P_ALL)) { 3912 if (nxt != &ptype_all) 3913 goto found; 3914 hash = 0; 3915 nxt = ptype_base[0].next; 3916 } else 3917 hash = ntohs(pt->type) & PTYPE_HASH_MASK; 3918 3919 while (nxt == &ptype_base[hash]) { 3920 if (++hash >= PTYPE_HASH_SIZE) 3921 return NULL; 3922 nxt = ptype_base[hash].next; 3923 } 3924 found: 3925 return list_entry(nxt, struct packet_type, list); 3926 } 3927 3928 static void ptype_seq_stop(struct seq_file *seq, void *v) 3929 __releases(RCU) 3930 { 3931 rcu_read_unlock(); 3932 } 3933 3934 static int ptype_seq_show(struct seq_file *seq, void *v) 3935 { 3936 struct packet_type *pt = v; 3937 3938 if (v == SEQ_START_TOKEN) 3939 seq_puts(seq, "Type Device Function\n"); 3940 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { 3941 if (pt->type == htons(ETH_P_ALL)) 3942 seq_puts(seq, "ALL "); 3943 else 3944 seq_printf(seq, "%04x", ntohs(pt->type)); 3945 3946 seq_printf(seq, " %-8s %pF\n", 3947 pt->dev ? pt->dev->name : "", pt->func); 3948 } 3949 3950 return 0; 3951 } 3952 3953 static const struct seq_operations ptype_seq_ops = { 3954 .start = ptype_seq_start, 3955 .next = ptype_seq_next, 3956 .stop = ptype_seq_stop, 3957 .show = ptype_seq_show, 3958 }; 3959 3960 static int ptype_seq_open(struct inode *inode, struct file *file) 3961 { 3962 return seq_open_net(inode, file, &ptype_seq_ops, 3963 sizeof(struct seq_net_private)); 3964 } 3965 3966 static const struct file_operations ptype_seq_fops = { 3967 .owner = THIS_MODULE, 3968 .open = ptype_seq_open, 3969 .read = seq_read, 3970 .llseek = seq_lseek, 3971 .release = seq_release_net, 3972 }; 3973 3974 3975 static int __net_init dev_proc_net_init(struct net *net) 3976 { 3977 int rc = -ENOMEM; 3978 3979 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) 3980 goto out; 3981 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) 3982 goto out_dev; 3983 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) 3984 goto out_softnet; 3985 3986 if (wext_proc_init(net)) 3987 goto out_ptype; 3988 rc = 0; 3989 out: 3990 return rc; 3991 out_ptype: 3992 proc_net_remove(net, "ptype"); 3993 out_softnet: 3994 proc_net_remove(net, "softnet_stat"); 3995 out_dev: 3996 proc_net_remove(net, "dev"); 3997 goto out; 3998 } 3999 4000 static void __net_exit dev_proc_net_exit(struct net *net) 4001 { 4002 wext_proc_exit(net); 4003 4004 proc_net_remove(net, "ptype"); 4005 proc_net_remove(net, "softnet_stat"); 4006 proc_net_remove(net, "dev"); 4007 } 4008 4009 static struct pernet_operations __net_initdata dev_proc_ops = { 4010 .init = dev_proc_net_init, 4011 .exit = dev_proc_net_exit, 4012 }; 4013 4014 static int __init dev_proc_init(void) 4015 { 4016 return register_pernet_subsys(&dev_proc_ops); 4017 } 4018 #else 4019 #define dev_proc_init() 0 4020 #endif /* CONFIG_PROC_FS */ 4021 4022 4023 /** 4024 * netdev_set_master - set up master/slave pair 4025 * @slave: slave device 4026 * @master: new master device 4027 * 4028 * Changes the master device of the slave. Pass %NULL to break the 4029 * bonding. The caller must hold the RTNL semaphore. On a failure 4030 * a negative errno code is returned. On success the reference counts 4031 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the 4032 * function returns zero. 4033 */ 4034 int netdev_set_master(struct net_device *slave, struct net_device *master) 4035 { 4036 struct net_device *old = slave->master; 4037 4038 ASSERT_RTNL(); 4039 4040 if (master) { 4041 if (old) 4042 return -EBUSY; 4043 dev_hold(master); 4044 } 4045 4046 slave->master = master; 4047 4048 if (old) { 4049 synchronize_net(); 4050 dev_put(old); 4051 } 4052 if (master) 4053 slave->flags |= IFF_SLAVE; 4054 else 4055 slave->flags &= ~IFF_SLAVE; 4056 4057 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); 4058 return 0; 4059 } 4060 EXPORT_SYMBOL(netdev_set_master); 4061 4062 static void dev_change_rx_flags(struct net_device *dev, int flags) 4063 { 4064 const struct net_device_ops *ops = dev->netdev_ops; 4065 4066 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) 4067 ops->ndo_change_rx_flags(dev, flags); 4068 } 4069 4070 static int __dev_set_promiscuity(struct net_device *dev, int inc) 4071 { 4072 unsigned short old_flags = dev->flags; 4073 uid_t uid; 4074 gid_t gid; 4075 4076 ASSERT_RTNL(); 4077 4078 dev->flags |= IFF_PROMISC; 4079 dev->promiscuity += inc; 4080 if (dev->promiscuity == 0) { 4081 /* 4082 * Avoid overflow. 4083 * If inc causes overflow, untouch promisc and return error. 4084 */ 4085 if (inc < 0) 4086 dev->flags &= ~IFF_PROMISC; 4087 else { 4088 dev->promiscuity -= inc; 4089 printk(KERN_WARNING "%s: promiscuity touches roof, " 4090 "set promiscuity failed, promiscuity feature " 4091 "of device might be broken.\n", dev->name); 4092 return -EOVERFLOW; 4093 } 4094 } 4095 if (dev->flags != old_flags) { 4096 printk(KERN_INFO "device %s %s promiscuous mode\n", 4097 dev->name, (dev->flags & IFF_PROMISC) ? "entered" : 4098 "left"); 4099 if (audit_enabled) { 4100 current_uid_gid(&uid, &gid); 4101 audit_log(current->audit_context, GFP_ATOMIC, 4102 AUDIT_ANOM_PROMISCUOUS, 4103 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 4104 dev->name, (dev->flags & IFF_PROMISC), 4105 (old_flags & IFF_PROMISC), 4106 audit_get_loginuid(current), 4107 uid, gid, 4108 audit_get_sessionid(current)); 4109 } 4110 4111 dev_change_rx_flags(dev, IFF_PROMISC); 4112 } 4113 return 0; 4114 } 4115 4116 /** 4117 * dev_set_promiscuity - update promiscuity count on a device 4118 * @dev: device 4119 * @inc: modifier 4120 * 4121 * Add or remove promiscuity from a device. While the count in the device 4122 * remains above zero the interface remains promiscuous. Once it hits zero 4123 * the device reverts back to normal filtering operation. A negative inc 4124 * value is used to drop promiscuity on the device. 4125 * Return 0 if successful or a negative errno code on error. 4126 */ 4127 int dev_set_promiscuity(struct net_device *dev, int inc) 4128 { 4129 unsigned short old_flags = dev->flags; 4130 int err; 4131 4132 err = __dev_set_promiscuity(dev, inc); 4133 if (err < 0) 4134 return err; 4135 if (dev->flags != old_flags) 4136 dev_set_rx_mode(dev); 4137 return err; 4138 } 4139 EXPORT_SYMBOL(dev_set_promiscuity); 4140 4141 /** 4142 * dev_set_allmulti - update allmulti count on a device 4143 * @dev: device 4144 * @inc: modifier 4145 * 4146 * Add or remove reception of all multicast frames to a device. While the 4147 * count in the device remains above zero the interface remains listening 4148 * to all interfaces. Once it hits zero the device reverts back to normal 4149 * filtering operation. A negative @inc value is used to drop the counter 4150 * when releasing a resource needing all multicasts. 4151 * Return 0 if successful or a negative errno code on error. 4152 */ 4153 4154 int dev_set_allmulti(struct net_device *dev, int inc) 4155 { 4156 unsigned short old_flags = dev->flags; 4157 4158 ASSERT_RTNL(); 4159 4160 dev->flags |= IFF_ALLMULTI; 4161 dev->allmulti += inc; 4162 if (dev->allmulti == 0) { 4163 /* 4164 * Avoid overflow. 4165 * If inc causes overflow, untouch allmulti and return error. 4166 */ 4167 if (inc < 0) 4168 dev->flags &= ~IFF_ALLMULTI; 4169 else { 4170 dev->allmulti -= inc; 4171 printk(KERN_WARNING "%s: allmulti touches roof, " 4172 "set allmulti failed, allmulti feature of " 4173 "device might be broken.\n", dev->name); 4174 return -EOVERFLOW; 4175 } 4176 } 4177 if (dev->flags ^ old_flags) { 4178 dev_change_rx_flags(dev, IFF_ALLMULTI); 4179 dev_set_rx_mode(dev); 4180 } 4181 return 0; 4182 } 4183 EXPORT_SYMBOL(dev_set_allmulti); 4184 4185 /* 4186 * Upload unicast and multicast address lists to device and 4187 * configure RX filtering. When the device doesn't support unicast 4188 * filtering it is put in promiscuous mode while unicast addresses 4189 * are present. 4190 */ 4191 void __dev_set_rx_mode(struct net_device *dev) 4192 { 4193 const struct net_device_ops *ops = dev->netdev_ops; 4194 4195 /* dev_open will call this function so the list will stay sane. */ 4196 if (!(dev->flags&IFF_UP)) 4197 return; 4198 4199 if (!netif_device_present(dev)) 4200 return; 4201 4202 if (ops->ndo_set_rx_mode) 4203 ops->ndo_set_rx_mode(dev); 4204 else { 4205 /* Unicast addresses changes may only happen under the rtnl, 4206 * therefore calling __dev_set_promiscuity here is safe. 4207 */ 4208 if (!netdev_uc_empty(dev) && !dev->uc_promisc) { 4209 __dev_set_promiscuity(dev, 1); 4210 dev->uc_promisc = 1; 4211 } else if (netdev_uc_empty(dev) && dev->uc_promisc) { 4212 __dev_set_promiscuity(dev, -1); 4213 dev->uc_promisc = 0; 4214 } 4215 4216 if (ops->ndo_set_multicast_list) 4217 ops->ndo_set_multicast_list(dev); 4218 } 4219 } 4220 4221 void dev_set_rx_mode(struct net_device *dev) 4222 { 4223 netif_addr_lock_bh(dev); 4224 __dev_set_rx_mode(dev); 4225 netif_addr_unlock_bh(dev); 4226 } 4227 4228 /** 4229 * dev_get_flags - get flags reported to userspace 4230 * @dev: device 4231 * 4232 * Get the combination of flag bits exported through APIs to userspace. 4233 */ 4234 unsigned dev_get_flags(const struct net_device *dev) 4235 { 4236 unsigned flags; 4237 4238 flags = (dev->flags & ~(IFF_PROMISC | 4239 IFF_ALLMULTI | 4240 IFF_RUNNING | 4241 IFF_LOWER_UP | 4242 IFF_DORMANT)) | 4243 (dev->gflags & (IFF_PROMISC | 4244 IFF_ALLMULTI)); 4245 4246 if (netif_running(dev)) { 4247 if (netif_oper_up(dev)) 4248 flags |= IFF_RUNNING; 4249 if (netif_carrier_ok(dev)) 4250 flags |= IFF_LOWER_UP; 4251 if (netif_dormant(dev)) 4252 flags |= IFF_DORMANT; 4253 } 4254 4255 return flags; 4256 } 4257 EXPORT_SYMBOL(dev_get_flags); 4258 4259 int __dev_change_flags(struct net_device *dev, unsigned int flags) 4260 { 4261 int old_flags = dev->flags; 4262 int ret; 4263 4264 ASSERT_RTNL(); 4265 4266 /* 4267 * Set the flags on our device. 4268 */ 4269 4270 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 4271 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 4272 IFF_AUTOMEDIA)) | 4273 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 4274 IFF_ALLMULTI)); 4275 4276 /* 4277 * Load in the correct multicast list now the flags have changed. 4278 */ 4279 4280 if ((old_flags ^ flags) & IFF_MULTICAST) 4281 dev_change_rx_flags(dev, IFF_MULTICAST); 4282 4283 dev_set_rx_mode(dev); 4284 4285 /* 4286 * Have we downed the interface. We handle IFF_UP ourselves 4287 * according to user attempts to set it, rather than blindly 4288 * setting it. 4289 */ 4290 4291 ret = 0; 4292 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ 4293 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); 4294 4295 if (!ret) 4296 dev_set_rx_mode(dev); 4297 } 4298 4299 if ((flags ^ dev->gflags) & IFF_PROMISC) { 4300 int inc = (flags & IFF_PROMISC) ? 1 : -1; 4301 4302 dev->gflags ^= IFF_PROMISC; 4303 dev_set_promiscuity(dev, inc); 4304 } 4305 4306 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 4307 is important. Some (broken) drivers set IFF_PROMISC, when 4308 IFF_ALLMULTI is requested not asking us and not reporting. 4309 */ 4310 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 4311 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 4312 4313 dev->gflags ^= IFF_ALLMULTI; 4314 dev_set_allmulti(dev, inc); 4315 } 4316 4317 return ret; 4318 } 4319 4320 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) 4321 { 4322 unsigned int changes = dev->flags ^ old_flags; 4323 4324 if (changes & IFF_UP) { 4325 if (dev->flags & IFF_UP) 4326 call_netdevice_notifiers(NETDEV_UP, dev); 4327 else 4328 call_netdevice_notifiers(NETDEV_DOWN, dev); 4329 } 4330 4331 if (dev->flags & IFF_UP && 4332 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) 4333 call_netdevice_notifiers(NETDEV_CHANGE, dev); 4334 } 4335 4336 /** 4337 * dev_change_flags - change device settings 4338 * @dev: device 4339 * @flags: device state flags 4340 * 4341 * Change settings on device based state flags. The flags are 4342 * in the userspace exported format. 4343 */ 4344 int dev_change_flags(struct net_device *dev, unsigned flags) 4345 { 4346 int ret, changes; 4347 int old_flags = dev->flags; 4348 4349 ret = __dev_change_flags(dev, flags); 4350 if (ret < 0) 4351 return ret; 4352 4353 changes = old_flags ^ dev->flags; 4354 if (changes) 4355 rtmsg_ifinfo(RTM_NEWLINK, dev, changes); 4356 4357 __dev_notify_flags(dev, old_flags); 4358 return ret; 4359 } 4360 EXPORT_SYMBOL(dev_change_flags); 4361 4362 /** 4363 * dev_set_mtu - Change maximum transfer unit 4364 * @dev: device 4365 * @new_mtu: new transfer unit 4366 * 4367 * Change the maximum transfer size of the network device. 4368 */ 4369 int dev_set_mtu(struct net_device *dev, int new_mtu) 4370 { 4371 const struct net_device_ops *ops = dev->netdev_ops; 4372 int err; 4373 4374 if (new_mtu == dev->mtu) 4375 return 0; 4376 4377 /* MTU must be positive. */ 4378 if (new_mtu < 0) 4379 return -EINVAL; 4380 4381 if (!netif_device_present(dev)) 4382 return -ENODEV; 4383 4384 err = 0; 4385 if (ops->ndo_change_mtu) 4386 err = ops->ndo_change_mtu(dev, new_mtu); 4387 else 4388 dev->mtu = new_mtu; 4389 4390 if (!err && dev->flags & IFF_UP) 4391 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 4392 return err; 4393 } 4394 EXPORT_SYMBOL(dev_set_mtu); 4395 4396 /** 4397 * dev_set_mac_address - Change Media Access Control Address 4398 * @dev: device 4399 * @sa: new address 4400 * 4401 * Change the hardware (MAC) address of the device 4402 */ 4403 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 4404 { 4405 const struct net_device_ops *ops = dev->netdev_ops; 4406 int err; 4407 4408 if (!ops->ndo_set_mac_address) 4409 return -EOPNOTSUPP; 4410 if (sa->sa_family != dev->type) 4411 return -EINVAL; 4412 if (!netif_device_present(dev)) 4413 return -ENODEV; 4414 err = ops->ndo_set_mac_address(dev, sa); 4415 if (!err) 4416 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 4417 return err; 4418 } 4419 EXPORT_SYMBOL(dev_set_mac_address); 4420 4421 /* 4422 * Perform the SIOCxIFxxx calls, inside rcu_read_lock() 4423 */ 4424 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) 4425 { 4426 int err; 4427 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); 4428 4429 if (!dev) 4430 return -ENODEV; 4431 4432 switch (cmd) { 4433 case SIOCGIFFLAGS: /* Get interface flags */ 4434 ifr->ifr_flags = (short) dev_get_flags(dev); 4435 return 0; 4436 4437 case SIOCGIFMETRIC: /* Get the metric on the interface 4438 (currently unused) */ 4439 ifr->ifr_metric = 0; 4440 return 0; 4441 4442 case SIOCGIFMTU: /* Get the MTU of a device */ 4443 ifr->ifr_mtu = dev->mtu; 4444 return 0; 4445 4446 case SIOCGIFHWADDR: 4447 if (!dev->addr_len) 4448 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); 4449 else 4450 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, 4451 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); 4452 ifr->ifr_hwaddr.sa_family = dev->type; 4453 return 0; 4454 4455 case SIOCGIFSLAVE: 4456 err = -EINVAL; 4457 break; 4458 4459 case SIOCGIFMAP: 4460 ifr->ifr_map.mem_start = dev->mem_start; 4461 ifr->ifr_map.mem_end = dev->mem_end; 4462 ifr->ifr_map.base_addr = dev->base_addr; 4463 ifr->ifr_map.irq = dev->irq; 4464 ifr->ifr_map.dma = dev->dma; 4465 ifr->ifr_map.port = dev->if_port; 4466 return 0; 4467 4468 case SIOCGIFINDEX: 4469 ifr->ifr_ifindex = dev->ifindex; 4470 return 0; 4471 4472 case SIOCGIFTXQLEN: 4473 ifr->ifr_qlen = dev->tx_queue_len; 4474 return 0; 4475 4476 default: 4477 /* dev_ioctl() should ensure this case 4478 * is never reached 4479 */ 4480 WARN_ON(1); 4481 err = -EINVAL; 4482 break; 4483 4484 } 4485 return err; 4486 } 4487 4488 /* 4489 * Perform the SIOCxIFxxx calls, inside rtnl_lock() 4490 */ 4491 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) 4492 { 4493 int err; 4494 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); 4495 const struct net_device_ops *ops; 4496 4497 if (!dev) 4498 return -ENODEV; 4499 4500 ops = dev->netdev_ops; 4501 4502 switch (cmd) { 4503 case SIOCSIFFLAGS: /* Set interface flags */ 4504 return dev_change_flags(dev, ifr->ifr_flags); 4505 4506 case SIOCSIFMETRIC: /* Set the metric on the interface 4507 (currently unused) */ 4508 return -EOPNOTSUPP; 4509 4510 case SIOCSIFMTU: /* Set the MTU of a device */ 4511 return dev_set_mtu(dev, ifr->ifr_mtu); 4512 4513 case SIOCSIFHWADDR: 4514 return dev_set_mac_address(dev, &ifr->ifr_hwaddr); 4515 4516 case SIOCSIFHWBROADCAST: 4517 if (ifr->ifr_hwaddr.sa_family != dev->type) 4518 return -EINVAL; 4519 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, 4520 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); 4521 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 4522 return 0; 4523 4524 case SIOCSIFMAP: 4525 if (ops->ndo_set_config) { 4526 if (!netif_device_present(dev)) 4527 return -ENODEV; 4528 return ops->ndo_set_config(dev, &ifr->ifr_map); 4529 } 4530 return -EOPNOTSUPP; 4531 4532 case SIOCADDMULTI: 4533 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || 4534 ifr->ifr_hwaddr.sa_family != AF_UNSPEC) 4535 return -EINVAL; 4536 if (!netif_device_present(dev)) 4537 return -ENODEV; 4538 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); 4539 4540 case SIOCDELMULTI: 4541 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || 4542 ifr->ifr_hwaddr.sa_family != AF_UNSPEC) 4543 return -EINVAL; 4544 if (!netif_device_present(dev)) 4545 return -ENODEV; 4546 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); 4547 4548 case SIOCSIFTXQLEN: 4549 if (ifr->ifr_qlen < 0) 4550 return -EINVAL; 4551 dev->tx_queue_len = ifr->ifr_qlen; 4552 return 0; 4553 4554 case SIOCSIFNAME: 4555 ifr->ifr_newname[IFNAMSIZ-1] = '\0'; 4556 return dev_change_name(dev, ifr->ifr_newname); 4557 4558 /* 4559 * Unknown or private ioctl 4560 */ 4561 default: 4562 if ((cmd >= SIOCDEVPRIVATE && 4563 cmd <= SIOCDEVPRIVATE + 15) || 4564 cmd == SIOCBONDENSLAVE || 4565 cmd == SIOCBONDRELEASE || 4566 cmd == SIOCBONDSETHWADDR || 4567 cmd == SIOCBONDSLAVEINFOQUERY || 4568 cmd == SIOCBONDINFOQUERY || 4569 cmd == SIOCBONDCHANGEACTIVE || 4570 cmd == SIOCGMIIPHY || 4571 cmd == SIOCGMIIREG || 4572 cmd == SIOCSMIIREG || 4573 cmd == SIOCBRADDIF || 4574 cmd == SIOCBRDELIF || 4575 cmd == SIOCSHWTSTAMP || 4576 cmd == SIOCWANDEV) { 4577 err = -EOPNOTSUPP; 4578 if (ops->ndo_do_ioctl) { 4579 if (netif_device_present(dev)) 4580 err = ops->ndo_do_ioctl(dev, ifr, cmd); 4581 else 4582 err = -ENODEV; 4583 } 4584 } else 4585 err = -EINVAL; 4586 4587 } 4588 return err; 4589 } 4590 4591 /* 4592 * This function handles all "interface"-type I/O control requests. The actual 4593 * 'doing' part of this is dev_ifsioc above. 4594 */ 4595 4596 /** 4597 * dev_ioctl - network device ioctl 4598 * @net: the applicable net namespace 4599 * @cmd: command to issue 4600 * @arg: pointer to a struct ifreq in user space 4601 * 4602 * Issue ioctl functions to devices. This is normally called by the 4603 * user space syscall interfaces but can sometimes be useful for 4604 * other purposes. The return value is the return from the syscall if 4605 * positive or a negative errno code on error. 4606 */ 4607 4608 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) 4609 { 4610 struct ifreq ifr; 4611 int ret; 4612 char *colon; 4613 4614 /* One special case: SIOCGIFCONF takes ifconf argument 4615 and requires shared lock, because it sleeps writing 4616 to user space. 4617 */ 4618 4619 if (cmd == SIOCGIFCONF) { 4620 rtnl_lock(); 4621 ret = dev_ifconf(net, (char __user *) arg); 4622 rtnl_unlock(); 4623 return ret; 4624 } 4625 if (cmd == SIOCGIFNAME) 4626 return dev_ifname(net, (struct ifreq __user *)arg); 4627 4628 if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) 4629 return -EFAULT; 4630 4631 ifr.ifr_name[IFNAMSIZ-1] = 0; 4632 4633 colon = strchr(ifr.ifr_name, ':'); 4634 if (colon) 4635 *colon = 0; 4636 4637 /* 4638 * See which interface the caller is talking about. 4639 */ 4640 4641 switch (cmd) { 4642 /* 4643 * These ioctl calls: 4644 * - can be done by all. 4645 * - atomic and do not require locking. 4646 * - return a value 4647 */ 4648 case SIOCGIFFLAGS: 4649 case SIOCGIFMETRIC: 4650 case SIOCGIFMTU: 4651 case SIOCGIFHWADDR: 4652 case SIOCGIFSLAVE: 4653 case SIOCGIFMAP: 4654 case SIOCGIFINDEX: 4655 case SIOCGIFTXQLEN: 4656 dev_load(net, ifr.ifr_name); 4657 rcu_read_lock(); 4658 ret = dev_ifsioc_locked(net, &ifr, cmd); 4659 rcu_read_unlock(); 4660 if (!ret) { 4661 if (colon) 4662 *colon = ':'; 4663 if (copy_to_user(arg, &ifr, 4664 sizeof(struct ifreq))) 4665 ret = -EFAULT; 4666 } 4667 return ret; 4668 4669 case SIOCETHTOOL: 4670 dev_load(net, ifr.ifr_name); 4671 rtnl_lock(); 4672 ret = dev_ethtool(net, &ifr); 4673 rtnl_unlock(); 4674 if (!ret) { 4675 if (colon) 4676 *colon = ':'; 4677 if (copy_to_user(arg, &ifr, 4678 sizeof(struct ifreq))) 4679 ret = -EFAULT; 4680 } 4681 return ret; 4682 4683 /* 4684 * These ioctl calls: 4685 * - require superuser power. 4686 * - require strict serialization. 4687 * - return a value 4688 */ 4689 case SIOCGMIIPHY: 4690 case SIOCGMIIREG: 4691 case SIOCSIFNAME: 4692 if (!capable(CAP_NET_ADMIN)) 4693 return -EPERM; 4694 dev_load(net, ifr.ifr_name); 4695 rtnl_lock(); 4696 ret = dev_ifsioc(net, &ifr, cmd); 4697 rtnl_unlock(); 4698 if (!ret) { 4699 if (colon) 4700 *colon = ':'; 4701 if (copy_to_user(arg, &ifr, 4702 sizeof(struct ifreq))) 4703 ret = -EFAULT; 4704 } 4705 return ret; 4706 4707 /* 4708 * These ioctl calls: 4709 * - require superuser power. 4710 * - require strict serialization. 4711 * - do not return a value 4712 */ 4713 case SIOCSIFFLAGS: 4714 case SIOCSIFMETRIC: 4715 case SIOCSIFMTU: 4716 case SIOCSIFMAP: 4717 case SIOCSIFHWADDR: 4718 case SIOCSIFSLAVE: 4719 case SIOCADDMULTI: 4720 case SIOCDELMULTI: 4721 case SIOCSIFHWBROADCAST: 4722 case SIOCSIFTXQLEN: 4723 case SIOCSMIIREG: 4724 case SIOCBONDENSLAVE: 4725 case SIOCBONDRELEASE: 4726 case SIOCBONDSETHWADDR: 4727 case SIOCBONDCHANGEACTIVE: 4728 case SIOCBRADDIF: 4729 case SIOCBRDELIF: 4730 case SIOCSHWTSTAMP: 4731 if (!capable(CAP_NET_ADMIN)) 4732 return -EPERM; 4733 /* fall through */ 4734 case SIOCBONDSLAVEINFOQUERY: 4735 case SIOCBONDINFOQUERY: 4736 dev_load(net, ifr.ifr_name); 4737 rtnl_lock(); 4738 ret = dev_ifsioc(net, &ifr, cmd); 4739 rtnl_unlock(); 4740 return ret; 4741 4742 case SIOCGIFMEM: 4743 /* Get the per device memory space. We can add this but 4744 * currently do not support it */ 4745 case SIOCSIFMEM: 4746 /* Set the per device memory buffer space. 4747 * Not applicable in our case */ 4748 case SIOCSIFLINK: 4749 return -EINVAL; 4750 4751 /* 4752 * Unknown or private ioctl. 4753 */ 4754 default: 4755 if (cmd == SIOCWANDEV || 4756 (cmd >= SIOCDEVPRIVATE && 4757 cmd <= SIOCDEVPRIVATE + 15)) { 4758 dev_load(net, ifr.ifr_name); 4759 rtnl_lock(); 4760 ret = dev_ifsioc(net, &ifr, cmd); 4761 rtnl_unlock(); 4762 if (!ret && copy_to_user(arg, &ifr, 4763 sizeof(struct ifreq))) 4764 ret = -EFAULT; 4765 return ret; 4766 } 4767 /* Take care of Wireless Extensions */ 4768 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) 4769 return wext_handle_ioctl(net, &ifr, cmd, arg); 4770 return -EINVAL; 4771 } 4772 } 4773 4774 4775 /** 4776 * dev_new_index - allocate an ifindex 4777 * @net: the applicable net namespace 4778 * 4779 * Returns a suitable unique value for a new device interface 4780 * number. The caller must hold the rtnl semaphore or the 4781 * dev_base_lock to be sure it remains unique. 4782 */ 4783 static int dev_new_index(struct net *net) 4784 { 4785 static int ifindex; 4786 for (;;) { 4787 if (++ifindex <= 0) 4788 ifindex = 1; 4789 if (!__dev_get_by_index(net, ifindex)) 4790 return ifindex; 4791 } 4792 } 4793 4794 /* Delayed registration/unregisteration */ 4795 static LIST_HEAD(net_todo_list); 4796 4797 static void net_set_todo(struct net_device *dev) 4798 { 4799 list_add_tail(&dev->todo_list, &net_todo_list); 4800 } 4801 4802 static void rollback_registered_many(struct list_head *head) 4803 { 4804 struct net_device *dev, *tmp; 4805 4806 BUG_ON(dev_boot_phase); 4807 ASSERT_RTNL(); 4808 4809 list_for_each_entry_safe(dev, tmp, head, unreg_list) { 4810 /* Some devices call without registering 4811 * for initialization unwind. Remove those 4812 * devices and proceed with the remaining. 4813 */ 4814 if (dev->reg_state == NETREG_UNINITIALIZED) { 4815 pr_debug("unregister_netdevice: device %s/%p never " 4816 "was registered\n", dev->name, dev); 4817 4818 WARN_ON(1); 4819 list_del(&dev->unreg_list); 4820 continue; 4821 } 4822 4823 BUG_ON(dev->reg_state != NETREG_REGISTERED); 4824 4825 /* If device is running, close it first. */ 4826 dev_close(dev); 4827 4828 /* And unlink it from device chain. */ 4829 unlist_netdevice(dev); 4830 4831 dev->reg_state = NETREG_UNREGISTERING; 4832 } 4833 4834 synchronize_net(); 4835 4836 list_for_each_entry(dev, head, unreg_list) { 4837 /* Shutdown queueing discipline. */ 4838 dev_shutdown(dev); 4839 4840 4841 /* Notify protocols, that we are about to destroy 4842 this device. They should clean all the things. 4843 */ 4844 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 4845 4846 if (!dev->rtnl_link_ops || 4847 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 4848 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); 4849 4850 /* 4851 * Flush the unicast and multicast chains 4852 */ 4853 dev_uc_flush(dev); 4854 dev_mc_flush(dev); 4855 4856 if (dev->netdev_ops->ndo_uninit) 4857 dev->netdev_ops->ndo_uninit(dev); 4858 4859 /* Notifier chain MUST detach us from master device. */ 4860 WARN_ON(dev->master); 4861 4862 /* Remove entries from kobject tree */ 4863 netdev_unregister_kobject(dev); 4864 } 4865 4866 /* Process any work delayed until the end of the batch */ 4867 dev = list_first_entry(head, struct net_device, unreg_list); 4868 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); 4869 4870 synchronize_net(); 4871 4872 list_for_each_entry(dev, head, unreg_list) 4873 dev_put(dev); 4874 } 4875 4876 static void rollback_registered(struct net_device *dev) 4877 { 4878 LIST_HEAD(single); 4879 4880 list_add(&dev->unreg_list, &single); 4881 rollback_registered_many(&single); 4882 } 4883 4884 static void __netdev_init_queue_locks_one(struct net_device *dev, 4885 struct netdev_queue *dev_queue, 4886 void *_unused) 4887 { 4888 spin_lock_init(&dev_queue->_xmit_lock); 4889 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); 4890 dev_queue->xmit_lock_owner = -1; 4891 } 4892 4893 static void netdev_init_queue_locks(struct net_device *dev) 4894 { 4895 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); 4896 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); 4897 } 4898 4899 unsigned long netdev_fix_features(unsigned long features, const char *name) 4900 { 4901 /* Fix illegal SG+CSUM combinations. */ 4902 if ((features & NETIF_F_SG) && 4903 !(features & NETIF_F_ALL_CSUM)) { 4904 if (name) 4905 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no " 4906 "checksum feature.\n", name); 4907 features &= ~NETIF_F_SG; 4908 } 4909 4910 /* TSO requires that SG is present as well. */ 4911 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) { 4912 if (name) 4913 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no " 4914 "SG feature.\n", name); 4915 features &= ~NETIF_F_TSO; 4916 } 4917 4918 if (features & NETIF_F_UFO) { 4919 if (!(features & NETIF_F_GEN_CSUM)) { 4920 if (name) 4921 printk(KERN_ERR "%s: Dropping NETIF_F_UFO " 4922 "since no NETIF_F_HW_CSUM feature.\n", 4923 name); 4924 features &= ~NETIF_F_UFO; 4925 } 4926 4927 if (!(features & NETIF_F_SG)) { 4928 if (name) 4929 printk(KERN_ERR "%s: Dropping NETIF_F_UFO " 4930 "since no NETIF_F_SG feature.\n", name); 4931 features &= ~NETIF_F_UFO; 4932 } 4933 } 4934 4935 return features; 4936 } 4937 EXPORT_SYMBOL(netdev_fix_features); 4938 4939 /** 4940 * netif_stacked_transfer_operstate - transfer operstate 4941 * @rootdev: the root or lower level device to transfer state from 4942 * @dev: the device to transfer operstate to 4943 * 4944 * Transfer operational state from root to device. This is normally 4945 * called when a stacking relationship exists between the root 4946 * device and the device(a leaf device). 4947 */ 4948 void netif_stacked_transfer_operstate(const struct net_device *rootdev, 4949 struct net_device *dev) 4950 { 4951 if (rootdev->operstate == IF_OPER_DORMANT) 4952 netif_dormant_on(dev); 4953 else 4954 netif_dormant_off(dev); 4955 4956 if (netif_carrier_ok(rootdev)) { 4957 if (!netif_carrier_ok(dev)) 4958 netif_carrier_on(dev); 4959 } else { 4960 if (netif_carrier_ok(dev)) 4961 netif_carrier_off(dev); 4962 } 4963 } 4964 EXPORT_SYMBOL(netif_stacked_transfer_operstate); 4965 4966 /** 4967 * register_netdevice - register a network device 4968 * @dev: device to register 4969 * 4970 * Take a completed network device structure and add it to the kernel 4971 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 4972 * chain. 0 is returned on success. A negative errno code is returned 4973 * on a failure to set up the device, or if the name is a duplicate. 4974 * 4975 * Callers must hold the rtnl semaphore. You may want 4976 * register_netdev() instead of this. 4977 * 4978 * BUGS: 4979 * The locking appears insufficient to guarantee two parallel registers 4980 * will not get the same name. 4981 */ 4982 4983 int register_netdevice(struct net_device *dev) 4984 { 4985 int ret; 4986 struct net *net = dev_net(dev); 4987 4988 BUG_ON(dev_boot_phase); 4989 ASSERT_RTNL(); 4990 4991 might_sleep(); 4992 4993 /* When net_device's are persistent, this will be fatal. */ 4994 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 4995 BUG_ON(!net); 4996 4997 spin_lock_init(&dev->addr_list_lock); 4998 netdev_set_addr_lockdep_class(dev); 4999 netdev_init_queue_locks(dev); 5000 5001 dev->iflink = -1; 5002 5003 #ifdef CONFIG_RPS 5004 if (!dev->num_rx_queues) { 5005 /* 5006 * Allocate a single RX queue if driver never called 5007 * alloc_netdev_mq 5008 */ 5009 5010 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL); 5011 if (!dev->_rx) { 5012 ret = -ENOMEM; 5013 goto out; 5014 } 5015 5016 dev->_rx->first = dev->_rx; 5017 atomic_set(&dev->_rx->count, 1); 5018 dev->num_rx_queues = 1; 5019 } 5020 #endif 5021 /* Init, if this function is available */ 5022 if (dev->netdev_ops->ndo_init) { 5023 ret = dev->netdev_ops->ndo_init(dev); 5024 if (ret) { 5025 if (ret > 0) 5026 ret = -EIO; 5027 goto out; 5028 } 5029 } 5030 5031 ret = dev_get_valid_name(dev, dev->name, 0); 5032 if (ret) 5033 goto err_uninit; 5034 5035 dev->ifindex = dev_new_index(net); 5036 if (dev->iflink == -1) 5037 dev->iflink = dev->ifindex; 5038 5039 /* Fix illegal checksum combinations */ 5040 if ((dev->features & NETIF_F_HW_CSUM) && 5041 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 5042 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n", 5043 dev->name); 5044 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 5045 } 5046 5047 if ((dev->features & NETIF_F_NO_CSUM) && 5048 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 5049 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n", 5050 dev->name); 5051 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); 5052 } 5053 5054 dev->features = netdev_fix_features(dev->features, dev->name); 5055 5056 /* Enable software GSO if SG is supported. */ 5057 if (dev->features & NETIF_F_SG) 5058 dev->features |= NETIF_F_GSO; 5059 5060 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 5061 ret = notifier_to_errno(ret); 5062 if (ret) 5063 goto err_uninit; 5064 5065 ret = netdev_register_kobject(dev); 5066 if (ret) 5067 goto err_uninit; 5068 dev->reg_state = NETREG_REGISTERED; 5069 5070 /* 5071 * Default initial state at registry is that the 5072 * device is present. 5073 */ 5074 5075 set_bit(__LINK_STATE_PRESENT, &dev->state); 5076 5077 dev_init_scheduler(dev); 5078 dev_hold(dev); 5079 list_netdevice(dev); 5080 5081 /* Notify protocols, that a new device appeared. */ 5082 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 5083 ret = notifier_to_errno(ret); 5084 if (ret) { 5085 rollback_registered(dev); 5086 dev->reg_state = NETREG_UNREGISTERED; 5087 } 5088 /* 5089 * Prevent userspace races by waiting until the network 5090 * device is fully setup before sending notifications. 5091 */ 5092 if (!dev->rtnl_link_ops || 5093 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 5094 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); 5095 5096 out: 5097 return ret; 5098 5099 err_uninit: 5100 if (dev->netdev_ops->ndo_uninit) 5101 dev->netdev_ops->ndo_uninit(dev); 5102 goto out; 5103 } 5104 EXPORT_SYMBOL(register_netdevice); 5105 5106 /** 5107 * init_dummy_netdev - init a dummy network device for NAPI 5108 * @dev: device to init 5109 * 5110 * This takes a network device structure and initialize the minimum 5111 * amount of fields so it can be used to schedule NAPI polls without 5112 * registering a full blown interface. This is to be used by drivers 5113 * that need to tie several hardware interfaces to a single NAPI 5114 * poll scheduler due to HW limitations. 5115 */ 5116 int init_dummy_netdev(struct net_device *dev) 5117 { 5118 /* Clear everything. Note we don't initialize spinlocks 5119 * are they aren't supposed to be taken by any of the 5120 * NAPI code and this dummy netdev is supposed to be 5121 * only ever used for NAPI polls 5122 */ 5123 memset(dev, 0, sizeof(struct net_device)); 5124 5125 /* make sure we BUG if trying to hit standard 5126 * register/unregister code path 5127 */ 5128 dev->reg_state = NETREG_DUMMY; 5129 5130 /* initialize the ref count */ 5131 atomic_set(&dev->refcnt, 1); 5132 5133 /* NAPI wants this */ 5134 INIT_LIST_HEAD(&dev->napi_list); 5135 5136 /* a dummy interface is started by default */ 5137 set_bit(__LINK_STATE_PRESENT, &dev->state); 5138 set_bit(__LINK_STATE_START, &dev->state); 5139 5140 return 0; 5141 } 5142 EXPORT_SYMBOL_GPL(init_dummy_netdev); 5143 5144 5145 /** 5146 * register_netdev - register a network device 5147 * @dev: device to register 5148 * 5149 * Take a completed network device structure and add it to the kernel 5150 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 5151 * chain. 0 is returned on success. A negative errno code is returned 5152 * on a failure to set up the device, or if the name is a duplicate. 5153 * 5154 * This is a wrapper around register_netdevice that takes the rtnl semaphore 5155 * and expands the device name if you passed a format string to 5156 * alloc_netdev. 5157 */ 5158 int register_netdev(struct net_device *dev) 5159 { 5160 int err; 5161 5162 rtnl_lock(); 5163 5164 /* 5165 * If the name is a format string the caller wants us to do a 5166 * name allocation. 5167 */ 5168 if (strchr(dev->name, '%')) { 5169 err = dev_alloc_name(dev, dev->name); 5170 if (err < 0) 5171 goto out; 5172 } 5173 5174 err = register_netdevice(dev); 5175 out: 5176 rtnl_unlock(); 5177 return err; 5178 } 5179 EXPORT_SYMBOL(register_netdev); 5180 5181 /* 5182 * netdev_wait_allrefs - wait until all references are gone. 5183 * 5184 * This is called when unregistering network devices. 5185 * 5186 * Any protocol or device that holds a reference should register 5187 * for netdevice notification, and cleanup and put back the 5188 * reference if they receive an UNREGISTER event. 5189 * We can get stuck here if buggy protocols don't correctly 5190 * call dev_put. 5191 */ 5192 static void netdev_wait_allrefs(struct net_device *dev) 5193 { 5194 unsigned long rebroadcast_time, warning_time; 5195 5196 linkwatch_forget_dev(dev); 5197 5198 rebroadcast_time = warning_time = jiffies; 5199 while (atomic_read(&dev->refcnt) != 0) { 5200 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 5201 rtnl_lock(); 5202 5203 /* Rebroadcast unregister notification */ 5204 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 5205 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users 5206 * should have already handle it the first time */ 5207 5208 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 5209 &dev->state)) { 5210 /* We must not have linkwatch events 5211 * pending on unregister. If this 5212 * happens, we simply run the queue 5213 * unscheduled, resulting in a noop 5214 * for this device. 5215 */ 5216 linkwatch_run_queue(); 5217 } 5218 5219 __rtnl_unlock(); 5220 5221 rebroadcast_time = jiffies; 5222 } 5223 5224 msleep(250); 5225 5226 if (time_after(jiffies, warning_time + 10 * HZ)) { 5227 printk(KERN_EMERG "unregister_netdevice: " 5228 "waiting for %s to become free. Usage " 5229 "count = %d\n", 5230 dev->name, atomic_read(&dev->refcnt)); 5231 warning_time = jiffies; 5232 } 5233 } 5234 } 5235 5236 /* The sequence is: 5237 * 5238 * rtnl_lock(); 5239 * ... 5240 * register_netdevice(x1); 5241 * register_netdevice(x2); 5242 * ... 5243 * unregister_netdevice(y1); 5244 * unregister_netdevice(y2); 5245 * ... 5246 * rtnl_unlock(); 5247 * free_netdev(y1); 5248 * free_netdev(y2); 5249 * 5250 * We are invoked by rtnl_unlock(). 5251 * This allows us to deal with problems: 5252 * 1) We can delete sysfs objects which invoke hotplug 5253 * without deadlocking with linkwatch via keventd. 5254 * 2) Since we run with the RTNL semaphore not held, we can sleep 5255 * safely in order to wait for the netdev refcnt to drop to zero. 5256 * 5257 * We must not return until all unregister events added during 5258 * the interval the lock was held have been completed. 5259 */ 5260 void netdev_run_todo(void) 5261 { 5262 struct list_head list; 5263 5264 /* Snapshot list, allow later requests */ 5265 list_replace_init(&net_todo_list, &list); 5266 5267 __rtnl_unlock(); 5268 5269 while (!list_empty(&list)) { 5270 struct net_device *dev 5271 = list_first_entry(&list, struct net_device, todo_list); 5272 list_del(&dev->todo_list); 5273 5274 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 5275 printk(KERN_ERR "network todo '%s' but state %d\n", 5276 dev->name, dev->reg_state); 5277 dump_stack(); 5278 continue; 5279 } 5280 5281 dev->reg_state = NETREG_UNREGISTERED; 5282 5283 on_each_cpu(flush_backlog, dev, 1); 5284 5285 netdev_wait_allrefs(dev); 5286 5287 /* paranoia */ 5288 BUG_ON(atomic_read(&dev->refcnt)); 5289 WARN_ON(dev->ip_ptr); 5290 WARN_ON(dev->ip6_ptr); 5291 WARN_ON(dev->dn_ptr); 5292 5293 if (dev->destructor) 5294 dev->destructor(dev); 5295 5296 /* Free network device */ 5297 kobject_put(&dev->dev.kobj); 5298 } 5299 } 5300 5301 /** 5302 * dev_txq_stats_fold - fold tx_queues stats 5303 * @dev: device to get statistics from 5304 * @stats: struct rtnl_link_stats64 to hold results 5305 */ 5306 void dev_txq_stats_fold(const struct net_device *dev, 5307 struct rtnl_link_stats64 *stats) 5308 { 5309 u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0; 5310 unsigned int i; 5311 struct netdev_queue *txq; 5312 5313 for (i = 0; i < dev->num_tx_queues; i++) { 5314 txq = netdev_get_tx_queue(dev, i); 5315 spin_lock_bh(&txq->_xmit_lock); 5316 tx_bytes += txq->tx_bytes; 5317 tx_packets += txq->tx_packets; 5318 tx_dropped += txq->tx_dropped; 5319 spin_unlock_bh(&txq->_xmit_lock); 5320 } 5321 if (tx_bytes || tx_packets || tx_dropped) { 5322 stats->tx_bytes = tx_bytes; 5323 stats->tx_packets = tx_packets; 5324 stats->tx_dropped = tx_dropped; 5325 } 5326 } 5327 EXPORT_SYMBOL(dev_txq_stats_fold); 5328 5329 /* Convert net_device_stats to rtnl_link_stats64. They have the same 5330 * fields in the same order, with only the type differing. 5331 */ 5332 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, 5333 const struct net_device_stats *netdev_stats) 5334 { 5335 #if BITS_PER_LONG == 64 5336 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); 5337 memcpy(stats64, netdev_stats, sizeof(*stats64)); 5338 #else 5339 size_t i, n = sizeof(*stats64) / sizeof(u64); 5340 const unsigned long *src = (const unsigned long *)netdev_stats; 5341 u64 *dst = (u64 *)stats64; 5342 5343 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != 5344 sizeof(*stats64) / sizeof(u64)); 5345 for (i = 0; i < n; i++) 5346 dst[i] = src[i]; 5347 #endif 5348 } 5349 5350 /** 5351 * dev_get_stats - get network device statistics 5352 * @dev: device to get statistics from 5353 * @storage: place to store stats 5354 * 5355 * Get network statistics from device. Return @storage. 5356 * The device driver may provide its own method by setting 5357 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; 5358 * otherwise the internal statistics structure is used. 5359 */ 5360 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, 5361 struct rtnl_link_stats64 *storage) 5362 { 5363 const struct net_device_ops *ops = dev->netdev_ops; 5364 5365 if (ops->ndo_get_stats64) { 5366 memset(storage, 0, sizeof(*storage)); 5367 return ops->ndo_get_stats64(dev, storage); 5368 } 5369 if (ops->ndo_get_stats) { 5370 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 5371 return storage; 5372 } 5373 netdev_stats_to_stats64(storage, &dev->stats); 5374 dev_txq_stats_fold(dev, storage); 5375 return storage; 5376 } 5377 EXPORT_SYMBOL(dev_get_stats); 5378 5379 static void netdev_init_one_queue(struct net_device *dev, 5380 struct netdev_queue *queue, 5381 void *_unused) 5382 { 5383 queue->dev = dev; 5384 } 5385 5386 static void netdev_init_queues(struct net_device *dev) 5387 { 5388 netdev_init_one_queue(dev, &dev->rx_queue, NULL); 5389 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 5390 spin_lock_init(&dev->tx_global_lock); 5391 } 5392 5393 /** 5394 * alloc_netdev_mq - allocate network device 5395 * @sizeof_priv: size of private data to allocate space for 5396 * @name: device name format string 5397 * @setup: callback to initialize device 5398 * @queue_count: the number of subqueues to allocate 5399 * 5400 * Allocates a struct net_device with private data area for driver use 5401 * and performs basic initialization. Also allocates subquue structs 5402 * for each queue on the device at the end of the netdevice. 5403 */ 5404 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, 5405 void (*setup)(struct net_device *), unsigned int queue_count) 5406 { 5407 struct netdev_queue *tx; 5408 struct net_device *dev; 5409 size_t alloc_size; 5410 struct net_device *p; 5411 #ifdef CONFIG_RPS 5412 struct netdev_rx_queue *rx; 5413 int i; 5414 #endif 5415 5416 BUG_ON(strlen(name) >= sizeof(dev->name)); 5417 5418 alloc_size = sizeof(struct net_device); 5419 if (sizeof_priv) { 5420 /* ensure 32-byte alignment of private area */ 5421 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); 5422 alloc_size += sizeof_priv; 5423 } 5424 /* ensure 32-byte alignment of whole construct */ 5425 alloc_size += NETDEV_ALIGN - 1; 5426 5427 p = kzalloc(alloc_size, GFP_KERNEL); 5428 if (!p) { 5429 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); 5430 return NULL; 5431 } 5432 5433 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); 5434 if (!tx) { 5435 printk(KERN_ERR "alloc_netdev: Unable to allocate " 5436 "tx qdiscs.\n"); 5437 goto free_p; 5438 } 5439 5440 #ifdef CONFIG_RPS 5441 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL); 5442 if (!rx) { 5443 printk(KERN_ERR "alloc_netdev: Unable to allocate " 5444 "rx queues.\n"); 5445 goto free_tx; 5446 } 5447 5448 atomic_set(&rx->count, queue_count); 5449 5450 /* 5451 * Set a pointer to first element in the array which holds the 5452 * reference count. 5453 */ 5454 for (i = 0; i < queue_count; i++) 5455 rx[i].first = rx; 5456 #endif 5457 5458 dev = PTR_ALIGN(p, NETDEV_ALIGN); 5459 dev->padded = (char *)dev - (char *)p; 5460 5461 if (dev_addr_init(dev)) 5462 goto free_rx; 5463 5464 dev_mc_init(dev); 5465 dev_uc_init(dev); 5466 5467 dev_net_set(dev, &init_net); 5468 5469 dev->_tx = tx; 5470 dev->num_tx_queues = queue_count; 5471 dev->real_num_tx_queues = queue_count; 5472 5473 #ifdef CONFIG_RPS 5474 dev->_rx = rx; 5475 dev->num_rx_queues = queue_count; 5476 #endif 5477 5478 dev->gso_max_size = GSO_MAX_SIZE; 5479 5480 netdev_init_queues(dev); 5481 5482 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); 5483 dev->ethtool_ntuple_list.count = 0; 5484 INIT_LIST_HEAD(&dev->napi_list); 5485 INIT_LIST_HEAD(&dev->unreg_list); 5486 INIT_LIST_HEAD(&dev->link_watch_list); 5487 dev->priv_flags = IFF_XMIT_DST_RELEASE; 5488 setup(dev); 5489 strcpy(dev->name, name); 5490 return dev; 5491 5492 free_rx: 5493 #ifdef CONFIG_RPS 5494 kfree(rx); 5495 free_tx: 5496 #endif 5497 kfree(tx); 5498 free_p: 5499 kfree(p); 5500 return NULL; 5501 } 5502 EXPORT_SYMBOL(alloc_netdev_mq); 5503 5504 /** 5505 * free_netdev - free network device 5506 * @dev: device 5507 * 5508 * This function does the last stage of destroying an allocated device 5509 * interface. The reference to the device object is released. 5510 * If this is the last reference then it will be freed. 5511 */ 5512 void free_netdev(struct net_device *dev) 5513 { 5514 struct napi_struct *p, *n; 5515 5516 release_net(dev_net(dev)); 5517 5518 kfree(dev->_tx); 5519 5520 /* Flush device addresses */ 5521 dev_addr_flush(dev); 5522 5523 /* Clear ethtool n-tuple list */ 5524 ethtool_ntuple_flush(dev); 5525 5526 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 5527 netif_napi_del(p); 5528 5529 /* Compatibility with error handling in drivers */ 5530 if (dev->reg_state == NETREG_UNINITIALIZED) { 5531 kfree((char *)dev - dev->padded); 5532 return; 5533 } 5534 5535 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 5536 dev->reg_state = NETREG_RELEASED; 5537 5538 /* will free via device release */ 5539 put_device(&dev->dev); 5540 } 5541 EXPORT_SYMBOL(free_netdev); 5542 5543 /** 5544 * synchronize_net - Synchronize with packet receive processing 5545 * 5546 * Wait for packets currently being received to be done. 5547 * Does not block later packets from starting. 5548 */ 5549 void synchronize_net(void) 5550 { 5551 might_sleep(); 5552 synchronize_rcu(); 5553 } 5554 EXPORT_SYMBOL(synchronize_net); 5555 5556 /** 5557 * unregister_netdevice_queue - remove device from the kernel 5558 * @dev: device 5559 * @head: list 5560 * 5561 * This function shuts down a device interface and removes it 5562 * from the kernel tables. 5563 * If head not NULL, device is queued to be unregistered later. 5564 * 5565 * Callers must hold the rtnl semaphore. You may want 5566 * unregister_netdev() instead of this. 5567 */ 5568 5569 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) 5570 { 5571 ASSERT_RTNL(); 5572 5573 if (head) { 5574 list_move_tail(&dev->unreg_list, head); 5575 } else { 5576 rollback_registered(dev); 5577 /* Finish processing unregister after unlock */ 5578 net_set_todo(dev); 5579 } 5580 } 5581 EXPORT_SYMBOL(unregister_netdevice_queue); 5582 5583 /** 5584 * unregister_netdevice_many - unregister many devices 5585 * @head: list of devices 5586 */ 5587 void unregister_netdevice_many(struct list_head *head) 5588 { 5589 struct net_device *dev; 5590 5591 if (!list_empty(head)) { 5592 rollback_registered_many(head); 5593 list_for_each_entry(dev, head, unreg_list) 5594 net_set_todo(dev); 5595 } 5596 } 5597 EXPORT_SYMBOL(unregister_netdevice_many); 5598 5599 /** 5600 * unregister_netdev - remove device from the kernel 5601 * @dev: device 5602 * 5603 * This function shuts down a device interface and removes it 5604 * from the kernel tables. 5605 * 5606 * This is just a wrapper for unregister_netdevice that takes 5607 * the rtnl semaphore. In general you want to use this and not 5608 * unregister_netdevice. 5609 */ 5610 void unregister_netdev(struct net_device *dev) 5611 { 5612 rtnl_lock(); 5613 unregister_netdevice(dev); 5614 rtnl_unlock(); 5615 } 5616 EXPORT_SYMBOL(unregister_netdev); 5617 5618 /** 5619 * dev_change_net_namespace - move device to different nethost namespace 5620 * @dev: device 5621 * @net: network namespace 5622 * @pat: If not NULL name pattern to try if the current device name 5623 * is already taken in the destination network namespace. 5624 * 5625 * This function shuts down a device interface and moves it 5626 * to a new network namespace. On success 0 is returned, on 5627 * a failure a netagive errno code is returned. 5628 * 5629 * Callers must hold the rtnl semaphore. 5630 */ 5631 5632 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 5633 { 5634 int err; 5635 5636 ASSERT_RTNL(); 5637 5638 /* Don't allow namespace local devices to be moved. */ 5639 err = -EINVAL; 5640 if (dev->features & NETIF_F_NETNS_LOCAL) 5641 goto out; 5642 5643 /* Ensure the device has been registrered */ 5644 err = -EINVAL; 5645 if (dev->reg_state != NETREG_REGISTERED) 5646 goto out; 5647 5648 /* Get out if there is nothing todo */ 5649 err = 0; 5650 if (net_eq(dev_net(dev), net)) 5651 goto out; 5652 5653 /* Pick the destination device name, and ensure 5654 * we can use it in the destination network namespace. 5655 */ 5656 err = -EEXIST; 5657 if (__dev_get_by_name(net, dev->name)) { 5658 /* We get here if we can't use the current device name */ 5659 if (!pat) 5660 goto out; 5661 if (dev_get_valid_name(dev, pat, 1)) 5662 goto out; 5663 } 5664 5665 /* 5666 * And now a mini version of register_netdevice unregister_netdevice. 5667 */ 5668 5669 /* If device is running close it first. */ 5670 dev_close(dev); 5671 5672 /* And unlink it from device chain */ 5673 err = -ENODEV; 5674 unlist_netdevice(dev); 5675 5676 synchronize_net(); 5677 5678 /* Shutdown queueing discipline. */ 5679 dev_shutdown(dev); 5680 5681 /* Notify protocols, that we are about to destroy 5682 this device. They should clean all the things. 5683 */ 5684 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 5685 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); 5686 5687 /* 5688 * Flush the unicast and multicast chains 5689 */ 5690 dev_uc_flush(dev); 5691 dev_mc_flush(dev); 5692 5693 /* Actually switch the network namespace */ 5694 dev_net_set(dev, net); 5695 5696 /* If there is an ifindex conflict assign a new one */ 5697 if (__dev_get_by_index(net, dev->ifindex)) { 5698 int iflink = (dev->iflink == dev->ifindex); 5699 dev->ifindex = dev_new_index(net); 5700 if (iflink) 5701 dev->iflink = dev->ifindex; 5702 } 5703 5704 /* Fixup kobjects */ 5705 err = device_rename(&dev->dev, dev->name); 5706 WARN_ON(err); 5707 5708 /* Add the device back in the hashes */ 5709 list_netdevice(dev); 5710 5711 /* Notify protocols, that a new device appeared. */ 5712 call_netdevice_notifiers(NETDEV_REGISTER, dev); 5713 5714 /* 5715 * Prevent userspace races by waiting until the network 5716 * device is fully setup before sending notifications. 5717 */ 5718 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); 5719 5720 synchronize_net(); 5721 err = 0; 5722 out: 5723 return err; 5724 } 5725 EXPORT_SYMBOL_GPL(dev_change_net_namespace); 5726 5727 static int dev_cpu_callback(struct notifier_block *nfb, 5728 unsigned long action, 5729 void *ocpu) 5730 { 5731 struct sk_buff **list_skb; 5732 struct sk_buff *skb; 5733 unsigned int cpu, oldcpu = (unsigned long)ocpu; 5734 struct softnet_data *sd, *oldsd; 5735 5736 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 5737 return NOTIFY_OK; 5738 5739 local_irq_disable(); 5740 cpu = smp_processor_id(); 5741 sd = &per_cpu(softnet_data, cpu); 5742 oldsd = &per_cpu(softnet_data, oldcpu); 5743 5744 /* Find end of our completion_queue. */ 5745 list_skb = &sd->completion_queue; 5746 while (*list_skb) 5747 list_skb = &(*list_skb)->next; 5748 /* Append completion queue from offline CPU. */ 5749 *list_skb = oldsd->completion_queue; 5750 oldsd->completion_queue = NULL; 5751 5752 /* Append output queue from offline CPU. */ 5753 if (oldsd->output_queue) { 5754 *sd->output_queue_tailp = oldsd->output_queue; 5755 sd->output_queue_tailp = oldsd->output_queue_tailp; 5756 oldsd->output_queue = NULL; 5757 oldsd->output_queue_tailp = &oldsd->output_queue; 5758 } 5759 5760 raise_softirq_irqoff(NET_TX_SOFTIRQ); 5761 local_irq_enable(); 5762 5763 /* Process offline CPU's input_pkt_queue */ 5764 while ((skb = __skb_dequeue(&oldsd->process_queue))) { 5765 netif_rx(skb); 5766 input_queue_head_incr(oldsd); 5767 } 5768 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { 5769 netif_rx(skb); 5770 input_queue_head_incr(oldsd); 5771 } 5772 5773 return NOTIFY_OK; 5774 } 5775 5776 5777 /** 5778 * netdev_increment_features - increment feature set by one 5779 * @all: current feature set 5780 * @one: new feature set 5781 * @mask: mask feature set 5782 * 5783 * Computes a new feature set after adding a device with feature set 5784 * @one to the master device with current feature set @all. Will not 5785 * enable anything that is off in @mask. Returns the new feature set. 5786 */ 5787 unsigned long netdev_increment_features(unsigned long all, unsigned long one, 5788 unsigned long mask) 5789 { 5790 /* If device needs checksumming, downgrade to it. */ 5791 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) 5792 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM); 5793 else if (mask & NETIF_F_ALL_CSUM) { 5794 /* If one device supports v4/v6 checksumming, set for all. */ 5795 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) && 5796 !(all & NETIF_F_GEN_CSUM)) { 5797 all &= ~NETIF_F_ALL_CSUM; 5798 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); 5799 } 5800 5801 /* If one device supports hw checksumming, set for all. */ 5802 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) { 5803 all &= ~NETIF_F_ALL_CSUM; 5804 all |= NETIF_F_HW_CSUM; 5805 } 5806 } 5807 5808 one |= NETIF_F_ALL_CSUM; 5809 5810 one |= all & NETIF_F_ONE_FOR_ALL; 5811 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO; 5812 all |= one & mask & NETIF_F_ONE_FOR_ALL; 5813 5814 return all; 5815 } 5816 EXPORT_SYMBOL(netdev_increment_features); 5817 5818 static struct hlist_head *netdev_create_hash(void) 5819 { 5820 int i; 5821 struct hlist_head *hash; 5822 5823 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 5824 if (hash != NULL) 5825 for (i = 0; i < NETDEV_HASHENTRIES; i++) 5826 INIT_HLIST_HEAD(&hash[i]); 5827 5828 return hash; 5829 } 5830 5831 /* Initialize per network namespace state */ 5832 static int __net_init netdev_init(struct net *net) 5833 { 5834 INIT_LIST_HEAD(&net->dev_base_head); 5835 5836 net->dev_name_head = netdev_create_hash(); 5837 if (net->dev_name_head == NULL) 5838 goto err_name; 5839 5840 net->dev_index_head = netdev_create_hash(); 5841 if (net->dev_index_head == NULL) 5842 goto err_idx; 5843 5844 return 0; 5845 5846 err_idx: 5847 kfree(net->dev_name_head); 5848 err_name: 5849 return -ENOMEM; 5850 } 5851 5852 /** 5853 * netdev_drivername - network driver for the device 5854 * @dev: network device 5855 * @buffer: buffer for resulting name 5856 * @len: size of buffer 5857 * 5858 * Determine network driver for device. 5859 */ 5860 char *netdev_drivername(const struct net_device *dev, char *buffer, int len) 5861 { 5862 const struct device_driver *driver; 5863 const struct device *parent; 5864 5865 if (len <= 0 || !buffer) 5866 return buffer; 5867 buffer[0] = 0; 5868 5869 parent = dev->dev.parent; 5870 5871 if (!parent) 5872 return buffer; 5873 5874 driver = parent->driver; 5875 if (driver && driver->name) 5876 strlcpy(buffer, driver->name, len); 5877 return buffer; 5878 } 5879 5880 static int __netdev_printk(const char *level, const struct net_device *dev, 5881 struct va_format *vaf) 5882 { 5883 int r; 5884 5885 if (dev && dev->dev.parent) 5886 r = dev_printk(level, dev->dev.parent, "%s: %pV", 5887 netdev_name(dev), vaf); 5888 else if (dev) 5889 r = printk("%s%s: %pV", level, netdev_name(dev), vaf); 5890 else 5891 r = printk("%s(NULL net_device): %pV", level, vaf); 5892 5893 return r; 5894 } 5895 5896 int netdev_printk(const char *level, const struct net_device *dev, 5897 const char *format, ...) 5898 { 5899 struct va_format vaf; 5900 va_list args; 5901 int r; 5902 5903 va_start(args, format); 5904 5905 vaf.fmt = format; 5906 vaf.va = &args; 5907 5908 r = __netdev_printk(level, dev, &vaf); 5909 va_end(args); 5910 5911 return r; 5912 } 5913 EXPORT_SYMBOL(netdev_printk); 5914 5915 #define define_netdev_printk_level(func, level) \ 5916 int func(const struct net_device *dev, const char *fmt, ...) \ 5917 { \ 5918 int r; \ 5919 struct va_format vaf; \ 5920 va_list args; \ 5921 \ 5922 va_start(args, fmt); \ 5923 \ 5924 vaf.fmt = fmt; \ 5925 vaf.va = &args; \ 5926 \ 5927 r = __netdev_printk(level, dev, &vaf); \ 5928 va_end(args); \ 5929 \ 5930 return r; \ 5931 } \ 5932 EXPORT_SYMBOL(func); 5933 5934 define_netdev_printk_level(netdev_emerg, KERN_EMERG); 5935 define_netdev_printk_level(netdev_alert, KERN_ALERT); 5936 define_netdev_printk_level(netdev_crit, KERN_CRIT); 5937 define_netdev_printk_level(netdev_err, KERN_ERR); 5938 define_netdev_printk_level(netdev_warn, KERN_WARNING); 5939 define_netdev_printk_level(netdev_notice, KERN_NOTICE); 5940 define_netdev_printk_level(netdev_info, KERN_INFO); 5941 5942 static void __net_exit netdev_exit(struct net *net) 5943 { 5944 kfree(net->dev_name_head); 5945 kfree(net->dev_index_head); 5946 } 5947 5948 static struct pernet_operations __net_initdata netdev_net_ops = { 5949 .init = netdev_init, 5950 .exit = netdev_exit, 5951 }; 5952 5953 static void __net_exit default_device_exit(struct net *net) 5954 { 5955 struct net_device *dev, *aux; 5956 /* 5957 * Push all migratable network devices back to the 5958 * initial network namespace 5959 */ 5960 rtnl_lock(); 5961 for_each_netdev_safe(net, dev, aux) { 5962 int err; 5963 char fb_name[IFNAMSIZ]; 5964 5965 /* Ignore unmoveable devices (i.e. loopback) */ 5966 if (dev->features & NETIF_F_NETNS_LOCAL) 5967 continue; 5968 5969 /* Leave virtual devices for the generic cleanup */ 5970 if (dev->rtnl_link_ops) 5971 continue; 5972 5973 /* Push remaing network devices to init_net */ 5974 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 5975 err = dev_change_net_namespace(dev, &init_net, fb_name); 5976 if (err) { 5977 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n", 5978 __func__, dev->name, err); 5979 BUG(); 5980 } 5981 } 5982 rtnl_unlock(); 5983 } 5984 5985 static void __net_exit default_device_exit_batch(struct list_head *net_list) 5986 { 5987 /* At exit all network devices most be removed from a network 5988 * namespace. Do this in the reverse order of registeration. 5989 * Do this across as many network namespaces as possible to 5990 * improve batching efficiency. 5991 */ 5992 struct net_device *dev; 5993 struct net *net; 5994 LIST_HEAD(dev_kill_list); 5995 5996 rtnl_lock(); 5997 list_for_each_entry(net, net_list, exit_list) { 5998 for_each_netdev_reverse(net, dev) { 5999 if (dev->rtnl_link_ops) 6000 dev->rtnl_link_ops->dellink(dev, &dev_kill_list); 6001 else 6002 unregister_netdevice_queue(dev, &dev_kill_list); 6003 } 6004 } 6005 unregister_netdevice_many(&dev_kill_list); 6006 rtnl_unlock(); 6007 } 6008 6009 static struct pernet_operations __net_initdata default_device_ops = { 6010 .exit = default_device_exit, 6011 .exit_batch = default_device_exit_batch, 6012 }; 6013 6014 /* 6015 * Initialize the DEV module. At boot time this walks the device list and 6016 * unhooks any devices that fail to initialise (normally hardware not 6017 * present) and leaves us with a valid list of present and active devices. 6018 * 6019 */ 6020 6021 /* 6022 * This is called single threaded during boot, so no need 6023 * to take the rtnl semaphore. 6024 */ 6025 static int __init net_dev_init(void) 6026 { 6027 int i, rc = -ENOMEM; 6028 6029 BUG_ON(!dev_boot_phase); 6030 6031 if (dev_proc_init()) 6032 goto out; 6033 6034 if (netdev_kobject_init()) 6035 goto out; 6036 6037 INIT_LIST_HEAD(&ptype_all); 6038 for (i = 0; i < PTYPE_HASH_SIZE; i++) 6039 INIT_LIST_HEAD(&ptype_base[i]); 6040 6041 if (register_pernet_subsys(&netdev_net_ops)) 6042 goto out; 6043 6044 /* 6045 * Initialise the packet receive queues. 6046 */ 6047 6048 for_each_possible_cpu(i) { 6049 struct softnet_data *sd = &per_cpu(softnet_data, i); 6050 6051 memset(sd, 0, sizeof(*sd)); 6052 skb_queue_head_init(&sd->input_pkt_queue); 6053 skb_queue_head_init(&sd->process_queue); 6054 sd->completion_queue = NULL; 6055 INIT_LIST_HEAD(&sd->poll_list); 6056 sd->output_queue = NULL; 6057 sd->output_queue_tailp = &sd->output_queue; 6058 #ifdef CONFIG_RPS 6059 sd->csd.func = rps_trigger_softirq; 6060 sd->csd.info = sd; 6061 sd->csd.flags = 0; 6062 sd->cpu = i; 6063 #endif 6064 6065 sd->backlog.poll = process_backlog; 6066 sd->backlog.weight = weight_p; 6067 sd->backlog.gro_list = NULL; 6068 sd->backlog.gro_count = 0; 6069 } 6070 6071 dev_boot_phase = 0; 6072 6073 /* The loopback device is special if any other network devices 6074 * is present in a network namespace the loopback device must 6075 * be present. Since we now dynamically allocate and free the 6076 * loopback device ensure this invariant is maintained by 6077 * keeping the loopback device as the first device on the 6078 * list of network devices. Ensuring the loopback devices 6079 * is the first device that appears and the last network device 6080 * that disappears. 6081 */ 6082 if (register_pernet_device(&loopback_net_ops)) 6083 goto out; 6084 6085 if (register_pernet_device(&default_device_ops)) 6086 goto out; 6087 6088 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 6089 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 6090 6091 hotcpu_notifier(dev_cpu_callback, 0); 6092 dst_init(); 6093 dev_mcast_init(); 6094 rc = 0; 6095 out: 6096 return rc; 6097 } 6098 6099 subsys_initcall(net_dev_init); 6100 6101 static int __init initialize_hashrnd(void) 6102 { 6103 get_random_bytes(&hashrnd, sizeof(hashrnd)); 6104 return 0; 6105 } 6106 6107 late_initcall_sync(initialize_hashrnd); 6108 6109