1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <[email protected]> 12 * Mark Evans, <[email protected]> 13 * 14 * Additional Authors: 15 * Florian la Roche <[email protected]> 16 * Alan Cox <[email protected]> 17 * David Hinds <[email protected]> 18 * Alexey Kuznetsov <[email protected]> 19 * Adam Sulmicki <[email protected]> 20 * Pekka Riikonen <[email protected]> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <asm/uaccess.h> 76 #include <asm/system.h> 77 #include <linux/bitops.h> 78 #include <linux/capability.h> 79 #include <linux/cpu.h> 80 #include <linux/types.h> 81 #include <linux/kernel.h> 82 #include <linux/hash.h> 83 #include <linux/slab.h> 84 #include <linux/sched.h> 85 #include <linux/mutex.h> 86 #include <linux/string.h> 87 #include <linux/mm.h> 88 #include <linux/socket.h> 89 #include <linux/sockios.h> 90 #include <linux/errno.h> 91 #include <linux/interrupt.h> 92 #include <linux/if_ether.h> 93 #include <linux/netdevice.h> 94 #include <linux/etherdevice.h> 95 #include <linux/ethtool.h> 96 #include <linux/notifier.h> 97 #include <linux/skbuff.h> 98 #include <net/net_namespace.h> 99 #include <net/sock.h> 100 #include <linux/rtnetlink.h> 101 #include <linux/proc_fs.h> 102 #include <linux/seq_file.h> 103 #include <linux/stat.h> 104 #include <net/dst.h> 105 #include <net/pkt_sched.h> 106 #include <net/checksum.h> 107 #include <net/xfrm.h> 108 #include <linux/highmem.h> 109 #include <linux/init.h> 110 #include <linux/kmod.h> 111 #include <linux/module.h> 112 #include <linux/netpoll.h> 113 #include <linux/rcupdate.h> 114 #include <linux/delay.h> 115 #include <net/wext.h> 116 #include <net/iw_handler.h> 117 #include <asm/current.h> 118 #include <linux/audit.h> 119 #include <linux/dmaengine.h> 120 #include <linux/err.h> 121 #include <linux/ctype.h> 122 #include <linux/if_arp.h> 123 #include <linux/if_vlan.h> 124 #include <linux/ip.h> 125 #include <net/ip.h> 126 #include <linux/ipv6.h> 127 #include <linux/in.h> 128 #include <linux/jhash.h> 129 #include <linux/random.h> 130 #include <trace/events/napi.h> 131 #include <trace/events/net.h> 132 #include <trace/events/skb.h> 133 #include <linux/pci.h> 134 #include <linux/inetdevice.h> 135 #include <linux/cpu_rmap.h> 136 #include <linux/if_tunnel.h> 137 #include <linux/if_pppox.h> 138 139 #include "net-sysfs.h" 140 141 /* Instead of increasing this, you should create a hash table. */ 142 #define MAX_GRO_SKBS 8 143 144 /* This should be increased if a protocol with a bigger head is added. */ 145 #define GRO_MAX_HEAD (MAX_HEADER + 128) 146 147 /* 148 * The list of packet types we will receive (as opposed to discard) 149 * and the routines to invoke. 150 * 151 * Why 16. Because with 16 the only overlap we get on a hash of the 152 * low nibble of the protocol value is RARP/SNAP/X.25. 153 * 154 * NOTE: That is no longer true with the addition of VLAN tags. Not 155 * sure which should go first, but I bet it won't make much 156 * difference if we are running VLANs. The good news is that 157 * this protocol won't be in the list unless compiled in, so 158 * the average user (w/out VLANs) will not be adversely affected. 159 * --BLG 160 * 161 * 0800 IP 162 * 8100 802.1Q VLAN 163 * 0001 802.3 164 * 0002 AX.25 165 * 0004 802.2 166 * 8035 RARP 167 * 0005 SNAP 168 * 0805 X.25 169 * 0806 ARP 170 * 8137 IPX 171 * 0009 Localtalk 172 * 86DD IPv6 173 */ 174 175 #define PTYPE_HASH_SIZE (16) 176 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) 177 178 static DEFINE_SPINLOCK(ptype_lock); 179 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 180 static struct list_head ptype_all __read_mostly; /* Taps */ 181 182 /* 183 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 184 * semaphore. 185 * 186 * Pure readers hold dev_base_lock for reading, or rcu_read_lock() 187 * 188 * Writers must hold the rtnl semaphore while they loop through the 189 * dev_base_head list, and hold dev_base_lock for writing when they do the 190 * actual updates. This allows pure readers to access the list even 191 * while a writer is preparing to update it. 192 * 193 * To put it another way, dev_base_lock is held for writing only to 194 * protect against pure readers; the rtnl semaphore provides the 195 * protection against other writers. 196 * 197 * See, for example usages, register_netdevice() and 198 * unregister_netdevice(), which must be called with the rtnl 199 * semaphore held. 200 */ 201 DEFINE_RWLOCK(dev_base_lock); 202 EXPORT_SYMBOL(dev_base_lock); 203 204 static inline void dev_base_seq_inc(struct net *net) 205 { 206 while (++net->dev_base_seq == 0); 207 } 208 209 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 210 { 211 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); 212 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; 213 } 214 215 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 216 { 217 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 218 } 219 220 static inline void rps_lock(struct softnet_data *sd) 221 { 222 #ifdef CONFIG_RPS 223 spin_lock(&sd->input_pkt_queue.lock); 224 #endif 225 } 226 227 static inline void rps_unlock(struct softnet_data *sd) 228 { 229 #ifdef CONFIG_RPS 230 spin_unlock(&sd->input_pkt_queue.lock); 231 #endif 232 } 233 234 /* Device list insertion */ 235 static int list_netdevice(struct net_device *dev) 236 { 237 struct net *net = dev_net(dev); 238 239 ASSERT_RTNL(); 240 241 write_lock_bh(&dev_base_lock); 242 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); 243 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 244 hlist_add_head_rcu(&dev->index_hlist, 245 dev_index_hash(net, dev->ifindex)); 246 write_unlock_bh(&dev_base_lock); 247 248 dev_base_seq_inc(net); 249 250 return 0; 251 } 252 253 /* Device list removal 254 * caller must respect a RCU grace period before freeing/reusing dev 255 */ 256 static void unlist_netdevice(struct net_device *dev) 257 { 258 ASSERT_RTNL(); 259 260 /* Unlink dev from the device chain */ 261 write_lock_bh(&dev_base_lock); 262 list_del_rcu(&dev->dev_list); 263 hlist_del_rcu(&dev->name_hlist); 264 hlist_del_rcu(&dev->index_hlist); 265 write_unlock_bh(&dev_base_lock); 266 267 dev_base_seq_inc(dev_net(dev)); 268 } 269 270 /* 271 * Our notifier list 272 */ 273 274 static RAW_NOTIFIER_HEAD(netdev_chain); 275 276 /* 277 * Device drivers call our routines to queue packets here. We empty the 278 * queue in the local softnet handler. 279 */ 280 281 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 282 EXPORT_PER_CPU_SYMBOL(softnet_data); 283 284 #ifdef CONFIG_LOCKDEP 285 /* 286 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 287 * according to dev->type 288 */ 289 static const unsigned short netdev_lock_type[] = 290 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 291 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 292 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 293 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 294 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 295 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 296 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 297 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 298 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 299 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 300 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 301 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 302 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, 303 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, 304 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, 305 ARPHRD_VOID, ARPHRD_NONE}; 306 307 static const char *const netdev_lock_name[] = 308 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 309 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 310 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 311 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 312 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 313 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 314 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 315 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 316 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 317 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 318 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 319 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 320 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", 321 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", 322 "_xmit_PHONET_PIPE", "_xmit_IEEE802154", 323 "_xmit_VOID", "_xmit_NONE"}; 324 325 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 326 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 327 328 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 329 { 330 int i; 331 332 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 333 if (netdev_lock_type[i] == dev_type) 334 return i; 335 /* the last key is used by default */ 336 return ARRAY_SIZE(netdev_lock_type) - 1; 337 } 338 339 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 340 unsigned short dev_type) 341 { 342 int i; 343 344 i = netdev_lock_pos(dev_type); 345 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 346 netdev_lock_name[i]); 347 } 348 349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 350 { 351 int i; 352 353 i = netdev_lock_pos(dev->type); 354 lockdep_set_class_and_name(&dev->addr_list_lock, 355 &netdev_addr_lock_key[i], 356 netdev_lock_name[i]); 357 } 358 #else 359 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 360 unsigned short dev_type) 361 { 362 } 363 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 364 { 365 } 366 #endif 367 368 /******************************************************************************* 369 370 Protocol management and registration routines 371 372 *******************************************************************************/ 373 374 /* 375 * Add a protocol ID to the list. Now that the input handler is 376 * smarter we can dispense with all the messy stuff that used to be 377 * here. 378 * 379 * BEWARE!!! Protocol handlers, mangling input packets, 380 * MUST BE last in hash buckets and checking protocol handlers 381 * MUST start from promiscuous ptype_all chain in net_bh. 382 * It is true now, do not change it. 383 * Explanation follows: if protocol handler, mangling packet, will 384 * be the first on list, it is not able to sense, that packet 385 * is cloned and should be copied-on-write, so that it will 386 * change it and subsequent readers will get broken packet. 387 * --ANK (980803) 388 */ 389 390 static inline struct list_head *ptype_head(const struct packet_type *pt) 391 { 392 if (pt->type == htons(ETH_P_ALL)) 393 return &ptype_all; 394 else 395 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 396 } 397 398 /** 399 * dev_add_pack - add packet handler 400 * @pt: packet type declaration 401 * 402 * Add a protocol handler to the networking stack. The passed &packet_type 403 * is linked into kernel lists and may not be freed until it has been 404 * removed from the kernel lists. 405 * 406 * This call does not sleep therefore it can not 407 * guarantee all CPU's that are in middle of receiving packets 408 * will see the new packet type (until the next received packet). 409 */ 410 411 void dev_add_pack(struct packet_type *pt) 412 { 413 struct list_head *head = ptype_head(pt); 414 415 spin_lock(&ptype_lock); 416 list_add_rcu(&pt->list, head); 417 spin_unlock(&ptype_lock); 418 } 419 EXPORT_SYMBOL(dev_add_pack); 420 421 /** 422 * __dev_remove_pack - remove packet handler 423 * @pt: packet type declaration 424 * 425 * Remove a protocol handler that was previously added to the kernel 426 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 427 * from the kernel lists and can be freed or reused once this function 428 * returns. 429 * 430 * The packet type might still be in use by receivers 431 * and must not be freed until after all the CPU's have gone 432 * through a quiescent state. 433 */ 434 void __dev_remove_pack(struct packet_type *pt) 435 { 436 struct list_head *head = ptype_head(pt); 437 struct packet_type *pt1; 438 439 spin_lock(&ptype_lock); 440 441 list_for_each_entry(pt1, head, list) { 442 if (pt == pt1) { 443 list_del_rcu(&pt->list); 444 goto out; 445 } 446 } 447 448 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); 449 out: 450 spin_unlock(&ptype_lock); 451 } 452 EXPORT_SYMBOL(__dev_remove_pack); 453 454 /** 455 * dev_remove_pack - remove packet handler 456 * @pt: packet type declaration 457 * 458 * Remove a protocol handler that was previously added to the kernel 459 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 460 * from the kernel lists and can be freed or reused once this function 461 * returns. 462 * 463 * This call sleeps to guarantee that no CPU is looking at the packet 464 * type after return. 465 */ 466 void dev_remove_pack(struct packet_type *pt) 467 { 468 __dev_remove_pack(pt); 469 470 synchronize_net(); 471 } 472 EXPORT_SYMBOL(dev_remove_pack); 473 474 /****************************************************************************** 475 476 Device Boot-time Settings Routines 477 478 *******************************************************************************/ 479 480 /* Boot time configuration table */ 481 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 482 483 /** 484 * netdev_boot_setup_add - add new setup entry 485 * @name: name of the device 486 * @map: configured settings for the device 487 * 488 * Adds new setup entry to the dev_boot_setup list. The function 489 * returns 0 on error and 1 on success. This is a generic routine to 490 * all netdevices. 491 */ 492 static int netdev_boot_setup_add(char *name, struct ifmap *map) 493 { 494 struct netdev_boot_setup *s; 495 int i; 496 497 s = dev_boot_setup; 498 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 499 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 500 memset(s[i].name, 0, sizeof(s[i].name)); 501 strlcpy(s[i].name, name, IFNAMSIZ); 502 memcpy(&s[i].map, map, sizeof(s[i].map)); 503 break; 504 } 505 } 506 507 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 508 } 509 510 /** 511 * netdev_boot_setup_check - check boot time settings 512 * @dev: the netdevice 513 * 514 * Check boot time settings for the device. 515 * The found settings are set for the device to be used 516 * later in the device probing. 517 * Returns 0 if no settings found, 1 if they are. 518 */ 519 int netdev_boot_setup_check(struct net_device *dev) 520 { 521 struct netdev_boot_setup *s = dev_boot_setup; 522 int i; 523 524 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 525 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 526 !strcmp(dev->name, s[i].name)) { 527 dev->irq = s[i].map.irq; 528 dev->base_addr = s[i].map.base_addr; 529 dev->mem_start = s[i].map.mem_start; 530 dev->mem_end = s[i].map.mem_end; 531 return 1; 532 } 533 } 534 return 0; 535 } 536 EXPORT_SYMBOL(netdev_boot_setup_check); 537 538 539 /** 540 * netdev_boot_base - get address from boot time settings 541 * @prefix: prefix for network device 542 * @unit: id for network device 543 * 544 * Check boot time settings for the base address of device. 545 * The found settings are set for the device to be used 546 * later in the device probing. 547 * Returns 0 if no settings found. 548 */ 549 unsigned long netdev_boot_base(const char *prefix, int unit) 550 { 551 const struct netdev_boot_setup *s = dev_boot_setup; 552 char name[IFNAMSIZ]; 553 int i; 554 555 sprintf(name, "%s%d", prefix, unit); 556 557 /* 558 * If device already registered then return base of 1 559 * to indicate not to probe for this interface 560 */ 561 if (__dev_get_by_name(&init_net, name)) 562 return 1; 563 564 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 565 if (!strcmp(name, s[i].name)) 566 return s[i].map.base_addr; 567 return 0; 568 } 569 570 /* 571 * Saves at boot time configured settings for any netdevice. 572 */ 573 int __init netdev_boot_setup(char *str) 574 { 575 int ints[5]; 576 struct ifmap map; 577 578 str = get_options(str, ARRAY_SIZE(ints), ints); 579 if (!str || !*str) 580 return 0; 581 582 /* Save settings */ 583 memset(&map, 0, sizeof(map)); 584 if (ints[0] > 0) 585 map.irq = ints[1]; 586 if (ints[0] > 1) 587 map.base_addr = ints[2]; 588 if (ints[0] > 2) 589 map.mem_start = ints[3]; 590 if (ints[0] > 3) 591 map.mem_end = ints[4]; 592 593 /* Add new entry to the list */ 594 return netdev_boot_setup_add(str, &map); 595 } 596 597 __setup("netdev=", netdev_boot_setup); 598 599 /******************************************************************************* 600 601 Device Interface Subroutines 602 603 *******************************************************************************/ 604 605 /** 606 * __dev_get_by_name - find a device by its name 607 * @net: the applicable net namespace 608 * @name: name to find 609 * 610 * Find an interface by name. Must be called under RTNL semaphore 611 * or @dev_base_lock. If the name is found a pointer to the device 612 * is returned. If the name is not found then %NULL is returned. The 613 * reference counters are not incremented so the caller must be 614 * careful with locks. 615 */ 616 617 struct net_device *__dev_get_by_name(struct net *net, const char *name) 618 { 619 struct hlist_node *p; 620 struct net_device *dev; 621 struct hlist_head *head = dev_name_hash(net, name); 622 623 hlist_for_each_entry(dev, p, head, name_hlist) 624 if (!strncmp(dev->name, name, IFNAMSIZ)) 625 return dev; 626 627 return NULL; 628 } 629 EXPORT_SYMBOL(__dev_get_by_name); 630 631 /** 632 * dev_get_by_name_rcu - find a device by its name 633 * @net: the applicable net namespace 634 * @name: name to find 635 * 636 * Find an interface by name. 637 * If the name is found a pointer to the device is returned. 638 * If the name is not found then %NULL is returned. 639 * The reference counters are not incremented so the caller must be 640 * careful with locks. The caller must hold RCU lock. 641 */ 642 643 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 644 { 645 struct hlist_node *p; 646 struct net_device *dev; 647 struct hlist_head *head = dev_name_hash(net, name); 648 649 hlist_for_each_entry_rcu(dev, p, head, name_hlist) 650 if (!strncmp(dev->name, name, IFNAMSIZ)) 651 return dev; 652 653 return NULL; 654 } 655 EXPORT_SYMBOL(dev_get_by_name_rcu); 656 657 /** 658 * dev_get_by_name - find a device by its name 659 * @net: the applicable net namespace 660 * @name: name to find 661 * 662 * Find an interface by name. This can be called from any 663 * context and does its own locking. The returned handle has 664 * the usage count incremented and the caller must use dev_put() to 665 * release it when it is no longer needed. %NULL is returned if no 666 * matching device is found. 667 */ 668 669 struct net_device *dev_get_by_name(struct net *net, const char *name) 670 { 671 struct net_device *dev; 672 673 rcu_read_lock(); 674 dev = dev_get_by_name_rcu(net, name); 675 if (dev) 676 dev_hold(dev); 677 rcu_read_unlock(); 678 return dev; 679 } 680 EXPORT_SYMBOL(dev_get_by_name); 681 682 /** 683 * __dev_get_by_index - find a device by its ifindex 684 * @net: the applicable net namespace 685 * @ifindex: index of device 686 * 687 * Search for an interface by index. Returns %NULL if the device 688 * is not found or a pointer to the device. The device has not 689 * had its reference counter increased so the caller must be careful 690 * about locking. The caller must hold either the RTNL semaphore 691 * or @dev_base_lock. 692 */ 693 694 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 695 { 696 struct hlist_node *p; 697 struct net_device *dev; 698 struct hlist_head *head = dev_index_hash(net, ifindex); 699 700 hlist_for_each_entry(dev, p, head, index_hlist) 701 if (dev->ifindex == ifindex) 702 return dev; 703 704 return NULL; 705 } 706 EXPORT_SYMBOL(__dev_get_by_index); 707 708 /** 709 * dev_get_by_index_rcu - find a device by its ifindex 710 * @net: the applicable net namespace 711 * @ifindex: index of device 712 * 713 * Search for an interface by index. Returns %NULL if the device 714 * is not found or a pointer to the device. The device has not 715 * had its reference counter increased so the caller must be careful 716 * about locking. The caller must hold RCU lock. 717 */ 718 719 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) 720 { 721 struct hlist_node *p; 722 struct net_device *dev; 723 struct hlist_head *head = dev_index_hash(net, ifindex); 724 725 hlist_for_each_entry_rcu(dev, p, head, index_hlist) 726 if (dev->ifindex == ifindex) 727 return dev; 728 729 return NULL; 730 } 731 EXPORT_SYMBOL(dev_get_by_index_rcu); 732 733 734 /** 735 * dev_get_by_index - find a device by its ifindex 736 * @net: the applicable net namespace 737 * @ifindex: index of device 738 * 739 * Search for an interface by index. Returns NULL if the device 740 * is not found or a pointer to the device. The device returned has 741 * had a reference added and the pointer is safe until the user calls 742 * dev_put to indicate they have finished with it. 743 */ 744 745 struct net_device *dev_get_by_index(struct net *net, int ifindex) 746 { 747 struct net_device *dev; 748 749 rcu_read_lock(); 750 dev = dev_get_by_index_rcu(net, ifindex); 751 if (dev) 752 dev_hold(dev); 753 rcu_read_unlock(); 754 return dev; 755 } 756 EXPORT_SYMBOL(dev_get_by_index); 757 758 /** 759 * dev_getbyhwaddr_rcu - find a device by its hardware address 760 * @net: the applicable net namespace 761 * @type: media type of device 762 * @ha: hardware address 763 * 764 * Search for an interface by MAC address. Returns NULL if the device 765 * is not found or a pointer to the device. 766 * The caller must hold RCU or RTNL. 767 * The returned device has not had its ref count increased 768 * and the caller must therefore be careful about locking 769 * 770 */ 771 772 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, 773 const char *ha) 774 { 775 struct net_device *dev; 776 777 for_each_netdev_rcu(net, dev) 778 if (dev->type == type && 779 !memcmp(dev->dev_addr, ha, dev->addr_len)) 780 return dev; 781 782 return NULL; 783 } 784 EXPORT_SYMBOL(dev_getbyhwaddr_rcu); 785 786 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 787 { 788 struct net_device *dev; 789 790 ASSERT_RTNL(); 791 for_each_netdev(net, dev) 792 if (dev->type == type) 793 return dev; 794 795 return NULL; 796 } 797 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 798 799 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 800 { 801 struct net_device *dev, *ret = NULL; 802 803 rcu_read_lock(); 804 for_each_netdev_rcu(net, dev) 805 if (dev->type == type) { 806 dev_hold(dev); 807 ret = dev; 808 break; 809 } 810 rcu_read_unlock(); 811 return ret; 812 } 813 EXPORT_SYMBOL(dev_getfirstbyhwtype); 814 815 /** 816 * dev_get_by_flags_rcu - find any device with given flags 817 * @net: the applicable net namespace 818 * @if_flags: IFF_* values 819 * @mask: bitmask of bits in if_flags to check 820 * 821 * Search for any interface with the given flags. Returns NULL if a device 822 * is not found or a pointer to the device. Must be called inside 823 * rcu_read_lock(), and result refcount is unchanged. 824 */ 825 826 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, 827 unsigned short mask) 828 { 829 struct net_device *dev, *ret; 830 831 ret = NULL; 832 for_each_netdev_rcu(net, dev) { 833 if (((dev->flags ^ if_flags) & mask) == 0) { 834 ret = dev; 835 break; 836 } 837 } 838 return ret; 839 } 840 EXPORT_SYMBOL(dev_get_by_flags_rcu); 841 842 /** 843 * dev_valid_name - check if name is okay for network device 844 * @name: name string 845 * 846 * Network device names need to be valid file names to 847 * to allow sysfs to work. We also disallow any kind of 848 * whitespace. 849 */ 850 int dev_valid_name(const char *name) 851 { 852 if (*name == '\0') 853 return 0; 854 if (strlen(name) >= IFNAMSIZ) 855 return 0; 856 if (!strcmp(name, ".") || !strcmp(name, "..")) 857 return 0; 858 859 while (*name) { 860 if (*name == '/' || isspace(*name)) 861 return 0; 862 name++; 863 } 864 return 1; 865 } 866 EXPORT_SYMBOL(dev_valid_name); 867 868 /** 869 * __dev_alloc_name - allocate a name for a device 870 * @net: network namespace to allocate the device name in 871 * @name: name format string 872 * @buf: scratch buffer and result name string 873 * 874 * Passed a format string - eg "lt%d" it will try and find a suitable 875 * id. It scans list of devices to build up a free map, then chooses 876 * the first empty slot. The caller must hold the dev_base or rtnl lock 877 * while allocating the name and adding the device in order to avoid 878 * duplicates. 879 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 880 * Returns the number of the unit assigned or a negative errno code. 881 */ 882 883 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 884 { 885 int i = 0; 886 const char *p; 887 const int max_netdevices = 8*PAGE_SIZE; 888 unsigned long *inuse; 889 struct net_device *d; 890 891 p = strnchr(name, IFNAMSIZ-1, '%'); 892 if (p) { 893 /* 894 * Verify the string as this thing may have come from 895 * the user. There must be either one "%d" and no other "%" 896 * characters. 897 */ 898 if (p[1] != 'd' || strchr(p + 2, '%')) 899 return -EINVAL; 900 901 /* Use one page as a bit array of possible slots */ 902 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 903 if (!inuse) 904 return -ENOMEM; 905 906 for_each_netdev(net, d) { 907 if (!sscanf(d->name, name, &i)) 908 continue; 909 if (i < 0 || i >= max_netdevices) 910 continue; 911 912 /* avoid cases where sscanf is not exact inverse of printf */ 913 snprintf(buf, IFNAMSIZ, name, i); 914 if (!strncmp(buf, d->name, IFNAMSIZ)) 915 set_bit(i, inuse); 916 } 917 918 i = find_first_zero_bit(inuse, max_netdevices); 919 free_page((unsigned long) inuse); 920 } 921 922 if (buf != name) 923 snprintf(buf, IFNAMSIZ, name, i); 924 if (!__dev_get_by_name(net, buf)) 925 return i; 926 927 /* It is possible to run out of possible slots 928 * when the name is long and there isn't enough space left 929 * for the digits, or if all bits are used. 930 */ 931 return -ENFILE; 932 } 933 934 /** 935 * dev_alloc_name - allocate a name for a device 936 * @dev: device 937 * @name: name format string 938 * 939 * Passed a format string - eg "lt%d" it will try and find a suitable 940 * id. It scans list of devices to build up a free map, then chooses 941 * the first empty slot. The caller must hold the dev_base or rtnl lock 942 * while allocating the name and adding the device in order to avoid 943 * duplicates. 944 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 945 * Returns the number of the unit assigned or a negative errno code. 946 */ 947 948 int dev_alloc_name(struct net_device *dev, const char *name) 949 { 950 char buf[IFNAMSIZ]; 951 struct net *net; 952 int ret; 953 954 BUG_ON(!dev_net(dev)); 955 net = dev_net(dev); 956 ret = __dev_alloc_name(net, name, buf); 957 if (ret >= 0) 958 strlcpy(dev->name, buf, IFNAMSIZ); 959 return ret; 960 } 961 EXPORT_SYMBOL(dev_alloc_name); 962 963 static int dev_get_valid_name(struct net_device *dev, const char *name) 964 { 965 struct net *net; 966 967 BUG_ON(!dev_net(dev)); 968 net = dev_net(dev); 969 970 if (!dev_valid_name(name)) 971 return -EINVAL; 972 973 if (strchr(name, '%')) 974 return dev_alloc_name(dev, name); 975 else if (__dev_get_by_name(net, name)) 976 return -EEXIST; 977 else if (dev->name != name) 978 strlcpy(dev->name, name, IFNAMSIZ); 979 980 return 0; 981 } 982 983 /** 984 * dev_change_name - change name of a device 985 * @dev: device 986 * @newname: name (or format string) must be at least IFNAMSIZ 987 * 988 * Change name of a device, can pass format strings "eth%d". 989 * for wildcarding. 990 */ 991 int dev_change_name(struct net_device *dev, const char *newname) 992 { 993 char oldname[IFNAMSIZ]; 994 int err = 0; 995 int ret; 996 struct net *net; 997 998 ASSERT_RTNL(); 999 BUG_ON(!dev_net(dev)); 1000 1001 net = dev_net(dev); 1002 if (dev->flags & IFF_UP) 1003 return -EBUSY; 1004 1005 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) 1006 return 0; 1007 1008 memcpy(oldname, dev->name, IFNAMSIZ); 1009 1010 err = dev_get_valid_name(dev, newname); 1011 if (err < 0) 1012 return err; 1013 1014 rollback: 1015 ret = device_rename(&dev->dev, dev->name); 1016 if (ret) { 1017 memcpy(dev->name, oldname, IFNAMSIZ); 1018 return ret; 1019 } 1020 1021 write_lock_bh(&dev_base_lock); 1022 hlist_del_rcu(&dev->name_hlist); 1023 write_unlock_bh(&dev_base_lock); 1024 1025 synchronize_rcu(); 1026 1027 write_lock_bh(&dev_base_lock); 1028 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 1029 write_unlock_bh(&dev_base_lock); 1030 1031 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 1032 ret = notifier_to_errno(ret); 1033 1034 if (ret) { 1035 /* err >= 0 after dev_alloc_name() or stores the first errno */ 1036 if (err >= 0) { 1037 err = ret; 1038 memcpy(dev->name, oldname, IFNAMSIZ); 1039 goto rollback; 1040 } else { 1041 printk(KERN_ERR 1042 "%s: name change rollback failed: %d.\n", 1043 dev->name, ret); 1044 } 1045 } 1046 1047 return err; 1048 } 1049 1050 /** 1051 * dev_set_alias - change ifalias of a device 1052 * @dev: device 1053 * @alias: name up to IFALIASZ 1054 * @len: limit of bytes to copy from info 1055 * 1056 * Set ifalias for a device, 1057 */ 1058 int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 1059 { 1060 ASSERT_RTNL(); 1061 1062 if (len >= IFALIASZ) 1063 return -EINVAL; 1064 1065 if (!len) { 1066 if (dev->ifalias) { 1067 kfree(dev->ifalias); 1068 dev->ifalias = NULL; 1069 } 1070 return 0; 1071 } 1072 1073 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 1074 if (!dev->ifalias) 1075 return -ENOMEM; 1076 1077 strlcpy(dev->ifalias, alias, len+1); 1078 return len; 1079 } 1080 1081 1082 /** 1083 * netdev_features_change - device changes features 1084 * @dev: device to cause notification 1085 * 1086 * Called to indicate a device has changed features. 1087 */ 1088 void netdev_features_change(struct net_device *dev) 1089 { 1090 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 1091 } 1092 EXPORT_SYMBOL(netdev_features_change); 1093 1094 /** 1095 * netdev_state_change - device changes state 1096 * @dev: device to cause notification 1097 * 1098 * Called to indicate a device has changed state. This function calls 1099 * the notifier chains for netdev_chain and sends a NEWLINK message 1100 * to the routing socket. 1101 */ 1102 void netdev_state_change(struct net_device *dev) 1103 { 1104 if (dev->flags & IFF_UP) { 1105 call_netdevice_notifiers(NETDEV_CHANGE, dev); 1106 rtmsg_ifinfo(RTM_NEWLINK, dev, 0); 1107 } 1108 } 1109 EXPORT_SYMBOL(netdev_state_change); 1110 1111 int netdev_bonding_change(struct net_device *dev, unsigned long event) 1112 { 1113 return call_netdevice_notifiers(event, dev); 1114 } 1115 EXPORT_SYMBOL(netdev_bonding_change); 1116 1117 /** 1118 * dev_load - load a network module 1119 * @net: the applicable net namespace 1120 * @name: name of interface 1121 * 1122 * If a network interface is not present and the process has suitable 1123 * privileges this function loads the module. If module loading is not 1124 * available in this kernel then it becomes a nop. 1125 */ 1126 1127 void dev_load(struct net *net, const char *name) 1128 { 1129 struct net_device *dev; 1130 int no_module; 1131 1132 rcu_read_lock(); 1133 dev = dev_get_by_name_rcu(net, name); 1134 rcu_read_unlock(); 1135 1136 no_module = !dev; 1137 if (no_module && capable(CAP_NET_ADMIN)) 1138 no_module = request_module("netdev-%s", name); 1139 if (no_module && capable(CAP_SYS_MODULE)) { 1140 if (!request_module("%s", name)) 1141 pr_err("Loading kernel module for a network device " 1142 "with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s " 1143 "instead\n", name); 1144 } 1145 } 1146 EXPORT_SYMBOL(dev_load); 1147 1148 static int __dev_open(struct net_device *dev) 1149 { 1150 const struct net_device_ops *ops = dev->netdev_ops; 1151 int ret; 1152 1153 ASSERT_RTNL(); 1154 1155 if (!netif_device_present(dev)) 1156 return -ENODEV; 1157 1158 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1159 ret = notifier_to_errno(ret); 1160 if (ret) 1161 return ret; 1162 1163 set_bit(__LINK_STATE_START, &dev->state); 1164 1165 if (ops->ndo_validate_addr) 1166 ret = ops->ndo_validate_addr(dev); 1167 1168 if (!ret && ops->ndo_open) 1169 ret = ops->ndo_open(dev); 1170 1171 if (ret) 1172 clear_bit(__LINK_STATE_START, &dev->state); 1173 else { 1174 dev->flags |= IFF_UP; 1175 net_dmaengine_get(); 1176 dev_set_rx_mode(dev); 1177 dev_activate(dev); 1178 } 1179 1180 return ret; 1181 } 1182 1183 /** 1184 * dev_open - prepare an interface for use. 1185 * @dev: device to open 1186 * 1187 * Takes a device from down to up state. The device's private open 1188 * function is invoked and then the multicast lists are loaded. Finally 1189 * the device is moved into the up state and a %NETDEV_UP message is 1190 * sent to the netdev notifier chain. 1191 * 1192 * Calling this function on an active interface is a nop. On a failure 1193 * a negative errno code is returned. 1194 */ 1195 int dev_open(struct net_device *dev) 1196 { 1197 int ret; 1198 1199 if (dev->flags & IFF_UP) 1200 return 0; 1201 1202 ret = __dev_open(dev); 1203 if (ret < 0) 1204 return ret; 1205 1206 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); 1207 call_netdevice_notifiers(NETDEV_UP, dev); 1208 1209 return ret; 1210 } 1211 EXPORT_SYMBOL(dev_open); 1212 1213 static int __dev_close_many(struct list_head *head) 1214 { 1215 struct net_device *dev; 1216 1217 ASSERT_RTNL(); 1218 might_sleep(); 1219 1220 list_for_each_entry(dev, head, unreg_list) { 1221 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1222 1223 clear_bit(__LINK_STATE_START, &dev->state); 1224 1225 /* Synchronize to scheduled poll. We cannot touch poll list, it 1226 * can be even on different cpu. So just clear netif_running(). 1227 * 1228 * dev->stop() will invoke napi_disable() on all of it's 1229 * napi_struct instances on this device. 1230 */ 1231 smp_mb__after_clear_bit(); /* Commit netif_running(). */ 1232 } 1233 1234 dev_deactivate_many(head); 1235 1236 list_for_each_entry(dev, head, unreg_list) { 1237 const struct net_device_ops *ops = dev->netdev_ops; 1238 1239 /* 1240 * Call the device specific close. This cannot fail. 1241 * Only if device is UP 1242 * 1243 * We allow it to be called even after a DETACH hot-plug 1244 * event. 1245 */ 1246 if (ops->ndo_stop) 1247 ops->ndo_stop(dev); 1248 1249 dev->flags &= ~IFF_UP; 1250 net_dmaengine_put(); 1251 } 1252 1253 return 0; 1254 } 1255 1256 static int __dev_close(struct net_device *dev) 1257 { 1258 int retval; 1259 LIST_HEAD(single); 1260 1261 list_add(&dev->unreg_list, &single); 1262 retval = __dev_close_many(&single); 1263 list_del(&single); 1264 return retval; 1265 } 1266 1267 static int dev_close_many(struct list_head *head) 1268 { 1269 struct net_device *dev, *tmp; 1270 LIST_HEAD(tmp_list); 1271 1272 list_for_each_entry_safe(dev, tmp, head, unreg_list) 1273 if (!(dev->flags & IFF_UP)) 1274 list_move(&dev->unreg_list, &tmp_list); 1275 1276 __dev_close_many(head); 1277 1278 list_for_each_entry(dev, head, unreg_list) { 1279 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); 1280 call_netdevice_notifiers(NETDEV_DOWN, dev); 1281 } 1282 1283 /* rollback_registered_many needs the complete original list */ 1284 list_splice(&tmp_list, head); 1285 return 0; 1286 } 1287 1288 /** 1289 * dev_close - shutdown an interface. 1290 * @dev: device to shutdown 1291 * 1292 * This function moves an active device into down state. A 1293 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1294 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1295 * chain. 1296 */ 1297 int dev_close(struct net_device *dev) 1298 { 1299 if (dev->flags & IFF_UP) { 1300 LIST_HEAD(single); 1301 1302 list_add(&dev->unreg_list, &single); 1303 dev_close_many(&single); 1304 list_del(&single); 1305 } 1306 return 0; 1307 } 1308 EXPORT_SYMBOL(dev_close); 1309 1310 1311 /** 1312 * dev_disable_lro - disable Large Receive Offload on a device 1313 * @dev: device 1314 * 1315 * Disable Large Receive Offload (LRO) on a net device. Must be 1316 * called under RTNL. This is needed if received packets may be 1317 * forwarded to another interface. 1318 */ 1319 void dev_disable_lro(struct net_device *dev) 1320 { 1321 u32 flags; 1322 1323 /* 1324 * If we're trying to disable lro on a vlan device 1325 * use the underlying physical device instead 1326 */ 1327 if (is_vlan_dev(dev)) 1328 dev = vlan_dev_real_dev(dev); 1329 1330 if (dev->ethtool_ops && dev->ethtool_ops->get_flags) 1331 flags = dev->ethtool_ops->get_flags(dev); 1332 else 1333 flags = ethtool_op_get_flags(dev); 1334 1335 if (!(flags & ETH_FLAG_LRO)) 1336 return; 1337 1338 __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO); 1339 if (unlikely(dev->features & NETIF_F_LRO)) 1340 netdev_WARN(dev, "failed to disable LRO!\n"); 1341 } 1342 EXPORT_SYMBOL(dev_disable_lro); 1343 1344 1345 static int dev_boot_phase = 1; 1346 1347 /** 1348 * register_netdevice_notifier - register a network notifier block 1349 * @nb: notifier 1350 * 1351 * Register a notifier to be called when network device events occur. 1352 * The notifier passed is linked into the kernel structures and must 1353 * not be reused until it has been unregistered. A negative errno code 1354 * is returned on a failure. 1355 * 1356 * When registered all registration and up events are replayed 1357 * to the new notifier to allow device to have a race free 1358 * view of the network device list. 1359 */ 1360 1361 int register_netdevice_notifier(struct notifier_block *nb) 1362 { 1363 struct net_device *dev; 1364 struct net_device *last; 1365 struct net *net; 1366 int err; 1367 1368 rtnl_lock(); 1369 err = raw_notifier_chain_register(&netdev_chain, nb); 1370 if (err) 1371 goto unlock; 1372 if (dev_boot_phase) 1373 goto unlock; 1374 for_each_net(net) { 1375 for_each_netdev(net, dev) { 1376 err = nb->notifier_call(nb, NETDEV_REGISTER, dev); 1377 err = notifier_to_errno(err); 1378 if (err) 1379 goto rollback; 1380 1381 if (!(dev->flags & IFF_UP)) 1382 continue; 1383 1384 nb->notifier_call(nb, NETDEV_UP, dev); 1385 } 1386 } 1387 1388 unlock: 1389 rtnl_unlock(); 1390 return err; 1391 1392 rollback: 1393 last = dev; 1394 for_each_net(net) { 1395 for_each_netdev(net, dev) { 1396 if (dev == last) 1397 break; 1398 1399 if (dev->flags & IFF_UP) { 1400 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); 1401 nb->notifier_call(nb, NETDEV_DOWN, dev); 1402 } 1403 nb->notifier_call(nb, NETDEV_UNREGISTER, dev); 1404 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); 1405 } 1406 } 1407 1408 raw_notifier_chain_unregister(&netdev_chain, nb); 1409 goto unlock; 1410 } 1411 EXPORT_SYMBOL(register_netdevice_notifier); 1412 1413 /** 1414 * unregister_netdevice_notifier - unregister a network notifier block 1415 * @nb: notifier 1416 * 1417 * Unregister a notifier previously registered by 1418 * register_netdevice_notifier(). The notifier is unlinked into the 1419 * kernel structures and may then be reused. A negative errno code 1420 * is returned on a failure. 1421 */ 1422 1423 int unregister_netdevice_notifier(struct notifier_block *nb) 1424 { 1425 int err; 1426 1427 rtnl_lock(); 1428 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1429 rtnl_unlock(); 1430 return err; 1431 } 1432 EXPORT_SYMBOL(unregister_netdevice_notifier); 1433 1434 /** 1435 * call_netdevice_notifiers - call all network notifier blocks 1436 * @val: value passed unmodified to notifier function 1437 * @dev: net_device pointer passed unmodified to notifier function 1438 * 1439 * Call all network notifier blocks. Parameters and return value 1440 * are as for raw_notifier_call_chain(). 1441 */ 1442 1443 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1444 { 1445 ASSERT_RTNL(); 1446 return raw_notifier_call_chain(&netdev_chain, val, dev); 1447 } 1448 EXPORT_SYMBOL(call_netdevice_notifiers); 1449 1450 /* When > 0 there are consumers of rx skb time stamps */ 1451 static atomic_t netstamp_needed = ATOMIC_INIT(0); 1452 1453 void net_enable_timestamp(void) 1454 { 1455 atomic_inc(&netstamp_needed); 1456 } 1457 EXPORT_SYMBOL(net_enable_timestamp); 1458 1459 void net_disable_timestamp(void) 1460 { 1461 atomic_dec(&netstamp_needed); 1462 } 1463 EXPORT_SYMBOL(net_disable_timestamp); 1464 1465 static inline void net_timestamp_set(struct sk_buff *skb) 1466 { 1467 if (atomic_read(&netstamp_needed)) 1468 __net_timestamp(skb); 1469 else 1470 skb->tstamp.tv64 = 0; 1471 } 1472 1473 static inline void net_timestamp_check(struct sk_buff *skb) 1474 { 1475 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) 1476 __net_timestamp(skb); 1477 } 1478 1479 static inline bool is_skb_forwardable(struct net_device *dev, 1480 struct sk_buff *skb) 1481 { 1482 unsigned int len; 1483 1484 if (!(dev->flags & IFF_UP)) 1485 return false; 1486 1487 len = dev->mtu + dev->hard_header_len + VLAN_HLEN; 1488 if (skb->len <= len) 1489 return true; 1490 1491 /* if TSO is enabled, we don't care about the length as the packet 1492 * could be forwarded without being segmented before 1493 */ 1494 if (skb_is_gso(skb)) 1495 return true; 1496 1497 return false; 1498 } 1499 1500 /** 1501 * dev_forward_skb - loopback an skb to another netif 1502 * 1503 * @dev: destination network device 1504 * @skb: buffer to forward 1505 * 1506 * return values: 1507 * NET_RX_SUCCESS (no congestion) 1508 * NET_RX_DROP (packet was dropped, but freed) 1509 * 1510 * dev_forward_skb can be used for injecting an skb from the 1511 * start_xmit function of one device into the receive queue 1512 * of another device. 1513 * 1514 * The receiving device may be in another namespace, so 1515 * we have to clear all information in the skb that could 1516 * impact namespace isolation. 1517 */ 1518 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1519 { 1520 skb_orphan(skb); 1521 nf_reset(skb); 1522 1523 if (unlikely(!is_skb_forwardable(dev, skb))) { 1524 atomic_long_inc(&dev->rx_dropped); 1525 kfree_skb(skb); 1526 return NET_RX_DROP; 1527 } 1528 skb_set_dev(skb, dev); 1529 skb->tstamp.tv64 = 0; 1530 skb->pkt_type = PACKET_HOST; 1531 skb->protocol = eth_type_trans(skb, dev); 1532 return netif_rx(skb); 1533 } 1534 EXPORT_SYMBOL_GPL(dev_forward_skb); 1535 1536 static inline int deliver_skb(struct sk_buff *skb, 1537 struct packet_type *pt_prev, 1538 struct net_device *orig_dev) 1539 { 1540 atomic_inc(&skb->users); 1541 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 1542 } 1543 1544 /* 1545 * Support routine. Sends outgoing frames to any network 1546 * taps currently in use. 1547 */ 1548 1549 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1550 { 1551 struct packet_type *ptype; 1552 struct sk_buff *skb2 = NULL; 1553 struct packet_type *pt_prev = NULL; 1554 1555 rcu_read_lock(); 1556 list_for_each_entry_rcu(ptype, &ptype_all, list) { 1557 /* Never send packets back to the socket 1558 * they originated from - MvS ([email protected]) 1559 */ 1560 if ((ptype->dev == dev || !ptype->dev) && 1561 (ptype->af_packet_priv == NULL || 1562 (struct sock *)ptype->af_packet_priv != skb->sk)) { 1563 if (pt_prev) { 1564 deliver_skb(skb2, pt_prev, skb->dev); 1565 pt_prev = ptype; 1566 continue; 1567 } 1568 1569 skb2 = skb_clone(skb, GFP_ATOMIC); 1570 if (!skb2) 1571 break; 1572 1573 net_timestamp_set(skb2); 1574 1575 /* skb->nh should be correctly 1576 set by sender, so that the second statement is 1577 just protection against buggy protocols. 1578 */ 1579 skb_reset_mac_header(skb2); 1580 1581 if (skb_network_header(skb2) < skb2->data || 1582 skb2->network_header > skb2->tail) { 1583 if (net_ratelimit()) 1584 printk(KERN_CRIT "protocol %04x is " 1585 "buggy, dev %s\n", 1586 ntohs(skb2->protocol), 1587 dev->name); 1588 skb_reset_network_header(skb2); 1589 } 1590 1591 skb2->transport_header = skb2->network_header; 1592 skb2->pkt_type = PACKET_OUTGOING; 1593 pt_prev = ptype; 1594 } 1595 } 1596 if (pt_prev) 1597 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); 1598 rcu_read_unlock(); 1599 } 1600 1601 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change 1602 * @dev: Network device 1603 * @txq: number of queues available 1604 * 1605 * If real_num_tx_queues is changed the tc mappings may no longer be 1606 * valid. To resolve this verify the tc mapping remains valid and if 1607 * not NULL the mapping. With no priorities mapping to this 1608 * offset/count pair it will no longer be used. In the worst case TC0 1609 * is invalid nothing can be done so disable priority mappings. If is 1610 * expected that drivers will fix this mapping if they can before 1611 * calling netif_set_real_num_tx_queues. 1612 */ 1613 static void netif_setup_tc(struct net_device *dev, unsigned int txq) 1614 { 1615 int i; 1616 struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; 1617 1618 /* If TC0 is invalidated disable TC mapping */ 1619 if (tc->offset + tc->count > txq) { 1620 pr_warning("Number of in use tx queues changed " 1621 "invalidating tc mappings. Priority " 1622 "traffic classification disabled!\n"); 1623 dev->num_tc = 0; 1624 return; 1625 } 1626 1627 /* Invalidated prio to tc mappings set to TC0 */ 1628 for (i = 1; i < TC_BITMASK + 1; i++) { 1629 int q = netdev_get_prio_tc_map(dev, i); 1630 1631 tc = &dev->tc_to_txq[q]; 1632 if (tc->offset + tc->count > txq) { 1633 pr_warning("Number of in use tx queues " 1634 "changed. Priority %i to tc " 1635 "mapping %i is no longer valid " 1636 "setting map to 0\n", 1637 i, q); 1638 netdev_set_prio_tc_map(dev, i, 0); 1639 } 1640 } 1641 } 1642 1643 /* 1644 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 1645 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 1646 */ 1647 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 1648 { 1649 int rc; 1650 1651 if (txq < 1 || txq > dev->num_tx_queues) 1652 return -EINVAL; 1653 1654 if (dev->reg_state == NETREG_REGISTERED || 1655 dev->reg_state == NETREG_UNREGISTERING) { 1656 ASSERT_RTNL(); 1657 1658 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, 1659 txq); 1660 if (rc) 1661 return rc; 1662 1663 if (dev->num_tc) 1664 netif_setup_tc(dev, txq); 1665 1666 if (txq < dev->real_num_tx_queues) 1667 qdisc_reset_all_tx_gt(dev, txq); 1668 } 1669 1670 dev->real_num_tx_queues = txq; 1671 return 0; 1672 } 1673 EXPORT_SYMBOL(netif_set_real_num_tx_queues); 1674 1675 #ifdef CONFIG_RPS 1676 /** 1677 * netif_set_real_num_rx_queues - set actual number of RX queues used 1678 * @dev: Network device 1679 * @rxq: Actual number of RX queues 1680 * 1681 * This must be called either with the rtnl_lock held or before 1682 * registration of the net device. Returns 0 on success, or a 1683 * negative error code. If called before registration, it always 1684 * succeeds. 1685 */ 1686 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) 1687 { 1688 int rc; 1689 1690 if (rxq < 1 || rxq > dev->num_rx_queues) 1691 return -EINVAL; 1692 1693 if (dev->reg_state == NETREG_REGISTERED) { 1694 ASSERT_RTNL(); 1695 1696 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, 1697 rxq); 1698 if (rc) 1699 return rc; 1700 } 1701 1702 dev->real_num_rx_queues = rxq; 1703 return 0; 1704 } 1705 EXPORT_SYMBOL(netif_set_real_num_rx_queues); 1706 #endif 1707 1708 static inline void __netif_reschedule(struct Qdisc *q) 1709 { 1710 struct softnet_data *sd; 1711 unsigned long flags; 1712 1713 local_irq_save(flags); 1714 sd = &__get_cpu_var(softnet_data); 1715 q->next_sched = NULL; 1716 *sd->output_queue_tailp = q; 1717 sd->output_queue_tailp = &q->next_sched; 1718 raise_softirq_irqoff(NET_TX_SOFTIRQ); 1719 local_irq_restore(flags); 1720 } 1721 1722 void __netif_schedule(struct Qdisc *q) 1723 { 1724 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 1725 __netif_reschedule(q); 1726 } 1727 EXPORT_SYMBOL(__netif_schedule); 1728 1729 void dev_kfree_skb_irq(struct sk_buff *skb) 1730 { 1731 if (atomic_dec_and_test(&skb->users)) { 1732 struct softnet_data *sd; 1733 unsigned long flags; 1734 1735 local_irq_save(flags); 1736 sd = &__get_cpu_var(softnet_data); 1737 skb->next = sd->completion_queue; 1738 sd->completion_queue = skb; 1739 raise_softirq_irqoff(NET_TX_SOFTIRQ); 1740 local_irq_restore(flags); 1741 } 1742 } 1743 EXPORT_SYMBOL(dev_kfree_skb_irq); 1744 1745 void dev_kfree_skb_any(struct sk_buff *skb) 1746 { 1747 if (in_irq() || irqs_disabled()) 1748 dev_kfree_skb_irq(skb); 1749 else 1750 dev_kfree_skb(skb); 1751 } 1752 EXPORT_SYMBOL(dev_kfree_skb_any); 1753 1754 1755 /** 1756 * netif_device_detach - mark device as removed 1757 * @dev: network device 1758 * 1759 * Mark device as removed from system and therefore no longer available. 1760 */ 1761 void netif_device_detach(struct net_device *dev) 1762 { 1763 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 1764 netif_running(dev)) { 1765 netif_tx_stop_all_queues(dev); 1766 } 1767 } 1768 EXPORT_SYMBOL(netif_device_detach); 1769 1770 /** 1771 * netif_device_attach - mark device as attached 1772 * @dev: network device 1773 * 1774 * Mark device as attached from system and restart if needed. 1775 */ 1776 void netif_device_attach(struct net_device *dev) 1777 { 1778 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 1779 netif_running(dev)) { 1780 netif_tx_wake_all_queues(dev); 1781 __netdev_watchdog_up(dev); 1782 } 1783 } 1784 EXPORT_SYMBOL(netif_device_attach); 1785 1786 /** 1787 * skb_dev_set -- assign a new device to a buffer 1788 * @skb: buffer for the new device 1789 * @dev: network device 1790 * 1791 * If an skb is owned by a device already, we have to reset 1792 * all data private to the namespace a device belongs to 1793 * before assigning it a new device. 1794 */ 1795 #ifdef CONFIG_NET_NS 1796 void skb_set_dev(struct sk_buff *skb, struct net_device *dev) 1797 { 1798 skb_dst_drop(skb); 1799 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) { 1800 secpath_reset(skb); 1801 nf_reset(skb); 1802 skb_init_secmark(skb); 1803 skb->mark = 0; 1804 skb->priority = 0; 1805 skb->nf_trace = 0; 1806 skb->ipvs_property = 0; 1807 #ifdef CONFIG_NET_SCHED 1808 skb->tc_index = 0; 1809 #endif 1810 } 1811 skb->dev = dev; 1812 } 1813 EXPORT_SYMBOL(skb_set_dev); 1814 #endif /* CONFIG_NET_NS */ 1815 1816 /* 1817 * Invalidate hardware checksum when packet is to be mangled, and 1818 * complete checksum manually on outgoing path. 1819 */ 1820 int skb_checksum_help(struct sk_buff *skb) 1821 { 1822 __wsum csum; 1823 int ret = 0, offset; 1824 1825 if (skb->ip_summed == CHECKSUM_COMPLETE) 1826 goto out_set_summed; 1827 1828 if (unlikely(skb_shinfo(skb)->gso_size)) { 1829 /* Let GSO fix up the checksum. */ 1830 goto out_set_summed; 1831 } 1832 1833 offset = skb_checksum_start_offset(skb); 1834 BUG_ON(offset >= skb_headlen(skb)); 1835 csum = skb_checksum(skb, offset, skb->len - offset, 0); 1836 1837 offset += skb->csum_offset; 1838 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 1839 1840 if (skb_cloned(skb) && 1841 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 1842 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 1843 if (ret) 1844 goto out; 1845 } 1846 1847 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 1848 out_set_summed: 1849 skb->ip_summed = CHECKSUM_NONE; 1850 out: 1851 return ret; 1852 } 1853 EXPORT_SYMBOL(skb_checksum_help); 1854 1855 /** 1856 * skb_gso_segment - Perform segmentation on skb. 1857 * @skb: buffer to segment 1858 * @features: features for the output path (see dev->features) 1859 * 1860 * This function segments the given skb and returns a list of segments. 1861 * 1862 * It may return NULL if the skb requires no segmentation. This is 1863 * only possible when GSO is used for verifying header integrity. 1864 */ 1865 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features) 1866 { 1867 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 1868 struct packet_type *ptype; 1869 __be16 type = skb->protocol; 1870 int vlan_depth = ETH_HLEN; 1871 int err; 1872 1873 while (type == htons(ETH_P_8021Q)) { 1874 struct vlan_hdr *vh; 1875 1876 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) 1877 return ERR_PTR(-EINVAL); 1878 1879 vh = (struct vlan_hdr *)(skb->data + vlan_depth); 1880 type = vh->h_vlan_encapsulated_proto; 1881 vlan_depth += VLAN_HLEN; 1882 } 1883 1884 skb_reset_mac_header(skb); 1885 skb->mac_len = skb->network_header - skb->mac_header; 1886 __skb_pull(skb, skb->mac_len); 1887 1888 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 1889 struct net_device *dev = skb->dev; 1890 struct ethtool_drvinfo info = {}; 1891 1892 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) 1893 dev->ethtool_ops->get_drvinfo(dev, &info); 1894 1895 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n", 1896 info.driver, dev ? dev->features : 0L, 1897 skb->sk ? skb->sk->sk_route_caps : 0L, 1898 skb->len, skb->data_len, skb->ip_summed); 1899 1900 if (skb_header_cloned(skb) && 1901 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) 1902 return ERR_PTR(err); 1903 } 1904 1905 rcu_read_lock(); 1906 list_for_each_entry_rcu(ptype, 1907 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 1908 if (ptype->type == type && !ptype->dev && ptype->gso_segment) { 1909 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 1910 err = ptype->gso_send_check(skb); 1911 segs = ERR_PTR(err); 1912 if (err || skb_gso_ok(skb, features)) 1913 break; 1914 __skb_push(skb, (skb->data - 1915 skb_network_header(skb))); 1916 } 1917 segs = ptype->gso_segment(skb, features); 1918 break; 1919 } 1920 } 1921 rcu_read_unlock(); 1922 1923 __skb_push(skb, skb->data - skb_mac_header(skb)); 1924 1925 return segs; 1926 } 1927 EXPORT_SYMBOL(skb_gso_segment); 1928 1929 /* Take action when hardware reception checksum errors are detected. */ 1930 #ifdef CONFIG_BUG 1931 void netdev_rx_csum_fault(struct net_device *dev) 1932 { 1933 if (net_ratelimit()) { 1934 printk(KERN_ERR "%s: hw csum failure.\n", 1935 dev ? dev->name : "<unknown>"); 1936 dump_stack(); 1937 } 1938 } 1939 EXPORT_SYMBOL(netdev_rx_csum_fault); 1940 #endif 1941 1942 /* Actually, we should eliminate this check as soon as we know, that: 1943 * 1. IOMMU is present and allows to map all the memory. 1944 * 2. No high memory really exists on this machine. 1945 */ 1946 1947 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 1948 { 1949 #ifdef CONFIG_HIGHMEM 1950 int i; 1951 if (!(dev->features & NETIF_F_HIGHDMA)) { 1952 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1953 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1954 if (PageHighMem(skb_frag_page(frag))) 1955 return 1; 1956 } 1957 } 1958 1959 if (PCI_DMA_BUS_IS_PHYS) { 1960 struct device *pdev = dev->dev.parent; 1961 1962 if (!pdev) 1963 return 0; 1964 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1965 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1966 dma_addr_t addr = page_to_phys(skb_frag_page(frag)); 1967 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 1968 return 1; 1969 } 1970 } 1971 #endif 1972 return 0; 1973 } 1974 1975 struct dev_gso_cb { 1976 void (*destructor)(struct sk_buff *skb); 1977 }; 1978 1979 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) 1980 1981 static void dev_gso_skb_destructor(struct sk_buff *skb) 1982 { 1983 struct dev_gso_cb *cb; 1984 1985 do { 1986 struct sk_buff *nskb = skb->next; 1987 1988 skb->next = nskb->next; 1989 nskb->next = NULL; 1990 kfree_skb(nskb); 1991 } while (skb->next); 1992 1993 cb = DEV_GSO_CB(skb); 1994 if (cb->destructor) 1995 cb->destructor(skb); 1996 } 1997 1998 /** 1999 * dev_gso_segment - Perform emulated hardware segmentation on skb. 2000 * @skb: buffer to segment 2001 * @features: device features as applicable to this skb 2002 * 2003 * This function segments the given skb and stores the list of segments 2004 * in skb->next. 2005 */ 2006 static int dev_gso_segment(struct sk_buff *skb, int features) 2007 { 2008 struct sk_buff *segs; 2009 2010 segs = skb_gso_segment(skb, features); 2011 2012 /* Verifying header integrity only. */ 2013 if (!segs) 2014 return 0; 2015 2016 if (IS_ERR(segs)) 2017 return PTR_ERR(segs); 2018 2019 skb->next = segs; 2020 DEV_GSO_CB(skb)->destructor = skb->destructor; 2021 skb->destructor = dev_gso_skb_destructor; 2022 2023 return 0; 2024 } 2025 2026 /* 2027 * Try to orphan skb early, right before transmission by the device. 2028 * We cannot orphan skb if tx timestamp is requested or the sk-reference 2029 * is needed on driver level for other reasons, e.g. see net/can/raw.c 2030 */ 2031 static inline void skb_orphan_try(struct sk_buff *skb) 2032 { 2033 struct sock *sk = skb->sk; 2034 2035 if (sk && !skb_shinfo(skb)->tx_flags) { 2036 /* skb_tx_hash() wont be able to get sk. 2037 * We copy sk_hash into skb->rxhash 2038 */ 2039 if (!skb->rxhash) 2040 skb->rxhash = sk->sk_hash; 2041 skb_orphan(skb); 2042 } 2043 } 2044 2045 static bool can_checksum_protocol(unsigned long features, __be16 protocol) 2046 { 2047 return ((features & NETIF_F_GEN_CSUM) || 2048 ((features & NETIF_F_V4_CSUM) && 2049 protocol == htons(ETH_P_IP)) || 2050 ((features & NETIF_F_V6_CSUM) && 2051 protocol == htons(ETH_P_IPV6)) || 2052 ((features & NETIF_F_FCOE_CRC) && 2053 protocol == htons(ETH_P_FCOE))); 2054 } 2055 2056 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features) 2057 { 2058 if (!can_checksum_protocol(features, protocol)) { 2059 features &= ~NETIF_F_ALL_CSUM; 2060 features &= ~NETIF_F_SG; 2061 } else if (illegal_highdma(skb->dev, skb)) { 2062 features &= ~NETIF_F_SG; 2063 } 2064 2065 return features; 2066 } 2067 2068 u32 netif_skb_features(struct sk_buff *skb) 2069 { 2070 __be16 protocol = skb->protocol; 2071 u32 features = skb->dev->features; 2072 2073 if (protocol == htons(ETH_P_8021Q)) { 2074 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; 2075 protocol = veh->h_vlan_encapsulated_proto; 2076 } else if (!vlan_tx_tag_present(skb)) { 2077 return harmonize_features(skb, protocol, features); 2078 } 2079 2080 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX); 2081 2082 if (protocol != htons(ETH_P_8021Q)) { 2083 return harmonize_features(skb, protocol, features); 2084 } else { 2085 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | 2086 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX; 2087 return harmonize_features(skb, protocol, features); 2088 } 2089 } 2090 EXPORT_SYMBOL(netif_skb_features); 2091 2092 /* 2093 * Returns true if either: 2094 * 1. skb has frag_list and the device doesn't support FRAGLIST, or 2095 * 2. skb is fragmented and the device does not support SG, or if 2096 * at least one of fragments is in highmem and device does not 2097 * support DMA from it. 2098 */ 2099 static inline int skb_needs_linearize(struct sk_buff *skb, 2100 int features) 2101 { 2102 return skb_is_nonlinear(skb) && 2103 ((skb_has_frag_list(skb) && 2104 !(features & NETIF_F_FRAGLIST)) || 2105 (skb_shinfo(skb)->nr_frags && 2106 !(features & NETIF_F_SG))); 2107 } 2108 2109 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, 2110 struct netdev_queue *txq) 2111 { 2112 const struct net_device_ops *ops = dev->netdev_ops; 2113 int rc = NETDEV_TX_OK; 2114 unsigned int skb_len; 2115 2116 if (likely(!skb->next)) { 2117 u32 features; 2118 2119 /* 2120 * If device doesn't need skb->dst, release it right now while 2121 * its hot in this cpu cache 2122 */ 2123 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 2124 skb_dst_drop(skb); 2125 2126 if (!list_empty(&ptype_all)) 2127 dev_queue_xmit_nit(skb, dev); 2128 2129 skb_orphan_try(skb); 2130 2131 features = netif_skb_features(skb); 2132 2133 if (vlan_tx_tag_present(skb) && 2134 !(features & NETIF_F_HW_VLAN_TX)) { 2135 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); 2136 if (unlikely(!skb)) 2137 goto out; 2138 2139 skb->vlan_tci = 0; 2140 } 2141 2142 if (netif_needs_gso(skb, features)) { 2143 if (unlikely(dev_gso_segment(skb, features))) 2144 goto out_kfree_skb; 2145 if (skb->next) 2146 goto gso; 2147 } else { 2148 if (skb_needs_linearize(skb, features) && 2149 __skb_linearize(skb)) 2150 goto out_kfree_skb; 2151 2152 /* If packet is not checksummed and device does not 2153 * support checksumming for this protocol, complete 2154 * checksumming here. 2155 */ 2156 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2157 skb_set_transport_header(skb, 2158 skb_checksum_start_offset(skb)); 2159 if (!(features & NETIF_F_ALL_CSUM) && 2160 skb_checksum_help(skb)) 2161 goto out_kfree_skb; 2162 } 2163 } 2164 2165 skb_len = skb->len; 2166 rc = ops->ndo_start_xmit(skb, dev); 2167 trace_net_dev_xmit(skb, rc, dev, skb_len); 2168 if (rc == NETDEV_TX_OK) 2169 txq_trans_update(txq); 2170 return rc; 2171 } 2172 2173 gso: 2174 do { 2175 struct sk_buff *nskb = skb->next; 2176 2177 skb->next = nskb->next; 2178 nskb->next = NULL; 2179 2180 /* 2181 * If device doesn't need nskb->dst, release it right now while 2182 * its hot in this cpu cache 2183 */ 2184 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 2185 skb_dst_drop(nskb); 2186 2187 skb_len = nskb->len; 2188 rc = ops->ndo_start_xmit(nskb, dev); 2189 trace_net_dev_xmit(nskb, rc, dev, skb_len); 2190 if (unlikely(rc != NETDEV_TX_OK)) { 2191 if (rc & ~NETDEV_TX_MASK) 2192 goto out_kfree_gso_skb; 2193 nskb->next = skb->next; 2194 skb->next = nskb; 2195 return rc; 2196 } 2197 txq_trans_update(txq); 2198 if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) 2199 return NETDEV_TX_BUSY; 2200 } while (skb->next); 2201 2202 out_kfree_gso_skb: 2203 if (likely(skb->next == NULL)) 2204 skb->destructor = DEV_GSO_CB(skb)->destructor; 2205 out_kfree_skb: 2206 kfree_skb(skb); 2207 out: 2208 return rc; 2209 } 2210 2211 static u32 hashrnd __read_mostly; 2212 2213 /* 2214 * Returns a Tx hash based on the given packet descriptor a Tx queues' number 2215 * to be used as a distribution range. 2216 */ 2217 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, 2218 unsigned int num_tx_queues) 2219 { 2220 u32 hash; 2221 u16 qoffset = 0; 2222 u16 qcount = num_tx_queues; 2223 2224 if (skb_rx_queue_recorded(skb)) { 2225 hash = skb_get_rx_queue(skb); 2226 while (unlikely(hash >= num_tx_queues)) 2227 hash -= num_tx_queues; 2228 return hash; 2229 } 2230 2231 if (dev->num_tc) { 2232 u8 tc = netdev_get_prio_tc_map(dev, skb->priority); 2233 qoffset = dev->tc_to_txq[tc].offset; 2234 qcount = dev->tc_to_txq[tc].count; 2235 } 2236 2237 if (skb->sk && skb->sk->sk_hash) 2238 hash = skb->sk->sk_hash; 2239 else 2240 hash = (__force u16) skb->protocol ^ skb->rxhash; 2241 hash = jhash_1word(hash, hashrnd); 2242 2243 return (u16) (((u64) hash * qcount) >> 32) + qoffset; 2244 } 2245 EXPORT_SYMBOL(__skb_tx_hash); 2246 2247 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) 2248 { 2249 if (unlikely(queue_index >= dev->real_num_tx_queues)) { 2250 if (net_ratelimit()) { 2251 pr_warning("%s selects TX queue %d, but " 2252 "real number of TX queues is %d\n", 2253 dev->name, queue_index, dev->real_num_tx_queues); 2254 } 2255 return 0; 2256 } 2257 return queue_index; 2258 } 2259 2260 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) 2261 { 2262 #ifdef CONFIG_XPS 2263 struct xps_dev_maps *dev_maps; 2264 struct xps_map *map; 2265 int queue_index = -1; 2266 2267 rcu_read_lock(); 2268 dev_maps = rcu_dereference(dev->xps_maps); 2269 if (dev_maps) { 2270 map = rcu_dereference( 2271 dev_maps->cpu_map[raw_smp_processor_id()]); 2272 if (map) { 2273 if (map->len == 1) 2274 queue_index = map->queues[0]; 2275 else { 2276 u32 hash; 2277 if (skb->sk && skb->sk->sk_hash) 2278 hash = skb->sk->sk_hash; 2279 else 2280 hash = (__force u16) skb->protocol ^ 2281 skb->rxhash; 2282 hash = jhash_1word(hash, hashrnd); 2283 queue_index = map->queues[ 2284 ((u64)hash * map->len) >> 32]; 2285 } 2286 if (unlikely(queue_index >= dev->real_num_tx_queues)) 2287 queue_index = -1; 2288 } 2289 } 2290 rcu_read_unlock(); 2291 2292 return queue_index; 2293 #else 2294 return -1; 2295 #endif 2296 } 2297 2298 static struct netdev_queue *dev_pick_tx(struct net_device *dev, 2299 struct sk_buff *skb) 2300 { 2301 int queue_index; 2302 const struct net_device_ops *ops = dev->netdev_ops; 2303 2304 if (dev->real_num_tx_queues == 1) 2305 queue_index = 0; 2306 else if (ops->ndo_select_queue) { 2307 queue_index = ops->ndo_select_queue(dev, skb); 2308 queue_index = dev_cap_txqueue(dev, queue_index); 2309 } else { 2310 struct sock *sk = skb->sk; 2311 queue_index = sk_tx_queue_get(sk); 2312 2313 if (queue_index < 0 || skb->ooo_okay || 2314 queue_index >= dev->real_num_tx_queues) { 2315 int old_index = queue_index; 2316 2317 queue_index = get_xps_queue(dev, skb); 2318 if (queue_index < 0) 2319 queue_index = skb_tx_hash(dev, skb); 2320 2321 if (queue_index != old_index && sk) { 2322 struct dst_entry *dst = 2323 rcu_dereference_check(sk->sk_dst_cache, 1); 2324 2325 if (dst && skb_dst(skb) == dst) 2326 sk_tx_queue_set(sk, queue_index); 2327 } 2328 } 2329 } 2330 2331 skb_set_queue_mapping(skb, queue_index); 2332 return netdev_get_tx_queue(dev, queue_index); 2333 } 2334 2335 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 2336 struct net_device *dev, 2337 struct netdev_queue *txq) 2338 { 2339 spinlock_t *root_lock = qdisc_lock(q); 2340 bool contended; 2341 int rc; 2342 2343 qdisc_skb_cb(skb)->pkt_len = skb->len; 2344 qdisc_calculate_pkt_len(skb, q); 2345 /* 2346 * Heuristic to force contended enqueues to serialize on a 2347 * separate lock before trying to get qdisc main lock. 2348 * This permits __QDISC_STATE_RUNNING owner to get the lock more often 2349 * and dequeue packets faster. 2350 */ 2351 contended = qdisc_is_running(q); 2352 if (unlikely(contended)) 2353 spin_lock(&q->busylock); 2354 2355 spin_lock(root_lock); 2356 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 2357 kfree_skb(skb); 2358 rc = NET_XMIT_DROP; 2359 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 2360 qdisc_run_begin(q)) { 2361 /* 2362 * This is a work-conserving queue; there are no old skbs 2363 * waiting to be sent out; and the qdisc is not running - 2364 * xmit the skb directly. 2365 */ 2366 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) 2367 skb_dst_force(skb); 2368 2369 qdisc_bstats_update(q, skb); 2370 2371 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { 2372 if (unlikely(contended)) { 2373 spin_unlock(&q->busylock); 2374 contended = false; 2375 } 2376 __qdisc_run(q); 2377 } else 2378 qdisc_run_end(q); 2379 2380 rc = NET_XMIT_SUCCESS; 2381 } else { 2382 skb_dst_force(skb); 2383 rc = q->enqueue(skb, q) & NET_XMIT_MASK; 2384 if (qdisc_run_begin(q)) { 2385 if (unlikely(contended)) { 2386 spin_unlock(&q->busylock); 2387 contended = false; 2388 } 2389 __qdisc_run(q); 2390 } 2391 } 2392 spin_unlock(root_lock); 2393 if (unlikely(contended)) 2394 spin_unlock(&q->busylock); 2395 return rc; 2396 } 2397 2398 static DEFINE_PER_CPU(int, xmit_recursion); 2399 #define RECURSION_LIMIT 10 2400 2401 /** 2402 * dev_queue_xmit - transmit a buffer 2403 * @skb: buffer to transmit 2404 * 2405 * Queue a buffer for transmission to a network device. The caller must 2406 * have set the device and priority and built the buffer before calling 2407 * this function. The function can be called from an interrupt. 2408 * 2409 * A negative errno code is returned on a failure. A success does not 2410 * guarantee the frame will be transmitted as it may be dropped due 2411 * to congestion or traffic shaping. 2412 * 2413 * ----------------------------------------------------------------------------------- 2414 * I notice this method can also return errors from the queue disciplines, 2415 * including NET_XMIT_DROP, which is a positive value. So, errors can also 2416 * be positive. 2417 * 2418 * Regardless of the return value, the skb is consumed, so it is currently 2419 * difficult to retry a send to this method. (You can bump the ref count 2420 * before sending to hold a reference for retry if you are careful.) 2421 * 2422 * When calling this method, interrupts MUST be enabled. This is because 2423 * the BH enable code must have IRQs enabled so that it will not deadlock. 2424 * --BLG 2425 */ 2426 int dev_queue_xmit(struct sk_buff *skb) 2427 { 2428 struct net_device *dev = skb->dev; 2429 struct netdev_queue *txq; 2430 struct Qdisc *q; 2431 int rc = -ENOMEM; 2432 2433 /* Disable soft irqs for various locks below. Also 2434 * stops preemption for RCU. 2435 */ 2436 rcu_read_lock_bh(); 2437 2438 txq = dev_pick_tx(dev, skb); 2439 q = rcu_dereference_bh(txq->qdisc); 2440 2441 #ifdef CONFIG_NET_CLS_ACT 2442 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 2443 #endif 2444 trace_net_dev_queue(skb); 2445 if (q->enqueue) { 2446 rc = __dev_xmit_skb(skb, q, dev, txq); 2447 goto out; 2448 } 2449 2450 /* The device has no queue. Common case for software devices: 2451 loopback, all the sorts of tunnels... 2452 2453 Really, it is unlikely that netif_tx_lock protection is necessary 2454 here. (f.e. loopback and IP tunnels are clean ignoring statistics 2455 counters.) 2456 However, it is possible, that they rely on protection 2457 made by us here. 2458 2459 Check this and shot the lock. It is not prone from deadlocks. 2460 Either shot noqueue qdisc, it is even simpler 8) 2461 */ 2462 if (dev->flags & IFF_UP) { 2463 int cpu = smp_processor_id(); /* ok because BHs are off */ 2464 2465 if (txq->xmit_lock_owner != cpu) { 2466 2467 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) 2468 goto recursion_alert; 2469 2470 HARD_TX_LOCK(dev, txq, cpu); 2471 2472 if (!netif_tx_queue_stopped(txq)) { 2473 __this_cpu_inc(xmit_recursion); 2474 rc = dev_hard_start_xmit(skb, dev, txq); 2475 __this_cpu_dec(xmit_recursion); 2476 if (dev_xmit_complete(rc)) { 2477 HARD_TX_UNLOCK(dev, txq); 2478 goto out; 2479 } 2480 } 2481 HARD_TX_UNLOCK(dev, txq); 2482 if (net_ratelimit()) 2483 printk(KERN_CRIT "Virtual device %s asks to " 2484 "queue packet!\n", dev->name); 2485 } else { 2486 /* Recursion is detected! It is possible, 2487 * unfortunately 2488 */ 2489 recursion_alert: 2490 if (net_ratelimit()) 2491 printk(KERN_CRIT "Dead loop on virtual device " 2492 "%s, fix it urgently!\n", dev->name); 2493 } 2494 } 2495 2496 rc = -ENETDOWN; 2497 rcu_read_unlock_bh(); 2498 2499 kfree_skb(skb); 2500 return rc; 2501 out: 2502 rcu_read_unlock_bh(); 2503 return rc; 2504 } 2505 EXPORT_SYMBOL(dev_queue_xmit); 2506 2507 2508 /*======================================================================= 2509 Receiver routines 2510 =======================================================================*/ 2511 2512 int netdev_max_backlog __read_mostly = 1000; 2513 int netdev_tstamp_prequeue __read_mostly = 1; 2514 int netdev_budget __read_mostly = 300; 2515 int weight_p __read_mostly = 64; /* old backlog weight */ 2516 2517 /* Called with irq disabled */ 2518 static inline void ____napi_schedule(struct softnet_data *sd, 2519 struct napi_struct *napi) 2520 { 2521 list_add_tail(&napi->poll_list, &sd->poll_list); 2522 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2523 } 2524 2525 /* 2526 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses 2527 * and src/dst port numbers. Sets rxhash in skb to non-zero hash value 2528 * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb 2529 * if hash is a canonical 4-tuple hash over transport ports. 2530 */ 2531 void __skb_get_rxhash(struct sk_buff *skb) 2532 { 2533 int nhoff, hash = 0, poff; 2534 const struct ipv6hdr *ip6; 2535 const struct iphdr *ip; 2536 const struct vlan_hdr *vlan; 2537 u8 ip_proto; 2538 u32 addr1, addr2; 2539 u16 proto; 2540 union { 2541 u32 v32; 2542 u16 v16[2]; 2543 } ports; 2544 2545 nhoff = skb_network_offset(skb); 2546 proto = skb->protocol; 2547 2548 again: 2549 switch (proto) { 2550 case __constant_htons(ETH_P_IP): 2551 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) 2552 goto done; 2553 2554 ip = (const struct iphdr *) (skb->data + nhoff); 2555 if (ip_is_fragment(ip)) 2556 ip_proto = 0; 2557 else 2558 ip_proto = ip->protocol; 2559 addr1 = (__force u32) ip->saddr; 2560 addr2 = (__force u32) ip->daddr; 2561 nhoff += ip->ihl * 4; 2562 break; 2563 case __constant_htons(ETH_P_IPV6): 2564 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) 2565 goto done; 2566 2567 ip6 = (const struct ipv6hdr *) (skb->data + nhoff); 2568 ip_proto = ip6->nexthdr; 2569 addr1 = (__force u32) ip6->saddr.s6_addr32[3]; 2570 addr2 = (__force u32) ip6->daddr.s6_addr32[3]; 2571 nhoff += 40; 2572 break; 2573 case __constant_htons(ETH_P_8021Q): 2574 if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff)) 2575 goto done; 2576 vlan = (const struct vlan_hdr *) (skb->data + nhoff); 2577 proto = vlan->h_vlan_encapsulated_proto; 2578 nhoff += sizeof(*vlan); 2579 goto again; 2580 case __constant_htons(ETH_P_PPP_SES): 2581 if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff)) 2582 goto done; 2583 proto = *((__be16 *) (skb->data + nhoff + 2584 sizeof(struct pppoe_hdr))); 2585 nhoff += PPPOE_SES_HLEN; 2586 goto again; 2587 default: 2588 goto done; 2589 } 2590 2591 switch (ip_proto) { 2592 case IPPROTO_GRE: 2593 if (pskb_may_pull(skb, nhoff + 16)) { 2594 u8 *h = skb->data + nhoff; 2595 __be16 flags = *(__be16 *)h; 2596 2597 /* 2598 * Only look inside GRE if version zero and no 2599 * routing 2600 */ 2601 if (!(flags & (GRE_VERSION|GRE_ROUTING))) { 2602 proto = *(__be16 *)(h + 2); 2603 nhoff += 4; 2604 if (flags & GRE_CSUM) 2605 nhoff += 4; 2606 if (flags & GRE_KEY) 2607 nhoff += 4; 2608 if (flags & GRE_SEQ) 2609 nhoff += 4; 2610 goto again; 2611 } 2612 } 2613 break; 2614 case IPPROTO_IPIP: 2615 goto again; 2616 default: 2617 break; 2618 } 2619 2620 ports.v32 = 0; 2621 poff = proto_ports_offset(ip_proto); 2622 if (poff >= 0) { 2623 nhoff += poff; 2624 if (pskb_may_pull(skb, nhoff + 4)) { 2625 ports.v32 = * (__force u32 *) (skb->data + nhoff); 2626 if (ports.v16[1] < ports.v16[0]) 2627 swap(ports.v16[0], ports.v16[1]); 2628 skb->l4_rxhash = 1; 2629 } 2630 } 2631 2632 /* get a consistent hash (same value on both flow directions) */ 2633 if (addr2 < addr1) 2634 swap(addr1, addr2); 2635 2636 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); 2637 if (!hash) 2638 hash = 1; 2639 2640 done: 2641 skb->rxhash = hash; 2642 } 2643 EXPORT_SYMBOL(__skb_get_rxhash); 2644 2645 #ifdef CONFIG_RPS 2646 2647 /* One global table that all flow-based protocols share. */ 2648 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; 2649 EXPORT_SYMBOL(rps_sock_flow_table); 2650 2651 static struct rps_dev_flow * 2652 set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 2653 struct rps_dev_flow *rflow, u16 next_cpu) 2654 { 2655 u16 tcpu; 2656 2657 tcpu = rflow->cpu = next_cpu; 2658 if (tcpu != RPS_NO_CPU) { 2659 #ifdef CONFIG_RFS_ACCEL 2660 struct netdev_rx_queue *rxqueue; 2661 struct rps_dev_flow_table *flow_table; 2662 struct rps_dev_flow *old_rflow; 2663 u32 flow_id; 2664 u16 rxq_index; 2665 int rc; 2666 2667 /* Should we steer this flow to a different hardware queue? */ 2668 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || 2669 !(dev->features & NETIF_F_NTUPLE)) 2670 goto out; 2671 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); 2672 if (rxq_index == skb_get_rx_queue(skb)) 2673 goto out; 2674 2675 rxqueue = dev->_rx + rxq_index; 2676 flow_table = rcu_dereference(rxqueue->rps_flow_table); 2677 if (!flow_table) 2678 goto out; 2679 flow_id = skb->rxhash & flow_table->mask; 2680 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, 2681 rxq_index, flow_id); 2682 if (rc < 0) 2683 goto out; 2684 old_rflow = rflow; 2685 rflow = &flow_table->flows[flow_id]; 2686 rflow->cpu = next_cpu; 2687 rflow->filter = rc; 2688 if (old_rflow->filter == rflow->filter) 2689 old_rflow->filter = RPS_NO_FILTER; 2690 out: 2691 #endif 2692 rflow->last_qtail = 2693 per_cpu(softnet_data, tcpu).input_queue_head; 2694 } 2695 2696 return rflow; 2697 } 2698 2699 /* 2700 * get_rps_cpu is called from netif_receive_skb and returns the target 2701 * CPU from the RPS map of the receiving queue for a given skb. 2702 * rcu_read_lock must be held on entry. 2703 */ 2704 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 2705 struct rps_dev_flow **rflowp) 2706 { 2707 struct netdev_rx_queue *rxqueue; 2708 struct rps_map *map; 2709 struct rps_dev_flow_table *flow_table; 2710 struct rps_sock_flow_table *sock_flow_table; 2711 int cpu = -1; 2712 u16 tcpu; 2713 2714 if (skb_rx_queue_recorded(skb)) { 2715 u16 index = skb_get_rx_queue(skb); 2716 if (unlikely(index >= dev->real_num_rx_queues)) { 2717 WARN_ONCE(dev->real_num_rx_queues > 1, 2718 "%s received packet on queue %u, but number " 2719 "of RX queues is %u\n", 2720 dev->name, index, dev->real_num_rx_queues); 2721 goto done; 2722 } 2723 rxqueue = dev->_rx + index; 2724 } else 2725 rxqueue = dev->_rx; 2726 2727 map = rcu_dereference(rxqueue->rps_map); 2728 if (map) { 2729 if (map->len == 1 && 2730 !rcu_access_pointer(rxqueue->rps_flow_table)) { 2731 tcpu = map->cpus[0]; 2732 if (cpu_online(tcpu)) 2733 cpu = tcpu; 2734 goto done; 2735 } 2736 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) { 2737 goto done; 2738 } 2739 2740 skb_reset_network_header(skb); 2741 if (!skb_get_rxhash(skb)) 2742 goto done; 2743 2744 flow_table = rcu_dereference(rxqueue->rps_flow_table); 2745 sock_flow_table = rcu_dereference(rps_sock_flow_table); 2746 if (flow_table && sock_flow_table) { 2747 u16 next_cpu; 2748 struct rps_dev_flow *rflow; 2749 2750 rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; 2751 tcpu = rflow->cpu; 2752 2753 next_cpu = sock_flow_table->ents[skb->rxhash & 2754 sock_flow_table->mask]; 2755 2756 /* 2757 * If the desired CPU (where last recvmsg was done) is 2758 * different from current CPU (one in the rx-queue flow 2759 * table entry), switch if one of the following holds: 2760 * - Current CPU is unset (equal to RPS_NO_CPU). 2761 * - Current CPU is offline. 2762 * - The current CPU's queue tail has advanced beyond the 2763 * last packet that was enqueued using this table entry. 2764 * This guarantees that all previous packets for the flow 2765 * have been dequeued, thus preserving in order delivery. 2766 */ 2767 if (unlikely(tcpu != next_cpu) && 2768 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || 2769 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 2770 rflow->last_qtail)) >= 0)) 2771 rflow = set_rps_cpu(dev, skb, rflow, next_cpu); 2772 2773 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { 2774 *rflowp = rflow; 2775 cpu = tcpu; 2776 goto done; 2777 } 2778 } 2779 2780 if (map) { 2781 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; 2782 2783 if (cpu_online(tcpu)) { 2784 cpu = tcpu; 2785 goto done; 2786 } 2787 } 2788 2789 done: 2790 return cpu; 2791 } 2792 2793 #ifdef CONFIG_RFS_ACCEL 2794 2795 /** 2796 * rps_may_expire_flow - check whether an RFS hardware filter may be removed 2797 * @dev: Device on which the filter was set 2798 * @rxq_index: RX queue index 2799 * @flow_id: Flow ID passed to ndo_rx_flow_steer() 2800 * @filter_id: Filter ID returned by ndo_rx_flow_steer() 2801 * 2802 * Drivers that implement ndo_rx_flow_steer() should periodically call 2803 * this function for each installed filter and remove the filters for 2804 * which it returns %true. 2805 */ 2806 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, 2807 u32 flow_id, u16 filter_id) 2808 { 2809 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; 2810 struct rps_dev_flow_table *flow_table; 2811 struct rps_dev_flow *rflow; 2812 bool expire = true; 2813 int cpu; 2814 2815 rcu_read_lock(); 2816 flow_table = rcu_dereference(rxqueue->rps_flow_table); 2817 if (flow_table && flow_id <= flow_table->mask) { 2818 rflow = &flow_table->flows[flow_id]; 2819 cpu = ACCESS_ONCE(rflow->cpu); 2820 if (rflow->filter == filter_id && cpu != RPS_NO_CPU && 2821 ((int)(per_cpu(softnet_data, cpu).input_queue_head - 2822 rflow->last_qtail) < 2823 (int)(10 * flow_table->mask))) 2824 expire = false; 2825 } 2826 rcu_read_unlock(); 2827 return expire; 2828 } 2829 EXPORT_SYMBOL(rps_may_expire_flow); 2830 2831 #endif /* CONFIG_RFS_ACCEL */ 2832 2833 /* Called from hardirq (IPI) context */ 2834 static void rps_trigger_softirq(void *data) 2835 { 2836 struct softnet_data *sd = data; 2837 2838 ____napi_schedule(sd, &sd->backlog); 2839 sd->received_rps++; 2840 } 2841 2842 #endif /* CONFIG_RPS */ 2843 2844 /* 2845 * Check if this softnet_data structure is another cpu one 2846 * If yes, queue it to our IPI list and return 1 2847 * If no, return 0 2848 */ 2849 static int rps_ipi_queued(struct softnet_data *sd) 2850 { 2851 #ifdef CONFIG_RPS 2852 struct softnet_data *mysd = &__get_cpu_var(softnet_data); 2853 2854 if (sd != mysd) { 2855 sd->rps_ipi_next = mysd->rps_ipi_list; 2856 mysd->rps_ipi_list = sd; 2857 2858 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2859 return 1; 2860 } 2861 #endif /* CONFIG_RPS */ 2862 return 0; 2863 } 2864 2865 /* 2866 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 2867 * queue (may be a remote CPU queue). 2868 */ 2869 static int enqueue_to_backlog(struct sk_buff *skb, int cpu, 2870 unsigned int *qtail) 2871 { 2872 struct softnet_data *sd; 2873 unsigned long flags; 2874 2875 sd = &per_cpu(softnet_data, cpu); 2876 2877 local_irq_save(flags); 2878 2879 rps_lock(sd); 2880 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { 2881 if (skb_queue_len(&sd->input_pkt_queue)) { 2882 enqueue: 2883 __skb_queue_tail(&sd->input_pkt_queue, skb); 2884 input_queue_tail_incr_save(sd, qtail); 2885 rps_unlock(sd); 2886 local_irq_restore(flags); 2887 return NET_RX_SUCCESS; 2888 } 2889 2890 /* Schedule NAPI for backlog device 2891 * We can use non atomic operation since we own the queue lock 2892 */ 2893 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { 2894 if (!rps_ipi_queued(sd)) 2895 ____napi_schedule(sd, &sd->backlog); 2896 } 2897 goto enqueue; 2898 } 2899 2900 sd->dropped++; 2901 rps_unlock(sd); 2902 2903 local_irq_restore(flags); 2904 2905 atomic_long_inc(&skb->dev->rx_dropped); 2906 kfree_skb(skb); 2907 return NET_RX_DROP; 2908 } 2909 2910 /** 2911 * netif_rx - post buffer to the network code 2912 * @skb: buffer to post 2913 * 2914 * This function receives a packet from a device driver and queues it for 2915 * the upper (protocol) levels to process. It always succeeds. The buffer 2916 * may be dropped during processing for congestion control or by the 2917 * protocol layers. 2918 * 2919 * return values: 2920 * NET_RX_SUCCESS (no congestion) 2921 * NET_RX_DROP (packet was dropped) 2922 * 2923 */ 2924 2925 int netif_rx(struct sk_buff *skb) 2926 { 2927 int ret; 2928 2929 /* if netpoll wants it, pretend we never saw it */ 2930 if (netpoll_rx(skb)) 2931 return NET_RX_DROP; 2932 2933 if (netdev_tstamp_prequeue) 2934 net_timestamp_check(skb); 2935 2936 trace_netif_rx(skb); 2937 #ifdef CONFIG_RPS 2938 { 2939 struct rps_dev_flow voidflow, *rflow = &voidflow; 2940 int cpu; 2941 2942 preempt_disable(); 2943 rcu_read_lock(); 2944 2945 cpu = get_rps_cpu(skb->dev, skb, &rflow); 2946 if (cpu < 0) 2947 cpu = smp_processor_id(); 2948 2949 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 2950 2951 rcu_read_unlock(); 2952 preempt_enable(); 2953 } 2954 #else 2955 { 2956 unsigned int qtail; 2957 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 2958 put_cpu(); 2959 } 2960 #endif 2961 return ret; 2962 } 2963 EXPORT_SYMBOL(netif_rx); 2964 2965 int netif_rx_ni(struct sk_buff *skb) 2966 { 2967 int err; 2968 2969 preempt_disable(); 2970 err = netif_rx(skb); 2971 if (local_softirq_pending()) 2972 do_softirq(); 2973 preempt_enable(); 2974 2975 return err; 2976 } 2977 EXPORT_SYMBOL(netif_rx_ni); 2978 2979 static void net_tx_action(struct softirq_action *h) 2980 { 2981 struct softnet_data *sd = &__get_cpu_var(softnet_data); 2982 2983 if (sd->completion_queue) { 2984 struct sk_buff *clist; 2985 2986 local_irq_disable(); 2987 clist = sd->completion_queue; 2988 sd->completion_queue = NULL; 2989 local_irq_enable(); 2990 2991 while (clist) { 2992 struct sk_buff *skb = clist; 2993 clist = clist->next; 2994 2995 WARN_ON(atomic_read(&skb->users)); 2996 trace_kfree_skb(skb, net_tx_action); 2997 __kfree_skb(skb); 2998 } 2999 } 3000 3001 if (sd->output_queue) { 3002 struct Qdisc *head; 3003 3004 local_irq_disable(); 3005 head = sd->output_queue; 3006 sd->output_queue = NULL; 3007 sd->output_queue_tailp = &sd->output_queue; 3008 local_irq_enable(); 3009 3010 while (head) { 3011 struct Qdisc *q = head; 3012 spinlock_t *root_lock; 3013 3014 head = head->next_sched; 3015 3016 root_lock = qdisc_lock(q); 3017 if (spin_trylock(root_lock)) { 3018 smp_mb__before_clear_bit(); 3019 clear_bit(__QDISC_STATE_SCHED, 3020 &q->state); 3021 qdisc_run(q); 3022 spin_unlock(root_lock); 3023 } else { 3024 if (!test_bit(__QDISC_STATE_DEACTIVATED, 3025 &q->state)) { 3026 __netif_reschedule(q); 3027 } else { 3028 smp_mb__before_clear_bit(); 3029 clear_bit(__QDISC_STATE_SCHED, 3030 &q->state); 3031 } 3032 } 3033 } 3034 } 3035 } 3036 3037 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ 3038 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) 3039 /* This hook is defined here for ATM LANE */ 3040 int (*br_fdb_test_addr_hook)(struct net_device *dev, 3041 unsigned char *addr) __read_mostly; 3042 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 3043 #endif 3044 3045 #ifdef CONFIG_NET_CLS_ACT 3046 /* TODO: Maybe we should just force sch_ingress to be compiled in 3047 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions 3048 * a compare and 2 stores extra right now if we dont have it on 3049 * but have CONFIG_NET_CLS_ACT 3050 * NOTE: This doesn't stop any functionality; if you dont have 3051 * the ingress scheduler, you just can't add policies on ingress. 3052 * 3053 */ 3054 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) 3055 { 3056 struct net_device *dev = skb->dev; 3057 u32 ttl = G_TC_RTTL(skb->tc_verd); 3058 int result = TC_ACT_OK; 3059 struct Qdisc *q; 3060 3061 if (unlikely(MAX_RED_LOOP < ttl++)) { 3062 if (net_ratelimit()) 3063 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n", 3064 skb->skb_iif, dev->ifindex); 3065 return TC_ACT_SHOT; 3066 } 3067 3068 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); 3069 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 3070 3071 q = rxq->qdisc; 3072 if (q != &noop_qdisc) { 3073 spin_lock(qdisc_lock(q)); 3074 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) 3075 result = qdisc_enqueue_root(skb, q); 3076 spin_unlock(qdisc_lock(q)); 3077 } 3078 3079 return result; 3080 } 3081 3082 static inline struct sk_buff *handle_ing(struct sk_buff *skb, 3083 struct packet_type **pt_prev, 3084 int *ret, struct net_device *orig_dev) 3085 { 3086 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); 3087 3088 if (!rxq || rxq->qdisc == &noop_qdisc) 3089 goto out; 3090 3091 if (*pt_prev) { 3092 *ret = deliver_skb(skb, *pt_prev, orig_dev); 3093 *pt_prev = NULL; 3094 } 3095 3096 switch (ing_filter(skb, rxq)) { 3097 case TC_ACT_SHOT: 3098 case TC_ACT_STOLEN: 3099 kfree_skb(skb); 3100 return NULL; 3101 } 3102 3103 out: 3104 skb->tc_verd = 0; 3105 return skb; 3106 } 3107 #endif 3108 3109 /** 3110 * netdev_rx_handler_register - register receive handler 3111 * @dev: device to register a handler for 3112 * @rx_handler: receive handler to register 3113 * @rx_handler_data: data pointer that is used by rx handler 3114 * 3115 * Register a receive hander for a device. This handler will then be 3116 * called from __netif_receive_skb. A negative errno code is returned 3117 * on a failure. 3118 * 3119 * The caller must hold the rtnl_mutex. 3120 * 3121 * For a general description of rx_handler, see enum rx_handler_result. 3122 */ 3123 int netdev_rx_handler_register(struct net_device *dev, 3124 rx_handler_func_t *rx_handler, 3125 void *rx_handler_data) 3126 { 3127 ASSERT_RTNL(); 3128 3129 if (dev->rx_handler) 3130 return -EBUSY; 3131 3132 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); 3133 rcu_assign_pointer(dev->rx_handler, rx_handler); 3134 3135 return 0; 3136 } 3137 EXPORT_SYMBOL_GPL(netdev_rx_handler_register); 3138 3139 /** 3140 * netdev_rx_handler_unregister - unregister receive handler 3141 * @dev: device to unregister a handler from 3142 * 3143 * Unregister a receive hander from a device. 3144 * 3145 * The caller must hold the rtnl_mutex. 3146 */ 3147 void netdev_rx_handler_unregister(struct net_device *dev) 3148 { 3149 3150 ASSERT_RTNL(); 3151 RCU_INIT_POINTER(dev->rx_handler, NULL); 3152 RCU_INIT_POINTER(dev->rx_handler_data, NULL); 3153 } 3154 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 3155 3156 static int __netif_receive_skb(struct sk_buff *skb) 3157 { 3158 struct packet_type *ptype, *pt_prev; 3159 rx_handler_func_t *rx_handler; 3160 struct net_device *orig_dev; 3161 struct net_device *null_or_dev; 3162 bool deliver_exact = false; 3163 int ret = NET_RX_DROP; 3164 __be16 type; 3165 3166 if (!netdev_tstamp_prequeue) 3167 net_timestamp_check(skb); 3168 3169 trace_netif_receive_skb(skb); 3170 3171 /* if we've gotten here through NAPI, check netpoll */ 3172 if (netpoll_receive_skb(skb)) 3173 return NET_RX_DROP; 3174 3175 if (!skb->skb_iif) 3176 skb->skb_iif = skb->dev->ifindex; 3177 orig_dev = skb->dev; 3178 3179 skb_reset_network_header(skb); 3180 skb_reset_transport_header(skb); 3181 skb_reset_mac_len(skb); 3182 3183 pt_prev = NULL; 3184 3185 rcu_read_lock(); 3186 3187 another_round: 3188 3189 __this_cpu_inc(softnet_data.processed); 3190 3191 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { 3192 skb = vlan_untag(skb); 3193 if (unlikely(!skb)) 3194 goto out; 3195 } 3196 3197 #ifdef CONFIG_NET_CLS_ACT 3198 if (skb->tc_verd & TC_NCLS) { 3199 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 3200 goto ncls; 3201 } 3202 #endif 3203 3204 list_for_each_entry_rcu(ptype, &ptype_all, list) { 3205 if (!ptype->dev || ptype->dev == skb->dev) { 3206 if (pt_prev) 3207 ret = deliver_skb(skb, pt_prev, orig_dev); 3208 pt_prev = ptype; 3209 } 3210 } 3211 3212 #ifdef CONFIG_NET_CLS_ACT 3213 skb = handle_ing(skb, &pt_prev, &ret, orig_dev); 3214 if (!skb) 3215 goto out; 3216 ncls: 3217 #endif 3218 3219 rx_handler = rcu_dereference(skb->dev->rx_handler); 3220 if (rx_handler) { 3221 if (pt_prev) { 3222 ret = deliver_skb(skb, pt_prev, orig_dev); 3223 pt_prev = NULL; 3224 } 3225 switch (rx_handler(&skb)) { 3226 case RX_HANDLER_CONSUMED: 3227 goto out; 3228 case RX_HANDLER_ANOTHER: 3229 goto another_round; 3230 case RX_HANDLER_EXACT: 3231 deliver_exact = true; 3232 case RX_HANDLER_PASS: 3233 break; 3234 default: 3235 BUG(); 3236 } 3237 } 3238 3239 if (vlan_tx_tag_present(skb)) { 3240 if (pt_prev) { 3241 ret = deliver_skb(skb, pt_prev, orig_dev); 3242 pt_prev = NULL; 3243 } 3244 if (vlan_do_receive(&skb)) 3245 goto another_round; 3246 else if (unlikely(!skb)) 3247 goto out; 3248 } 3249 3250 /* deliver only exact match when indicated */ 3251 null_or_dev = deliver_exact ? skb->dev : NULL; 3252 3253 type = skb->protocol; 3254 list_for_each_entry_rcu(ptype, 3255 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 3256 if (ptype->type == type && 3257 (ptype->dev == null_or_dev || ptype->dev == skb->dev || 3258 ptype->dev == orig_dev)) { 3259 if (pt_prev) 3260 ret = deliver_skb(skb, pt_prev, orig_dev); 3261 pt_prev = ptype; 3262 } 3263 } 3264 3265 if (pt_prev) { 3266 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 3267 } else { 3268 atomic_long_inc(&skb->dev->rx_dropped); 3269 kfree_skb(skb); 3270 /* Jamal, now you will not able to escape explaining 3271 * me how you were going to use this. :-) 3272 */ 3273 ret = NET_RX_DROP; 3274 } 3275 3276 out: 3277 rcu_read_unlock(); 3278 return ret; 3279 } 3280 3281 /** 3282 * netif_receive_skb - process receive buffer from network 3283 * @skb: buffer to process 3284 * 3285 * netif_receive_skb() is the main receive data processing function. 3286 * It always succeeds. The buffer may be dropped during processing 3287 * for congestion control or by the protocol layers. 3288 * 3289 * This function may only be called from softirq context and interrupts 3290 * should be enabled. 3291 * 3292 * Return values (usually ignored): 3293 * NET_RX_SUCCESS: no congestion 3294 * NET_RX_DROP: packet was dropped 3295 */ 3296 int netif_receive_skb(struct sk_buff *skb) 3297 { 3298 if (netdev_tstamp_prequeue) 3299 net_timestamp_check(skb); 3300 3301 if (skb_defer_rx_timestamp(skb)) 3302 return NET_RX_SUCCESS; 3303 3304 #ifdef CONFIG_RPS 3305 { 3306 struct rps_dev_flow voidflow, *rflow = &voidflow; 3307 int cpu, ret; 3308 3309 rcu_read_lock(); 3310 3311 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3312 3313 if (cpu >= 0) { 3314 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3315 rcu_read_unlock(); 3316 } else { 3317 rcu_read_unlock(); 3318 ret = __netif_receive_skb(skb); 3319 } 3320 3321 return ret; 3322 } 3323 #else 3324 return __netif_receive_skb(skb); 3325 #endif 3326 } 3327 EXPORT_SYMBOL(netif_receive_skb); 3328 3329 /* Network device is going away, flush any packets still pending 3330 * Called with irqs disabled. 3331 */ 3332 static void flush_backlog(void *arg) 3333 { 3334 struct net_device *dev = arg; 3335 struct softnet_data *sd = &__get_cpu_var(softnet_data); 3336 struct sk_buff *skb, *tmp; 3337 3338 rps_lock(sd); 3339 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 3340 if (skb->dev == dev) { 3341 __skb_unlink(skb, &sd->input_pkt_queue); 3342 kfree_skb(skb); 3343 input_queue_head_incr(sd); 3344 } 3345 } 3346 rps_unlock(sd); 3347 3348 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 3349 if (skb->dev == dev) { 3350 __skb_unlink(skb, &sd->process_queue); 3351 kfree_skb(skb); 3352 input_queue_head_incr(sd); 3353 } 3354 } 3355 } 3356 3357 static int napi_gro_complete(struct sk_buff *skb) 3358 { 3359 struct packet_type *ptype; 3360 __be16 type = skb->protocol; 3361 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; 3362 int err = -ENOENT; 3363 3364 if (NAPI_GRO_CB(skb)->count == 1) { 3365 skb_shinfo(skb)->gso_size = 0; 3366 goto out; 3367 } 3368 3369 rcu_read_lock(); 3370 list_for_each_entry_rcu(ptype, head, list) { 3371 if (ptype->type != type || ptype->dev || !ptype->gro_complete) 3372 continue; 3373 3374 err = ptype->gro_complete(skb); 3375 break; 3376 } 3377 rcu_read_unlock(); 3378 3379 if (err) { 3380 WARN_ON(&ptype->list == head); 3381 kfree_skb(skb); 3382 return NET_RX_SUCCESS; 3383 } 3384 3385 out: 3386 return netif_receive_skb(skb); 3387 } 3388 3389 inline void napi_gro_flush(struct napi_struct *napi) 3390 { 3391 struct sk_buff *skb, *next; 3392 3393 for (skb = napi->gro_list; skb; skb = next) { 3394 next = skb->next; 3395 skb->next = NULL; 3396 napi_gro_complete(skb); 3397 } 3398 3399 napi->gro_count = 0; 3400 napi->gro_list = NULL; 3401 } 3402 EXPORT_SYMBOL(napi_gro_flush); 3403 3404 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3405 { 3406 struct sk_buff **pp = NULL; 3407 struct packet_type *ptype; 3408 __be16 type = skb->protocol; 3409 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; 3410 int same_flow; 3411 int mac_len; 3412 enum gro_result ret; 3413 3414 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) 3415 goto normal; 3416 3417 if (skb_is_gso(skb) || skb_has_frag_list(skb)) 3418 goto normal; 3419 3420 rcu_read_lock(); 3421 list_for_each_entry_rcu(ptype, head, list) { 3422 if (ptype->type != type || ptype->dev || !ptype->gro_receive) 3423 continue; 3424 3425 skb_set_network_header(skb, skb_gro_offset(skb)); 3426 mac_len = skb->network_header - skb->mac_header; 3427 skb->mac_len = mac_len; 3428 NAPI_GRO_CB(skb)->same_flow = 0; 3429 NAPI_GRO_CB(skb)->flush = 0; 3430 NAPI_GRO_CB(skb)->free = 0; 3431 3432 pp = ptype->gro_receive(&napi->gro_list, skb); 3433 break; 3434 } 3435 rcu_read_unlock(); 3436 3437 if (&ptype->list == head) 3438 goto normal; 3439 3440 same_flow = NAPI_GRO_CB(skb)->same_flow; 3441 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 3442 3443 if (pp) { 3444 struct sk_buff *nskb = *pp; 3445 3446 *pp = nskb->next; 3447 nskb->next = NULL; 3448 napi_gro_complete(nskb); 3449 napi->gro_count--; 3450 } 3451 3452 if (same_flow) 3453 goto ok; 3454 3455 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) 3456 goto normal; 3457 3458 napi->gro_count++; 3459 NAPI_GRO_CB(skb)->count = 1; 3460 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 3461 skb->next = napi->gro_list; 3462 napi->gro_list = skb; 3463 ret = GRO_HELD; 3464 3465 pull: 3466 if (skb_headlen(skb) < skb_gro_offset(skb)) { 3467 int grow = skb_gro_offset(skb) - skb_headlen(skb); 3468 3469 BUG_ON(skb->end - skb->tail < grow); 3470 3471 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); 3472 3473 skb->tail += grow; 3474 skb->data_len -= grow; 3475 3476 skb_shinfo(skb)->frags[0].page_offset += grow; 3477 skb_shinfo(skb)->frags[0].size -= grow; 3478 3479 if (unlikely(!skb_shinfo(skb)->frags[0].size)) { 3480 skb_frag_unref(skb, 0); 3481 memmove(skb_shinfo(skb)->frags, 3482 skb_shinfo(skb)->frags + 1, 3483 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); 3484 } 3485 } 3486 3487 ok: 3488 return ret; 3489 3490 normal: 3491 ret = GRO_NORMAL; 3492 goto pull; 3493 } 3494 EXPORT_SYMBOL(dev_gro_receive); 3495 3496 static inline gro_result_t 3497 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3498 { 3499 struct sk_buff *p; 3500 3501 for (p = napi->gro_list; p; p = p->next) { 3502 unsigned long diffs; 3503 3504 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; 3505 diffs |= p->vlan_tci ^ skb->vlan_tci; 3506 diffs |= compare_ether_header(skb_mac_header(p), 3507 skb_gro_mac_header(skb)); 3508 NAPI_GRO_CB(p)->same_flow = !diffs; 3509 NAPI_GRO_CB(p)->flush = 0; 3510 } 3511 3512 return dev_gro_receive(napi, skb); 3513 } 3514 3515 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 3516 { 3517 switch (ret) { 3518 case GRO_NORMAL: 3519 if (netif_receive_skb(skb)) 3520 ret = GRO_DROP; 3521 break; 3522 3523 case GRO_DROP: 3524 case GRO_MERGED_FREE: 3525 kfree_skb(skb); 3526 break; 3527 3528 case GRO_HELD: 3529 case GRO_MERGED: 3530 break; 3531 } 3532 3533 return ret; 3534 } 3535 EXPORT_SYMBOL(napi_skb_finish); 3536 3537 void skb_gro_reset_offset(struct sk_buff *skb) 3538 { 3539 NAPI_GRO_CB(skb)->data_offset = 0; 3540 NAPI_GRO_CB(skb)->frag0 = NULL; 3541 NAPI_GRO_CB(skb)->frag0_len = 0; 3542 3543 if (skb->mac_header == skb->tail && 3544 !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) { 3545 NAPI_GRO_CB(skb)->frag0 = 3546 skb_frag_address(&skb_shinfo(skb)->frags[0]); 3547 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size; 3548 } 3549 } 3550 EXPORT_SYMBOL(skb_gro_reset_offset); 3551 3552 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 3553 { 3554 skb_gro_reset_offset(skb); 3555 3556 return napi_skb_finish(__napi_gro_receive(napi, skb), skb); 3557 } 3558 EXPORT_SYMBOL(napi_gro_receive); 3559 3560 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 3561 { 3562 __skb_pull(skb, skb_headlen(skb)); 3563 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); 3564 skb->vlan_tci = 0; 3565 skb->dev = napi->dev; 3566 skb->skb_iif = 0; 3567 3568 napi->skb = skb; 3569 } 3570 3571 struct sk_buff *napi_get_frags(struct napi_struct *napi) 3572 { 3573 struct sk_buff *skb = napi->skb; 3574 3575 if (!skb) { 3576 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); 3577 if (skb) 3578 napi->skb = skb; 3579 } 3580 return skb; 3581 } 3582 EXPORT_SYMBOL(napi_get_frags); 3583 3584 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, 3585 gro_result_t ret) 3586 { 3587 switch (ret) { 3588 case GRO_NORMAL: 3589 case GRO_HELD: 3590 skb->protocol = eth_type_trans(skb, skb->dev); 3591 3592 if (ret == GRO_HELD) 3593 skb_gro_pull(skb, -ETH_HLEN); 3594 else if (netif_receive_skb(skb)) 3595 ret = GRO_DROP; 3596 break; 3597 3598 case GRO_DROP: 3599 case GRO_MERGED_FREE: 3600 napi_reuse_skb(napi, skb); 3601 break; 3602 3603 case GRO_MERGED: 3604 break; 3605 } 3606 3607 return ret; 3608 } 3609 EXPORT_SYMBOL(napi_frags_finish); 3610 3611 struct sk_buff *napi_frags_skb(struct napi_struct *napi) 3612 { 3613 struct sk_buff *skb = napi->skb; 3614 struct ethhdr *eth; 3615 unsigned int hlen; 3616 unsigned int off; 3617 3618 napi->skb = NULL; 3619 3620 skb_reset_mac_header(skb); 3621 skb_gro_reset_offset(skb); 3622 3623 off = skb_gro_offset(skb); 3624 hlen = off + sizeof(*eth); 3625 eth = skb_gro_header_fast(skb, off); 3626 if (skb_gro_header_hard(skb, hlen)) { 3627 eth = skb_gro_header_slow(skb, hlen, off); 3628 if (unlikely(!eth)) { 3629 napi_reuse_skb(napi, skb); 3630 skb = NULL; 3631 goto out; 3632 } 3633 } 3634 3635 skb_gro_pull(skb, sizeof(*eth)); 3636 3637 /* 3638 * This works because the only protocols we care about don't require 3639 * special handling. We'll fix it up properly at the end. 3640 */ 3641 skb->protocol = eth->h_proto; 3642 3643 out: 3644 return skb; 3645 } 3646 EXPORT_SYMBOL(napi_frags_skb); 3647 3648 gro_result_t napi_gro_frags(struct napi_struct *napi) 3649 { 3650 struct sk_buff *skb = napi_frags_skb(napi); 3651 3652 if (!skb) 3653 return GRO_DROP; 3654 3655 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); 3656 } 3657 EXPORT_SYMBOL(napi_gro_frags); 3658 3659 /* 3660 * net_rps_action sends any pending IPI's for rps. 3661 * Note: called with local irq disabled, but exits with local irq enabled. 3662 */ 3663 static void net_rps_action_and_irq_enable(struct softnet_data *sd) 3664 { 3665 #ifdef CONFIG_RPS 3666 struct softnet_data *remsd = sd->rps_ipi_list; 3667 3668 if (remsd) { 3669 sd->rps_ipi_list = NULL; 3670 3671 local_irq_enable(); 3672 3673 /* Send pending IPI's to kick RPS processing on remote cpus. */ 3674 while (remsd) { 3675 struct softnet_data *next = remsd->rps_ipi_next; 3676 3677 if (cpu_online(remsd->cpu)) 3678 __smp_call_function_single(remsd->cpu, 3679 &remsd->csd, 0); 3680 remsd = next; 3681 } 3682 } else 3683 #endif 3684 local_irq_enable(); 3685 } 3686 3687 static int process_backlog(struct napi_struct *napi, int quota) 3688 { 3689 int work = 0; 3690 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 3691 3692 #ifdef CONFIG_RPS 3693 /* Check if we have pending ipi, its better to send them now, 3694 * not waiting net_rx_action() end. 3695 */ 3696 if (sd->rps_ipi_list) { 3697 local_irq_disable(); 3698 net_rps_action_and_irq_enable(sd); 3699 } 3700 #endif 3701 napi->weight = weight_p; 3702 local_irq_disable(); 3703 while (work < quota) { 3704 struct sk_buff *skb; 3705 unsigned int qlen; 3706 3707 while ((skb = __skb_dequeue(&sd->process_queue))) { 3708 local_irq_enable(); 3709 __netif_receive_skb(skb); 3710 local_irq_disable(); 3711 input_queue_head_incr(sd); 3712 if (++work >= quota) { 3713 local_irq_enable(); 3714 return work; 3715 } 3716 } 3717 3718 rps_lock(sd); 3719 qlen = skb_queue_len(&sd->input_pkt_queue); 3720 if (qlen) 3721 skb_queue_splice_tail_init(&sd->input_pkt_queue, 3722 &sd->process_queue); 3723 3724 if (qlen < quota - work) { 3725 /* 3726 * Inline a custom version of __napi_complete(). 3727 * only current cpu owns and manipulates this napi, 3728 * and NAPI_STATE_SCHED is the only possible flag set on backlog. 3729 * we can use a plain write instead of clear_bit(), 3730 * and we dont need an smp_mb() memory barrier. 3731 */ 3732 list_del(&napi->poll_list); 3733 napi->state = 0; 3734 3735 quota = work + qlen; 3736 } 3737 rps_unlock(sd); 3738 } 3739 local_irq_enable(); 3740 3741 return work; 3742 } 3743 3744 /** 3745 * __napi_schedule - schedule for receive 3746 * @n: entry to schedule 3747 * 3748 * The entry's receive function will be scheduled to run 3749 */ 3750 void __napi_schedule(struct napi_struct *n) 3751 { 3752 unsigned long flags; 3753 3754 local_irq_save(flags); 3755 ____napi_schedule(&__get_cpu_var(softnet_data), n); 3756 local_irq_restore(flags); 3757 } 3758 EXPORT_SYMBOL(__napi_schedule); 3759 3760 void __napi_complete(struct napi_struct *n) 3761 { 3762 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 3763 BUG_ON(n->gro_list); 3764 3765 list_del(&n->poll_list); 3766 smp_mb__before_clear_bit(); 3767 clear_bit(NAPI_STATE_SCHED, &n->state); 3768 } 3769 EXPORT_SYMBOL(__napi_complete); 3770 3771 void napi_complete(struct napi_struct *n) 3772 { 3773 unsigned long flags; 3774 3775 /* 3776 * don't let napi dequeue from the cpu poll list 3777 * just in case its running on a different cpu 3778 */ 3779 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 3780 return; 3781 3782 napi_gro_flush(n); 3783 local_irq_save(flags); 3784 __napi_complete(n); 3785 local_irq_restore(flags); 3786 } 3787 EXPORT_SYMBOL(napi_complete); 3788 3789 void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 3790 int (*poll)(struct napi_struct *, int), int weight) 3791 { 3792 INIT_LIST_HEAD(&napi->poll_list); 3793 napi->gro_count = 0; 3794 napi->gro_list = NULL; 3795 napi->skb = NULL; 3796 napi->poll = poll; 3797 napi->weight = weight; 3798 list_add(&napi->dev_list, &dev->napi_list); 3799 napi->dev = dev; 3800 #ifdef CONFIG_NETPOLL 3801 spin_lock_init(&napi->poll_lock); 3802 napi->poll_owner = -1; 3803 #endif 3804 set_bit(NAPI_STATE_SCHED, &napi->state); 3805 } 3806 EXPORT_SYMBOL(netif_napi_add); 3807 3808 void netif_napi_del(struct napi_struct *napi) 3809 { 3810 struct sk_buff *skb, *next; 3811 3812 list_del_init(&napi->dev_list); 3813 napi_free_frags(napi); 3814 3815 for (skb = napi->gro_list; skb; skb = next) { 3816 next = skb->next; 3817 skb->next = NULL; 3818 kfree_skb(skb); 3819 } 3820 3821 napi->gro_list = NULL; 3822 napi->gro_count = 0; 3823 } 3824 EXPORT_SYMBOL(netif_napi_del); 3825 3826 static void net_rx_action(struct softirq_action *h) 3827 { 3828 struct softnet_data *sd = &__get_cpu_var(softnet_data); 3829 unsigned long time_limit = jiffies + 2; 3830 int budget = netdev_budget; 3831 void *have; 3832 3833 local_irq_disable(); 3834 3835 while (!list_empty(&sd->poll_list)) { 3836 struct napi_struct *n; 3837 int work, weight; 3838 3839 /* If softirq window is exhuasted then punt. 3840 * Allow this to run for 2 jiffies since which will allow 3841 * an average latency of 1.5/HZ. 3842 */ 3843 if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) 3844 goto softnet_break; 3845 3846 local_irq_enable(); 3847 3848 /* Even though interrupts have been re-enabled, this 3849 * access is safe because interrupts can only add new 3850 * entries to the tail of this list, and only ->poll() 3851 * calls can remove this head entry from the list. 3852 */ 3853 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); 3854 3855 have = netpoll_poll_lock(n); 3856 3857 weight = n->weight; 3858 3859 /* This NAPI_STATE_SCHED test is for avoiding a race 3860 * with netpoll's poll_napi(). Only the entity which 3861 * obtains the lock and sees NAPI_STATE_SCHED set will 3862 * actually make the ->poll() call. Therefore we avoid 3863 * accidentally calling ->poll() when NAPI is not scheduled. 3864 */ 3865 work = 0; 3866 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 3867 work = n->poll(n, weight); 3868 trace_napi_poll(n); 3869 } 3870 3871 WARN_ON_ONCE(work > weight); 3872 3873 budget -= work; 3874 3875 local_irq_disable(); 3876 3877 /* Drivers must not modify the NAPI state if they 3878 * consume the entire weight. In such cases this code 3879 * still "owns" the NAPI instance and therefore can 3880 * move the instance around on the list at-will. 3881 */ 3882 if (unlikely(work == weight)) { 3883 if (unlikely(napi_disable_pending(n))) { 3884 local_irq_enable(); 3885 napi_complete(n); 3886 local_irq_disable(); 3887 } else 3888 list_move_tail(&n->poll_list, &sd->poll_list); 3889 } 3890 3891 netpoll_poll_unlock(have); 3892 } 3893 out: 3894 net_rps_action_and_irq_enable(sd); 3895 3896 #ifdef CONFIG_NET_DMA 3897 /* 3898 * There may not be any more sk_buffs coming right now, so push 3899 * any pending DMA copies to hardware 3900 */ 3901 dma_issue_pending_all(); 3902 #endif 3903 3904 return; 3905 3906 softnet_break: 3907 sd->time_squeeze++; 3908 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3909 goto out; 3910 } 3911 3912 static gifconf_func_t *gifconf_list[NPROTO]; 3913 3914 /** 3915 * register_gifconf - register a SIOCGIF handler 3916 * @family: Address family 3917 * @gifconf: Function handler 3918 * 3919 * Register protocol dependent address dumping routines. The handler 3920 * that is passed must not be freed or reused until it has been replaced 3921 * by another handler. 3922 */ 3923 int register_gifconf(unsigned int family, gifconf_func_t *gifconf) 3924 { 3925 if (family >= NPROTO) 3926 return -EINVAL; 3927 gifconf_list[family] = gifconf; 3928 return 0; 3929 } 3930 EXPORT_SYMBOL(register_gifconf); 3931 3932 3933 /* 3934 * Map an interface index to its name (SIOCGIFNAME) 3935 */ 3936 3937 /* 3938 * We need this ioctl for efficient implementation of the 3939 * if_indextoname() function required by the IPv6 API. Without 3940 * it, we would have to search all the interfaces to find a 3941 * match. --pb 3942 */ 3943 3944 static int dev_ifname(struct net *net, struct ifreq __user *arg) 3945 { 3946 struct net_device *dev; 3947 struct ifreq ifr; 3948 3949 /* 3950 * Fetch the caller's info block. 3951 */ 3952 3953 if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) 3954 return -EFAULT; 3955 3956 rcu_read_lock(); 3957 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); 3958 if (!dev) { 3959 rcu_read_unlock(); 3960 return -ENODEV; 3961 } 3962 3963 strcpy(ifr.ifr_name, dev->name); 3964 rcu_read_unlock(); 3965 3966 if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) 3967 return -EFAULT; 3968 return 0; 3969 } 3970 3971 /* 3972 * Perform a SIOCGIFCONF call. This structure will change 3973 * size eventually, and there is nothing I can do about it. 3974 * Thus we will need a 'compatibility mode'. 3975 */ 3976 3977 static int dev_ifconf(struct net *net, char __user *arg) 3978 { 3979 struct ifconf ifc; 3980 struct net_device *dev; 3981 char __user *pos; 3982 int len; 3983 int total; 3984 int i; 3985 3986 /* 3987 * Fetch the caller's info block. 3988 */ 3989 3990 if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) 3991 return -EFAULT; 3992 3993 pos = ifc.ifc_buf; 3994 len = ifc.ifc_len; 3995 3996 /* 3997 * Loop over the interfaces, and write an info block for each. 3998 */ 3999 4000 total = 0; 4001 for_each_netdev(net, dev) { 4002 for (i = 0; i < NPROTO; i++) { 4003 if (gifconf_list[i]) { 4004 int done; 4005 if (!pos) 4006 done = gifconf_list[i](dev, NULL, 0); 4007 else 4008 done = gifconf_list[i](dev, pos + total, 4009 len - total); 4010 if (done < 0) 4011 return -EFAULT; 4012 total += done; 4013 } 4014 } 4015 } 4016 4017 /* 4018 * All done. Write the updated control block back to the caller. 4019 */ 4020 ifc.ifc_len = total; 4021 4022 /* 4023 * Both BSD and Solaris return 0 here, so we do too. 4024 */ 4025 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; 4026 } 4027 4028 #ifdef CONFIG_PROC_FS 4029 /* 4030 * This is invoked by the /proc filesystem handler to display a device 4031 * in detail. 4032 */ 4033 void *dev_seq_start(struct seq_file *seq, loff_t *pos) 4034 __acquires(RCU) 4035 { 4036 struct net *net = seq_file_net(seq); 4037 loff_t off; 4038 struct net_device *dev; 4039 4040 rcu_read_lock(); 4041 if (!*pos) 4042 return SEQ_START_TOKEN; 4043 4044 off = 1; 4045 for_each_netdev_rcu(net, dev) 4046 if (off++ == *pos) 4047 return dev; 4048 4049 return NULL; 4050 } 4051 4052 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4053 { 4054 struct net_device *dev = v; 4055 4056 if (v == SEQ_START_TOKEN) 4057 dev = first_net_device_rcu(seq_file_net(seq)); 4058 else 4059 dev = next_net_device_rcu(dev); 4060 4061 ++*pos; 4062 return dev; 4063 } 4064 4065 void dev_seq_stop(struct seq_file *seq, void *v) 4066 __releases(RCU) 4067 { 4068 rcu_read_unlock(); 4069 } 4070 4071 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) 4072 { 4073 struct rtnl_link_stats64 temp; 4074 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); 4075 4076 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " 4077 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", 4078 dev->name, stats->rx_bytes, stats->rx_packets, 4079 stats->rx_errors, 4080 stats->rx_dropped + stats->rx_missed_errors, 4081 stats->rx_fifo_errors, 4082 stats->rx_length_errors + stats->rx_over_errors + 4083 stats->rx_crc_errors + stats->rx_frame_errors, 4084 stats->rx_compressed, stats->multicast, 4085 stats->tx_bytes, stats->tx_packets, 4086 stats->tx_errors, stats->tx_dropped, 4087 stats->tx_fifo_errors, stats->collisions, 4088 stats->tx_carrier_errors + 4089 stats->tx_aborted_errors + 4090 stats->tx_window_errors + 4091 stats->tx_heartbeat_errors, 4092 stats->tx_compressed); 4093 } 4094 4095 /* 4096 * Called from the PROCfs module. This now uses the new arbitrary sized 4097 * /proc/net interface to create /proc/net/dev 4098 */ 4099 static int dev_seq_show(struct seq_file *seq, void *v) 4100 { 4101 if (v == SEQ_START_TOKEN) 4102 seq_puts(seq, "Inter-| Receive " 4103 " | Transmit\n" 4104 " face |bytes packets errs drop fifo frame " 4105 "compressed multicast|bytes packets errs " 4106 "drop fifo colls carrier compressed\n"); 4107 else 4108 dev_seq_printf_stats(seq, v); 4109 return 0; 4110 } 4111 4112 static struct softnet_data *softnet_get_online(loff_t *pos) 4113 { 4114 struct softnet_data *sd = NULL; 4115 4116 while (*pos < nr_cpu_ids) 4117 if (cpu_online(*pos)) { 4118 sd = &per_cpu(softnet_data, *pos); 4119 break; 4120 } else 4121 ++*pos; 4122 return sd; 4123 } 4124 4125 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) 4126 { 4127 return softnet_get_online(pos); 4128 } 4129 4130 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4131 { 4132 ++*pos; 4133 return softnet_get_online(pos); 4134 } 4135 4136 static void softnet_seq_stop(struct seq_file *seq, void *v) 4137 { 4138 } 4139 4140 static int softnet_seq_show(struct seq_file *seq, void *v) 4141 { 4142 struct softnet_data *sd = v; 4143 4144 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 4145 sd->processed, sd->dropped, sd->time_squeeze, 0, 4146 0, 0, 0, 0, /* was fastroute */ 4147 sd->cpu_collision, sd->received_rps); 4148 return 0; 4149 } 4150 4151 static const struct seq_operations dev_seq_ops = { 4152 .start = dev_seq_start, 4153 .next = dev_seq_next, 4154 .stop = dev_seq_stop, 4155 .show = dev_seq_show, 4156 }; 4157 4158 static int dev_seq_open(struct inode *inode, struct file *file) 4159 { 4160 return seq_open_net(inode, file, &dev_seq_ops, 4161 sizeof(struct seq_net_private)); 4162 } 4163 4164 static const struct file_operations dev_seq_fops = { 4165 .owner = THIS_MODULE, 4166 .open = dev_seq_open, 4167 .read = seq_read, 4168 .llseek = seq_lseek, 4169 .release = seq_release_net, 4170 }; 4171 4172 static const struct seq_operations softnet_seq_ops = { 4173 .start = softnet_seq_start, 4174 .next = softnet_seq_next, 4175 .stop = softnet_seq_stop, 4176 .show = softnet_seq_show, 4177 }; 4178 4179 static int softnet_seq_open(struct inode *inode, struct file *file) 4180 { 4181 return seq_open(file, &softnet_seq_ops); 4182 } 4183 4184 static const struct file_operations softnet_seq_fops = { 4185 .owner = THIS_MODULE, 4186 .open = softnet_seq_open, 4187 .read = seq_read, 4188 .llseek = seq_lseek, 4189 .release = seq_release, 4190 }; 4191 4192 static void *ptype_get_idx(loff_t pos) 4193 { 4194 struct packet_type *pt = NULL; 4195 loff_t i = 0; 4196 int t; 4197 4198 list_for_each_entry_rcu(pt, &ptype_all, list) { 4199 if (i == pos) 4200 return pt; 4201 ++i; 4202 } 4203 4204 for (t = 0; t < PTYPE_HASH_SIZE; t++) { 4205 list_for_each_entry_rcu(pt, &ptype_base[t], list) { 4206 if (i == pos) 4207 return pt; 4208 ++i; 4209 } 4210 } 4211 return NULL; 4212 } 4213 4214 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) 4215 __acquires(RCU) 4216 { 4217 rcu_read_lock(); 4218 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; 4219 } 4220 4221 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4222 { 4223 struct packet_type *pt; 4224 struct list_head *nxt; 4225 int hash; 4226 4227 ++*pos; 4228 if (v == SEQ_START_TOKEN) 4229 return ptype_get_idx(0); 4230 4231 pt = v; 4232 nxt = pt->list.next; 4233 if (pt->type == htons(ETH_P_ALL)) { 4234 if (nxt != &ptype_all) 4235 goto found; 4236 hash = 0; 4237 nxt = ptype_base[0].next; 4238 } else 4239 hash = ntohs(pt->type) & PTYPE_HASH_MASK; 4240 4241 while (nxt == &ptype_base[hash]) { 4242 if (++hash >= PTYPE_HASH_SIZE) 4243 return NULL; 4244 nxt = ptype_base[hash].next; 4245 } 4246 found: 4247 return list_entry(nxt, struct packet_type, list); 4248 } 4249 4250 static void ptype_seq_stop(struct seq_file *seq, void *v) 4251 __releases(RCU) 4252 { 4253 rcu_read_unlock(); 4254 } 4255 4256 static int ptype_seq_show(struct seq_file *seq, void *v) 4257 { 4258 struct packet_type *pt = v; 4259 4260 if (v == SEQ_START_TOKEN) 4261 seq_puts(seq, "Type Device Function\n"); 4262 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { 4263 if (pt->type == htons(ETH_P_ALL)) 4264 seq_puts(seq, "ALL "); 4265 else 4266 seq_printf(seq, "%04x", ntohs(pt->type)); 4267 4268 seq_printf(seq, " %-8s %pF\n", 4269 pt->dev ? pt->dev->name : "", pt->func); 4270 } 4271 4272 return 0; 4273 } 4274 4275 static const struct seq_operations ptype_seq_ops = { 4276 .start = ptype_seq_start, 4277 .next = ptype_seq_next, 4278 .stop = ptype_seq_stop, 4279 .show = ptype_seq_show, 4280 }; 4281 4282 static int ptype_seq_open(struct inode *inode, struct file *file) 4283 { 4284 return seq_open_net(inode, file, &ptype_seq_ops, 4285 sizeof(struct seq_net_private)); 4286 } 4287 4288 static const struct file_operations ptype_seq_fops = { 4289 .owner = THIS_MODULE, 4290 .open = ptype_seq_open, 4291 .read = seq_read, 4292 .llseek = seq_lseek, 4293 .release = seq_release_net, 4294 }; 4295 4296 4297 static int __net_init dev_proc_net_init(struct net *net) 4298 { 4299 int rc = -ENOMEM; 4300 4301 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) 4302 goto out; 4303 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) 4304 goto out_dev; 4305 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) 4306 goto out_softnet; 4307 4308 if (wext_proc_init(net)) 4309 goto out_ptype; 4310 rc = 0; 4311 out: 4312 return rc; 4313 out_ptype: 4314 proc_net_remove(net, "ptype"); 4315 out_softnet: 4316 proc_net_remove(net, "softnet_stat"); 4317 out_dev: 4318 proc_net_remove(net, "dev"); 4319 goto out; 4320 } 4321 4322 static void __net_exit dev_proc_net_exit(struct net *net) 4323 { 4324 wext_proc_exit(net); 4325 4326 proc_net_remove(net, "ptype"); 4327 proc_net_remove(net, "softnet_stat"); 4328 proc_net_remove(net, "dev"); 4329 } 4330 4331 static struct pernet_operations __net_initdata dev_proc_ops = { 4332 .init = dev_proc_net_init, 4333 .exit = dev_proc_net_exit, 4334 }; 4335 4336 static int __init dev_proc_init(void) 4337 { 4338 return register_pernet_subsys(&dev_proc_ops); 4339 } 4340 #else 4341 #define dev_proc_init() 0 4342 #endif /* CONFIG_PROC_FS */ 4343 4344 4345 /** 4346 * netdev_set_master - set up master pointer 4347 * @slave: slave device 4348 * @master: new master device 4349 * 4350 * Changes the master device of the slave. Pass %NULL to break the 4351 * bonding. The caller must hold the RTNL semaphore. On a failure 4352 * a negative errno code is returned. On success the reference counts 4353 * are adjusted and the function returns zero. 4354 */ 4355 int netdev_set_master(struct net_device *slave, struct net_device *master) 4356 { 4357 struct net_device *old = slave->master; 4358 4359 ASSERT_RTNL(); 4360 4361 if (master) { 4362 if (old) 4363 return -EBUSY; 4364 dev_hold(master); 4365 } 4366 4367 slave->master = master; 4368 4369 if (old) 4370 dev_put(old); 4371 return 0; 4372 } 4373 EXPORT_SYMBOL(netdev_set_master); 4374 4375 /** 4376 * netdev_set_bond_master - set up bonding master/slave pair 4377 * @slave: slave device 4378 * @master: new master device 4379 * 4380 * Changes the master device of the slave. Pass %NULL to break the 4381 * bonding. The caller must hold the RTNL semaphore. On a failure 4382 * a negative errno code is returned. On success %RTM_NEWLINK is sent 4383 * to the routing socket and the function returns zero. 4384 */ 4385 int netdev_set_bond_master(struct net_device *slave, struct net_device *master) 4386 { 4387 int err; 4388 4389 ASSERT_RTNL(); 4390 4391 err = netdev_set_master(slave, master); 4392 if (err) 4393 return err; 4394 if (master) 4395 slave->flags |= IFF_SLAVE; 4396 else 4397 slave->flags &= ~IFF_SLAVE; 4398 4399 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); 4400 return 0; 4401 } 4402 EXPORT_SYMBOL(netdev_set_bond_master); 4403 4404 static void dev_change_rx_flags(struct net_device *dev, int flags) 4405 { 4406 const struct net_device_ops *ops = dev->netdev_ops; 4407 4408 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) 4409 ops->ndo_change_rx_flags(dev, flags); 4410 } 4411 4412 static int __dev_set_promiscuity(struct net_device *dev, int inc) 4413 { 4414 unsigned short old_flags = dev->flags; 4415 uid_t uid; 4416 gid_t gid; 4417 4418 ASSERT_RTNL(); 4419 4420 dev->flags |= IFF_PROMISC; 4421 dev->promiscuity += inc; 4422 if (dev->promiscuity == 0) { 4423 /* 4424 * Avoid overflow. 4425 * If inc causes overflow, untouch promisc and return error. 4426 */ 4427 if (inc < 0) 4428 dev->flags &= ~IFF_PROMISC; 4429 else { 4430 dev->promiscuity -= inc; 4431 printk(KERN_WARNING "%s: promiscuity touches roof, " 4432 "set promiscuity failed, promiscuity feature " 4433 "of device might be broken.\n", dev->name); 4434 return -EOVERFLOW; 4435 } 4436 } 4437 if (dev->flags != old_flags) { 4438 printk(KERN_INFO "device %s %s promiscuous mode\n", 4439 dev->name, (dev->flags & IFF_PROMISC) ? "entered" : 4440 "left"); 4441 if (audit_enabled) { 4442 current_uid_gid(&uid, &gid); 4443 audit_log(current->audit_context, GFP_ATOMIC, 4444 AUDIT_ANOM_PROMISCUOUS, 4445 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 4446 dev->name, (dev->flags & IFF_PROMISC), 4447 (old_flags & IFF_PROMISC), 4448 audit_get_loginuid(current), 4449 uid, gid, 4450 audit_get_sessionid(current)); 4451 } 4452 4453 dev_change_rx_flags(dev, IFF_PROMISC); 4454 } 4455 return 0; 4456 } 4457 4458 /** 4459 * dev_set_promiscuity - update promiscuity count on a device 4460 * @dev: device 4461 * @inc: modifier 4462 * 4463 * Add or remove promiscuity from a device. While the count in the device 4464 * remains above zero the interface remains promiscuous. Once it hits zero 4465 * the device reverts back to normal filtering operation. A negative inc 4466 * value is used to drop promiscuity on the device. 4467 * Return 0 if successful or a negative errno code on error. 4468 */ 4469 int dev_set_promiscuity(struct net_device *dev, int inc) 4470 { 4471 unsigned short old_flags = dev->flags; 4472 int err; 4473 4474 err = __dev_set_promiscuity(dev, inc); 4475 if (err < 0) 4476 return err; 4477 if (dev->flags != old_flags) 4478 dev_set_rx_mode(dev); 4479 return err; 4480 } 4481 EXPORT_SYMBOL(dev_set_promiscuity); 4482 4483 /** 4484 * dev_set_allmulti - update allmulti count on a device 4485 * @dev: device 4486 * @inc: modifier 4487 * 4488 * Add or remove reception of all multicast frames to a device. While the 4489 * count in the device remains above zero the interface remains listening 4490 * to all interfaces. Once it hits zero the device reverts back to normal 4491 * filtering operation. A negative @inc value is used to drop the counter 4492 * when releasing a resource needing all multicasts. 4493 * Return 0 if successful or a negative errno code on error. 4494 */ 4495 4496 int dev_set_allmulti(struct net_device *dev, int inc) 4497 { 4498 unsigned short old_flags = dev->flags; 4499 4500 ASSERT_RTNL(); 4501 4502 dev->flags |= IFF_ALLMULTI; 4503 dev->allmulti += inc; 4504 if (dev->allmulti == 0) { 4505 /* 4506 * Avoid overflow. 4507 * If inc causes overflow, untouch allmulti and return error. 4508 */ 4509 if (inc < 0) 4510 dev->flags &= ~IFF_ALLMULTI; 4511 else { 4512 dev->allmulti -= inc; 4513 printk(KERN_WARNING "%s: allmulti touches roof, " 4514 "set allmulti failed, allmulti feature of " 4515 "device might be broken.\n", dev->name); 4516 return -EOVERFLOW; 4517 } 4518 } 4519 if (dev->flags ^ old_flags) { 4520 dev_change_rx_flags(dev, IFF_ALLMULTI); 4521 dev_set_rx_mode(dev); 4522 } 4523 return 0; 4524 } 4525 EXPORT_SYMBOL(dev_set_allmulti); 4526 4527 /* 4528 * Upload unicast and multicast address lists to device and 4529 * configure RX filtering. When the device doesn't support unicast 4530 * filtering it is put in promiscuous mode while unicast addresses 4531 * are present. 4532 */ 4533 void __dev_set_rx_mode(struct net_device *dev) 4534 { 4535 const struct net_device_ops *ops = dev->netdev_ops; 4536 4537 /* dev_open will call this function so the list will stay sane. */ 4538 if (!(dev->flags&IFF_UP)) 4539 return; 4540 4541 if (!netif_device_present(dev)) 4542 return; 4543 4544 if (!(dev->priv_flags & IFF_UNICAST_FLT)) { 4545 /* Unicast addresses changes may only happen under the rtnl, 4546 * therefore calling __dev_set_promiscuity here is safe. 4547 */ 4548 if (!netdev_uc_empty(dev) && !dev->uc_promisc) { 4549 __dev_set_promiscuity(dev, 1); 4550 dev->uc_promisc = true; 4551 } else if (netdev_uc_empty(dev) && dev->uc_promisc) { 4552 __dev_set_promiscuity(dev, -1); 4553 dev->uc_promisc = false; 4554 } 4555 } 4556 4557 if (ops->ndo_set_rx_mode) 4558 ops->ndo_set_rx_mode(dev); 4559 } 4560 4561 void dev_set_rx_mode(struct net_device *dev) 4562 { 4563 netif_addr_lock_bh(dev); 4564 __dev_set_rx_mode(dev); 4565 netif_addr_unlock_bh(dev); 4566 } 4567 4568 /** 4569 * dev_get_flags - get flags reported to userspace 4570 * @dev: device 4571 * 4572 * Get the combination of flag bits exported through APIs to userspace. 4573 */ 4574 unsigned dev_get_flags(const struct net_device *dev) 4575 { 4576 unsigned flags; 4577 4578 flags = (dev->flags & ~(IFF_PROMISC | 4579 IFF_ALLMULTI | 4580 IFF_RUNNING | 4581 IFF_LOWER_UP | 4582 IFF_DORMANT)) | 4583 (dev->gflags & (IFF_PROMISC | 4584 IFF_ALLMULTI)); 4585 4586 if (netif_running(dev)) { 4587 if (netif_oper_up(dev)) 4588 flags |= IFF_RUNNING; 4589 if (netif_carrier_ok(dev)) 4590 flags |= IFF_LOWER_UP; 4591 if (netif_dormant(dev)) 4592 flags |= IFF_DORMANT; 4593 } 4594 4595 return flags; 4596 } 4597 EXPORT_SYMBOL(dev_get_flags); 4598 4599 int __dev_change_flags(struct net_device *dev, unsigned int flags) 4600 { 4601 int old_flags = dev->flags; 4602 int ret; 4603 4604 ASSERT_RTNL(); 4605 4606 /* 4607 * Set the flags on our device. 4608 */ 4609 4610 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 4611 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 4612 IFF_AUTOMEDIA)) | 4613 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 4614 IFF_ALLMULTI)); 4615 4616 /* 4617 * Load in the correct multicast list now the flags have changed. 4618 */ 4619 4620 if ((old_flags ^ flags) & IFF_MULTICAST) 4621 dev_change_rx_flags(dev, IFF_MULTICAST); 4622 4623 dev_set_rx_mode(dev); 4624 4625 /* 4626 * Have we downed the interface. We handle IFF_UP ourselves 4627 * according to user attempts to set it, rather than blindly 4628 * setting it. 4629 */ 4630 4631 ret = 0; 4632 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ 4633 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); 4634 4635 if (!ret) 4636 dev_set_rx_mode(dev); 4637 } 4638 4639 if ((flags ^ dev->gflags) & IFF_PROMISC) { 4640 int inc = (flags & IFF_PROMISC) ? 1 : -1; 4641 4642 dev->gflags ^= IFF_PROMISC; 4643 dev_set_promiscuity(dev, inc); 4644 } 4645 4646 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 4647 is important. Some (broken) drivers set IFF_PROMISC, when 4648 IFF_ALLMULTI is requested not asking us and not reporting. 4649 */ 4650 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 4651 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 4652 4653 dev->gflags ^= IFF_ALLMULTI; 4654 dev_set_allmulti(dev, inc); 4655 } 4656 4657 return ret; 4658 } 4659 4660 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) 4661 { 4662 unsigned int changes = dev->flags ^ old_flags; 4663 4664 if (changes & IFF_UP) { 4665 if (dev->flags & IFF_UP) 4666 call_netdevice_notifiers(NETDEV_UP, dev); 4667 else 4668 call_netdevice_notifiers(NETDEV_DOWN, dev); 4669 } 4670 4671 if (dev->flags & IFF_UP && 4672 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) 4673 call_netdevice_notifiers(NETDEV_CHANGE, dev); 4674 } 4675 4676 /** 4677 * dev_change_flags - change device settings 4678 * @dev: device 4679 * @flags: device state flags 4680 * 4681 * Change settings on device based state flags. The flags are 4682 * in the userspace exported format. 4683 */ 4684 int dev_change_flags(struct net_device *dev, unsigned flags) 4685 { 4686 int ret, changes; 4687 int old_flags = dev->flags; 4688 4689 ret = __dev_change_flags(dev, flags); 4690 if (ret < 0) 4691 return ret; 4692 4693 changes = old_flags ^ dev->flags; 4694 if (changes) 4695 rtmsg_ifinfo(RTM_NEWLINK, dev, changes); 4696 4697 __dev_notify_flags(dev, old_flags); 4698 return ret; 4699 } 4700 EXPORT_SYMBOL(dev_change_flags); 4701 4702 /** 4703 * dev_set_mtu - Change maximum transfer unit 4704 * @dev: device 4705 * @new_mtu: new transfer unit 4706 * 4707 * Change the maximum transfer size of the network device. 4708 */ 4709 int dev_set_mtu(struct net_device *dev, int new_mtu) 4710 { 4711 const struct net_device_ops *ops = dev->netdev_ops; 4712 int err; 4713 4714 if (new_mtu == dev->mtu) 4715 return 0; 4716 4717 /* MTU must be positive. */ 4718 if (new_mtu < 0) 4719 return -EINVAL; 4720 4721 if (!netif_device_present(dev)) 4722 return -ENODEV; 4723 4724 err = 0; 4725 if (ops->ndo_change_mtu) 4726 err = ops->ndo_change_mtu(dev, new_mtu); 4727 else 4728 dev->mtu = new_mtu; 4729 4730 if (!err && dev->flags & IFF_UP) 4731 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 4732 return err; 4733 } 4734 EXPORT_SYMBOL(dev_set_mtu); 4735 4736 /** 4737 * dev_set_group - Change group this device belongs to 4738 * @dev: device 4739 * @new_group: group this device should belong to 4740 */ 4741 void dev_set_group(struct net_device *dev, int new_group) 4742 { 4743 dev->group = new_group; 4744 } 4745 EXPORT_SYMBOL(dev_set_group); 4746 4747 /** 4748 * dev_set_mac_address - Change Media Access Control Address 4749 * @dev: device 4750 * @sa: new address 4751 * 4752 * Change the hardware (MAC) address of the device 4753 */ 4754 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 4755 { 4756 const struct net_device_ops *ops = dev->netdev_ops; 4757 int err; 4758 4759 if (!ops->ndo_set_mac_address) 4760 return -EOPNOTSUPP; 4761 if (sa->sa_family != dev->type) 4762 return -EINVAL; 4763 if (!netif_device_present(dev)) 4764 return -ENODEV; 4765 err = ops->ndo_set_mac_address(dev, sa); 4766 if (!err) 4767 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 4768 return err; 4769 } 4770 EXPORT_SYMBOL(dev_set_mac_address); 4771 4772 /* 4773 * Perform the SIOCxIFxxx calls, inside rcu_read_lock() 4774 */ 4775 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) 4776 { 4777 int err; 4778 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); 4779 4780 if (!dev) 4781 return -ENODEV; 4782 4783 switch (cmd) { 4784 case SIOCGIFFLAGS: /* Get interface flags */ 4785 ifr->ifr_flags = (short) dev_get_flags(dev); 4786 return 0; 4787 4788 case SIOCGIFMETRIC: /* Get the metric on the interface 4789 (currently unused) */ 4790 ifr->ifr_metric = 0; 4791 return 0; 4792 4793 case SIOCGIFMTU: /* Get the MTU of a device */ 4794 ifr->ifr_mtu = dev->mtu; 4795 return 0; 4796 4797 case SIOCGIFHWADDR: 4798 if (!dev->addr_len) 4799 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); 4800 else 4801 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, 4802 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); 4803 ifr->ifr_hwaddr.sa_family = dev->type; 4804 return 0; 4805 4806 case SIOCGIFSLAVE: 4807 err = -EINVAL; 4808 break; 4809 4810 case SIOCGIFMAP: 4811 ifr->ifr_map.mem_start = dev->mem_start; 4812 ifr->ifr_map.mem_end = dev->mem_end; 4813 ifr->ifr_map.base_addr = dev->base_addr; 4814 ifr->ifr_map.irq = dev->irq; 4815 ifr->ifr_map.dma = dev->dma; 4816 ifr->ifr_map.port = dev->if_port; 4817 return 0; 4818 4819 case SIOCGIFINDEX: 4820 ifr->ifr_ifindex = dev->ifindex; 4821 return 0; 4822 4823 case SIOCGIFTXQLEN: 4824 ifr->ifr_qlen = dev->tx_queue_len; 4825 return 0; 4826 4827 default: 4828 /* dev_ioctl() should ensure this case 4829 * is never reached 4830 */ 4831 WARN_ON(1); 4832 err = -ENOTTY; 4833 break; 4834 4835 } 4836 return err; 4837 } 4838 4839 /* 4840 * Perform the SIOCxIFxxx calls, inside rtnl_lock() 4841 */ 4842 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) 4843 { 4844 int err; 4845 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); 4846 const struct net_device_ops *ops; 4847 4848 if (!dev) 4849 return -ENODEV; 4850 4851 ops = dev->netdev_ops; 4852 4853 switch (cmd) { 4854 case SIOCSIFFLAGS: /* Set interface flags */ 4855 return dev_change_flags(dev, ifr->ifr_flags); 4856 4857 case SIOCSIFMETRIC: /* Set the metric on the interface 4858 (currently unused) */ 4859 return -EOPNOTSUPP; 4860 4861 case SIOCSIFMTU: /* Set the MTU of a device */ 4862 return dev_set_mtu(dev, ifr->ifr_mtu); 4863 4864 case SIOCSIFHWADDR: 4865 return dev_set_mac_address(dev, &ifr->ifr_hwaddr); 4866 4867 case SIOCSIFHWBROADCAST: 4868 if (ifr->ifr_hwaddr.sa_family != dev->type) 4869 return -EINVAL; 4870 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, 4871 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); 4872 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 4873 return 0; 4874 4875 case SIOCSIFMAP: 4876 if (ops->ndo_set_config) { 4877 if (!netif_device_present(dev)) 4878 return -ENODEV; 4879 return ops->ndo_set_config(dev, &ifr->ifr_map); 4880 } 4881 return -EOPNOTSUPP; 4882 4883 case SIOCADDMULTI: 4884 if (!ops->ndo_set_rx_mode || 4885 ifr->ifr_hwaddr.sa_family != AF_UNSPEC) 4886 return -EINVAL; 4887 if (!netif_device_present(dev)) 4888 return -ENODEV; 4889 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); 4890 4891 case SIOCDELMULTI: 4892 if (!ops->ndo_set_rx_mode || 4893 ifr->ifr_hwaddr.sa_family != AF_UNSPEC) 4894 return -EINVAL; 4895 if (!netif_device_present(dev)) 4896 return -ENODEV; 4897 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); 4898 4899 case SIOCSIFTXQLEN: 4900 if (ifr->ifr_qlen < 0) 4901 return -EINVAL; 4902 dev->tx_queue_len = ifr->ifr_qlen; 4903 return 0; 4904 4905 case SIOCSIFNAME: 4906 ifr->ifr_newname[IFNAMSIZ-1] = '\0'; 4907 return dev_change_name(dev, ifr->ifr_newname); 4908 4909 /* 4910 * Unknown or private ioctl 4911 */ 4912 default: 4913 if ((cmd >= SIOCDEVPRIVATE && 4914 cmd <= SIOCDEVPRIVATE + 15) || 4915 cmd == SIOCBONDENSLAVE || 4916 cmd == SIOCBONDRELEASE || 4917 cmd == SIOCBONDSETHWADDR || 4918 cmd == SIOCBONDSLAVEINFOQUERY || 4919 cmd == SIOCBONDINFOQUERY || 4920 cmd == SIOCBONDCHANGEACTIVE || 4921 cmd == SIOCGMIIPHY || 4922 cmd == SIOCGMIIREG || 4923 cmd == SIOCSMIIREG || 4924 cmd == SIOCBRADDIF || 4925 cmd == SIOCBRDELIF || 4926 cmd == SIOCSHWTSTAMP || 4927 cmd == SIOCWANDEV) { 4928 err = -EOPNOTSUPP; 4929 if (ops->ndo_do_ioctl) { 4930 if (netif_device_present(dev)) 4931 err = ops->ndo_do_ioctl(dev, ifr, cmd); 4932 else 4933 err = -ENODEV; 4934 } 4935 } else 4936 err = -EINVAL; 4937 4938 } 4939 return err; 4940 } 4941 4942 /* 4943 * This function handles all "interface"-type I/O control requests. The actual 4944 * 'doing' part of this is dev_ifsioc above. 4945 */ 4946 4947 /** 4948 * dev_ioctl - network device ioctl 4949 * @net: the applicable net namespace 4950 * @cmd: command to issue 4951 * @arg: pointer to a struct ifreq in user space 4952 * 4953 * Issue ioctl functions to devices. This is normally called by the 4954 * user space syscall interfaces but can sometimes be useful for 4955 * other purposes. The return value is the return from the syscall if 4956 * positive or a negative errno code on error. 4957 */ 4958 4959 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) 4960 { 4961 struct ifreq ifr; 4962 int ret; 4963 char *colon; 4964 4965 /* One special case: SIOCGIFCONF takes ifconf argument 4966 and requires shared lock, because it sleeps writing 4967 to user space. 4968 */ 4969 4970 if (cmd == SIOCGIFCONF) { 4971 rtnl_lock(); 4972 ret = dev_ifconf(net, (char __user *) arg); 4973 rtnl_unlock(); 4974 return ret; 4975 } 4976 if (cmd == SIOCGIFNAME) 4977 return dev_ifname(net, (struct ifreq __user *)arg); 4978 4979 if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) 4980 return -EFAULT; 4981 4982 ifr.ifr_name[IFNAMSIZ-1] = 0; 4983 4984 colon = strchr(ifr.ifr_name, ':'); 4985 if (colon) 4986 *colon = 0; 4987 4988 /* 4989 * See which interface the caller is talking about. 4990 */ 4991 4992 switch (cmd) { 4993 /* 4994 * These ioctl calls: 4995 * - can be done by all. 4996 * - atomic and do not require locking. 4997 * - return a value 4998 */ 4999 case SIOCGIFFLAGS: 5000 case SIOCGIFMETRIC: 5001 case SIOCGIFMTU: 5002 case SIOCGIFHWADDR: 5003 case SIOCGIFSLAVE: 5004 case SIOCGIFMAP: 5005 case SIOCGIFINDEX: 5006 case SIOCGIFTXQLEN: 5007 dev_load(net, ifr.ifr_name); 5008 rcu_read_lock(); 5009 ret = dev_ifsioc_locked(net, &ifr, cmd); 5010 rcu_read_unlock(); 5011 if (!ret) { 5012 if (colon) 5013 *colon = ':'; 5014 if (copy_to_user(arg, &ifr, 5015 sizeof(struct ifreq))) 5016 ret = -EFAULT; 5017 } 5018 return ret; 5019 5020 case SIOCETHTOOL: 5021 dev_load(net, ifr.ifr_name); 5022 rtnl_lock(); 5023 ret = dev_ethtool(net, &ifr); 5024 rtnl_unlock(); 5025 if (!ret) { 5026 if (colon) 5027 *colon = ':'; 5028 if (copy_to_user(arg, &ifr, 5029 sizeof(struct ifreq))) 5030 ret = -EFAULT; 5031 } 5032 return ret; 5033 5034 /* 5035 * These ioctl calls: 5036 * - require superuser power. 5037 * - require strict serialization. 5038 * - return a value 5039 */ 5040 case SIOCGMIIPHY: 5041 case SIOCGMIIREG: 5042 case SIOCSIFNAME: 5043 if (!capable(CAP_NET_ADMIN)) 5044 return -EPERM; 5045 dev_load(net, ifr.ifr_name); 5046 rtnl_lock(); 5047 ret = dev_ifsioc(net, &ifr, cmd); 5048 rtnl_unlock(); 5049 if (!ret) { 5050 if (colon) 5051 *colon = ':'; 5052 if (copy_to_user(arg, &ifr, 5053 sizeof(struct ifreq))) 5054 ret = -EFAULT; 5055 } 5056 return ret; 5057 5058 /* 5059 * These ioctl calls: 5060 * - require superuser power. 5061 * - require strict serialization. 5062 * - do not return a value 5063 */ 5064 case SIOCSIFFLAGS: 5065 case SIOCSIFMETRIC: 5066 case SIOCSIFMTU: 5067 case SIOCSIFMAP: 5068 case SIOCSIFHWADDR: 5069 case SIOCSIFSLAVE: 5070 case SIOCADDMULTI: 5071 case SIOCDELMULTI: 5072 case SIOCSIFHWBROADCAST: 5073 case SIOCSIFTXQLEN: 5074 case SIOCSMIIREG: 5075 case SIOCBONDENSLAVE: 5076 case SIOCBONDRELEASE: 5077 case SIOCBONDSETHWADDR: 5078 case SIOCBONDCHANGEACTIVE: 5079 case SIOCBRADDIF: 5080 case SIOCBRDELIF: 5081 case SIOCSHWTSTAMP: 5082 if (!capable(CAP_NET_ADMIN)) 5083 return -EPERM; 5084 /* fall through */ 5085 case SIOCBONDSLAVEINFOQUERY: 5086 case SIOCBONDINFOQUERY: 5087 dev_load(net, ifr.ifr_name); 5088 rtnl_lock(); 5089 ret = dev_ifsioc(net, &ifr, cmd); 5090 rtnl_unlock(); 5091 return ret; 5092 5093 case SIOCGIFMEM: 5094 /* Get the per device memory space. We can add this but 5095 * currently do not support it */ 5096 case SIOCSIFMEM: 5097 /* Set the per device memory buffer space. 5098 * Not applicable in our case */ 5099 case SIOCSIFLINK: 5100 return -ENOTTY; 5101 5102 /* 5103 * Unknown or private ioctl. 5104 */ 5105 default: 5106 if (cmd == SIOCWANDEV || 5107 (cmd >= SIOCDEVPRIVATE && 5108 cmd <= SIOCDEVPRIVATE + 15)) { 5109 dev_load(net, ifr.ifr_name); 5110 rtnl_lock(); 5111 ret = dev_ifsioc(net, &ifr, cmd); 5112 rtnl_unlock(); 5113 if (!ret && copy_to_user(arg, &ifr, 5114 sizeof(struct ifreq))) 5115 ret = -EFAULT; 5116 return ret; 5117 } 5118 /* Take care of Wireless Extensions */ 5119 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) 5120 return wext_handle_ioctl(net, &ifr, cmd, arg); 5121 return -ENOTTY; 5122 } 5123 } 5124 5125 5126 /** 5127 * dev_new_index - allocate an ifindex 5128 * @net: the applicable net namespace 5129 * 5130 * Returns a suitable unique value for a new device interface 5131 * number. The caller must hold the rtnl semaphore or the 5132 * dev_base_lock to be sure it remains unique. 5133 */ 5134 static int dev_new_index(struct net *net) 5135 { 5136 static int ifindex; 5137 for (;;) { 5138 if (++ifindex <= 0) 5139 ifindex = 1; 5140 if (!__dev_get_by_index(net, ifindex)) 5141 return ifindex; 5142 } 5143 } 5144 5145 /* Delayed registration/unregisteration */ 5146 static LIST_HEAD(net_todo_list); 5147 5148 static void net_set_todo(struct net_device *dev) 5149 { 5150 list_add_tail(&dev->todo_list, &net_todo_list); 5151 } 5152 5153 static void rollback_registered_many(struct list_head *head) 5154 { 5155 struct net_device *dev, *tmp; 5156 5157 BUG_ON(dev_boot_phase); 5158 ASSERT_RTNL(); 5159 5160 list_for_each_entry_safe(dev, tmp, head, unreg_list) { 5161 /* Some devices call without registering 5162 * for initialization unwind. Remove those 5163 * devices and proceed with the remaining. 5164 */ 5165 if (dev->reg_state == NETREG_UNINITIALIZED) { 5166 pr_debug("unregister_netdevice: device %s/%p never " 5167 "was registered\n", dev->name, dev); 5168 5169 WARN_ON(1); 5170 list_del(&dev->unreg_list); 5171 continue; 5172 } 5173 dev->dismantle = true; 5174 BUG_ON(dev->reg_state != NETREG_REGISTERED); 5175 } 5176 5177 /* If device is running, close it first. */ 5178 dev_close_many(head); 5179 5180 list_for_each_entry(dev, head, unreg_list) { 5181 /* And unlink it from device chain. */ 5182 unlist_netdevice(dev); 5183 5184 dev->reg_state = NETREG_UNREGISTERING; 5185 } 5186 5187 synchronize_net(); 5188 5189 list_for_each_entry(dev, head, unreg_list) { 5190 /* Shutdown queueing discipline. */ 5191 dev_shutdown(dev); 5192 5193 5194 /* Notify protocols, that we are about to destroy 5195 this device. They should clean all the things. 5196 */ 5197 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 5198 5199 if (!dev->rtnl_link_ops || 5200 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 5201 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); 5202 5203 /* 5204 * Flush the unicast and multicast chains 5205 */ 5206 dev_uc_flush(dev); 5207 dev_mc_flush(dev); 5208 5209 if (dev->netdev_ops->ndo_uninit) 5210 dev->netdev_ops->ndo_uninit(dev); 5211 5212 /* Notifier chain MUST detach us from master device. */ 5213 WARN_ON(dev->master); 5214 5215 /* Remove entries from kobject tree */ 5216 netdev_unregister_kobject(dev); 5217 } 5218 5219 /* Process any work delayed until the end of the batch */ 5220 dev = list_first_entry(head, struct net_device, unreg_list); 5221 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); 5222 5223 rcu_barrier(); 5224 5225 list_for_each_entry(dev, head, unreg_list) 5226 dev_put(dev); 5227 } 5228 5229 static void rollback_registered(struct net_device *dev) 5230 { 5231 LIST_HEAD(single); 5232 5233 list_add(&dev->unreg_list, &single); 5234 rollback_registered_many(&single); 5235 list_del(&single); 5236 } 5237 5238 static u32 netdev_fix_features(struct net_device *dev, u32 features) 5239 { 5240 /* Fix illegal checksum combinations */ 5241 if ((features & NETIF_F_HW_CSUM) && 5242 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 5243 netdev_warn(dev, "mixed HW and IP checksum settings.\n"); 5244 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 5245 } 5246 5247 if ((features & NETIF_F_NO_CSUM) && 5248 (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 5249 netdev_warn(dev, "mixed no checksumming and other settings.\n"); 5250 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); 5251 } 5252 5253 /* Fix illegal SG+CSUM combinations. */ 5254 if ((features & NETIF_F_SG) && 5255 !(features & NETIF_F_ALL_CSUM)) { 5256 netdev_dbg(dev, 5257 "Dropping NETIF_F_SG since no checksum feature.\n"); 5258 features &= ~NETIF_F_SG; 5259 } 5260 5261 /* TSO requires that SG is present as well. */ 5262 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { 5263 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); 5264 features &= ~NETIF_F_ALL_TSO; 5265 } 5266 5267 /* TSO ECN requires that TSO is present as well. */ 5268 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) 5269 features &= ~NETIF_F_TSO_ECN; 5270 5271 /* Software GSO depends on SG. */ 5272 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { 5273 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); 5274 features &= ~NETIF_F_GSO; 5275 } 5276 5277 /* UFO needs SG and checksumming */ 5278 if (features & NETIF_F_UFO) { 5279 /* maybe split UFO into V4 and V6? */ 5280 if (!((features & NETIF_F_GEN_CSUM) || 5281 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) 5282 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 5283 netdev_dbg(dev, 5284 "Dropping NETIF_F_UFO since no checksum offload features.\n"); 5285 features &= ~NETIF_F_UFO; 5286 } 5287 5288 if (!(features & NETIF_F_SG)) { 5289 netdev_dbg(dev, 5290 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); 5291 features &= ~NETIF_F_UFO; 5292 } 5293 } 5294 5295 return features; 5296 } 5297 5298 int __netdev_update_features(struct net_device *dev) 5299 { 5300 u32 features; 5301 int err = 0; 5302 5303 ASSERT_RTNL(); 5304 5305 features = netdev_get_wanted_features(dev); 5306 5307 if (dev->netdev_ops->ndo_fix_features) 5308 features = dev->netdev_ops->ndo_fix_features(dev, features); 5309 5310 /* driver might be less strict about feature dependencies */ 5311 features = netdev_fix_features(dev, features); 5312 5313 if (dev->features == features) 5314 return 0; 5315 5316 netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n", 5317 dev->features, features); 5318 5319 if (dev->netdev_ops->ndo_set_features) 5320 err = dev->netdev_ops->ndo_set_features(dev, features); 5321 5322 if (unlikely(err < 0)) { 5323 netdev_err(dev, 5324 "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n", 5325 err, features, dev->features); 5326 return -1; 5327 } 5328 5329 if (!err) 5330 dev->features = features; 5331 5332 return 1; 5333 } 5334 5335 /** 5336 * netdev_update_features - recalculate device features 5337 * @dev: the device to check 5338 * 5339 * Recalculate dev->features set and send notifications if it 5340 * has changed. Should be called after driver or hardware dependent 5341 * conditions might have changed that influence the features. 5342 */ 5343 void netdev_update_features(struct net_device *dev) 5344 { 5345 if (__netdev_update_features(dev)) 5346 netdev_features_change(dev); 5347 } 5348 EXPORT_SYMBOL(netdev_update_features); 5349 5350 /** 5351 * netdev_change_features - recalculate device features 5352 * @dev: the device to check 5353 * 5354 * Recalculate dev->features set and send notifications even 5355 * if they have not changed. Should be called instead of 5356 * netdev_update_features() if also dev->vlan_features might 5357 * have changed to allow the changes to be propagated to stacked 5358 * VLAN devices. 5359 */ 5360 void netdev_change_features(struct net_device *dev) 5361 { 5362 __netdev_update_features(dev); 5363 netdev_features_change(dev); 5364 } 5365 EXPORT_SYMBOL(netdev_change_features); 5366 5367 /** 5368 * netif_stacked_transfer_operstate - transfer operstate 5369 * @rootdev: the root or lower level device to transfer state from 5370 * @dev: the device to transfer operstate to 5371 * 5372 * Transfer operational state from root to device. This is normally 5373 * called when a stacking relationship exists between the root 5374 * device and the device(a leaf device). 5375 */ 5376 void netif_stacked_transfer_operstate(const struct net_device *rootdev, 5377 struct net_device *dev) 5378 { 5379 if (rootdev->operstate == IF_OPER_DORMANT) 5380 netif_dormant_on(dev); 5381 else 5382 netif_dormant_off(dev); 5383 5384 if (netif_carrier_ok(rootdev)) { 5385 if (!netif_carrier_ok(dev)) 5386 netif_carrier_on(dev); 5387 } else { 5388 if (netif_carrier_ok(dev)) 5389 netif_carrier_off(dev); 5390 } 5391 } 5392 EXPORT_SYMBOL(netif_stacked_transfer_operstate); 5393 5394 #ifdef CONFIG_RPS 5395 static int netif_alloc_rx_queues(struct net_device *dev) 5396 { 5397 unsigned int i, count = dev->num_rx_queues; 5398 struct netdev_rx_queue *rx; 5399 5400 BUG_ON(count < 1); 5401 5402 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); 5403 if (!rx) { 5404 pr_err("netdev: Unable to allocate %u rx queues.\n", count); 5405 return -ENOMEM; 5406 } 5407 dev->_rx = rx; 5408 5409 for (i = 0; i < count; i++) 5410 rx[i].dev = dev; 5411 return 0; 5412 } 5413 #endif 5414 5415 static void netdev_init_one_queue(struct net_device *dev, 5416 struct netdev_queue *queue, void *_unused) 5417 { 5418 /* Initialize queue lock */ 5419 spin_lock_init(&queue->_xmit_lock); 5420 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); 5421 queue->xmit_lock_owner = -1; 5422 netdev_queue_numa_node_write(queue, NUMA_NO_NODE); 5423 queue->dev = dev; 5424 } 5425 5426 static int netif_alloc_netdev_queues(struct net_device *dev) 5427 { 5428 unsigned int count = dev->num_tx_queues; 5429 struct netdev_queue *tx; 5430 5431 BUG_ON(count < 1); 5432 5433 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); 5434 if (!tx) { 5435 pr_err("netdev: Unable to allocate %u tx queues.\n", 5436 count); 5437 return -ENOMEM; 5438 } 5439 dev->_tx = tx; 5440 5441 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 5442 spin_lock_init(&dev->tx_global_lock); 5443 5444 return 0; 5445 } 5446 5447 /** 5448 * register_netdevice - register a network device 5449 * @dev: device to register 5450 * 5451 * Take a completed network device structure and add it to the kernel 5452 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 5453 * chain. 0 is returned on success. A negative errno code is returned 5454 * on a failure to set up the device, or if the name is a duplicate. 5455 * 5456 * Callers must hold the rtnl semaphore. You may want 5457 * register_netdev() instead of this. 5458 * 5459 * BUGS: 5460 * The locking appears insufficient to guarantee two parallel registers 5461 * will not get the same name. 5462 */ 5463 5464 int register_netdevice(struct net_device *dev) 5465 { 5466 int ret; 5467 struct net *net = dev_net(dev); 5468 5469 BUG_ON(dev_boot_phase); 5470 ASSERT_RTNL(); 5471 5472 might_sleep(); 5473 5474 /* When net_device's are persistent, this will be fatal. */ 5475 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 5476 BUG_ON(!net); 5477 5478 spin_lock_init(&dev->addr_list_lock); 5479 netdev_set_addr_lockdep_class(dev); 5480 5481 dev->iflink = -1; 5482 5483 ret = dev_get_valid_name(dev, dev->name); 5484 if (ret < 0) 5485 goto out; 5486 5487 /* Init, if this function is available */ 5488 if (dev->netdev_ops->ndo_init) { 5489 ret = dev->netdev_ops->ndo_init(dev); 5490 if (ret) { 5491 if (ret > 0) 5492 ret = -EIO; 5493 goto out; 5494 } 5495 } 5496 5497 dev->ifindex = dev_new_index(net); 5498 if (dev->iflink == -1) 5499 dev->iflink = dev->ifindex; 5500 5501 /* Transfer changeable features to wanted_features and enable 5502 * software offloads (GSO and GRO). 5503 */ 5504 dev->hw_features |= NETIF_F_SOFT_FEATURES; 5505 dev->features |= NETIF_F_SOFT_FEATURES; 5506 dev->wanted_features = dev->features & dev->hw_features; 5507 5508 /* Turn on no cache copy if HW is doing checksum */ 5509 dev->hw_features |= NETIF_F_NOCACHE_COPY; 5510 if ((dev->features & NETIF_F_ALL_CSUM) && 5511 !(dev->features & NETIF_F_NO_CSUM)) { 5512 dev->wanted_features |= NETIF_F_NOCACHE_COPY; 5513 dev->features |= NETIF_F_NOCACHE_COPY; 5514 } 5515 5516 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. 5517 */ 5518 dev->vlan_features |= NETIF_F_HIGHDMA; 5519 5520 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 5521 ret = notifier_to_errno(ret); 5522 if (ret) 5523 goto err_uninit; 5524 5525 ret = netdev_register_kobject(dev); 5526 if (ret) 5527 goto err_uninit; 5528 dev->reg_state = NETREG_REGISTERED; 5529 5530 __netdev_update_features(dev); 5531 5532 /* 5533 * Default initial state at registry is that the 5534 * device is present. 5535 */ 5536 5537 set_bit(__LINK_STATE_PRESENT, &dev->state); 5538 5539 dev_init_scheduler(dev); 5540 dev_hold(dev); 5541 list_netdevice(dev); 5542 5543 /* Notify protocols, that a new device appeared. */ 5544 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 5545 ret = notifier_to_errno(ret); 5546 if (ret) { 5547 rollback_registered(dev); 5548 dev->reg_state = NETREG_UNREGISTERED; 5549 } 5550 /* 5551 * Prevent userspace races by waiting until the network 5552 * device is fully setup before sending notifications. 5553 */ 5554 if (!dev->rtnl_link_ops || 5555 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 5556 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); 5557 5558 out: 5559 return ret; 5560 5561 err_uninit: 5562 if (dev->netdev_ops->ndo_uninit) 5563 dev->netdev_ops->ndo_uninit(dev); 5564 goto out; 5565 } 5566 EXPORT_SYMBOL(register_netdevice); 5567 5568 /** 5569 * init_dummy_netdev - init a dummy network device for NAPI 5570 * @dev: device to init 5571 * 5572 * This takes a network device structure and initialize the minimum 5573 * amount of fields so it can be used to schedule NAPI polls without 5574 * registering a full blown interface. This is to be used by drivers 5575 * that need to tie several hardware interfaces to a single NAPI 5576 * poll scheduler due to HW limitations. 5577 */ 5578 int init_dummy_netdev(struct net_device *dev) 5579 { 5580 /* Clear everything. Note we don't initialize spinlocks 5581 * are they aren't supposed to be taken by any of the 5582 * NAPI code and this dummy netdev is supposed to be 5583 * only ever used for NAPI polls 5584 */ 5585 memset(dev, 0, sizeof(struct net_device)); 5586 5587 /* make sure we BUG if trying to hit standard 5588 * register/unregister code path 5589 */ 5590 dev->reg_state = NETREG_DUMMY; 5591 5592 /* NAPI wants this */ 5593 INIT_LIST_HEAD(&dev->napi_list); 5594 5595 /* a dummy interface is started by default */ 5596 set_bit(__LINK_STATE_PRESENT, &dev->state); 5597 set_bit(__LINK_STATE_START, &dev->state); 5598 5599 /* Note : We dont allocate pcpu_refcnt for dummy devices, 5600 * because users of this 'device' dont need to change 5601 * its refcount. 5602 */ 5603 5604 return 0; 5605 } 5606 EXPORT_SYMBOL_GPL(init_dummy_netdev); 5607 5608 5609 /** 5610 * register_netdev - register a network device 5611 * @dev: device to register 5612 * 5613 * Take a completed network device structure and add it to the kernel 5614 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 5615 * chain. 0 is returned on success. A negative errno code is returned 5616 * on a failure to set up the device, or if the name is a duplicate. 5617 * 5618 * This is a wrapper around register_netdevice that takes the rtnl semaphore 5619 * and expands the device name if you passed a format string to 5620 * alloc_netdev. 5621 */ 5622 int register_netdev(struct net_device *dev) 5623 { 5624 int err; 5625 5626 rtnl_lock(); 5627 err = register_netdevice(dev); 5628 rtnl_unlock(); 5629 return err; 5630 } 5631 EXPORT_SYMBOL(register_netdev); 5632 5633 int netdev_refcnt_read(const struct net_device *dev) 5634 { 5635 int i, refcnt = 0; 5636 5637 for_each_possible_cpu(i) 5638 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); 5639 return refcnt; 5640 } 5641 EXPORT_SYMBOL(netdev_refcnt_read); 5642 5643 /* 5644 * netdev_wait_allrefs - wait until all references are gone. 5645 * 5646 * This is called when unregistering network devices. 5647 * 5648 * Any protocol or device that holds a reference should register 5649 * for netdevice notification, and cleanup and put back the 5650 * reference if they receive an UNREGISTER event. 5651 * We can get stuck here if buggy protocols don't correctly 5652 * call dev_put. 5653 */ 5654 static void netdev_wait_allrefs(struct net_device *dev) 5655 { 5656 unsigned long rebroadcast_time, warning_time; 5657 int refcnt; 5658 5659 linkwatch_forget_dev(dev); 5660 5661 rebroadcast_time = warning_time = jiffies; 5662 refcnt = netdev_refcnt_read(dev); 5663 5664 while (refcnt != 0) { 5665 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 5666 rtnl_lock(); 5667 5668 /* Rebroadcast unregister notification */ 5669 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 5670 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users 5671 * should have already handle it the first time */ 5672 5673 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 5674 &dev->state)) { 5675 /* We must not have linkwatch events 5676 * pending on unregister. If this 5677 * happens, we simply run the queue 5678 * unscheduled, resulting in a noop 5679 * for this device. 5680 */ 5681 linkwatch_run_queue(); 5682 } 5683 5684 __rtnl_unlock(); 5685 5686 rebroadcast_time = jiffies; 5687 } 5688 5689 msleep(250); 5690 5691 refcnt = netdev_refcnt_read(dev); 5692 5693 if (time_after(jiffies, warning_time + 10 * HZ)) { 5694 printk(KERN_EMERG "unregister_netdevice: " 5695 "waiting for %s to become free. Usage " 5696 "count = %d\n", 5697 dev->name, refcnt); 5698 warning_time = jiffies; 5699 } 5700 } 5701 } 5702 5703 /* The sequence is: 5704 * 5705 * rtnl_lock(); 5706 * ... 5707 * register_netdevice(x1); 5708 * register_netdevice(x2); 5709 * ... 5710 * unregister_netdevice(y1); 5711 * unregister_netdevice(y2); 5712 * ... 5713 * rtnl_unlock(); 5714 * free_netdev(y1); 5715 * free_netdev(y2); 5716 * 5717 * We are invoked by rtnl_unlock(). 5718 * This allows us to deal with problems: 5719 * 1) We can delete sysfs objects which invoke hotplug 5720 * without deadlocking with linkwatch via keventd. 5721 * 2) Since we run with the RTNL semaphore not held, we can sleep 5722 * safely in order to wait for the netdev refcnt to drop to zero. 5723 * 5724 * We must not return until all unregister events added during 5725 * the interval the lock was held have been completed. 5726 */ 5727 void netdev_run_todo(void) 5728 { 5729 struct list_head list; 5730 5731 /* Snapshot list, allow later requests */ 5732 list_replace_init(&net_todo_list, &list); 5733 5734 __rtnl_unlock(); 5735 5736 while (!list_empty(&list)) { 5737 struct net_device *dev 5738 = list_first_entry(&list, struct net_device, todo_list); 5739 list_del(&dev->todo_list); 5740 5741 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 5742 printk(KERN_ERR "network todo '%s' but state %d\n", 5743 dev->name, dev->reg_state); 5744 dump_stack(); 5745 continue; 5746 } 5747 5748 dev->reg_state = NETREG_UNREGISTERED; 5749 5750 on_each_cpu(flush_backlog, dev, 1); 5751 5752 netdev_wait_allrefs(dev); 5753 5754 /* paranoia */ 5755 BUG_ON(netdev_refcnt_read(dev)); 5756 WARN_ON(rcu_access_pointer(dev->ip_ptr)); 5757 WARN_ON(rcu_access_pointer(dev->ip6_ptr)); 5758 WARN_ON(dev->dn_ptr); 5759 5760 if (dev->destructor) 5761 dev->destructor(dev); 5762 5763 /* Free network device */ 5764 kobject_put(&dev->dev.kobj); 5765 } 5766 } 5767 5768 /* Convert net_device_stats to rtnl_link_stats64. They have the same 5769 * fields in the same order, with only the type differing. 5770 */ 5771 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, 5772 const struct net_device_stats *netdev_stats) 5773 { 5774 #if BITS_PER_LONG == 64 5775 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); 5776 memcpy(stats64, netdev_stats, sizeof(*stats64)); 5777 #else 5778 size_t i, n = sizeof(*stats64) / sizeof(u64); 5779 const unsigned long *src = (const unsigned long *)netdev_stats; 5780 u64 *dst = (u64 *)stats64; 5781 5782 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != 5783 sizeof(*stats64) / sizeof(u64)); 5784 for (i = 0; i < n; i++) 5785 dst[i] = src[i]; 5786 #endif 5787 } 5788 5789 /** 5790 * dev_get_stats - get network device statistics 5791 * @dev: device to get statistics from 5792 * @storage: place to store stats 5793 * 5794 * Get network statistics from device. Return @storage. 5795 * The device driver may provide its own method by setting 5796 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; 5797 * otherwise the internal statistics structure is used. 5798 */ 5799 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, 5800 struct rtnl_link_stats64 *storage) 5801 { 5802 const struct net_device_ops *ops = dev->netdev_ops; 5803 5804 if (ops->ndo_get_stats64) { 5805 memset(storage, 0, sizeof(*storage)); 5806 ops->ndo_get_stats64(dev, storage); 5807 } else if (ops->ndo_get_stats) { 5808 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 5809 } else { 5810 netdev_stats_to_stats64(storage, &dev->stats); 5811 } 5812 storage->rx_dropped += atomic_long_read(&dev->rx_dropped); 5813 return storage; 5814 } 5815 EXPORT_SYMBOL(dev_get_stats); 5816 5817 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) 5818 { 5819 struct netdev_queue *queue = dev_ingress_queue(dev); 5820 5821 #ifdef CONFIG_NET_CLS_ACT 5822 if (queue) 5823 return queue; 5824 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 5825 if (!queue) 5826 return NULL; 5827 netdev_init_one_queue(dev, queue, NULL); 5828 queue->qdisc = &noop_qdisc; 5829 queue->qdisc_sleeping = &noop_qdisc; 5830 rcu_assign_pointer(dev->ingress_queue, queue); 5831 #endif 5832 return queue; 5833 } 5834 5835 /** 5836 * alloc_netdev_mqs - allocate network device 5837 * @sizeof_priv: size of private data to allocate space for 5838 * @name: device name format string 5839 * @setup: callback to initialize device 5840 * @txqs: the number of TX subqueues to allocate 5841 * @rxqs: the number of RX subqueues to allocate 5842 * 5843 * Allocates a struct net_device with private data area for driver use 5844 * and performs basic initialization. Also allocates subquue structs 5845 * for each queue on the device. 5846 */ 5847 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, 5848 void (*setup)(struct net_device *), 5849 unsigned int txqs, unsigned int rxqs) 5850 { 5851 struct net_device *dev; 5852 size_t alloc_size; 5853 struct net_device *p; 5854 5855 BUG_ON(strlen(name) >= sizeof(dev->name)); 5856 5857 if (txqs < 1) { 5858 pr_err("alloc_netdev: Unable to allocate device " 5859 "with zero queues.\n"); 5860 return NULL; 5861 } 5862 5863 #ifdef CONFIG_RPS 5864 if (rxqs < 1) { 5865 pr_err("alloc_netdev: Unable to allocate device " 5866 "with zero RX queues.\n"); 5867 return NULL; 5868 } 5869 #endif 5870 5871 alloc_size = sizeof(struct net_device); 5872 if (sizeof_priv) { 5873 /* ensure 32-byte alignment of private area */ 5874 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); 5875 alloc_size += sizeof_priv; 5876 } 5877 /* ensure 32-byte alignment of whole construct */ 5878 alloc_size += NETDEV_ALIGN - 1; 5879 5880 p = kzalloc(alloc_size, GFP_KERNEL); 5881 if (!p) { 5882 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); 5883 return NULL; 5884 } 5885 5886 dev = PTR_ALIGN(p, NETDEV_ALIGN); 5887 dev->padded = (char *)dev - (char *)p; 5888 5889 dev->pcpu_refcnt = alloc_percpu(int); 5890 if (!dev->pcpu_refcnt) 5891 goto free_p; 5892 5893 if (dev_addr_init(dev)) 5894 goto free_pcpu; 5895 5896 dev_mc_init(dev); 5897 dev_uc_init(dev); 5898 5899 dev_net_set(dev, &init_net); 5900 5901 dev->gso_max_size = GSO_MAX_SIZE; 5902 5903 INIT_LIST_HEAD(&dev->napi_list); 5904 INIT_LIST_HEAD(&dev->unreg_list); 5905 INIT_LIST_HEAD(&dev->link_watch_list); 5906 dev->priv_flags = IFF_XMIT_DST_RELEASE; 5907 setup(dev); 5908 5909 dev->num_tx_queues = txqs; 5910 dev->real_num_tx_queues = txqs; 5911 if (netif_alloc_netdev_queues(dev)) 5912 goto free_all; 5913 5914 #ifdef CONFIG_RPS 5915 dev->num_rx_queues = rxqs; 5916 dev->real_num_rx_queues = rxqs; 5917 if (netif_alloc_rx_queues(dev)) 5918 goto free_all; 5919 #endif 5920 5921 strcpy(dev->name, name); 5922 dev->group = INIT_NETDEV_GROUP; 5923 return dev; 5924 5925 free_all: 5926 free_netdev(dev); 5927 return NULL; 5928 5929 free_pcpu: 5930 free_percpu(dev->pcpu_refcnt); 5931 kfree(dev->_tx); 5932 #ifdef CONFIG_RPS 5933 kfree(dev->_rx); 5934 #endif 5935 5936 free_p: 5937 kfree(p); 5938 return NULL; 5939 } 5940 EXPORT_SYMBOL(alloc_netdev_mqs); 5941 5942 /** 5943 * free_netdev - free network device 5944 * @dev: device 5945 * 5946 * This function does the last stage of destroying an allocated device 5947 * interface. The reference to the device object is released. 5948 * If this is the last reference then it will be freed. 5949 */ 5950 void free_netdev(struct net_device *dev) 5951 { 5952 struct napi_struct *p, *n; 5953 5954 release_net(dev_net(dev)); 5955 5956 kfree(dev->_tx); 5957 #ifdef CONFIG_RPS 5958 kfree(dev->_rx); 5959 #endif 5960 5961 kfree(rcu_dereference_protected(dev->ingress_queue, 1)); 5962 5963 /* Flush device addresses */ 5964 dev_addr_flush(dev); 5965 5966 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 5967 netif_napi_del(p); 5968 5969 free_percpu(dev->pcpu_refcnt); 5970 dev->pcpu_refcnt = NULL; 5971 5972 /* Compatibility with error handling in drivers */ 5973 if (dev->reg_state == NETREG_UNINITIALIZED) { 5974 kfree((char *)dev - dev->padded); 5975 return; 5976 } 5977 5978 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 5979 dev->reg_state = NETREG_RELEASED; 5980 5981 /* will free via device release */ 5982 put_device(&dev->dev); 5983 } 5984 EXPORT_SYMBOL(free_netdev); 5985 5986 /** 5987 * synchronize_net - Synchronize with packet receive processing 5988 * 5989 * Wait for packets currently being received to be done. 5990 * Does not block later packets from starting. 5991 */ 5992 void synchronize_net(void) 5993 { 5994 might_sleep(); 5995 if (rtnl_is_locked()) 5996 synchronize_rcu_expedited(); 5997 else 5998 synchronize_rcu(); 5999 } 6000 EXPORT_SYMBOL(synchronize_net); 6001 6002 /** 6003 * unregister_netdevice_queue - remove device from the kernel 6004 * @dev: device 6005 * @head: list 6006 * 6007 * This function shuts down a device interface and removes it 6008 * from the kernel tables. 6009 * If head not NULL, device is queued to be unregistered later. 6010 * 6011 * Callers must hold the rtnl semaphore. You may want 6012 * unregister_netdev() instead of this. 6013 */ 6014 6015 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) 6016 { 6017 ASSERT_RTNL(); 6018 6019 if (head) { 6020 list_move_tail(&dev->unreg_list, head); 6021 } else { 6022 rollback_registered(dev); 6023 /* Finish processing unregister after unlock */ 6024 net_set_todo(dev); 6025 } 6026 } 6027 EXPORT_SYMBOL(unregister_netdevice_queue); 6028 6029 /** 6030 * unregister_netdevice_many - unregister many devices 6031 * @head: list of devices 6032 */ 6033 void unregister_netdevice_many(struct list_head *head) 6034 { 6035 struct net_device *dev; 6036 6037 if (!list_empty(head)) { 6038 rollback_registered_many(head); 6039 list_for_each_entry(dev, head, unreg_list) 6040 net_set_todo(dev); 6041 } 6042 } 6043 EXPORT_SYMBOL(unregister_netdevice_many); 6044 6045 /** 6046 * unregister_netdev - remove device from the kernel 6047 * @dev: device 6048 * 6049 * This function shuts down a device interface and removes it 6050 * from the kernel tables. 6051 * 6052 * This is just a wrapper for unregister_netdevice that takes 6053 * the rtnl semaphore. In general you want to use this and not 6054 * unregister_netdevice. 6055 */ 6056 void unregister_netdev(struct net_device *dev) 6057 { 6058 rtnl_lock(); 6059 unregister_netdevice(dev); 6060 rtnl_unlock(); 6061 } 6062 EXPORT_SYMBOL(unregister_netdev); 6063 6064 /** 6065 * dev_change_net_namespace - move device to different nethost namespace 6066 * @dev: device 6067 * @net: network namespace 6068 * @pat: If not NULL name pattern to try if the current device name 6069 * is already taken in the destination network namespace. 6070 * 6071 * This function shuts down a device interface and moves it 6072 * to a new network namespace. On success 0 is returned, on 6073 * a failure a netagive errno code is returned. 6074 * 6075 * Callers must hold the rtnl semaphore. 6076 */ 6077 6078 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 6079 { 6080 int err; 6081 6082 ASSERT_RTNL(); 6083 6084 /* Don't allow namespace local devices to be moved. */ 6085 err = -EINVAL; 6086 if (dev->features & NETIF_F_NETNS_LOCAL) 6087 goto out; 6088 6089 /* Ensure the device has been registrered */ 6090 err = -EINVAL; 6091 if (dev->reg_state != NETREG_REGISTERED) 6092 goto out; 6093 6094 /* Get out if there is nothing todo */ 6095 err = 0; 6096 if (net_eq(dev_net(dev), net)) 6097 goto out; 6098 6099 /* Pick the destination device name, and ensure 6100 * we can use it in the destination network namespace. 6101 */ 6102 err = -EEXIST; 6103 if (__dev_get_by_name(net, dev->name)) { 6104 /* We get here if we can't use the current device name */ 6105 if (!pat) 6106 goto out; 6107 if (dev_get_valid_name(dev, pat) < 0) 6108 goto out; 6109 } 6110 6111 /* 6112 * And now a mini version of register_netdevice unregister_netdevice. 6113 */ 6114 6115 /* If device is running close it first. */ 6116 dev_close(dev); 6117 6118 /* And unlink it from device chain */ 6119 err = -ENODEV; 6120 unlist_netdevice(dev); 6121 6122 synchronize_net(); 6123 6124 /* Shutdown queueing discipline. */ 6125 dev_shutdown(dev); 6126 6127 /* Notify protocols, that we are about to destroy 6128 this device. They should clean all the things. 6129 6130 Note that dev->reg_state stays at NETREG_REGISTERED. 6131 This is wanted because this way 8021q and macvlan know 6132 the device is just moving and can keep their slaves up. 6133 */ 6134 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6135 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); 6136 6137 /* 6138 * Flush the unicast and multicast chains 6139 */ 6140 dev_uc_flush(dev); 6141 dev_mc_flush(dev); 6142 6143 /* Actually switch the network namespace */ 6144 dev_net_set(dev, net); 6145 6146 /* If there is an ifindex conflict assign a new one */ 6147 if (__dev_get_by_index(net, dev->ifindex)) { 6148 int iflink = (dev->iflink == dev->ifindex); 6149 dev->ifindex = dev_new_index(net); 6150 if (iflink) 6151 dev->iflink = dev->ifindex; 6152 } 6153 6154 /* Fixup kobjects */ 6155 err = device_rename(&dev->dev, dev->name); 6156 WARN_ON(err); 6157 6158 /* Add the device back in the hashes */ 6159 list_netdevice(dev); 6160 6161 /* Notify protocols, that a new device appeared. */ 6162 call_netdevice_notifiers(NETDEV_REGISTER, dev); 6163 6164 /* 6165 * Prevent userspace races by waiting until the network 6166 * device is fully setup before sending notifications. 6167 */ 6168 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); 6169 6170 synchronize_net(); 6171 err = 0; 6172 out: 6173 return err; 6174 } 6175 EXPORT_SYMBOL_GPL(dev_change_net_namespace); 6176 6177 static int dev_cpu_callback(struct notifier_block *nfb, 6178 unsigned long action, 6179 void *ocpu) 6180 { 6181 struct sk_buff **list_skb; 6182 struct sk_buff *skb; 6183 unsigned int cpu, oldcpu = (unsigned long)ocpu; 6184 struct softnet_data *sd, *oldsd; 6185 6186 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 6187 return NOTIFY_OK; 6188 6189 local_irq_disable(); 6190 cpu = smp_processor_id(); 6191 sd = &per_cpu(softnet_data, cpu); 6192 oldsd = &per_cpu(softnet_data, oldcpu); 6193 6194 /* Find end of our completion_queue. */ 6195 list_skb = &sd->completion_queue; 6196 while (*list_skb) 6197 list_skb = &(*list_skb)->next; 6198 /* Append completion queue from offline CPU. */ 6199 *list_skb = oldsd->completion_queue; 6200 oldsd->completion_queue = NULL; 6201 6202 /* Append output queue from offline CPU. */ 6203 if (oldsd->output_queue) { 6204 *sd->output_queue_tailp = oldsd->output_queue; 6205 sd->output_queue_tailp = oldsd->output_queue_tailp; 6206 oldsd->output_queue = NULL; 6207 oldsd->output_queue_tailp = &oldsd->output_queue; 6208 } 6209 /* Append NAPI poll list from offline CPU. */ 6210 if (!list_empty(&oldsd->poll_list)) { 6211 list_splice_init(&oldsd->poll_list, &sd->poll_list); 6212 raise_softirq_irqoff(NET_RX_SOFTIRQ); 6213 } 6214 6215 raise_softirq_irqoff(NET_TX_SOFTIRQ); 6216 local_irq_enable(); 6217 6218 /* Process offline CPU's input_pkt_queue */ 6219 while ((skb = __skb_dequeue(&oldsd->process_queue))) { 6220 netif_rx(skb); 6221 input_queue_head_incr(oldsd); 6222 } 6223 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { 6224 netif_rx(skb); 6225 input_queue_head_incr(oldsd); 6226 } 6227 6228 return NOTIFY_OK; 6229 } 6230 6231 6232 /** 6233 * netdev_increment_features - increment feature set by one 6234 * @all: current feature set 6235 * @one: new feature set 6236 * @mask: mask feature set 6237 * 6238 * Computes a new feature set after adding a device with feature set 6239 * @one to the master device with current feature set @all. Will not 6240 * enable anything that is off in @mask. Returns the new feature set. 6241 */ 6242 u32 netdev_increment_features(u32 all, u32 one, u32 mask) 6243 { 6244 if (mask & NETIF_F_GEN_CSUM) 6245 mask |= NETIF_F_ALL_CSUM; 6246 mask |= NETIF_F_VLAN_CHALLENGED; 6247 6248 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; 6249 all &= one | ~NETIF_F_ALL_FOR_ALL; 6250 6251 /* If device needs checksumming, downgrade to it. */ 6252 if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM)) 6253 all &= ~NETIF_F_NO_CSUM; 6254 6255 /* If one device supports hw checksumming, set for all. */ 6256 if (all & NETIF_F_GEN_CSUM) 6257 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); 6258 6259 return all; 6260 } 6261 EXPORT_SYMBOL(netdev_increment_features); 6262 6263 static struct hlist_head *netdev_create_hash(void) 6264 { 6265 int i; 6266 struct hlist_head *hash; 6267 6268 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 6269 if (hash != NULL) 6270 for (i = 0; i < NETDEV_HASHENTRIES; i++) 6271 INIT_HLIST_HEAD(&hash[i]); 6272 6273 return hash; 6274 } 6275 6276 /* Initialize per network namespace state */ 6277 static int __net_init netdev_init(struct net *net) 6278 { 6279 INIT_LIST_HEAD(&net->dev_base_head); 6280 6281 net->dev_name_head = netdev_create_hash(); 6282 if (net->dev_name_head == NULL) 6283 goto err_name; 6284 6285 net->dev_index_head = netdev_create_hash(); 6286 if (net->dev_index_head == NULL) 6287 goto err_idx; 6288 6289 return 0; 6290 6291 err_idx: 6292 kfree(net->dev_name_head); 6293 err_name: 6294 return -ENOMEM; 6295 } 6296 6297 /** 6298 * netdev_drivername - network driver for the device 6299 * @dev: network device 6300 * 6301 * Determine network driver for device. 6302 */ 6303 const char *netdev_drivername(const struct net_device *dev) 6304 { 6305 const struct device_driver *driver; 6306 const struct device *parent; 6307 const char *empty = ""; 6308 6309 parent = dev->dev.parent; 6310 if (!parent) 6311 return empty; 6312 6313 driver = parent->driver; 6314 if (driver && driver->name) 6315 return driver->name; 6316 return empty; 6317 } 6318 6319 static int __netdev_printk(const char *level, const struct net_device *dev, 6320 struct va_format *vaf) 6321 { 6322 int r; 6323 6324 if (dev && dev->dev.parent) 6325 r = dev_printk(level, dev->dev.parent, "%s: %pV", 6326 netdev_name(dev), vaf); 6327 else if (dev) 6328 r = printk("%s%s: %pV", level, netdev_name(dev), vaf); 6329 else 6330 r = printk("%s(NULL net_device): %pV", level, vaf); 6331 6332 return r; 6333 } 6334 6335 int netdev_printk(const char *level, const struct net_device *dev, 6336 const char *format, ...) 6337 { 6338 struct va_format vaf; 6339 va_list args; 6340 int r; 6341 6342 va_start(args, format); 6343 6344 vaf.fmt = format; 6345 vaf.va = &args; 6346 6347 r = __netdev_printk(level, dev, &vaf); 6348 va_end(args); 6349 6350 return r; 6351 } 6352 EXPORT_SYMBOL(netdev_printk); 6353 6354 #define define_netdev_printk_level(func, level) \ 6355 int func(const struct net_device *dev, const char *fmt, ...) \ 6356 { \ 6357 int r; \ 6358 struct va_format vaf; \ 6359 va_list args; \ 6360 \ 6361 va_start(args, fmt); \ 6362 \ 6363 vaf.fmt = fmt; \ 6364 vaf.va = &args; \ 6365 \ 6366 r = __netdev_printk(level, dev, &vaf); \ 6367 va_end(args); \ 6368 \ 6369 return r; \ 6370 } \ 6371 EXPORT_SYMBOL(func); 6372 6373 define_netdev_printk_level(netdev_emerg, KERN_EMERG); 6374 define_netdev_printk_level(netdev_alert, KERN_ALERT); 6375 define_netdev_printk_level(netdev_crit, KERN_CRIT); 6376 define_netdev_printk_level(netdev_err, KERN_ERR); 6377 define_netdev_printk_level(netdev_warn, KERN_WARNING); 6378 define_netdev_printk_level(netdev_notice, KERN_NOTICE); 6379 define_netdev_printk_level(netdev_info, KERN_INFO); 6380 6381 static void __net_exit netdev_exit(struct net *net) 6382 { 6383 kfree(net->dev_name_head); 6384 kfree(net->dev_index_head); 6385 } 6386 6387 static struct pernet_operations __net_initdata netdev_net_ops = { 6388 .init = netdev_init, 6389 .exit = netdev_exit, 6390 }; 6391 6392 static void __net_exit default_device_exit(struct net *net) 6393 { 6394 struct net_device *dev, *aux; 6395 /* 6396 * Push all migratable network devices back to the 6397 * initial network namespace 6398 */ 6399 rtnl_lock(); 6400 for_each_netdev_safe(net, dev, aux) { 6401 int err; 6402 char fb_name[IFNAMSIZ]; 6403 6404 /* Ignore unmoveable devices (i.e. loopback) */ 6405 if (dev->features & NETIF_F_NETNS_LOCAL) 6406 continue; 6407 6408 /* Leave virtual devices for the generic cleanup */ 6409 if (dev->rtnl_link_ops) 6410 continue; 6411 6412 /* Push remaining network devices to init_net */ 6413 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 6414 err = dev_change_net_namespace(dev, &init_net, fb_name); 6415 if (err) { 6416 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n", 6417 __func__, dev->name, err); 6418 BUG(); 6419 } 6420 } 6421 rtnl_unlock(); 6422 } 6423 6424 static void __net_exit default_device_exit_batch(struct list_head *net_list) 6425 { 6426 /* At exit all network devices most be removed from a network 6427 * namespace. Do this in the reverse order of registration. 6428 * Do this across as many network namespaces as possible to 6429 * improve batching efficiency. 6430 */ 6431 struct net_device *dev; 6432 struct net *net; 6433 LIST_HEAD(dev_kill_list); 6434 6435 rtnl_lock(); 6436 list_for_each_entry(net, net_list, exit_list) { 6437 for_each_netdev_reverse(net, dev) { 6438 if (dev->rtnl_link_ops) 6439 dev->rtnl_link_ops->dellink(dev, &dev_kill_list); 6440 else 6441 unregister_netdevice_queue(dev, &dev_kill_list); 6442 } 6443 } 6444 unregister_netdevice_many(&dev_kill_list); 6445 list_del(&dev_kill_list); 6446 rtnl_unlock(); 6447 } 6448 6449 static struct pernet_operations __net_initdata default_device_ops = { 6450 .exit = default_device_exit, 6451 .exit_batch = default_device_exit_batch, 6452 }; 6453 6454 /* 6455 * Initialize the DEV module. At boot time this walks the device list and 6456 * unhooks any devices that fail to initialise (normally hardware not 6457 * present) and leaves us with a valid list of present and active devices. 6458 * 6459 */ 6460 6461 /* 6462 * This is called single threaded during boot, so no need 6463 * to take the rtnl semaphore. 6464 */ 6465 static int __init net_dev_init(void) 6466 { 6467 int i, rc = -ENOMEM; 6468 6469 BUG_ON(!dev_boot_phase); 6470 6471 if (dev_proc_init()) 6472 goto out; 6473 6474 if (netdev_kobject_init()) 6475 goto out; 6476 6477 INIT_LIST_HEAD(&ptype_all); 6478 for (i = 0; i < PTYPE_HASH_SIZE; i++) 6479 INIT_LIST_HEAD(&ptype_base[i]); 6480 6481 if (register_pernet_subsys(&netdev_net_ops)) 6482 goto out; 6483 6484 /* 6485 * Initialise the packet receive queues. 6486 */ 6487 6488 for_each_possible_cpu(i) { 6489 struct softnet_data *sd = &per_cpu(softnet_data, i); 6490 6491 memset(sd, 0, sizeof(*sd)); 6492 skb_queue_head_init(&sd->input_pkt_queue); 6493 skb_queue_head_init(&sd->process_queue); 6494 sd->completion_queue = NULL; 6495 INIT_LIST_HEAD(&sd->poll_list); 6496 sd->output_queue = NULL; 6497 sd->output_queue_tailp = &sd->output_queue; 6498 #ifdef CONFIG_RPS 6499 sd->csd.func = rps_trigger_softirq; 6500 sd->csd.info = sd; 6501 sd->csd.flags = 0; 6502 sd->cpu = i; 6503 #endif 6504 6505 sd->backlog.poll = process_backlog; 6506 sd->backlog.weight = weight_p; 6507 sd->backlog.gro_list = NULL; 6508 sd->backlog.gro_count = 0; 6509 } 6510 6511 dev_boot_phase = 0; 6512 6513 /* The loopback device is special if any other network devices 6514 * is present in a network namespace the loopback device must 6515 * be present. Since we now dynamically allocate and free the 6516 * loopback device ensure this invariant is maintained by 6517 * keeping the loopback device as the first device on the 6518 * list of network devices. Ensuring the loopback devices 6519 * is the first device that appears and the last network device 6520 * that disappears. 6521 */ 6522 if (register_pernet_device(&loopback_net_ops)) 6523 goto out; 6524 6525 if (register_pernet_device(&default_device_ops)) 6526 goto out; 6527 6528 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 6529 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 6530 6531 hotcpu_notifier(dev_cpu_callback, 0); 6532 dst_init(); 6533 dev_mcast_init(); 6534 rc = 0; 6535 out: 6536 return rc; 6537 } 6538 6539 subsys_initcall(net_dev_init); 6540 6541 static int __init initialize_hashrnd(void) 6542 { 6543 get_random_bytes(&hashrnd, sizeof(hashrnd)); 6544 return 0; 6545 } 6546 6547 late_initcall_sync(initialize_hashrnd); 6548 6549