1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <[email protected]> 12 * Mark Evans, <[email protected]> 13 * 14 * Additional Authors: 15 * Florian la Roche <[email protected]> 16 * Alan Cox <[email protected]> 17 * David Hinds <[email protected]> 18 * Alexey Kuznetsov <[email protected]> 19 * Adam Sulmicki <[email protected]> 20 * Pekka Riikonen <[email protected]> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <asm/uaccess.h> 76 #include <linux/bitops.h> 77 #include <linux/capability.h> 78 #include <linux/cpu.h> 79 #include <linux/types.h> 80 #include <linux/kernel.h> 81 #include <linux/hash.h> 82 #include <linux/slab.h> 83 #include <linux/sched.h> 84 #include <linux/mutex.h> 85 #include <linux/string.h> 86 #include <linux/mm.h> 87 #include <linux/socket.h> 88 #include <linux/sockios.h> 89 #include <linux/errno.h> 90 #include <linux/interrupt.h> 91 #include <linux/if_ether.h> 92 #include <linux/netdevice.h> 93 #include <linux/etherdevice.h> 94 #include <linux/ethtool.h> 95 #include <linux/notifier.h> 96 #include <linux/skbuff.h> 97 #include <net/net_namespace.h> 98 #include <net/sock.h> 99 #include <net/busy_poll.h> 100 #include <linux/rtnetlink.h> 101 #include <linux/stat.h> 102 #include <net/dst.h> 103 #include <net/dst_metadata.h> 104 #include <net/pkt_sched.h> 105 #include <net/checksum.h> 106 #include <net/xfrm.h> 107 #include <linux/highmem.h> 108 #include <linux/init.h> 109 #include <linux/module.h> 110 #include <linux/netpoll.h> 111 #include <linux/rcupdate.h> 112 #include <linux/delay.h> 113 #include <net/iw_handler.h> 114 #include <asm/current.h> 115 #include <linux/audit.h> 116 #include <linux/dmaengine.h> 117 #include <linux/err.h> 118 #include <linux/ctype.h> 119 #include <linux/if_arp.h> 120 #include <linux/if_vlan.h> 121 #include <linux/ip.h> 122 #include <net/ip.h> 123 #include <net/mpls.h> 124 #include <linux/ipv6.h> 125 #include <linux/in.h> 126 #include <linux/jhash.h> 127 #include <linux/random.h> 128 #include <trace/events/napi.h> 129 #include <trace/events/net.h> 130 #include <trace/events/skb.h> 131 #include <linux/pci.h> 132 #include <linux/inetdevice.h> 133 #include <linux/cpu_rmap.h> 134 #include <linux/static_key.h> 135 #include <linux/hashtable.h> 136 #include <linux/vmalloc.h> 137 #include <linux/if_macvlan.h> 138 #include <linux/errqueue.h> 139 #include <linux/hrtimer.h> 140 #include <linux/netfilter_ingress.h> 141 #include <linux/sctp.h> 142 143 #include "net-sysfs.h" 144 145 /* Instead of increasing this, you should create a hash table. */ 146 #define MAX_GRO_SKBS 8 147 148 /* This should be increased if a protocol with a bigger head is added. */ 149 #define GRO_MAX_HEAD (MAX_HEADER + 128) 150 151 static DEFINE_SPINLOCK(ptype_lock); 152 static DEFINE_SPINLOCK(offload_lock); 153 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 154 struct list_head ptype_all __read_mostly; /* Taps */ 155 static struct list_head offload_base __read_mostly; 156 157 static int netif_rx_internal(struct sk_buff *skb); 158 static int call_netdevice_notifiers_info(unsigned long val, 159 struct net_device *dev, 160 struct netdev_notifier_info *info); 161 162 /* 163 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 164 * semaphore. 165 * 166 * Pure readers hold dev_base_lock for reading, or rcu_read_lock() 167 * 168 * Writers must hold the rtnl semaphore while they loop through the 169 * dev_base_head list, and hold dev_base_lock for writing when they do the 170 * actual updates. This allows pure readers to access the list even 171 * while a writer is preparing to update it. 172 * 173 * To put it another way, dev_base_lock is held for writing only to 174 * protect against pure readers; the rtnl semaphore provides the 175 * protection against other writers. 176 * 177 * See, for example usages, register_netdevice() and 178 * unregister_netdevice(), which must be called with the rtnl 179 * semaphore held. 180 */ 181 DEFINE_RWLOCK(dev_base_lock); 182 EXPORT_SYMBOL(dev_base_lock); 183 184 /* protects napi_hash addition/deletion and napi_gen_id */ 185 static DEFINE_SPINLOCK(napi_hash_lock); 186 187 static unsigned int napi_gen_id = NR_CPUS; 188 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); 189 190 static seqcount_t devnet_rename_seq; 191 192 static inline void dev_base_seq_inc(struct net *net) 193 { 194 while (++net->dev_base_seq == 0); 195 } 196 197 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 198 { 199 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); 200 201 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; 202 } 203 204 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 205 { 206 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 207 } 208 209 static inline void rps_lock(struct softnet_data *sd) 210 { 211 #ifdef CONFIG_RPS 212 spin_lock(&sd->input_pkt_queue.lock); 213 #endif 214 } 215 216 static inline void rps_unlock(struct softnet_data *sd) 217 { 218 #ifdef CONFIG_RPS 219 spin_unlock(&sd->input_pkt_queue.lock); 220 #endif 221 } 222 223 /* Device list insertion */ 224 static void list_netdevice(struct net_device *dev) 225 { 226 struct net *net = dev_net(dev); 227 228 ASSERT_RTNL(); 229 230 write_lock_bh(&dev_base_lock); 231 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); 232 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 233 hlist_add_head_rcu(&dev->index_hlist, 234 dev_index_hash(net, dev->ifindex)); 235 write_unlock_bh(&dev_base_lock); 236 237 dev_base_seq_inc(net); 238 } 239 240 /* Device list removal 241 * caller must respect a RCU grace period before freeing/reusing dev 242 */ 243 static void unlist_netdevice(struct net_device *dev) 244 { 245 ASSERT_RTNL(); 246 247 /* Unlink dev from the device chain */ 248 write_lock_bh(&dev_base_lock); 249 list_del_rcu(&dev->dev_list); 250 hlist_del_rcu(&dev->name_hlist); 251 hlist_del_rcu(&dev->index_hlist); 252 write_unlock_bh(&dev_base_lock); 253 254 dev_base_seq_inc(dev_net(dev)); 255 } 256 257 /* 258 * Our notifier list 259 */ 260 261 static RAW_NOTIFIER_HEAD(netdev_chain); 262 263 /* 264 * Device drivers call our routines to queue packets here. We empty the 265 * queue in the local softnet handler. 266 */ 267 268 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 269 EXPORT_PER_CPU_SYMBOL(softnet_data); 270 271 #ifdef CONFIG_LOCKDEP 272 /* 273 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 274 * according to dev->type 275 */ 276 static const unsigned short netdev_lock_type[] = 277 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 278 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 279 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 280 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 281 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 282 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 283 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 284 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 285 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 286 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 287 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 288 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 289 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, 290 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, 291 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; 292 293 static const char *const netdev_lock_name[] = 294 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 295 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 296 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 297 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 298 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 299 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 300 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 301 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 302 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 303 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 304 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 305 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 306 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", 307 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", 308 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; 309 310 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 311 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 312 313 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 314 { 315 int i; 316 317 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 318 if (netdev_lock_type[i] == dev_type) 319 return i; 320 /* the last key is used by default */ 321 return ARRAY_SIZE(netdev_lock_type) - 1; 322 } 323 324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 325 unsigned short dev_type) 326 { 327 int i; 328 329 i = netdev_lock_pos(dev_type); 330 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 331 netdev_lock_name[i]); 332 } 333 334 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 335 { 336 int i; 337 338 i = netdev_lock_pos(dev->type); 339 lockdep_set_class_and_name(&dev->addr_list_lock, 340 &netdev_addr_lock_key[i], 341 netdev_lock_name[i]); 342 } 343 #else 344 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 345 unsigned short dev_type) 346 { 347 } 348 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 349 { 350 } 351 #endif 352 353 /******************************************************************************* 354 355 Protocol management and registration routines 356 357 *******************************************************************************/ 358 359 /* 360 * Add a protocol ID to the list. Now that the input handler is 361 * smarter we can dispense with all the messy stuff that used to be 362 * here. 363 * 364 * BEWARE!!! Protocol handlers, mangling input packets, 365 * MUST BE last in hash buckets and checking protocol handlers 366 * MUST start from promiscuous ptype_all chain in net_bh. 367 * It is true now, do not change it. 368 * Explanation follows: if protocol handler, mangling packet, will 369 * be the first on list, it is not able to sense, that packet 370 * is cloned and should be copied-on-write, so that it will 371 * change it and subsequent readers will get broken packet. 372 * --ANK (980803) 373 */ 374 375 static inline struct list_head *ptype_head(const struct packet_type *pt) 376 { 377 if (pt->type == htons(ETH_P_ALL)) 378 return pt->dev ? &pt->dev->ptype_all : &ptype_all; 379 else 380 return pt->dev ? &pt->dev->ptype_specific : 381 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 382 } 383 384 /** 385 * dev_add_pack - add packet handler 386 * @pt: packet type declaration 387 * 388 * Add a protocol handler to the networking stack. The passed &packet_type 389 * is linked into kernel lists and may not be freed until it has been 390 * removed from the kernel lists. 391 * 392 * This call does not sleep therefore it can not 393 * guarantee all CPU's that are in middle of receiving packets 394 * will see the new packet type (until the next received packet). 395 */ 396 397 void dev_add_pack(struct packet_type *pt) 398 { 399 struct list_head *head = ptype_head(pt); 400 401 spin_lock(&ptype_lock); 402 list_add_rcu(&pt->list, head); 403 spin_unlock(&ptype_lock); 404 } 405 EXPORT_SYMBOL(dev_add_pack); 406 407 /** 408 * __dev_remove_pack - remove packet handler 409 * @pt: packet type declaration 410 * 411 * Remove a protocol handler that was previously added to the kernel 412 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 413 * from the kernel lists and can be freed or reused once this function 414 * returns. 415 * 416 * The packet type might still be in use by receivers 417 * and must not be freed until after all the CPU's have gone 418 * through a quiescent state. 419 */ 420 void __dev_remove_pack(struct packet_type *pt) 421 { 422 struct list_head *head = ptype_head(pt); 423 struct packet_type *pt1; 424 425 spin_lock(&ptype_lock); 426 427 list_for_each_entry(pt1, head, list) { 428 if (pt == pt1) { 429 list_del_rcu(&pt->list); 430 goto out; 431 } 432 } 433 434 pr_warn("dev_remove_pack: %p not found\n", pt); 435 out: 436 spin_unlock(&ptype_lock); 437 } 438 EXPORT_SYMBOL(__dev_remove_pack); 439 440 /** 441 * dev_remove_pack - remove packet handler 442 * @pt: packet type declaration 443 * 444 * Remove a protocol handler that was previously added to the kernel 445 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 446 * from the kernel lists and can be freed or reused once this function 447 * returns. 448 * 449 * This call sleeps to guarantee that no CPU is looking at the packet 450 * type after return. 451 */ 452 void dev_remove_pack(struct packet_type *pt) 453 { 454 __dev_remove_pack(pt); 455 456 synchronize_net(); 457 } 458 EXPORT_SYMBOL(dev_remove_pack); 459 460 461 /** 462 * dev_add_offload - register offload handlers 463 * @po: protocol offload declaration 464 * 465 * Add protocol offload handlers to the networking stack. The passed 466 * &proto_offload is linked into kernel lists and may not be freed until 467 * it has been removed from the kernel lists. 468 * 469 * This call does not sleep therefore it can not 470 * guarantee all CPU's that are in middle of receiving packets 471 * will see the new offload handlers (until the next received packet). 472 */ 473 void dev_add_offload(struct packet_offload *po) 474 { 475 struct packet_offload *elem; 476 477 spin_lock(&offload_lock); 478 list_for_each_entry(elem, &offload_base, list) { 479 if (po->priority < elem->priority) 480 break; 481 } 482 list_add_rcu(&po->list, elem->list.prev); 483 spin_unlock(&offload_lock); 484 } 485 EXPORT_SYMBOL(dev_add_offload); 486 487 /** 488 * __dev_remove_offload - remove offload handler 489 * @po: packet offload declaration 490 * 491 * Remove a protocol offload handler that was previously added to the 492 * kernel offload handlers by dev_add_offload(). The passed &offload_type 493 * is removed from the kernel lists and can be freed or reused once this 494 * function returns. 495 * 496 * The packet type might still be in use by receivers 497 * and must not be freed until after all the CPU's have gone 498 * through a quiescent state. 499 */ 500 static void __dev_remove_offload(struct packet_offload *po) 501 { 502 struct list_head *head = &offload_base; 503 struct packet_offload *po1; 504 505 spin_lock(&offload_lock); 506 507 list_for_each_entry(po1, head, list) { 508 if (po == po1) { 509 list_del_rcu(&po->list); 510 goto out; 511 } 512 } 513 514 pr_warn("dev_remove_offload: %p not found\n", po); 515 out: 516 spin_unlock(&offload_lock); 517 } 518 519 /** 520 * dev_remove_offload - remove packet offload handler 521 * @po: packet offload declaration 522 * 523 * Remove a packet offload handler that was previously added to the kernel 524 * offload handlers by dev_add_offload(). The passed &offload_type is 525 * removed from the kernel lists and can be freed or reused once this 526 * function returns. 527 * 528 * This call sleeps to guarantee that no CPU is looking at the packet 529 * type after return. 530 */ 531 void dev_remove_offload(struct packet_offload *po) 532 { 533 __dev_remove_offload(po); 534 535 synchronize_net(); 536 } 537 EXPORT_SYMBOL(dev_remove_offload); 538 539 /****************************************************************************** 540 541 Device Boot-time Settings Routines 542 543 *******************************************************************************/ 544 545 /* Boot time configuration table */ 546 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 547 548 /** 549 * netdev_boot_setup_add - add new setup entry 550 * @name: name of the device 551 * @map: configured settings for the device 552 * 553 * Adds new setup entry to the dev_boot_setup list. The function 554 * returns 0 on error and 1 on success. This is a generic routine to 555 * all netdevices. 556 */ 557 static int netdev_boot_setup_add(char *name, struct ifmap *map) 558 { 559 struct netdev_boot_setup *s; 560 int i; 561 562 s = dev_boot_setup; 563 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 564 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 565 memset(s[i].name, 0, sizeof(s[i].name)); 566 strlcpy(s[i].name, name, IFNAMSIZ); 567 memcpy(&s[i].map, map, sizeof(s[i].map)); 568 break; 569 } 570 } 571 572 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 573 } 574 575 /** 576 * netdev_boot_setup_check - check boot time settings 577 * @dev: the netdevice 578 * 579 * Check boot time settings for the device. 580 * The found settings are set for the device to be used 581 * later in the device probing. 582 * Returns 0 if no settings found, 1 if they are. 583 */ 584 int netdev_boot_setup_check(struct net_device *dev) 585 { 586 struct netdev_boot_setup *s = dev_boot_setup; 587 int i; 588 589 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 590 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 591 !strcmp(dev->name, s[i].name)) { 592 dev->irq = s[i].map.irq; 593 dev->base_addr = s[i].map.base_addr; 594 dev->mem_start = s[i].map.mem_start; 595 dev->mem_end = s[i].map.mem_end; 596 return 1; 597 } 598 } 599 return 0; 600 } 601 EXPORT_SYMBOL(netdev_boot_setup_check); 602 603 604 /** 605 * netdev_boot_base - get address from boot time settings 606 * @prefix: prefix for network device 607 * @unit: id for network device 608 * 609 * Check boot time settings for the base address of device. 610 * The found settings are set for the device to be used 611 * later in the device probing. 612 * Returns 0 if no settings found. 613 */ 614 unsigned long netdev_boot_base(const char *prefix, int unit) 615 { 616 const struct netdev_boot_setup *s = dev_boot_setup; 617 char name[IFNAMSIZ]; 618 int i; 619 620 sprintf(name, "%s%d", prefix, unit); 621 622 /* 623 * If device already registered then return base of 1 624 * to indicate not to probe for this interface 625 */ 626 if (__dev_get_by_name(&init_net, name)) 627 return 1; 628 629 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 630 if (!strcmp(name, s[i].name)) 631 return s[i].map.base_addr; 632 return 0; 633 } 634 635 /* 636 * Saves at boot time configured settings for any netdevice. 637 */ 638 int __init netdev_boot_setup(char *str) 639 { 640 int ints[5]; 641 struct ifmap map; 642 643 str = get_options(str, ARRAY_SIZE(ints), ints); 644 if (!str || !*str) 645 return 0; 646 647 /* Save settings */ 648 memset(&map, 0, sizeof(map)); 649 if (ints[0] > 0) 650 map.irq = ints[1]; 651 if (ints[0] > 1) 652 map.base_addr = ints[2]; 653 if (ints[0] > 2) 654 map.mem_start = ints[3]; 655 if (ints[0] > 3) 656 map.mem_end = ints[4]; 657 658 /* Add new entry to the list */ 659 return netdev_boot_setup_add(str, &map); 660 } 661 662 __setup("netdev=", netdev_boot_setup); 663 664 /******************************************************************************* 665 666 Device Interface Subroutines 667 668 *******************************************************************************/ 669 670 /** 671 * dev_get_iflink - get 'iflink' value of a interface 672 * @dev: targeted interface 673 * 674 * Indicates the ifindex the interface is linked to. 675 * Physical interfaces have the same 'ifindex' and 'iflink' values. 676 */ 677 678 int dev_get_iflink(const struct net_device *dev) 679 { 680 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) 681 return dev->netdev_ops->ndo_get_iflink(dev); 682 683 return dev->ifindex; 684 } 685 EXPORT_SYMBOL(dev_get_iflink); 686 687 /** 688 * dev_fill_metadata_dst - Retrieve tunnel egress information. 689 * @dev: targeted interface 690 * @skb: The packet. 691 * 692 * For better visibility of tunnel traffic OVS needs to retrieve 693 * egress tunnel information for a packet. Following API allows 694 * user to get this info. 695 */ 696 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) 697 { 698 struct ip_tunnel_info *info; 699 700 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) 701 return -EINVAL; 702 703 info = skb_tunnel_info_unclone(skb); 704 if (!info) 705 return -ENOMEM; 706 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) 707 return -EINVAL; 708 709 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); 710 } 711 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); 712 713 /** 714 * __dev_get_by_name - find a device by its name 715 * @net: the applicable net namespace 716 * @name: name to find 717 * 718 * Find an interface by name. Must be called under RTNL semaphore 719 * or @dev_base_lock. If the name is found a pointer to the device 720 * is returned. If the name is not found then %NULL is returned. The 721 * reference counters are not incremented so the caller must be 722 * careful with locks. 723 */ 724 725 struct net_device *__dev_get_by_name(struct net *net, const char *name) 726 { 727 struct net_device *dev; 728 struct hlist_head *head = dev_name_hash(net, name); 729 730 hlist_for_each_entry(dev, head, name_hlist) 731 if (!strncmp(dev->name, name, IFNAMSIZ)) 732 return dev; 733 734 return NULL; 735 } 736 EXPORT_SYMBOL(__dev_get_by_name); 737 738 /** 739 * dev_get_by_name_rcu - find a device by its name 740 * @net: the applicable net namespace 741 * @name: name to find 742 * 743 * Find an interface by name. 744 * If the name is found a pointer to the device is returned. 745 * If the name is not found then %NULL is returned. 746 * The reference counters are not incremented so the caller must be 747 * careful with locks. The caller must hold RCU lock. 748 */ 749 750 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 751 { 752 struct net_device *dev; 753 struct hlist_head *head = dev_name_hash(net, name); 754 755 hlist_for_each_entry_rcu(dev, head, name_hlist) 756 if (!strncmp(dev->name, name, IFNAMSIZ)) 757 return dev; 758 759 return NULL; 760 } 761 EXPORT_SYMBOL(dev_get_by_name_rcu); 762 763 /** 764 * dev_get_by_name - find a device by its name 765 * @net: the applicable net namespace 766 * @name: name to find 767 * 768 * Find an interface by name. This can be called from any 769 * context and does its own locking. The returned handle has 770 * the usage count incremented and the caller must use dev_put() to 771 * release it when it is no longer needed. %NULL is returned if no 772 * matching device is found. 773 */ 774 775 struct net_device *dev_get_by_name(struct net *net, const char *name) 776 { 777 struct net_device *dev; 778 779 rcu_read_lock(); 780 dev = dev_get_by_name_rcu(net, name); 781 if (dev) 782 dev_hold(dev); 783 rcu_read_unlock(); 784 return dev; 785 } 786 EXPORT_SYMBOL(dev_get_by_name); 787 788 /** 789 * __dev_get_by_index - find a device by its ifindex 790 * @net: the applicable net namespace 791 * @ifindex: index of device 792 * 793 * Search for an interface by index. Returns %NULL if the device 794 * is not found or a pointer to the device. The device has not 795 * had its reference counter increased so the caller must be careful 796 * about locking. The caller must hold either the RTNL semaphore 797 * or @dev_base_lock. 798 */ 799 800 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 801 { 802 struct net_device *dev; 803 struct hlist_head *head = dev_index_hash(net, ifindex); 804 805 hlist_for_each_entry(dev, head, index_hlist) 806 if (dev->ifindex == ifindex) 807 return dev; 808 809 return NULL; 810 } 811 EXPORT_SYMBOL(__dev_get_by_index); 812 813 /** 814 * dev_get_by_index_rcu - find a device by its ifindex 815 * @net: the applicable net namespace 816 * @ifindex: index of device 817 * 818 * Search for an interface by index. Returns %NULL if the device 819 * is not found or a pointer to the device. The device has not 820 * had its reference counter increased so the caller must be careful 821 * about locking. The caller must hold RCU lock. 822 */ 823 824 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) 825 { 826 struct net_device *dev; 827 struct hlist_head *head = dev_index_hash(net, ifindex); 828 829 hlist_for_each_entry_rcu(dev, head, index_hlist) 830 if (dev->ifindex == ifindex) 831 return dev; 832 833 return NULL; 834 } 835 EXPORT_SYMBOL(dev_get_by_index_rcu); 836 837 838 /** 839 * dev_get_by_index - find a device by its ifindex 840 * @net: the applicable net namespace 841 * @ifindex: index of device 842 * 843 * Search for an interface by index. Returns NULL if the device 844 * is not found or a pointer to the device. The device returned has 845 * had a reference added and the pointer is safe until the user calls 846 * dev_put to indicate they have finished with it. 847 */ 848 849 struct net_device *dev_get_by_index(struct net *net, int ifindex) 850 { 851 struct net_device *dev; 852 853 rcu_read_lock(); 854 dev = dev_get_by_index_rcu(net, ifindex); 855 if (dev) 856 dev_hold(dev); 857 rcu_read_unlock(); 858 return dev; 859 } 860 EXPORT_SYMBOL(dev_get_by_index); 861 862 /** 863 * netdev_get_name - get a netdevice name, knowing its ifindex. 864 * @net: network namespace 865 * @name: a pointer to the buffer where the name will be stored. 866 * @ifindex: the ifindex of the interface to get the name from. 867 * 868 * The use of raw_seqcount_begin() and cond_resched() before 869 * retrying is required as we want to give the writers a chance 870 * to complete when CONFIG_PREEMPT is not set. 871 */ 872 int netdev_get_name(struct net *net, char *name, int ifindex) 873 { 874 struct net_device *dev; 875 unsigned int seq; 876 877 retry: 878 seq = raw_seqcount_begin(&devnet_rename_seq); 879 rcu_read_lock(); 880 dev = dev_get_by_index_rcu(net, ifindex); 881 if (!dev) { 882 rcu_read_unlock(); 883 return -ENODEV; 884 } 885 886 strcpy(name, dev->name); 887 rcu_read_unlock(); 888 if (read_seqcount_retry(&devnet_rename_seq, seq)) { 889 cond_resched(); 890 goto retry; 891 } 892 893 return 0; 894 } 895 896 /** 897 * dev_getbyhwaddr_rcu - find a device by its hardware address 898 * @net: the applicable net namespace 899 * @type: media type of device 900 * @ha: hardware address 901 * 902 * Search for an interface by MAC address. Returns NULL if the device 903 * is not found or a pointer to the device. 904 * The caller must hold RCU or RTNL. 905 * The returned device has not had its ref count increased 906 * and the caller must therefore be careful about locking 907 * 908 */ 909 910 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, 911 const char *ha) 912 { 913 struct net_device *dev; 914 915 for_each_netdev_rcu(net, dev) 916 if (dev->type == type && 917 !memcmp(dev->dev_addr, ha, dev->addr_len)) 918 return dev; 919 920 return NULL; 921 } 922 EXPORT_SYMBOL(dev_getbyhwaddr_rcu); 923 924 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 925 { 926 struct net_device *dev; 927 928 ASSERT_RTNL(); 929 for_each_netdev(net, dev) 930 if (dev->type == type) 931 return dev; 932 933 return NULL; 934 } 935 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 936 937 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 938 { 939 struct net_device *dev, *ret = NULL; 940 941 rcu_read_lock(); 942 for_each_netdev_rcu(net, dev) 943 if (dev->type == type) { 944 dev_hold(dev); 945 ret = dev; 946 break; 947 } 948 rcu_read_unlock(); 949 return ret; 950 } 951 EXPORT_SYMBOL(dev_getfirstbyhwtype); 952 953 /** 954 * __dev_get_by_flags - find any device with given flags 955 * @net: the applicable net namespace 956 * @if_flags: IFF_* values 957 * @mask: bitmask of bits in if_flags to check 958 * 959 * Search for any interface with the given flags. Returns NULL if a device 960 * is not found or a pointer to the device. Must be called inside 961 * rtnl_lock(), and result refcount is unchanged. 962 */ 963 964 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, 965 unsigned short mask) 966 { 967 struct net_device *dev, *ret; 968 969 ASSERT_RTNL(); 970 971 ret = NULL; 972 for_each_netdev(net, dev) { 973 if (((dev->flags ^ if_flags) & mask) == 0) { 974 ret = dev; 975 break; 976 } 977 } 978 return ret; 979 } 980 EXPORT_SYMBOL(__dev_get_by_flags); 981 982 /** 983 * dev_valid_name - check if name is okay for network device 984 * @name: name string 985 * 986 * Network device names need to be valid file names to 987 * to allow sysfs to work. We also disallow any kind of 988 * whitespace. 989 */ 990 bool dev_valid_name(const char *name) 991 { 992 if (*name == '\0') 993 return false; 994 if (strlen(name) >= IFNAMSIZ) 995 return false; 996 if (!strcmp(name, ".") || !strcmp(name, "..")) 997 return false; 998 999 while (*name) { 1000 if (*name == '/' || *name == ':' || isspace(*name)) 1001 return false; 1002 name++; 1003 } 1004 return true; 1005 } 1006 EXPORT_SYMBOL(dev_valid_name); 1007 1008 /** 1009 * __dev_alloc_name - allocate a name for a device 1010 * @net: network namespace to allocate the device name in 1011 * @name: name format string 1012 * @buf: scratch buffer and result name string 1013 * 1014 * Passed a format string - eg "lt%d" it will try and find a suitable 1015 * id. It scans list of devices to build up a free map, then chooses 1016 * the first empty slot. The caller must hold the dev_base or rtnl lock 1017 * while allocating the name and adding the device in order to avoid 1018 * duplicates. 1019 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1020 * Returns the number of the unit assigned or a negative errno code. 1021 */ 1022 1023 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 1024 { 1025 int i = 0; 1026 const char *p; 1027 const int max_netdevices = 8*PAGE_SIZE; 1028 unsigned long *inuse; 1029 struct net_device *d; 1030 1031 p = strnchr(name, IFNAMSIZ-1, '%'); 1032 if (p) { 1033 /* 1034 * Verify the string as this thing may have come from 1035 * the user. There must be either one "%d" and no other "%" 1036 * characters. 1037 */ 1038 if (p[1] != 'd' || strchr(p + 2, '%')) 1039 return -EINVAL; 1040 1041 /* Use one page as a bit array of possible slots */ 1042 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 1043 if (!inuse) 1044 return -ENOMEM; 1045 1046 for_each_netdev(net, d) { 1047 if (!sscanf(d->name, name, &i)) 1048 continue; 1049 if (i < 0 || i >= max_netdevices) 1050 continue; 1051 1052 /* avoid cases where sscanf is not exact inverse of printf */ 1053 snprintf(buf, IFNAMSIZ, name, i); 1054 if (!strncmp(buf, d->name, IFNAMSIZ)) 1055 set_bit(i, inuse); 1056 } 1057 1058 i = find_first_zero_bit(inuse, max_netdevices); 1059 free_page((unsigned long) inuse); 1060 } 1061 1062 if (buf != name) 1063 snprintf(buf, IFNAMSIZ, name, i); 1064 if (!__dev_get_by_name(net, buf)) 1065 return i; 1066 1067 /* It is possible to run out of possible slots 1068 * when the name is long and there isn't enough space left 1069 * for the digits, or if all bits are used. 1070 */ 1071 return -ENFILE; 1072 } 1073 1074 /** 1075 * dev_alloc_name - allocate a name for a device 1076 * @dev: device 1077 * @name: name format string 1078 * 1079 * Passed a format string - eg "lt%d" it will try and find a suitable 1080 * id. It scans list of devices to build up a free map, then chooses 1081 * the first empty slot. The caller must hold the dev_base or rtnl lock 1082 * while allocating the name and adding the device in order to avoid 1083 * duplicates. 1084 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1085 * Returns the number of the unit assigned or a negative errno code. 1086 */ 1087 1088 int dev_alloc_name(struct net_device *dev, const char *name) 1089 { 1090 char buf[IFNAMSIZ]; 1091 struct net *net; 1092 int ret; 1093 1094 BUG_ON(!dev_net(dev)); 1095 net = dev_net(dev); 1096 ret = __dev_alloc_name(net, name, buf); 1097 if (ret >= 0) 1098 strlcpy(dev->name, buf, IFNAMSIZ); 1099 return ret; 1100 } 1101 EXPORT_SYMBOL(dev_alloc_name); 1102 1103 static int dev_alloc_name_ns(struct net *net, 1104 struct net_device *dev, 1105 const char *name) 1106 { 1107 char buf[IFNAMSIZ]; 1108 int ret; 1109 1110 ret = __dev_alloc_name(net, name, buf); 1111 if (ret >= 0) 1112 strlcpy(dev->name, buf, IFNAMSIZ); 1113 return ret; 1114 } 1115 1116 static int dev_get_valid_name(struct net *net, 1117 struct net_device *dev, 1118 const char *name) 1119 { 1120 BUG_ON(!net); 1121 1122 if (!dev_valid_name(name)) 1123 return -EINVAL; 1124 1125 if (strchr(name, '%')) 1126 return dev_alloc_name_ns(net, dev, name); 1127 else if (__dev_get_by_name(net, name)) 1128 return -EEXIST; 1129 else if (dev->name != name) 1130 strlcpy(dev->name, name, IFNAMSIZ); 1131 1132 return 0; 1133 } 1134 1135 /** 1136 * dev_change_name - change name of a device 1137 * @dev: device 1138 * @newname: name (or format string) must be at least IFNAMSIZ 1139 * 1140 * Change name of a device, can pass format strings "eth%d". 1141 * for wildcarding. 1142 */ 1143 int dev_change_name(struct net_device *dev, const char *newname) 1144 { 1145 unsigned char old_assign_type; 1146 char oldname[IFNAMSIZ]; 1147 int err = 0; 1148 int ret; 1149 struct net *net; 1150 1151 ASSERT_RTNL(); 1152 BUG_ON(!dev_net(dev)); 1153 1154 net = dev_net(dev); 1155 if (dev->flags & IFF_UP) 1156 return -EBUSY; 1157 1158 write_seqcount_begin(&devnet_rename_seq); 1159 1160 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { 1161 write_seqcount_end(&devnet_rename_seq); 1162 return 0; 1163 } 1164 1165 memcpy(oldname, dev->name, IFNAMSIZ); 1166 1167 err = dev_get_valid_name(net, dev, newname); 1168 if (err < 0) { 1169 write_seqcount_end(&devnet_rename_seq); 1170 return err; 1171 } 1172 1173 if (oldname[0] && !strchr(oldname, '%')) 1174 netdev_info(dev, "renamed from %s\n", oldname); 1175 1176 old_assign_type = dev->name_assign_type; 1177 dev->name_assign_type = NET_NAME_RENAMED; 1178 1179 rollback: 1180 ret = device_rename(&dev->dev, dev->name); 1181 if (ret) { 1182 memcpy(dev->name, oldname, IFNAMSIZ); 1183 dev->name_assign_type = old_assign_type; 1184 write_seqcount_end(&devnet_rename_seq); 1185 return ret; 1186 } 1187 1188 write_seqcount_end(&devnet_rename_seq); 1189 1190 netdev_adjacent_rename_links(dev, oldname); 1191 1192 write_lock_bh(&dev_base_lock); 1193 hlist_del_rcu(&dev->name_hlist); 1194 write_unlock_bh(&dev_base_lock); 1195 1196 synchronize_rcu(); 1197 1198 write_lock_bh(&dev_base_lock); 1199 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 1200 write_unlock_bh(&dev_base_lock); 1201 1202 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 1203 ret = notifier_to_errno(ret); 1204 1205 if (ret) { 1206 /* err >= 0 after dev_alloc_name() or stores the first errno */ 1207 if (err >= 0) { 1208 err = ret; 1209 write_seqcount_begin(&devnet_rename_seq); 1210 memcpy(dev->name, oldname, IFNAMSIZ); 1211 memcpy(oldname, newname, IFNAMSIZ); 1212 dev->name_assign_type = old_assign_type; 1213 old_assign_type = NET_NAME_RENAMED; 1214 goto rollback; 1215 } else { 1216 pr_err("%s: name change rollback failed: %d\n", 1217 dev->name, ret); 1218 } 1219 } 1220 1221 return err; 1222 } 1223 1224 /** 1225 * dev_set_alias - change ifalias of a device 1226 * @dev: device 1227 * @alias: name up to IFALIASZ 1228 * @len: limit of bytes to copy from info 1229 * 1230 * Set ifalias for a device, 1231 */ 1232 int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 1233 { 1234 char *new_ifalias; 1235 1236 ASSERT_RTNL(); 1237 1238 if (len >= IFALIASZ) 1239 return -EINVAL; 1240 1241 if (!len) { 1242 kfree(dev->ifalias); 1243 dev->ifalias = NULL; 1244 return 0; 1245 } 1246 1247 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 1248 if (!new_ifalias) 1249 return -ENOMEM; 1250 dev->ifalias = new_ifalias; 1251 1252 strlcpy(dev->ifalias, alias, len+1); 1253 return len; 1254 } 1255 1256 1257 /** 1258 * netdev_features_change - device changes features 1259 * @dev: device to cause notification 1260 * 1261 * Called to indicate a device has changed features. 1262 */ 1263 void netdev_features_change(struct net_device *dev) 1264 { 1265 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 1266 } 1267 EXPORT_SYMBOL(netdev_features_change); 1268 1269 /** 1270 * netdev_state_change - device changes state 1271 * @dev: device to cause notification 1272 * 1273 * Called to indicate a device has changed state. This function calls 1274 * the notifier chains for netdev_chain and sends a NEWLINK message 1275 * to the routing socket. 1276 */ 1277 void netdev_state_change(struct net_device *dev) 1278 { 1279 if (dev->flags & IFF_UP) { 1280 struct netdev_notifier_change_info change_info; 1281 1282 change_info.flags_changed = 0; 1283 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 1284 &change_info.info); 1285 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); 1286 } 1287 } 1288 EXPORT_SYMBOL(netdev_state_change); 1289 1290 /** 1291 * netdev_notify_peers - notify network peers about existence of @dev 1292 * @dev: network device 1293 * 1294 * Generate traffic such that interested network peers are aware of 1295 * @dev, such as by generating a gratuitous ARP. This may be used when 1296 * a device wants to inform the rest of the network about some sort of 1297 * reconfiguration such as a failover event or virtual machine 1298 * migration. 1299 */ 1300 void netdev_notify_peers(struct net_device *dev) 1301 { 1302 rtnl_lock(); 1303 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); 1304 rtnl_unlock(); 1305 } 1306 EXPORT_SYMBOL(netdev_notify_peers); 1307 1308 static int __dev_open(struct net_device *dev) 1309 { 1310 const struct net_device_ops *ops = dev->netdev_ops; 1311 int ret; 1312 1313 ASSERT_RTNL(); 1314 1315 if (!netif_device_present(dev)) 1316 return -ENODEV; 1317 1318 /* Block netpoll from trying to do any rx path servicing. 1319 * If we don't do this there is a chance ndo_poll_controller 1320 * or ndo_poll may be running while we open the device 1321 */ 1322 netpoll_poll_disable(dev); 1323 1324 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1325 ret = notifier_to_errno(ret); 1326 if (ret) 1327 return ret; 1328 1329 set_bit(__LINK_STATE_START, &dev->state); 1330 1331 if (ops->ndo_validate_addr) 1332 ret = ops->ndo_validate_addr(dev); 1333 1334 if (!ret && ops->ndo_open) 1335 ret = ops->ndo_open(dev); 1336 1337 netpoll_poll_enable(dev); 1338 1339 if (ret) 1340 clear_bit(__LINK_STATE_START, &dev->state); 1341 else { 1342 dev->flags |= IFF_UP; 1343 dev_set_rx_mode(dev); 1344 dev_activate(dev); 1345 add_device_randomness(dev->dev_addr, dev->addr_len); 1346 } 1347 1348 return ret; 1349 } 1350 1351 /** 1352 * dev_open - prepare an interface for use. 1353 * @dev: device to open 1354 * 1355 * Takes a device from down to up state. The device's private open 1356 * function is invoked and then the multicast lists are loaded. Finally 1357 * the device is moved into the up state and a %NETDEV_UP message is 1358 * sent to the netdev notifier chain. 1359 * 1360 * Calling this function on an active interface is a nop. On a failure 1361 * a negative errno code is returned. 1362 */ 1363 int dev_open(struct net_device *dev) 1364 { 1365 int ret; 1366 1367 if (dev->flags & IFF_UP) 1368 return 0; 1369 1370 ret = __dev_open(dev); 1371 if (ret < 0) 1372 return ret; 1373 1374 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1375 call_netdevice_notifiers(NETDEV_UP, dev); 1376 1377 return ret; 1378 } 1379 EXPORT_SYMBOL(dev_open); 1380 1381 static int __dev_close_many(struct list_head *head) 1382 { 1383 struct net_device *dev; 1384 1385 ASSERT_RTNL(); 1386 might_sleep(); 1387 1388 list_for_each_entry(dev, head, close_list) { 1389 /* Temporarily disable netpoll until the interface is down */ 1390 netpoll_poll_disable(dev); 1391 1392 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1393 1394 clear_bit(__LINK_STATE_START, &dev->state); 1395 1396 /* Synchronize to scheduled poll. We cannot touch poll list, it 1397 * can be even on different cpu. So just clear netif_running(). 1398 * 1399 * dev->stop() will invoke napi_disable() on all of it's 1400 * napi_struct instances on this device. 1401 */ 1402 smp_mb__after_atomic(); /* Commit netif_running(). */ 1403 } 1404 1405 dev_deactivate_many(head); 1406 1407 list_for_each_entry(dev, head, close_list) { 1408 const struct net_device_ops *ops = dev->netdev_ops; 1409 1410 /* 1411 * Call the device specific close. This cannot fail. 1412 * Only if device is UP 1413 * 1414 * We allow it to be called even after a DETACH hot-plug 1415 * event. 1416 */ 1417 if (ops->ndo_stop) 1418 ops->ndo_stop(dev); 1419 1420 dev->flags &= ~IFF_UP; 1421 netpoll_poll_enable(dev); 1422 } 1423 1424 return 0; 1425 } 1426 1427 static int __dev_close(struct net_device *dev) 1428 { 1429 int retval; 1430 LIST_HEAD(single); 1431 1432 list_add(&dev->close_list, &single); 1433 retval = __dev_close_many(&single); 1434 list_del(&single); 1435 1436 return retval; 1437 } 1438 1439 int dev_close_many(struct list_head *head, bool unlink) 1440 { 1441 struct net_device *dev, *tmp; 1442 1443 /* Remove the devices that don't need to be closed */ 1444 list_for_each_entry_safe(dev, tmp, head, close_list) 1445 if (!(dev->flags & IFF_UP)) 1446 list_del_init(&dev->close_list); 1447 1448 __dev_close_many(head); 1449 1450 list_for_each_entry_safe(dev, tmp, head, close_list) { 1451 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1452 call_netdevice_notifiers(NETDEV_DOWN, dev); 1453 if (unlink) 1454 list_del_init(&dev->close_list); 1455 } 1456 1457 return 0; 1458 } 1459 EXPORT_SYMBOL(dev_close_many); 1460 1461 /** 1462 * dev_close - shutdown an interface. 1463 * @dev: device to shutdown 1464 * 1465 * This function moves an active device into down state. A 1466 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1467 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1468 * chain. 1469 */ 1470 int dev_close(struct net_device *dev) 1471 { 1472 if (dev->flags & IFF_UP) { 1473 LIST_HEAD(single); 1474 1475 list_add(&dev->close_list, &single); 1476 dev_close_many(&single, true); 1477 list_del(&single); 1478 } 1479 return 0; 1480 } 1481 EXPORT_SYMBOL(dev_close); 1482 1483 1484 /** 1485 * dev_disable_lro - disable Large Receive Offload on a device 1486 * @dev: device 1487 * 1488 * Disable Large Receive Offload (LRO) on a net device. Must be 1489 * called under RTNL. This is needed if received packets may be 1490 * forwarded to another interface. 1491 */ 1492 void dev_disable_lro(struct net_device *dev) 1493 { 1494 struct net_device *lower_dev; 1495 struct list_head *iter; 1496 1497 dev->wanted_features &= ~NETIF_F_LRO; 1498 netdev_update_features(dev); 1499 1500 if (unlikely(dev->features & NETIF_F_LRO)) 1501 netdev_WARN(dev, "failed to disable LRO!\n"); 1502 1503 netdev_for_each_lower_dev(dev, lower_dev, iter) 1504 dev_disable_lro(lower_dev); 1505 } 1506 EXPORT_SYMBOL(dev_disable_lro); 1507 1508 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, 1509 struct net_device *dev) 1510 { 1511 struct netdev_notifier_info info; 1512 1513 netdev_notifier_info_init(&info, dev); 1514 return nb->notifier_call(nb, val, &info); 1515 } 1516 1517 static int dev_boot_phase = 1; 1518 1519 /** 1520 * register_netdevice_notifier - register a network notifier block 1521 * @nb: notifier 1522 * 1523 * Register a notifier to be called when network device events occur. 1524 * The notifier passed is linked into the kernel structures and must 1525 * not be reused until it has been unregistered. A negative errno code 1526 * is returned on a failure. 1527 * 1528 * When registered all registration and up events are replayed 1529 * to the new notifier to allow device to have a race free 1530 * view of the network device list. 1531 */ 1532 1533 int register_netdevice_notifier(struct notifier_block *nb) 1534 { 1535 struct net_device *dev; 1536 struct net_device *last; 1537 struct net *net; 1538 int err; 1539 1540 rtnl_lock(); 1541 err = raw_notifier_chain_register(&netdev_chain, nb); 1542 if (err) 1543 goto unlock; 1544 if (dev_boot_phase) 1545 goto unlock; 1546 for_each_net(net) { 1547 for_each_netdev(net, dev) { 1548 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); 1549 err = notifier_to_errno(err); 1550 if (err) 1551 goto rollback; 1552 1553 if (!(dev->flags & IFF_UP)) 1554 continue; 1555 1556 call_netdevice_notifier(nb, NETDEV_UP, dev); 1557 } 1558 } 1559 1560 unlock: 1561 rtnl_unlock(); 1562 return err; 1563 1564 rollback: 1565 last = dev; 1566 for_each_net(net) { 1567 for_each_netdev(net, dev) { 1568 if (dev == last) 1569 goto outroll; 1570 1571 if (dev->flags & IFF_UP) { 1572 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1573 dev); 1574 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1575 } 1576 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1577 } 1578 } 1579 1580 outroll: 1581 raw_notifier_chain_unregister(&netdev_chain, nb); 1582 goto unlock; 1583 } 1584 EXPORT_SYMBOL(register_netdevice_notifier); 1585 1586 /** 1587 * unregister_netdevice_notifier - unregister a network notifier block 1588 * @nb: notifier 1589 * 1590 * Unregister a notifier previously registered by 1591 * register_netdevice_notifier(). The notifier is unlinked into the 1592 * kernel structures and may then be reused. A negative errno code 1593 * is returned on a failure. 1594 * 1595 * After unregistering unregister and down device events are synthesized 1596 * for all devices on the device list to the removed notifier to remove 1597 * the need for special case cleanup code. 1598 */ 1599 1600 int unregister_netdevice_notifier(struct notifier_block *nb) 1601 { 1602 struct net_device *dev; 1603 struct net *net; 1604 int err; 1605 1606 rtnl_lock(); 1607 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1608 if (err) 1609 goto unlock; 1610 1611 for_each_net(net) { 1612 for_each_netdev(net, dev) { 1613 if (dev->flags & IFF_UP) { 1614 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1615 dev); 1616 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1617 } 1618 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1619 } 1620 } 1621 unlock: 1622 rtnl_unlock(); 1623 return err; 1624 } 1625 EXPORT_SYMBOL(unregister_netdevice_notifier); 1626 1627 /** 1628 * call_netdevice_notifiers_info - call all network notifier blocks 1629 * @val: value passed unmodified to notifier function 1630 * @dev: net_device pointer passed unmodified to notifier function 1631 * @info: notifier information data 1632 * 1633 * Call all network notifier blocks. Parameters and return value 1634 * are as for raw_notifier_call_chain(). 1635 */ 1636 1637 static int call_netdevice_notifiers_info(unsigned long val, 1638 struct net_device *dev, 1639 struct netdev_notifier_info *info) 1640 { 1641 ASSERT_RTNL(); 1642 netdev_notifier_info_init(info, dev); 1643 return raw_notifier_call_chain(&netdev_chain, val, info); 1644 } 1645 1646 /** 1647 * call_netdevice_notifiers - call all network notifier blocks 1648 * @val: value passed unmodified to notifier function 1649 * @dev: net_device pointer passed unmodified to notifier function 1650 * 1651 * Call all network notifier blocks. Parameters and return value 1652 * are as for raw_notifier_call_chain(). 1653 */ 1654 1655 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1656 { 1657 struct netdev_notifier_info info; 1658 1659 return call_netdevice_notifiers_info(val, dev, &info); 1660 } 1661 EXPORT_SYMBOL(call_netdevice_notifiers); 1662 1663 #ifdef CONFIG_NET_INGRESS 1664 static struct static_key ingress_needed __read_mostly; 1665 1666 void net_inc_ingress_queue(void) 1667 { 1668 static_key_slow_inc(&ingress_needed); 1669 } 1670 EXPORT_SYMBOL_GPL(net_inc_ingress_queue); 1671 1672 void net_dec_ingress_queue(void) 1673 { 1674 static_key_slow_dec(&ingress_needed); 1675 } 1676 EXPORT_SYMBOL_GPL(net_dec_ingress_queue); 1677 #endif 1678 1679 #ifdef CONFIG_NET_EGRESS 1680 static struct static_key egress_needed __read_mostly; 1681 1682 void net_inc_egress_queue(void) 1683 { 1684 static_key_slow_inc(&egress_needed); 1685 } 1686 EXPORT_SYMBOL_GPL(net_inc_egress_queue); 1687 1688 void net_dec_egress_queue(void) 1689 { 1690 static_key_slow_dec(&egress_needed); 1691 } 1692 EXPORT_SYMBOL_GPL(net_dec_egress_queue); 1693 #endif 1694 1695 static struct static_key netstamp_needed __read_mostly; 1696 #ifdef HAVE_JUMP_LABEL 1697 /* We are not allowed to call static_key_slow_dec() from irq context 1698 * If net_disable_timestamp() is called from irq context, defer the 1699 * static_key_slow_dec() calls. 1700 */ 1701 static atomic_t netstamp_needed_deferred; 1702 #endif 1703 1704 void net_enable_timestamp(void) 1705 { 1706 #ifdef HAVE_JUMP_LABEL 1707 int deferred = atomic_xchg(&netstamp_needed_deferred, 0); 1708 1709 if (deferred) { 1710 while (--deferred) 1711 static_key_slow_dec(&netstamp_needed); 1712 return; 1713 } 1714 #endif 1715 static_key_slow_inc(&netstamp_needed); 1716 } 1717 EXPORT_SYMBOL(net_enable_timestamp); 1718 1719 void net_disable_timestamp(void) 1720 { 1721 #ifdef HAVE_JUMP_LABEL 1722 if (in_interrupt()) { 1723 atomic_inc(&netstamp_needed_deferred); 1724 return; 1725 } 1726 #endif 1727 static_key_slow_dec(&netstamp_needed); 1728 } 1729 EXPORT_SYMBOL(net_disable_timestamp); 1730 1731 static inline void net_timestamp_set(struct sk_buff *skb) 1732 { 1733 skb->tstamp.tv64 = 0; 1734 if (static_key_false(&netstamp_needed)) 1735 __net_timestamp(skb); 1736 } 1737 1738 #define net_timestamp_check(COND, SKB) \ 1739 if (static_key_false(&netstamp_needed)) { \ 1740 if ((COND) && !(SKB)->tstamp.tv64) \ 1741 __net_timestamp(SKB); \ 1742 } \ 1743 1744 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) 1745 { 1746 unsigned int len; 1747 1748 if (!(dev->flags & IFF_UP)) 1749 return false; 1750 1751 len = dev->mtu + dev->hard_header_len + VLAN_HLEN; 1752 if (skb->len <= len) 1753 return true; 1754 1755 /* if TSO is enabled, we don't care about the length as the packet 1756 * could be forwarded without being segmented before 1757 */ 1758 if (skb_is_gso(skb)) 1759 return true; 1760 1761 return false; 1762 } 1763 EXPORT_SYMBOL_GPL(is_skb_forwardable); 1764 1765 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1766 { 1767 if (skb_orphan_frags(skb, GFP_ATOMIC) || 1768 unlikely(!is_skb_forwardable(dev, skb))) { 1769 atomic_long_inc(&dev->rx_dropped); 1770 kfree_skb(skb); 1771 return NET_RX_DROP; 1772 } 1773 1774 skb_scrub_packet(skb, true); 1775 skb->priority = 0; 1776 skb->protocol = eth_type_trans(skb, dev); 1777 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 1778 1779 return 0; 1780 } 1781 EXPORT_SYMBOL_GPL(__dev_forward_skb); 1782 1783 /** 1784 * dev_forward_skb - loopback an skb to another netif 1785 * 1786 * @dev: destination network device 1787 * @skb: buffer to forward 1788 * 1789 * return values: 1790 * NET_RX_SUCCESS (no congestion) 1791 * NET_RX_DROP (packet was dropped, but freed) 1792 * 1793 * dev_forward_skb can be used for injecting an skb from the 1794 * start_xmit function of one device into the receive queue 1795 * of another device. 1796 * 1797 * The receiving device may be in another namespace, so 1798 * we have to clear all information in the skb that could 1799 * impact namespace isolation. 1800 */ 1801 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1802 { 1803 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); 1804 } 1805 EXPORT_SYMBOL_GPL(dev_forward_skb); 1806 1807 static inline int deliver_skb(struct sk_buff *skb, 1808 struct packet_type *pt_prev, 1809 struct net_device *orig_dev) 1810 { 1811 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 1812 return -ENOMEM; 1813 atomic_inc(&skb->users); 1814 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 1815 } 1816 1817 static inline void deliver_ptype_list_skb(struct sk_buff *skb, 1818 struct packet_type **pt, 1819 struct net_device *orig_dev, 1820 __be16 type, 1821 struct list_head *ptype_list) 1822 { 1823 struct packet_type *ptype, *pt_prev = *pt; 1824 1825 list_for_each_entry_rcu(ptype, ptype_list, list) { 1826 if (ptype->type != type) 1827 continue; 1828 if (pt_prev) 1829 deliver_skb(skb, pt_prev, orig_dev); 1830 pt_prev = ptype; 1831 } 1832 *pt = pt_prev; 1833 } 1834 1835 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) 1836 { 1837 if (!ptype->af_packet_priv || !skb->sk) 1838 return false; 1839 1840 if (ptype->id_match) 1841 return ptype->id_match(ptype, skb->sk); 1842 else if ((struct sock *)ptype->af_packet_priv == skb->sk) 1843 return true; 1844 1845 return false; 1846 } 1847 1848 /* 1849 * Support routine. Sends outgoing frames to any network 1850 * taps currently in use. 1851 */ 1852 1853 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1854 { 1855 struct packet_type *ptype; 1856 struct sk_buff *skb2 = NULL; 1857 struct packet_type *pt_prev = NULL; 1858 struct list_head *ptype_list = &ptype_all; 1859 1860 rcu_read_lock(); 1861 again: 1862 list_for_each_entry_rcu(ptype, ptype_list, list) { 1863 /* Never send packets back to the socket 1864 * they originated from - MvS ([email protected]) 1865 */ 1866 if (skb_loop_sk(ptype, skb)) 1867 continue; 1868 1869 if (pt_prev) { 1870 deliver_skb(skb2, pt_prev, skb->dev); 1871 pt_prev = ptype; 1872 continue; 1873 } 1874 1875 /* need to clone skb, done only once */ 1876 skb2 = skb_clone(skb, GFP_ATOMIC); 1877 if (!skb2) 1878 goto out_unlock; 1879 1880 net_timestamp_set(skb2); 1881 1882 /* skb->nh should be correctly 1883 * set by sender, so that the second statement is 1884 * just protection against buggy protocols. 1885 */ 1886 skb_reset_mac_header(skb2); 1887 1888 if (skb_network_header(skb2) < skb2->data || 1889 skb_network_header(skb2) > skb_tail_pointer(skb2)) { 1890 net_crit_ratelimited("protocol %04x is buggy, dev %s\n", 1891 ntohs(skb2->protocol), 1892 dev->name); 1893 skb_reset_network_header(skb2); 1894 } 1895 1896 skb2->transport_header = skb2->network_header; 1897 skb2->pkt_type = PACKET_OUTGOING; 1898 pt_prev = ptype; 1899 } 1900 1901 if (ptype_list == &ptype_all) { 1902 ptype_list = &dev->ptype_all; 1903 goto again; 1904 } 1905 out_unlock: 1906 if (pt_prev) 1907 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); 1908 rcu_read_unlock(); 1909 } 1910 1911 /** 1912 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change 1913 * @dev: Network device 1914 * @txq: number of queues available 1915 * 1916 * If real_num_tx_queues is changed the tc mappings may no longer be 1917 * valid. To resolve this verify the tc mapping remains valid and if 1918 * not NULL the mapping. With no priorities mapping to this 1919 * offset/count pair it will no longer be used. In the worst case TC0 1920 * is invalid nothing can be done so disable priority mappings. If is 1921 * expected that drivers will fix this mapping if they can before 1922 * calling netif_set_real_num_tx_queues. 1923 */ 1924 static void netif_setup_tc(struct net_device *dev, unsigned int txq) 1925 { 1926 int i; 1927 struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; 1928 1929 /* If TC0 is invalidated disable TC mapping */ 1930 if (tc->offset + tc->count > txq) { 1931 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); 1932 dev->num_tc = 0; 1933 return; 1934 } 1935 1936 /* Invalidated prio to tc mappings set to TC0 */ 1937 for (i = 1; i < TC_BITMASK + 1; i++) { 1938 int q = netdev_get_prio_tc_map(dev, i); 1939 1940 tc = &dev->tc_to_txq[q]; 1941 if (tc->offset + tc->count > txq) { 1942 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", 1943 i, q); 1944 netdev_set_prio_tc_map(dev, i, 0); 1945 } 1946 } 1947 } 1948 1949 #ifdef CONFIG_XPS 1950 static DEFINE_MUTEX(xps_map_mutex); 1951 #define xmap_dereference(P) \ 1952 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) 1953 1954 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, 1955 int cpu, u16 index) 1956 { 1957 struct xps_map *map = NULL; 1958 int pos; 1959 1960 if (dev_maps) 1961 map = xmap_dereference(dev_maps->cpu_map[cpu]); 1962 1963 for (pos = 0; map && pos < map->len; pos++) { 1964 if (map->queues[pos] == index) { 1965 if (map->len > 1) { 1966 map->queues[pos] = map->queues[--map->len]; 1967 } else { 1968 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); 1969 kfree_rcu(map, rcu); 1970 map = NULL; 1971 } 1972 break; 1973 } 1974 } 1975 1976 return map; 1977 } 1978 1979 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) 1980 { 1981 struct xps_dev_maps *dev_maps; 1982 int cpu, i; 1983 bool active = false; 1984 1985 mutex_lock(&xps_map_mutex); 1986 dev_maps = xmap_dereference(dev->xps_maps); 1987 1988 if (!dev_maps) 1989 goto out_no_maps; 1990 1991 for_each_possible_cpu(cpu) { 1992 for (i = index; i < dev->num_tx_queues; i++) { 1993 if (!remove_xps_queue(dev_maps, cpu, i)) 1994 break; 1995 } 1996 if (i == dev->num_tx_queues) 1997 active = true; 1998 } 1999 2000 if (!active) { 2001 RCU_INIT_POINTER(dev->xps_maps, NULL); 2002 kfree_rcu(dev_maps, rcu); 2003 } 2004 2005 for (i = index; i < dev->num_tx_queues; i++) 2006 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), 2007 NUMA_NO_NODE); 2008 2009 out_no_maps: 2010 mutex_unlock(&xps_map_mutex); 2011 } 2012 2013 static struct xps_map *expand_xps_map(struct xps_map *map, 2014 int cpu, u16 index) 2015 { 2016 struct xps_map *new_map; 2017 int alloc_len = XPS_MIN_MAP_ALLOC; 2018 int i, pos; 2019 2020 for (pos = 0; map && pos < map->len; pos++) { 2021 if (map->queues[pos] != index) 2022 continue; 2023 return map; 2024 } 2025 2026 /* Need to add queue to this CPU's existing map */ 2027 if (map) { 2028 if (pos < map->alloc_len) 2029 return map; 2030 2031 alloc_len = map->alloc_len * 2; 2032 } 2033 2034 /* Need to allocate new map to store queue on this CPU's map */ 2035 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, 2036 cpu_to_node(cpu)); 2037 if (!new_map) 2038 return NULL; 2039 2040 for (i = 0; i < pos; i++) 2041 new_map->queues[i] = map->queues[i]; 2042 new_map->alloc_len = alloc_len; 2043 new_map->len = pos; 2044 2045 return new_map; 2046 } 2047 2048 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, 2049 u16 index) 2050 { 2051 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; 2052 struct xps_map *map, *new_map; 2053 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); 2054 int cpu, numa_node_id = -2; 2055 bool active = false; 2056 2057 mutex_lock(&xps_map_mutex); 2058 2059 dev_maps = xmap_dereference(dev->xps_maps); 2060 2061 /* allocate memory for queue storage */ 2062 for_each_online_cpu(cpu) { 2063 if (!cpumask_test_cpu(cpu, mask)) 2064 continue; 2065 2066 if (!new_dev_maps) 2067 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); 2068 if (!new_dev_maps) { 2069 mutex_unlock(&xps_map_mutex); 2070 return -ENOMEM; 2071 } 2072 2073 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2074 NULL; 2075 2076 map = expand_xps_map(map, cpu, index); 2077 if (!map) 2078 goto error; 2079 2080 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2081 } 2082 2083 if (!new_dev_maps) 2084 goto out_no_new_maps; 2085 2086 for_each_possible_cpu(cpu) { 2087 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { 2088 /* add queue to CPU maps */ 2089 int pos = 0; 2090 2091 map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2092 while ((pos < map->len) && (map->queues[pos] != index)) 2093 pos++; 2094 2095 if (pos == map->len) 2096 map->queues[map->len++] = index; 2097 #ifdef CONFIG_NUMA 2098 if (numa_node_id == -2) 2099 numa_node_id = cpu_to_node(cpu); 2100 else if (numa_node_id != cpu_to_node(cpu)) 2101 numa_node_id = -1; 2102 #endif 2103 } else if (dev_maps) { 2104 /* fill in the new device map from the old device map */ 2105 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2106 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2107 } 2108 2109 } 2110 2111 rcu_assign_pointer(dev->xps_maps, new_dev_maps); 2112 2113 /* Cleanup old maps */ 2114 if (dev_maps) { 2115 for_each_possible_cpu(cpu) { 2116 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2117 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2118 if (map && map != new_map) 2119 kfree_rcu(map, rcu); 2120 } 2121 2122 kfree_rcu(dev_maps, rcu); 2123 } 2124 2125 dev_maps = new_dev_maps; 2126 active = true; 2127 2128 out_no_new_maps: 2129 /* update Tx queue numa node */ 2130 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), 2131 (numa_node_id >= 0) ? numa_node_id : 2132 NUMA_NO_NODE); 2133 2134 if (!dev_maps) 2135 goto out_no_maps; 2136 2137 /* removes queue from unused CPUs */ 2138 for_each_possible_cpu(cpu) { 2139 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) 2140 continue; 2141 2142 if (remove_xps_queue(dev_maps, cpu, index)) 2143 active = true; 2144 } 2145 2146 /* free map if not active */ 2147 if (!active) { 2148 RCU_INIT_POINTER(dev->xps_maps, NULL); 2149 kfree_rcu(dev_maps, rcu); 2150 } 2151 2152 out_no_maps: 2153 mutex_unlock(&xps_map_mutex); 2154 2155 return 0; 2156 error: 2157 /* remove any maps that we added */ 2158 for_each_possible_cpu(cpu) { 2159 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2160 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2161 NULL; 2162 if (new_map && new_map != map) 2163 kfree(new_map); 2164 } 2165 2166 mutex_unlock(&xps_map_mutex); 2167 2168 kfree(new_dev_maps); 2169 return -ENOMEM; 2170 } 2171 EXPORT_SYMBOL(netif_set_xps_queue); 2172 2173 #endif 2174 /* 2175 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 2176 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 2177 */ 2178 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 2179 { 2180 int rc; 2181 2182 if (txq < 1 || txq > dev->num_tx_queues) 2183 return -EINVAL; 2184 2185 if (dev->reg_state == NETREG_REGISTERED || 2186 dev->reg_state == NETREG_UNREGISTERING) { 2187 ASSERT_RTNL(); 2188 2189 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, 2190 txq); 2191 if (rc) 2192 return rc; 2193 2194 if (dev->num_tc) 2195 netif_setup_tc(dev, txq); 2196 2197 if (txq < dev->real_num_tx_queues) { 2198 qdisc_reset_all_tx_gt(dev, txq); 2199 #ifdef CONFIG_XPS 2200 netif_reset_xps_queues_gt(dev, txq); 2201 #endif 2202 } 2203 } 2204 2205 dev->real_num_tx_queues = txq; 2206 return 0; 2207 } 2208 EXPORT_SYMBOL(netif_set_real_num_tx_queues); 2209 2210 #ifdef CONFIG_SYSFS 2211 /** 2212 * netif_set_real_num_rx_queues - set actual number of RX queues used 2213 * @dev: Network device 2214 * @rxq: Actual number of RX queues 2215 * 2216 * This must be called either with the rtnl_lock held or before 2217 * registration of the net device. Returns 0 on success, or a 2218 * negative error code. If called before registration, it always 2219 * succeeds. 2220 */ 2221 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) 2222 { 2223 int rc; 2224 2225 if (rxq < 1 || rxq > dev->num_rx_queues) 2226 return -EINVAL; 2227 2228 if (dev->reg_state == NETREG_REGISTERED) { 2229 ASSERT_RTNL(); 2230 2231 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, 2232 rxq); 2233 if (rc) 2234 return rc; 2235 } 2236 2237 dev->real_num_rx_queues = rxq; 2238 return 0; 2239 } 2240 EXPORT_SYMBOL(netif_set_real_num_rx_queues); 2241 #endif 2242 2243 /** 2244 * netif_get_num_default_rss_queues - default number of RSS queues 2245 * 2246 * This routine should set an upper limit on the number of RSS queues 2247 * used by default by multiqueue devices. 2248 */ 2249 int netif_get_num_default_rss_queues(void) 2250 { 2251 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); 2252 } 2253 EXPORT_SYMBOL(netif_get_num_default_rss_queues); 2254 2255 static inline void __netif_reschedule(struct Qdisc *q) 2256 { 2257 struct softnet_data *sd; 2258 unsigned long flags; 2259 2260 local_irq_save(flags); 2261 sd = this_cpu_ptr(&softnet_data); 2262 q->next_sched = NULL; 2263 *sd->output_queue_tailp = q; 2264 sd->output_queue_tailp = &q->next_sched; 2265 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2266 local_irq_restore(flags); 2267 } 2268 2269 void __netif_schedule(struct Qdisc *q) 2270 { 2271 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 2272 __netif_reschedule(q); 2273 } 2274 EXPORT_SYMBOL(__netif_schedule); 2275 2276 struct dev_kfree_skb_cb { 2277 enum skb_free_reason reason; 2278 }; 2279 2280 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) 2281 { 2282 return (struct dev_kfree_skb_cb *)skb->cb; 2283 } 2284 2285 void netif_schedule_queue(struct netdev_queue *txq) 2286 { 2287 rcu_read_lock(); 2288 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { 2289 struct Qdisc *q = rcu_dereference(txq->qdisc); 2290 2291 __netif_schedule(q); 2292 } 2293 rcu_read_unlock(); 2294 } 2295 EXPORT_SYMBOL(netif_schedule_queue); 2296 2297 /** 2298 * netif_wake_subqueue - allow sending packets on subqueue 2299 * @dev: network device 2300 * @queue_index: sub queue index 2301 * 2302 * Resume individual transmit queue of a device with multiple transmit queues. 2303 */ 2304 void netif_wake_subqueue(struct net_device *dev, u16 queue_index) 2305 { 2306 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); 2307 2308 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { 2309 struct Qdisc *q; 2310 2311 rcu_read_lock(); 2312 q = rcu_dereference(txq->qdisc); 2313 __netif_schedule(q); 2314 rcu_read_unlock(); 2315 } 2316 } 2317 EXPORT_SYMBOL(netif_wake_subqueue); 2318 2319 void netif_tx_wake_queue(struct netdev_queue *dev_queue) 2320 { 2321 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { 2322 struct Qdisc *q; 2323 2324 rcu_read_lock(); 2325 q = rcu_dereference(dev_queue->qdisc); 2326 __netif_schedule(q); 2327 rcu_read_unlock(); 2328 } 2329 } 2330 EXPORT_SYMBOL(netif_tx_wake_queue); 2331 2332 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) 2333 { 2334 unsigned long flags; 2335 2336 if (likely(atomic_read(&skb->users) == 1)) { 2337 smp_rmb(); 2338 atomic_set(&skb->users, 0); 2339 } else if (likely(!atomic_dec_and_test(&skb->users))) { 2340 return; 2341 } 2342 get_kfree_skb_cb(skb)->reason = reason; 2343 local_irq_save(flags); 2344 skb->next = __this_cpu_read(softnet_data.completion_queue); 2345 __this_cpu_write(softnet_data.completion_queue, skb); 2346 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2347 local_irq_restore(flags); 2348 } 2349 EXPORT_SYMBOL(__dev_kfree_skb_irq); 2350 2351 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) 2352 { 2353 if (in_irq() || irqs_disabled()) 2354 __dev_kfree_skb_irq(skb, reason); 2355 else 2356 dev_kfree_skb(skb); 2357 } 2358 EXPORT_SYMBOL(__dev_kfree_skb_any); 2359 2360 2361 /** 2362 * netif_device_detach - mark device as removed 2363 * @dev: network device 2364 * 2365 * Mark device as removed from system and therefore no longer available. 2366 */ 2367 void netif_device_detach(struct net_device *dev) 2368 { 2369 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 2370 netif_running(dev)) { 2371 netif_tx_stop_all_queues(dev); 2372 } 2373 } 2374 EXPORT_SYMBOL(netif_device_detach); 2375 2376 /** 2377 * netif_device_attach - mark device as attached 2378 * @dev: network device 2379 * 2380 * Mark device as attached from system and restart if needed. 2381 */ 2382 void netif_device_attach(struct net_device *dev) 2383 { 2384 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 2385 netif_running(dev)) { 2386 netif_tx_wake_all_queues(dev); 2387 __netdev_watchdog_up(dev); 2388 } 2389 } 2390 EXPORT_SYMBOL(netif_device_attach); 2391 2392 /* 2393 * Returns a Tx hash based on the given packet descriptor a Tx queues' number 2394 * to be used as a distribution range. 2395 */ 2396 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, 2397 unsigned int num_tx_queues) 2398 { 2399 u32 hash; 2400 u16 qoffset = 0; 2401 u16 qcount = num_tx_queues; 2402 2403 if (skb_rx_queue_recorded(skb)) { 2404 hash = skb_get_rx_queue(skb); 2405 while (unlikely(hash >= num_tx_queues)) 2406 hash -= num_tx_queues; 2407 return hash; 2408 } 2409 2410 if (dev->num_tc) { 2411 u8 tc = netdev_get_prio_tc_map(dev, skb->priority); 2412 qoffset = dev->tc_to_txq[tc].offset; 2413 qcount = dev->tc_to_txq[tc].count; 2414 } 2415 2416 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; 2417 } 2418 EXPORT_SYMBOL(__skb_tx_hash); 2419 2420 static void skb_warn_bad_offload(const struct sk_buff *skb) 2421 { 2422 static const netdev_features_t null_features = 0; 2423 struct net_device *dev = skb->dev; 2424 const char *name = ""; 2425 2426 if (!net_ratelimit()) 2427 return; 2428 2429 if (dev) { 2430 if (dev->dev.parent) 2431 name = dev_driver_string(dev->dev.parent); 2432 else 2433 name = netdev_name(dev); 2434 } 2435 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " 2436 "gso_type=%d ip_summed=%d\n", 2437 name, dev ? &dev->features : &null_features, 2438 skb->sk ? &skb->sk->sk_route_caps : &null_features, 2439 skb->len, skb->data_len, skb_shinfo(skb)->gso_size, 2440 skb_shinfo(skb)->gso_type, skb->ip_summed); 2441 } 2442 2443 /* 2444 * Invalidate hardware checksum when packet is to be mangled, and 2445 * complete checksum manually on outgoing path. 2446 */ 2447 int skb_checksum_help(struct sk_buff *skb) 2448 { 2449 __wsum csum; 2450 int ret = 0, offset; 2451 2452 if (skb->ip_summed == CHECKSUM_COMPLETE) 2453 goto out_set_summed; 2454 2455 if (unlikely(skb_shinfo(skb)->gso_size)) { 2456 skb_warn_bad_offload(skb); 2457 return -EINVAL; 2458 } 2459 2460 /* Before computing a checksum, we should make sure no frag could 2461 * be modified by an external entity : checksum could be wrong. 2462 */ 2463 if (skb_has_shared_frag(skb)) { 2464 ret = __skb_linearize(skb); 2465 if (ret) 2466 goto out; 2467 } 2468 2469 offset = skb_checksum_start_offset(skb); 2470 BUG_ON(offset >= skb_headlen(skb)); 2471 csum = skb_checksum(skb, offset, skb->len - offset, 0); 2472 2473 offset += skb->csum_offset; 2474 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 2475 2476 if (skb_cloned(skb) && 2477 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 2478 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 2479 if (ret) 2480 goto out; 2481 } 2482 2483 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 2484 out_set_summed: 2485 skb->ip_summed = CHECKSUM_NONE; 2486 out: 2487 return ret; 2488 } 2489 EXPORT_SYMBOL(skb_checksum_help); 2490 2491 /* skb_csum_offload_check - Driver helper function to determine if a device 2492 * with limited checksum offload capabilities is able to offload the checksum 2493 * for a given packet. 2494 * 2495 * Arguments: 2496 * skb - sk_buff for the packet in question 2497 * spec - contains the description of what device can offload 2498 * csum_encapped - returns true if the checksum being offloaded is 2499 * encpasulated. That is it is checksum for the transport header 2500 * in the inner headers. 2501 * checksum_help - when set indicates that helper function should 2502 * call skb_checksum_help if offload checks fail 2503 * 2504 * Returns: 2505 * true: Packet has passed the checksum checks and should be offloadable to 2506 * the device (a driver may still need to check for additional 2507 * restrictions of its device) 2508 * false: Checksum is not offloadable. If checksum_help was set then 2509 * skb_checksum_help was called to resolve checksum for non-GSO 2510 * packets and when IP protocol is not SCTP 2511 */ 2512 bool __skb_csum_offload_chk(struct sk_buff *skb, 2513 const struct skb_csum_offl_spec *spec, 2514 bool *csum_encapped, 2515 bool csum_help) 2516 { 2517 struct iphdr *iph; 2518 struct ipv6hdr *ipv6; 2519 void *nhdr; 2520 int protocol; 2521 u8 ip_proto; 2522 2523 if (skb->protocol == htons(ETH_P_8021Q) || 2524 skb->protocol == htons(ETH_P_8021AD)) { 2525 if (!spec->vlan_okay) 2526 goto need_help; 2527 } 2528 2529 /* We check whether the checksum refers to a transport layer checksum in 2530 * the outermost header or an encapsulated transport layer checksum that 2531 * corresponds to the inner headers of the skb. If the checksum is for 2532 * something else in the packet we need help. 2533 */ 2534 if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) { 2535 /* Non-encapsulated checksum */ 2536 protocol = eproto_to_ipproto(vlan_get_protocol(skb)); 2537 nhdr = skb_network_header(skb); 2538 *csum_encapped = false; 2539 if (spec->no_not_encapped) 2540 goto need_help; 2541 } else if (skb->encapsulation && spec->encap_okay && 2542 skb_checksum_start_offset(skb) == 2543 skb_inner_transport_offset(skb)) { 2544 /* Encapsulated checksum */ 2545 *csum_encapped = true; 2546 switch (skb->inner_protocol_type) { 2547 case ENCAP_TYPE_ETHER: 2548 protocol = eproto_to_ipproto(skb->inner_protocol); 2549 break; 2550 case ENCAP_TYPE_IPPROTO: 2551 protocol = skb->inner_protocol; 2552 break; 2553 } 2554 nhdr = skb_inner_network_header(skb); 2555 } else { 2556 goto need_help; 2557 } 2558 2559 switch (protocol) { 2560 case IPPROTO_IP: 2561 if (!spec->ipv4_okay) 2562 goto need_help; 2563 iph = nhdr; 2564 ip_proto = iph->protocol; 2565 if (iph->ihl != 5 && !spec->ip_options_okay) 2566 goto need_help; 2567 break; 2568 case IPPROTO_IPV6: 2569 if (!spec->ipv6_okay) 2570 goto need_help; 2571 if (spec->no_encapped_ipv6 && *csum_encapped) 2572 goto need_help; 2573 ipv6 = nhdr; 2574 nhdr += sizeof(*ipv6); 2575 ip_proto = ipv6->nexthdr; 2576 break; 2577 default: 2578 goto need_help; 2579 } 2580 2581 ip_proto_again: 2582 switch (ip_proto) { 2583 case IPPROTO_TCP: 2584 if (!spec->tcp_okay || 2585 skb->csum_offset != offsetof(struct tcphdr, check)) 2586 goto need_help; 2587 break; 2588 case IPPROTO_UDP: 2589 if (!spec->udp_okay || 2590 skb->csum_offset != offsetof(struct udphdr, check)) 2591 goto need_help; 2592 break; 2593 case IPPROTO_SCTP: 2594 if (!spec->sctp_okay || 2595 skb->csum_offset != offsetof(struct sctphdr, checksum)) 2596 goto cant_help; 2597 break; 2598 case NEXTHDR_HOP: 2599 case NEXTHDR_ROUTING: 2600 case NEXTHDR_DEST: { 2601 u8 *opthdr = nhdr; 2602 2603 if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay) 2604 goto need_help; 2605 2606 ip_proto = opthdr[0]; 2607 nhdr += (opthdr[1] + 1) << 3; 2608 2609 goto ip_proto_again; 2610 } 2611 default: 2612 goto need_help; 2613 } 2614 2615 /* Passed the tests for offloading checksum */ 2616 return true; 2617 2618 need_help: 2619 if (csum_help && !skb_shinfo(skb)->gso_size) 2620 skb_checksum_help(skb); 2621 cant_help: 2622 return false; 2623 } 2624 EXPORT_SYMBOL(__skb_csum_offload_chk); 2625 2626 __be16 skb_network_protocol(struct sk_buff *skb, int *depth) 2627 { 2628 __be16 type = skb->protocol; 2629 2630 /* Tunnel gso handlers can set protocol to ethernet. */ 2631 if (type == htons(ETH_P_TEB)) { 2632 struct ethhdr *eth; 2633 2634 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) 2635 return 0; 2636 2637 eth = (struct ethhdr *)skb_mac_header(skb); 2638 type = eth->h_proto; 2639 } 2640 2641 return __vlan_get_protocol(skb, type, depth); 2642 } 2643 2644 /** 2645 * skb_mac_gso_segment - mac layer segmentation handler. 2646 * @skb: buffer to segment 2647 * @features: features for the output path (see dev->features) 2648 */ 2649 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, 2650 netdev_features_t features) 2651 { 2652 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 2653 struct packet_offload *ptype; 2654 int vlan_depth = skb->mac_len; 2655 __be16 type = skb_network_protocol(skb, &vlan_depth); 2656 2657 if (unlikely(!type)) 2658 return ERR_PTR(-EINVAL); 2659 2660 __skb_pull(skb, vlan_depth); 2661 2662 rcu_read_lock(); 2663 list_for_each_entry_rcu(ptype, &offload_base, list) { 2664 if (ptype->type == type && ptype->callbacks.gso_segment) { 2665 segs = ptype->callbacks.gso_segment(skb, features); 2666 break; 2667 } 2668 } 2669 rcu_read_unlock(); 2670 2671 __skb_push(skb, skb->data - skb_mac_header(skb)); 2672 2673 return segs; 2674 } 2675 EXPORT_SYMBOL(skb_mac_gso_segment); 2676 2677 2678 /* openvswitch calls this on rx path, so we need a different check. 2679 */ 2680 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) 2681 { 2682 if (tx_path) 2683 return skb->ip_summed != CHECKSUM_PARTIAL; 2684 else 2685 return skb->ip_summed == CHECKSUM_NONE; 2686 } 2687 2688 /** 2689 * __skb_gso_segment - Perform segmentation on skb. 2690 * @skb: buffer to segment 2691 * @features: features for the output path (see dev->features) 2692 * @tx_path: whether it is called in TX path 2693 * 2694 * This function segments the given skb and returns a list of segments. 2695 * 2696 * It may return NULL if the skb requires no segmentation. This is 2697 * only possible when GSO is used for verifying header integrity. 2698 * 2699 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. 2700 */ 2701 struct sk_buff *__skb_gso_segment(struct sk_buff *skb, 2702 netdev_features_t features, bool tx_path) 2703 { 2704 if (unlikely(skb_needs_check(skb, tx_path))) { 2705 int err; 2706 2707 skb_warn_bad_offload(skb); 2708 2709 err = skb_cow_head(skb, 0); 2710 if (err < 0) 2711 return ERR_PTR(err); 2712 } 2713 2714 /* Only report GSO partial support if it will enable us to 2715 * support segmentation on this frame without needing additional 2716 * work. 2717 */ 2718 if (features & NETIF_F_GSO_PARTIAL) { 2719 netdev_features_t partial_features = NETIF_F_GSO_ROBUST; 2720 struct net_device *dev = skb->dev; 2721 2722 partial_features |= dev->features & dev->gso_partial_features; 2723 if (!skb_gso_ok(skb, features | partial_features)) 2724 features &= ~NETIF_F_GSO_PARTIAL; 2725 } 2726 2727 BUILD_BUG_ON(SKB_SGO_CB_OFFSET + 2728 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); 2729 2730 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); 2731 SKB_GSO_CB(skb)->encap_level = 0; 2732 2733 skb_reset_mac_header(skb); 2734 skb_reset_mac_len(skb); 2735 2736 return skb_mac_gso_segment(skb, features); 2737 } 2738 EXPORT_SYMBOL(__skb_gso_segment); 2739 2740 /* Take action when hardware reception checksum errors are detected. */ 2741 #ifdef CONFIG_BUG 2742 void netdev_rx_csum_fault(struct net_device *dev) 2743 { 2744 if (net_ratelimit()) { 2745 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); 2746 dump_stack(); 2747 } 2748 } 2749 EXPORT_SYMBOL(netdev_rx_csum_fault); 2750 #endif 2751 2752 /* Actually, we should eliminate this check as soon as we know, that: 2753 * 1. IOMMU is present and allows to map all the memory. 2754 * 2. No high memory really exists on this machine. 2755 */ 2756 2757 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 2758 { 2759 #ifdef CONFIG_HIGHMEM 2760 int i; 2761 if (!(dev->features & NETIF_F_HIGHDMA)) { 2762 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2763 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2764 if (PageHighMem(skb_frag_page(frag))) 2765 return 1; 2766 } 2767 } 2768 2769 if (PCI_DMA_BUS_IS_PHYS) { 2770 struct device *pdev = dev->dev.parent; 2771 2772 if (!pdev) 2773 return 0; 2774 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2775 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2776 dma_addr_t addr = page_to_phys(skb_frag_page(frag)); 2777 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 2778 return 1; 2779 } 2780 } 2781 #endif 2782 return 0; 2783 } 2784 2785 /* If MPLS offload request, verify we are testing hardware MPLS features 2786 * instead of standard features for the netdev. 2787 */ 2788 #if IS_ENABLED(CONFIG_NET_MPLS_GSO) 2789 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2790 netdev_features_t features, 2791 __be16 type) 2792 { 2793 if (eth_p_mpls(type)) 2794 features &= skb->dev->mpls_features; 2795 2796 return features; 2797 } 2798 #else 2799 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2800 netdev_features_t features, 2801 __be16 type) 2802 { 2803 return features; 2804 } 2805 #endif 2806 2807 static netdev_features_t harmonize_features(struct sk_buff *skb, 2808 netdev_features_t features) 2809 { 2810 int tmp; 2811 __be16 type; 2812 2813 type = skb_network_protocol(skb, &tmp); 2814 features = net_mpls_features(skb, features, type); 2815 2816 if (skb->ip_summed != CHECKSUM_NONE && 2817 !can_checksum_protocol(features, type)) { 2818 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); 2819 } else if (illegal_highdma(skb->dev, skb)) { 2820 features &= ~NETIF_F_SG; 2821 } 2822 2823 return features; 2824 } 2825 2826 netdev_features_t passthru_features_check(struct sk_buff *skb, 2827 struct net_device *dev, 2828 netdev_features_t features) 2829 { 2830 return features; 2831 } 2832 EXPORT_SYMBOL(passthru_features_check); 2833 2834 static netdev_features_t dflt_features_check(const struct sk_buff *skb, 2835 struct net_device *dev, 2836 netdev_features_t features) 2837 { 2838 return vlan_features_check(skb, features); 2839 } 2840 2841 static netdev_features_t gso_features_check(const struct sk_buff *skb, 2842 struct net_device *dev, 2843 netdev_features_t features) 2844 { 2845 u16 gso_segs = skb_shinfo(skb)->gso_segs; 2846 2847 if (gso_segs > dev->gso_max_segs) 2848 return features & ~NETIF_F_GSO_MASK; 2849 2850 /* Support for GSO partial features requires software 2851 * intervention before we can actually process the packets 2852 * so we need to strip support for any partial features now 2853 * and we can pull them back in after we have partially 2854 * segmented the frame. 2855 */ 2856 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) 2857 features &= ~dev->gso_partial_features; 2858 2859 /* Make sure to clear the IPv4 ID mangling feature if the 2860 * IPv4 header has the potential to be fragmented. 2861 */ 2862 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { 2863 struct iphdr *iph = skb->encapsulation ? 2864 inner_ip_hdr(skb) : ip_hdr(skb); 2865 2866 if (!(iph->frag_off & htons(IP_DF))) 2867 features &= ~NETIF_F_TSO_MANGLEID; 2868 } 2869 2870 return features; 2871 } 2872 2873 netdev_features_t netif_skb_features(struct sk_buff *skb) 2874 { 2875 struct net_device *dev = skb->dev; 2876 netdev_features_t features = dev->features; 2877 2878 if (skb_is_gso(skb)) 2879 features = gso_features_check(skb, dev, features); 2880 2881 /* If encapsulation offload request, verify we are testing 2882 * hardware encapsulation features instead of standard 2883 * features for the netdev 2884 */ 2885 if (skb->encapsulation) 2886 features &= dev->hw_enc_features; 2887 2888 if (skb_vlan_tagged(skb)) 2889 features = netdev_intersect_features(features, 2890 dev->vlan_features | 2891 NETIF_F_HW_VLAN_CTAG_TX | 2892 NETIF_F_HW_VLAN_STAG_TX); 2893 2894 if (dev->netdev_ops->ndo_features_check) 2895 features &= dev->netdev_ops->ndo_features_check(skb, dev, 2896 features); 2897 else 2898 features &= dflt_features_check(skb, dev, features); 2899 2900 return harmonize_features(skb, features); 2901 } 2902 EXPORT_SYMBOL(netif_skb_features); 2903 2904 static int xmit_one(struct sk_buff *skb, struct net_device *dev, 2905 struct netdev_queue *txq, bool more) 2906 { 2907 unsigned int len; 2908 int rc; 2909 2910 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) 2911 dev_queue_xmit_nit(skb, dev); 2912 2913 len = skb->len; 2914 trace_net_dev_start_xmit(skb, dev); 2915 rc = netdev_start_xmit(skb, dev, txq, more); 2916 trace_net_dev_xmit(skb, rc, dev, len); 2917 2918 return rc; 2919 } 2920 2921 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, 2922 struct netdev_queue *txq, int *ret) 2923 { 2924 struct sk_buff *skb = first; 2925 int rc = NETDEV_TX_OK; 2926 2927 while (skb) { 2928 struct sk_buff *next = skb->next; 2929 2930 skb->next = NULL; 2931 rc = xmit_one(skb, dev, txq, next != NULL); 2932 if (unlikely(!dev_xmit_complete(rc))) { 2933 skb->next = next; 2934 goto out; 2935 } 2936 2937 skb = next; 2938 if (netif_xmit_stopped(txq) && skb) { 2939 rc = NETDEV_TX_BUSY; 2940 break; 2941 } 2942 } 2943 2944 out: 2945 *ret = rc; 2946 return skb; 2947 } 2948 2949 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, 2950 netdev_features_t features) 2951 { 2952 if (skb_vlan_tag_present(skb) && 2953 !vlan_hw_offload_capable(features, skb->vlan_proto)) 2954 skb = __vlan_hwaccel_push_inside(skb); 2955 return skb; 2956 } 2957 2958 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) 2959 { 2960 netdev_features_t features; 2961 2962 features = netif_skb_features(skb); 2963 skb = validate_xmit_vlan(skb, features); 2964 if (unlikely(!skb)) 2965 goto out_null; 2966 2967 if (netif_needs_gso(skb, features)) { 2968 struct sk_buff *segs; 2969 2970 segs = skb_gso_segment(skb, features); 2971 if (IS_ERR(segs)) { 2972 goto out_kfree_skb; 2973 } else if (segs) { 2974 consume_skb(skb); 2975 skb = segs; 2976 } 2977 } else { 2978 if (skb_needs_linearize(skb, features) && 2979 __skb_linearize(skb)) 2980 goto out_kfree_skb; 2981 2982 /* If packet is not checksummed and device does not 2983 * support checksumming for this protocol, complete 2984 * checksumming here. 2985 */ 2986 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2987 if (skb->encapsulation) 2988 skb_set_inner_transport_header(skb, 2989 skb_checksum_start_offset(skb)); 2990 else 2991 skb_set_transport_header(skb, 2992 skb_checksum_start_offset(skb)); 2993 if (!(features & NETIF_F_CSUM_MASK) && 2994 skb_checksum_help(skb)) 2995 goto out_kfree_skb; 2996 } 2997 } 2998 2999 return skb; 3000 3001 out_kfree_skb: 3002 kfree_skb(skb); 3003 out_null: 3004 atomic_long_inc(&dev->tx_dropped); 3005 return NULL; 3006 } 3007 3008 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) 3009 { 3010 struct sk_buff *next, *head = NULL, *tail; 3011 3012 for (; skb != NULL; skb = next) { 3013 next = skb->next; 3014 skb->next = NULL; 3015 3016 /* in case skb wont be segmented, point to itself */ 3017 skb->prev = skb; 3018 3019 skb = validate_xmit_skb(skb, dev); 3020 if (!skb) 3021 continue; 3022 3023 if (!head) 3024 head = skb; 3025 else 3026 tail->next = skb; 3027 /* If skb was segmented, skb->prev points to 3028 * the last segment. If not, it still contains skb. 3029 */ 3030 tail = skb->prev; 3031 } 3032 return head; 3033 } 3034 3035 static void qdisc_pkt_len_init(struct sk_buff *skb) 3036 { 3037 const struct skb_shared_info *shinfo = skb_shinfo(skb); 3038 3039 qdisc_skb_cb(skb)->pkt_len = skb->len; 3040 3041 /* To get more precise estimation of bytes sent on wire, 3042 * we add to pkt_len the headers size of all segments 3043 */ 3044 if (shinfo->gso_size) { 3045 unsigned int hdr_len; 3046 u16 gso_segs = shinfo->gso_segs; 3047 3048 /* mac layer + network layer */ 3049 hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 3050 3051 /* + transport layer */ 3052 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 3053 hdr_len += tcp_hdrlen(skb); 3054 else 3055 hdr_len += sizeof(struct udphdr); 3056 3057 if (shinfo->gso_type & SKB_GSO_DODGY) 3058 gso_segs = DIV_ROUND_UP(skb->len - hdr_len, 3059 shinfo->gso_size); 3060 3061 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; 3062 } 3063 } 3064 3065 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 3066 struct net_device *dev, 3067 struct netdev_queue *txq) 3068 { 3069 spinlock_t *root_lock = qdisc_lock(q); 3070 bool contended; 3071 int rc; 3072 3073 qdisc_calculate_pkt_len(skb, q); 3074 /* 3075 * Heuristic to force contended enqueues to serialize on a 3076 * separate lock before trying to get qdisc main lock. 3077 * This permits __QDISC___STATE_RUNNING owner to get the lock more 3078 * often and dequeue packets faster. 3079 */ 3080 contended = qdisc_is_running(q); 3081 if (unlikely(contended)) 3082 spin_lock(&q->busylock); 3083 3084 spin_lock(root_lock); 3085 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 3086 kfree_skb(skb); 3087 rc = NET_XMIT_DROP; 3088 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 3089 qdisc_run_begin(q)) { 3090 /* 3091 * This is a work-conserving queue; there are no old skbs 3092 * waiting to be sent out; and the qdisc is not running - 3093 * xmit the skb directly. 3094 */ 3095 3096 qdisc_bstats_update(q, skb); 3097 3098 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { 3099 if (unlikely(contended)) { 3100 spin_unlock(&q->busylock); 3101 contended = false; 3102 } 3103 __qdisc_run(q); 3104 } else 3105 qdisc_run_end(q); 3106 3107 rc = NET_XMIT_SUCCESS; 3108 } else { 3109 rc = q->enqueue(skb, q) & NET_XMIT_MASK; 3110 if (qdisc_run_begin(q)) { 3111 if (unlikely(contended)) { 3112 spin_unlock(&q->busylock); 3113 contended = false; 3114 } 3115 __qdisc_run(q); 3116 } 3117 } 3118 spin_unlock(root_lock); 3119 if (unlikely(contended)) 3120 spin_unlock(&q->busylock); 3121 return rc; 3122 } 3123 3124 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) 3125 static void skb_update_prio(struct sk_buff *skb) 3126 { 3127 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); 3128 3129 if (!skb->priority && skb->sk && map) { 3130 unsigned int prioidx = 3131 sock_cgroup_prioidx(&skb->sk->sk_cgrp_data); 3132 3133 if (prioidx < map->priomap_len) 3134 skb->priority = map->priomap[prioidx]; 3135 } 3136 } 3137 #else 3138 #define skb_update_prio(skb) 3139 #endif 3140 3141 DEFINE_PER_CPU(int, xmit_recursion); 3142 EXPORT_SYMBOL(xmit_recursion); 3143 3144 #define RECURSION_LIMIT 10 3145 3146 /** 3147 * dev_loopback_xmit - loop back @skb 3148 * @net: network namespace this loopback is happening in 3149 * @sk: sk needed to be a netfilter okfn 3150 * @skb: buffer to transmit 3151 */ 3152 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) 3153 { 3154 skb_reset_mac_header(skb); 3155 __skb_pull(skb, skb_network_offset(skb)); 3156 skb->pkt_type = PACKET_LOOPBACK; 3157 skb->ip_summed = CHECKSUM_UNNECESSARY; 3158 WARN_ON(!skb_dst(skb)); 3159 skb_dst_force(skb); 3160 netif_rx_ni(skb); 3161 return 0; 3162 } 3163 EXPORT_SYMBOL(dev_loopback_xmit); 3164 3165 #ifdef CONFIG_NET_EGRESS 3166 static struct sk_buff * 3167 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) 3168 { 3169 struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list); 3170 struct tcf_result cl_res; 3171 3172 if (!cl) 3173 return skb; 3174 3175 /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set 3176 * earlier by the caller. 3177 */ 3178 qdisc_bstats_cpu_update(cl->q, skb); 3179 3180 switch (tc_classify(skb, cl, &cl_res, false)) { 3181 case TC_ACT_OK: 3182 case TC_ACT_RECLASSIFY: 3183 skb->tc_index = TC_H_MIN(cl_res.classid); 3184 break; 3185 case TC_ACT_SHOT: 3186 qdisc_qstats_cpu_drop(cl->q); 3187 *ret = NET_XMIT_DROP; 3188 goto drop; 3189 case TC_ACT_STOLEN: 3190 case TC_ACT_QUEUED: 3191 *ret = NET_XMIT_SUCCESS; 3192 drop: 3193 kfree_skb(skb); 3194 return NULL; 3195 case TC_ACT_REDIRECT: 3196 /* No need to push/pop skb's mac_header here on egress! */ 3197 skb_do_redirect(skb); 3198 *ret = NET_XMIT_SUCCESS; 3199 return NULL; 3200 default: 3201 break; 3202 } 3203 3204 return skb; 3205 } 3206 #endif /* CONFIG_NET_EGRESS */ 3207 3208 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) 3209 { 3210 #ifdef CONFIG_XPS 3211 struct xps_dev_maps *dev_maps; 3212 struct xps_map *map; 3213 int queue_index = -1; 3214 3215 rcu_read_lock(); 3216 dev_maps = rcu_dereference(dev->xps_maps); 3217 if (dev_maps) { 3218 map = rcu_dereference( 3219 dev_maps->cpu_map[skb->sender_cpu - 1]); 3220 if (map) { 3221 if (map->len == 1) 3222 queue_index = map->queues[0]; 3223 else 3224 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), 3225 map->len)]; 3226 if (unlikely(queue_index >= dev->real_num_tx_queues)) 3227 queue_index = -1; 3228 } 3229 } 3230 rcu_read_unlock(); 3231 3232 return queue_index; 3233 #else 3234 return -1; 3235 #endif 3236 } 3237 3238 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) 3239 { 3240 struct sock *sk = skb->sk; 3241 int queue_index = sk_tx_queue_get(sk); 3242 3243 if (queue_index < 0 || skb->ooo_okay || 3244 queue_index >= dev->real_num_tx_queues) { 3245 int new_index = get_xps_queue(dev, skb); 3246 if (new_index < 0) 3247 new_index = skb_tx_hash(dev, skb); 3248 3249 if (queue_index != new_index && sk && 3250 sk_fullsock(sk) && 3251 rcu_access_pointer(sk->sk_dst_cache)) 3252 sk_tx_queue_set(sk, new_index); 3253 3254 queue_index = new_index; 3255 } 3256 3257 return queue_index; 3258 } 3259 3260 struct netdev_queue *netdev_pick_tx(struct net_device *dev, 3261 struct sk_buff *skb, 3262 void *accel_priv) 3263 { 3264 int queue_index = 0; 3265 3266 #ifdef CONFIG_XPS 3267 u32 sender_cpu = skb->sender_cpu - 1; 3268 3269 if (sender_cpu >= (u32)NR_CPUS) 3270 skb->sender_cpu = raw_smp_processor_id() + 1; 3271 #endif 3272 3273 if (dev->real_num_tx_queues != 1) { 3274 const struct net_device_ops *ops = dev->netdev_ops; 3275 if (ops->ndo_select_queue) 3276 queue_index = ops->ndo_select_queue(dev, skb, accel_priv, 3277 __netdev_pick_tx); 3278 else 3279 queue_index = __netdev_pick_tx(dev, skb); 3280 3281 if (!accel_priv) 3282 queue_index = netdev_cap_txqueue(dev, queue_index); 3283 } 3284 3285 skb_set_queue_mapping(skb, queue_index); 3286 return netdev_get_tx_queue(dev, queue_index); 3287 } 3288 3289 /** 3290 * __dev_queue_xmit - transmit a buffer 3291 * @skb: buffer to transmit 3292 * @accel_priv: private data used for L2 forwarding offload 3293 * 3294 * Queue a buffer for transmission to a network device. The caller must 3295 * have set the device and priority and built the buffer before calling 3296 * this function. The function can be called from an interrupt. 3297 * 3298 * A negative errno code is returned on a failure. A success does not 3299 * guarantee the frame will be transmitted as it may be dropped due 3300 * to congestion or traffic shaping. 3301 * 3302 * ----------------------------------------------------------------------------------- 3303 * I notice this method can also return errors from the queue disciplines, 3304 * including NET_XMIT_DROP, which is a positive value. So, errors can also 3305 * be positive. 3306 * 3307 * Regardless of the return value, the skb is consumed, so it is currently 3308 * difficult to retry a send to this method. (You can bump the ref count 3309 * before sending to hold a reference for retry if you are careful.) 3310 * 3311 * When calling this method, interrupts MUST be enabled. This is because 3312 * the BH enable code must have IRQs enabled so that it will not deadlock. 3313 * --BLG 3314 */ 3315 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) 3316 { 3317 struct net_device *dev = skb->dev; 3318 struct netdev_queue *txq; 3319 struct Qdisc *q; 3320 int rc = -ENOMEM; 3321 3322 skb_reset_mac_header(skb); 3323 3324 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) 3325 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); 3326 3327 /* Disable soft irqs for various locks below. Also 3328 * stops preemption for RCU. 3329 */ 3330 rcu_read_lock_bh(); 3331 3332 skb_update_prio(skb); 3333 3334 qdisc_pkt_len_init(skb); 3335 #ifdef CONFIG_NET_CLS_ACT 3336 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 3337 # ifdef CONFIG_NET_EGRESS 3338 if (static_key_false(&egress_needed)) { 3339 skb = sch_handle_egress(skb, &rc, dev); 3340 if (!skb) 3341 goto out; 3342 } 3343 # endif 3344 #endif 3345 /* If device/qdisc don't need skb->dst, release it right now while 3346 * its hot in this cpu cache. 3347 */ 3348 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 3349 skb_dst_drop(skb); 3350 else 3351 skb_dst_force(skb); 3352 3353 #ifdef CONFIG_NET_SWITCHDEV 3354 /* Don't forward if offload device already forwarded */ 3355 if (skb->offload_fwd_mark && 3356 skb->offload_fwd_mark == dev->offload_fwd_mark) { 3357 consume_skb(skb); 3358 rc = NET_XMIT_SUCCESS; 3359 goto out; 3360 } 3361 #endif 3362 3363 txq = netdev_pick_tx(dev, skb, accel_priv); 3364 q = rcu_dereference_bh(txq->qdisc); 3365 3366 trace_net_dev_queue(skb); 3367 if (q->enqueue) { 3368 rc = __dev_xmit_skb(skb, q, dev, txq); 3369 goto out; 3370 } 3371 3372 /* The device has no queue. Common case for software devices: 3373 loopback, all the sorts of tunnels... 3374 3375 Really, it is unlikely that netif_tx_lock protection is necessary 3376 here. (f.e. loopback and IP tunnels are clean ignoring statistics 3377 counters.) 3378 However, it is possible, that they rely on protection 3379 made by us here. 3380 3381 Check this and shot the lock. It is not prone from deadlocks. 3382 Either shot noqueue qdisc, it is even simpler 8) 3383 */ 3384 if (dev->flags & IFF_UP) { 3385 int cpu = smp_processor_id(); /* ok because BHs are off */ 3386 3387 if (txq->xmit_lock_owner != cpu) { 3388 3389 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) 3390 goto recursion_alert; 3391 3392 skb = validate_xmit_skb(skb, dev); 3393 if (!skb) 3394 goto out; 3395 3396 HARD_TX_LOCK(dev, txq, cpu); 3397 3398 if (!netif_xmit_stopped(txq)) { 3399 __this_cpu_inc(xmit_recursion); 3400 skb = dev_hard_start_xmit(skb, dev, txq, &rc); 3401 __this_cpu_dec(xmit_recursion); 3402 if (dev_xmit_complete(rc)) { 3403 HARD_TX_UNLOCK(dev, txq); 3404 goto out; 3405 } 3406 } 3407 HARD_TX_UNLOCK(dev, txq); 3408 net_crit_ratelimited("Virtual device %s asks to queue packet!\n", 3409 dev->name); 3410 } else { 3411 /* Recursion is detected! It is possible, 3412 * unfortunately 3413 */ 3414 recursion_alert: 3415 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", 3416 dev->name); 3417 } 3418 } 3419 3420 rc = -ENETDOWN; 3421 rcu_read_unlock_bh(); 3422 3423 atomic_long_inc(&dev->tx_dropped); 3424 kfree_skb_list(skb); 3425 return rc; 3426 out: 3427 rcu_read_unlock_bh(); 3428 return rc; 3429 } 3430 3431 int dev_queue_xmit(struct sk_buff *skb) 3432 { 3433 return __dev_queue_xmit(skb, NULL); 3434 } 3435 EXPORT_SYMBOL(dev_queue_xmit); 3436 3437 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) 3438 { 3439 return __dev_queue_xmit(skb, accel_priv); 3440 } 3441 EXPORT_SYMBOL(dev_queue_xmit_accel); 3442 3443 3444 /*======================================================================= 3445 Receiver routines 3446 =======================================================================*/ 3447 3448 int netdev_max_backlog __read_mostly = 1000; 3449 EXPORT_SYMBOL(netdev_max_backlog); 3450 3451 int netdev_tstamp_prequeue __read_mostly = 1; 3452 int netdev_budget __read_mostly = 300; 3453 int weight_p __read_mostly = 64; /* old backlog weight */ 3454 3455 /* Called with irq disabled */ 3456 static inline void ____napi_schedule(struct softnet_data *sd, 3457 struct napi_struct *napi) 3458 { 3459 list_add_tail(&napi->poll_list, &sd->poll_list); 3460 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3461 } 3462 3463 #ifdef CONFIG_RPS 3464 3465 /* One global table that all flow-based protocols share. */ 3466 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; 3467 EXPORT_SYMBOL(rps_sock_flow_table); 3468 u32 rps_cpu_mask __read_mostly; 3469 EXPORT_SYMBOL(rps_cpu_mask); 3470 3471 struct static_key rps_needed __read_mostly; 3472 EXPORT_SYMBOL(rps_needed); 3473 3474 static struct rps_dev_flow * 3475 set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3476 struct rps_dev_flow *rflow, u16 next_cpu) 3477 { 3478 if (next_cpu < nr_cpu_ids) { 3479 #ifdef CONFIG_RFS_ACCEL 3480 struct netdev_rx_queue *rxqueue; 3481 struct rps_dev_flow_table *flow_table; 3482 struct rps_dev_flow *old_rflow; 3483 u32 flow_id; 3484 u16 rxq_index; 3485 int rc; 3486 3487 /* Should we steer this flow to a different hardware queue? */ 3488 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || 3489 !(dev->features & NETIF_F_NTUPLE)) 3490 goto out; 3491 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); 3492 if (rxq_index == skb_get_rx_queue(skb)) 3493 goto out; 3494 3495 rxqueue = dev->_rx + rxq_index; 3496 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3497 if (!flow_table) 3498 goto out; 3499 flow_id = skb_get_hash(skb) & flow_table->mask; 3500 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, 3501 rxq_index, flow_id); 3502 if (rc < 0) 3503 goto out; 3504 old_rflow = rflow; 3505 rflow = &flow_table->flows[flow_id]; 3506 rflow->filter = rc; 3507 if (old_rflow->filter == rflow->filter) 3508 old_rflow->filter = RPS_NO_FILTER; 3509 out: 3510 #endif 3511 rflow->last_qtail = 3512 per_cpu(softnet_data, next_cpu).input_queue_head; 3513 } 3514 3515 rflow->cpu = next_cpu; 3516 return rflow; 3517 } 3518 3519 /* 3520 * get_rps_cpu is called from netif_receive_skb and returns the target 3521 * CPU from the RPS map of the receiving queue for a given skb. 3522 * rcu_read_lock must be held on entry. 3523 */ 3524 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3525 struct rps_dev_flow **rflowp) 3526 { 3527 const struct rps_sock_flow_table *sock_flow_table; 3528 struct netdev_rx_queue *rxqueue = dev->_rx; 3529 struct rps_dev_flow_table *flow_table; 3530 struct rps_map *map; 3531 int cpu = -1; 3532 u32 tcpu; 3533 u32 hash; 3534 3535 if (skb_rx_queue_recorded(skb)) { 3536 u16 index = skb_get_rx_queue(skb); 3537 3538 if (unlikely(index >= dev->real_num_rx_queues)) { 3539 WARN_ONCE(dev->real_num_rx_queues > 1, 3540 "%s received packet on queue %u, but number " 3541 "of RX queues is %u\n", 3542 dev->name, index, dev->real_num_rx_queues); 3543 goto done; 3544 } 3545 rxqueue += index; 3546 } 3547 3548 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ 3549 3550 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3551 map = rcu_dereference(rxqueue->rps_map); 3552 if (!flow_table && !map) 3553 goto done; 3554 3555 skb_reset_network_header(skb); 3556 hash = skb_get_hash(skb); 3557 if (!hash) 3558 goto done; 3559 3560 sock_flow_table = rcu_dereference(rps_sock_flow_table); 3561 if (flow_table && sock_flow_table) { 3562 struct rps_dev_flow *rflow; 3563 u32 next_cpu; 3564 u32 ident; 3565 3566 /* First check into global flow table if there is a match */ 3567 ident = sock_flow_table->ents[hash & sock_flow_table->mask]; 3568 if ((ident ^ hash) & ~rps_cpu_mask) 3569 goto try_rps; 3570 3571 next_cpu = ident & rps_cpu_mask; 3572 3573 /* OK, now we know there is a match, 3574 * we can look at the local (per receive queue) flow table 3575 */ 3576 rflow = &flow_table->flows[hash & flow_table->mask]; 3577 tcpu = rflow->cpu; 3578 3579 /* 3580 * If the desired CPU (where last recvmsg was done) is 3581 * different from current CPU (one in the rx-queue flow 3582 * table entry), switch if one of the following holds: 3583 * - Current CPU is unset (>= nr_cpu_ids). 3584 * - Current CPU is offline. 3585 * - The current CPU's queue tail has advanced beyond the 3586 * last packet that was enqueued using this table entry. 3587 * This guarantees that all previous packets for the flow 3588 * have been dequeued, thus preserving in order delivery. 3589 */ 3590 if (unlikely(tcpu != next_cpu) && 3591 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || 3592 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 3593 rflow->last_qtail)) >= 0)) { 3594 tcpu = next_cpu; 3595 rflow = set_rps_cpu(dev, skb, rflow, next_cpu); 3596 } 3597 3598 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { 3599 *rflowp = rflow; 3600 cpu = tcpu; 3601 goto done; 3602 } 3603 } 3604 3605 try_rps: 3606 3607 if (map) { 3608 tcpu = map->cpus[reciprocal_scale(hash, map->len)]; 3609 if (cpu_online(tcpu)) { 3610 cpu = tcpu; 3611 goto done; 3612 } 3613 } 3614 3615 done: 3616 return cpu; 3617 } 3618 3619 #ifdef CONFIG_RFS_ACCEL 3620 3621 /** 3622 * rps_may_expire_flow - check whether an RFS hardware filter may be removed 3623 * @dev: Device on which the filter was set 3624 * @rxq_index: RX queue index 3625 * @flow_id: Flow ID passed to ndo_rx_flow_steer() 3626 * @filter_id: Filter ID returned by ndo_rx_flow_steer() 3627 * 3628 * Drivers that implement ndo_rx_flow_steer() should periodically call 3629 * this function for each installed filter and remove the filters for 3630 * which it returns %true. 3631 */ 3632 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, 3633 u32 flow_id, u16 filter_id) 3634 { 3635 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; 3636 struct rps_dev_flow_table *flow_table; 3637 struct rps_dev_flow *rflow; 3638 bool expire = true; 3639 unsigned int cpu; 3640 3641 rcu_read_lock(); 3642 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3643 if (flow_table && flow_id <= flow_table->mask) { 3644 rflow = &flow_table->flows[flow_id]; 3645 cpu = ACCESS_ONCE(rflow->cpu); 3646 if (rflow->filter == filter_id && cpu < nr_cpu_ids && 3647 ((int)(per_cpu(softnet_data, cpu).input_queue_head - 3648 rflow->last_qtail) < 3649 (int)(10 * flow_table->mask))) 3650 expire = false; 3651 } 3652 rcu_read_unlock(); 3653 return expire; 3654 } 3655 EXPORT_SYMBOL(rps_may_expire_flow); 3656 3657 #endif /* CONFIG_RFS_ACCEL */ 3658 3659 /* Called from hardirq (IPI) context */ 3660 static void rps_trigger_softirq(void *data) 3661 { 3662 struct softnet_data *sd = data; 3663 3664 ____napi_schedule(sd, &sd->backlog); 3665 sd->received_rps++; 3666 } 3667 3668 #endif /* CONFIG_RPS */ 3669 3670 /* 3671 * Check if this softnet_data structure is another cpu one 3672 * If yes, queue it to our IPI list and return 1 3673 * If no, return 0 3674 */ 3675 static int rps_ipi_queued(struct softnet_data *sd) 3676 { 3677 #ifdef CONFIG_RPS 3678 struct softnet_data *mysd = this_cpu_ptr(&softnet_data); 3679 3680 if (sd != mysd) { 3681 sd->rps_ipi_next = mysd->rps_ipi_list; 3682 mysd->rps_ipi_list = sd; 3683 3684 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3685 return 1; 3686 } 3687 #endif /* CONFIG_RPS */ 3688 return 0; 3689 } 3690 3691 #ifdef CONFIG_NET_FLOW_LIMIT 3692 int netdev_flow_limit_table_len __read_mostly = (1 << 12); 3693 #endif 3694 3695 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) 3696 { 3697 #ifdef CONFIG_NET_FLOW_LIMIT 3698 struct sd_flow_limit *fl; 3699 struct softnet_data *sd; 3700 unsigned int old_flow, new_flow; 3701 3702 if (qlen < (netdev_max_backlog >> 1)) 3703 return false; 3704 3705 sd = this_cpu_ptr(&softnet_data); 3706 3707 rcu_read_lock(); 3708 fl = rcu_dereference(sd->flow_limit); 3709 if (fl) { 3710 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); 3711 old_flow = fl->history[fl->history_head]; 3712 fl->history[fl->history_head] = new_flow; 3713 3714 fl->history_head++; 3715 fl->history_head &= FLOW_LIMIT_HISTORY - 1; 3716 3717 if (likely(fl->buckets[old_flow])) 3718 fl->buckets[old_flow]--; 3719 3720 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { 3721 fl->count++; 3722 rcu_read_unlock(); 3723 return true; 3724 } 3725 } 3726 rcu_read_unlock(); 3727 #endif 3728 return false; 3729 } 3730 3731 /* 3732 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 3733 * queue (may be a remote CPU queue). 3734 */ 3735 static int enqueue_to_backlog(struct sk_buff *skb, int cpu, 3736 unsigned int *qtail) 3737 { 3738 struct softnet_data *sd; 3739 unsigned long flags; 3740 unsigned int qlen; 3741 3742 sd = &per_cpu(softnet_data, cpu); 3743 3744 local_irq_save(flags); 3745 3746 rps_lock(sd); 3747 if (!netif_running(skb->dev)) 3748 goto drop; 3749 qlen = skb_queue_len(&sd->input_pkt_queue); 3750 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { 3751 if (qlen) { 3752 enqueue: 3753 __skb_queue_tail(&sd->input_pkt_queue, skb); 3754 input_queue_tail_incr_save(sd, qtail); 3755 rps_unlock(sd); 3756 local_irq_restore(flags); 3757 return NET_RX_SUCCESS; 3758 } 3759 3760 /* Schedule NAPI for backlog device 3761 * We can use non atomic operation since we own the queue lock 3762 */ 3763 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { 3764 if (!rps_ipi_queued(sd)) 3765 ____napi_schedule(sd, &sd->backlog); 3766 } 3767 goto enqueue; 3768 } 3769 3770 drop: 3771 sd->dropped++; 3772 rps_unlock(sd); 3773 3774 local_irq_restore(flags); 3775 3776 atomic_long_inc(&skb->dev->rx_dropped); 3777 kfree_skb(skb); 3778 return NET_RX_DROP; 3779 } 3780 3781 static int netif_rx_internal(struct sk_buff *skb) 3782 { 3783 int ret; 3784 3785 net_timestamp_check(netdev_tstamp_prequeue, skb); 3786 3787 trace_netif_rx(skb); 3788 #ifdef CONFIG_RPS 3789 if (static_key_false(&rps_needed)) { 3790 struct rps_dev_flow voidflow, *rflow = &voidflow; 3791 int cpu; 3792 3793 preempt_disable(); 3794 rcu_read_lock(); 3795 3796 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3797 if (cpu < 0) 3798 cpu = smp_processor_id(); 3799 3800 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3801 3802 rcu_read_unlock(); 3803 preempt_enable(); 3804 } else 3805 #endif 3806 { 3807 unsigned int qtail; 3808 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 3809 put_cpu(); 3810 } 3811 return ret; 3812 } 3813 3814 /** 3815 * netif_rx - post buffer to the network code 3816 * @skb: buffer to post 3817 * 3818 * This function receives a packet from a device driver and queues it for 3819 * the upper (protocol) levels to process. It always succeeds. The buffer 3820 * may be dropped during processing for congestion control or by the 3821 * protocol layers. 3822 * 3823 * return values: 3824 * NET_RX_SUCCESS (no congestion) 3825 * NET_RX_DROP (packet was dropped) 3826 * 3827 */ 3828 3829 int netif_rx(struct sk_buff *skb) 3830 { 3831 trace_netif_rx_entry(skb); 3832 3833 return netif_rx_internal(skb); 3834 } 3835 EXPORT_SYMBOL(netif_rx); 3836 3837 int netif_rx_ni(struct sk_buff *skb) 3838 { 3839 int err; 3840 3841 trace_netif_rx_ni_entry(skb); 3842 3843 preempt_disable(); 3844 err = netif_rx_internal(skb); 3845 if (local_softirq_pending()) 3846 do_softirq(); 3847 preempt_enable(); 3848 3849 return err; 3850 } 3851 EXPORT_SYMBOL(netif_rx_ni); 3852 3853 static void net_tx_action(struct softirq_action *h) 3854 { 3855 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 3856 3857 if (sd->completion_queue) { 3858 struct sk_buff *clist; 3859 3860 local_irq_disable(); 3861 clist = sd->completion_queue; 3862 sd->completion_queue = NULL; 3863 local_irq_enable(); 3864 3865 while (clist) { 3866 struct sk_buff *skb = clist; 3867 clist = clist->next; 3868 3869 WARN_ON(atomic_read(&skb->users)); 3870 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) 3871 trace_consume_skb(skb); 3872 else 3873 trace_kfree_skb(skb, net_tx_action); 3874 3875 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) 3876 __kfree_skb(skb); 3877 else 3878 __kfree_skb_defer(skb); 3879 } 3880 3881 __kfree_skb_flush(); 3882 } 3883 3884 if (sd->output_queue) { 3885 struct Qdisc *head; 3886 3887 local_irq_disable(); 3888 head = sd->output_queue; 3889 sd->output_queue = NULL; 3890 sd->output_queue_tailp = &sd->output_queue; 3891 local_irq_enable(); 3892 3893 while (head) { 3894 struct Qdisc *q = head; 3895 spinlock_t *root_lock; 3896 3897 head = head->next_sched; 3898 3899 root_lock = qdisc_lock(q); 3900 if (spin_trylock(root_lock)) { 3901 smp_mb__before_atomic(); 3902 clear_bit(__QDISC_STATE_SCHED, 3903 &q->state); 3904 qdisc_run(q); 3905 spin_unlock(root_lock); 3906 } else { 3907 if (!test_bit(__QDISC_STATE_DEACTIVATED, 3908 &q->state)) { 3909 __netif_reschedule(q); 3910 } else { 3911 smp_mb__before_atomic(); 3912 clear_bit(__QDISC_STATE_SCHED, 3913 &q->state); 3914 } 3915 } 3916 } 3917 } 3918 } 3919 3920 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ 3921 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) 3922 /* This hook is defined here for ATM LANE */ 3923 int (*br_fdb_test_addr_hook)(struct net_device *dev, 3924 unsigned char *addr) __read_mostly; 3925 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 3926 #endif 3927 3928 static inline struct sk_buff * 3929 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, 3930 struct net_device *orig_dev) 3931 { 3932 #ifdef CONFIG_NET_CLS_ACT 3933 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); 3934 struct tcf_result cl_res; 3935 3936 /* If there's at least one ingress present somewhere (so 3937 * we get here via enabled static key), remaining devices 3938 * that are not configured with an ingress qdisc will bail 3939 * out here. 3940 */ 3941 if (!cl) 3942 return skb; 3943 if (*pt_prev) { 3944 *ret = deliver_skb(skb, *pt_prev, orig_dev); 3945 *pt_prev = NULL; 3946 } 3947 3948 qdisc_skb_cb(skb)->pkt_len = skb->len; 3949 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 3950 qdisc_bstats_cpu_update(cl->q, skb); 3951 3952 switch (tc_classify(skb, cl, &cl_res, false)) { 3953 case TC_ACT_OK: 3954 case TC_ACT_RECLASSIFY: 3955 skb->tc_index = TC_H_MIN(cl_res.classid); 3956 break; 3957 case TC_ACT_SHOT: 3958 qdisc_qstats_cpu_drop(cl->q); 3959 kfree_skb(skb); 3960 return NULL; 3961 case TC_ACT_STOLEN: 3962 case TC_ACT_QUEUED: 3963 consume_skb(skb); 3964 return NULL; 3965 case TC_ACT_REDIRECT: 3966 /* skb_mac_header check was done by cls/act_bpf, so 3967 * we can safely push the L2 header back before 3968 * redirecting to another netdev 3969 */ 3970 __skb_push(skb, skb->mac_len); 3971 skb_do_redirect(skb); 3972 return NULL; 3973 default: 3974 break; 3975 } 3976 #endif /* CONFIG_NET_CLS_ACT */ 3977 return skb; 3978 } 3979 3980 /** 3981 * netdev_rx_handler_register - register receive handler 3982 * @dev: device to register a handler for 3983 * @rx_handler: receive handler to register 3984 * @rx_handler_data: data pointer that is used by rx handler 3985 * 3986 * Register a receive handler for a device. This handler will then be 3987 * called from __netif_receive_skb. A negative errno code is returned 3988 * on a failure. 3989 * 3990 * The caller must hold the rtnl_mutex. 3991 * 3992 * For a general description of rx_handler, see enum rx_handler_result. 3993 */ 3994 int netdev_rx_handler_register(struct net_device *dev, 3995 rx_handler_func_t *rx_handler, 3996 void *rx_handler_data) 3997 { 3998 ASSERT_RTNL(); 3999 4000 if (dev->rx_handler) 4001 return -EBUSY; 4002 4003 /* Note: rx_handler_data must be set before rx_handler */ 4004 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); 4005 rcu_assign_pointer(dev->rx_handler, rx_handler); 4006 4007 return 0; 4008 } 4009 EXPORT_SYMBOL_GPL(netdev_rx_handler_register); 4010 4011 /** 4012 * netdev_rx_handler_unregister - unregister receive handler 4013 * @dev: device to unregister a handler from 4014 * 4015 * Unregister a receive handler from a device. 4016 * 4017 * The caller must hold the rtnl_mutex. 4018 */ 4019 void netdev_rx_handler_unregister(struct net_device *dev) 4020 { 4021 4022 ASSERT_RTNL(); 4023 RCU_INIT_POINTER(dev->rx_handler, NULL); 4024 /* a reader seeing a non NULL rx_handler in a rcu_read_lock() 4025 * section has a guarantee to see a non NULL rx_handler_data 4026 * as well. 4027 */ 4028 synchronize_net(); 4029 RCU_INIT_POINTER(dev->rx_handler_data, NULL); 4030 } 4031 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 4032 4033 /* 4034 * Limit the use of PFMEMALLOC reserves to those protocols that implement 4035 * the special handling of PFMEMALLOC skbs. 4036 */ 4037 static bool skb_pfmemalloc_protocol(struct sk_buff *skb) 4038 { 4039 switch (skb->protocol) { 4040 case htons(ETH_P_ARP): 4041 case htons(ETH_P_IP): 4042 case htons(ETH_P_IPV6): 4043 case htons(ETH_P_8021Q): 4044 case htons(ETH_P_8021AD): 4045 return true; 4046 default: 4047 return false; 4048 } 4049 } 4050 4051 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, 4052 int *ret, struct net_device *orig_dev) 4053 { 4054 #ifdef CONFIG_NETFILTER_INGRESS 4055 if (nf_hook_ingress_active(skb)) { 4056 if (*pt_prev) { 4057 *ret = deliver_skb(skb, *pt_prev, orig_dev); 4058 *pt_prev = NULL; 4059 } 4060 4061 return nf_hook_ingress(skb); 4062 } 4063 #endif /* CONFIG_NETFILTER_INGRESS */ 4064 return 0; 4065 } 4066 4067 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) 4068 { 4069 struct packet_type *ptype, *pt_prev; 4070 rx_handler_func_t *rx_handler; 4071 struct net_device *orig_dev; 4072 bool deliver_exact = false; 4073 int ret = NET_RX_DROP; 4074 __be16 type; 4075 4076 net_timestamp_check(!netdev_tstamp_prequeue, skb); 4077 4078 trace_netif_receive_skb(skb); 4079 4080 orig_dev = skb->dev; 4081 4082 skb_reset_network_header(skb); 4083 if (!skb_transport_header_was_set(skb)) 4084 skb_reset_transport_header(skb); 4085 skb_reset_mac_len(skb); 4086 4087 pt_prev = NULL; 4088 4089 another_round: 4090 skb->skb_iif = skb->dev->ifindex; 4091 4092 __this_cpu_inc(softnet_data.processed); 4093 4094 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || 4095 skb->protocol == cpu_to_be16(ETH_P_8021AD)) { 4096 skb = skb_vlan_untag(skb); 4097 if (unlikely(!skb)) 4098 goto out; 4099 } 4100 4101 #ifdef CONFIG_NET_CLS_ACT 4102 if (skb->tc_verd & TC_NCLS) { 4103 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 4104 goto ncls; 4105 } 4106 #endif 4107 4108 if (pfmemalloc) 4109 goto skip_taps; 4110 4111 list_for_each_entry_rcu(ptype, &ptype_all, list) { 4112 if (pt_prev) 4113 ret = deliver_skb(skb, pt_prev, orig_dev); 4114 pt_prev = ptype; 4115 } 4116 4117 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { 4118 if (pt_prev) 4119 ret = deliver_skb(skb, pt_prev, orig_dev); 4120 pt_prev = ptype; 4121 } 4122 4123 skip_taps: 4124 #ifdef CONFIG_NET_INGRESS 4125 if (static_key_false(&ingress_needed)) { 4126 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); 4127 if (!skb) 4128 goto out; 4129 4130 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) 4131 goto out; 4132 } 4133 #endif 4134 #ifdef CONFIG_NET_CLS_ACT 4135 skb->tc_verd = 0; 4136 ncls: 4137 #endif 4138 if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) 4139 goto drop; 4140 4141 if (skb_vlan_tag_present(skb)) { 4142 if (pt_prev) { 4143 ret = deliver_skb(skb, pt_prev, orig_dev); 4144 pt_prev = NULL; 4145 } 4146 if (vlan_do_receive(&skb)) 4147 goto another_round; 4148 else if (unlikely(!skb)) 4149 goto out; 4150 } 4151 4152 rx_handler = rcu_dereference(skb->dev->rx_handler); 4153 if (rx_handler) { 4154 if (pt_prev) { 4155 ret = deliver_skb(skb, pt_prev, orig_dev); 4156 pt_prev = NULL; 4157 } 4158 switch (rx_handler(&skb)) { 4159 case RX_HANDLER_CONSUMED: 4160 ret = NET_RX_SUCCESS; 4161 goto out; 4162 case RX_HANDLER_ANOTHER: 4163 goto another_round; 4164 case RX_HANDLER_EXACT: 4165 deliver_exact = true; 4166 case RX_HANDLER_PASS: 4167 break; 4168 default: 4169 BUG(); 4170 } 4171 } 4172 4173 if (unlikely(skb_vlan_tag_present(skb))) { 4174 if (skb_vlan_tag_get_id(skb)) 4175 skb->pkt_type = PACKET_OTHERHOST; 4176 /* Note: we might in the future use prio bits 4177 * and set skb->priority like in vlan_do_receive() 4178 * For the time being, just ignore Priority Code Point 4179 */ 4180 skb->vlan_tci = 0; 4181 } 4182 4183 type = skb->protocol; 4184 4185 /* deliver only exact match when indicated */ 4186 if (likely(!deliver_exact)) { 4187 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4188 &ptype_base[ntohs(type) & 4189 PTYPE_HASH_MASK]); 4190 } 4191 4192 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4193 &orig_dev->ptype_specific); 4194 4195 if (unlikely(skb->dev != orig_dev)) { 4196 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4197 &skb->dev->ptype_specific); 4198 } 4199 4200 if (pt_prev) { 4201 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 4202 goto drop; 4203 else 4204 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 4205 } else { 4206 drop: 4207 if (!deliver_exact) 4208 atomic_long_inc(&skb->dev->rx_dropped); 4209 else 4210 atomic_long_inc(&skb->dev->rx_nohandler); 4211 kfree_skb(skb); 4212 /* Jamal, now you will not able to escape explaining 4213 * me how you were going to use this. :-) 4214 */ 4215 ret = NET_RX_DROP; 4216 } 4217 4218 out: 4219 return ret; 4220 } 4221 4222 static int __netif_receive_skb(struct sk_buff *skb) 4223 { 4224 int ret; 4225 4226 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { 4227 unsigned long pflags = current->flags; 4228 4229 /* 4230 * PFMEMALLOC skbs are special, they should 4231 * - be delivered to SOCK_MEMALLOC sockets only 4232 * - stay away from userspace 4233 * - have bounded memory usage 4234 * 4235 * Use PF_MEMALLOC as this saves us from propagating the allocation 4236 * context down to all allocation sites. 4237 */ 4238 current->flags |= PF_MEMALLOC; 4239 ret = __netif_receive_skb_core(skb, true); 4240 tsk_restore_flags(current, pflags, PF_MEMALLOC); 4241 } else 4242 ret = __netif_receive_skb_core(skb, false); 4243 4244 return ret; 4245 } 4246 4247 static int netif_receive_skb_internal(struct sk_buff *skb) 4248 { 4249 int ret; 4250 4251 net_timestamp_check(netdev_tstamp_prequeue, skb); 4252 4253 if (skb_defer_rx_timestamp(skb)) 4254 return NET_RX_SUCCESS; 4255 4256 rcu_read_lock(); 4257 4258 #ifdef CONFIG_RPS 4259 if (static_key_false(&rps_needed)) { 4260 struct rps_dev_flow voidflow, *rflow = &voidflow; 4261 int cpu = get_rps_cpu(skb->dev, skb, &rflow); 4262 4263 if (cpu >= 0) { 4264 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 4265 rcu_read_unlock(); 4266 return ret; 4267 } 4268 } 4269 #endif 4270 ret = __netif_receive_skb(skb); 4271 rcu_read_unlock(); 4272 return ret; 4273 } 4274 4275 /** 4276 * netif_receive_skb - process receive buffer from network 4277 * @skb: buffer to process 4278 * 4279 * netif_receive_skb() is the main receive data processing function. 4280 * It always succeeds. The buffer may be dropped during processing 4281 * for congestion control or by the protocol layers. 4282 * 4283 * This function may only be called from softirq context and interrupts 4284 * should be enabled. 4285 * 4286 * Return values (usually ignored): 4287 * NET_RX_SUCCESS: no congestion 4288 * NET_RX_DROP: packet was dropped 4289 */ 4290 int netif_receive_skb(struct sk_buff *skb) 4291 { 4292 trace_netif_receive_skb_entry(skb); 4293 4294 return netif_receive_skb_internal(skb); 4295 } 4296 EXPORT_SYMBOL(netif_receive_skb); 4297 4298 /* Network device is going away, flush any packets still pending 4299 * Called with irqs disabled. 4300 */ 4301 static void flush_backlog(void *arg) 4302 { 4303 struct net_device *dev = arg; 4304 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 4305 struct sk_buff *skb, *tmp; 4306 4307 rps_lock(sd); 4308 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 4309 if (skb->dev == dev) { 4310 __skb_unlink(skb, &sd->input_pkt_queue); 4311 kfree_skb(skb); 4312 input_queue_head_incr(sd); 4313 } 4314 } 4315 rps_unlock(sd); 4316 4317 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 4318 if (skb->dev == dev) { 4319 __skb_unlink(skb, &sd->process_queue); 4320 kfree_skb(skb); 4321 input_queue_head_incr(sd); 4322 } 4323 } 4324 } 4325 4326 static int napi_gro_complete(struct sk_buff *skb) 4327 { 4328 struct packet_offload *ptype; 4329 __be16 type = skb->protocol; 4330 struct list_head *head = &offload_base; 4331 int err = -ENOENT; 4332 4333 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); 4334 4335 if (NAPI_GRO_CB(skb)->count == 1) { 4336 skb_shinfo(skb)->gso_size = 0; 4337 goto out; 4338 } 4339 4340 rcu_read_lock(); 4341 list_for_each_entry_rcu(ptype, head, list) { 4342 if (ptype->type != type || !ptype->callbacks.gro_complete) 4343 continue; 4344 4345 err = ptype->callbacks.gro_complete(skb, 0); 4346 break; 4347 } 4348 rcu_read_unlock(); 4349 4350 if (err) { 4351 WARN_ON(&ptype->list == head); 4352 kfree_skb(skb); 4353 return NET_RX_SUCCESS; 4354 } 4355 4356 out: 4357 return netif_receive_skb_internal(skb); 4358 } 4359 4360 /* napi->gro_list contains packets ordered by age. 4361 * youngest packets at the head of it. 4362 * Complete skbs in reverse order to reduce latencies. 4363 */ 4364 void napi_gro_flush(struct napi_struct *napi, bool flush_old) 4365 { 4366 struct sk_buff *skb, *prev = NULL; 4367 4368 /* scan list and build reverse chain */ 4369 for (skb = napi->gro_list; skb != NULL; skb = skb->next) { 4370 skb->prev = prev; 4371 prev = skb; 4372 } 4373 4374 for (skb = prev; skb; skb = prev) { 4375 skb->next = NULL; 4376 4377 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) 4378 return; 4379 4380 prev = skb->prev; 4381 napi_gro_complete(skb); 4382 napi->gro_count--; 4383 } 4384 4385 napi->gro_list = NULL; 4386 } 4387 EXPORT_SYMBOL(napi_gro_flush); 4388 4389 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) 4390 { 4391 struct sk_buff *p; 4392 unsigned int maclen = skb->dev->hard_header_len; 4393 u32 hash = skb_get_hash_raw(skb); 4394 4395 for (p = napi->gro_list; p; p = p->next) { 4396 unsigned long diffs; 4397 4398 NAPI_GRO_CB(p)->flush = 0; 4399 4400 if (hash != skb_get_hash_raw(p)) { 4401 NAPI_GRO_CB(p)->same_flow = 0; 4402 continue; 4403 } 4404 4405 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; 4406 diffs |= p->vlan_tci ^ skb->vlan_tci; 4407 diffs |= skb_metadata_dst_cmp(p, skb); 4408 if (maclen == ETH_HLEN) 4409 diffs |= compare_ether_header(skb_mac_header(p), 4410 skb_mac_header(skb)); 4411 else if (!diffs) 4412 diffs = memcmp(skb_mac_header(p), 4413 skb_mac_header(skb), 4414 maclen); 4415 NAPI_GRO_CB(p)->same_flow = !diffs; 4416 } 4417 } 4418 4419 static void skb_gro_reset_offset(struct sk_buff *skb) 4420 { 4421 const struct skb_shared_info *pinfo = skb_shinfo(skb); 4422 const skb_frag_t *frag0 = &pinfo->frags[0]; 4423 4424 NAPI_GRO_CB(skb)->data_offset = 0; 4425 NAPI_GRO_CB(skb)->frag0 = NULL; 4426 NAPI_GRO_CB(skb)->frag0_len = 0; 4427 4428 if (skb_mac_header(skb) == skb_tail_pointer(skb) && 4429 pinfo->nr_frags && 4430 !PageHighMem(skb_frag_page(frag0))) { 4431 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); 4432 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); 4433 } 4434 } 4435 4436 static void gro_pull_from_frag0(struct sk_buff *skb, int grow) 4437 { 4438 struct skb_shared_info *pinfo = skb_shinfo(skb); 4439 4440 BUG_ON(skb->end - skb->tail < grow); 4441 4442 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); 4443 4444 skb->data_len -= grow; 4445 skb->tail += grow; 4446 4447 pinfo->frags[0].page_offset += grow; 4448 skb_frag_size_sub(&pinfo->frags[0], grow); 4449 4450 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { 4451 skb_frag_unref(skb, 0); 4452 memmove(pinfo->frags, pinfo->frags + 1, 4453 --pinfo->nr_frags * sizeof(pinfo->frags[0])); 4454 } 4455 } 4456 4457 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4458 { 4459 struct sk_buff **pp = NULL; 4460 struct packet_offload *ptype; 4461 __be16 type = skb->protocol; 4462 struct list_head *head = &offload_base; 4463 int same_flow; 4464 enum gro_result ret; 4465 int grow; 4466 4467 if (!(skb->dev->features & NETIF_F_GRO)) 4468 goto normal; 4469 4470 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) 4471 goto normal; 4472 4473 gro_list_prepare(napi, skb); 4474 4475 rcu_read_lock(); 4476 list_for_each_entry_rcu(ptype, head, list) { 4477 if (ptype->type != type || !ptype->callbacks.gro_receive) 4478 continue; 4479 4480 skb_set_network_header(skb, skb_gro_offset(skb)); 4481 skb_reset_mac_len(skb); 4482 NAPI_GRO_CB(skb)->same_flow = 0; 4483 NAPI_GRO_CB(skb)->flush = 0; 4484 NAPI_GRO_CB(skb)->free = 0; 4485 NAPI_GRO_CB(skb)->encap_mark = 0; 4486 NAPI_GRO_CB(skb)->is_fou = 0; 4487 NAPI_GRO_CB(skb)->is_atomic = 1; 4488 NAPI_GRO_CB(skb)->gro_remcsum_start = 0; 4489 4490 /* Setup for GRO checksum validation */ 4491 switch (skb->ip_summed) { 4492 case CHECKSUM_COMPLETE: 4493 NAPI_GRO_CB(skb)->csum = skb->csum; 4494 NAPI_GRO_CB(skb)->csum_valid = 1; 4495 NAPI_GRO_CB(skb)->csum_cnt = 0; 4496 break; 4497 case CHECKSUM_UNNECESSARY: 4498 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; 4499 NAPI_GRO_CB(skb)->csum_valid = 0; 4500 break; 4501 default: 4502 NAPI_GRO_CB(skb)->csum_cnt = 0; 4503 NAPI_GRO_CB(skb)->csum_valid = 0; 4504 } 4505 4506 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); 4507 break; 4508 } 4509 rcu_read_unlock(); 4510 4511 if (&ptype->list == head) 4512 goto normal; 4513 4514 same_flow = NAPI_GRO_CB(skb)->same_flow; 4515 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 4516 4517 if (pp) { 4518 struct sk_buff *nskb = *pp; 4519 4520 *pp = nskb->next; 4521 nskb->next = NULL; 4522 napi_gro_complete(nskb); 4523 napi->gro_count--; 4524 } 4525 4526 if (same_flow) 4527 goto ok; 4528 4529 if (NAPI_GRO_CB(skb)->flush) 4530 goto normal; 4531 4532 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { 4533 struct sk_buff *nskb = napi->gro_list; 4534 4535 /* locate the end of the list to select the 'oldest' flow */ 4536 while (nskb->next) { 4537 pp = &nskb->next; 4538 nskb = *pp; 4539 } 4540 *pp = NULL; 4541 nskb->next = NULL; 4542 napi_gro_complete(nskb); 4543 } else { 4544 napi->gro_count++; 4545 } 4546 NAPI_GRO_CB(skb)->count = 1; 4547 NAPI_GRO_CB(skb)->age = jiffies; 4548 NAPI_GRO_CB(skb)->last = skb; 4549 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 4550 skb->next = napi->gro_list; 4551 napi->gro_list = skb; 4552 ret = GRO_HELD; 4553 4554 pull: 4555 grow = skb_gro_offset(skb) - skb_headlen(skb); 4556 if (grow > 0) 4557 gro_pull_from_frag0(skb, grow); 4558 ok: 4559 return ret; 4560 4561 normal: 4562 ret = GRO_NORMAL; 4563 goto pull; 4564 } 4565 4566 struct packet_offload *gro_find_receive_by_type(__be16 type) 4567 { 4568 struct list_head *offload_head = &offload_base; 4569 struct packet_offload *ptype; 4570 4571 list_for_each_entry_rcu(ptype, offload_head, list) { 4572 if (ptype->type != type || !ptype->callbacks.gro_receive) 4573 continue; 4574 return ptype; 4575 } 4576 return NULL; 4577 } 4578 EXPORT_SYMBOL(gro_find_receive_by_type); 4579 4580 struct packet_offload *gro_find_complete_by_type(__be16 type) 4581 { 4582 struct list_head *offload_head = &offload_base; 4583 struct packet_offload *ptype; 4584 4585 list_for_each_entry_rcu(ptype, offload_head, list) { 4586 if (ptype->type != type || !ptype->callbacks.gro_complete) 4587 continue; 4588 return ptype; 4589 } 4590 return NULL; 4591 } 4592 EXPORT_SYMBOL(gro_find_complete_by_type); 4593 4594 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 4595 { 4596 switch (ret) { 4597 case GRO_NORMAL: 4598 if (netif_receive_skb_internal(skb)) 4599 ret = GRO_DROP; 4600 break; 4601 4602 case GRO_DROP: 4603 kfree_skb(skb); 4604 break; 4605 4606 case GRO_MERGED_FREE: 4607 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { 4608 skb_dst_drop(skb); 4609 kmem_cache_free(skbuff_head_cache, skb); 4610 } else { 4611 __kfree_skb(skb); 4612 } 4613 break; 4614 4615 case GRO_HELD: 4616 case GRO_MERGED: 4617 break; 4618 } 4619 4620 return ret; 4621 } 4622 4623 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4624 { 4625 skb_mark_napi_id(skb, napi); 4626 trace_napi_gro_receive_entry(skb); 4627 4628 skb_gro_reset_offset(skb); 4629 4630 return napi_skb_finish(dev_gro_receive(napi, skb), skb); 4631 } 4632 EXPORT_SYMBOL(napi_gro_receive); 4633 4634 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 4635 { 4636 if (unlikely(skb->pfmemalloc)) { 4637 consume_skb(skb); 4638 return; 4639 } 4640 __skb_pull(skb, skb_headlen(skb)); 4641 /* restore the reserve we had after netdev_alloc_skb_ip_align() */ 4642 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); 4643 skb->vlan_tci = 0; 4644 skb->dev = napi->dev; 4645 skb->skb_iif = 0; 4646 skb->encapsulation = 0; 4647 skb_shinfo(skb)->gso_type = 0; 4648 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 4649 4650 napi->skb = skb; 4651 } 4652 4653 struct sk_buff *napi_get_frags(struct napi_struct *napi) 4654 { 4655 struct sk_buff *skb = napi->skb; 4656 4657 if (!skb) { 4658 skb = napi_alloc_skb(napi, GRO_MAX_HEAD); 4659 if (skb) { 4660 napi->skb = skb; 4661 skb_mark_napi_id(skb, napi); 4662 } 4663 } 4664 return skb; 4665 } 4666 EXPORT_SYMBOL(napi_get_frags); 4667 4668 static gro_result_t napi_frags_finish(struct napi_struct *napi, 4669 struct sk_buff *skb, 4670 gro_result_t ret) 4671 { 4672 switch (ret) { 4673 case GRO_NORMAL: 4674 case GRO_HELD: 4675 __skb_push(skb, ETH_HLEN); 4676 skb->protocol = eth_type_trans(skb, skb->dev); 4677 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) 4678 ret = GRO_DROP; 4679 break; 4680 4681 case GRO_DROP: 4682 case GRO_MERGED_FREE: 4683 napi_reuse_skb(napi, skb); 4684 break; 4685 4686 case GRO_MERGED: 4687 break; 4688 } 4689 4690 return ret; 4691 } 4692 4693 /* Upper GRO stack assumes network header starts at gro_offset=0 4694 * Drivers could call both napi_gro_frags() and napi_gro_receive() 4695 * We copy ethernet header into skb->data to have a common layout. 4696 */ 4697 static struct sk_buff *napi_frags_skb(struct napi_struct *napi) 4698 { 4699 struct sk_buff *skb = napi->skb; 4700 const struct ethhdr *eth; 4701 unsigned int hlen = sizeof(*eth); 4702 4703 napi->skb = NULL; 4704 4705 skb_reset_mac_header(skb); 4706 skb_gro_reset_offset(skb); 4707 4708 eth = skb_gro_header_fast(skb, 0); 4709 if (unlikely(skb_gro_header_hard(skb, hlen))) { 4710 eth = skb_gro_header_slow(skb, hlen, 0); 4711 if (unlikely(!eth)) { 4712 net_warn_ratelimited("%s: dropping impossible skb from %s\n", 4713 __func__, napi->dev->name); 4714 napi_reuse_skb(napi, skb); 4715 return NULL; 4716 } 4717 } else { 4718 gro_pull_from_frag0(skb, hlen); 4719 NAPI_GRO_CB(skb)->frag0 += hlen; 4720 NAPI_GRO_CB(skb)->frag0_len -= hlen; 4721 } 4722 __skb_pull(skb, hlen); 4723 4724 /* 4725 * This works because the only protocols we care about don't require 4726 * special handling. 4727 * We'll fix it up properly in napi_frags_finish() 4728 */ 4729 skb->protocol = eth->h_proto; 4730 4731 return skb; 4732 } 4733 4734 gro_result_t napi_gro_frags(struct napi_struct *napi) 4735 { 4736 struct sk_buff *skb = napi_frags_skb(napi); 4737 4738 if (!skb) 4739 return GRO_DROP; 4740 4741 trace_napi_gro_frags_entry(skb); 4742 4743 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); 4744 } 4745 EXPORT_SYMBOL(napi_gro_frags); 4746 4747 /* Compute the checksum from gro_offset and return the folded value 4748 * after adding in any pseudo checksum. 4749 */ 4750 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) 4751 { 4752 __wsum wsum; 4753 __sum16 sum; 4754 4755 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); 4756 4757 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ 4758 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); 4759 if (likely(!sum)) { 4760 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 4761 !skb->csum_complete_sw) 4762 netdev_rx_csum_fault(skb->dev); 4763 } 4764 4765 NAPI_GRO_CB(skb)->csum = wsum; 4766 NAPI_GRO_CB(skb)->csum_valid = 1; 4767 4768 return sum; 4769 } 4770 EXPORT_SYMBOL(__skb_gro_checksum_complete); 4771 4772 /* 4773 * net_rps_action_and_irq_enable sends any pending IPI's for rps. 4774 * Note: called with local irq disabled, but exits with local irq enabled. 4775 */ 4776 static void net_rps_action_and_irq_enable(struct softnet_data *sd) 4777 { 4778 #ifdef CONFIG_RPS 4779 struct softnet_data *remsd = sd->rps_ipi_list; 4780 4781 if (remsd) { 4782 sd->rps_ipi_list = NULL; 4783 4784 local_irq_enable(); 4785 4786 /* Send pending IPI's to kick RPS processing on remote cpus. */ 4787 while (remsd) { 4788 struct softnet_data *next = remsd->rps_ipi_next; 4789 4790 if (cpu_online(remsd->cpu)) 4791 smp_call_function_single_async(remsd->cpu, 4792 &remsd->csd); 4793 remsd = next; 4794 } 4795 } else 4796 #endif 4797 local_irq_enable(); 4798 } 4799 4800 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) 4801 { 4802 #ifdef CONFIG_RPS 4803 return sd->rps_ipi_list != NULL; 4804 #else 4805 return false; 4806 #endif 4807 } 4808 4809 static int process_backlog(struct napi_struct *napi, int quota) 4810 { 4811 int work = 0; 4812 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 4813 4814 /* Check if we have pending ipi, its better to send them now, 4815 * not waiting net_rx_action() end. 4816 */ 4817 if (sd_has_rps_ipi_waiting(sd)) { 4818 local_irq_disable(); 4819 net_rps_action_and_irq_enable(sd); 4820 } 4821 4822 napi->weight = weight_p; 4823 local_irq_disable(); 4824 while (1) { 4825 struct sk_buff *skb; 4826 4827 while ((skb = __skb_dequeue(&sd->process_queue))) { 4828 rcu_read_lock(); 4829 local_irq_enable(); 4830 __netif_receive_skb(skb); 4831 rcu_read_unlock(); 4832 local_irq_disable(); 4833 input_queue_head_incr(sd); 4834 if (++work >= quota) { 4835 local_irq_enable(); 4836 return work; 4837 } 4838 } 4839 4840 rps_lock(sd); 4841 if (skb_queue_empty(&sd->input_pkt_queue)) { 4842 /* 4843 * Inline a custom version of __napi_complete(). 4844 * only current cpu owns and manipulates this napi, 4845 * and NAPI_STATE_SCHED is the only possible flag set 4846 * on backlog. 4847 * We can use a plain write instead of clear_bit(), 4848 * and we dont need an smp_mb() memory barrier. 4849 */ 4850 napi->state = 0; 4851 rps_unlock(sd); 4852 4853 break; 4854 } 4855 4856 skb_queue_splice_tail_init(&sd->input_pkt_queue, 4857 &sd->process_queue); 4858 rps_unlock(sd); 4859 } 4860 local_irq_enable(); 4861 4862 return work; 4863 } 4864 4865 /** 4866 * __napi_schedule - schedule for receive 4867 * @n: entry to schedule 4868 * 4869 * The entry's receive function will be scheduled to run. 4870 * Consider using __napi_schedule_irqoff() if hard irqs are masked. 4871 */ 4872 void __napi_schedule(struct napi_struct *n) 4873 { 4874 unsigned long flags; 4875 4876 local_irq_save(flags); 4877 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4878 local_irq_restore(flags); 4879 } 4880 EXPORT_SYMBOL(__napi_schedule); 4881 4882 /** 4883 * __napi_schedule_irqoff - schedule for receive 4884 * @n: entry to schedule 4885 * 4886 * Variant of __napi_schedule() assuming hard irqs are masked 4887 */ 4888 void __napi_schedule_irqoff(struct napi_struct *n) 4889 { 4890 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4891 } 4892 EXPORT_SYMBOL(__napi_schedule_irqoff); 4893 4894 void __napi_complete(struct napi_struct *n) 4895 { 4896 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 4897 4898 list_del_init(&n->poll_list); 4899 smp_mb__before_atomic(); 4900 clear_bit(NAPI_STATE_SCHED, &n->state); 4901 } 4902 EXPORT_SYMBOL(__napi_complete); 4903 4904 void napi_complete_done(struct napi_struct *n, int work_done) 4905 { 4906 unsigned long flags; 4907 4908 /* 4909 * don't let napi dequeue from the cpu poll list 4910 * just in case its running on a different cpu 4911 */ 4912 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 4913 return; 4914 4915 if (n->gro_list) { 4916 unsigned long timeout = 0; 4917 4918 if (work_done) 4919 timeout = n->dev->gro_flush_timeout; 4920 4921 if (timeout) 4922 hrtimer_start(&n->timer, ns_to_ktime(timeout), 4923 HRTIMER_MODE_REL_PINNED); 4924 else 4925 napi_gro_flush(n, false); 4926 } 4927 if (likely(list_empty(&n->poll_list))) { 4928 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); 4929 } else { 4930 /* If n->poll_list is not empty, we need to mask irqs */ 4931 local_irq_save(flags); 4932 __napi_complete(n); 4933 local_irq_restore(flags); 4934 } 4935 } 4936 EXPORT_SYMBOL(napi_complete_done); 4937 4938 /* must be called under rcu_read_lock(), as we dont take a reference */ 4939 static struct napi_struct *napi_by_id(unsigned int napi_id) 4940 { 4941 unsigned int hash = napi_id % HASH_SIZE(napi_hash); 4942 struct napi_struct *napi; 4943 4944 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) 4945 if (napi->napi_id == napi_id) 4946 return napi; 4947 4948 return NULL; 4949 } 4950 4951 #if defined(CONFIG_NET_RX_BUSY_POLL) 4952 #define BUSY_POLL_BUDGET 8 4953 bool sk_busy_loop(struct sock *sk, int nonblock) 4954 { 4955 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; 4956 int (*busy_poll)(struct napi_struct *dev); 4957 struct napi_struct *napi; 4958 int rc = false; 4959 4960 rcu_read_lock(); 4961 4962 napi = napi_by_id(sk->sk_napi_id); 4963 if (!napi) 4964 goto out; 4965 4966 /* Note: ndo_busy_poll method is optional in linux-4.5 */ 4967 busy_poll = napi->dev->netdev_ops->ndo_busy_poll; 4968 4969 do { 4970 rc = 0; 4971 local_bh_disable(); 4972 if (busy_poll) { 4973 rc = busy_poll(napi); 4974 } else if (napi_schedule_prep(napi)) { 4975 void *have = netpoll_poll_lock(napi); 4976 4977 if (test_bit(NAPI_STATE_SCHED, &napi->state)) { 4978 rc = napi->poll(napi, BUSY_POLL_BUDGET); 4979 trace_napi_poll(napi); 4980 if (rc == BUSY_POLL_BUDGET) { 4981 napi_complete_done(napi, rc); 4982 napi_schedule(napi); 4983 } 4984 } 4985 netpoll_poll_unlock(have); 4986 } 4987 if (rc > 0) 4988 __NET_ADD_STATS(sock_net(sk), 4989 LINUX_MIB_BUSYPOLLRXPACKETS, rc); 4990 local_bh_enable(); 4991 4992 if (rc == LL_FLUSH_FAILED) 4993 break; /* permanent failure */ 4994 4995 cpu_relax(); 4996 } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && 4997 !need_resched() && !busy_loop_timeout(end_time)); 4998 4999 rc = !skb_queue_empty(&sk->sk_receive_queue); 5000 out: 5001 rcu_read_unlock(); 5002 return rc; 5003 } 5004 EXPORT_SYMBOL(sk_busy_loop); 5005 5006 #endif /* CONFIG_NET_RX_BUSY_POLL */ 5007 5008 void napi_hash_add(struct napi_struct *napi) 5009 { 5010 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || 5011 test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) 5012 return; 5013 5014 spin_lock(&napi_hash_lock); 5015 5016 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ 5017 do { 5018 if (unlikely(++napi_gen_id < NR_CPUS + 1)) 5019 napi_gen_id = NR_CPUS + 1; 5020 } while (napi_by_id(napi_gen_id)); 5021 napi->napi_id = napi_gen_id; 5022 5023 hlist_add_head_rcu(&napi->napi_hash_node, 5024 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); 5025 5026 spin_unlock(&napi_hash_lock); 5027 } 5028 EXPORT_SYMBOL_GPL(napi_hash_add); 5029 5030 /* Warning : caller is responsible to make sure rcu grace period 5031 * is respected before freeing memory containing @napi 5032 */ 5033 bool napi_hash_del(struct napi_struct *napi) 5034 { 5035 bool rcu_sync_needed = false; 5036 5037 spin_lock(&napi_hash_lock); 5038 5039 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { 5040 rcu_sync_needed = true; 5041 hlist_del_rcu(&napi->napi_hash_node); 5042 } 5043 spin_unlock(&napi_hash_lock); 5044 return rcu_sync_needed; 5045 } 5046 EXPORT_SYMBOL_GPL(napi_hash_del); 5047 5048 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) 5049 { 5050 struct napi_struct *napi; 5051 5052 napi = container_of(timer, struct napi_struct, timer); 5053 if (napi->gro_list) 5054 napi_schedule(napi); 5055 5056 return HRTIMER_NORESTART; 5057 } 5058 5059 void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 5060 int (*poll)(struct napi_struct *, int), int weight) 5061 { 5062 INIT_LIST_HEAD(&napi->poll_list); 5063 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); 5064 napi->timer.function = napi_watchdog; 5065 napi->gro_count = 0; 5066 napi->gro_list = NULL; 5067 napi->skb = NULL; 5068 napi->poll = poll; 5069 if (weight > NAPI_POLL_WEIGHT) 5070 pr_err_once("netif_napi_add() called with weight %d on device %s\n", 5071 weight, dev->name); 5072 napi->weight = weight; 5073 list_add(&napi->dev_list, &dev->napi_list); 5074 napi->dev = dev; 5075 #ifdef CONFIG_NETPOLL 5076 spin_lock_init(&napi->poll_lock); 5077 napi->poll_owner = -1; 5078 #endif 5079 set_bit(NAPI_STATE_SCHED, &napi->state); 5080 napi_hash_add(napi); 5081 } 5082 EXPORT_SYMBOL(netif_napi_add); 5083 5084 void napi_disable(struct napi_struct *n) 5085 { 5086 might_sleep(); 5087 set_bit(NAPI_STATE_DISABLE, &n->state); 5088 5089 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) 5090 msleep(1); 5091 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) 5092 msleep(1); 5093 5094 hrtimer_cancel(&n->timer); 5095 5096 clear_bit(NAPI_STATE_DISABLE, &n->state); 5097 } 5098 EXPORT_SYMBOL(napi_disable); 5099 5100 /* Must be called in process context */ 5101 void netif_napi_del(struct napi_struct *napi) 5102 { 5103 might_sleep(); 5104 if (napi_hash_del(napi)) 5105 synchronize_net(); 5106 list_del_init(&napi->dev_list); 5107 napi_free_frags(napi); 5108 5109 kfree_skb_list(napi->gro_list); 5110 napi->gro_list = NULL; 5111 napi->gro_count = 0; 5112 } 5113 EXPORT_SYMBOL(netif_napi_del); 5114 5115 static int napi_poll(struct napi_struct *n, struct list_head *repoll) 5116 { 5117 void *have; 5118 int work, weight; 5119 5120 list_del_init(&n->poll_list); 5121 5122 have = netpoll_poll_lock(n); 5123 5124 weight = n->weight; 5125 5126 /* This NAPI_STATE_SCHED test is for avoiding a race 5127 * with netpoll's poll_napi(). Only the entity which 5128 * obtains the lock and sees NAPI_STATE_SCHED set will 5129 * actually make the ->poll() call. Therefore we avoid 5130 * accidentally calling ->poll() when NAPI is not scheduled. 5131 */ 5132 work = 0; 5133 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 5134 work = n->poll(n, weight); 5135 trace_napi_poll(n); 5136 } 5137 5138 WARN_ON_ONCE(work > weight); 5139 5140 if (likely(work < weight)) 5141 goto out_unlock; 5142 5143 /* Drivers must not modify the NAPI state if they 5144 * consume the entire weight. In such cases this code 5145 * still "owns" the NAPI instance and therefore can 5146 * move the instance around on the list at-will. 5147 */ 5148 if (unlikely(napi_disable_pending(n))) { 5149 napi_complete(n); 5150 goto out_unlock; 5151 } 5152 5153 if (n->gro_list) { 5154 /* flush too old packets 5155 * If HZ < 1000, flush all packets. 5156 */ 5157 napi_gro_flush(n, HZ >= 1000); 5158 } 5159 5160 /* Some drivers may have called napi_schedule 5161 * prior to exhausting their budget. 5162 */ 5163 if (unlikely(!list_empty(&n->poll_list))) { 5164 pr_warn_once("%s: Budget exhausted after napi rescheduled\n", 5165 n->dev ? n->dev->name : "backlog"); 5166 goto out_unlock; 5167 } 5168 5169 list_add_tail(&n->poll_list, repoll); 5170 5171 out_unlock: 5172 netpoll_poll_unlock(have); 5173 5174 return work; 5175 } 5176 5177 static void net_rx_action(struct softirq_action *h) 5178 { 5179 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 5180 unsigned long time_limit = jiffies + 2; 5181 int budget = netdev_budget; 5182 LIST_HEAD(list); 5183 LIST_HEAD(repoll); 5184 5185 local_irq_disable(); 5186 list_splice_init(&sd->poll_list, &list); 5187 local_irq_enable(); 5188 5189 for (;;) { 5190 struct napi_struct *n; 5191 5192 if (list_empty(&list)) { 5193 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) 5194 return; 5195 break; 5196 } 5197 5198 n = list_first_entry(&list, struct napi_struct, poll_list); 5199 budget -= napi_poll(n, &repoll); 5200 5201 /* If softirq window is exhausted then punt. 5202 * Allow this to run for 2 jiffies since which will allow 5203 * an average latency of 1.5/HZ. 5204 */ 5205 if (unlikely(budget <= 0 || 5206 time_after_eq(jiffies, time_limit))) { 5207 sd->time_squeeze++; 5208 break; 5209 } 5210 } 5211 5212 __kfree_skb_flush(); 5213 local_irq_disable(); 5214 5215 list_splice_tail_init(&sd->poll_list, &list); 5216 list_splice_tail(&repoll, &list); 5217 list_splice(&list, &sd->poll_list); 5218 if (!list_empty(&sd->poll_list)) 5219 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 5220 5221 net_rps_action_and_irq_enable(sd); 5222 } 5223 5224 struct netdev_adjacent { 5225 struct net_device *dev; 5226 5227 /* upper master flag, there can only be one master device per list */ 5228 bool master; 5229 5230 /* counter for the number of times this device was added to us */ 5231 u16 ref_nr; 5232 5233 /* private field for the users */ 5234 void *private; 5235 5236 struct list_head list; 5237 struct rcu_head rcu; 5238 }; 5239 5240 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, 5241 struct list_head *adj_list) 5242 { 5243 struct netdev_adjacent *adj; 5244 5245 list_for_each_entry(adj, adj_list, list) { 5246 if (adj->dev == adj_dev) 5247 return adj; 5248 } 5249 return NULL; 5250 } 5251 5252 /** 5253 * netdev_has_upper_dev - Check if device is linked to an upper device 5254 * @dev: device 5255 * @upper_dev: upper device to check 5256 * 5257 * Find out if a device is linked to specified upper device and return true 5258 * in case it is. Note that this checks only immediate upper device, 5259 * not through a complete stack of devices. The caller must hold the RTNL lock. 5260 */ 5261 bool netdev_has_upper_dev(struct net_device *dev, 5262 struct net_device *upper_dev) 5263 { 5264 ASSERT_RTNL(); 5265 5266 return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper); 5267 } 5268 EXPORT_SYMBOL(netdev_has_upper_dev); 5269 5270 /** 5271 * netdev_has_any_upper_dev - Check if device is linked to some device 5272 * @dev: device 5273 * 5274 * Find out if a device is linked to an upper device and return true in case 5275 * it is. The caller must hold the RTNL lock. 5276 */ 5277 static bool netdev_has_any_upper_dev(struct net_device *dev) 5278 { 5279 ASSERT_RTNL(); 5280 5281 return !list_empty(&dev->all_adj_list.upper); 5282 } 5283 5284 /** 5285 * netdev_master_upper_dev_get - Get master upper device 5286 * @dev: device 5287 * 5288 * Find a master upper device and return pointer to it or NULL in case 5289 * it's not there. The caller must hold the RTNL lock. 5290 */ 5291 struct net_device *netdev_master_upper_dev_get(struct net_device *dev) 5292 { 5293 struct netdev_adjacent *upper; 5294 5295 ASSERT_RTNL(); 5296 5297 if (list_empty(&dev->adj_list.upper)) 5298 return NULL; 5299 5300 upper = list_first_entry(&dev->adj_list.upper, 5301 struct netdev_adjacent, list); 5302 if (likely(upper->master)) 5303 return upper->dev; 5304 return NULL; 5305 } 5306 EXPORT_SYMBOL(netdev_master_upper_dev_get); 5307 5308 void *netdev_adjacent_get_private(struct list_head *adj_list) 5309 { 5310 struct netdev_adjacent *adj; 5311 5312 adj = list_entry(adj_list, struct netdev_adjacent, list); 5313 5314 return adj->private; 5315 } 5316 EXPORT_SYMBOL(netdev_adjacent_get_private); 5317 5318 /** 5319 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list 5320 * @dev: device 5321 * @iter: list_head ** of the current position 5322 * 5323 * Gets the next device from the dev's upper list, starting from iter 5324 * position. The caller must hold RCU read lock. 5325 */ 5326 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, 5327 struct list_head **iter) 5328 { 5329 struct netdev_adjacent *upper; 5330 5331 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 5332 5333 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5334 5335 if (&upper->list == &dev->adj_list.upper) 5336 return NULL; 5337 5338 *iter = &upper->list; 5339 5340 return upper->dev; 5341 } 5342 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); 5343 5344 /** 5345 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list 5346 * @dev: device 5347 * @iter: list_head ** of the current position 5348 * 5349 * Gets the next device from the dev's upper list, starting from iter 5350 * position. The caller must hold RCU read lock. 5351 */ 5352 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, 5353 struct list_head **iter) 5354 { 5355 struct netdev_adjacent *upper; 5356 5357 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 5358 5359 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5360 5361 if (&upper->list == &dev->all_adj_list.upper) 5362 return NULL; 5363 5364 *iter = &upper->list; 5365 5366 return upper->dev; 5367 } 5368 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); 5369 5370 /** 5371 * netdev_lower_get_next_private - Get the next ->private from the 5372 * lower neighbour list 5373 * @dev: device 5374 * @iter: list_head ** of the current position 5375 * 5376 * Gets the next netdev_adjacent->private from the dev's lower neighbour 5377 * list, starting from iter position. The caller must hold either hold the 5378 * RTNL lock or its own locking that guarantees that the neighbour lower 5379 * list will remain unchanged. 5380 */ 5381 void *netdev_lower_get_next_private(struct net_device *dev, 5382 struct list_head **iter) 5383 { 5384 struct netdev_adjacent *lower; 5385 5386 lower = list_entry(*iter, struct netdev_adjacent, list); 5387 5388 if (&lower->list == &dev->adj_list.lower) 5389 return NULL; 5390 5391 *iter = lower->list.next; 5392 5393 return lower->private; 5394 } 5395 EXPORT_SYMBOL(netdev_lower_get_next_private); 5396 5397 /** 5398 * netdev_lower_get_next_private_rcu - Get the next ->private from the 5399 * lower neighbour list, RCU 5400 * variant 5401 * @dev: device 5402 * @iter: list_head ** of the current position 5403 * 5404 * Gets the next netdev_adjacent->private from the dev's lower neighbour 5405 * list, starting from iter position. The caller must hold RCU read lock. 5406 */ 5407 void *netdev_lower_get_next_private_rcu(struct net_device *dev, 5408 struct list_head **iter) 5409 { 5410 struct netdev_adjacent *lower; 5411 5412 WARN_ON_ONCE(!rcu_read_lock_held()); 5413 5414 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5415 5416 if (&lower->list == &dev->adj_list.lower) 5417 return NULL; 5418 5419 *iter = &lower->list; 5420 5421 return lower->private; 5422 } 5423 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); 5424 5425 /** 5426 * netdev_lower_get_next - Get the next device from the lower neighbour 5427 * list 5428 * @dev: device 5429 * @iter: list_head ** of the current position 5430 * 5431 * Gets the next netdev_adjacent from the dev's lower neighbour 5432 * list, starting from iter position. The caller must hold RTNL lock or 5433 * its own locking that guarantees that the neighbour lower 5434 * list will remain unchanged. 5435 */ 5436 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) 5437 { 5438 struct netdev_adjacent *lower; 5439 5440 lower = list_entry(*iter, struct netdev_adjacent, list); 5441 5442 if (&lower->list == &dev->adj_list.lower) 5443 return NULL; 5444 5445 *iter = lower->list.next; 5446 5447 return lower->dev; 5448 } 5449 EXPORT_SYMBOL(netdev_lower_get_next); 5450 5451 /** 5452 * netdev_lower_get_first_private_rcu - Get the first ->private from the 5453 * lower neighbour list, RCU 5454 * variant 5455 * @dev: device 5456 * 5457 * Gets the first netdev_adjacent->private from the dev's lower neighbour 5458 * list. The caller must hold RCU read lock. 5459 */ 5460 void *netdev_lower_get_first_private_rcu(struct net_device *dev) 5461 { 5462 struct netdev_adjacent *lower; 5463 5464 lower = list_first_or_null_rcu(&dev->adj_list.lower, 5465 struct netdev_adjacent, list); 5466 if (lower) 5467 return lower->private; 5468 return NULL; 5469 } 5470 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); 5471 5472 /** 5473 * netdev_master_upper_dev_get_rcu - Get master upper device 5474 * @dev: device 5475 * 5476 * Find a master upper device and return pointer to it or NULL in case 5477 * it's not there. The caller must hold the RCU read lock. 5478 */ 5479 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) 5480 { 5481 struct netdev_adjacent *upper; 5482 5483 upper = list_first_or_null_rcu(&dev->adj_list.upper, 5484 struct netdev_adjacent, list); 5485 if (upper && likely(upper->master)) 5486 return upper->dev; 5487 return NULL; 5488 } 5489 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); 5490 5491 static int netdev_adjacent_sysfs_add(struct net_device *dev, 5492 struct net_device *adj_dev, 5493 struct list_head *dev_list) 5494 { 5495 char linkname[IFNAMSIZ+7]; 5496 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5497 "upper_%s" : "lower_%s", adj_dev->name); 5498 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), 5499 linkname); 5500 } 5501 static void netdev_adjacent_sysfs_del(struct net_device *dev, 5502 char *name, 5503 struct list_head *dev_list) 5504 { 5505 char linkname[IFNAMSIZ+7]; 5506 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5507 "upper_%s" : "lower_%s", name); 5508 sysfs_remove_link(&(dev->dev.kobj), linkname); 5509 } 5510 5511 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, 5512 struct net_device *adj_dev, 5513 struct list_head *dev_list) 5514 { 5515 return (dev_list == &dev->adj_list.upper || 5516 dev_list == &dev->adj_list.lower) && 5517 net_eq(dev_net(dev), dev_net(adj_dev)); 5518 } 5519 5520 static int __netdev_adjacent_dev_insert(struct net_device *dev, 5521 struct net_device *adj_dev, 5522 struct list_head *dev_list, 5523 void *private, bool master) 5524 { 5525 struct netdev_adjacent *adj; 5526 int ret; 5527 5528 adj = __netdev_find_adj(adj_dev, dev_list); 5529 5530 if (adj) { 5531 adj->ref_nr++; 5532 return 0; 5533 } 5534 5535 adj = kmalloc(sizeof(*adj), GFP_KERNEL); 5536 if (!adj) 5537 return -ENOMEM; 5538 5539 adj->dev = adj_dev; 5540 adj->master = master; 5541 adj->ref_nr = 1; 5542 adj->private = private; 5543 dev_hold(adj_dev); 5544 5545 pr_debug("dev_hold for %s, because of link added from %s to %s\n", 5546 adj_dev->name, dev->name, adj_dev->name); 5547 5548 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { 5549 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); 5550 if (ret) 5551 goto free_adj; 5552 } 5553 5554 /* Ensure that master link is always the first item in list. */ 5555 if (master) { 5556 ret = sysfs_create_link(&(dev->dev.kobj), 5557 &(adj_dev->dev.kobj), "master"); 5558 if (ret) 5559 goto remove_symlinks; 5560 5561 list_add_rcu(&adj->list, dev_list); 5562 } else { 5563 list_add_tail_rcu(&adj->list, dev_list); 5564 } 5565 5566 return 0; 5567 5568 remove_symlinks: 5569 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5570 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5571 free_adj: 5572 kfree(adj); 5573 dev_put(adj_dev); 5574 5575 return ret; 5576 } 5577 5578 static void __netdev_adjacent_dev_remove(struct net_device *dev, 5579 struct net_device *adj_dev, 5580 struct list_head *dev_list) 5581 { 5582 struct netdev_adjacent *adj; 5583 5584 adj = __netdev_find_adj(adj_dev, dev_list); 5585 5586 if (!adj) { 5587 pr_err("tried to remove device %s from %s\n", 5588 dev->name, adj_dev->name); 5589 BUG(); 5590 } 5591 5592 if (adj->ref_nr > 1) { 5593 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, 5594 adj->ref_nr-1); 5595 adj->ref_nr--; 5596 return; 5597 } 5598 5599 if (adj->master) 5600 sysfs_remove_link(&(dev->dev.kobj), "master"); 5601 5602 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5603 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5604 5605 list_del_rcu(&adj->list); 5606 pr_debug("dev_put for %s, because link removed from %s to %s\n", 5607 adj_dev->name, dev->name, adj_dev->name); 5608 dev_put(adj_dev); 5609 kfree_rcu(adj, rcu); 5610 } 5611 5612 static int __netdev_adjacent_dev_link_lists(struct net_device *dev, 5613 struct net_device *upper_dev, 5614 struct list_head *up_list, 5615 struct list_head *down_list, 5616 void *private, bool master) 5617 { 5618 int ret; 5619 5620 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, 5621 master); 5622 if (ret) 5623 return ret; 5624 5625 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, 5626 false); 5627 if (ret) { 5628 __netdev_adjacent_dev_remove(dev, upper_dev, up_list); 5629 return ret; 5630 } 5631 5632 return 0; 5633 } 5634 5635 static int __netdev_adjacent_dev_link(struct net_device *dev, 5636 struct net_device *upper_dev) 5637 { 5638 return __netdev_adjacent_dev_link_lists(dev, upper_dev, 5639 &dev->all_adj_list.upper, 5640 &upper_dev->all_adj_list.lower, 5641 NULL, false); 5642 } 5643 5644 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, 5645 struct net_device *upper_dev, 5646 struct list_head *up_list, 5647 struct list_head *down_list) 5648 { 5649 __netdev_adjacent_dev_remove(dev, upper_dev, up_list); 5650 __netdev_adjacent_dev_remove(upper_dev, dev, down_list); 5651 } 5652 5653 static void __netdev_adjacent_dev_unlink(struct net_device *dev, 5654 struct net_device *upper_dev) 5655 { 5656 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 5657 &dev->all_adj_list.upper, 5658 &upper_dev->all_adj_list.lower); 5659 } 5660 5661 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, 5662 struct net_device *upper_dev, 5663 void *private, bool master) 5664 { 5665 int ret = __netdev_adjacent_dev_link(dev, upper_dev); 5666 5667 if (ret) 5668 return ret; 5669 5670 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 5671 &dev->adj_list.upper, 5672 &upper_dev->adj_list.lower, 5673 private, master); 5674 if (ret) { 5675 __netdev_adjacent_dev_unlink(dev, upper_dev); 5676 return ret; 5677 } 5678 5679 return 0; 5680 } 5681 5682 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, 5683 struct net_device *upper_dev) 5684 { 5685 __netdev_adjacent_dev_unlink(dev, upper_dev); 5686 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 5687 &dev->adj_list.upper, 5688 &upper_dev->adj_list.lower); 5689 } 5690 5691 static int __netdev_upper_dev_link(struct net_device *dev, 5692 struct net_device *upper_dev, bool master, 5693 void *upper_priv, void *upper_info) 5694 { 5695 struct netdev_notifier_changeupper_info changeupper_info; 5696 struct netdev_adjacent *i, *j, *to_i, *to_j; 5697 int ret = 0; 5698 5699 ASSERT_RTNL(); 5700 5701 if (dev == upper_dev) 5702 return -EBUSY; 5703 5704 /* To prevent loops, check if dev is not upper device to upper_dev. */ 5705 if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper)) 5706 return -EBUSY; 5707 5708 if (__netdev_find_adj(upper_dev, &dev->adj_list.upper)) 5709 return -EEXIST; 5710 5711 if (master && netdev_master_upper_dev_get(dev)) 5712 return -EBUSY; 5713 5714 changeupper_info.upper_dev = upper_dev; 5715 changeupper_info.master = master; 5716 changeupper_info.linking = true; 5717 changeupper_info.upper_info = upper_info; 5718 5719 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, 5720 &changeupper_info.info); 5721 ret = notifier_to_errno(ret); 5722 if (ret) 5723 return ret; 5724 5725 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, 5726 master); 5727 if (ret) 5728 return ret; 5729 5730 /* Now that we linked these devs, make all the upper_dev's 5731 * all_adj_list.upper visible to every dev's all_adj_list.lower an 5732 * versa, and don't forget the devices itself. All of these 5733 * links are non-neighbours. 5734 */ 5735 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5736 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5737 pr_debug("Interlinking %s with %s, non-neighbour\n", 5738 i->dev->name, j->dev->name); 5739 ret = __netdev_adjacent_dev_link(i->dev, j->dev); 5740 if (ret) 5741 goto rollback_mesh; 5742 } 5743 } 5744 5745 /* add dev to every upper_dev's upper device */ 5746 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5747 pr_debug("linking %s's upper device %s with %s\n", 5748 upper_dev->name, i->dev->name, dev->name); 5749 ret = __netdev_adjacent_dev_link(dev, i->dev); 5750 if (ret) 5751 goto rollback_upper_mesh; 5752 } 5753 5754 /* add upper_dev to every dev's lower device */ 5755 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5756 pr_debug("linking %s's lower device %s with %s\n", dev->name, 5757 i->dev->name, upper_dev->name); 5758 ret = __netdev_adjacent_dev_link(i->dev, upper_dev); 5759 if (ret) 5760 goto rollback_lower_mesh; 5761 } 5762 5763 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 5764 &changeupper_info.info); 5765 ret = notifier_to_errno(ret); 5766 if (ret) 5767 goto rollback_lower_mesh; 5768 5769 return 0; 5770 5771 rollback_lower_mesh: 5772 to_i = i; 5773 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5774 if (i == to_i) 5775 break; 5776 __netdev_adjacent_dev_unlink(i->dev, upper_dev); 5777 } 5778 5779 i = NULL; 5780 5781 rollback_upper_mesh: 5782 to_i = i; 5783 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5784 if (i == to_i) 5785 break; 5786 __netdev_adjacent_dev_unlink(dev, i->dev); 5787 } 5788 5789 i = j = NULL; 5790 5791 rollback_mesh: 5792 to_i = i; 5793 to_j = j; 5794 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5795 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5796 if (i == to_i && j == to_j) 5797 break; 5798 __netdev_adjacent_dev_unlink(i->dev, j->dev); 5799 } 5800 if (i == to_i) 5801 break; 5802 } 5803 5804 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5805 5806 return ret; 5807 } 5808 5809 /** 5810 * netdev_upper_dev_link - Add a link to the upper device 5811 * @dev: device 5812 * @upper_dev: new upper device 5813 * 5814 * Adds a link to device which is upper to this one. The caller must hold 5815 * the RTNL lock. On a failure a negative errno code is returned. 5816 * On success the reference counts are adjusted and the function 5817 * returns zero. 5818 */ 5819 int netdev_upper_dev_link(struct net_device *dev, 5820 struct net_device *upper_dev) 5821 { 5822 return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL); 5823 } 5824 EXPORT_SYMBOL(netdev_upper_dev_link); 5825 5826 /** 5827 * netdev_master_upper_dev_link - Add a master link to the upper device 5828 * @dev: device 5829 * @upper_dev: new upper device 5830 * @upper_priv: upper device private 5831 * @upper_info: upper info to be passed down via notifier 5832 * 5833 * Adds a link to device which is upper to this one. In this case, only 5834 * one master upper device can be linked, although other non-master devices 5835 * might be linked as well. The caller must hold the RTNL lock. 5836 * On a failure a negative errno code is returned. On success the reference 5837 * counts are adjusted and the function returns zero. 5838 */ 5839 int netdev_master_upper_dev_link(struct net_device *dev, 5840 struct net_device *upper_dev, 5841 void *upper_priv, void *upper_info) 5842 { 5843 return __netdev_upper_dev_link(dev, upper_dev, true, 5844 upper_priv, upper_info); 5845 } 5846 EXPORT_SYMBOL(netdev_master_upper_dev_link); 5847 5848 /** 5849 * netdev_upper_dev_unlink - Removes a link to upper device 5850 * @dev: device 5851 * @upper_dev: new upper device 5852 * 5853 * Removes a link to device which is upper to this one. The caller must hold 5854 * the RTNL lock. 5855 */ 5856 void netdev_upper_dev_unlink(struct net_device *dev, 5857 struct net_device *upper_dev) 5858 { 5859 struct netdev_notifier_changeupper_info changeupper_info; 5860 struct netdev_adjacent *i, *j; 5861 ASSERT_RTNL(); 5862 5863 changeupper_info.upper_dev = upper_dev; 5864 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; 5865 changeupper_info.linking = false; 5866 5867 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, 5868 &changeupper_info.info); 5869 5870 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5871 5872 /* Here is the tricky part. We must remove all dev's lower 5873 * devices from all upper_dev's upper devices and vice 5874 * versa, to maintain the graph relationship. 5875 */ 5876 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5877 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) 5878 __netdev_adjacent_dev_unlink(i->dev, j->dev); 5879 5880 /* remove also the devices itself from lower/upper device 5881 * list 5882 */ 5883 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5884 __netdev_adjacent_dev_unlink(i->dev, upper_dev); 5885 5886 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) 5887 __netdev_adjacent_dev_unlink(dev, i->dev); 5888 5889 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 5890 &changeupper_info.info); 5891 } 5892 EXPORT_SYMBOL(netdev_upper_dev_unlink); 5893 5894 /** 5895 * netdev_bonding_info_change - Dispatch event about slave change 5896 * @dev: device 5897 * @bonding_info: info to dispatch 5898 * 5899 * Send NETDEV_BONDING_INFO to netdev notifiers with info. 5900 * The caller must hold the RTNL lock. 5901 */ 5902 void netdev_bonding_info_change(struct net_device *dev, 5903 struct netdev_bonding_info *bonding_info) 5904 { 5905 struct netdev_notifier_bonding_info info; 5906 5907 memcpy(&info.bonding_info, bonding_info, 5908 sizeof(struct netdev_bonding_info)); 5909 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, 5910 &info.info); 5911 } 5912 EXPORT_SYMBOL(netdev_bonding_info_change); 5913 5914 static void netdev_adjacent_add_links(struct net_device *dev) 5915 { 5916 struct netdev_adjacent *iter; 5917 5918 struct net *net = dev_net(dev); 5919 5920 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5921 if (!net_eq(net,dev_net(iter->dev))) 5922 continue; 5923 netdev_adjacent_sysfs_add(iter->dev, dev, 5924 &iter->dev->adj_list.lower); 5925 netdev_adjacent_sysfs_add(dev, iter->dev, 5926 &dev->adj_list.upper); 5927 } 5928 5929 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5930 if (!net_eq(net,dev_net(iter->dev))) 5931 continue; 5932 netdev_adjacent_sysfs_add(iter->dev, dev, 5933 &iter->dev->adj_list.upper); 5934 netdev_adjacent_sysfs_add(dev, iter->dev, 5935 &dev->adj_list.lower); 5936 } 5937 } 5938 5939 static void netdev_adjacent_del_links(struct net_device *dev) 5940 { 5941 struct netdev_adjacent *iter; 5942 5943 struct net *net = dev_net(dev); 5944 5945 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5946 if (!net_eq(net,dev_net(iter->dev))) 5947 continue; 5948 netdev_adjacent_sysfs_del(iter->dev, dev->name, 5949 &iter->dev->adj_list.lower); 5950 netdev_adjacent_sysfs_del(dev, iter->dev->name, 5951 &dev->adj_list.upper); 5952 } 5953 5954 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5955 if (!net_eq(net,dev_net(iter->dev))) 5956 continue; 5957 netdev_adjacent_sysfs_del(iter->dev, dev->name, 5958 &iter->dev->adj_list.upper); 5959 netdev_adjacent_sysfs_del(dev, iter->dev->name, 5960 &dev->adj_list.lower); 5961 } 5962 } 5963 5964 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) 5965 { 5966 struct netdev_adjacent *iter; 5967 5968 struct net *net = dev_net(dev); 5969 5970 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5971 if (!net_eq(net,dev_net(iter->dev))) 5972 continue; 5973 netdev_adjacent_sysfs_del(iter->dev, oldname, 5974 &iter->dev->adj_list.lower); 5975 netdev_adjacent_sysfs_add(iter->dev, dev, 5976 &iter->dev->adj_list.lower); 5977 } 5978 5979 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5980 if (!net_eq(net,dev_net(iter->dev))) 5981 continue; 5982 netdev_adjacent_sysfs_del(iter->dev, oldname, 5983 &iter->dev->adj_list.upper); 5984 netdev_adjacent_sysfs_add(iter->dev, dev, 5985 &iter->dev->adj_list.upper); 5986 } 5987 } 5988 5989 void *netdev_lower_dev_get_private(struct net_device *dev, 5990 struct net_device *lower_dev) 5991 { 5992 struct netdev_adjacent *lower; 5993 5994 if (!lower_dev) 5995 return NULL; 5996 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); 5997 if (!lower) 5998 return NULL; 5999 6000 return lower->private; 6001 } 6002 EXPORT_SYMBOL(netdev_lower_dev_get_private); 6003 6004 6005 int dev_get_nest_level(struct net_device *dev, 6006 bool (*type_check)(const struct net_device *dev)) 6007 { 6008 struct net_device *lower = NULL; 6009 struct list_head *iter; 6010 int max_nest = -1; 6011 int nest; 6012 6013 ASSERT_RTNL(); 6014 6015 netdev_for_each_lower_dev(dev, lower, iter) { 6016 nest = dev_get_nest_level(lower, type_check); 6017 if (max_nest < nest) 6018 max_nest = nest; 6019 } 6020 6021 if (type_check(dev)) 6022 max_nest++; 6023 6024 return max_nest; 6025 } 6026 EXPORT_SYMBOL(dev_get_nest_level); 6027 6028 /** 6029 * netdev_lower_change - Dispatch event about lower device state change 6030 * @lower_dev: device 6031 * @lower_state_info: state to dispatch 6032 * 6033 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. 6034 * The caller must hold the RTNL lock. 6035 */ 6036 void netdev_lower_state_changed(struct net_device *lower_dev, 6037 void *lower_state_info) 6038 { 6039 struct netdev_notifier_changelowerstate_info changelowerstate_info; 6040 6041 ASSERT_RTNL(); 6042 changelowerstate_info.lower_state_info = lower_state_info; 6043 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev, 6044 &changelowerstate_info.info); 6045 } 6046 EXPORT_SYMBOL(netdev_lower_state_changed); 6047 6048 static void dev_change_rx_flags(struct net_device *dev, int flags) 6049 { 6050 const struct net_device_ops *ops = dev->netdev_ops; 6051 6052 if (ops->ndo_change_rx_flags) 6053 ops->ndo_change_rx_flags(dev, flags); 6054 } 6055 6056 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) 6057 { 6058 unsigned int old_flags = dev->flags; 6059 kuid_t uid; 6060 kgid_t gid; 6061 6062 ASSERT_RTNL(); 6063 6064 dev->flags |= IFF_PROMISC; 6065 dev->promiscuity += inc; 6066 if (dev->promiscuity == 0) { 6067 /* 6068 * Avoid overflow. 6069 * If inc causes overflow, untouch promisc and return error. 6070 */ 6071 if (inc < 0) 6072 dev->flags &= ~IFF_PROMISC; 6073 else { 6074 dev->promiscuity -= inc; 6075 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", 6076 dev->name); 6077 return -EOVERFLOW; 6078 } 6079 } 6080 if (dev->flags != old_flags) { 6081 pr_info("device %s %s promiscuous mode\n", 6082 dev->name, 6083 dev->flags & IFF_PROMISC ? "entered" : "left"); 6084 if (audit_enabled) { 6085 current_uid_gid(&uid, &gid); 6086 audit_log(current->audit_context, GFP_ATOMIC, 6087 AUDIT_ANOM_PROMISCUOUS, 6088 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 6089 dev->name, (dev->flags & IFF_PROMISC), 6090 (old_flags & IFF_PROMISC), 6091 from_kuid(&init_user_ns, audit_get_loginuid(current)), 6092 from_kuid(&init_user_ns, uid), 6093 from_kgid(&init_user_ns, gid), 6094 audit_get_sessionid(current)); 6095 } 6096 6097 dev_change_rx_flags(dev, IFF_PROMISC); 6098 } 6099 if (notify) 6100 __dev_notify_flags(dev, old_flags, IFF_PROMISC); 6101 return 0; 6102 } 6103 6104 /** 6105 * dev_set_promiscuity - update promiscuity count on a device 6106 * @dev: device 6107 * @inc: modifier 6108 * 6109 * Add or remove promiscuity from a device. While the count in the device 6110 * remains above zero the interface remains promiscuous. Once it hits zero 6111 * the device reverts back to normal filtering operation. A negative inc 6112 * value is used to drop promiscuity on the device. 6113 * Return 0 if successful or a negative errno code on error. 6114 */ 6115 int dev_set_promiscuity(struct net_device *dev, int inc) 6116 { 6117 unsigned int old_flags = dev->flags; 6118 int err; 6119 6120 err = __dev_set_promiscuity(dev, inc, true); 6121 if (err < 0) 6122 return err; 6123 if (dev->flags != old_flags) 6124 dev_set_rx_mode(dev); 6125 return err; 6126 } 6127 EXPORT_SYMBOL(dev_set_promiscuity); 6128 6129 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) 6130 { 6131 unsigned int old_flags = dev->flags, old_gflags = dev->gflags; 6132 6133 ASSERT_RTNL(); 6134 6135 dev->flags |= IFF_ALLMULTI; 6136 dev->allmulti += inc; 6137 if (dev->allmulti == 0) { 6138 /* 6139 * Avoid overflow. 6140 * If inc causes overflow, untouch allmulti and return error. 6141 */ 6142 if (inc < 0) 6143 dev->flags &= ~IFF_ALLMULTI; 6144 else { 6145 dev->allmulti -= inc; 6146 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", 6147 dev->name); 6148 return -EOVERFLOW; 6149 } 6150 } 6151 if (dev->flags ^ old_flags) { 6152 dev_change_rx_flags(dev, IFF_ALLMULTI); 6153 dev_set_rx_mode(dev); 6154 if (notify) 6155 __dev_notify_flags(dev, old_flags, 6156 dev->gflags ^ old_gflags); 6157 } 6158 return 0; 6159 } 6160 6161 /** 6162 * dev_set_allmulti - update allmulti count on a device 6163 * @dev: device 6164 * @inc: modifier 6165 * 6166 * Add or remove reception of all multicast frames to a device. While the 6167 * count in the device remains above zero the interface remains listening 6168 * to all interfaces. Once it hits zero the device reverts back to normal 6169 * filtering operation. A negative @inc value is used to drop the counter 6170 * when releasing a resource needing all multicasts. 6171 * Return 0 if successful or a negative errno code on error. 6172 */ 6173 6174 int dev_set_allmulti(struct net_device *dev, int inc) 6175 { 6176 return __dev_set_allmulti(dev, inc, true); 6177 } 6178 EXPORT_SYMBOL(dev_set_allmulti); 6179 6180 /* 6181 * Upload unicast and multicast address lists to device and 6182 * configure RX filtering. When the device doesn't support unicast 6183 * filtering it is put in promiscuous mode while unicast addresses 6184 * are present. 6185 */ 6186 void __dev_set_rx_mode(struct net_device *dev) 6187 { 6188 const struct net_device_ops *ops = dev->netdev_ops; 6189 6190 /* dev_open will call this function so the list will stay sane. */ 6191 if (!(dev->flags&IFF_UP)) 6192 return; 6193 6194 if (!netif_device_present(dev)) 6195 return; 6196 6197 if (!(dev->priv_flags & IFF_UNICAST_FLT)) { 6198 /* Unicast addresses changes may only happen under the rtnl, 6199 * therefore calling __dev_set_promiscuity here is safe. 6200 */ 6201 if (!netdev_uc_empty(dev) && !dev->uc_promisc) { 6202 __dev_set_promiscuity(dev, 1, false); 6203 dev->uc_promisc = true; 6204 } else if (netdev_uc_empty(dev) && dev->uc_promisc) { 6205 __dev_set_promiscuity(dev, -1, false); 6206 dev->uc_promisc = false; 6207 } 6208 } 6209 6210 if (ops->ndo_set_rx_mode) 6211 ops->ndo_set_rx_mode(dev); 6212 } 6213 6214 void dev_set_rx_mode(struct net_device *dev) 6215 { 6216 netif_addr_lock_bh(dev); 6217 __dev_set_rx_mode(dev); 6218 netif_addr_unlock_bh(dev); 6219 } 6220 6221 /** 6222 * dev_get_flags - get flags reported to userspace 6223 * @dev: device 6224 * 6225 * Get the combination of flag bits exported through APIs to userspace. 6226 */ 6227 unsigned int dev_get_flags(const struct net_device *dev) 6228 { 6229 unsigned int flags; 6230 6231 flags = (dev->flags & ~(IFF_PROMISC | 6232 IFF_ALLMULTI | 6233 IFF_RUNNING | 6234 IFF_LOWER_UP | 6235 IFF_DORMANT)) | 6236 (dev->gflags & (IFF_PROMISC | 6237 IFF_ALLMULTI)); 6238 6239 if (netif_running(dev)) { 6240 if (netif_oper_up(dev)) 6241 flags |= IFF_RUNNING; 6242 if (netif_carrier_ok(dev)) 6243 flags |= IFF_LOWER_UP; 6244 if (netif_dormant(dev)) 6245 flags |= IFF_DORMANT; 6246 } 6247 6248 return flags; 6249 } 6250 EXPORT_SYMBOL(dev_get_flags); 6251 6252 int __dev_change_flags(struct net_device *dev, unsigned int flags) 6253 { 6254 unsigned int old_flags = dev->flags; 6255 int ret; 6256 6257 ASSERT_RTNL(); 6258 6259 /* 6260 * Set the flags on our device. 6261 */ 6262 6263 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 6264 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 6265 IFF_AUTOMEDIA)) | 6266 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 6267 IFF_ALLMULTI)); 6268 6269 /* 6270 * Load in the correct multicast list now the flags have changed. 6271 */ 6272 6273 if ((old_flags ^ flags) & IFF_MULTICAST) 6274 dev_change_rx_flags(dev, IFF_MULTICAST); 6275 6276 dev_set_rx_mode(dev); 6277 6278 /* 6279 * Have we downed the interface. We handle IFF_UP ourselves 6280 * according to user attempts to set it, rather than blindly 6281 * setting it. 6282 */ 6283 6284 ret = 0; 6285 if ((old_flags ^ flags) & IFF_UP) 6286 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); 6287 6288 if ((flags ^ dev->gflags) & IFF_PROMISC) { 6289 int inc = (flags & IFF_PROMISC) ? 1 : -1; 6290 unsigned int old_flags = dev->flags; 6291 6292 dev->gflags ^= IFF_PROMISC; 6293 6294 if (__dev_set_promiscuity(dev, inc, false) >= 0) 6295 if (dev->flags != old_flags) 6296 dev_set_rx_mode(dev); 6297 } 6298 6299 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 6300 is important. Some (broken) drivers set IFF_PROMISC, when 6301 IFF_ALLMULTI is requested not asking us and not reporting. 6302 */ 6303 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 6304 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 6305 6306 dev->gflags ^= IFF_ALLMULTI; 6307 __dev_set_allmulti(dev, inc, false); 6308 } 6309 6310 return ret; 6311 } 6312 6313 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, 6314 unsigned int gchanges) 6315 { 6316 unsigned int changes = dev->flags ^ old_flags; 6317 6318 if (gchanges) 6319 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); 6320 6321 if (changes & IFF_UP) { 6322 if (dev->flags & IFF_UP) 6323 call_netdevice_notifiers(NETDEV_UP, dev); 6324 else 6325 call_netdevice_notifiers(NETDEV_DOWN, dev); 6326 } 6327 6328 if (dev->flags & IFF_UP && 6329 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { 6330 struct netdev_notifier_change_info change_info; 6331 6332 change_info.flags_changed = changes; 6333 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 6334 &change_info.info); 6335 } 6336 } 6337 6338 /** 6339 * dev_change_flags - change device settings 6340 * @dev: device 6341 * @flags: device state flags 6342 * 6343 * Change settings on device based state flags. The flags are 6344 * in the userspace exported format. 6345 */ 6346 int dev_change_flags(struct net_device *dev, unsigned int flags) 6347 { 6348 int ret; 6349 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; 6350 6351 ret = __dev_change_flags(dev, flags); 6352 if (ret < 0) 6353 return ret; 6354 6355 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); 6356 __dev_notify_flags(dev, old_flags, changes); 6357 return ret; 6358 } 6359 EXPORT_SYMBOL(dev_change_flags); 6360 6361 static int __dev_set_mtu(struct net_device *dev, int new_mtu) 6362 { 6363 const struct net_device_ops *ops = dev->netdev_ops; 6364 6365 if (ops->ndo_change_mtu) 6366 return ops->ndo_change_mtu(dev, new_mtu); 6367 6368 dev->mtu = new_mtu; 6369 return 0; 6370 } 6371 6372 /** 6373 * dev_set_mtu - Change maximum transfer unit 6374 * @dev: device 6375 * @new_mtu: new transfer unit 6376 * 6377 * Change the maximum transfer size of the network device. 6378 */ 6379 int dev_set_mtu(struct net_device *dev, int new_mtu) 6380 { 6381 int err, orig_mtu; 6382 6383 if (new_mtu == dev->mtu) 6384 return 0; 6385 6386 /* MTU must be positive. */ 6387 if (new_mtu < 0) 6388 return -EINVAL; 6389 6390 if (!netif_device_present(dev)) 6391 return -ENODEV; 6392 6393 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); 6394 err = notifier_to_errno(err); 6395 if (err) 6396 return err; 6397 6398 orig_mtu = dev->mtu; 6399 err = __dev_set_mtu(dev, new_mtu); 6400 6401 if (!err) { 6402 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 6403 err = notifier_to_errno(err); 6404 if (err) { 6405 /* setting mtu back and notifying everyone again, 6406 * so that they have a chance to revert changes. 6407 */ 6408 __dev_set_mtu(dev, orig_mtu); 6409 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 6410 } 6411 } 6412 return err; 6413 } 6414 EXPORT_SYMBOL(dev_set_mtu); 6415 6416 /** 6417 * dev_set_group - Change group this device belongs to 6418 * @dev: device 6419 * @new_group: group this device should belong to 6420 */ 6421 void dev_set_group(struct net_device *dev, int new_group) 6422 { 6423 dev->group = new_group; 6424 } 6425 EXPORT_SYMBOL(dev_set_group); 6426 6427 /** 6428 * dev_set_mac_address - Change Media Access Control Address 6429 * @dev: device 6430 * @sa: new address 6431 * 6432 * Change the hardware (MAC) address of the device 6433 */ 6434 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 6435 { 6436 const struct net_device_ops *ops = dev->netdev_ops; 6437 int err; 6438 6439 if (!ops->ndo_set_mac_address) 6440 return -EOPNOTSUPP; 6441 if (sa->sa_family != dev->type) 6442 return -EINVAL; 6443 if (!netif_device_present(dev)) 6444 return -ENODEV; 6445 err = ops->ndo_set_mac_address(dev, sa); 6446 if (err) 6447 return err; 6448 dev->addr_assign_type = NET_ADDR_SET; 6449 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 6450 add_device_randomness(dev->dev_addr, dev->addr_len); 6451 return 0; 6452 } 6453 EXPORT_SYMBOL(dev_set_mac_address); 6454 6455 /** 6456 * dev_change_carrier - Change device carrier 6457 * @dev: device 6458 * @new_carrier: new value 6459 * 6460 * Change device carrier 6461 */ 6462 int dev_change_carrier(struct net_device *dev, bool new_carrier) 6463 { 6464 const struct net_device_ops *ops = dev->netdev_ops; 6465 6466 if (!ops->ndo_change_carrier) 6467 return -EOPNOTSUPP; 6468 if (!netif_device_present(dev)) 6469 return -ENODEV; 6470 return ops->ndo_change_carrier(dev, new_carrier); 6471 } 6472 EXPORT_SYMBOL(dev_change_carrier); 6473 6474 /** 6475 * dev_get_phys_port_id - Get device physical port ID 6476 * @dev: device 6477 * @ppid: port ID 6478 * 6479 * Get device physical port ID 6480 */ 6481 int dev_get_phys_port_id(struct net_device *dev, 6482 struct netdev_phys_item_id *ppid) 6483 { 6484 const struct net_device_ops *ops = dev->netdev_ops; 6485 6486 if (!ops->ndo_get_phys_port_id) 6487 return -EOPNOTSUPP; 6488 return ops->ndo_get_phys_port_id(dev, ppid); 6489 } 6490 EXPORT_SYMBOL(dev_get_phys_port_id); 6491 6492 /** 6493 * dev_get_phys_port_name - Get device physical port name 6494 * @dev: device 6495 * @name: port name 6496 * @len: limit of bytes to copy to name 6497 * 6498 * Get device physical port name 6499 */ 6500 int dev_get_phys_port_name(struct net_device *dev, 6501 char *name, size_t len) 6502 { 6503 const struct net_device_ops *ops = dev->netdev_ops; 6504 6505 if (!ops->ndo_get_phys_port_name) 6506 return -EOPNOTSUPP; 6507 return ops->ndo_get_phys_port_name(dev, name, len); 6508 } 6509 EXPORT_SYMBOL(dev_get_phys_port_name); 6510 6511 /** 6512 * dev_change_proto_down - update protocol port state information 6513 * @dev: device 6514 * @proto_down: new value 6515 * 6516 * This info can be used by switch drivers to set the phys state of the 6517 * port. 6518 */ 6519 int dev_change_proto_down(struct net_device *dev, bool proto_down) 6520 { 6521 const struct net_device_ops *ops = dev->netdev_ops; 6522 6523 if (!ops->ndo_change_proto_down) 6524 return -EOPNOTSUPP; 6525 if (!netif_device_present(dev)) 6526 return -ENODEV; 6527 return ops->ndo_change_proto_down(dev, proto_down); 6528 } 6529 EXPORT_SYMBOL(dev_change_proto_down); 6530 6531 /** 6532 * dev_new_index - allocate an ifindex 6533 * @net: the applicable net namespace 6534 * 6535 * Returns a suitable unique value for a new device interface 6536 * number. The caller must hold the rtnl semaphore or the 6537 * dev_base_lock to be sure it remains unique. 6538 */ 6539 static int dev_new_index(struct net *net) 6540 { 6541 int ifindex = net->ifindex; 6542 for (;;) { 6543 if (++ifindex <= 0) 6544 ifindex = 1; 6545 if (!__dev_get_by_index(net, ifindex)) 6546 return net->ifindex = ifindex; 6547 } 6548 } 6549 6550 /* Delayed registration/unregisteration */ 6551 static LIST_HEAD(net_todo_list); 6552 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); 6553 6554 static void net_set_todo(struct net_device *dev) 6555 { 6556 list_add_tail(&dev->todo_list, &net_todo_list); 6557 dev_net(dev)->dev_unreg_count++; 6558 } 6559 6560 static void rollback_registered_many(struct list_head *head) 6561 { 6562 struct net_device *dev, *tmp; 6563 LIST_HEAD(close_head); 6564 6565 BUG_ON(dev_boot_phase); 6566 ASSERT_RTNL(); 6567 6568 list_for_each_entry_safe(dev, tmp, head, unreg_list) { 6569 /* Some devices call without registering 6570 * for initialization unwind. Remove those 6571 * devices and proceed with the remaining. 6572 */ 6573 if (dev->reg_state == NETREG_UNINITIALIZED) { 6574 pr_debug("unregister_netdevice: device %s/%p never was registered\n", 6575 dev->name, dev); 6576 6577 WARN_ON(1); 6578 list_del(&dev->unreg_list); 6579 continue; 6580 } 6581 dev->dismantle = true; 6582 BUG_ON(dev->reg_state != NETREG_REGISTERED); 6583 } 6584 6585 /* If device is running, close it first. */ 6586 list_for_each_entry(dev, head, unreg_list) 6587 list_add_tail(&dev->close_list, &close_head); 6588 dev_close_many(&close_head, true); 6589 6590 list_for_each_entry(dev, head, unreg_list) { 6591 /* And unlink it from device chain. */ 6592 unlist_netdevice(dev); 6593 6594 dev->reg_state = NETREG_UNREGISTERING; 6595 on_each_cpu(flush_backlog, dev, 1); 6596 } 6597 6598 synchronize_net(); 6599 6600 list_for_each_entry(dev, head, unreg_list) { 6601 struct sk_buff *skb = NULL; 6602 6603 /* Shutdown queueing discipline. */ 6604 dev_shutdown(dev); 6605 6606 6607 /* Notify protocols, that we are about to destroy 6608 this device. They should clean all the things. 6609 */ 6610 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6611 6612 if (!dev->rtnl_link_ops || 6613 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 6614 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 6615 GFP_KERNEL); 6616 6617 /* 6618 * Flush the unicast and multicast chains 6619 */ 6620 dev_uc_flush(dev); 6621 dev_mc_flush(dev); 6622 6623 if (dev->netdev_ops->ndo_uninit) 6624 dev->netdev_ops->ndo_uninit(dev); 6625 6626 if (skb) 6627 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); 6628 6629 /* Notifier chain MUST detach us all upper devices. */ 6630 WARN_ON(netdev_has_any_upper_dev(dev)); 6631 6632 /* Remove entries from kobject tree */ 6633 netdev_unregister_kobject(dev); 6634 #ifdef CONFIG_XPS 6635 /* Remove XPS queueing entries */ 6636 netif_reset_xps_queues_gt(dev, 0); 6637 #endif 6638 } 6639 6640 synchronize_net(); 6641 6642 list_for_each_entry(dev, head, unreg_list) 6643 dev_put(dev); 6644 } 6645 6646 static void rollback_registered(struct net_device *dev) 6647 { 6648 LIST_HEAD(single); 6649 6650 list_add(&dev->unreg_list, &single); 6651 rollback_registered_many(&single); 6652 list_del(&single); 6653 } 6654 6655 static netdev_features_t netdev_sync_upper_features(struct net_device *lower, 6656 struct net_device *upper, netdev_features_t features) 6657 { 6658 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; 6659 netdev_features_t feature; 6660 int feature_bit; 6661 6662 for_each_netdev_feature(&upper_disables, feature_bit) { 6663 feature = __NETIF_F_BIT(feature_bit); 6664 if (!(upper->wanted_features & feature) 6665 && (features & feature)) { 6666 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", 6667 &feature, upper->name); 6668 features &= ~feature; 6669 } 6670 } 6671 6672 return features; 6673 } 6674 6675 static void netdev_sync_lower_features(struct net_device *upper, 6676 struct net_device *lower, netdev_features_t features) 6677 { 6678 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; 6679 netdev_features_t feature; 6680 int feature_bit; 6681 6682 for_each_netdev_feature(&upper_disables, feature_bit) { 6683 feature = __NETIF_F_BIT(feature_bit); 6684 if (!(features & feature) && (lower->features & feature)) { 6685 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", 6686 &feature, lower->name); 6687 lower->wanted_features &= ~feature; 6688 netdev_update_features(lower); 6689 6690 if (unlikely(lower->features & feature)) 6691 netdev_WARN(upper, "failed to disable %pNF on %s!\n", 6692 &feature, lower->name); 6693 } 6694 } 6695 } 6696 6697 static netdev_features_t netdev_fix_features(struct net_device *dev, 6698 netdev_features_t features) 6699 { 6700 /* Fix illegal checksum combinations */ 6701 if ((features & NETIF_F_HW_CSUM) && 6702 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 6703 netdev_warn(dev, "mixed HW and IP checksum settings.\n"); 6704 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 6705 } 6706 6707 /* TSO requires that SG is present as well. */ 6708 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { 6709 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); 6710 features &= ~NETIF_F_ALL_TSO; 6711 } 6712 6713 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && 6714 !(features & NETIF_F_IP_CSUM)) { 6715 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); 6716 features &= ~NETIF_F_TSO; 6717 features &= ~NETIF_F_TSO_ECN; 6718 } 6719 6720 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && 6721 !(features & NETIF_F_IPV6_CSUM)) { 6722 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); 6723 features &= ~NETIF_F_TSO6; 6724 } 6725 6726 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ 6727 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) 6728 features &= ~NETIF_F_TSO_MANGLEID; 6729 6730 /* TSO ECN requires that TSO is present as well. */ 6731 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) 6732 features &= ~NETIF_F_TSO_ECN; 6733 6734 /* Software GSO depends on SG. */ 6735 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { 6736 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); 6737 features &= ~NETIF_F_GSO; 6738 } 6739 6740 /* UFO needs SG and checksumming */ 6741 if (features & NETIF_F_UFO) { 6742 /* maybe split UFO into V4 and V6? */ 6743 if (!(features & NETIF_F_HW_CSUM) && 6744 ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != 6745 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) { 6746 netdev_dbg(dev, 6747 "Dropping NETIF_F_UFO since no checksum offload features.\n"); 6748 features &= ~NETIF_F_UFO; 6749 } 6750 6751 if (!(features & NETIF_F_SG)) { 6752 netdev_dbg(dev, 6753 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); 6754 features &= ~NETIF_F_UFO; 6755 } 6756 } 6757 6758 /* GSO partial features require GSO partial be set */ 6759 if ((features & dev->gso_partial_features) && 6760 !(features & NETIF_F_GSO_PARTIAL)) { 6761 netdev_dbg(dev, 6762 "Dropping partially supported GSO features since no GSO partial.\n"); 6763 features &= ~dev->gso_partial_features; 6764 } 6765 6766 #ifdef CONFIG_NET_RX_BUSY_POLL 6767 if (dev->netdev_ops->ndo_busy_poll) 6768 features |= NETIF_F_BUSY_POLL; 6769 else 6770 #endif 6771 features &= ~NETIF_F_BUSY_POLL; 6772 6773 return features; 6774 } 6775 6776 int __netdev_update_features(struct net_device *dev) 6777 { 6778 struct net_device *upper, *lower; 6779 netdev_features_t features; 6780 struct list_head *iter; 6781 int err = -1; 6782 6783 ASSERT_RTNL(); 6784 6785 features = netdev_get_wanted_features(dev); 6786 6787 if (dev->netdev_ops->ndo_fix_features) 6788 features = dev->netdev_ops->ndo_fix_features(dev, features); 6789 6790 /* driver might be less strict about feature dependencies */ 6791 features = netdev_fix_features(dev, features); 6792 6793 /* some features can't be enabled if they're off an an upper device */ 6794 netdev_for_each_upper_dev_rcu(dev, upper, iter) 6795 features = netdev_sync_upper_features(dev, upper, features); 6796 6797 if (dev->features == features) 6798 goto sync_lower; 6799 6800 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", 6801 &dev->features, &features); 6802 6803 if (dev->netdev_ops->ndo_set_features) 6804 err = dev->netdev_ops->ndo_set_features(dev, features); 6805 else 6806 err = 0; 6807 6808 if (unlikely(err < 0)) { 6809 netdev_err(dev, 6810 "set_features() failed (%d); wanted %pNF, left %pNF\n", 6811 err, &features, &dev->features); 6812 /* return non-0 since some features might have changed and 6813 * it's better to fire a spurious notification than miss it 6814 */ 6815 return -1; 6816 } 6817 6818 sync_lower: 6819 /* some features must be disabled on lower devices when disabled 6820 * on an upper device (think: bonding master or bridge) 6821 */ 6822 netdev_for_each_lower_dev(dev, lower, iter) 6823 netdev_sync_lower_features(dev, lower, features); 6824 6825 if (!err) 6826 dev->features = features; 6827 6828 return err < 0 ? 0 : 1; 6829 } 6830 6831 /** 6832 * netdev_update_features - recalculate device features 6833 * @dev: the device to check 6834 * 6835 * Recalculate dev->features set and send notifications if it 6836 * has changed. Should be called after driver or hardware dependent 6837 * conditions might have changed that influence the features. 6838 */ 6839 void netdev_update_features(struct net_device *dev) 6840 { 6841 if (__netdev_update_features(dev)) 6842 netdev_features_change(dev); 6843 } 6844 EXPORT_SYMBOL(netdev_update_features); 6845 6846 /** 6847 * netdev_change_features - recalculate device features 6848 * @dev: the device to check 6849 * 6850 * Recalculate dev->features set and send notifications even 6851 * if they have not changed. Should be called instead of 6852 * netdev_update_features() if also dev->vlan_features might 6853 * have changed to allow the changes to be propagated to stacked 6854 * VLAN devices. 6855 */ 6856 void netdev_change_features(struct net_device *dev) 6857 { 6858 __netdev_update_features(dev); 6859 netdev_features_change(dev); 6860 } 6861 EXPORT_SYMBOL(netdev_change_features); 6862 6863 /** 6864 * netif_stacked_transfer_operstate - transfer operstate 6865 * @rootdev: the root or lower level device to transfer state from 6866 * @dev: the device to transfer operstate to 6867 * 6868 * Transfer operational state from root to device. This is normally 6869 * called when a stacking relationship exists between the root 6870 * device and the device(a leaf device). 6871 */ 6872 void netif_stacked_transfer_operstate(const struct net_device *rootdev, 6873 struct net_device *dev) 6874 { 6875 if (rootdev->operstate == IF_OPER_DORMANT) 6876 netif_dormant_on(dev); 6877 else 6878 netif_dormant_off(dev); 6879 6880 if (netif_carrier_ok(rootdev)) { 6881 if (!netif_carrier_ok(dev)) 6882 netif_carrier_on(dev); 6883 } else { 6884 if (netif_carrier_ok(dev)) 6885 netif_carrier_off(dev); 6886 } 6887 } 6888 EXPORT_SYMBOL(netif_stacked_transfer_operstate); 6889 6890 #ifdef CONFIG_SYSFS 6891 static int netif_alloc_rx_queues(struct net_device *dev) 6892 { 6893 unsigned int i, count = dev->num_rx_queues; 6894 struct netdev_rx_queue *rx; 6895 size_t sz = count * sizeof(*rx); 6896 6897 BUG_ON(count < 1); 6898 6899 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6900 if (!rx) { 6901 rx = vzalloc(sz); 6902 if (!rx) 6903 return -ENOMEM; 6904 } 6905 dev->_rx = rx; 6906 6907 for (i = 0; i < count; i++) 6908 rx[i].dev = dev; 6909 return 0; 6910 } 6911 #endif 6912 6913 static void netdev_init_one_queue(struct net_device *dev, 6914 struct netdev_queue *queue, void *_unused) 6915 { 6916 /* Initialize queue lock */ 6917 spin_lock_init(&queue->_xmit_lock); 6918 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); 6919 queue->xmit_lock_owner = -1; 6920 netdev_queue_numa_node_write(queue, NUMA_NO_NODE); 6921 queue->dev = dev; 6922 #ifdef CONFIG_BQL 6923 dql_init(&queue->dql, HZ); 6924 #endif 6925 } 6926 6927 static void netif_free_tx_queues(struct net_device *dev) 6928 { 6929 kvfree(dev->_tx); 6930 } 6931 6932 static int netif_alloc_netdev_queues(struct net_device *dev) 6933 { 6934 unsigned int count = dev->num_tx_queues; 6935 struct netdev_queue *tx; 6936 size_t sz = count * sizeof(*tx); 6937 6938 if (count < 1 || count > 0xffff) 6939 return -EINVAL; 6940 6941 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6942 if (!tx) { 6943 tx = vzalloc(sz); 6944 if (!tx) 6945 return -ENOMEM; 6946 } 6947 dev->_tx = tx; 6948 6949 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 6950 spin_lock_init(&dev->tx_global_lock); 6951 6952 return 0; 6953 } 6954 6955 void netif_tx_stop_all_queues(struct net_device *dev) 6956 { 6957 unsigned int i; 6958 6959 for (i = 0; i < dev->num_tx_queues; i++) { 6960 struct netdev_queue *txq = netdev_get_tx_queue(dev, i); 6961 netif_tx_stop_queue(txq); 6962 } 6963 } 6964 EXPORT_SYMBOL(netif_tx_stop_all_queues); 6965 6966 /** 6967 * register_netdevice - register a network device 6968 * @dev: device to register 6969 * 6970 * Take a completed network device structure and add it to the kernel 6971 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 6972 * chain. 0 is returned on success. A negative errno code is returned 6973 * on a failure to set up the device, or if the name is a duplicate. 6974 * 6975 * Callers must hold the rtnl semaphore. You may want 6976 * register_netdev() instead of this. 6977 * 6978 * BUGS: 6979 * The locking appears insufficient to guarantee two parallel registers 6980 * will not get the same name. 6981 */ 6982 6983 int register_netdevice(struct net_device *dev) 6984 { 6985 int ret; 6986 struct net *net = dev_net(dev); 6987 6988 BUG_ON(dev_boot_phase); 6989 ASSERT_RTNL(); 6990 6991 might_sleep(); 6992 6993 /* When net_device's are persistent, this will be fatal. */ 6994 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 6995 BUG_ON(!net); 6996 6997 spin_lock_init(&dev->addr_list_lock); 6998 netdev_set_addr_lockdep_class(dev); 6999 7000 ret = dev_get_valid_name(net, dev, dev->name); 7001 if (ret < 0) 7002 goto out; 7003 7004 /* Init, if this function is available */ 7005 if (dev->netdev_ops->ndo_init) { 7006 ret = dev->netdev_ops->ndo_init(dev); 7007 if (ret) { 7008 if (ret > 0) 7009 ret = -EIO; 7010 goto out; 7011 } 7012 } 7013 7014 if (((dev->hw_features | dev->features) & 7015 NETIF_F_HW_VLAN_CTAG_FILTER) && 7016 (!dev->netdev_ops->ndo_vlan_rx_add_vid || 7017 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { 7018 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); 7019 ret = -EINVAL; 7020 goto err_uninit; 7021 } 7022 7023 ret = -EBUSY; 7024 if (!dev->ifindex) 7025 dev->ifindex = dev_new_index(net); 7026 else if (__dev_get_by_index(net, dev->ifindex)) 7027 goto err_uninit; 7028 7029 /* Transfer changeable features to wanted_features and enable 7030 * software offloads (GSO and GRO). 7031 */ 7032 dev->hw_features |= NETIF_F_SOFT_FEATURES; 7033 dev->features |= NETIF_F_SOFT_FEATURES; 7034 dev->wanted_features = dev->features & dev->hw_features; 7035 7036 if (!(dev->flags & IFF_LOOPBACK)) 7037 dev->hw_features |= NETIF_F_NOCACHE_COPY; 7038 7039 /* If IPv4 TCP segmentation offload is supported we should also 7040 * allow the device to enable segmenting the frame with the option 7041 * of ignoring a static IP ID value. This doesn't enable the 7042 * feature itself but allows the user to enable it later. 7043 */ 7044 if (dev->hw_features & NETIF_F_TSO) 7045 dev->hw_features |= NETIF_F_TSO_MANGLEID; 7046 if (dev->vlan_features & NETIF_F_TSO) 7047 dev->vlan_features |= NETIF_F_TSO_MANGLEID; 7048 if (dev->mpls_features & NETIF_F_TSO) 7049 dev->mpls_features |= NETIF_F_TSO_MANGLEID; 7050 if (dev->hw_enc_features & NETIF_F_TSO) 7051 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; 7052 7053 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. 7054 */ 7055 dev->vlan_features |= NETIF_F_HIGHDMA; 7056 7057 /* Make NETIF_F_SG inheritable to tunnel devices. 7058 */ 7059 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; 7060 7061 /* Make NETIF_F_SG inheritable to MPLS. 7062 */ 7063 dev->mpls_features |= NETIF_F_SG; 7064 7065 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 7066 ret = notifier_to_errno(ret); 7067 if (ret) 7068 goto err_uninit; 7069 7070 ret = netdev_register_kobject(dev); 7071 if (ret) 7072 goto err_uninit; 7073 dev->reg_state = NETREG_REGISTERED; 7074 7075 __netdev_update_features(dev); 7076 7077 /* 7078 * Default initial state at registry is that the 7079 * device is present. 7080 */ 7081 7082 set_bit(__LINK_STATE_PRESENT, &dev->state); 7083 7084 linkwatch_init_dev(dev); 7085 7086 dev_init_scheduler(dev); 7087 dev_hold(dev); 7088 list_netdevice(dev); 7089 add_device_randomness(dev->dev_addr, dev->addr_len); 7090 7091 /* If the device has permanent device address, driver should 7092 * set dev_addr and also addr_assign_type should be set to 7093 * NET_ADDR_PERM (default value). 7094 */ 7095 if (dev->addr_assign_type == NET_ADDR_PERM) 7096 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); 7097 7098 /* Notify protocols, that a new device appeared. */ 7099 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 7100 ret = notifier_to_errno(ret); 7101 if (ret) { 7102 rollback_registered(dev); 7103 dev->reg_state = NETREG_UNREGISTERED; 7104 } 7105 /* 7106 * Prevent userspace races by waiting until the network 7107 * device is fully setup before sending notifications. 7108 */ 7109 if (!dev->rtnl_link_ops || 7110 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 7111 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 7112 7113 out: 7114 return ret; 7115 7116 err_uninit: 7117 if (dev->netdev_ops->ndo_uninit) 7118 dev->netdev_ops->ndo_uninit(dev); 7119 goto out; 7120 } 7121 EXPORT_SYMBOL(register_netdevice); 7122 7123 /** 7124 * init_dummy_netdev - init a dummy network device for NAPI 7125 * @dev: device to init 7126 * 7127 * This takes a network device structure and initialize the minimum 7128 * amount of fields so it can be used to schedule NAPI polls without 7129 * registering a full blown interface. This is to be used by drivers 7130 * that need to tie several hardware interfaces to a single NAPI 7131 * poll scheduler due to HW limitations. 7132 */ 7133 int init_dummy_netdev(struct net_device *dev) 7134 { 7135 /* Clear everything. Note we don't initialize spinlocks 7136 * are they aren't supposed to be taken by any of the 7137 * NAPI code and this dummy netdev is supposed to be 7138 * only ever used for NAPI polls 7139 */ 7140 memset(dev, 0, sizeof(struct net_device)); 7141 7142 /* make sure we BUG if trying to hit standard 7143 * register/unregister code path 7144 */ 7145 dev->reg_state = NETREG_DUMMY; 7146 7147 /* NAPI wants this */ 7148 INIT_LIST_HEAD(&dev->napi_list); 7149 7150 /* a dummy interface is started by default */ 7151 set_bit(__LINK_STATE_PRESENT, &dev->state); 7152 set_bit(__LINK_STATE_START, &dev->state); 7153 7154 /* Note : We dont allocate pcpu_refcnt for dummy devices, 7155 * because users of this 'device' dont need to change 7156 * its refcount. 7157 */ 7158 7159 return 0; 7160 } 7161 EXPORT_SYMBOL_GPL(init_dummy_netdev); 7162 7163 7164 /** 7165 * register_netdev - register a network device 7166 * @dev: device to register 7167 * 7168 * Take a completed network device structure and add it to the kernel 7169 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 7170 * chain. 0 is returned on success. A negative errno code is returned 7171 * on a failure to set up the device, or if the name is a duplicate. 7172 * 7173 * This is a wrapper around register_netdevice that takes the rtnl semaphore 7174 * and expands the device name if you passed a format string to 7175 * alloc_netdev. 7176 */ 7177 int register_netdev(struct net_device *dev) 7178 { 7179 int err; 7180 7181 rtnl_lock(); 7182 err = register_netdevice(dev); 7183 rtnl_unlock(); 7184 return err; 7185 } 7186 EXPORT_SYMBOL(register_netdev); 7187 7188 int netdev_refcnt_read(const struct net_device *dev) 7189 { 7190 int i, refcnt = 0; 7191 7192 for_each_possible_cpu(i) 7193 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); 7194 return refcnt; 7195 } 7196 EXPORT_SYMBOL(netdev_refcnt_read); 7197 7198 /** 7199 * netdev_wait_allrefs - wait until all references are gone. 7200 * @dev: target net_device 7201 * 7202 * This is called when unregistering network devices. 7203 * 7204 * Any protocol or device that holds a reference should register 7205 * for netdevice notification, and cleanup and put back the 7206 * reference if they receive an UNREGISTER event. 7207 * We can get stuck here if buggy protocols don't correctly 7208 * call dev_put. 7209 */ 7210 static void netdev_wait_allrefs(struct net_device *dev) 7211 { 7212 unsigned long rebroadcast_time, warning_time; 7213 int refcnt; 7214 7215 linkwatch_forget_dev(dev); 7216 7217 rebroadcast_time = warning_time = jiffies; 7218 refcnt = netdev_refcnt_read(dev); 7219 7220 while (refcnt != 0) { 7221 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 7222 rtnl_lock(); 7223 7224 /* Rebroadcast unregister notification */ 7225 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7226 7227 __rtnl_unlock(); 7228 rcu_barrier(); 7229 rtnl_lock(); 7230 7231 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7232 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 7233 &dev->state)) { 7234 /* We must not have linkwatch events 7235 * pending on unregister. If this 7236 * happens, we simply run the queue 7237 * unscheduled, resulting in a noop 7238 * for this device. 7239 */ 7240 linkwatch_run_queue(); 7241 } 7242 7243 __rtnl_unlock(); 7244 7245 rebroadcast_time = jiffies; 7246 } 7247 7248 msleep(250); 7249 7250 refcnt = netdev_refcnt_read(dev); 7251 7252 if (time_after(jiffies, warning_time + 10 * HZ)) { 7253 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", 7254 dev->name, refcnt); 7255 warning_time = jiffies; 7256 } 7257 } 7258 } 7259 7260 /* The sequence is: 7261 * 7262 * rtnl_lock(); 7263 * ... 7264 * register_netdevice(x1); 7265 * register_netdevice(x2); 7266 * ... 7267 * unregister_netdevice(y1); 7268 * unregister_netdevice(y2); 7269 * ... 7270 * rtnl_unlock(); 7271 * free_netdev(y1); 7272 * free_netdev(y2); 7273 * 7274 * We are invoked by rtnl_unlock(). 7275 * This allows us to deal with problems: 7276 * 1) We can delete sysfs objects which invoke hotplug 7277 * without deadlocking with linkwatch via keventd. 7278 * 2) Since we run with the RTNL semaphore not held, we can sleep 7279 * safely in order to wait for the netdev refcnt to drop to zero. 7280 * 7281 * We must not return until all unregister events added during 7282 * the interval the lock was held have been completed. 7283 */ 7284 void netdev_run_todo(void) 7285 { 7286 struct list_head list; 7287 7288 /* Snapshot list, allow later requests */ 7289 list_replace_init(&net_todo_list, &list); 7290 7291 __rtnl_unlock(); 7292 7293 7294 /* Wait for rcu callbacks to finish before next phase */ 7295 if (!list_empty(&list)) 7296 rcu_barrier(); 7297 7298 while (!list_empty(&list)) { 7299 struct net_device *dev 7300 = list_first_entry(&list, struct net_device, todo_list); 7301 list_del(&dev->todo_list); 7302 7303 rtnl_lock(); 7304 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7305 __rtnl_unlock(); 7306 7307 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 7308 pr_err("network todo '%s' but state %d\n", 7309 dev->name, dev->reg_state); 7310 dump_stack(); 7311 continue; 7312 } 7313 7314 dev->reg_state = NETREG_UNREGISTERED; 7315 7316 netdev_wait_allrefs(dev); 7317 7318 /* paranoia */ 7319 BUG_ON(netdev_refcnt_read(dev)); 7320 BUG_ON(!list_empty(&dev->ptype_all)); 7321 BUG_ON(!list_empty(&dev->ptype_specific)); 7322 WARN_ON(rcu_access_pointer(dev->ip_ptr)); 7323 WARN_ON(rcu_access_pointer(dev->ip6_ptr)); 7324 WARN_ON(dev->dn_ptr); 7325 7326 if (dev->destructor) 7327 dev->destructor(dev); 7328 7329 /* Report a network device has been unregistered */ 7330 rtnl_lock(); 7331 dev_net(dev)->dev_unreg_count--; 7332 __rtnl_unlock(); 7333 wake_up(&netdev_unregistering_wq); 7334 7335 /* Free network device */ 7336 kobject_put(&dev->dev.kobj); 7337 } 7338 } 7339 7340 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has 7341 * all the same fields in the same order as net_device_stats, with only 7342 * the type differing, but rtnl_link_stats64 may have additional fields 7343 * at the end for newer counters. 7344 */ 7345 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, 7346 const struct net_device_stats *netdev_stats) 7347 { 7348 #if BITS_PER_LONG == 64 7349 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); 7350 memcpy(stats64, netdev_stats, sizeof(*stats64)); 7351 /* zero out counters that only exist in rtnl_link_stats64 */ 7352 memset((char *)stats64 + sizeof(*netdev_stats), 0, 7353 sizeof(*stats64) - sizeof(*netdev_stats)); 7354 #else 7355 size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long); 7356 const unsigned long *src = (const unsigned long *)netdev_stats; 7357 u64 *dst = (u64 *)stats64; 7358 7359 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); 7360 for (i = 0; i < n; i++) 7361 dst[i] = src[i]; 7362 /* zero out counters that only exist in rtnl_link_stats64 */ 7363 memset((char *)stats64 + n * sizeof(u64), 0, 7364 sizeof(*stats64) - n * sizeof(u64)); 7365 #endif 7366 } 7367 EXPORT_SYMBOL(netdev_stats_to_stats64); 7368 7369 /** 7370 * dev_get_stats - get network device statistics 7371 * @dev: device to get statistics from 7372 * @storage: place to store stats 7373 * 7374 * Get network statistics from device. Return @storage. 7375 * The device driver may provide its own method by setting 7376 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; 7377 * otherwise the internal statistics structure is used. 7378 */ 7379 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, 7380 struct rtnl_link_stats64 *storage) 7381 { 7382 const struct net_device_ops *ops = dev->netdev_ops; 7383 7384 if (ops->ndo_get_stats64) { 7385 memset(storage, 0, sizeof(*storage)); 7386 ops->ndo_get_stats64(dev, storage); 7387 } else if (ops->ndo_get_stats) { 7388 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 7389 } else { 7390 netdev_stats_to_stats64(storage, &dev->stats); 7391 } 7392 storage->rx_dropped += atomic_long_read(&dev->rx_dropped); 7393 storage->tx_dropped += atomic_long_read(&dev->tx_dropped); 7394 storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler); 7395 return storage; 7396 } 7397 EXPORT_SYMBOL(dev_get_stats); 7398 7399 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) 7400 { 7401 struct netdev_queue *queue = dev_ingress_queue(dev); 7402 7403 #ifdef CONFIG_NET_CLS_ACT 7404 if (queue) 7405 return queue; 7406 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 7407 if (!queue) 7408 return NULL; 7409 netdev_init_one_queue(dev, queue, NULL); 7410 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); 7411 queue->qdisc_sleeping = &noop_qdisc; 7412 rcu_assign_pointer(dev->ingress_queue, queue); 7413 #endif 7414 return queue; 7415 } 7416 7417 static const struct ethtool_ops default_ethtool_ops; 7418 7419 void netdev_set_default_ethtool_ops(struct net_device *dev, 7420 const struct ethtool_ops *ops) 7421 { 7422 if (dev->ethtool_ops == &default_ethtool_ops) 7423 dev->ethtool_ops = ops; 7424 } 7425 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); 7426 7427 void netdev_freemem(struct net_device *dev) 7428 { 7429 char *addr = (char *)dev - dev->padded; 7430 7431 kvfree(addr); 7432 } 7433 7434 /** 7435 * alloc_netdev_mqs - allocate network device 7436 * @sizeof_priv: size of private data to allocate space for 7437 * @name: device name format string 7438 * @name_assign_type: origin of device name 7439 * @setup: callback to initialize device 7440 * @txqs: the number of TX subqueues to allocate 7441 * @rxqs: the number of RX subqueues to allocate 7442 * 7443 * Allocates a struct net_device with private data area for driver use 7444 * and performs basic initialization. Also allocates subqueue structs 7445 * for each queue on the device. 7446 */ 7447 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, 7448 unsigned char name_assign_type, 7449 void (*setup)(struct net_device *), 7450 unsigned int txqs, unsigned int rxqs) 7451 { 7452 struct net_device *dev; 7453 size_t alloc_size; 7454 struct net_device *p; 7455 7456 BUG_ON(strlen(name) >= sizeof(dev->name)); 7457 7458 if (txqs < 1) { 7459 pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); 7460 return NULL; 7461 } 7462 7463 #ifdef CONFIG_SYSFS 7464 if (rxqs < 1) { 7465 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); 7466 return NULL; 7467 } 7468 #endif 7469 7470 alloc_size = sizeof(struct net_device); 7471 if (sizeof_priv) { 7472 /* ensure 32-byte alignment of private area */ 7473 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); 7474 alloc_size += sizeof_priv; 7475 } 7476 /* ensure 32-byte alignment of whole construct */ 7477 alloc_size += NETDEV_ALIGN - 1; 7478 7479 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 7480 if (!p) 7481 p = vzalloc(alloc_size); 7482 if (!p) 7483 return NULL; 7484 7485 dev = PTR_ALIGN(p, NETDEV_ALIGN); 7486 dev->padded = (char *)dev - (char *)p; 7487 7488 dev->pcpu_refcnt = alloc_percpu(int); 7489 if (!dev->pcpu_refcnt) 7490 goto free_dev; 7491 7492 if (dev_addr_init(dev)) 7493 goto free_pcpu; 7494 7495 dev_mc_init(dev); 7496 dev_uc_init(dev); 7497 7498 dev_net_set(dev, &init_net); 7499 7500 dev->gso_max_size = GSO_MAX_SIZE; 7501 dev->gso_max_segs = GSO_MAX_SEGS; 7502 7503 INIT_LIST_HEAD(&dev->napi_list); 7504 INIT_LIST_HEAD(&dev->unreg_list); 7505 INIT_LIST_HEAD(&dev->close_list); 7506 INIT_LIST_HEAD(&dev->link_watch_list); 7507 INIT_LIST_HEAD(&dev->adj_list.upper); 7508 INIT_LIST_HEAD(&dev->adj_list.lower); 7509 INIT_LIST_HEAD(&dev->all_adj_list.upper); 7510 INIT_LIST_HEAD(&dev->all_adj_list.lower); 7511 INIT_LIST_HEAD(&dev->ptype_all); 7512 INIT_LIST_HEAD(&dev->ptype_specific); 7513 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; 7514 setup(dev); 7515 7516 if (!dev->tx_queue_len) { 7517 dev->priv_flags |= IFF_NO_QUEUE; 7518 dev->tx_queue_len = 1; 7519 } 7520 7521 dev->num_tx_queues = txqs; 7522 dev->real_num_tx_queues = txqs; 7523 if (netif_alloc_netdev_queues(dev)) 7524 goto free_all; 7525 7526 #ifdef CONFIG_SYSFS 7527 dev->num_rx_queues = rxqs; 7528 dev->real_num_rx_queues = rxqs; 7529 if (netif_alloc_rx_queues(dev)) 7530 goto free_all; 7531 #endif 7532 7533 strcpy(dev->name, name); 7534 dev->name_assign_type = name_assign_type; 7535 dev->group = INIT_NETDEV_GROUP; 7536 if (!dev->ethtool_ops) 7537 dev->ethtool_ops = &default_ethtool_ops; 7538 7539 nf_hook_ingress_init(dev); 7540 7541 return dev; 7542 7543 free_all: 7544 free_netdev(dev); 7545 return NULL; 7546 7547 free_pcpu: 7548 free_percpu(dev->pcpu_refcnt); 7549 free_dev: 7550 netdev_freemem(dev); 7551 return NULL; 7552 } 7553 EXPORT_SYMBOL(alloc_netdev_mqs); 7554 7555 /** 7556 * free_netdev - free network device 7557 * @dev: device 7558 * 7559 * This function does the last stage of destroying an allocated device 7560 * interface. The reference to the device object is released. 7561 * If this is the last reference then it will be freed. 7562 * Must be called in process context. 7563 */ 7564 void free_netdev(struct net_device *dev) 7565 { 7566 struct napi_struct *p, *n; 7567 7568 might_sleep(); 7569 netif_free_tx_queues(dev); 7570 #ifdef CONFIG_SYSFS 7571 kvfree(dev->_rx); 7572 #endif 7573 7574 kfree(rcu_dereference_protected(dev->ingress_queue, 1)); 7575 7576 /* Flush device addresses */ 7577 dev_addr_flush(dev); 7578 7579 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 7580 netif_napi_del(p); 7581 7582 free_percpu(dev->pcpu_refcnt); 7583 dev->pcpu_refcnt = NULL; 7584 7585 /* Compatibility with error handling in drivers */ 7586 if (dev->reg_state == NETREG_UNINITIALIZED) { 7587 netdev_freemem(dev); 7588 return; 7589 } 7590 7591 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 7592 dev->reg_state = NETREG_RELEASED; 7593 7594 /* will free via device release */ 7595 put_device(&dev->dev); 7596 } 7597 EXPORT_SYMBOL(free_netdev); 7598 7599 /** 7600 * synchronize_net - Synchronize with packet receive processing 7601 * 7602 * Wait for packets currently being received to be done. 7603 * Does not block later packets from starting. 7604 */ 7605 void synchronize_net(void) 7606 { 7607 might_sleep(); 7608 if (rtnl_is_locked()) 7609 synchronize_rcu_expedited(); 7610 else 7611 synchronize_rcu(); 7612 } 7613 EXPORT_SYMBOL(synchronize_net); 7614 7615 /** 7616 * unregister_netdevice_queue - remove device from the kernel 7617 * @dev: device 7618 * @head: list 7619 * 7620 * This function shuts down a device interface and removes it 7621 * from the kernel tables. 7622 * If head not NULL, device is queued to be unregistered later. 7623 * 7624 * Callers must hold the rtnl semaphore. You may want 7625 * unregister_netdev() instead of this. 7626 */ 7627 7628 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) 7629 { 7630 ASSERT_RTNL(); 7631 7632 if (head) { 7633 list_move_tail(&dev->unreg_list, head); 7634 } else { 7635 rollback_registered(dev); 7636 /* Finish processing unregister after unlock */ 7637 net_set_todo(dev); 7638 } 7639 } 7640 EXPORT_SYMBOL(unregister_netdevice_queue); 7641 7642 /** 7643 * unregister_netdevice_many - unregister many devices 7644 * @head: list of devices 7645 * 7646 * Note: As most callers use a stack allocated list_head, 7647 * we force a list_del() to make sure stack wont be corrupted later. 7648 */ 7649 void unregister_netdevice_many(struct list_head *head) 7650 { 7651 struct net_device *dev; 7652 7653 if (!list_empty(head)) { 7654 rollback_registered_many(head); 7655 list_for_each_entry(dev, head, unreg_list) 7656 net_set_todo(dev); 7657 list_del(head); 7658 } 7659 } 7660 EXPORT_SYMBOL(unregister_netdevice_many); 7661 7662 /** 7663 * unregister_netdev - remove device from the kernel 7664 * @dev: device 7665 * 7666 * This function shuts down a device interface and removes it 7667 * from the kernel tables. 7668 * 7669 * This is just a wrapper for unregister_netdevice that takes 7670 * the rtnl semaphore. In general you want to use this and not 7671 * unregister_netdevice. 7672 */ 7673 void unregister_netdev(struct net_device *dev) 7674 { 7675 rtnl_lock(); 7676 unregister_netdevice(dev); 7677 rtnl_unlock(); 7678 } 7679 EXPORT_SYMBOL(unregister_netdev); 7680 7681 /** 7682 * dev_change_net_namespace - move device to different nethost namespace 7683 * @dev: device 7684 * @net: network namespace 7685 * @pat: If not NULL name pattern to try if the current device name 7686 * is already taken in the destination network namespace. 7687 * 7688 * This function shuts down a device interface and moves it 7689 * to a new network namespace. On success 0 is returned, on 7690 * a failure a netagive errno code is returned. 7691 * 7692 * Callers must hold the rtnl semaphore. 7693 */ 7694 7695 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 7696 { 7697 int err; 7698 7699 ASSERT_RTNL(); 7700 7701 /* Don't allow namespace local devices to be moved. */ 7702 err = -EINVAL; 7703 if (dev->features & NETIF_F_NETNS_LOCAL) 7704 goto out; 7705 7706 /* Ensure the device has been registrered */ 7707 if (dev->reg_state != NETREG_REGISTERED) 7708 goto out; 7709 7710 /* Get out if there is nothing todo */ 7711 err = 0; 7712 if (net_eq(dev_net(dev), net)) 7713 goto out; 7714 7715 /* Pick the destination device name, and ensure 7716 * we can use it in the destination network namespace. 7717 */ 7718 err = -EEXIST; 7719 if (__dev_get_by_name(net, dev->name)) { 7720 /* We get here if we can't use the current device name */ 7721 if (!pat) 7722 goto out; 7723 if (dev_get_valid_name(net, dev, pat) < 0) 7724 goto out; 7725 } 7726 7727 /* 7728 * And now a mini version of register_netdevice unregister_netdevice. 7729 */ 7730 7731 /* If device is running close it first. */ 7732 dev_close(dev); 7733 7734 /* And unlink it from device chain */ 7735 err = -ENODEV; 7736 unlist_netdevice(dev); 7737 7738 synchronize_net(); 7739 7740 /* Shutdown queueing discipline. */ 7741 dev_shutdown(dev); 7742 7743 /* Notify protocols, that we are about to destroy 7744 this device. They should clean all the things. 7745 7746 Note that dev->reg_state stays at NETREG_REGISTERED. 7747 This is wanted because this way 8021q and macvlan know 7748 the device is just moving and can keep their slaves up. 7749 */ 7750 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7751 rcu_barrier(); 7752 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7753 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); 7754 7755 /* 7756 * Flush the unicast and multicast chains 7757 */ 7758 dev_uc_flush(dev); 7759 dev_mc_flush(dev); 7760 7761 /* Send a netdev-removed uevent to the old namespace */ 7762 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); 7763 netdev_adjacent_del_links(dev); 7764 7765 /* Actually switch the network namespace */ 7766 dev_net_set(dev, net); 7767 7768 /* If there is an ifindex conflict assign a new one */ 7769 if (__dev_get_by_index(net, dev->ifindex)) 7770 dev->ifindex = dev_new_index(net); 7771 7772 /* Send a netdev-add uevent to the new namespace */ 7773 kobject_uevent(&dev->dev.kobj, KOBJ_ADD); 7774 netdev_adjacent_add_links(dev); 7775 7776 /* Fixup kobjects */ 7777 err = device_rename(&dev->dev, dev->name); 7778 WARN_ON(err); 7779 7780 /* Add the device back in the hashes */ 7781 list_netdevice(dev); 7782 7783 /* Notify protocols, that a new device appeared. */ 7784 call_netdevice_notifiers(NETDEV_REGISTER, dev); 7785 7786 /* 7787 * Prevent userspace races by waiting until the network 7788 * device is fully setup before sending notifications. 7789 */ 7790 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 7791 7792 synchronize_net(); 7793 err = 0; 7794 out: 7795 return err; 7796 } 7797 EXPORT_SYMBOL_GPL(dev_change_net_namespace); 7798 7799 static int dev_cpu_callback(struct notifier_block *nfb, 7800 unsigned long action, 7801 void *ocpu) 7802 { 7803 struct sk_buff **list_skb; 7804 struct sk_buff *skb; 7805 unsigned int cpu, oldcpu = (unsigned long)ocpu; 7806 struct softnet_data *sd, *oldsd; 7807 7808 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 7809 return NOTIFY_OK; 7810 7811 local_irq_disable(); 7812 cpu = smp_processor_id(); 7813 sd = &per_cpu(softnet_data, cpu); 7814 oldsd = &per_cpu(softnet_data, oldcpu); 7815 7816 /* Find end of our completion_queue. */ 7817 list_skb = &sd->completion_queue; 7818 while (*list_skb) 7819 list_skb = &(*list_skb)->next; 7820 /* Append completion queue from offline CPU. */ 7821 *list_skb = oldsd->completion_queue; 7822 oldsd->completion_queue = NULL; 7823 7824 /* Append output queue from offline CPU. */ 7825 if (oldsd->output_queue) { 7826 *sd->output_queue_tailp = oldsd->output_queue; 7827 sd->output_queue_tailp = oldsd->output_queue_tailp; 7828 oldsd->output_queue = NULL; 7829 oldsd->output_queue_tailp = &oldsd->output_queue; 7830 } 7831 /* Append NAPI poll list from offline CPU, with one exception : 7832 * process_backlog() must be called by cpu owning percpu backlog. 7833 * We properly handle process_queue & input_pkt_queue later. 7834 */ 7835 while (!list_empty(&oldsd->poll_list)) { 7836 struct napi_struct *napi = list_first_entry(&oldsd->poll_list, 7837 struct napi_struct, 7838 poll_list); 7839 7840 list_del_init(&napi->poll_list); 7841 if (napi->poll == process_backlog) 7842 napi->state = 0; 7843 else 7844 ____napi_schedule(sd, napi); 7845 } 7846 7847 raise_softirq_irqoff(NET_TX_SOFTIRQ); 7848 local_irq_enable(); 7849 7850 /* Process offline CPU's input_pkt_queue */ 7851 while ((skb = __skb_dequeue(&oldsd->process_queue))) { 7852 netif_rx_ni(skb); 7853 input_queue_head_incr(oldsd); 7854 } 7855 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { 7856 netif_rx_ni(skb); 7857 input_queue_head_incr(oldsd); 7858 } 7859 7860 return NOTIFY_OK; 7861 } 7862 7863 7864 /** 7865 * netdev_increment_features - increment feature set by one 7866 * @all: current feature set 7867 * @one: new feature set 7868 * @mask: mask feature set 7869 * 7870 * Computes a new feature set after adding a device with feature set 7871 * @one to the master device with current feature set @all. Will not 7872 * enable anything that is off in @mask. Returns the new feature set. 7873 */ 7874 netdev_features_t netdev_increment_features(netdev_features_t all, 7875 netdev_features_t one, netdev_features_t mask) 7876 { 7877 if (mask & NETIF_F_HW_CSUM) 7878 mask |= NETIF_F_CSUM_MASK; 7879 mask |= NETIF_F_VLAN_CHALLENGED; 7880 7881 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; 7882 all &= one | ~NETIF_F_ALL_FOR_ALL; 7883 7884 /* If one device supports hw checksumming, set for all. */ 7885 if (all & NETIF_F_HW_CSUM) 7886 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); 7887 7888 return all; 7889 } 7890 EXPORT_SYMBOL(netdev_increment_features); 7891 7892 static struct hlist_head * __net_init netdev_create_hash(void) 7893 { 7894 int i; 7895 struct hlist_head *hash; 7896 7897 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 7898 if (hash != NULL) 7899 for (i = 0; i < NETDEV_HASHENTRIES; i++) 7900 INIT_HLIST_HEAD(&hash[i]); 7901 7902 return hash; 7903 } 7904 7905 /* Initialize per network namespace state */ 7906 static int __net_init netdev_init(struct net *net) 7907 { 7908 if (net != &init_net) 7909 INIT_LIST_HEAD(&net->dev_base_head); 7910 7911 net->dev_name_head = netdev_create_hash(); 7912 if (net->dev_name_head == NULL) 7913 goto err_name; 7914 7915 net->dev_index_head = netdev_create_hash(); 7916 if (net->dev_index_head == NULL) 7917 goto err_idx; 7918 7919 return 0; 7920 7921 err_idx: 7922 kfree(net->dev_name_head); 7923 err_name: 7924 return -ENOMEM; 7925 } 7926 7927 /** 7928 * netdev_drivername - network driver for the device 7929 * @dev: network device 7930 * 7931 * Determine network driver for device. 7932 */ 7933 const char *netdev_drivername(const struct net_device *dev) 7934 { 7935 const struct device_driver *driver; 7936 const struct device *parent; 7937 const char *empty = ""; 7938 7939 parent = dev->dev.parent; 7940 if (!parent) 7941 return empty; 7942 7943 driver = parent->driver; 7944 if (driver && driver->name) 7945 return driver->name; 7946 return empty; 7947 } 7948 7949 static void __netdev_printk(const char *level, const struct net_device *dev, 7950 struct va_format *vaf) 7951 { 7952 if (dev && dev->dev.parent) { 7953 dev_printk_emit(level[1] - '0', 7954 dev->dev.parent, 7955 "%s %s %s%s: %pV", 7956 dev_driver_string(dev->dev.parent), 7957 dev_name(dev->dev.parent), 7958 netdev_name(dev), netdev_reg_state(dev), 7959 vaf); 7960 } else if (dev) { 7961 printk("%s%s%s: %pV", 7962 level, netdev_name(dev), netdev_reg_state(dev), vaf); 7963 } else { 7964 printk("%s(NULL net_device): %pV", level, vaf); 7965 } 7966 } 7967 7968 void netdev_printk(const char *level, const struct net_device *dev, 7969 const char *format, ...) 7970 { 7971 struct va_format vaf; 7972 va_list args; 7973 7974 va_start(args, format); 7975 7976 vaf.fmt = format; 7977 vaf.va = &args; 7978 7979 __netdev_printk(level, dev, &vaf); 7980 7981 va_end(args); 7982 } 7983 EXPORT_SYMBOL(netdev_printk); 7984 7985 #define define_netdev_printk_level(func, level) \ 7986 void func(const struct net_device *dev, const char *fmt, ...) \ 7987 { \ 7988 struct va_format vaf; \ 7989 va_list args; \ 7990 \ 7991 va_start(args, fmt); \ 7992 \ 7993 vaf.fmt = fmt; \ 7994 vaf.va = &args; \ 7995 \ 7996 __netdev_printk(level, dev, &vaf); \ 7997 \ 7998 va_end(args); \ 7999 } \ 8000 EXPORT_SYMBOL(func); 8001 8002 define_netdev_printk_level(netdev_emerg, KERN_EMERG); 8003 define_netdev_printk_level(netdev_alert, KERN_ALERT); 8004 define_netdev_printk_level(netdev_crit, KERN_CRIT); 8005 define_netdev_printk_level(netdev_err, KERN_ERR); 8006 define_netdev_printk_level(netdev_warn, KERN_WARNING); 8007 define_netdev_printk_level(netdev_notice, KERN_NOTICE); 8008 define_netdev_printk_level(netdev_info, KERN_INFO); 8009 8010 static void __net_exit netdev_exit(struct net *net) 8011 { 8012 kfree(net->dev_name_head); 8013 kfree(net->dev_index_head); 8014 } 8015 8016 static struct pernet_operations __net_initdata netdev_net_ops = { 8017 .init = netdev_init, 8018 .exit = netdev_exit, 8019 }; 8020 8021 static void __net_exit default_device_exit(struct net *net) 8022 { 8023 struct net_device *dev, *aux; 8024 /* 8025 * Push all migratable network devices back to the 8026 * initial network namespace 8027 */ 8028 rtnl_lock(); 8029 for_each_netdev_safe(net, dev, aux) { 8030 int err; 8031 char fb_name[IFNAMSIZ]; 8032 8033 /* Ignore unmoveable devices (i.e. loopback) */ 8034 if (dev->features & NETIF_F_NETNS_LOCAL) 8035 continue; 8036 8037 /* Leave virtual devices for the generic cleanup */ 8038 if (dev->rtnl_link_ops) 8039 continue; 8040 8041 /* Push remaining network devices to init_net */ 8042 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 8043 err = dev_change_net_namespace(dev, &init_net, fb_name); 8044 if (err) { 8045 pr_emerg("%s: failed to move %s to init_net: %d\n", 8046 __func__, dev->name, err); 8047 BUG(); 8048 } 8049 } 8050 rtnl_unlock(); 8051 } 8052 8053 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) 8054 { 8055 /* Return with the rtnl_lock held when there are no network 8056 * devices unregistering in any network namespace in net_list. 8057 */ 8058 struct net *net; 8059 bool unregistering; 8060 DEFINE_WAIT_FUNC(wait, woken_wake_function); 8061 8062 add_wait_queue(&netdev_unregistering_wq, &wait); 8063 for (;;) { 8064 unregistering = false; 8065 rtnl_lock(); 8066 list_for_each_entry(net, net_list, exit_list) { 8067 if (net->dev_unreg_count > 0) { 8068 unregistering = true; 8069 break; 8070 } 8071 } 8072 if (!unregistering) 8073 break; 8074 __rtnl_unlock(); 8075 8076 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 8077 } 8078 remove_wait_queue(&netdev_unregistering_wq, &wait); 8079 } 8080 8081 static void __net_exit default_device_exit_batch(struct list_head *net_list) 8082 { 8083 /* At exit all network devices most be removed from a network 8084 * namespace. Do this in the reverse order of registration. 8085 * Do this across as many network namespaces as possible to 8086 * improve batching efficiency. 8087 */ 8088 struct net_device *dev; 8089 struct net *net; 8090 LIST_HEAD(dev_kill_list); 8091 8092 /* To prevent network device cleanup code from dereferencing 8093 * loopback devices or network devices that have been freed 8094 * wait here for all pending unregistrations to complete, 8095 * before unregistring the loopback device and allowing the 8096 * network namespace be freed. 8097 * 8098 * The netdev todo list containing all network devices 8099 * unregistrations that happen in default_device_exit_batch 8100 * will run in the rtnl_unlock() at the end of 8101 * default_device_exit_batch. 8102 */ 8103 rtnl_lock_unregistering(net_list); 8104 list_for_each_entry(net, net_list, exit_list) { 8105 for_each_netdev_reverse(net, dev) { 8106 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) 8107 dev->rtnl_link_ops->dellink(dev, &dev_kill_list); 8108 else 8109 unregister_netdevice_queue(dev, &dev_kill_list); 8110 } 8111 } 8112 unregister_netdevice_many(&dev_kill_list); 8113 rtnl_unlock(); 8114 } 8115 8116 static struct pernet_operations __net_initdata default_device_ops = { 8117 .exit = default_device_exit, 8118 .exit_batch = default_device_exit_batch, 8119 }; 8120 8121 /* 8122 * Initialize the DEV module. At boot time this walks the device list and 8123 * unhooks any devices that fail to initialise (normally hardware not 8124 * present) and leaves us with a valid list of present and active devices. 8125 * 8126 */ 8127 8128 /* 8129 * This is called single threaded during boot, so no need 8130 * to take the rtnl semaphore. 8131 */ 8132 static int __init net_dev_init(void) 8133 { 8134 int i, rc = -ENOMEM; 8135 8136 BUG_ON(!dev_boot_phase); 8137 8138 if (dev_proc_init()) 8139 goto out; 8140 8141 if (netdev_kobject_init()) 8142 goto out; 8143 8144 INIT_LIST_HEAD(&ptype_all); 8145 for (i = 0; i < PTYPE_HASH_SIZE; i++) 8146 INIT_LIST_HEAD(&ptype_base[i]); 8147 8148 INIT_LIST_HEAD(&offload_base); 8149 8150 if (register_pernet_subsys(&netdev_net_ops)) 8151 goto out; 8152 8153 /* 8154 * Initialise the packet receive queues. 8155 */ 8156 8157 for_each_possible_cpu(i) { 8158 struct softnet_data *sd = &per_cpu(softnet_data, i); 8159 8160 skb_queue_head_init(&sd->input_pkt_queue); 8161 skb_queue_head_init(&sd->process_queue); 8162 INIT_LIST_HEAD(&sd->poll_list); 8163 sd->output_queue_tailp = &sd->output_queue; 8164 #ifdef CONFIG_RPS 8165 sd->csd.func = rps_trigger_softirq; 8166 sd->csd.info = sd; 8167 sd->cpu = i; 8168 #endif 8169 8170 sd->backlog.poll = process_backlog; 8171 sd->backlog.weight = weight_p; 8172 } 8173 8174 dev_boot_phase = 0; 8175 8176 /* The loopback device is special if any other network devices 8177 * is present in a network namespace the loopback device must 8178 * be present. Since we now dynamically allocate and free the 8179 * loopback device ensure this invariant is maintained by 8180 * keeping the loopback device as the first device on the 8181 * list of network devices. Ensuring the loopback devices 8182 * is the first device that appears and the last network device 8183 * that disappears. 8184 */ 8185 if (register_pernet_device(&loopback_net_ops)) 8186 goto out; 8187 8188 if (register_pernet_device(&default_device_ops)) 8189 goto out; 8190 8191 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 8192 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 8193 8194 hotcpu_notifier(dev_cpu_callback, 0); 8195 dst_subsys_init(); 8196 rc = 0; 8197 out: 8198 return rc; 8199 } 8200 8201 subsys_initcall(net_dev_init); 8202