1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <[email protected]> 12 * Mark Evans, <[email protected]> 13 * 14 * Additional Authors: 15 * Florian la Roche <[email protected]> 16 * Alan Cox <[email protected]> 17 * David Hinds <[email protected]> 18 * Alexey Kuznetsov <[email protected]> 19 * Adam Sulmicki <[email protected]> 20 * Pekka Riikonen <[email protected]> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <linux/uaccess.h> 76 #include <linux/bitops.h> 77 #include <linux/capability.h> 78 #include <linux/cpu.h> 79 #include <linux/types.h> 80 #include <linux/kernel.h> 81 #include <linux/hash.h> 82 #include <linux/slab.h> 83 #include <linux/sched.h> 84 #include <linux/mutex.h> 85 #include <linux/string.h> 86 #include <linux/mm.h> 87 #include <linux/socket.h> 88 #include <linux/sockios.h> 89 #include <linux/errno.h> 90 #include <linux/interrupt.h> 91 #include <linux/if_ether.h> 92 #include <linux/netdevice.h> 93 #include <linux/etherdevice.h> 94 #include <linux/ethtool.h> 95 #include <linux/notifier.h> 96 #include <linux/skbuff.h> 97 #include <linux/bpf.h> 98 #include <net/net_namespace.h> 99 #include <net/sock.h> 100 #include <net/busy_poll.h> 101 #include <linux/rtnetlink.h> 102 #include <linux/stat.h> 103 #include <net/dst.h> 104 #include <net/dst_metadata.h> 105 #include <net/pkt_sched.h> 106 #include <net/checksum.h> 107 #include <net/xfrm.h> 108 #include <linux/highmem.h> 109 #include <linux/init.h> 110 #include <linux/module.h> 111 #include <linux/netpoll.h> 112 #include <linux/rcupdate.h> 113 #include <linux/delay.h> 114 #include <net/iw_handler.h> 115 #include <asm/current.h> 116 #include <linux/audit.h> 117 #include <linux/dmaengine.h> 118 #include <linux/err.h> 119 #include <linux/ctype.h> 120 #include <linux/if_arp.h> 121 #include <linux/if_vlan.h> 122 #include <linux/ip.h> 123 #include <net/ip.h> 124 #include <net/mpls.h> 125 #include <linux/ipv6.h> 126 #include <linux/in.h> 127 #include <linux/jhash.h> 128 #include <linux/random.h> 129 #include <trace/events/napi.h> 130 #include <trace/events/net.h> 131 #include <trace/events/skb.h> 132 #include <linux/pci.h> 133 #include <linux/inetdevice.h> 134 #include <linux/cpu_rmap.h> 135 #include <linux/static_key.h> 136 #include <linux/hashtable.h> 137 #include <linux/vmalloc.h> 138 #include <linux/if_macvlan.h> 139 #include <linux/errqueue.h> 140 #include <linux/hrtimer.h> 141 #include <linux/netfilter_ingress.h> 142 #include <linux/crash_dump.h> 143 144 #include "net-sysfs.h" 145 146 /* Instead of increasing this, you should create a hash table. */ 147 #define MAX_GRO_SKBS 8 148 149 /* This should be increased if a protocol with a bigger head is added. */ 150 #define GRO_MAX_HEAD (MAX_HEADER + 128) 151 152 static DEFINE_SPINLOCK(ptype_lock); 153 static DEFINE_SPINLOCK(offload_lock); 154 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 155 struct list_head ptype_all __read_mostly; /* Taps */ 156 static struct list_head offload_base __read_mostly; 157 158 static int netif_rx_internal(struct sk_buff *skb); 159 static int call_netdevice_notifiers_info(unsigned long val, 160 struct net_device *dev, 161 struct netdev_notifier_info *info); 162 163 /* 164 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 165 * semaphore. 166 * 167 * Pure readers hold dev_base_lock for reading, or rcu_read_lock() 168 * 169 * Writers must hold the rtnl semaphore while they loop through the 170 * dev_base_head list, and hold dev_base_lock for writing when they do the 171 * actual updates. This allows pure readers to access the list even 172 * while a writer is preparing to update it. 173 * 174 * To put it another way, dev_base_lock is held for writing only to 175 * protect against pure readers; the rtnl semaphore provides the 176 * protection against other writers. 177 * 178 * See, for example usages, register_netdevice() and 179 * unregister_netdevice(), which must be called with the rtnl 180 * semaphore held. 181 */ 182 DEFINE_RWLOCK(dev_base_lock); 183 EXPORT_SYMBOL(dev_base_lock); 184 185 /* protects napi_hash addition/deletion and napi_gen_id */ 186 static DEFINE_SPINLOCK(napi_hash_lock); 187 188 static unsigned int napi_gen_id = NR_CPUS; 189 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); 190 191 static seqcount_t devnet_rename_seq; 192 193 static inline void dev_base_seq_inc(struct net *net) 194 { 195 while (++net->dev_base_seq == 0); 196 } 197 198 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 199 { 200 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); 201 202 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; 203 } 204 205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 206 { 207 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 208 } 209 210 static inline void rps_lock(struct softnet_data *sd) 211 { 212 #ifdef CONFIG_RPS 213 spin_lock(&sd->input_pkt_queue.lock); 214 #endif 215 } 216 217 static inline void rps_unlock(struct softnet_data *sd) 218 { 219 #ifdef CONFIG_RPS 220 spin_unlock(&sd->input_pkt_queue.lock); 221 #endif 222 } 223 224 /* Device list insertion */ 225 static void list_netdevice(struct net_device *dev) 226 { 227 struct net *net = dev_net(dev); 228 229 ASSERT_RTNL(); 230 231 write_lock_bh(&dev_base_lock); 232 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); 233 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 234 hlist_add_head_rcu(&dev->index_hlist, 235 dev_index_hash(net, dev->ifindex)); 236 write_unlock_bh(&dev_base_lock); 237 238 dev_base_seq_inc(net); 239 } 240 241 /* Device list removal 242 * caller must respect a RCU grace period before freeing/reusing dev 243 */ 244 static void unlist_netdevice(struct net_device *dev) 245 { 246 ASSERT_RTNL(); 247 248 /* Unlink dev from the device chain */ 249 write_lock_bh(&dev_base_lock); 250 list_del_rcu(&dev->dev_list); 251 hlist_del_rcu(&dev->name_hlist); 252 hlist_del_rcu(&dev->index_hlist); 253 write_unlock_bh(&dev_base_lock); 254 255 dev_base_seq_inc(dev_net(dev)); 256 } 257 258 /* 259 * Our notifier list 260 */ 261 262 static RAW_NOTIFIER_HEAD(netdev_chain); 263 264 /* 265 * Device drivers call our routines to queue packets here. We empty the 266 * queue in the local softnet handler. 267 */ 268 269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 270 EXPORT_PER_CPU_SYMBOL(softnet_data); 271 272 #ifdef CONFIG_LOCKDEP 273 /* 274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 275 * according to dev->type 276 */ 277 static const unsigned short netdev_lock_type[] = 278 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 279 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 280 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 281 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 282 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 283 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 284 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 285 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 286 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 287 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 288 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 289 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 290 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, 291 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, 292 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; 293 294 static const char *const netdev_lock_name[] = 295 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 296 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 297 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 298 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 299 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 300 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 301 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 302 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 303 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 304 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 305 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 306 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 307 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", 308 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", 309 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; 310 311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 313 314 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 315 { 316 int i; 317 318 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 319 if (netdev_lock_type[i] == dev_type) 320 return i; 321 /* the last key is used by default */ 322 return ARRAY_SIZE(netdev_lock_type) - 1; 323 } 324 325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 326 unsigned short dev_type) 327 { 328 int i; 329 330 i = netdev_lock_pos(dev_type); 331 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 332 netdev_lock_name[i]); 333 } 334 335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 336 { 337 int i; 338 339 i = netdev_lock_pos(dev->type); 340 lockdep_set_class_and_name(&dev->addr_list_lock, 341 &netdev_addr_lock_key[i], 342 netdev_lock_name[i]); 343 } 344 #else 345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 346 unsigned short dev_type) 347 { 348 } 349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 350 { 351 } 352 #endif 353 354 /******************************************************************************* 355 356 Protocol management and registration routines 357 358 *******************************************************************************/ 359 360 /* 361 * Add a protocol ID to the list. Now that the input handler is 362 * smarter we can dispense with all the messy stuff that used to be 363 * here. 364 * 365 * BEWARE!!! Protocol handlers, mangling input packets, 366 * MUST BE last in hash buckets and checking protocol handlers 367 * MUST start from promiscuous ptype_all chain in net_bh. 368 * It is true now, do not change it. 369 * Explanation follows: if protocol handler, mangling packet, will 370 * be the first on list, it is not able to sense, that packet 371 * is cloned and should be copied-on-write, so that it will 372 * change it and subsequent readers will get broken packet. 373 * --ANK (980803) 374 */ 375 376 static inline struct list_head *ptype_head(const struct packet_type *pt) 377 { 378 if (pt->type == htons(ETH_P_ALL)) 379 return pt->dev ? &pt->dev->ptype_all : &ptype_all; 380 else 381 return pt->dev ? &pt->dev->ptype_specific : 382 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 383 } 384 385 /** 386 * dev_add_pack - add packet handler 387 * @pt: packet type declaration 388 * 389 * Add a protocol handler to the networking stack. The passed &packet_type 390 * is linked into kernel lists and may not be freed until it has been 391 * removed from the kernel lists. 392 * 393 * This call does not sleep therefore it can not 394 * guarantee all CPU's that are in middle of receiving packets 395 * will see the new packet type (until the next received packet). 396 */ 397 398 void dev_add_pack(struct packet_type *pt) 399 { 400 struct list_head *head = ptype_head(pt); 401 402 spin_lock(&ptype_lock); 403 list_add_rcu(&pt->list, head); 404 spin_unlock(&ptype_lock); 405 } 406 EXPORT_SYMBOL(dev_add_pack); 407 408 /** 409 * __dev_remove_pack - remove packet handler 410 * @pt: packet type declaration 411 * 412 * Remove a protocol handler that was previously added to the kernel 413 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 414 * from the kernel lists and can be freed or reused once this function 415 * returns. 416 * 417 * The packet type might still be in use by receivers 418 * and must not be freed until after all the CPU's have gone 419 * through a quiescent state. 420 */ 421 void __dev_remove_pack(struct packet_type *pt) 422 { 423 struct list_head *head = ptype_head(pt); 424 struct packet_type *pt1; 425 426 spin_lock(&ptype_lock); 427 428 list_for_each_entry(pt1, head, list) { 429 if (pt == pt1) { 430 list_del_rcu(&pt->list); 431 goto out; 432 } 433 } 434 435 pr_warn("dev_remove_pack: %p not found\n", pt); 436 out: 437 spin_unlock(&ptype_lock); 438 } 439 EXPORT_SYMBOL(__dev_remove_pack); 440 441 /** 442 * dev_remove_pack - remove packet handler 443 * @pt: packet type declaration 444 * 445 * Remove a protocol handler that was previously added to the kernel 446 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 447 * from the kernel lists and can be freed or reused once this function 448 * returns. 449 * 450 * This call sleeps to guarantee that no CPU is looking at the packet 451 * type after return. 452 */ 453 void dev_remove_pack(struct packet_type *pt) 454 { 455 __dev_remove_pack(pt); 456 457 synchronize_net(); 458 } 459 EXPORT_SYMBOL(dev_remove_pack); 460 461 462 /** 463 * dev_add_offload - register offload handlers 464 * @po: protocol offload declaration 465 * 466 * Add protocol offload handlers to the networking stack. The passed 467 * &proto_offload is linked into kernel lists and may not be freed until 468 * it has been removed from the kernel lists. 469 * 470 * This call does not sleep therefore it can not 471 * guarantee all CPU's that are in middle of receiving packets 472 * will see the new offload handlers (until the next received packet). 473 */ 474 void dev_add_offload(struct packet_offload *po) 475 { 476 struct packet_offload *elem; 477 478 spin_lock(&offload_lock); 479 list_for_each_entry(elem, &offload_base, list) { 480 if (po->priority < elem->priority) 481 break; 482 } 483 list_add_rcu(&po->list, elem->list.prev); 484 spin_unlock(&offload_lock); 485 } 486 EXPORT_SYMBOL(dev_add_offload); 487 488 /** 489 * __dev_remove_offload - remove offload handler 490 * @po: packet offload declaration 491 * 492 * Remove a protocol offload handler that was previously added to the 493 * kernel offload handlers by dev_add_offload(). The passed &offload_type 494 * is removed from the kernel lists and can be freed or reused once this 495 * function returns. 496 * 497 * The packet type might still be in use by receivers 498 * and must not be freed until after all the CPU's have gone 499 * through a quiescent state. 500 */ 501 static void __dev_remove_offload(struct packet_offload *po) 502 { 503 struct list_head *head = &offload_base; 504 struct packet_offload *po1; 505 506 spin_lock(&offload_lock); 507 508 list_for_each_entry(po1, head, list) { 509 if (po == po1) { 510 list_del_rcu(&po->list); 511 goto out; 512 } 513 } 514 515 pr_warn("dev_remove_offload: %p not found\n", po); 516 out: 517 spin_unlock(&offload_lock); 518 } 519 520 /** 521 * dev_remove_offload - remove packet offload handler 522 * @po: packet offload declaration 523 * 524 * Remove a packet offload handler that was previously added to the kernel 525 * offload handlers by dev_add_offload(). The passed &offload_type is 526 * removed from the kernel lists and can be freed or reused once this 527 * function returns. 528 * 529 * This call sleeps to guarantee that no CPU is looking at the packet 530 * type after return. 531 */ 532 void dev_remove_offload(struct packet_offload *po) 533 { 534 __dev_remove_offload(po); 535 536 synchronize_net(); 537 } 538 EXPORT_SYMBOL(dev_remove_offload); 539 540 /****************************************************************************** 541 542 Device Boot-time Settings Routines 543 544 *******************************************************************************/ 545 546 /* Boot time configuration table */ 547 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 548 549 /** 550 * netdev_boot_setup_add - add new setup entry 551 * @name: name of the device 552 * @map: configured settings for the device 553 * 554 * Adds new setup entry to the dev_boot_setup list. The function 555 * returns 0 on error and 1 on success. This is a generic routine to 556 * all netdevices. 557 */ 558 static int netdev_boot_setup_add(char *name, struct ifmap *map) 559 { 560 struct netdev_boot_setup *s; 561 int i; 562 563 s = dev_boot_setup; 564 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 565 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 566 memset(s[i].name, 0, sizeof(s[i].name)); 567 strlcpy(s[i].name, name, IFNAMSIZ); 568 memcpy(&s[i].map, map, sizeof(s[i].map)); 569 break; 570 } 571 } 572 573 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 574 } 575 576 /** 577 * netdev_boot_setup_check - check boot time settings 578 * @dev: the netdevice 579 * 580 * Check boot time settings for the device. 581 * The found settings are set for the device to be used 582 * later in the device probing. 583 * Returns 0 if no settings found, 1 if they are. 584 */ 585 int netdev_boot_setup_check(struct net_device *dev) 586 { 587 struct netdev_boot_setup *s = dev_boot_setup; 588 int i; 589 590 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 591 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 592 !strcmp(dev->name, s[i].name)) { 593 dev->irq = s[i].map.irq; 594 dev->base_addr = s[i].map.base_addr; 595 dev->mem_start = s[i].map.mem_start; 596 dev->mem_end = s[i].map.mem_end; 597 return 1; 598 } 599 } 600 return 0; 601 } 602 EXPORT_SYMBOL(netdev_boot_setup_check); 603 604 605 /** 606 * netdev_boot_base - get address from boot time settings 607 * @prefix: prefix for network device 608 * @unit: id for network device 609 * 610 * Check boot time settings for the base address of device. 611 * The found settings are set for the device to be used 612 * later in the device probing. 613 * Returns 0 if no settings found. 614 */ 615 unsigned long netdev_boot_base(const char *prefix, int unit) 616 { 617 const struct netdev_boot_setup *s = dev_boot_setup; 618 char name[IFNAMSIZ]; 619 int i; 620 621 sprintf(name, "%s%d", prefix, unit); 622 623 /* 624 * If device already registered then return base of 1 625 * to indicate not to probe for this interface 626 */ 627 if (__dev_get_by_name(&init_net, name)) 628 return 1; 629 630 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 631 if (!strcmp(name, s[i].name)) 632 return s[i].map.base_addr; 633 return 0; 634 } 635 636 /* 637 * Saves at boot time configured settings for any netdevice. 638 */ 639 int __init netdev_boot_setup(char *str) 640 { 641 int ints[5]; 642 struct ifmap map; 643 644 str = get_options(str, ARRAY_SIZE(ints), ints); 645 if (!str || !*str) 646 return 0; 647 648 /* Save settings */ 649 memset(&map, 0, sizeof(map)); 650 if (ints[0] > 0) 651 map.irq = ints[1]; 652 if (ints[0] > 1) 653 map.base_addr = ints[2]; 654 if (ints[0] > 2) 655 map.mem_start = ints[3]; 656 if (ints[0] > 3) 657 map.mem_end = ints[4]; 658 659 /* Add new entry to the list */ 660 return netdev_boot_setup_add(str, &map); 661 } 662 663 __setup("netdev=", netdev_boot_setup); 664 665 /******************************************************************************* 666 667 Device Interface Subroutines 668 669 *******************************************************************************/ 670 671 /** 672 * dev_get_iflink - get 'iflink' value of a interface 673 * @dev: targeted interface 674 * 675 * Indicates the ifindex the interface is linked to. 676 * Physical interfaces have the same 'ifindex' and 'iflink' values. 677 */ 678 679 int dev_get_iflink(const struct net_device *dev) 680 { 681 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) 682 return dev->netdev_ops->ndo_get_iflink(dev); 683 684 return dev->ifindex; 685 } 686 EXPORT_SYMBOL(dev_get_iflink); 687 688 /** 689 * dev_fill_metadata_dst - Retrieve tunnel egress information. 690 * @dev: targeted interface 691 * @skb: The packet. 692 * 693 * For better visibility of tunnel traffic OVS needs to retrieve 694 * egress tunnel information for a packet. Following API allows 695 * user to get this info. 696 */ 697 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) 698 { 699 struct ip_tunnel_info *info; 700 701 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) 702 return -EINVAL; 703 704 info = skb_tunnel_info_unclone(skb); 705 if (!info) 706 return -ENOMEM; 707 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) 708 return -EINVAL; 709 710 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); 711 } 712 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); 713 714 /** 715 * __dev_get_by_name - find a device by its name 716 * @net: the applicable net namespace 717 * @name: name to find 718 * 719 * Find an interface by name. Must be called under RTNL semaphore 720 * or @dev_base_lock. If the name is found a pointer to the device 721 * is returned. If the name is not found then %NULL is returned. The 722 * reference counters are not incremented so the caller must be 723 * careful with locks. 724 */ 725 726 struct net_device *__dev_get_by_name(struct net *net, const char *name) 727 { 728 struct net_device *dev; 729 struct hlist_head *head = dev_name_hash(net, name); 730 731 hlist_for_each_entry(dev, head, name_hlist) 732 if (!strncmp(dev->name, name, IFNAMSIZ)) 733 return dev; 734 735 return NULL; 736 } 737 EXPORT_SYMBOL(__dev_get_by_name); 738 739 /** 740 * dev_get_by_name_rcu - find a device by its name 741 * @net: the applicable net namespace 742 * @name: name to find 743 * 744 * Find an interface by name. 745 * If the name is found a pointer to the device is returned. 746 * If the name is not found then %NULL is returned. 747 * The reference counters are not incremented so the caller must be 748 * careful with locks. The caller must hold RCU lock. 749 */ 750 751 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 752 { 753 struct net_device *dev; 754 struct hlist_head *head = dev_name_hash(net, name); 755 756 hlist_for_each_entry_rcu(dev, head, name_hlist) 757 if (!strncmp(dev->name, name, IFNAMSIZ)) 758 return dev; 759 760 return NULL; 761 } 762 EXPORT_SYMBOL(dev_get_by_name_rcu); 763 764 /** 765 * dev_get_by_name - find a device by its name 766 * @net: the applicable net namespace 767 * @name: name to find 768 * 769 * Find an interface by name. This can be called from any 770 * context and does its own locking. The returned handle has 771 * the usage count incremented and the caller must use dev_put() to 772 * release it when it is no longer needed. %NULL is returned if no 773 * matching device is found. 774 */ 775 776 struct net_device *dev_get_by_name(struct net *net, const char *name) 777 { 778 struct net_device *dev; 779 780 rcu_read_lock(); 781 dev = dev_get_by_name_rcu(net, name); 782 if (dev) 783 dev_hold(dev); 784 rcu_read_unlock(); 785 return dev; 786 } 787 EXPORT_SYMBOL(dev_get_by_name); 788 789 /** 790 * __dev_get_by_index - find a device by its ifindex 791 * @net: the applicable net namespace 792 * @ifindex: index of device 793 * 794 * Search for an interface by index. Returns %NULL if the device 795 * is not found or a pointer to the device. The device has not 796 * had its reference counter increased so the caller must be careful 797 * about locking. The caller must hold either the RTNL semaphore 798 * or @dev_base_lock. 799 */ 800 801 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 802 { 803 struct net_device *dev; 804 struct hlist_head *head = dev_index_hash(net, ifindex); 805 806 hlist_for_each_entry(dev, head, index_hlist) 807 if (dev->ifindex == ifindex) 808 return dev; 809 810 return NULL; 811 } 812 EXPORT_SYMBOL(__dev_get_by_index); 813 814 /** 815 * dev_get_by_index_rcu - find a device by its ifindex 816 * @net: the applicable net namespace 817 * @ifindex: index of device 818 * 819 * Search for an interface by index. Returns %NULL if the device 820 * is not found or a pointer to the device. The device has not 821 * had its reference counter increased so the caller must be careful 822 * about locking. The caller must hold RCU lock. 823 */ 824 825 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) 826 { 827 struct net_device *dev; 828 struct hlist_head *head = dev_index_hash(net, ifindex); 829 830 hlist_for_each_entry_rcu(dev, head, index_hlist) 831 if (dev->ifindex == ifindex) 832 return dev; 833 834 return NULL; 835 } 836 EXPORT_SYMBOL(dev_get_by_index_rcu); 837 838 839 /** 840 * dev_get_by_index - find a device by its ifindex 841 * @net: the applicable net namespace 842 * @ifindex: index of device 843 * 844 * Search for an interface by index. Returns NULL if the device 845 * is not found or a pointer to the device. The device returned has 846 * had a reference added and the pointer is safe until the user calls 847 * dev_put to indicate they have finished with it. 848 */ 849 850 struct net_device *dev_get_by_index(struct net *net, int ifindex) 851 { 852 struct net_device *dev; 853 854 rcu_read_lock(); 855 dev = dev_get_by_index_rcu(net, ifindex); 856 if (dev) 857 dev_hold(dev); 858 rcu_read_unlock(); 859 return dev; 860 } 861 EXPORT_SYMBOL(dev_get_by_index); 862 863 /** 864 * netdev_get_name - get a netdevice name, knowing its ifindex. 865 * @net: network namespace 866 * @name: a pointer to the buffer where the name will be stored. 867 * @ifindex: the ifindex of the interface to get the name from. 868 * 869 * The use of raw_seqcount_begin() and cond_resched() before 870 * retrying is required as we want to give the writers a chance 871 * to complete when CONFIG_PREEMPT is not set. 872 */ 873 int netdev_get_name(struct net *net, char *name, int ifindex) 874 { 875 struct net_device *dev; 876 unsigned int seq; 877 878 retry: 879 seq = raw_seqcount_begin(&devnet_rename_seq); 880 rcu_read_lock(); 881 dev = dev_get_by_index_rcu(net, ifindex); 882 if (!dev) { 883 rcu_read_unlock(); 884 return -ENODEV; 885 } 886 887 strcpy(name, dev->name); 888 rcu_read_unlock(); 889 if (read_seqcount_retry(&devnet_rename_seq, seq)) { 890 cond_resched(); 891 goto retry; 892 } 893 894 return 0; 895 } 896 897 /** 898 * dev_getbyhwaddr_rcu - find a device by its hardware address 899 * @net: the applicable net namespace 900 * @type: media type of device 901 * @ha: hardware address 902 * 903 * Search for an interface by MAC address. Returns NULL if the device 904 * is not found or a pointer to the device. 905 * The caller must hold RCU or RTNL. 906 * The returned device has not had its ref count increased 907 * and the caller must therefore be careful about locking 908 * 909 */ 910 911 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, 912 const char *ha) 913 { 914 struct net_device *dev; 915 916 for_each_netdev_rcu(net, dev) 917 if (dev->type == type && 918 !memcmp(dev->dev_addr, ha, dev->addr_len)) 919 return dev; 920 921 return NULL; 922 } 923 EXPORT_SYMBOL(dev_getbyhwaddr_rcu); 924 925 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 926 { 927 struct net_device *dev; 928 929 ASSERT_RTNL(); 930 for_each_netdev(net, dev) 931 if (dev->type == type) 932 return dev; 933 934 return NULL; 935 } 936 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 937 938 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 939 { 940 struct net_device *dev, *ret = NULL; 941 942 rcu_read_lock(); 943 for_each_netdev_rcu(net, dev) 944 if (dev->type == type) { 945 dev_hold(dev); 946 ret = dev; 947 break; 948 } 949 rcu_read_unlock(); 950 return ret; 951 } 952 EXPORT_SYMBOL(dev_getfirstbyhwtype); 953 954 /** 955 * __dev_get_by_flags - find any device with given flags 956 * @net: the applicable net namespace 957 * @if_flags: IFF_* values 958 * @mask: bitmask of bits in if_flags to check 959 * 960 * Search for any interface with the given flags. Returns NULL if a device 961 * is not found or a pointer to the device. Must be called inside 962 * rtnl_lock(), and result refcount is unchanged. 963 */ 964 965 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, 966 unsigned short mask) 967 { 968 struct net_device *dev, *ret; 969 970 ASSERT_RTNL(); 971 972 ret = NULL; 973 for_each_netdev(net, dev) { 974 if (((dev->flags ^ if_flags) & mask) == 0) { 975 ret = dev; 976 break; 977 } 978 } 979 return ret; 980 } 981 EXPORT_SYMBOL(__dev_get_by_flags); 982 983 /** 984 * dev_valid_name - check if name is okay for network device 985 * @name: name string 986 * 987 * Network device names need to be valid file names to 988 * to allow sysfs to work. We also disallow any kind of 989 * whitespace. 990 */ 991 bool dev_valid_name(const char *name) 992 { 993 if (*name == '\0') 994 return false; 995 if (strlen(name) >= IFNAMSIZ) 996 return false; 997 if (!strcmp(name, ".") || !strcmp(name, "..")) 998 return false; 999 1000 while (*name) { 1001 if (*name == '/' || *name == ':' || isspace(*name)) 1002 return false; 1003 name++; 1004 } 1005 return true; 1006 } 1007 EXPORT_SYMBOL(dev_valid_name); 1008 1009 /** 1010 * __dev_alloc_name - allocate a name for a device 1011 * @net: network namespace to allocate the device name in 1012 * @name: name format string 1013 * @buf: scratch buffer and result name string 1014 * 1015 * Passed a format string - eg "lt%d" it will try and find a suitable 1016 * id. It scans list of devices to build up a free map, then chooses 1017 * the first empty slot. The caller must hold the dev_base or rtnl lock 1018 * while allocating the name and adding the device in order to avoid 1019 * duplicates. 1020 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1021 * Returns the number of the unit assigned or a negative errno code. 1022 */ 1023 1024 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 1025 { 1026 int i = 0; 1027 const char *p; 1028 const int max_netdevices = 8*PAGE_SIZE; 1029 unsigned long *inuse; 1030 struct net_device *d; 1031 1032 p = strnchr(name, IFNAMSIZ-1, '%'); 1033 if (p) { 1034 /* 1035 * Verify the string as this thing may have come from 1036 * the user. There must be either one "%d" and no other "%" 1037 * characters. 1038 */ 1039 if (p[1] != 'd' || strchr(p + 2, '%')) 1040 return -EINVAL; 1041 1042 /* Use one page as a bit array of possible slots */ 1043 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 1044 if (!inuse) 1045 return -ENOMEM; 1046 1047 for_each_netdev(net, d) { 1048 if (!sscanf(d->name, name, &i)) 1049 continue; 1050 if (i < 0 || i >= max_netdevices) 1051 continue; 1052 1053 /* avoid cases where sscanf is not exact inverse of printf */ 1054 snprintf(buf, IFNAMSIZ, name, i); 1055 if (!strncmp(buf, d->name, IFNAMSIZ)) 1056 set_bit(i, inuse); 1057 } 1058 1059 i = find_first_zero_bit(inuse, max_netdevices); 1060 free_page((unsigned long) inuse); 1061 } 1062 1063 if (buf != name) 1064 snprintf(buf, IFNAMSIZ, name, i); 1065 if (!__dev_get_by_name(net, buf)) 1066 return i; 1067 1068 /* It is possible to run out of possible slots 1069 * when the name is long and there isn't enough space left 1070 * for the digits, or if all bits are used. 1071 */ 1072 return -ENFILE; 1073 } 1074 1075 /** 1076 * dev_alloc_name - allocate a name for a device 1077 * @dev: device 1078 * @name: name format string 1079 * 1080 * Passed a format string - eg "lt%d" it will try and find a suitable 1081 * id. It scans list of devices to build up a free map, then chooses 1082 * the first empty slot. The caller must hold the dev_base or rtnl lock 1083 * while allocating the name and adding the device in order to avoid 1084 * duplicates. 1085 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1086 * Returns the number of the unit assigned or a negative errno code. 1087 */ 1088 1089 int dev_alloc_name(struct net_device *dev, const char *name) 1090 { 1091 char buf[IFNAMSIZ]; 1092 struct net *net; 1093 int ret; 1094 1095 BUG_ON(!dev_net(dev)); 1096 net = dev_net(dev); 1097 ret = __dev_alloc_name(net, name, buf); 1098 if (ret >= 0) 1099 strlcpy(dev->name, buf, IFNAMSIZ); 1100 return ret; 1101 } 1102 EXPORT_SYMBOL(dev_alloc_name); 1103 1104 static int dev_alloc_name_ns(struct net *net, 1105 struct net_device *dev, 1106 const char *name) 1107 { 1108 char buf[IFNAMSIZ]; 1109 int ret; 1110 1111 ret = __dev_alloc_name(net, name, buf); 1112 if (ret >= 0) 1113 strlcpy(dev->name, buf, IFNAMSIZ); 1114 return ret; 1115 } 1116 1117 static int dev_get_valid_name(struct net *net, 1118 struct net_device *dev, 1119 const char *name) 1120 { 1121 BUG_ON(!net); 1122 1123 if (!dev_valid_name(name)) 1124 return -EINVAL; 1125 1126 if (strchr(name, '%')) 1127 return dev_alloc_name_ns(net, dev, name); 1128 else if (__dev_get_by_name(net, name)) 1129 return -EEXIST; 1130 else if (dev->name != name) 1131 strlcpy(dev->name, name, IFNAMSIZ); 1132 1133 return 0; 1134 } 1135 1136 /** 1137 * dev_change_name - change name of a device 1138 * @dev: device 1139 * @newname: name (or format string) must be at least IFNAMSIZ 1140 * 1141 * Change name of a device, can pass format strings "eth%d". 1142 * for wildcarding. 1143 */ 1144 int dev_change_name(struct net_device *dev, const char *newname) 1145 { 1146 unsigned char old_assign_type; 1147 char oldname[IFNAMSIZ]; 1148 int err = 0; 1149 int ret; 1150 struct net *net; 1151 1152 ASSERT_RTNL(); 1153 BUG_ON(!dev_net(dev)); 1154 1155 net = dev_net(dev); 1156 if (dev->flags & IFF_UP) 1157 return -EBUSY; 1158 1159 write_seqcount_begin(&devnet_rename_seq); 1160 1161 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { 1162 write_seqcount_end(&devnet_rename_seq); 1163 return 0; 1164 } 1165 1166 memcpy(oldname, dev->name, IFNAMSIZ); 1167 1168 err = dev_get_valid_name(net, dev, newname); 1169 if (err < 0) { 1170 write_seqcount_end(&devnet_rename_seq); 1171 return err; 1172 } 1173 1174 if (oldname[0] && !strchr(oldname, '%')) 1175 netdev_info(dev, "renamed from %s\n", oldname); 1176 1177 old_assign_type = dev->name_assign_type; 1178 dev->name_assign_type = NET_NAME_RENAMED; 1179 1180 rollback: 1181 ret = device_rename(&dev->dev, dev->name); 1182 if (ret) { 1183 memcpy(dev->name, oldname, IFNAMSIZ); 1184 dev->name_assign_type = old_assign_type; 1185 write_seqcount_end(&devnet_rename_seq); 1186 return ret; 1187 } 1188 1189 write_seqcount_end(&devnet_rename_seq); 1190 1191 netdev_adjacent_rename_links(dev, oldname); 1192 1193 write_lock_bh(&dev_base_lock); 1194 hlist_del_rcu(&dev->name_hlist); 1195 write_unlock_bh(&dev_base_lock); 1196 1197 synchronize_rcu(); 1198 1199 write_lock_bh(&dev_base_lock); 1200 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 1201 write_unlock_bh(&dev_base_lock); 1202 1203 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 1204 ret = notifier_to_errno(ret); 1205 1206 if (ret) { 1207 /* err >= 0 after dev_alloc_name() or stores the first errno */ 1208 if (err >= 0) { 1209 err = ret; 1210 write_seqcount_begin(&devnet_rename_seq); 1211 memcpy(dev->name, oldname, IFNAMSIZ); 1212 memcpy(oldname, newname, IFNAMSIZ); 1213 dev->name_assign_type = old_assign_type; 1214 old_assign_type = NET_NAME_RENAMED; 1215 goto rollback; 1216 } else { 1217 pr_err("%s: name change rollback failed: %d\n", 1218 dev->name, ret); 1219 } 1220 } 1221 1222 return err; 1223 } 1224 1225 /** 1226 * dev_set_alias - change ifalias of a device 1227 * @dev: device 1228 * @alias: name up to IFALIASZ 1229 * @len: limit of bytes to copy from info 1230 * 1231 * Set ifalias for a device, 1232 */ 1233 int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 1234 { 1235 char *new_ifalias; 1236 1237 ASSERT_RTNL(); 1238 1239 if (len >= IFALIASZ) 1240 return -EINVAL; 1241 1242 if (!len) { 1243 kfree(dev->ifalias); 1244 dev->ifalias = NULL; 1245 return 0; 1246 } 1247 1248 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 1249 if (!new_ifalias) 1250 return -ENOMEM; 1251 dev->ifalias = new_ifalias; 1252 1253 strlcpy(dev->ifalias, alias, len+1); 1254 return len; 1255 } 1256 1257 1258 /** 1259 * netdev_features_change - device changes features 1260 * @dev: device to cause notification 1261 * 1262 * Called to indicate a device has changed features. 1263 */ 1264 void netdev_features_change(struct net_device *dev) 1265 { 1266 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 1267 } 1268 EXPORT_SYMBOL(netdev_features_change); 1269 1270 /** 1271 * netdev_state_change - device changes state 1272 * @dev: device to cause notification 1273 * 1274 * Called to indicate a device has changed state. This function calls 1275 * the notifier chains for netdev_chain and sends a NEWLINK message 1276 * to the routing socket. 1277 */ 1278 void netdev_state_change(struct net_device *dev) 1279 { 1280 if (dev->flags & IFF_UP) { 1281 struct netdev_notifier_change_info change_info; 1282 1283 change_info.flags_changed = 0; 1284 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 1285 &change_info.info); 1286 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); 1287 } 1288 } 1289 EXPORT_SYMBOL(netdev_state_change); 1290 1291 /** 1292 * netdev_notify_peers - notify network peers about existence of @dev 1293 * @dev: network device 1294 * 1295 * Generate traffic such that interested network peers are aware of 1296 * @dev, such as by generating a gratuitous ARP. This may be used when 1297 * a device wants to inform the rest of the network about some sort of 1298 * reconfiguration such as a failover event or virtual machine 1299 * migration. 1300 */ 1301 void netdev_notify_peers(struct net_device *dev) 1302 { 1303 rtnl_lock(); 1304 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); 1305 rtnl_unlock(); 1306 } 1307 EXPORT_SYMBOL(netdev_notify_peers); 1308 1309 static int __dev_open(struct net_device *dev) 1310 { 1311 const struct net_device_ops *ops = dev->netdev_ops; 1312 int ret; 1313 1314 ASSERT_RTNL(); 1315 1316 if (!netif_device_present(dev)) 1317 return -ENODEV; 1318 1319 /* Block netpoll from trying to do any rx path servicing. 1320 * If we don't do this there is a chance ndo_poll_controller 1321 * or ndo_poll may be running while we open the device 1322 */ 1323 netpoll_poll_disable(dev); 1324 1325 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1326 ret = notifier_to_errno(ret); 1327 if (ret) 1328 return ret; 1329 1330 set_bit(__LINK_STATE_START, &dev->state); 1331 1332 if (ops->ndo_validate_addr) 1333 ret = ops->ndo_validate_addr(dev); 1334 1335 if (!ret && ops->ndo_open) 1336 ret = ops->ndo_open(dev); 1337 1338 netpoll_poll_enable(dev); 1339 1340 if (ret) 1341 clear_bit(__LINK_STATE_START, &dev->state); 1342 else { 1343 dev->flags |= IFF_UP; 1344 dev_set_rx_mode(dev); 1345 dev_activate(dev); 1346 add_device_randomness(dev->dev_addr, dev->addr_len); 1347 } 1348 1349 return ret; 1350 } 1351 1352 /** 1353 * dev_open - prepare an interface for use. 1354 * @dev: device to open 1355 * 1356 * Takes a device from down to up state. The device's private open 1357 * function is invoked and then the multicast lists are loaded. Finally 1358 * the device is moved into the up state and a %NETDEV_UP message is 1359 * sent to the netdev notifier chain. 1360 * 1361 * Calling this function on an active interface is a nop. On a failure 1362 * a negative errno code is returned. 1363 */ 1364 int dev_open(struct net_device *dev) 1365 { 1366 int ret; 1367 1368 if (dev->flags & IFF_UP) 1369 return 0; 1370 1371 ret = __dev_open(dev); 1372 if (ret < 0) 1373 return ret; 1374 1375 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1376 call_netdevice_notifiers(NETDEV_UP, dev); 1377 1378 return ret; 1379 } 1380 EXPORT_SYMBOL(dev_open); 1381 1382 static int __dev_close_many(struct list_head *head) 1383 { 1384 struct net_device *dev; 1385 1386 ASSERT_RTNL(); 1387 might_sleep(); 1388 1389 list_for_each_entry(dev, head, close_list) { 1390 /* Temporarily disable netpoll until the interface is down */ 1391 netpoll_poll_disable(dev); 1392 1393 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1394 1395 clear_bit(__LINK_STATE_START, &dev->state); 1396 1397 /* Synchronize to scheduled poll. We cannot touch poll list, it 1398 * can be even on different cpu. So just clear netif_running(). 1399 * 1400 * dev->stop() will invoke napi_disable() on all of it's 1401 * napi_struct instances on this device. 1402 */ 1403 smp_mb__after_atomic(); /* Commit netif_running(). */ 1404 } 1405 1406 dev_deactivate_many(head); 1407 1408 list_for_each_entry(dev, head, close_list) { 1409 const struct net_device_ops *ops = dev->netdev_ops; 1410 1411 /* 1412 * Call the device specific close. This cannot fail. 1413 * Only if device is UP 1414 * 1415 * We allow it to be called even after a DETACH hot-plug 1416 * event. 1417 */ 1418 if (ops->ndo_stop) 1419 ops->ndo_stop(dev); 1420 1421 dev->flags &= ~IFF_UP; 1422 netpoll_poll_enable(dev); 1423 } 1424 1425 return 0; 1426 } 1427 1428 static int __dev_close(struct net_device *dev) 1429 { 1430 int retval; 1431 LIST_HEAD(single); 1432 1433 list_add(&dev->close_list, &single); 1434 retval = __dev_close_many(&single); 1435 list_del(&single); 1436 1437 return retval; 1438 } 1439 1440 int dev_close_many(struct list_head *head, bool unlink) 1441 { 1442 struct net_device *dev, *tmp; 1443 1444 /* Remove the devices that don't need to be closed */ 1445 list_for_each_entry_safe(dev, tmp, head, close_list) 1446 if (!(dev->flags & IFF_UP)) 1447 list_del_init(&dev->close_list); 1448 1449 __dev_close_many(head); 1450 1451 list_for_each_entry_safe(dev, tmp, head, close_list) { 1452 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1453 call_netdevice_notifiers(NETDEV_DOWN, dev); 1454 if (unlink) 1455 list_del_init(&dev->close_list); 1456 } 1457 1458 return 0; 1459 } 1460 EXPORT_SYMBOL(dev_close_many); 1461 1462 /** 1463 * dev_close - shutdown an interface. 1464 * @dev: device to shutdown 1465 * 1466 * This function moves an active device into down state. A 1467 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1468 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1469 * chain. 1470 */ 1471 int dev_close(struct net_device *dev) 1472 { 1473 if (dev->flags & IFF_UP) { 1474 LIST_HEAD(single); 1475 1476 list_add(&dev->close_list, &single); 1477 dev_close_many(&single, true); 1478 list_del(&single); 1479 } 1480 return 0; 1481 } 1482 EXPORT_SYMBOL(dev_close); 1483 1484 1485 /** 1486 * dev_disable_lro - disable Large Receive Offload on a device 1487 * @dev: device 1488 * 1489 * Disable Large Receive Offload (LRO) on a net device. Must be 1490 * called under RTNL. This is needed if received packets may be 1491 * forwarded to another interface. 1492 */ 1493 void dev_disable_lro(struct net_device *dev) 1494 { 1495 struct net_device *lower_dev; 1496 struct list_head *iter; 1497 1498 dev->wanted_features &= ~NETIF_F_LRO; 1499 netdev_update_features(dev); 1500 1501 if (unlikely(dev->features & NETIF_F_LRO)) 1502 netdev_WARN(dev, "failed to disable LRO!\n"); 1503 1504 netdev_for_each_lower_dev(dev, lower_dev, iter) 1505 dev_disable_lro(lower_dev); 1506 } 1507 EXPORT_SYMBOL(dev_disable_lro); 1508 1509 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, 1510 struct net_device *dev) 1511 { 1512 struct netdev_notifier_info info; 1513 1514 netdev_notifier_info_init(&info, dev); 1515 return nb->notifier_call(nb, val, &info); 1516 } 1517 1518 static int dev_boot_phase = 1; 1519 1520 /** 1521 * register_netdevice_notifier - register a network notifier block 1522 * @nb: notifier 1523 * 1524 * Register a notifier to be called when network device events occur. 1525 * The notifier passed is linked into the kernel structures and must 1526 * not be reused until it has been unregistered. A negative errno code 1527 * is returned on a failure. 1528 * 1529 * When registered all registration and up events are replayed 1530 * to the new notifier to allow device to have a race free 1531 * view of the network device list. 1532 */ 1533 1534 int register_netdevice_notifier(struct notifier_block *nb) 1535 { 1536 struct net_device *dev; 1537 struct net_device *last; 1538 struct net *net; 1539 int err; 1540 1541 rtnl_lock(); 1542 err = raw_notifier_chain_register(&netdev_chain, nb); 1543 if (err) 1544 goto unlock; 1545 if (dev_boot_phase) 1546 goto unlock; 1547 for_each_net(net) { 1548 for_each_netdev(net, dev) { 1549 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); 1550 err = notifier_to_errno(err); 1551 if (err) 1552 goto rollback; 1553 1554 if (!(dev->flags & IFF_UP)) 1555 continue; 1556 1557 call_netdevice_notifier(nb, NETDEV_UP, dev); 1558 } 1559 } 1560 1561 unlock: 1562 rtnl_unlock(); 1563 return err; 1564 1565 rollback: 1566 last = dev; 1567 for_each_net(net) { 1568 for_each_netdev(net, dev) { 1569 if (dev == last) 1570 goto outroll; 1571 1572 if (dev->flags & IFF_UP) { 1573 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1574 dev); 1575 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1576 } 1577 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1578 } 1579 } 1580 1581 outroll: 1582 raw_notifier_chain_unregister(&netdev_chain, nb); 1583 goto unlock; 1584 } 1585 EXPORT_SYMBOL(register_netdevice_notifier); 1586 1587 /** 1588 * unregister_netdevice_notifier - unregister a network notifier block 1589 * @nb: notifier 1590 * 1591 * Unregister a notifier previously registered by 1592 * register_netdevice_notifier(). The notifier is unlinked into the 1593 * kernel structures and may then be reused. A negative errno code 1594 * is returned on a failure. 1595 * 1596 * After unregistering unregister and down device events are synthesized 1597 * for all devices on the device list to the removed notifier to remove 1598 * the need for special case cleanup code. 1599 */ 1600 1601 int unregister_netdevice_notifier(struct notifier_block *nb) 1602 { 1603 struct net_device *dev; 1604 struct net *net; 1605 int err; 1606 1607 rtnl_lock(); 1608 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1609 if (err) 1610 goto unlock; 1611 1612 for_each_net(net) { 1613 for_each_netdev(net, dev) { 1614 if (dev->flags & IFF_UP) { 1615 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1616 dev); 1617 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1618 } 1619 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1620 } 1621 } 1622 unlock: 1623 rtnl_unlock(); 1624 return err; 1625 } 1626 EXPORT_SYMBOL(unregister_netdevice_notifier); 1627 1628 /** 1629 * call_netdevice_notifiers_info - call all network notifier blocks 1630 * @val: value passed unmodified to notifier function 1631 * @dev: net_device pointer passed unmodified to notifier function 1632 * @info: notifier information data 1633 * 1634 * Call all network notifier blocks. Parameters and return value 1635 * are as for raw_notifier_call_chain(). 1636 */ 1637 1638 static int call_netdevice_notifiers_info(unsigned long val, 1639 struct net_device *dev, 1640 struct netdev_notifier_info *info) 1641 { 1642 ASSERT_RTNL(); 1643 netdev_notifier_info_init(info, dev); 1644 return raw_notifier_call_chain(&netdev_chain, val, info); 1645 } 1646 1647 /** 1648 * call_netdevice_notifiers - call all network notifier blocks 1649 * @val: value passed unmodified to notifier function 1650 * @dev: net_device pointer passed unmodified to notifier function 1651 * 1652 * Call all network notifier blocks. Parameters and return value 1653 * are as for raw_notifier_call_chain(). 1654 */ 1655 1656 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1657 { 1658 struct netdev_notifier_info info; 1659 1660 return call_netdevice_notifiers_info(val, dev, &info); 1661 } 1662 EXPORT_SYMBOL(call_netdevice_notifiers); 1663 1664 #ifdef CONFIG_NET_INGRESS 1665 static struct static_key ingress_needed __read_mostly; 1666 1667 void net_inc_ingress_queue(void) 1668 { 1669 static_key_slow_inc(&ingress_needed); 1670 } 1671 EXPORT_SYMBOL_GPL(net_inc_ingress_queue); 1672 1673 void net_dec_ingress_queue(void) 1674 { 1675 static_key_slow_dec(&ingress_needed); 1676 } 1677 EXPORT_SYMBOL_GPL(net_dec_ingress_queue); 1678 #endif 1679 1680 #ifdef CONFIG_NET_EGRESS 1681 static struct static_key egress_needed __read_mostly; 1682 1683 void net_inc_egress_queue(void) 1684 { 1685 static_key_slow_inc(&egress_needed); 1686 } 1687 EXPORT_SYMBOL_GPL(net_inc_egress_queue); 1688 1689 void net_dec_egress_queue(void) 1690 { 1691 static_key_slow_dec(&egress_needed); 1692 } 1693 EXPORT_SYMBOL_GPL(net_dec_egress_queue); 1694 #endif 1695 1696 static struct static_key netstamp_needed __read_mostly; 1697 #ifdef HAVE_JUMP_LABEL 1698 /* We are not allowed to call static_key_slow_dec() from irq context 1699 * If net_disable_timestamp() is called from irq context, defer the 1700 * static_key_slow_dec() calls. 1701 */ 1702 static atomic_t netstamp_needed_deferred; 1703 #endif 1704 1705 void net_enable_timestamp(void) 1706 { 1707 #ifdef HAVE_JUMP_LABEL 1708 int deferred = atomic_xchg(&netstamp_needed_deferred, 0); 1709 1710 if (deferred) { 1711 while (--deferred) 1712 static_key_slow_dec(&netstamp_needed); 1713 return; 1714 } 1715 #endif 1716 static_key_slow_inc(&netstamp_needed); 1717 } 1718 EXPORT_SYMBOL(net_enable_timestamp); 1719 1720 void net_disable_timestamp(void) 1721 { 1722 #ifdef HAVE_JUMP_LABEL 1723 if (in_interrupt()) { 1724 atomic_inc(&netstamp_needed_deferred); 1725 return; 1726 } 1727 #endif 1728 static_key_slow_dec(&netstamp_needed); 1729 } 1730 EXPORT_SYMBOL(net_disable_timestamp); 1731 1732 static inline void net_timestamp_set(struct sk_buff *skb) 1733 { 1734 skb->tstamp = 0; 1735 if (static_key_false(&netstamp_needed)) 1736 __net_timestamp(skb); 1737 } 1738 1739 #define net_timestamp_check(COND, SKB) \ 1740 if (static_key_false(&netstamp_needed)) { \ 1741 if ((COND) && !(SKB)->tstamp) \ 1742 __net_timestamp(SKB); \ 1743 } \ 1744 1745 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) 1746 { 1747 unsigned int len; 1748 1749 if (!(dev->flags & IFF_UP)) 1750 return false; 1751 1752 len = dev->mtu + dev->hard_header_len + VLAN_HLEN; 1753 if (skb->len <= len) 1754 return true; 1755 1756 /* if TSO is enabled, we don't care about the length as the packet 1757 * could be forwarded without being segmented before 1758 */ 1759 if (skb_is_gso(skb)) 1760 return true; 1761 1762 return false; 1763 } 1764 EXPORT_SYMBOL_GPL(is_skb_forwardable); 1765 1766 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1767 { 1768 int ret = ____dev_forward_skb(dev, skb); 1769 1770 if (likely(!ret)) { 1771 skb->protocol = eth_type_trans(skb, dev); 1772 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 1773 } 1774 1775 return ret; 1776 } 1777 EXPORT_SYMBOL_GPL(__dev_forward_skb); 1778 1779 /** 1780 * dev_forward_skb - loopback an skb to another netif 1781 * 1782 * @dev: destination network device 1783 * @skb: buffer to forward 1784 * 1785 * return values: 1786 * NET_RX_SUCCESS (no congestion) 1787 * NET_RX_DROP (packet was dropped, but freed) 1788 * 1789 * dev_forward_skb can be used for injecting an skb from the 1790 * start_xmit function of one device into the receive queue 1791 * of another device. 1792 * 1793 * The receiving device may be in another namespace, so 1794 * we have to clear all information in the skb that could 1795 * impact namespace isolation. 1796 */ 1797 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1798 { 1799 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); 1800 } 1801 EXPORT_SYMBOL_GPL(dev_forward_skb); 1802 1803 static inline int deliver_skb(struct sk_buff *skb, 1804 struct packet_type *pt_prev, 1805 struct net_device *orig_dev) 1806 { 1807 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 1808 return -ENOMEM; 1809 atomic_inc(&skb->users); 1810 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 1811 } 1812 1813 static inline void deliver_ptype_list_skb(struct sk_buff *skb, 1814 struct packet_type **pt, 1815 struct net_device *orig_dev, 1816 __be16 type, 1817 struct list_head *ptype_list) 1818 { 1819 struct packet_type *ptype, *pt_prev = *pt; 1820 1821 list_for_each_entry_rcu(ptype, ptype_list, list) { 1822 if (ptype->type != type) 1823 continue; 1824 if (pt_prev) 1825 deliver_skb(skb, pt_prev, orig_dev); 1826 pt_prev = ptype; 1827 } 1828 *pt = pt_prev; 1829 } 1830 1831 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) 1832 { 1833 if (!ptype->af_packet_priv || !skb->sk) 1834 return false; 1835 1836 if (ptype->id_match) 1837 return ptype->id_match(ptype, skb->sk); 1838 else if ((struct sock *)ptype->af_packet_priv == skb->sk) 1839 return true; 1840 1841 return false; 1842 } 1843 1844 /* 1845 * Support routine. Sends outgoing frames to any network 1846 * taps currently in use. 1847 */ 1848 1849 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1850 { 1851 struct packet_type *ptype; 1852 struct sk_buff *skb2 = NULL; 1853 struct packet_type *pt_prev = NULL; 1854 struct list_head *ptype_list = &ptype_all; 1855 1856 rcu_read_lock(); 1857 again: 1858 list_for_each_entry_rcu(ptype, ptype_list, list) { 1859 /* Never send packets back to the socket 1860 * they originated from - MvS ([email protected]) 1861 */ 1862 if (skb_loop_sk(ptype, skb)) 1863 continue; 1864 1865 if (pt_prev) { 1866 deliver_skb(skb2, pt_prev, skb->dev); 1867 pt_prev = ptype; 1868 continue; 1869 } 1870 1871 /* need to clone skb, done only once */ 1872 skb2 = skb_clone(skb, GFP_ATOMIC); 1873 if (!skb2) 1874 goto out_unlock; 1875 1876 net_timestamp_set(skb2); 1877 1878 /* skb->nh should be correctly 1879 * set by sender, so that the second statement is 1880 * just protection against buggy protocols. 1881 */ 1882 skb_reset_mac_header(skb2); 1883 1884 if (skb_network_header(skb2) < skb2->data || 1885 skb_network_header(skb2) > skb_tail_pointer(skb2)) { 1886 net_crit_ratelimited("protocol %04x is buggy, dev %s\n", 1887 ntohs(skb2->protocol), 1888 dev->name); 1889 skb_reset_network_header(skb2); 1890 } 1891 1892 skb2->transport_header = skb2->network_header; 1893 skb2->pkt_type = PACKET_OUTGOING; 1894 pt_prev = ptype; 1895 } 1896 1897 if (ptype_list == &ptype_all) { 1898 ptype_list = &dev->ptype_all; 1899 goto again; 1900 } 1901 out_unlock: 1902 if (pt_prev) 1903 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); 1904 rcu_read_unlock(); 1905 } 1906 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); 1907 1908 /** 1909 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change 1910 * @dev: Network device 1911 * @txq: number of queues available 1912 * 1913 * If real_num_tx_queues is changed the tc mappings may no longer be 1914 * valid. To resolve this verify the tc mapping remains valid and if 1915 * not NULL the mapping. With no priorities mapping to this 1916 * offset/count pair it will no longer be used. In the worst case TC0 1917 * is invalid nothing can be done so disable priority mappings. If is 1918 * expected that drivers will fix this mapping if they can before 1919 * calling netif_set_real_num_tx_queues. 1920 */ 1921 static void netif_setup_tc(struct net_device *dev, unsigned int txq) 1922 { 1923 int i; 1924 struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; 1925 1926 /* If TC0 is invalidated disable TC mapping */ 1927 if (tc->offset + tc->count > txq) { 1928 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); 1929 dev->num_tc = 0; 1930 return; 1931 } 1932 1933 /* Invalidated prio to tc mappings set to TC0 */ 1934 for (i = 1; i < TC_BITMASK + 1; i++) { 1935 int q = netdev_get_prio_tc_map(dev, i); 1936 1937 tc = &dev->tc_to_txq[q]; 1938 if (tc->offset + tc->count > txq) { 1939 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", 1940 i, q); 1941 netdev_set_prio_tc_map(dev, i, 0); 1942 } 1943 } 1944 } 1945 1946 int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) 1947 { 1948 if (dev->num_tc) { 1949 struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; 1950 int i; 1951 1952 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { 1953 if ((txq - tc->offset) < tc->count) 1954 return i; 1955 } 1956 1957 return -1; 1958 } 1959 1960 return 0; 1961 } 1962 1963 #ifdef CONFIG_XPS 1964 static DEFINE_MUTEX(xps_map_mutex); 1965 #define xmap_dereference(P) \ 1966 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) 1967 1968 static bool remove_xps_queue(struct xps_dev_maps *dev_maps, 1969 int tci, u16 index) 1970 { 1971 struct xps_map *map = NULL; 1972 int pos; 1973 1974 if (dev_maps) 1975 map = xmap_dereference(dev_maps->cpu_map[tci]); 1976 if (!map) 1977 return false; 1978 1979 for (pos = map->len; pos--;) { 1980 if (map->queues[pos] != index) 1981 continue; 1982 1983 if (map->len > 1) { 1984 map->queues[pos] = map->queues[--map->len]; 1985 break; 1986 } 1987 1988 RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL); 1989 kfree_rcu(map, rcu); 1990 return false; 1991 } 1992 1993 return true; 1994 } 1995 1996 static bool remove_xps_queue_cpu(struct net_device *dev, 1997 struct xps_dev_maps *dev_maps, 1998 int cpu, u16 offset, u16 count) 1999 { 2000 int num_tc = dev->num_tc ? : 1; 2001 bool active = false; 2002 int tci; 2003 2004 for (tci = cpu * num_tc; num_tc--; tci++) { 2005 int i, j; 2006 2007 for (i = count, j = offset; i--; j++) { 2008 if (!remove_xps_queue(dev_maps, cpu, j)) 2009 break; 2010 } 2011 2012 active |= i < 0; 2013 } 2014 2015 return active; 2016 } 2017 2018 static void netif_reset_xps_queues(struct net_device *dev, u16 offset, 2019 u16 count) 2020 { 2021 struct xps_dev_maps *dev_maps; 2022 int cpu, i; 2023 bool active = false; 2024 2025 mutex_lock(&xps_map_mutex); 2026 dev_maps = xmap_dereference(dev->xps_maps); 2027 2028 if (!dev_maps) 2029 goto out_no_maps; 2030 2031 for_each_possible_cpu(cpu) 2032 active |= remove_xps_queue_cpu(dev, dev_maps, cpu, 2033 offset, count); 2034 2035 if (!active) { 2036 RCU_INIT_POINTER(dev->xps_maps, NULL); 2037 kfree_rcu(dev_maps, rcu); 2038 } 2039 2040 for (i = offset + (count - 1); count--; i--) 2041 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), 2042 NUMA_NO_NODE); 2043 2044 out_no_maps: 2045 mutex_unlock(&xps_map_mutex); 2046 } 2047 2048 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) 2049 { 2050 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); 2051 } 2052 2053 static struct xps_map *expand_xps_map(struct xps_map *map, 2054 int cpu, u16 index) 2055 { 2056 struct xps_map *new_map; 2057 int alloc_len = XPS_MIN_MAP_ALLOC; 2058 int i, pos; 2059 2060 for (pos = 0; map && pos < map->len; pos++) { 2061 if (map->queues[pos] != index) 2062 continue; 2063 return map; 2064 } 2065 2066 /* Need to add queue to this CPU's existing map */ 2067 if (map) { 2068 if (pos < map->alloc_len) 2069 return map; 2070 2071 alloc_len = map->alloc_len * 2; 2072 } 2073 2074 /* Need to allocate new map to store queue on this CPU's map */ 2075 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, 2076 cpu_to_node(cpu)); 2077 if (!new_map) 2078 return NULL; 2079 2080 for (i = 0; i < pos; i++) 2081 new_map->queues[i] = map->queues[i]; 2082 new_map->alloc_len = alloc_len; 2083 new_map->len = pos; 2084 2085 return new_map; 2086 } 2087 2088 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, 2089 u16 index) 2090 { 2091 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; 2092 int i, cpu, tci, numa_node_id = -2; 2093 int maps_sz, num_tc = 1, tc = 0; 2094 struct xps_map *map, *new_map; 2095 bool active = false; 2096 2097 if (dev->num_tc) { 2098 num_tc = dev->num_tc; 2099 tc = netdev_txq_to_tc(dev, index); 2100 if (tc < 0) 2101 return -EINVAL; 2102 } 2103 2104 maps_sz = XPS_DEV_MAPS_SIZE(num_tc); 2105 if (maps_sz < L1_CACHE_BYTES) 2106 maps_sz = L1_CACHE_BYTES; 2107 2108 mutex_lock(&xps_map_mutex); 2109 2110 dev_maps = xmap_dereference(dev->xps_maps); 2111 2112 /* allocate memory for queue storage */ 2113 for_each_cpu_and(cpu, cpu_online_mask, mask) { 2114 if (!new_dev_maps) 2115 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); 2116 if (!new_dev_maps) { 2117 mutex_unlock(&xps_map_mutex); 2118 return -ENOMEM; 2119 } 2120 2121 tci = cpu * num_tc + tc; 2122 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) : 2123 NULL; 2124 2125 map = expand_xps_map(map, cpu, index); 2126 if (!map) 2127 goto error; 2128 2129 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); 2130 } 2131 2132 if (!new_dev_maps) 2133 goto out_no_new_maps; 2134 2135 for_each_possible_cpu(cpu) { 2136 /* copy maps belonging to foreign traffic classes */ 2137 for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) { 2138 /* fill in the new device map from the old device map */ 2139 map = xmap_dereference(dev_maps->cpu_map[tci]); 2140 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); 2141 } 2142 2143 /* We need to explicitly update tci as prevous loop 2144 * could break out early if dev_maps is NULL. 2145 */ 2146 tci = cpu * num_tc + tc; 2147 2148 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { 2149 /* add queue to CPU maps */ 2150 int pos = 0; 2151 2152 map = xmap_dereference(new_dev_maps->cpu_map[tci]); 2153 while ((pos < map->len) && (map->queues[pos] != index)) 2154 pos++; 2155 2156 if (pos == map->len) 2157 map->queues[map->len++] = index; 2158 #ifdef CONFIG_NUMA 2159 if (numa_node_id == -2) 2160 numa_node_id = cpu_to_node(cpu); 2161 else if (numa_node_id != cpu_to_node(cpu)) 2162 numa_node_id = -1; 2163 #endif 2164 } else if (dev_maps) { 2165 /* fill in the new device map from the old device map */ 2166 map = xmap_dereference(dev_maps->cpu_map[tci]); 2167 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); 2168 } 2169 2170 /* copy maps belonging to foreign traffic classes */ 2171 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { 2172 /* fill in the new device map from the old device map */ 2173 map = xmap_dereference(dev_maps->cpu_map[tci]); 2174 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); 2175 } 2176 } 2177 2178 rcu_assign_pointer(dev->xps_maps, new_dev_maps); 2179 2180 /* Cleanup old maps */ 2181 if (!dev_maps) 2182 goto out_no_old_maps; 2183 2184 for_each_possible_cpu(cpu) { 2185 for (i = num_tc, tci = cpu * num_tc; i--; tci++) { 2186 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); 2187 map = xmap_dereference(dev_maps->cpu_map[tci]); 2188 if (map && map != new_map) 2189 kfree_rcu(map, rcu); 2190 } 2191 } 2192 2193 kfree_rcu(dev_maps, rcu); 2194 2195 out_no_old_maps: 2196 dev_maps = new_dev_maps; 2197 active = true; 2198 2199 out_no_new_maps: 2200 /* update Tx queue numa node */ 2201 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), 2202 (numa_node_id >= 0) ? numa_node_id : 2203 NUMA_NO_NODE); 2204 2205 if (!dev_maps) 2206 goto out_no_maps; 2207 2208 /* removes queue from unused CPUs */ 2209 for_each_possible_cpu(cpu) { 2210 for (i = tc, tci = cpu * num_tc; i--; tci++) 2211 active |= remove_xps_queue(dev_maps, tci, index); 2212 if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu)) 2213 active |= remove_xps_queue(dev_maps, tci, index); 2214 for (i = num_tc - tc, tci++; --i; tci++) 2215 active |= remove_xps_queue(dev_maps, tci, index); 2216 } 2217 2218 /* free map if not active */ 2219 if (!active) { 2220 RCU_INIT_POINTER(dev->xps_maps, NULL); 2221 kfree_rcu(dev_maps, rcu); 2222 } 2223 2224 out_no_maps: 2225 mutex_unlock(&xps_map_mutex); 2226 2227 return 0; 2228 error: 2229 /* remove any maps that we added */ 2230 for_each_possible_cpu(cpu) { 2231 for (i = num_tc, tci = cpu * num_tc; i--; tci++) { 2232 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); 2233 map = dev_maps ? 2234 xmap_dereference(dev_maps->cpu_map[tci]) : 2235 NULL; 2236 if (new_map && new_map != map) 2237 kfree(new_map); 2238 } 2239 } 2240 2241 mutex_unlock(&xps_map_mutex); 2242 2243 kfree(new_dev_maps); 2244 return -ENOMEM; 2245 } 2246 EXPORT_SYMBOL(netif_set_xps_queue); 2247 2248 #endif 2249 void netdev_reset_tc(struct net_device *dev) 2250 { 2251 #ifdef CONFIG_XPS 2252 netif_reset_xps_queues_gt(dev, 0); 2253 #endif 2254 dev->num_tc = 0; 2255 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); 2256 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); 2257 } 2258 EXPORT_SYMBOL(netdev_reset_tc); 2259 2260 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) 2261 { 2262 if (tc >= dev->num_tc) 2263 return -EINVAL; 2264 2265 #ifdef CONFIG_XPS 2266 netif_reset_xps_queues(dev, offset, count); 2267 #endif 2268 dev->tc_to_txq[tc].count = count; 2269 dev->tc_to_txq[tc].offset = offset; 2270 return 0; 2271 } 2272 EXPORT_SYMBOL(netdev_set_tc_queue); 2273 2274 int netdev_set_num_tc(struct net_device *dev, u8 num_tc) 2275 { 2276 if (num_tc > TC_MAX_QUEUE) 2277 return -EINVAL; 2278 2279 #ifdef CONFIG_XPS 2280 netif_reset_xps_queues_gt(dev, 0); 2281 #endif 2282 dev->num_tc = num_tc; 2283 return 0; 2284 } 2285 EXPORT_SYMBOL(netdev_set_num_tc); 2286 2287 /* 2288 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 2289 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 2290 */ 2291 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 2292 { 2293 int rc; 2294 2295 if (txq < 1 || txq > dev->num_tx_queues) 2296 return -EINVAL; 2297 2298 if (dev->reg_state == NETREG_REGISTERED || 2299 dev->reg_state == NETREG_UNREGISTERING) { 2300 ASSERT_RTNL(); 2301 2302 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, 2303 txq); 2304 if (rc) 2305 return rc; 2306 2307 if (dev->num_tc) 2308 netif_setup_tc(dev, txq); 2309 2310 if (txq < dev->real_num_tx_queues) { 2311 qdisc_reset_all_tx_gt(dev, txq); 2312 #ifdef CONFIG_XPS 2313 netif_reset_xps_queues_gt(dev, txq); 2314 #endif 2315 } 2316 } 2317 2318 dev->real_num_tx_queues = txq; 2319 return 0; 2320 } 2321 EXPORT_SYMBOL(netif_set_real_num_tx_queues); 2322 2323 #ifdef CONFIG_SYSFS 2324 /** 2325 * netif_set_real_num_rx_queues - set actual number of RX queues used 2326 * @dev: Network device 2327 * @rxq: Actual number of RX queues 2328 * 2329 * This must be called either with the rtnl_lock held or before 2330 * registration of the net device. Returns 0 on success, or a 2331 * negative error code. If called before registration, it always 2332 * succeeds. 2333 */ 2334 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) 2335 { 2336 int rc; 2337 2338 if (rxq < 1 || rxq > dev->num_rx_queues) 2339 return -EINVAL; 2340 2341 if (dev->reg_state == NETREG_REGISTERED) { 2342 ASSERT_RTNL(); 2343 2344 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, 2345 rxq); 2346 if (rc) 2347 return rc; 2348 } 2349 2350 dev->real_num_rx_queues = rxq; 2351 return 0; 2352 } 2353 EXPORT_SYMBOL(netif_set_real_num_rx_queues); 2354 #endif 2355 2356 /** 2357 * netif_get_num_default_rss_queues - default number of RSS queues 2358 * 2359 * This routine should set an upper limit on the number of RSS queues 2360 * used by default by multiqueue devices. 2361 */ 2362 int netif_get_num_default_rss_queues(void) 2363 { 2364 return is_kdump_kernel() ? 2365 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); 2366 } 2367 EXPORT_SYMBOL(netif_get_num_default_rss_queues); 2368 2369 static void __netif_reschedule(struct Qdisc *q) 2370 { 2371 struct softnet_data *sd; 2372 unsigned long flags; 2373 2374 local_irq_save(flags); 2375 sd = this_cpu_ptr(&softnet_data); 2376 q->next_sched = NULL; 2377 *sd->output_queue_tailp = q; 2378 sd->output_queue_tailp = &q->next_sched; 2379 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2380 local_irq_restore(flags); 2381 } 2382 2383 void __netif_schedule(struct Qdisc *q) 2384 { 2385 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 2386 __netif_reschedule(q); 2387 } 2388 EXPORT_SYMBOL(__netif_schedule); 2389 2390 struct dev_kfree_skb_cb { 2391 enum skb_free_reason reason; 2392 }; 2393 2394 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) 2395 { 2396 return (struct dev_kfree_skb_cb *)skb->cb; 2397 } 2398 2399 void netif_schedule_queue(struct netdev_queue *txq) 2400 { 2401 rcu_read_lock(); 2402 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { 2403 struct Qdisc *q = rcu_dereference(txq->qdisc); 2404 2405 __netif_schedule(q); 2406 } 2407 rcu_read_unlock(); 2408 } 2409 EXPORT_SYMBOL(netif_schedule_queue); 2410 2411 void netif_tx_wake_queue(struct netdev_queue *dev_queue) 2412 { 2413 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { 2414 struct Qdisc *q; 2415 2416 rcu_read_lock(); 2417 q = rcu_dereference(dev_queue->qdisc); 2418 __netif_schedule(q); 2419 rcu_read_unlock(); 2420 } 2421 } 2422 EXPORT_SYMBOL(netif_tx_wake_queue); 2423 2424 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) 2425 { 2426 unsigned long flags; 2427 2428 if (likely(atomic_read(&skb->users) == 1)) { 2429 smp_rmb(); 2430 atomic_set(&skb->users, 0); 2431 } else if (likely(!atomic_dec_and_test(&skb->users))) { 2432 return; 2433 } 2434 get_kfree_skb_cb(skb)->reason = reason; 2435 local_irq_save(flags); 2436 skb->next = __this_cpu_read(softnet_data.completion_queue); 2437 __this_cpu_write(softnet_data.completion_queue, skb); 2438 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2439 local_irq_restore(flags); 2440 } 2441 EXPORT_SYMBOL(__dev_kfree_skb_irq); 2442 2443 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) 2444 { 2445 if (in_irq() || irqs_disabled()) 2446 __dev_kfree_skb_irq(skb, reason); 2447 else 2448 dev_kfree_skb(skb); 2449 } 2450 EXPORT_SYMBOL(__dev_kfree_skb_any); 2451 2452 2453 /** 2454 * netif_device_detach - mark device as removed 2455 * @dev: network device 2456 * 2457 * Mark device as removed from system and therefore no longer available. 2458 */ 2459 void netif_device_detach(struct net_device *dev) 2460 { 2461 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 2462 netif_running(dev)) { 2463 netif_tx_stop_all_queues(dev); 2464 } 2465 } 2466 EXPORT_SYMBOL(netif_device_detach); 2467 2468 /** 2469 * netif_device_attach - mark device as attached 2470 * @dev: network device 2471 * 2472 * Mark device as attached from system and restart if needed. 2473 */ 2474 void netif_device_attach(struct net_device *dev) 2475 { 2476 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 2477 netif_running(dev)) { 2478 netif_tx_wake_all_queues(dev); 2479 __netdev_watchdog_up(dev); 2480 } 2481 } 2482 EXPORT_SYMBOL(netif_device_attach); 2483 2484 /* 2485 * Returns a Tx hash based on the given packet descriptor a Tx queues' number 2486 * to be used as a distribution range. 2487 */ 2488 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, 2489 unsigned int num_tx_queues) 2490 { 2491 u32 hash; 2492 u16 qoffset = 0; 2493 u16 qcount = num_tx_queues; 2494 2495 if (skb_rx_queue_recorded(skb)) { 2496 hash = skb_get_rx_queue(skb); 2497 while (unlikely(hash >= num_tx_queues)) 2498 hash -= num_tx_queues; 2499 return hash; 2500 } 2501 2502 if (dev->num_tc) { 2503 u8 tc = netdev_get_prio_tc_map(dev, skb->priority); 2504 qoffset = dev->tc_to_txq[tc].offset; 2505 qcount = dev->tc_to_txq[tc].count; 2506 } 2507 2508 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; 2509 } 2510 EXPORT_SYMBOL(__skb_tx_hash); 2511 2512 static void skb_warn_bad_offload(const struct sk_buff *skb) 2513 { 2514 static const netdev_features_t null_features; 2515 struct net_device *dev = skb->dev; 2516 const char *name = ""; 2517 2518 if (!net_ratelimit()) 2519 return; 2520 2521 if (dev) { 2522 if (dev->dev.parent) 2523 name = dev_driver_string(dev->dev.parent); 2524 else 2525 name = netdev_name(dev); 2526 } 2527 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " 2528 "gso_type=%d ip_summed=%d\n", 2529 name, dev ? &dev->features : &null_features, 2530 skb->sk ? &skb->sk->sk_route_caps : &null_features, 2531 skb->len, skb->data_len, skb_shinfo(skb)->gso_size, 2532 skb_shinfo(skb)->gso_type, skb->ip_summed); 2533 } 2534 2535 /* 2536 * Invalidate hardware checksum when packet is to be mangled, and 2537 * complete checksum manually on outgoing path. 2538 */ 2539 int skb_checksum_help(struct sk_buff *skb) 2540 { 2541 __wsum csum; 2542 int ret = 0, offset; 2543 2544 if (skb->ip_summed == CHECKSUM_COMPLETE) 2545 goto out_set_summed; 2546 2547 if (unlikely(skb_shinfo(skb)->gso_size)) { 2548 skb_warn_bad_offload(skb); 2549 return -EINVAL; 2550 } 2551 2552 /* Before computing a checksum, we should make sure no frag could 2553 * be modified by an external entity : checksum could be wrong. 2554 */ 2555 if (skb_has_shared_frag(skb)) { 2556 ret = __skb_linearize(skb); 2557 if (ret) 2558 goto out; 2559 } 2560 2561 offset = skb_checksum_start_offset(skb); 2562 BUG_ON(offset >= skb_headlen(skb)); 2563 csum = skb_checksum(skb, offset, skb->len - offset, 0); 2564 2565 offset += skb->csum_offset; 2566 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 2567 2568 if (skb_cloned(skb) && 2569 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 2570 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 2571 if (ret) 2572 goto out; 2573 } 2574 2575 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; 2576 out_set_summed: 2577 skb->ip_summed = CHECKSUM_NONE; 2578 out: 2579 return ret; 2580 } 2581 EXPORT_SYMBOL(skb_checksum_help); 2582 2583 __be16 skb_network_protocol(struct sk_buff *skb, int *depth) 2584 { 2585 __be16 type = skb->protocol; 2586 2587 /* Tunnel gso handlers can set protocol to ethernet. */ 2588 if (type == htons(ETH_P_TEB)) { 2589 struct ethhdr *eth; 2590 2591 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) 2592 return 0; 2593 2594 eth = (struct ethhdr *)skb_mac_header(skb); 2595 type = eth->h_proto; 2596 } 2597 2598 return __vlan_get_protocol(skb, type, depth); 2599 } 2600 2601 /** 2602 * skb_mac_gso_segment - mac layer segmentation handler. 2603 * @skb: buffer to segment 2604 * @features: features for the output path (see dev->features) 2605 */ 2606 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, 2607 netdev_features_t features) 2608 { 2609 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 2610 struct packet_offload *ptype; 2611 int vlan_depth = skb->mac_len; 2612 __be16 type = skb_network_protocol(skb, &vlan_depth); 2613 2614 if (unlikely(!type)) 2615 return ERR_PTR(-EINVAL); 2616 2617 __skb_pull(skb, vlan_depth); 2618 2619 rcu_read_lock(); 2620 list_for_each_entry_rcu(ptype, &offload_base, list) { 2621 if (ptype->type == type && ptype->callbacks.gso_segment) { 2622 segs = ptype->callbacks.gso_segment(skb, features); 2623 break; 2624 } 2625 } 2626 rcu_read_unlock(); 2627 2628 __skb_push(skb, skb->data - skb_mac_header(skb)); 2629 2630 return segs; 2631 } 2632 EXPORT_SYMBOL(skb_mac_gso_segment); 2633 2634 2635 /* openvswitch calls this on rx path, so we need a different check. 2636 */ 2637 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) 2638 { 2639 if (tx_path) 2640 return skb->ip_summed != CHECKSUM_PARTIAL; 2641 else 2642 return skb->ip_summed == CHECKSUM_NONE; 2643 } 2644 2645 /** 2646 * __skb_gso_segment - Perform segmentation on skb. 2647 * @skb: buffer to segment 2648 * @features: features for the output path (see dev->features) 2649 * @tx_path: whether it is called in TX path 2650 * 2651 * This function segments the given skb and returns a list of segments. 2652 * 2653 * It may return NULL if the skb requires no segmentation. This is 2654 * only possible when GSO is used for verifying header integrity. 2655 * 2656 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. 2657 */ 2658 struct sk_buff *__skb_gso_segment(struct sk_buff *skb, 2659 netdev_features_t features, bool tx_path) 2660 { 2661 struct sk_buff *segs; 2662 2663 if (unlikely(skb_needs_check(skb, tx_path))) { 2664 int err; 2665 2666 /* We're going to init ->check field in TCP or UDP header */ 2667 err = skb_cow_head(skb, 0); 2668 if (err < 0) 2669 return ERR_PTR(err); 2670 } 2671 2672 /* Only report GSO partial support if it will enable us to 2673 * support segmentation on this frame without needing additional 2674 * work. 2675 */ 2676 if (features & NETIF_F_GSO_PARTIAL) { 2677 netdev_features_t partial_features = NETIF_F_GSO_ROBUST; 2678 struct net_device *dev = skb->dev; 2679 2680 partial_features |= dev->features & dev->gso_partial_features; 2681 if (!skb_gso_ok(skb, features | partial_features)) 2682 features &= ~NETIF_F_GSO_PARTIAL; 2683 } 2684 2685 BUILD_BUG_ON(SKB_SGO_CB_OFFSET + 2686 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); 2687 2688 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); 2689 SKB_GSO_CB(skb)->encap_level = 0; 2690 2691 skb_reset_mac_header(skb); 2692 skb_reset_mac_len(skb); 2693 2694 segs = skb_mac_gso_segment(skb, features); 2695 2696 if (unlikely(skb_needs_check(skb, tx_path))) 2697 skb_warn_bad_offload(skb); 2698 2699 return segs; 2700 } 2701 EXPORT_SYMBOL(__skb_gso_segment); 2702 2703 /* Take action when hardware reception checksum errors are detected. */ 2704 #ifdef CONFIG_BUG 2705 void netdev_rx_csum_fault(struct net_device *dev) 2706 { 2707 if (net_ratelimit()) { 2708 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); 2709 dump_stack(); 2710 } 2711 } 2712 EXPORT_SYMBOL(netdev_rx_csum_fault); 2713 #endif 2714 2715 /* Actually, we should eliminate this check as soon as we know, that: 2716 * 1. IOMMU is present and allows to map all the memory. 2717 * 2. No high memory really exists on this machine. 2718 */ 2719 2720 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 2721 { 2722 #ifdef CONFIG_HIGHMEM 2723 int i; 2724 if (!(dev->features & NETIF_F_HIGHDMA)) { 2725 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2726 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2727 if (PageHighMem(skb_frag_page(frag))) 2728 return 1; 2729 } 2730 } 2731 2732 if (PCI_DMA_BUS_IS_PHYS) { 2733 struct device *pdev = dev->dev.parent; 2734 2735 if (!pdev) 2736 return 0; 2737 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2738 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2739 dma_addr_t addr = page_to_phys(skb_frag_page(frag)); 2740 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 2741 return 1; 2742 } 2743 } 2744 #endif 2745 return 0; 2746 } 2747 2748 /* If MPLS offload request, verify we are testing hardware MPLS features 2749 * instead of standard features for the netdev. 2750 */ 2751 #if IS_ENABLED(CONFIG_NET_MPLS_GSO) 2752 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2753 netdev_features_t features, 2754 __be16 type) 2755 { 2756 if (eth_p_mpls(type)) 2757 features &= skb->dev->mpls_features; 2758 2759 return features; 2760 } 2761 #else 2762 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2763 netdev_features_t features, 2764 __be16 type) 2765 { 2766 return features; 2767 } 2768 #endif 2769 2770 static netdev_features_t harmonize_features(struct sk_buff *skb, 2771 netdev_features_t features) 2772 { 2773 int tmp; 2774 __be16 type; 2775 2776 type = skb_network_protocol(skb, &tmp); 2777 features = net_mpls_features(skb, features, type); 2778 2779 if (skb->ip_summed != CHECKSUM_NONE && 2780 !can_checksum_protocol(features, type)) { 2781 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); 2782 } 2783 if (illegal_highdma(skb->dev, skb)) 2784 features &= ~NETIF_F_SG; 2785 2786 return features; 2787 } 2788 2789 netdev_features_t passthru_features_check(struct sk_buff *skb, 2790 struct net_device *dev, 2791 netdev_features_t features) 2792 { 2793 return features; 2794 } 2795 EXPORT_SYMBOL(passthru_features_check); 2796 2797 static netdev_features_t dflt_features_check(const struct sk_buff *skb, 2798 struct net_device *dev, 2799 netdev_features_t features) 2800 { 2801 return vlan_features_check(skb, features); 2802 } 2803 2804 static netdev_features_t gso_features_check(const struct sk_buff *skb, 2805 struct net_device *dev, 2806 netdev_features_t features) 2807 { 2808 u16 gso_segs = skb_shinfo(skb)->gso_segs; 2809 2810 if (gso_segs > dev->gso_max_segs) 2811 return features & ~NETIF_F_GSO_MASK; 2812 2813 /* Support for GSO partial features requires software 2814 * intervention before we can actually process the packets 2815 * so we need to strip support for any partial features now 2816 * and we can pull them back in after we have partially 2817 * segmented the frame. 2818 */ 2819 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) 2820 features &= ~dev->gso_partial_features; 2821 2822 /* Make sure to clear the IPv4 ID mangling feature if the 2823 * IPv4 header has the potential to be fragmented. 2824 */ 2825 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { 2826 struct iphdr *iph = skb->encapsulation ? 2827 inner_ip_hdr(skb) : ip_hdr(skb); 2828 2829 if (!(iph->frag_off & htons(IP_DF))) 2830 features &= ~NETIF_F_TSO_MANGLEID; 2831 } 2832 2833 return features; 2834 } 2835 2836 netdev_features_t netif_skb_features(struct sk_buff *skb) 2837 { 2838 struct net_device *dev = skb->dev; 2839 netdev_features_t features = dev->features; 2840 2841 if (skb_is_gso(skb)) 2842 features = gso_features_check(skb, dev, features); 2843 2844 /* If encapsulation offload request, verify we are testing 2845 * hardware encapsulation features instead of standard 2846 * features for the netdev 2847 */ 2848 if (skb->encapsulation) 2849 features &= dev->hw_enc_features; 2850 2851 if (skb_vlan_tagged(skb)) 2852 features = netdev_intersect_features(features, 2853 dev->vlan_features | 2854 NETIF_F_HW_VLAN_CTAG_TX | 2855 NETIF_F_HW_VLAN_STAG_TX); 2856 2857 if (dev->netdev_ops->ndo_features_check) 2858 features &= dev->netdev_ops->ndo_features_check(skb, dev, 2859 features); 2860 else 2861 features &= dflt_features_check(skb, dev, features); 2862 2863 return harmonize_features(skb, features); 2864 } 2865 EXPORT_SYMBOL(netif_skb_features); 2866 2867 static int xmit_one(struct sk_buff *skb, struct net_device *dev, 2868 struct netdev_queue *txq, bool more) 2869 { 2870 unsigned int len; 2871 int rc; 2872 2873 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) 2874 dev_queue_xmit_nit(skb, dev); 2875 2876 len = skb->len; 2877 trace_net_dev_start_xmit(skb, dev); 2878 rc = netdev_start_xmit(skb, dev, txq, more); 2879 trace_net_dev_xmit(skb, rc, dev, len); 2880 2881 return rc; 2882 } 2883 2884 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, 2885 struct netdev_queue *txq, int *ret) 2886 { 2887 struct sk_buff *skb = first; 2888 int rc = NETDEV_TX_OK; 2889 2890 while (skb) { 2891 struct sk_buff *next = skb->next; 2892 2893 skb->next = NULL; 2894 rc = xmit_one(skb, dev, txq, next != NULL); 2895 if (unlikely(!dev_xmit_complete(rc))) { 2896 skb->next = next; 2897 goto out; 2898 } 2899 2900 skb = next; 2901 if (netif_xmit_stopped(txq) && skb) { 2902 rc = NETDEV_TX_BUSY; 2903 break; 2904 } 2905 } 2906 2907 out: 2908 *ret = rc; 2909 return skb; 2910 } 2911 2912 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, 2913 netdev_features_t features) 2914 { 2915 if (skb_vlan_tag_present(skb) && 2916 !vlan_hw_offload_capable(features, skb->vlan_proto)) 2917 skb = __vlan_hwaccel_push_inside(skb); 2918 return skb; 2919 } 2920 2921 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) 2922 { 2923 netdev_features_t features; 2924 2925 features = netif_skb_features(skb); 2926 skb = validate_xmit_vlan(skb, features); 2927 if (unlikely(!skb)) 2928 goto out_null; 2929 2930 if (netif_needs_gso(skb, features)) { 2931 struct sk_buff *segs; 2932 2933 segs = skb_gso_segment(skb, features); 2934 if (IS_ERR(segs)) { 2935 goto out_kfree_skb; 2936 } else if (segs) { 2937 consume_skb(skb); 2938 skb = segs; 2939 } 2940 } else { 2941 if (skb_needs_linearize(skb, features) && 2942 __skb_linearize(skb)) 2943 goto out_kfree_skb; 2944 2945 /* If packet is not checksummed and device does not 2946 * support checksumming for this protocol, complete 2947 * checksumming here. 2948 */ 2949 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2950 if (skb->encapsulation) 2951 skb_set_inner_transport_header(skb, 2952 skb_checksum_start_offset(skb)); 2953 else 2954 skb_set_transport_header(skb, 2955 skb_checksum_start_offset(skb)); 2956 if (!(features & NETIF_F_CSUM_MASK) && 2957 skb_checksum_help(skb)) 2958 goto out_kfree_skb; 2959 } 2960 } 2961 2962 return skb; 2963 2964 out_kfree_skb: 2965 kfree_skb(skb); 2966 out_null: 2967 atomic_long_inc(&dev->tx_dropped); 2968 return NULL; 2969 } 2970 2971 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) 2972 { 2973 struct sk_buff *next, *head = NULL, *tail; 2974 2975 for (; skb != NULL; skb = next) { 2976 next = skb->next; 2977 skb->next = NULL; 2978 2979 /* in case skb wont be segmented, point to itself */ 2980 skb->prev = skb; 2981 2982 skb = validate_xmit_skb(skb, dev); 2983 if (!skb) 2984 continue; 2985 2986 if (!head) 2987 head = skb; 2988 else 2989 tail->next = skb; 2990 /* If skb was segmented, skb->prev points to 2991 * the last segment. If not, it still contains skb. 2992 */ 2993 tail = skb->prev; 2994 } 2995 return head; 2996 } 2997 EXPORT_SYMBOL_GPL(validate_xmit_skb_list); 2998 2999 static void qdisc_pkt_len_init(struct sk_buff *skb) 3000 { 3001 const struct skb_shared_info *shinfo = skb_shinfo(skb); 3002 3003 qdisc_skb_cb(skb)->pkt_len = skb->len; 3004 3005 /* To get more precise estimation of bytes sent on wire, 3006 * we add to pkt_len the headers size of all segments 3007 */ 3008 if (shinfo->gso_size) { 3009 unsigned int hdr_len; 3010 u16 gso_segs = shinfo->gso_segs; 3011 3012 /* mac layer + network layer */ 3013 hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 3014 3015 /* + transport layer */ 3016 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 3017 hdr_len += tcp_hdrlen(skb); 3018 else 3019 hdr_len += sizeof(struct udphdr); 3020 3021 if (shinfo->gso_type & SKB_GSO_DODGY) 3022 gso_segs = DIV_ROUND_UP(skb->len - hdr_len, 3023 shinfo->gso_size); 3024 3025 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; 3026 } 3027 } 3028 3029 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 3030 struct net_device *dev, 3031 struct netdev_queue *txq) 3032 { 3033 spinlock_t *root_lock = qdisc_lock(q); 3034 struct sk_buff *to_free = NULL; 3035 bool contended; 3036 int rc; 3037 3038 qdisc_calculate_pkt_len(skb, q); 3039 /* 3040 * Heuristic to force contended enqueues to serialize on a 3041 * separate lock before trying to get qdisc main lock. 3042 * This permits qdisc->running owner to get the lock more 3043 * often and dequeue packets faster. 3044 */ 3045 contended = qdisc_is_running(q); 3046 if (unlikely(contended)) 3047 spin_lock(&q->busylock); 3048 3049 spin_lock(root_lock); 3050 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 3051 __qdisc_drop(skb, &to_free); 3052 rc = NET_XMIT_DROP; 3053 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 3054 qdisc_run_begin(q)) { 3055 /* 3056 * This is a work-conserving queue; there are no old skbs 3057 * waiting to be sent out; and the qdisc is not running - 3058 * xmit the skb directly. 3059 */ 3060 3061 qdisc_bstats_update(q, skb); 3062 3063 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { 3064 if (unlikely(contended)) { 3065 spin_unlock(&q->busylock); 3066 contended = false; 3067 } 3068 __qdisc_run(q); 3069 } else 3070 qdisc_run_end(q); 3071 3072 rc = NET_XMIT_SUCCESS; 3073 } else { 3074 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; 3075 if (qdisc_run_begin(q)) { 3076 if (unlikely(contended)) { 3077 spin_unlock(&q->busylock); 3078 contended = false; 3079 } 3080 __qdisc_run(q); 3081 } 3082 } 3083 spin_unlock(root_lock); 3084 if (unlikely(to_free)) 3085 kfree_skb_list(to_free); 3086 if (unlikely(contended)) 3087 spin_unlock(&q->busylock); 3088 return rc; 3089 } 3090 3091 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) 3092 static void skb_update_prio(struct sk_buff *skb) 3093 { 3094 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); 3095 3096 if (!skb->priority && skb->sk && map) { 3097 unsigned int prioidx = 3098 sock_cgroup_prioidx(&skb->sk->sk_cgrp_data); 3099 3100 if (prioidx < map->priomap_len) 3101 skb->priority = map->priomap[prioidx]; 3102 } 3103 } 3104 #else 3105 #define skb_update_prio(skb) 3106 #endif 3107 3108 DEFINE_PER_CPU(int, xmit_recursion); 3109 EXPORT_SYMBOL(xmit_recursion); 3110 3111 /** 3112 * dev_loopback_xmit - loop back @skb 3113 * @net: network namespace this loopback is happening in 3114 * @sk: sk needed to be a netfilter okfn 3115 * @skb: buffer to transmit 3116 */ 3117 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) 3118 { 3119 skb_reset_mac_header(skb); 3120 __skb_pull(skb, skb_network_offset(skb)); 3121 skb->pkt_type = PACKET_LOOPBACK; 3122 skb->ip_summed = CHECKSUM_UNNECESSARY; 3123 WARN_ON(!skb_dst(skb)); 3124 skb_dst_force(skb); 3125 netif_rx_ni(skb); 3126 return 0; 3127 } 3128 EXPORT_SYMBOL(dev_loopback_xmit); 3129 3130 #ifdef CONFIG_NET_EGRESS 3131 static struct sk_buff * 3132 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) 3133 { 3134 struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list); 3135 struct tcf_result cl_res; 3136 3137 if (!cl) 3138 return skb; 3139 3140 /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ 3141 qdisc_bstats_cpu_update(cl->q, skb); 3142 3143 switch (tc_classify(skb, cl, &cl_res, false)) { 3144 case TC_ACT_OK: 3145 case TC_ACT_RECLASSIFY: 3146 skb->tc_index = TC_H_MIN(cl_res.classid); 3147 break; 3148 case TC_ACT_SHOT: 3149 qdisc_qstats_cpu_drop(cl->q); 3150 *ret = NET_XMIT_DROP; 3151 kfree_skb(skb); 3152 return NULL; 3153 case TC_ACT_STOLEN: 3154 case TC_ACT_QUEUED: 3155 *ret = NET_XMIT_SUCCESS; 3156 consume_skb(skb); 3157 return NULL; 3158 case TC_ACT_REDIRECT: 3159 /* No need to push/pop skb's mac_header here on egress! */ 3160 skb_do_redirect(skb); 3161 *ret = NET_XMIT_SUCCESS; 3162 return NULL; 3163 default: 3164 break; 3165 } 3166 3167 return skb; 3168 } 3169 #endif /* CONFIG_NET_EGRESS */ 3170 3171 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) 3172 { 3173 #ifdef CONFIG_XPS 3174 struct xps_dev_maps *dev_maps; 3175 struct xps_map *map; 3176 int queue_index = -1; 3177 3178 rcu_read_lock(); 3179 dev_maps = rcu_dereference(dev->xps_maps); 3180 if (dev_maps) { 3181 unsigned int tci = skb->sender_cpu - 1; 3182 3183 if (dev->num_tc) { 3184 tci *= dev->num_tc; 3185 tci += netdev_get_prio_tc_map(dev, skb->priority); 3186 } 3187 3188 map = rcu_dereference(dev_maps->cpu_map[tci]); 3189 if (map) { 3190 if (map->len == 1) 3191 queue_index = map->queues[0]; 3192 else 3193 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), 3194 map->len)]; 3195 if (unlikely(queue_index >= dev->real_num_tx_queues)) 3196 queue_index = -1; 3197 } 3198 } 3199 rcu_read_unlock(); 3200 3201 return queue_index; 3202 #else 3203 return -1; 3204 #endif 3205 } 3206 3207 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) 3208 { 3209 struct sock *sk = skb->sk; 3210 int queue_index = sk_tx_queue_get(sk); 3211 3212 if (queue_index < 0 || skb->ooo_okay || 3213 queue_index >= dev->real_num_tx_queues) { 3214 int new_index = get_xps_queue(dev, skb); 3215 if (new_index < 0) 3216 new_index = skb_tx_hash(dev, skb); 3217 3218 if (queue_index != new_index && sk && 3219 sk_fullsock(sk) && 3220 rcu_access_pointer(sk->sk_dst_cache)) 3221 sk_tx_queue_set(sk, new_index); 3222 3223 queue_index = new_index; 3224 } 3225 3226 return queue_index; 3227 } 3228 3229 struct netdev_queue *netdev_pick_tx(struct net_device *dev, 3230 struct sk_buff *skb, 3231 void *accel_priv) 3232 { 3233 int queue_index = 0; 3234 3235 #ifdef CONFIG_XPS 3236 u32 sender_cpu = skb->sender_cpu - 1; 3237 3238 if (sender_cpu >= (u32)NR_CPUS) 3239 skb->sender_cpu = raw_smp_processor_id() + 1; 3240 #endif 3241 3242 if (dev->real_num_tx_queues != 1) { 3243 const struct net_device_ops *ops = dev->netdev_ops; 3244 if (ops->ndo_select_queue) 3245 queue_index = ops->ndo_select_queue(dev, skb, accel_priv, 3246 __netdev_pick_tx); 3247 else 3248 queue_index = __netdev_pick_tx(dev, skb); 3249 3250 if (!accel_priv) 3251 queue_index = netdev_cap_txqueue(dev, queue_index); 3252 } 3253 3254 skb_set_queue_mapping(skb, queue_index); 3255 return netdev_get_tx_queue(dev, queue_index); 3256 } 3257 3258 /** 3259 * __dev_queue_xmit - transmit a buffer 3260 * @skb: buffer to transmit 3261 * @accel_priv: private data used for L2 forwarding offload 3262 * 3263 * Queue a buffer for transmission to a network device. The caller must 3264 * have set the device and priority and built the buffer before calling 3265 * this function. The function can be called from an interrupt. 3266 * 3267 * A negative errno code is returned on a failure. A success does not 3268 * guarantee the frame will be transmitted as it may be dropped due 3269 * to congestion or traffic shaping. 3270 * 3271 * ----------------------------------------------------------------------------------- 3272 * I notice this method can also return errors from the queue disciplines, 3273 * including NET_XMIT_DROP, which is a positive value. So, errors can also 3274 * be positive. 3275 * 3276 * Regardless of the return value, the skb is consumed, so it is currently 3277 * difficult to retry a send to this method. (You can bump the ref count 3278 * before sending to hold a reference for retry if you are careful.) 3279 * 3280 * When calling this method, interrupts MUST be enabled. This is because 3281 * the BH enable code must have IRQs enabled so that it will not deadlock. 3282 * --BLG 3283 */ 3284 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) 3285 { 3286 struct net_device *dev = skb->dev; 3287 struct netdev_queue *txq; 3288 struct Qdisc *q; 3289 int rc = -ENOMEM; 3290 3291 skb_reset_mac_header(skb); 3292 3293 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) 3294 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); 3295 3296 /* Disable soft irqs for various locks below. Also 3297 * stops preemption for RCU. 3298 */ 3299 rcu_read_lock_bh(); 3300 3301 skb_update_prio(skb); 3302 3303 qdisc_pkt_len_init(skb); 3304 #ifdef CONFIG_NET_CLS_ACT 3305 skb->tc_at_ingress = 0; 3306 # ifdef CONFIG_NET_EGRESS 3307 if (static_key_false(&egress_needed)) { 3308 skb = sch_handle_egress(skb, &rc, dev); 3309 if (!skb) 3310 goto out; 3311 } 3312 # endif 3313 #endif 3314 /* If device/qdisc don't need skb->dst, release it right now while 3315 * its hot in this cpu cache. 3316 */ 3317 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 3318 skb_dst_drop(skb); 3319 else 3320 skb_dst_force(skb); 3321 3322 txq = netdev_pick_tx(dev, skb, accel_priv); 3323 q = rcu_dereference_bh(txq->qdisc); 3324 3325 trace_net_dev_queue(skb); 3326 if (q->enqueue) { 3327 rc = __dev_xmit_skb(skb, q, dev, txq); 3328 goto out; 3329 } 3330 3331 /* The device has no queue. Common case for software devices: 3332 loopback, all the sorts of tunnels... 3333 3334 Really, it is unlikely that netif_tx_lock protection is necessary 3335 here. (f.e. loopback and IP tunnels are clean ignoring statistics 3336 counters.) 3337 However, it is possible, that they rely on protection 3338 made by us here. 3339 3340 Check this and shot the lock. It is not prone from deadlocks. 3341 Either shot noqueue qdisc, it is even simpler 8) 3342 */ 3343 if (dev->flags & IFF_UP) { 3344 int cpu = smp_processor_id(); /* ok because BHs are off */ 3345 3346 if (txq->xmit_lock_owner != cpu) { 3347 if (unlikely(__this_cpu_read(xmit_recursion) > 3348 XMIT_RECURSION_LIMIT)) 3349 goto recursion_alert; 3350 3351 skb = validate_xmit_skb(skb, dev); 3352 if (!skb) 3353 goto out; 3354 3355 HARD_TX_LOCK(dev, txq, cpu); 3356 3357 if (!netif_xmit_stopped(txq)) { 3358 __this_cpu_inc(xmit_recursion); 3359 skb = dev_hard_start_xmit(skb, dev, txq, &rc); 3360 __this_cpu_dec(xmit_recursion); 3361 if (dev_xmit_complete(rc)) { 3362 HARD_TX_UNLOCK(dev, txq); 3363 goto out; 3364 } 3365 } 3366 HARD_TX_UNLOCK(dev, txq); 3367 net_crit_ratelimited("Virtual device %s asks to queue packet!\n", 3368 dev->name); 3369 } else { 3370 /* Recursion is detected! It is possible, 3371 * unfortunately 3372 */ 3373 recursion_alert: 3374 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", 3375 dev->name); 3376 } 3377 } 3378 3379 rc = -ENETDOWN; 3380 rcu_read_unlock_bh(); 3381 3382 atomic_long_inc(&dev->tx_dropped); 3383 kfree_skb_list(skb); 3384 return rc; 3385 out: 3386 rcu_read_unlock_bh(); 3387 return rc; 3388 } 3389 3390 int dev_queue_xmit(struct sk_buff *skb) 3391 { 3392 return __dev_queue_xmit(skb, NULL); 3393 } 3394 EXPORT_SYMBOL(dev_queue_xmit); 3395 3396 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) 3397 { 3398 return __dev_queue_xmit(skb, accel_priv); 3399 } 3400 EXPORT_SYMBOL(dev_queue_xmit_accel); 3401 3402 3403 /*======================================================================= 3404 Receiver routines 3405 =======================================================================*/ 3406 3407 int netdev_max_backlog __read_mostly = 1000; 3408 EXPORT_SYMBOL(netdev_max_backlog); 3409 3410 int netdev_tstamp_prequeue __read_mostly = 1; 3411 int netdev_budget __read_mostly = 300; 3412 int weight_p __read_mostly = 64; /* old backlog weight */ 3413 int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ 3414 int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ 3415 int dev_rx_weight __read_mostly = 64; 3416 int dev_tx_weight __read_mostly = 64; 3417 3418 /* Called with irq disabled */ 3419 static inline void ____napi_schedule(struct softnet_data *sd, 3420 struct napi_struct *napi) 3421 { 3422 list_add_tail(&napi->poll_list, &sd->poll_list); 3423 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3424 } 3425 3426 #ifdef CONFIG_RPS 3427 3428 /* One global table that all flow-based protocols share. */ 3429 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; 3430 EXPORT_SYMBOL(rps_sock_flow_table); 3431 u32 rps_cpu_mask __read_mostly; 3432 EXPORT_SYMBOL(rps_cpu_mask); 3433 3434 struct static_key rps_needed __read_mostly; 3435 EXPORT_SYMBOL(rps_needed); 3436 struct static_key rfs_needed __read_mostly; 3437 EXPORT_SYMBOL(rfs_needed); 3438 3439 static struct rps_dev_flow * 3440 set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3441 struct rps_dev_flow *rflow, u16 next_cpu) 3442 { 3443 if (next_cpu < nr_cpu_ids) { 3444 #ifdef CONFIG_RFS_ACCEL 3445 struct netdev_rx_queue *rxqueue; 3446 struct rps_dev_flow_table *flow_table; 3447 struct rps_dev_flow *old_rflow; 3448 u32 flow_id; 3449 u16 rxq_index; 3450 int rc; 3451 3452 /* Should we steer this flow to a different hardware queue? */ 3453 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || 3454 !(dev->features & NETIF_F_NTUPLE)) 3455 goto out; 3456 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); 3457 if (rxq_index == skb_get_rx_queue(skb)) 3458 goto out; 3459 3460 rxqueue = dev->_rx + rxq_index; 3461 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3462 if (!flow_table) 3463 goto out; 3464 flow_id = skb_get_hash(skb) & flow_table->mask; 3465 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, 3466 rxq_index, flow_id); 3467 if (rc < 0) 3468 goto out; 3469 old_rflow = rflow; 3470 rflow = &flow_table->flows[flow_id]; 3471 rflow->filter = rc; 3472 if (old_rflow->filter == rflow->filter) 3473 old_rflow->filter = RPS_NO_FILTER; 3474 out: 3475 #endif 3476 rflow->last_qtail = 3477 per_cpu(softnet_data, next_cpu).input_queue_head; 3478 } 3479 3480 rflow->cpu = next_cpu; 3481 return rflow; 3482 } 3483 3484 /* 3485 * get_rps_cpu is called from netif_receive_skb and returns the target 3486 * CPU from the RPS map of the receiving queue for a given skb. 3487 * rcu_read_lock must be held on entry. 3488 */ 3489 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3490 struct rps_dev_flow **rflowp) 3491 { 3492 const struct rps_sock_flow_table *sock_flow_table; 3493 struct netdev_rx_queue *rxqueue = dev->_rx; 3494 struct rps_dev_flow_table *flow_table; 3495 struct rps_map *map; 3496 int cpu = -1; 3497 u32 tcpu; 3498 u32 hash; 3499 3500 if (skb_rx_queue_recorded(skb)) { 3501 u16 index = skb_get_rx_queue(skb); 3502 3503 if (unlikely(index >= dev->real_num_rx_queues)) { 3504 WARN_ONCE(dev->real_num_rx_queues > 1, 3505 "%s received packet on queue %u, but number " 3506 "of RX queues is %u\n", 3507 dev->name, index, dev->real_num_rx_queues); 3508 goto done; 3509 } 3510 rxqueue += index; 3511 } 3512 3513 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ 3514 3515 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3516 map = rcu_dereference(rxqueue->rps_map); 3517 if (!flow_table && !map) 3518 goto done; 3519 3520 skb_reset_network_header(skb); 3521 hash = skb_get_hash(skb); 3522 if (!hash) 3523 goto done; 3524 3525 sock_flow_table = rcu_dereference(rps_sock_flow_table); 3526 if (flow_table && sock_flow_table) { 3527 struct rps_dev_flow *rflow; 3528 u32 next_cpu; 3529 u32 ident; 3530 3531 /* First check into global flow table if there is a match */ 3532 ident = sock_flow_table->ents[hash & sock_flow_table->mask]; 3533 if ((ident ^ hash) & ~rps_cpu_mask) 3534 goto try_rps; 3535 3536 next_cpu = ident & rps_cpu_mask; 3537 3538 /* OK, now we know there is a match, 3539 * we can look at the local (per receive queue) flow table 3540 */ 3541 rflow = &flow_table->flows[hash & flow_table->mask]; 3542 tcpu = rflow->cpu; 3543 3544 /* 3545 * If the desired CPU (where last recvmsg was done) is 3546 * different from current CPU (one in the rx-queue flow 3547 * table entry), switch if one of the following holds: 3548 * - Current CPU is unset (>= nr_cpu_ids). 3549 * - Current CPU is offline. 3550 * - The current CPU's queue tail has advanced beyond the 3551 * last packet that was enqueued using this table entry. 3552 * This guarantees that all previous packets for the flow 3553 * have been dequeued, thus preserving in order delivery. 3554 */ 3555 if (unlikely(tcpu != next_cpu) && 3556 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || 3557 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 3558 rflow->last_qtail)) >= 0)) { 3559 tcpu = next_cpu; 3560 rflow = set_rps_cpu(dev, skb, rflow, next_cpu); 3561 } 3562 3563 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { 3564 *rflowp = rflow; 3565 cpu = tcpu; 3566 goto done; 3567 } 3568 } 3569 3570 try_rps: 3571 3572 if (map) { 3573 tcpu = map->cpus[reciprocal_scale(hash, map->len)]; 3574 if (cpu_online(tcpu)) { 3575 cpu = tcpu; 3576 goto done; 3577 } 3578 } 3579 3580 done: 3581 return cpu; 3582 } 3583 3584 #ifdef CONFIG_RFS_ACCEL 3585 3586 /** 3587 * rps_may_expire_flow - check whether an RFS hardware filter may be removed 3588 * @dev: Device on which the filter was set 3589 * @rxq_index: RX queue index 3590 * @flow_id: Flow ID passed to ndo_rx_flow_steer() 3591 * @filter_id: Filter ID returned by ndo_rx_flow_steer() 3592 * 3593 * Drivers that implement ndo_rx_flow_steer() should periodically call 3594 * this function for each installed filter and remove the filters for 3595 * which it returns %true. 3596 */ 3597 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, 3598 u32 flow_id, u16 filter_id) 3599 { 3600 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; 3601 struct rps_dev_flow_table *flow_table; 3602 struct rps_dev_flow *rflow; 3603 bool expire = true; 3604 unsigned int cpu; 3605 3606 rcu_read_lock(); 3607 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3608 if (flow_table && flow_id <= flow_table->mask) { 3609 rflow = &flow_table->flows[flow_id]; 3610 cpu = ACCESS_ONCE(rflow->cpu); 3611 if (rflow->filter == filter_id && cpu < nr_cpu_ids && 3612 ((int)(per_cpu(softnet_data, cpu).input_queue_head - 3613 rflow->last_qtail) < 3614 (int)(10 * flow_table->mask))) 3615 expire = false; 3616 } 3617 rcu_read_unlock(); 3618 return expire; 3619 } 3620 EXPORT_SYMBOL(rps_may_expire_flow); 3621 3622 #endif /* CONFIG_RFS_ACCEL */ 3623 3624 /* Called from hardirq (IPI) context */ 3625 static void rps_trigger_softirq(void *data) 3626 { 3627 struct softnet_data *sd = data; 3628 3629 ____napi_schedule(sd, &sd->backlog); 3630 sd->received_rps++; 3631 } 3632 3633 #endif /* CONFIG_RPS */ 3634 3635 /* 3636 * Check if this softnet_data structure is another cpu one 3637 * If yes, queue it to our IPI list and return 1 3638 * If no, return 0 3639 */ 3640 static int rps_ipi_queued(struct softnet_data *sd) 3641 { 3642 #ifdef CONFIG_RPS 3643 struct softnet_data *mysd = this_cpu_ptr(&softnet_data); 3644 3645 if (sd != mysd) { 3646 sd->rps_ipi_next = mysd->rps_ipi_list; 3647 mysd->rps_ipi_list = sd; 3648 3649 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3650 return 1; 3651 } 3652 #endif /* CONFIG_RPS */ 3653 return 0; 3654 } 3655 3656 #ifdef CONFIG_NET_FLOW_LIMIT 3657 int netdev_flow_limit_table_len __read_mostly = (1 << 12); 3658 #endif 3659 3660 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) 3661 { 3662 #ifdef CONFIG_NET_FLOW_LIMIT 3663 struct sd_flow_limit *fl; 3664 struct softnet_data *sd; 3665 unsigned int old_flow, new_flow; 3666 3667 if (qlen < (netdev_max_backlog >> 1)) 3668 return false; 3669 3670 sd = this_cpu_ptr(&softnet_data); 3671 3672 rcu_read_lock(); 3673 fl = rcu_dereference(sd->flow_limit); 3674 if (fl) { 3675 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); 3676 old_flow = fl->history[fl->history_head]; 3677 fl->history[fl->history_head] = new_flow; 3678 3679 fl->history_head++; 3680 fl->history_head &= FLOW_LIMIT_HISTORY - 1; 3681 3682 if (likely(fl->buckets[old_flow])) 3683 fl->buckets[old_flow]--; 3684 3685 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { 3686 fl->count++; 3687 rcu_read_unlock(); 3688 return true; 3689 } 3690 } 3691 rcu_read_unlock(); 3692 #endif 3693 return false; 3694 } 3695 3696 /* 3697 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 3698 * queue (may be a remote CPU queue). 3699 */ 3700 static int enqueue_to_backlog(struct sk_buff *skb, int cpu, 3701 unsigned int *qtail) 3702 { 3703 struct softnet_data *sd; 3704 unsigned long flags; 3705 unsigned int qlen; 3706 3707 sd = &per_cpu(softnet_data, cpu); 3708 3709 local_irq_save(flags); 3710 3711 rps_lock(sd); 3712 if (!netif_running(skb->dev)) 3713 goto drop; 3714 qlen = skb_queue_len(&sd->input_pkt_queue); 3715 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { 3716 if (qlen) { 3717 enqueue: 3718 __skb_queue_tail(&sd->input_pkt_queue, skb); 3719 input_queue_tail_incr_save(sd, qtail); 3720 rps_unlock(sd); 3721 local_irq_restore(flags); 3722 return NET_RX_SUCCESS; 3723 } 3724 3725 /* Schedule NAPI for backlog device 3726 * We can use non atomic operation since we own the queue lock 3727 */ 3728 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { 3729 if (!rps_ipi_queued(sd)) 3730 ____napi_schedule(sd, &sd->backlog); 3731 } 3732 goto enqueue; 3733 } 3734 3735 drop: 3736 sd->dropped++; 3737 rps_unlock(sd); 3738 3739 local_irq_restore(flags); 3740 3741 atomic_long_inc(&skb->dev->rx_dropped); 3742 kfree_skb(skb); 3743 return NET_RX_DROP; 3744 } 3745 3746 static int netif_rx_internal(struct sk_buff *skb) 3747 { 3748 int ret; 3749 3750 net_timestamp_check(netdev_tstamp_prequeue, skb); 3751 3752 trace_netif_rx(skb); 3753 #ifdef CONFIG_RPS 3754 if (static_key_false(&rps_needed)) { 3755 struct rps_dev_flow voidflow, *rflow = &voidflow; 3756 int cpu; 3757 3758 preempt_disable(); 3759 rcu_read_lock(); 3760 3761 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3762 if (cpu < 0) 3763 cpu = smp_processor_id(); 3764 3765 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3766 3767 rcu_read_unlock(); 3768 preempt_enable(); 3769 } else 3770 #endif 3771 { 3772 unsigned int qtail; 3773 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 3774 put_cpu(); 3775 } 3776 return ret; 3777 } 3778 3779 /** 3780 * netif_rx - post buffer to the network code 3781 * @skb: buffer to post 3782 * 3783 * This function receives a packet from a device driver and queues it for 3784 * the upper (protocol) levels to process. It always succeeds. The buffer 3785 * may be dropped during processing for congestion control or by the 3786 * protocol layers. 3787 * 3788 * return values: 3789 * NET_RX_SUCCESS (no congestion) 3790 * NET_RX_DROP (packet was dropped) 3791 * 3792 */ 3793 3794 int netif_rx(struct sk_buff *skb) 3795 { 3796 trace_netif_rx_entry(skb); 3797 3798 return netif_rx_internal(skb); 3799 } 3800 EXPORT_SYMBOL(netif_rx); 3801 3802 int netif_rx_ni(struct sk_buff *skb) 3803 { 3804 int err; 3805 3806 trace_netif_rx_ni_entry(skb); 3807 3808 preempt_disable(); 3809 err = netif_rx_internal(skb); 3810 if (local_softirq_pending()) 3811 do_softirq(); 3812 preempt_enable(); 3813 3814 return err; 3815 } 3816 EXPORT_SYMBOL(netif_rx_ni); 3817 3818 static __latent_entropy void net_tx_action(struct softirq_action *h) 3819 { 3820 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 3821 3822 if (sd->completion_queue) { 3823 struct sk_buff *clist; 3824 3825 local_irq_disable(); 3826 clist = sd->completion_queue; 3827 sd->completion_queue = NULL; 3828 local_irq_enable(); 3829 3830 while (clist) { 3831 struct sk_buff *skb = clist; 3832 clist = clist->next; 3833 3834 WARN_ON(atomic_read(&skb->users)); 3835 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) 3836 trace_consume_skb(skb); 3837 else 3838 trace_kfree_skb(skb, net_tx_action); 3839 3840 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) 3841 __kfree_skb(skb); 3842 else 3843 __kfree_skb_defer(skb); 3844 } 3845 3846 __kfree_skb_flush(); 3847 } 3848 3849 if (sd->output_queue) { 3850 struct Qdisc *head; 3851 3852 local_irq_disable(); 3853 head = sd->output_queue; 3854 sd->output_queue = NULL; 3855 sd->output_queue_tailp = &sd->output_queue; 3856 local_irq_enable(); 3857 3858 while (head) { 3859 struct Qdisc *q = head; 3860 spinlock_t *root_lock; 3861 3862 head = head->next_sched; 3863 3864 root_lock = qdisc_lock(q); 3865 spin_lock(root_lock); 3866 /* We need to make sure head->next_sched is read 3867 * before clearing __QDISC_STATE_SCHED 3868 */ 3869 smp_mb__before_atomic(); 3870 clear_bit(__QDISC_STATE_SCHED, &q->state); 3871 qdisc_run(q); 3872 spin_unlock(root_lock); 3873 } 3874 } 3875 } 3876 3877 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) 3878 /* This hook is defined here for ATM LANE */ 3879 int (*br_fdb_test_addr_hook)(struct net_device *dev, 3880 unsigned char *addr) __read_mostly; 3881 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 3882 #endif 3883 3884 static inline struct sk_buff * 3885 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, 3886 struct net_device *orig_dev) 3887 { 3888 #ifdef CONFIG_NET_CLS_ACT 3889 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); 3890 struct tcf_result cl_res; 3891 3892 /* If there's at least one ingress present somewhere (so 3893 * we get here via enabled static key), remaining devices 3894 * that are not configured with an ingress qdisc will bail 3895 * out here. 3896 */ 3897 if (!cl) 3898 return skb; 3899 if (*pt_prev) { 3900 *ret = deliver_skb(skb, *pt_prev, orig_dev); 3901 *pt_prev = NULL; 3902 } 3903 3904 qdisc_skb_cb(skb)->pkt_len = skb->len; 3905 skb->tc_at_ingress = 1; 3906 qdisc_bstats_cpu_update(cl->q, skb); 3907 3908 switch (tc_classify(skb, cl, &cl_res, false)) { 3909 case TC_ACT_OK: 3910 case TC_ACT_RECLASSIFY: 3911 skb->tc_index = TC_H_MIN(cl_res.classid); 3912 break; 3913 case TC_ACT_SHOT: 3914 qdisc_qstats_cpu_drop(cl->q); 3915 kfree_skb(skb); 3916 return NULL; 3917 case TC_ACT_STOLEN: 3918 case TC_ACT_QUEUED: 3919 consume_skb(skb); 3920 return NULL; 3921 case TC_ACT_REDIRECT: 3922 /* skb_mac_header check was done by cls/act_bpf, so 3923 * we can safely push the L2 header back before 3924 * redirecting to another netdev 3925 */ 3926 __skb_push(skb, skb->mac_len); 3927 skb_do_redirect(skb); 3928 return NULL; 3929 default: 3930 break; 3931 } 3932 #endif /* CONFIG_NET_CLS_ACT */ 3933 return skb; 3934 } 3935 3936 /** 3937 * netdev_is_rx_handler_busy - check if receive handler is registered 3938 * @dev: device to check 3939 * 3940 * Check if a receive handler is already registered for a given device. 3941 * Return true if there one. 3942 * 3943 * The caller must hold the rtnl_mutex. 3944 */ 3945 bool netdev_is_rx_handler_busy(struct net_device *dev) 3946 { 3947 ASSERT_RTNL(); 3948 return dev && rtnl_dereference(dev->rx_handler); 3949 } 3950 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); 3951 3952 /** 3953 * netdev_rx_handler_register - register receive handler 3954 * @dev: device to register a handler for 3955 * @rx_handler: receive handler to register 3956 * @rx_handler_data: data pointer that is used by rx handler 3957 * 3958 * Register a receive handler for a device. This handler will then be 3959 * called from __netif_receive_skb. A negative errno code is returned 3960 * on a failure. 3961 * 3962 * The caller must hold the rtnl_mutex. 3963 * 3964 * For a general description of rx_handler, see enum rx_handler_result. 3965 */ 3966 int netdev_rx_handler_register(struct net_device *dev, 3967 rx_handler_func_t *rx_handler, 3968 void *rx_handler_data) 3969 { 3970 if (netdev_is_rx_handler_busy(dev)) 3971 return -EBUSY; 3972 3973 /* Note: rx_handler_data must be set before rx_handler */ 3974 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); 3975 rcu_assign_pointer(dev->rx_handler, rx_handler); 3976 3977 return 0; 3978 } 3979 EXPORT_SYMBOL_GPL(netdev_rx_handler_register); 3980 3981 /** 3982 * netdev_rx_handler_unregister - unregister receive handler 3983 * @dev: device to unregister a handler from 3984 * 3985 * Unregister a receive handler from a device. 3986 * 3987 * The caller must hold the rtnl_mutex. 3988 */ 3989 void netdev_rx_handler_unregister(struct net_device *dev) 3990 { 3991 3992 ASSERT_RTNL(); 3993 RCU_INIT_POINTER(dev->rx_handler, NULL); 3994 /* a reader seeing a non NULL rx_handler in a rcu_read_lock() 3995 * section has a guarantee to see a non NULL rx_handler_data 3996 * as well. 3997 */ 3998 synchronize_net(); 3999 RCU_INIT_POINTER(dev->rx_handler_data, NULL); 4000 } 4001 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 4002 4003 /* 4004 * Limit the use of PFMEMALLOC reserves to those protocols that implement 4005 * the special handling of PFMEMALLOC skbs. 4006 */ 4007 static bool skb_pfmemalloc_protocol(struct sk_buff *skb) 4008 { 4009 switch (skb->protocol) { 4010 case htons(ETH_P_ARP): 4011 case htons(ETH_P_IP): 4012 case htons(ETH_P_IPV6): 4013 case htons(ETH_P_8021Q): 4014 case htons(ETH_P_8021AD): 4015 return true; 4016 default: 4017 return false; 4018 } 4019 } 4020 4021 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, 4022 int *ret, struct net_device *orig_dev) 4023 { 4024 #ifdef CONFIG_NETFILTER_INGRESS 4025 if (nf_hook_ingress_active(skb)) { 4026 int ingress_retval; 4027 4028 if (*pt_prev) { 4029 *ret = deliver_skb(skb, *pt_prev, orig_dev); 4030 *pt_prev = NULL; 4031 } 4032 4033 rcu_read_lock(); 4034 ingress_retval = nf_hook_ingress(skb); 4035 rcu_read_unlock(); 4036 return ingress_retval; 4037 } 4038 #endif /* CONFIG_NETFILTER_INGRESS */ 4039 return 0; 4040 } 4041 4042 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) 4043 { 4044 struct packet_type *ptype, *pt_prev; 4045 rx_handler_func_t *rx_handler; 4046 struct net_device *orig_dev; 4047 bool deliver_exact = false; 4048 int ret = NET_RX_DROP; 4049 __be16 type; 4050 4051 net_timestamp_check(!netdev_tstamp_prequeue, skb); 4052 4053 trace_netif_receive_skb(skb); 4054 4055 orig_dev = skb->dev; 4056 4057 skb_reset_network_header(skb); 4058 if (!skb_transport_header_was_set(skb)) 4059 skb_reset_transport_header(skb); 4060 skb_reset_mac_len(skb); 4061 4062 pt_prev = NULL; 4063 4064 another_round: 4065 skb->skb_iif = skb->dev->ifindex; 4066 4067 __this_cpu_inc(softnet_data.processed); 4068 4069 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || 4070 skb->protocol == cpu_to_be16(ETH_P_8021AD)) { 4071 skb = skb_vlan_untag(skb); 4072 if (unlikely(!skb)) 4073 goto out; 4074 } 4075 4076 if (skb_skip_tc_classify(skb)) 4077 goto skip_classify; 4078 4079 if (pfmemalloc) 4080 goto skip_taps; 4081 4082 list_for_each_entry_rcu(ptype, &ptype_all, list) { 4083 if (pt_prev) 4084 ret = deliver_skb(skb, pt_prev, orig_dev); 4085 pt_prev = ptype; 4086 } 4087 4088 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { 4089 if (pt_prev) 4090 ret = deliver_skb(skb, pt_prev, orig_dev); 4091 pt_prev = ptype; 4092 } 4093 4094 skip_taps: 4095 #ifdef CONFIG_NET_INGRESS 4096 if (static_key_false(&ingress_needed)) { 4097 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); 4098 if (!skb) 4099 goto out; 4100 4101 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) 4102 goto out; 4103 } 4104 #endif 4105 skb_reset_tc(skb); 4106 skip_classify: 4107 if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) 4108 goto drop; 4109 4110 if (skb_vlan_tag_present(skb)) { 4111 if (pt_prev) { 4112 ret = deliver_skb(skb, pt_prev, orig_dev); 4113 pt_prev = NULL; 4114 } 4115 if (vlan_do_receive(&skb)) 4116 goto another_round; 4117 else if (unlikely(!skb)) 4118 goto out; 4119 } 4120 4121 rx_handler = rcu_dereference(skb->dev->rx_handler); 4122 if (rx_handler) { 4123 if (pt_prev) { 4124 ret = deliver_skb(skb, pt_prev, orig_dev); 4125 pt_prev = NULL; 4126 } 4127 switch (rx_handler(&skb)) { 4128 case RX_HANDLER_CONSUMED: 4129 ret = NET_RX_SUCCESS; 4130 goto out; 4131 case RX_HANDLER_ANOTHER: 4132 goto another_round; 4133 case RX_HANDLER_EXACT: 4134 deliver_exact = true; 4135 case RX_HANDLER_PASS: 4136 break; 4137 default: 4138 BUG(); 4139 } 4140 } 4141 4142 if (unlikely(skb_vlan_tag_present(skb))) { 4143 if (skb_vlan_tag_get_id(skb)) 4144 skb->pkt_type = PACKET_OTHERHOST; 4145 /* Note: we might in the future use prio bits 4146 * and set skb->priority like in vlan_do_receive() 4147 * For the time being, just ignore Priority Code Point 4148 */ 4149 skb->vlan_tci = 0; 4150 } 4151 4152 type = skb->protocol; 4153 4154 /* deliver only exact match when indicated */ 4155 if (likely(!deliver_exact)) { 4156 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4157 &ptype_base[ntohs(type) & 4158 PTYPE_HASH_MASK]); 4159 } 4160 4161 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4162 &orig_dev->ptype_specific); 4163 4164 if (unlikely(skb->dev != orig_dev)) { 4165 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4166 &skb->dev->ptype_specific); 4167 } 4168 4169 if (pt_prev) { 4170 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 4171 goto drop; 4172 else 4173 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 4174 } else { 4175 drop: 4176 if (!deliver_exact) 4177 atomic_long_inc(&skb->dev->rx_dropped); 4178 else 4179 atomic_long_inc(&skb->dev->rx_nohandler); 4180 kfree_skb(skb); 4181 /* Jamal, now you will not able to escape explaining 4182 * me how you were going to use this. :-) 4183 */ 4184 ret = NET_RX_DROP; 4185 } 4186 4187 out: 4188 return ret; 4189 } 4190 4191 static int __netif_receive_skb(struct sk_buff *skb) 4192 { 4193 int ret; 4194 4195 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { 4196 unsigned long pflags = current->flags; 4197 4198 /* 4199 * PFMEMALLOC skbs are special, they should 4200 * - be delivered to SOCK_MEMALLOC sockets only 4201 * - stay away from userspace 4202 * - have bounded memory usage 4203 * 4204 * Use PF_MEMALLOC as this saves us from propagating the allocation 4205 * context down to all allocation sites. 4206 */ 4207 current->flags |= PF_MEMALLOC; 4208 ret = __netif_receive_skb_core(skb, true); 4209 tsk_restore_flags(current, pflags, PF_MEMALLOC); 4210 } else 4211 ret = __netif_receive_skb_core(skb, false); 4212 4213 return ret; 4214 } 4215 4216 static int netif_receive_skb_internal(struct sk_buff *skb) 4217 { 4218 int ret; 4219 4220 net_timestamp_check(netdev_tstamp_prequeue, skb); 4221 4222 if (skb_defer_rx_timestamp(skb)) 4223 return NET_RX_SUCCESS; 4224 4225 rcu_read_lock(); 4226 4227 #ifdef CONFIG_RPS 4228 if (static_key_false(&rps_needed)) { 4229 struct rps_dev_flow voidflow, *rflow = &voidflow; 4230 int cpu = get_rps_cpu(skb->dev, skb, &rflow); 4231 4232 if (cpu >= 0) { 4233 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 4234 rcu_read_unlock(); 4235 return ret; 4236 } 4237 } 4238 #endif 4239 ret = __netif_receive_skb(skb); 4240 rcu_read_unlock(); 4241 return ret; 4242 } 4243 4244 /** 4245 * netif_receive_skb - process receive buffer from network 4246 * @skb: buffer to process 4247 * 4248 * netif_receive_skb() is the main receive data processing function. 4249 * It always succeeds. The buffer may be dropped during processing 4250 * for congestion control or by the protocol layers. 4251 * 4252 * This function may only be called from softirq context and interrupts 4253 * should be enabled. 4254 * 4255 * Return values (usually ignored): 4256 * NET_RX_SUCCESS: no congestion 4257 * NET_RX_DROP: packet was dropped 4258 */ 4259 int netif_receive_skb(struct sk_buff *skb) 4260 { 4261 trace_netif_receive_skb_entry(skb); 4262 4263 return netif_receive_skb_internal(skb); 4264 } 4265 EXPORT_SYMBOL(netif_receive_skb); 4266 4267 DEFINE_PER_CPU(struct work_struct, flush_works); 4268 4269 /* Network device is going away, flush any packets still pending */ 4270 static void flush_backlog(struct work_struct *work) 4271 { 4272 struct sk_buff *skb, *tmp; 4273 struct softnet_data *sd; 4274 4275 local_bh_disable(); 4276 sd = this_cpu_ptr(&softnet_data); 4277 4278 local_irq_disable(); 4279 rps_lock(sd); 4280 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 4281 if (skb->dev->reg_state == NETREG_UNREGISTERING) { 4282 __skb_unlink(skb, &sd->input_pkt_queue); 4283 kfree_skb(skb); 4284 input_queue_head_incr(sd); 4285 } 4286 } 4287 rps_unlock(sd); 4288 local_irq_enable(); 4289 4290 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 4291 if (skb->dev->reg_state == NETREG_UNREGISTERING) { 4292 __skb_unlink(skb, &sd->process_queue); 4293 kfree_skb(skb); 4294 input_queue_head_incr(sd); 4295 } 4296 } 4297 local_bh_enable(); 4298 } 4299 4300 static void flush_all_backlogs(void) 4301 { 4302 unsigned int cpu; 4303 4304 get_online_cpus(); 4305 4306 for_each_online_cpu(cpu) 4307 queue_work_on(cpu, system_highpri_wq, 4308 per_cpu_ptr(&flush_works, cpu)); 4309 4310 for_each_online_cpu(cpu) 4311 flush_work(per_cpu_ptr(&flush_works, cpu)); 4312 4313 put_online_cpus(); 4314 } 4315 4316 static int napi_gro_complete(struct sk_buff *skb) 4317 { 4318 struct packet_offload *ptype; 4319 __be16 type = skb->protocol; 4320 struct list_head *head = &offload_base; 4321 int err = -ENOENT; 4322 4323 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); 4324 4325 if (NAPI_GRO_CB(skb)->count == 1) { 4326 skb_shinfo(skb)->gso_size = 0; 4327 goto out; 4328 } 4329 4330 rcu_read_lock(); 4331 list_for_each_entry_rcu(ptype, head, list) { 4332 if (ptype->type != type || !ptype->callbacks.gro_complete) 4333 continue; 4334 4335 err = ptype->callbacks.gro_complete(skb, 0); 4336 break; 4337 } 4338 rcu_read_unlock(); 4339 4340 if (err) { 4341 WARN_ON(&ptype->list == head); 4342 kfree_skb(skb); 4343 return NET_RX_SUCCESS; 4344 } 4345 4346 out: 4347 return netif_receive_skb_internal(skb); 4348 } 4349 4350 /* napi->gro_list contains packets ordered by age. 4351 * youngest packets at the head of it. 4352 * Complete skbs in reverse order to reduce latencies. 4353 */ 4354 void napi_gro_flush(struct napi_struct *napi, bool flush_old) 4355 { 4356 struct sk_buff *skb, *prev = NULL; 4357 4358 /* scan list and build reverse chain */ 4359 for (skb = napi->gro_list; skb != NULL; skb = skb->next) { 4360 skb->prev = prev; 4361 prev = skb; 4362 } 4363 4364 for (skb = prev; skb; skb = prev) { 4365 skb->next = NULL; 4366 4367 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) 4368 return; 4369 4370 prev = skb->prev; 4371 napi_gro_complete(skb); 4372 napi->gro_count--; 4373 } 4374 4375 napi->gro_list = NULL; 4376 } 4377 EXPORT_SYMBOL(napi_gro_flush); 4378 4379 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) 4380 { 4381 struct sk_buff *p; 4382 unsigned int maclen = skb->dev->hard_header_len; 4383 u32 hash = skb_get_hash_raw(skb); 4384 4385 for (p = napi->gro_list; p; p = p->next) { 4386 unsigned long diffs; 4387 4388 NAPI_GRO_CB(p)->flush = 0; 4389 4390 if (hash != skb_get_hash_raw(p)) { 4391 NAPI_GRO_CB(p)->same_flow = 0; 4392 continue; 4393 } 4394 4395 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; 4396 diffs |= p->vlan_tci ^ skb->vlan_tci; 4397 diffs |= skb_metadata_dst_cmp(p, skb); 4398 if (maclen == ETH_HLEN) 4399 diffs |= compare_ether_header(skb_mac_header(p), 4400 skb_mac_header(skb)); 4401 else if (!diffs) 4402 diffs = memcmp(skb_mac_header(p), 4403 skb_mac_header(skb), 4404 maclen); 4405 NAPI_GRO_CB(p)->same_flow = !diffs; 4406 } 4407 } 4408 4409 static void skb_gro_reset_offset(struct sk_buff *skb) 4410 { 4411 const struct skb_shared_info *pinfo = skb_shinfo(skb); 4412 const skb_frag_t *frag0 = &pinfo->frags[0]; 4413 4414 NAPI_GRO_CB(skb)->data_offset = 0; 4415 NAPI_GRO_CB(skb)->frag0 = NULL; 4416 NAPI_GRO_CB(skb)->frag0_len = 0; 4417 4418 if (skb_mac_header(skb) == skb_tail_pointer(skb) && 4419 pinfo->nr_frags && 4420 !PageHighMem(skb_frag_page(frag0))) { 4421 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); 4422 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, 4423 skb_frag_size(frag0), 4424 skb->end - skb->tail); 4425 } 4426 } 4427 4428 static void gro_pull_from_frag0(struct sk_buff *skb, int grow) 4429 { 4430 struct skb_shared_info *pinfo = skb_shinfo(skb); 4431 4432 BUG_ON(skb->end - skb->tail < grow); 4433 4434 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); 4435 4436 skb->data_len -= grow; 4437 skb->tail += grow; 4438 4439 pinfo->frags[0].page_offset += grow; 4440 skb_frag_size_sub(&pinfo->frags[0], grow); 4441 4442 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { 4443 skb_frag_unref(skb, 0); 4444 memmove(pinfo->frags, pinfo->frags + 1, 4445 --pinfo->nr_frags * sizeof(pinfo->frags[0])); 4446 } 4447 } 4448 4449 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4450 { 4451 struct sk_buff **pp = NULL; 4452 struct packet_offload *ptype; 4453 __be16 type = skb->protocol; 4454 struct list_head *head = &offload_base; 4455 int same_flow; 4456 enum gro_result ret; 4457 int grow; 4458 4459 if (!(skb->dev->features & NETIF_F_GRO)) 4460 goto normal; 4461 4462 if (skb->csum_bad) 4463 goto normal; 4464 4465 gro_list_prepare(napi, skb); 4466 4467 rcu_read_lock(); 4468 list_for_each_entry_rcu(ptype, head, list) { 4469 if (ptype->type != type || !ptype->callbacks.gro_receive) 4470 continue; 4471 4472 skb_set_network_header(skb, skb_gro_offset(skb)); 4473 skb_reset_mac_len(skb); 4474 NAPI_GRO_CB(skb)->same_flow = 0; 4475 NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); 4476 NAPI_GRO_CB(skb)->free = 0; 4477 NAPI_GRO_CB(skb)->encap_mark = 0; 4478 NAPI_GRO_CB(skb)->recursion_counter = 0; 4479 NAPI_GRO_CB(skb)->is_fou = 0; 4480 NAPI_GRO_CB(skb)->is_atomic = 1; 4481 NAPI_GRO_CB(skb)->gro_remcsum_start = 0; 4482 4483 /* Setup for GRO checksum validation */ 4484 switch (skb->ip_summed) { 4485 case CHECKSUM_COMPLETE: 4486 NAPI_GRO_CB(skb)->csum = skb->csum; 4487 NAPI_GRO_CB(skb)->csum_valid = 1; 4488 NAPI_GRO_CB(skb)->csum_cnt = 0; 4489 break; 4490 case CHECKSUM_UNNECESSARY: 4491 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; 4492 NAPI_GRO_CB(skb)->csum_valid = 0; 4493 break; 4494 default: 4495 NAPI_GRO_CB(skb)->csum_cnt = 0; 4496 NAPI_GRO_CB(skb)->csum_valid = 0; 4497 } 4498 4499 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); 4500 break; 4501 } 4502 rcu_read_unlock(); 4503 4504 if (&ptype->list == head) 4505 goto normal; 4506 4507 same_flow = NAPI_GRO_CB(skb)->same_flow; 4508 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 4509 4510 if (pp) { 4511 struct sk_buff *nskb = *pp; 4512 4513 *pp = nskb->next; 4514 nskb->next = NULL; 4515 napi_gro_complete(nskb); 4516 napi->gro_count--; 4517 } 4518 4519 if (same_flow) 4520 goto ok; 4521 4522 if (NAPI_GRO_CB(skb)->flush) 4523 goto normal; 4524 4525 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { 4526 struct sk_buff *nskb = napi->gro_list; 4527 4528 /* locate the end of the list to select the 'oldest' flow */ 4529 while (nskb->next) { 4530 pp = &nskb->next; 4531 nskb = *pp; 4532 } 4533 *pp = NULL; 4534 nskb->next = NULL; 4535 napi_gro_complete(nskb); 4536 } else { 4537 napi->gro_count++; 4538 } 4539 NAPI_GRO_CB(skb)->count = 1; 4540 NAPI_GRO_CB(skb)->age = jiffies; 4541 NAPI_GRO_CB(skb)->last = skb; 4542 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 4543 skb->next = napi->gro_list; 4544 napi->gro_list = skb; 4545 ret = GRO_HELD; 4546 4547 pull: 4548 grow = skb_gro_offset(skb) - skb_headlen(skb); 4549 if (grow > 0) 4550 gro_pull_from_frag0(skb, grow); 4551 ok: 4552 return ret; 4553 4554 normal: 4555 ret = GRO_NORMAL; 4556 goto pull; 4557 } 4558 4559 struct packet_offload *gro_find_receive_by_type(__be16 type) 4560 { 4561 struct list_head *offload_head = &offload_base; 4562 struct packet_offload *ptype; 4563 4564 list_for_each_entry_rcu(ptype, offload_head, list) { 4565 if (ptype->type != type || !ptype->callbacks.gro_receive) 4566 continue; 4567 return ptype; 4568 } 4569 return NULL; 4570 } 4571 EXPORT_SYMBOL(gro_find_receive_by_type); 4572 4573 struct packet_offload *gro_find_complete_by_type(__be16 type) 4574 { 4575 struct list_head *offload_head = &offload_base; 4576 struct packet_offload *ptype; 4577 4578 list_for_each_entry_rcu(ptype, offload_head, list) { 4579 if (ptype->type != type || !ptype->callbacks.gro_complete) 4580 continue; 4581 return ptype; 4582 } 4583 return NULL; 4584 } 4585 EXPORT_SYMBOL(gro_find_complete_by_type); 4586 4587 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 4588 { 4589 switch (ret) { 4590 case GRO_NORMAL: 4591 if (netif_receive_skb_internal(skb)) 4592 ret = GRO_DROP; 4593 break; 4594 4595 case GRO_DROP: 4596 kfree_skb(skb); 4597 break; 4598 4599 case GRO_MERGED_FREE: 4600 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { 4601 skb_dst_drop(skb); 4602 secpath_reset(skb); 4603 kmem_cache_free(skbuff_head_cache, skb); 4604 } else { 4605 __kfree_skb(skb); 4606 } 4607 break; 4608 4609 case GRO_HELD: 4610 case GRO_MERGED: 4611 break; 4612 } 4613 4614 return ret; 4615 } 4616 4617 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4618 { 4619 skb_mark_napi_id(skb, napi); 4620 trace_napi_gro_receive_entry(skb); 4621 4622 skb_gro_reset_offset(skb); 4623 4624 return napi_skb_finish(dev_gro_receive(napi, skb), skb); 4625 } 4626 EXPORT_SYMBOL(napi_gro_receive); 4627 4628 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 4629 { 4630 if (unlikely(skb->pfmemalloc)) { 4631 consume_skb(skb); 4632 return; 4633 } 4634 __skb_pull(skb, skb_headlen(skb)); 4635 /* restore the reserve we had after netdev_alloc_skb_ip_align() */ 4636 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); 4637 skb->vlan_tci = 0; 4638 skb->dev = napi->dev; 4639 skb->skb_iif = 0; 4640 skb->encapsulation = 0; 4641 skb_shinfo(skb)->gso_type = 0; 4642 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 4643 secpath_reset(skb); 4644 4645 napi->skb = skb; 4646 } 4647 4648 struct sk_buff *napi_get_frags(struct napi_struct *napi) 4649 { 4650 struct sk_buff *skb = napi->skb; 4651 4652 if (!skb) { 4653 skb = napi_alloc_skb(napi, GRO_MAX_HEAD); 4654 if (skb) { 4655 napi->skb = skb; 4656 skb_mark_napi_id(skb, napi); 4657 } 4658 } 4659 return skb; 4660 } 4661 EXPORT_SYMBOL(napi_get_frags); 4662 4663 static gro_result_t napi_frags_finish(struct napi_struct *napi, 4664 struct sk_buff *skb, 4665 gro_result_t ret) 4666 { 4667 switch (ret) { 4668 case GRO_NORMAL: 4669 case GRO_HELD: 4670 __skb_push(skb, ETH_HLEN); 4671 skb->protocol = eth_type_trans(skb, skb->dev); 4672 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) 4673 ret = GRO_DROP; 4674 break; 4675 4676 case GRO_DROP: 4677 case GRO_MERGED_FREE: 4678 napi_reuse_skb(napi, skb); 4679 break; 4680 4681 case GRO_MERGED: 4682 break; 4683 } 4684 4685 return ret; 4686 } 4687 4688 /* Upper GRO stack assumes network header starts at gro_offset=0 4689 * Drivers could call both napi_gro_frags() and napi_gro_receive() 4690 * We copy ethernet header into skb->data to have a common layout. 4691 */ 4692 static struct sk_buff *napi_frags_skb(struct napi_struct *napi) 4693 { 4694 struct sk_buff *skb = napi->skb; 4695 const struct ethhdr *eth; 4696 unsigned int hlen = sizeof(*eth); 4697 4698 napi->skb = NULL; 4699 4700 skb_reset_mac_header(skb); 4701 skb_gro_reset_offset(skb); 4702 4703 eth = skb_gro_header_fast(skb, 0); 4704 if (unlikely(skb_gro_header_hard(skb, hlen))) { 4705 eth = skb_gro_header_slow(skb, hlen, 0); 4706 if (unlikely(!eth)) { 4707 net_warn_ratelimited("%s: dropping impossible skb from %s\n", 4708 __func__, napi->dev->name); 4709 napi_reuse_skb(napi, skb); 4710 return NULL; 4711 } 4712 } else { 4713 gro_pull_from_frag0(skb, hlen); 4714 NAPI_GRO_CB(skb)->frag0 += hlen; 4715 NAPI_GRO_CB(skb)->frag0_len -= hlen; 4716 } 4717 __skb_pull(skb, hlen); 4718 4719 /* 4720 * This works because the only protocols we care about don't require 4721 * special handling. 4722 * We'll fix it up properly in napi_frags_finish() 4723 */ 4724 skb->protocol = eth->h_proto; 4725 4726 return skb; 4727 } 4728 4729 gro_result_t napi_gro_frags(struct napi_struct *napi) 4730 { 4731 struct sk_buff *skb = napi_frags_skb(napi); 4732 4733 if (!skb) 4734 return GRO_DROP; 4735 4736 trace_napi_gro_frags_entry(skb); 4737 4738 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); 4739 } 4740 EXPORT_SYMBOL(napi_gro_frags); 4741 4742 /* Compute the checksum from gro_offset and return the folded value 4743 * after adding in any pseudo checksum. 4744 */ 4745 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) 4746 { 4747 __wsum wsum; 4748 __sum16 sum; 4749 4750 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); 4751 4752 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ 4753 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); 4754 if (likely(!sum)) { 4755 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 4756 !skb->csum_complete_sw) 4757 netdev_rx_csum_fault(skb->dev); 4758 } 4759 4760 NAPI_GRO_CB(skb)->csum = wsum; 4761 NAPI_GRO_CB(skb)->csum_valid = 1; 4762 4763 return sum; 4764 } 4765 EXPORT_SYMBOL(__skb_gro_checksum_complete); 4766 4767 /* 4768 * net_rps_action_and_irq_enable sends any pending IPI's for rps. 4769 * Note: called with local irq disabled, but exits with local irq enabled. 4770 */ 4771 static void net_rps_action_and_irq_enable(struct softnet_data *sd) 4772 { 4773 #ifdef CONFIG_RPS 4774 struct softnet_data *remsd = sd->rps_ipi_list; 4775 4776 if (remsd) { 4777 sd->rps_ipi_list = NULL; 4778 4779 local_irq_enable(); 4780 4781 /* Send pending IPI's to kick RPS processing on remote cpus. */ 4782 while (remsd) { 4783 struct softnet_data *next = remsd->rps_ipi_next; 4784 4785 if (cpu_online(remsd->cpu)) 4786 smp_call_function_single_async(remsd->cpu, 4787 &remsd->csd); 4788 remsd = next; 4789 } 4790 } else 4791 #endif 4792 local_irq_enable(); 4793 } 4794 4795 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) 4796 { 4797 #ifdef CONFIG_RPS 4798 return sd->rps_ipi_list != NULL; 4799 #else 4800 return false; 4801 #endif 4802 } 4803 4804 static int process_backlog(struct napi_struct *napi, int quota) 4805 { 4806 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 4807 bool again = true; 4808 int work = 0; 4809 4810 /* Check if we have pending ipi, its better to send them now, 4811 * not waiting net_rx_action() end. 4812 */ 4813 if (sd_has_rps_ipi_waiting(sd)) { 4814 local_irq_disable(); 4815 net_rps_action_and_irq_enable(sd); 4816 } 4817 4818 napi->weight = dev_rx_weight; 4819 while (again) { 4820 struct sk_buff *skb; 4821 4822 while ((skb = __skb_dequeue(&sd->process_queue))) { 4823 rcu_read_lock(); 4824 __netif_receive_skb(skb); 4825 rcu_read_unlock(); 4826 input_queue_head_incr(sd); 4827 if (++work >= quota) 4828 return work; 4829 4830 } 4831 4832 local_irq_disable(); 4833 rps_lock(sd); 4834 if (skb_queue_empty(&sd->input_pkt_queue)) { 4835 /* 4836 * Inline a custom version of __napi_complete(). 4837 * only current cpu owns and manipulates this napi, 4838 * and NAPI_STATE_SCHED is the only possible flag set 4839 * on backlog. 4840 * We can use a plain write instead of clear_bit(), 4841 * and we dont need an smp_mb() memory barrier. 4842 */ 4843 napi->state = 0; 4844 again = false; 4845 } else { 4846 skb_queue_splice_tail_init(&sd->input_pkt_queue, 4847 &sd->process_queue); 4848 } 4849 rps_unlock(sd); 4850 local_irq_enable(); 4851 } 4852 4853 return work; 4854 } 4855 4856 /** 4857 * __napi_schedule - schedule for receive 4858 * @n: entry to schedule 4859 * 4860 * The entry's receive function will be scheduled to run. 4861 * Consider using __napi_schedule_irqoff() if hard irqs are masked. 4862 */ 4863 void __napi_schedule(struct napi_struct *n) 4864 { 4865 unsigned long flags; 4866 4867 local_irq_save(flags); 4868 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4869 local_irq_restore(flags); 4870 } 4871 EXPORT_SYMBOL(__napi_schedule); 4872 4873 /** 4874 * __napi_schedule_irqoff - schedule for receive 4875 * @n: entry to schedule 4876 * 4877 * Variant of __napi_schedule() assuming hard irqs are masked 4878 */ 4879 void __napi_schedule_irqoff(struct napi_struct *n) 4880 { 4881 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4882 } 4883 EXPORT_SYMBOL(__napi_schedule_irqoff); 4884 4885 bool __napi_complete(struct napi_struct *n) 4886 { 4887 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 4888 4889 /* Some drivers call us directly, instead of calling 4890 * napi_complete_done(). 4891 */ 4892 if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state))) 4893 return false; 4894 4895 list_del_init(&n->poll_list); 4896 smp_mb__before_atomic(); 4897 clear_bit(NAPI_STATE_SCHED, &n->state); 4898 return true; 4899 } 4900 EXPORT_SYMBOL(__napi_complete); 4901 4902 bool napi_complete_done(struct napi_struct *n, int work_done) 4903 { 4904 unsigned long flags; 4905 4906 /* 4907 * 1) Don't let napi dequeue from the cpu poll list 4908 * just in case its running on a different cpu. 4909 * 2) If we are busy polling, do nothing here, we have 4910 * the guarantee we will be called later. 4911 */ 4912 if (unlikely(n->state & (NAPIF_STATE_NPSVC | 4913 NAPIF_STATE_IN_BUSY_POLL))) 4914 return false; 4915 4916 if (n->gro_list) { 4917 unsigned long timeout = 0; 4918 4919 if (work_done) 4920 timeout = n->dev->gro_flush_timeout; 4921 4922 if (timeout) 4923 hrtimer_start(&n->timer, ns_to_ktime(timeout), 4924 HRTIMER_MODE_REL_PINNED); 4925 else 4926 napi_gro_flush(n, false); 4927 } 4928 if (likely(list_empty(&n->poll_list))) { 4929 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); 4930 } else { 4931 /* If n->poll_list is not empty, we need to mask irqs */ 4932 local_irq_save(flags); 4933 __napi_complete(n); 4934 local_irq_restore(flags); 4935 } 4936 return true; 4937 } 4938 EXPORT_SYMBOL(napi_complete_done); 4939 4940 /* must be called under rcu_read_lock(), as we dont take a reference */ 4941 static struct napi_struct *napi_by_id(unsigned int napi_id) 4942 { 4943 unsigned int hash = napi_id % HASH_SIZE(napi_hash); 4944 struct napi_struct *napi; 4945 4946 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) 4947 if (napi->napi_id == napi_id) 4948 return napi; 4949 4950 return NULL; 4951 } 4952 4953 #if defined(CONFIG_NET_RX_BUSY_POLL) 4954 4955 #define BUSY_POLL_BUDGET 8 4956 4957 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) 4958 { 4959 int rc; 4960 4961 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); 4962 4963 local_bh_disable(); 4964 4965 /* All we really want here is to re-enable device interrupts. 4966 * Ideally, a new ndo_busy_poll_stop() could avoid another round. 4967 */ 4968 rc = napi->poll(napi, BUSY_POLL_BUDGET); 4969 netpoll_poll_unlock(have_poll_lock); 4970 if (rc == BUSY_POLL_BUDGET) 4971 __napi_schedule(napi); 4972 local_bh_enable(); 4973 if (local_softirq_pending()) 4974 do_softirq(); 4975 } 4976 4977 bool sk_busy_loop(struct sock *sk, int nonblock) 4978 { 4979 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; 4980 int (*napi_poll)(struct napi_struct *napi, int budget); 4981 int (*busy_poll)(struct napi_struct *dev); 4982 void *have_poll_lock = NULL; 4983 struct napi_struct *napi; 4984 int rc; 4985 4986 restart: 4987 rc = false; 4988 napi_poll = NULL; 4989 4990 rcu_read_lock(); 4991 4992 napi = napi_by_id(sk->sk_napi_id); 4993 if (!napi) 4994 goto out; 4995 4996 /* Note: ndo_busy_poll method is optional in linux-4.5 */ 4997 busy_poll = napi->dev->netdev_ops->ndo_busy_poll; 4998 4999 preempt_disable(); 5000 for (;;) { 5001 rc = 0; 5002 local_bh_disable(); 5003 if (busy_poll) { 5004 rc = busy_poll(napi); 5005 goto count; 5006 } 5007 if (!napi_poll) { 5008 unsigned long val = READ_ONCE(napi->state); 5009 5010 /* If multiple threads are competing for this napi, 5011 * we avoid dirtying napi->state as much as we can. 5012 */ 5013 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | 5014 NAPIF_STATE_IN_BUSY_POLL)) 5015 goto count; 5016 if (cmpxchg(&napi->state, val, 5017 val | NAPIF_STATE_IN_BUSY_POLL | 5018 NAPIF_STATE_SCHED) != val) 5019 goto count; 5020 have_poll_lock = netpoll_poll_lock(napi); 5021 napi_poll = napi->poll; 5022 } 5023 rc = napi_poll(napi, BUSY_POLL_BUDGET); 5024 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); 5025 count: 5026 if (rc > 0) 5027 __NET_ADD_STATS(sock_net(sk), 5028 LINUX_MIB_BUSYPOLLRXPACKETS, rc); 5029 local_bh_enable(); 5030 5031 if (rc == LL_FLUSH_FAILED) 5032 break; /* permanent failure */ 5033 5034 if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || 5035 busy_loop_timeout(end_time)) 5036 break; 5037 5038 if (unlikely(need_resched())) { 5039 if (napi_poll) 5040 busy_poll_stop(napi, have_poll_lock); 5041 preempt_enable(); 5042 rcu_read_unlock(); 5043 cond_resched(); 5044 rc = !skb_queue_empty(&sk->sk_receive_queue); 5045 if (rc || busy_loop_timeout(end_time)) 5046 return rc; 5047 goto restart; 5048 } 5049 cpu_relax(); 5050 } 5051 if (napi_poll) 5052 busy_poll_stop(napi, have_poll_lock); 5053 preempt_enable(); 5054 rc = !skb_queue_empty(&sk->sk_receive_queue); 5055 out: 5056 rcu_read_unlock(); 5057 return rc; 5058 } 5059 EXPORT_SYMBOL(sk_busy_loop); 5060 5061 #endif /* CONFIG_NET_RX_BUSY_POLL */ 5062 5063 static void napi_hash_add(struct napi_struct *napi) 5064 { 5065 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || 5066 test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) 5067 return; 5068 5069 spin_lock(&napi_hash_lock); 5070 5071 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ 5072 do { 5073 if (unlikely(++napi_gen_id < NR_CPUS + 1)) 5074 napi_gen_id = NR_CPUS + 1; 5075 } while (napi_by_id(napi_gen_id)); 5076 napi->napi_id = napi_gen_id; 5077 5078 hlist_add_head_rcu(&napi->napi_hash_node, 5079 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); 5080 5081 spin_unlock(&napi_hash_lock); 5082 } 5083 5084 /* Warning : caller is responsible to make sure rcu grace period 5085 * is respected before freeing memory containing @napi 5086 */ 5087 bool napi_hash_del(struct napi_struct *napi) 5088 { 5089 bool rcu_sync_needed = false; 5090 5091 spin_lock(&napi_hash_lock); 5092 5093 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { 5094 rcu_sync_needed = true; 5095 hlist_del_rcu(&napi->napi_hash_node); 5096 } 5097 spin_unlock(&napi_hash_lock); 5098 return rcu_sync_needed; 5099 } 5100 EXPORT_SYMBOL_GPL(napi_hash_del); 5101 5102 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) 5103 { 5104 struct napi_struct *napi; 5105 5106 napi = container_of(timer, struct napi_struct, timer); 5107 if (napi->gro_list) 5108 napi_schedule(napi); 5109 5110 return HRTIMER_NORESTART; 5111 } 5112 5113 void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 5114 int (*poll)(struct napi_struct *, int), int weight) 5115 { 5116 INIT_LIST_HEAD(&napi->poll_list); 5117 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); 5118 napi->timer.function = napi_watchdog; 5119 napi->gro_count = 0; 5120 napi->gro_list = NULL; 5121 napi->skb = NULL; 5122 napi->poll = poll; 5123 if (weight > NAPI_POLL_WEIGHT) 5124 pr_err_once("netif_napi_add() called with weight %d on device %s\n", 5125 weight, dev->name); 5126 napi->weight = weight; 5127 list_add(&napi->dev_list, &dev->napi_list); 5128 napi->dev = dev; 5129 #ifdef CONFIG_NETPOLL 5130 napi->poll_owner = -1; 5131 #endif 5132 set_bit(NAPI_STATE_SCHED, &napi->state); 5133 napi_hash_add(napi); 5134 } 5135 EXPORT_SYMBOL(netif_napi_add); 5136 5137 void napi_disable(struct napi_struct *n) 5138 { 5139 might_sleep(); 5140 set_bit(NAPI_STATE_DISABLE, &n->state); 5141 5142 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) 5143 msleep(1); 5144 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) 5145 msleep(1); 5146 5147 hrtimer_cancel(&n->timer); 5148 5149 clear_bit(NAPI_STATE_DISABLE, &n->state); 5150 } 5151 EXPORT_SYMBOL(napi_disable); 5152 5153 /* Must be called in process context */ 5154 void netif_napi_del(struct napi_struct *napi) 5155 { 5156 might_sleep(); 5157 if (napi_hash_del(napi)) 5158 synchronize_net(); 5159 list_del_init(&napi->dev_list); 5160 napi_free_frags(napi); 5161 5162 kfree_skb_list(napi->gro_list); 5163 napi->gro_list = NULL; 5164 napi->gro_count = 0; 5165 } 5166 EXPORT_SYMBOL(netif_napi_del); 5167 5168 static int napi_poll(struct napi_struct *n, struct list_head *repoll) 5169 { 5170 void *have; 5171 int work, weight; 5172 5173 list_del_init(&n->poll_list); 5174 5175 have = netpoll_poll_lock(n); 5176 5177 weight = n->weight; 5178 5179 /* This NAPI_STATE_SCHED test is for avoiding a race 5180 * with netpoll's poll_napi(). Only the entity which 5181 * obtains the lock and sees NAPI_STATE_SCHED set will 5182 * actually make the ->poll() call. Therefore we avoid 5183 * accidentally calling ->poll() when NAPI is not scheduled. 5184 */ 5185 work = 0; 5186 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 5187 work = n->poll(n, weight); 5188 trace_napi_poll(n, work, weight); 5189 } 5190 5191 WARN_ON_ONCE(work > weight); 5192 5193 if (likely(work < weight)) 5194 goto out_unlock; 5195 5196 /* Drivers must not modify the NAPI state if they 5197 * consume the entire weight. In such cases this code 5198 * still "owns" the NAPI instance and therefore can 5199 * move the instance around on the list at-will. 5200 */ 5201 if (unlikely(napi_disable_pending(n))) { 5202 napi_complete(n); 5203 goto out_unlock; 5204 } 5205 5206 if (n->gro_list) { 5207 /* flush too old packets 5208 * If HZ < 1000, flush all packets. 5209 */ 5210 napi_gro_flush(n, HZ >= 1000); 5211 } 5212 5213 /* Some drivers may have called napi_schedule 5214 * prior to exhausting their budget. 5215 */ 5216 if (unlikely(!list_empty(&n->poll_list))) { 5217 pr_warn_once("%s: Budget exhausted after napi rescheduled\n", 5218 n->dev ? n->dev->name : "backlog"); 5219 goto out_unlock; 5220 } 5221 5222 list_add_tail(&n->poll_list, repoll); 5223 5224 out_unlock: 5225 netpoll_poll_unlock(have); 5226 5227 return work; 5228 } 5229 5230 static __latent_entropy void net_rx_action(struct softirq_action *h) 5231 { 5232 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 5233 unsigned long time_limit = jiffies + 2; 5234 int budget = netdev_budget; 5235 LIST_HEAD(list); 5236 LIST_HEAD(repoll); 5237 5238 local_irq_disable(); 5239 list_splice_init(&sd->poll_list, &list); 5240 local_irq_enable(); 5241 5242 for (;;) { 5243 struct napi_struct *n; 5244 5245 if (list_empty(&list)) { 5246 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) 5247 goto out; 5248 break; 5249 } 5250 5251 n = list_first_entry(&list, struct napi_struct, poll_list); 5252 budget -= napi_poll(n, &repoll); 5253 5254 /* If softirq window is exhausted then punt. 5255 * Allow this to run for 2 jiffies since which will allow 5256 * an average latency of 1.5/HZ. 5257 */ 5258 if (unlikely(budget <= 0 || 5259 time_after_eq(jiffies, time_limit))) { 5260 sd->time_squeeze++; 5261 break; 5262 } 5263 } 5264 5265 local_irq_disable(); 5266 5267 list_splice_tail_init(&sd->poll_list, &list); 5268 list_splice_tail(&repoll, &list); 5269 list_splice(&list, &sd->poll_list); 5270 if (!list_empty(&sd->poll_list)) 5271 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 5272 5273 net_rps_action_and_irq_enable(sd); 5274 out: 5275 __kfree_skb_flush(); 5276 } 5277 5278 struct netdev_adjacent { 5279 struct net_device *dev; 5280 5281 /* upper master flag, there can only be one master device per list */ 5282 bool master; 5283 5284 /* counter for the number of times this device was added to us */ 5285 u16 ref_nr; 5286 5287 /* private field for the users */ 5288 void *private; 5289 5290 struct list_head list; 5291 struct rcu_head rcu; 5292 }; 5293 5294 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, 5295 struct list_head *adj_list) 5296 { 5297 struct netdev_adjacent *adj; 5298 5299 list_for_each_entry(adj, adj_list, list) { 5300 if (adj->dev == adj_dev) 5301 return adj; 5302 } 5303 return NULL; 5304 } 5305 5306 static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data) 5307 { 5308 struct net_device *dev = data; 5309 5310 return upper_dev == dev; 5311 } 5312 5313 /** 5314 * netdev_has_upper_dev - Check if device is linked to an upper device 5315 * @dev: device 5316 * @upper_dev: upper device to check 5317 * 5318 * Find out if a device is linked to specified upper device and return true 5319 * in case it is. Note that this checks only immediate upper device, 5320 * not through a complete stack of devices. The caller must hold the RTNL lock. 5321 */ 5322 bool netdev_has_upper_dev(struct net_device *dev, 5323 struct net_device *upper_dev) 5324 { 5325 ASSERT_RTNL(); 5326 5327 return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, 5328 upper_dev); 5329 } 5330 EXPORT_SYMBOL(netdev_has_upper_dev); 5331 5332 /** 5333 * netdev_has_upper_dev_all - Check if device is linked to an upper device 5334 * @dev: device 5335 * @upper_dev: upper device to check 5336 * 5337 * Find out if a device is linked to specified upper device and return true 5338 * in case it is. Note that this checks the entire upper device chain. 5339 * The caller must hold rcu lock. 5340 */ 5341 5342 bool netdev_has_upper_dev_all_rcu(struct net_device *dev, 5343 struct net_device *upper_dev) 5344 { 5345 return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, 5346 upper_dev); 5347 } 5348 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); 5349 5350 /** 5351 * netdev_has_any_upper_dev - Check if device is linked to some device 5352 * @dev: device 5353 * 5354 * Find out if a device is linked to an upper device and return true in case 5355 * it is. The caller must hold the RTNL lock. 5356 */ 5357 static bool netdev_has_any_upper_dev(struct net_device *dev) 5358 { 5359 ASSERT_RTNL(); 5360 5361 return !list_empty(&dev->adj_list.upper); 5362 } 5363 5364 /** 5365 * netdev_master_upper_dev_get - Get master upper device 5366 * @dev: device 5367 * 5368 * Find a master upper device and return pointer to it or NULL in case 5369 * it's not there. The caller must hold the RTNL lock. 5370 */ 5371 struct net_device *netdev_master_upper_dev_get(struct net_device *dev) 5372 { 5373 struct netdev_adjacent *upper; 5374 5375 ASSERT_RTNL(); 5376 5377 if (list_empty(&dev->adj_list.upper)) 5378 return NULL; 5379 5380 upper = list_first_entry(&dev->adj_list.upper, 5381 struct netdev_adjacent, list); 5382 if (likely(upper->master)) 5383 return upper->dev; 5384 return NULL; 5385 } 5386 EXPORT_SYMBOL(netdev_master_upper_dev_get); 5387 5388 /** 5389 * netdev_has_any_lower_dev - Check if device is linked to some device 5390 * @dev: device 5391 * 5392 * Find out if a device is linked to a lower device and return true in case 5393 * it is. The caller must hold the RTNL lock. 5394 */ 5395 static bool netdev_has_any_lower_dev(struct net_device *dev) 5396 { 5397 ASSERT_RTNL(); 5398 5399 return !list_empty(&dev->adj_list.lower); 5400 } 5401 5402 void *netdev_adjacent_get_private(struct list_head *adj_list) 5403 { 5404 struct netdev_adjacent *adj; 5405 5406 adj = list_entry(adj_list, struct netdev_adjacent, list); 5407 5408 return adj->private; 5409 } 5410 EXPORT_SYMBOL(netdev_adjacent_get_private); 5411 5412 /** 5413 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list 5414 * @dev: device 5415 * @iter: list_head ** of the current position 5416 * 5417 * Gets the next device from the dev's upper list, starting from iter 5418 * position. The caller must hold RCU read lock. 5419 */ 5420 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, 5421 struct list_head **iter) 5422 { 5423 struct netdev_adjacent *upper; 5424 5425 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 5426 5427 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5428 5429 if (&upper->list == &dev->adj_list.upper) 5430 return NULL; 5431 5432 *iter = &upper->list; 5433 5434 return upper->dev; 5435 } 5436 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); 5437 5438 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, 5439 struct list_head **iter) 5440 { 5441 struct netdev_adjacent *upper; 5442 5443 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 5444 5445 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5446 5447 if (&upper->list == &dev->adj_list.upper) 5448 return NULL; 5449 5450 *iter = &upper->list; 5451 5452 return upper->dev; 5453 } 5454 5455 int netdev_walk_all_upper_dev_rcu(struct net_device *dev, 5456 int (*fn)(struct net_device *dev, 5457 void *data), 5458 void *data) 5459 { 5460 struct net_device *udev; 5461 struct list_head *iter; 5462 int ret; 5463 5464 for (iter = &dev->adj_list.upper, 5465 udev = netdev_next_upper_dev_rcu(dev, &iter); 5466 udev; 5467 udev = netdev_next_upper_dev_rcu(dev, &iter)) { 5468 /* first is the upper device itself */ 5469 ret = fn(udev, data); 5470 if (ret) 5471 return ret; 5472 5473 /* then look at all of its upper devices */ 5474 ret = netdev_walk_all_upper_dev_rcu(udev, fn, data); 5475 if (ret) 5476 return ret; 5477 } 5478 5479 return 0; 5480 } 5481 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); 5482 5483 /** 5484 * netdev_lower_get_next_private - Get the next ->private from the 5485 * lower neighbour list 5486 * @dev: device 5487 * @iter: list_head ** of the current position 5488 * 5489 * Gets the next netdev_adjacent->private from the dev's lower neighbour 5490 * list, starting from iter position. The caller must hold either hold the 5491 * RTNL lock or its own locking that guarantees that the neighbour lower 5492 * list will remain unchanged. 5493 */ 5494 void *netdev_lower_get_next_private(struct net_device *dev, 5495 struct list_head **iter) 5496 { 5497 struct netdev_adjacent *lower; 5498 5499 lower = list_entry(*iter, struct netdev_adjacent, list); 5500 5501 if (&lower->list == &dev->adj_list.lower) 5502 return NULL; 5503 5504 *iter = lower->list.next; 5505 5506 return lower->private; 5507 } 5508 EXPORT_SYMBOL(netdev_lower_get_next_private); 5509 5510 /** 5511 * netdev_lower_get_next_private_rcu - Get the next ->private from the 5512 * lower neighbour list, RCU 5513 * variant 5514 * @dev: device 5515 * @iter: list_head ** of the current position 5516 * 5517 * Gets the next netdev_adjacent->private from the dev's lower neighbour 5518 * list, starting from iter position. The caller must hold RCU read lock. 5519 */ 5520 void *netdev_lower_get_next_private_rcu(struct net_device *dev, 5521 struct list_head **iter) 5522 { 5523 struct netdev_adjacent *lower; 5524 5525 WARN_ON_ONCE(!rcu_read_lock_held()); 5526 5527 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5528 5529 if (&lower->list == &dev->adj_list.lower) 5530 return NULL; 5531 5532 *iter = &lower->list; 5533 5534 return lower->private; 5535 } 5536 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); 5537 5538 /** 5539 * netdev_lower_get_next - Get the next device from the lower neighbour 5540 * list 5541 * @dev: device 5542 * @iter: list_head ** of the current position 5543 * 5544 * Gets the next netdev_adjacent from the dev's lower neighbour 5545 * list, starting from iter position. The caller must hold RTNL lock or 5546 * its own locking that guarantees that the neighbour lower 5547 * list will remain unchanged. 5548 */ 5549 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) 5550 { 5551 struct netdev_adjacent *lower; 5552 5553 lower = list_entry(*iter, struct netdev_adjacent, list); 5554 5555 if (&lower->list == &dev->adj_list.lower) 5556 return NULL; 5557 5558 *iter = lower->list.next; 5559 5560 return lower->dev; 5561 } 5562 EXPORT_SYMBOL(netdev_lower_get_next); 5563 5564 static struct net_device *netdev_next_lower_dev(struct net_device *dev, 5565 struct list_head **iter) 5566 { 5567 struct netdev_adjacent *lower; 5568 5569 lower = list_entry((*iter)->next, struct netdev_adjacent, list); 5570 5571 if (&lower->list == &dev->adj_list.lower) 5572 return NULL; 5573 5574 *iter = &lower->list; 5575 5576 return lower->dev; 5577 } 5578 5579 int netdev_walk_all_lower_dev(struct net_device *dev, 5580 int (*fn)(struct net_device *dev, 5581 void *data), 5582 void *data) 5583 { 5584 struct net_device *ldev; 5585 struct list_head *iter; 5586 int ret; 5587 5588 for (iter = &dev->adj_list.lower, 5589 ldev = netdev_next_lower_dev(dev, &iter); 5590 ldev; 5591 ldev = netdev_next_lower_dev(dev, &iter)) { 5592 /* first is the lower device itself */ 5593 ret = fn(ldev, data); 5594 if (ret) 5595 return ret; 5596 5597 /* then look at all of its lower devices */ 5598 ret = netdev_walk_all_lower_dev(ldev, fn, data); 5599 if (ret) 5600 return ret; 5601 } 5602 5603 return 0; 5604 } 5605 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); 5606 5607 static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, 5608 struct list_head **iter) 5609 { 5610 struct netdev_adjacent *lower; 5611 5612 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5613 if (&lower->list == &dev->adj_list.lower) 5614 return NULL; 5615 5616 *iter = &lower->list; 5617 5618 return lower->dev; 5619 } 5620 5621 int netdev_walk_all_lower_dev_rcu(struct net_device *dev, 5622 int (*fn)(struct net_device *dev, 5623 void *data), 5624 void *data) 5625 { 5626 struct net_device *ldev; 5627 struct list_head *iter; 5628 int ret; 5629 5630 for (iter = &dev->adj_list.lower, 5631 ldev = netdev_next_lower_dev_rcu(dev, &iter); 5632 ldev; 5633 ldev = netdev_next_lower_dev_rcu(dev, &iter)) { 5634 /* first is the lower device itself */ 5635 ret = fn(ldev, data); 5636 if (ret) 5637 return ret; 5638 5639 /* then look at all of its lower devices */ 5640 ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data); 5641 if (ret) 5642 return ret; 5643 } 5644 5645 return 0; 5646 } 5647 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); 5648 5649 /** 5650 * netdev_lower_get_first_private_rcu - Get the first ->private from the 5651 * lower neighbour list, RCU 5652 * variant 5653 * @dev: device 5654 * 5655 * Gets the first netdev_adjacent->private from the dev's lower neighbour 5656 * list. The caller must hold RCU read lock. 5657 */ 5658 void *netdev_lower_get_first_private_rcu(struct net_device *dev) 5659 { 5660 struct netdev_adjacent *lower; 5661 5662 lower = list_first_or_null_rcu(&dev->adj_list.lower, 5663 struct netdev_adjacent, list); 5664 if (lower) 5665 return lower->private; 5666 return NULL; 5667 } 5668 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); 5669 5670 /** 5671 * netdev_master_upper_dev_get_rcu - Get master upper device 5672 * @dev: device 5673 * 5674 * Find a master upper device and return pointer to it or NULL in case 5675 * it's not there. The caller must hold the RCU read lock. 5676 */ 5677 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) 5678 { 5679 struct netdev_adjacent *upper; 5680 5681 upper = list_first_or_null_rcu(&dev->adj_list.upper, 5682 struct netdev_adjacent, list); 5683 if (upper && likely(upper->master)) 5684 return upper->dev; 5685 return NULL; 5686 } 5687 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); 5688 5689 static int netdev_adjacent_sysfs_add(struct net_device *dev, 5690 struct net_device *adj_dev, 5691 struct list_head *dev_list) 5692 { 5693 char linkname[IFNAMSIZ+7]; 5694 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5695 "upper_%s" : "lower_%s", adj_dev->name); 5696 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), 5697 linkname); 5698 } 5699 static void netdev_adjacent_sysfs_del(struct net_device *dev, 5700 char *name, 5701 struct list_head *dev_list) 5702 { 5703 char linkname[IFNAMSIZ+7]; 5704 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5705 "upper_%s" : "lower_%s", name); 5706 sysfs_remove_link(&(dev->dev.kobj), linkname); 5707 } 5708 5709 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, 5710 struct net_device *adj_dev, 5711 struct list_head *dev_list) 5712 { 5713 return (dev_list == &dev->adj_list.upper || 5714 dev_list == &dev->adj_list.lower) && 5715 net_eq(dev_net(dev), dev_net(adj_dev)); 5716 } 5717 5718 static int __netdev_adjacent_dev_insert(struct net_device *dev, 5719 struct net_device *adj_dev, 5720 struct list_head *dev_list, 5721 void *private, bool master) 5722 { 5723 struct netdev_adjacent *adj; 5724 int ret; 5725 5726 adj = __netdev_find_adj(adj_dev, dev_list); 5727 5728 if (adj) { 5729 adj->ref_nr += 1; 5730 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", 5731 dev->name, adj_dev->name, adj->ref_nr); 5732 5733 return 0; 5734 } 5735 5736 adj = kmalloc(sizeof(*adj), GFP_KERNEL); 5737 if (!adj) 5738 return -ENOMEM; 5739 5740 adj->dev = adj_dev; 5741 adj->master = master; 5742 adj->ref_nr = 1; 5743 adj->private = private; 5744 dev_hold(adj_dev); 5745 5746 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", 5747 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); 5748 5749 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { 5750 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); 5751 if (ret) 5752 goto free_adj; 5753 } 5754 5755 /* Ensure that master link is always the first item in list. */ 5756 if (master) { 5757 ret = sysfs_create_link(&(dev->dev.kobj), 5758 &(adj_dev->dev.kobj), "master"); 5759 if (ret) 5760 goto remove_symlinks; 5761 5762 list_add_rcu(&adj->list, dev_list); 5763 } else { 5764 list_add_tail_rcu(&adj->list, dev_list); 5765 } 5766 5767 return 0; 5768 5769 remove_symlinks: 5770 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5771 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5772 free_adj: 5773 kfree(adj); 5774 dev_put(adj_dev); 5775 5776 return ret; 5777 } 5778 5779 static void __netdev_adjacent_dev_remove(struct net_device *dev, 5780 struct net_device *adj_dev, 5781 u16 ref_nr, 5782 struct list_head *dev_list) 5783 { 5784 struct netdev_adjacent *adj; 5785 5786 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", 5787 dev->name, adj_dev->name, ref_nr); 5788 5789 adj = __netdev_find_adj(adj_dev, dev_list); 5790 5791 if (!adj) { 5792 pr_err("Adjacency does not exist for device %s from %s\n", 5793 dev->name, adj_dev->name); 5794 WARN_ON(1); 5795 return; 5796 } 5797 5798 if (adj->ref_nr > ref_nr) { 5799 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", 5800 dev->name, adj_dev->name, ref_nr, 5801 adj->ref_nr - ref_nr); 5802 adj->ref_nr -= ref_nr; 5803 return; 5804 } 5805 5806 if (adj->master) 5807 sysfs_remove_link(&(dev->dev.kobj), "master"); 5808 5809 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5810 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5811 5812 list_del_rcu(&adj->list); 5813 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", 5814 adj_dev->name, dev->name, adj_dev->name); 5815 dev_put(adj_dev); 5816 kfree_rcu(adj, rcu); 5817 } 5818 5819 static int __netdev_adjacent_dev_link_lists(struct net_device *dev, 5820 struct net_device *upper_dev, 5821 struct list_head *up_list, 5822 struct list_head *down_list, 5823 void *private, bool master) 5824 { 5825 int ret; 5826 5827 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, 5828 private, master); 5829 if (ret) 5830 return ret; 5831 5832 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, 5833 private, false); 5834 if (ret) { 5835 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); 5836 return ret; 5837 } 5838 5839 return 0; 5840 } 5841 5842 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, 5843 struct net_device *upper_dev, 5844 u16 ref_nr, 5845 struct list_head *up_list, 5846 struct list_head *down_list) 5847 { 5848 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); 5849 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); 5850 } 5851 5852 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, 5853 struct net_device *upper_dev, 5854 void *private, bool master) 5855 { 5856 return __netdev_adjacent_dev_link_lists(dev, upper_dev, 5857 &dev->adj_list.upper, 5858 &upper_dev->adj_list.lower, 5859 private, master); 5860 } 5861 5862 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, 5863 struct net_device *upper_dev) 5864 { 5865 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, 5866 &dev->adj_list.upper, 5867 &upper_dev->adj_list.lower); 5868 } 5869 5870 static int __netdev_upper_dev_link(struct net_device *dev, 5871 struct net_device *upper_dev, bool master, 5872 void *upper_priv, void *upper_info) 5873 { 5874 struct netdev_notifier_changeupper_info changeupper_info; 5875 int ret = 0; 5876 5877 ASSERT_RTNL(); 5878 5879 if (dev == upper_dev) 5880 return -EBUSY; 5881 5882 /* To prevent loops, check if dev is not upper device to upper_dev. */ 5883 if (netdev_has_upper_dev(upper_dev, dev)) 5884 return -EBUSY; 5885 5886 if (netdev_has_upper_dev(dev, upper_dev)) 5887 return -EEXIST; 5888 5889 if (master && netdev_master_upper_dev_get(dev)) 5890 return -EBUSY; 5891 5892 changeupper_info.upper_dev = upper_dev; 5893 changeupper_info.master = master; 5894 changeupper_info.linking = true; 5895 changeupper_info.upper_info = upper_info; 5896 5897 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, 5898 &changeupper_info.info); 5899 ret = notifier_to_errno(ret); 5900 if (ret) 5901 return ret; 5902 5903 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, 5904 master); 5905 if (ret) 5906 return ret; 5907 5908 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 5909 &changeupper_info.info); 5910 ret = notifier_to_errno(ret); 5911 if (ret) 5912 goto rollback; 5913 5914 return 0; 5915 5916 rollback: 5917 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5918 5919 return ret; 5920 } 5921 5922 /** 5923 * netdev_upper_dev_link - Add a link to the upper device 5924 * @dev: device 5925 * @upper_dev: new upper device 5926 * 5927 * Adds a link to device which is upper to this one. The caller must hold 5928 * the RTNL lock. On a failure a negative errno code is returned. 5929 * On success the reference counts are adjusted and the function 5930 * returns zero. 5931 */ 5932 int netdev_upper_dev_link(struct net_device *dev, 5933 struct net_device *upper_dev) 5934 { 5935 return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL); 5936 } 5937 EXPORT_SYMBOL(netdev_upper_dev_link); 5938 5939 /** 5940 * netdev_master_upper_dev_link - Add a master link to the upper device 5941 * @dev: device 5942 * @upper_dev: new upper device 5943 * @upper_priv: upper device private 5944 * @upper_info: upper info to be passed down via notifier 5945 * 5946 * Adds a link to device which is upper to this one. In this case, only 5947 * one master upper device can be linked, although other non-master devices 5948 * might be linked as well. The caller must hold the RTNL lock. 5949 * On a failure a negative errno code is returned. On success the reference 5950 * counts are adjusted and the function returns zero. 5951 */ 5952 int netdev_master_upper_dev_link(struct net_device *dev, 5953 struct net_device *upper_dev, 5954 void *upper_priv, void *upper_info) 5955 { 5956 return __netdev_upper_dev_link(dev, upper_dev, true, 5957 upper_priv, upper_info); 5958 } 5959 EXPORT_SYMBOL(netdev_master_upper_dev_link); 5960 5961 /** 5962 * netdev_upper_dev_unlink - Removes a link to upper device 5963 * @dev: device 5964 * @upper_dev: new upper device 5965 * 5966 * Removes a link to device which is upper to this one. The caller must hold 5967 * the RTNL lock. 5968 */ 5969 void netdev_upper_dev_unlink(struct net_device *dev, 5970 struct net_device *upper_dev) 5971 { 5972 struct netdev_notifier_changeupper_info changeupper_info; 5973 ASSERT_RTNL(); 5974 5975 changeupper_info.upper_dev = upper_dev; 5976 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; 5977 changeupper_info.linking = false; 5978 5979 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, 5980 &changeupper_info.info); 5981 5982 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5983 5984 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 5985 &changeupper_info.info); 5986 } 5987 EXPORT_SYMBOL(netdev_upper_dev_unlink); 5988 5989 /** 5990 * netdev_bonding_info_change - Dispatch event about slave change 5991 * @dev: device 5992 * @bonding_info: info to dispatch 5993 * 5994 * Send NETDEV_BONDING_INFO to netdev notifiers with info. 5995 * The caller must hold the RTNL lock. 5996 */ 5997 void netdev_bonding_info_change(struct net_device *dev, 5998 struct netdev_bonding_info *bonding_info) 5999 { 6000 struct netdev_notifier_bonding_info info; 6001 6002 memcpy(&info.bonding_info, bonding_info, 6003 sizeof(struct netdev_bonding_info)); 6004 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, 6005 &info.info); 6006 } 6007 EXPORT_SYMBOL(netdev_bonding_info_change); 6008 6009 static void netdev_adjacent_add_links(struct net_device *dev) 6010 { 6011 struct netdev_adjacent *iter; 6012 6013 struct net *net = dev_net(dev); 6014 6015 list_for_each_entry(iter, &dev->adj_list.upper, list) { 6016 if (!net_eq(net, dev_net(iter->dev))) 6017 continue; 6018 netdev_adjacent_sysfs_add(iter->dev, dev, 6019 &iter->dev->adj_list.lower); 6020 netdev_adjacent_sysfs_add(dev, iter->dev, 6021 &dev->adj_list.upper); 6022 } 6023 6024 list_for_each_entry(iter, &dev->adj_list.lower, list) { 6025 if (!net_eq(net, dev_net(iter->dev))) 6026 continue; 6027 netdev_adjacent_sysfs_add(iter->dev, dev, 6028 &iter->dev->adj_list.upper); 6029 netdev_adjacent_sysfs_add(dev, iter->dev, 6030 &dev->adj_list.lower); 6031 } 6032 } 6033 6034 static void netdev_adjacent_del_links(struct net_device *dev) 6035 { 6036 struct netdev_adjacent *iter; 6037 6038 struct net *net = dev_net(dev); 6039 6040 list_for_each_entry(iter, &dev->adj_list.upper, list) { 6041 if (!net_eq(net, dev_net(iter->dev))) 6042 continue; 6043 netdev_adjacent_sysfs_del(iter->dev, dev->name, 6044 &iter->dev->adj_list.lower); 6045 netdev_adjacent_sysfs_del(dev, iter->dev->name, 6046 &dev->adj_list.upper); 6047 } 6048 6049 list_for_each_entry(iter, &dev->adj_list.lower, list) { 6050 if (!net_eq(net, dev_net(iter->dev))) 6051 continue; 6052 netdev_adjacent_sysfs_del(iter->dev, dev->name, 6053 &iter->dev->adj_list.upper); 6054 netdev_adjacent_sysfs_del(dev, iter->dev->name, 6055 &dev->adj_list.lower); 6056 } 6057 } 6058 6059 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) 6060 { 6061 struct netdev_adjacent *iter; 6062 6063 struct net *net = dev_net(dev); 6064 6065 list_for_each_entry(iter, &dev->adj_list.upper, list) { 6066 if (!net_eq(net, dev_net(iter->dev))) 6067 continue; 6068 netdev_adjacent_sysfs_del(iter->dev, oldname, 6069 &iter->dev->adj_list.lower); 6070 netdev_adjacent_sysfs_add(iter->dev, dev, 6071 &iter->dev->adj_list.lower); 6072 } 6073 6074 list_for_each_entry(iter, &dev->adj_list.lower, list) { 6075 if (!net_eq(net, dev_net(iter->dev))) 6076 continue; 6077 netdev_adjacent_sysfs_del(iter->dev, oldname, 6078 &iter->dev->adj_list.upper); 6079 netdev_adjacent_sysfs_add(iter->dev, dev, 6080 &iter->dev->adj_list.upper); 6081 } 6082 } 6083 6084 void *netdev_lower_dev_get_private(struct net_device *dev, 6085 struct net_device *lower_dev) 6086 { 6087 struct netdev_adjacent *lower; 6088 6089 if (!lower_dev) 6090 return NULL; 6091 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); 6092 if (!lower) 6093 return NULL; 6094 6095 return lower->private; 6096 } 6097 EXPORT_SYMBOL(netdev_lower_dev_get_private); 6098 6099 6100 int dev_get_nest_level(struct net_device *dev) 6101 { 6102 struct net_device *lower = NULL; 6103 struct list_head *iter; 6104 int max_nest = -1; 6105 int nest; 6106 6107 ASSERT_RTNL(); 6108 6109 netdev_for_each_lower_dev(dev, lower, iter) { 6110 nest = dev_get_nest_level(lower); 6111 if (max_nest < nest) 6112 max_nest = nest; 6113 } 6114 6115 return max_nest + 1; 6116 } 6117 EXPORT_SYMBOL(dev_get_nest_level); 6118 6119 /** 6120 * netdev_lower_change - Dispatch event about lower device state change 6121 * @lower_dev: device 6122 * @lower_state_info: state to dispatch 6123 * 6124 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. 6125 * The caller must hold the RTNL lock. 6126 */ 6127 void netdev_lower_state_changed(struct net_device *lower_dev, 6128 void *lower_state_info) 6129 { 6130 struct netdev_notifier_changelowerstate_info changelowerstate_info; 6131 6132 ASSERT_RTNL(); 6133 changelowerstate_info.lower_state_info = lower_state_info; 6134 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev, 6135 &changelowerstate_info.info); 6136 } 6137 EXPORT_SYMBOL(netdev_lower_state_changed); 6138 6139 int netdev_default_l2upper_neigh_construct(struct net_device *dev, 6140 struct neighbour *n) 6141 { 6142 struct net_device *lower_dev, *stop_dev; 6143 struct list_head *iter; 6144 int err; 6145 6146 netdev_for_each_lower_dev(dev, lower_dev, iter) { 6147 if (!lower_dev->netdev_ops->ndo_neigh_construct) 6148 continue; 6149 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n); 6150 if (err) { 6151 stop_dev = lower_dev; 6152 goto rollback; 6153 } 6154 } 6155 return 0; 6156 6157 rollback: 6158 netdev_for_each_lower_dev(dev, lower_dev, iter) { 6159 if (lower_dev == stop_dev) 6160 break; 6161 if (!lower_dev->netdev_ops->ndo_neigh_destroy) 6162 continue; 6163 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n); 6164 } 6165 return err; 6166 } 6167 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct); 6168 6169 void netdev_default_l2upper_neigh_destroy(struct net_device *dev, 6170 struct neighbour *n) 6171 { 6172 struct net_device *lower_dev; 6173 struct list_head *iter; 6174 6175 netdev_for_each_lower_dev(dev, lower_dev, iter) { 6176 if (!lower_dev->netdev_ops->ndo_neigh_destroy) 6177 continue; 6178 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n); 6179 } 6180 } 6181 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy); 6182 6183 static void dev_change_rx_flags(struct net_device *dev, int flags) 6184 { 6185 const struct net_device_ops *ops = dev->netdev_ops; 6186 6187 if (ops->ndo_change_rx_flags) 6188 ops->ndo_change_rx_flags(dev, flags); 6189 } 6190 6191 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) 6192 { 6193 unsigned int old_flags = dev->flags; 6194 kuid_t uid; 6195 kgid_t gid; 6196 6197 ASSERT_RTNL(); 6198 6199 dev->flags |= IFF_PROMISC; 6200 dev->promiscuity += inc; 6201 if (dev->promiscuity == 0) { 6202 /* 6203 * Avoid overflow. 6204 * If inc causes overflow, untouch promisc and return error. 6205 */ 6206 if (inc < 0) 6207 dev->flags &= ~IFF_PROMISC; 6208 else { 6209 dev->promiscuity -= inc; 6210 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", 6211 dev->name); 6212 return -EOVERFLOW; 6213 } 6214 } 6215 if (dev->flags != old_flags) { 6216 pr_info("device %s %s promiscuous mode\n", 6217 dev->name, 6218 dev->flags & IFF_PROMISC ? "entered" : "left"); 6219 if (audit_enabled) { 6220 current_uid_gid(&uid, &gid); 6221 audit_log(current->audit_context, GFP_ATOMIC, 6222 AUDIT_ANOM_PROMISCUOUS, 6223 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 6224 dev->name, (dev->flags & IFF_PROMISC), 6225 (old_flags & IFF_PROMISC), 6226 from_kuid(&init_user_ns, audit_get_loginuid(current)), 6227 from_kuid(&init_user_ns, uid), 6228 from_kgid(&init_user_ns, gid), 6229 audit_get_sessionid(current)); 6230 } 6231 6232 dev_change_rx_flags(dev, IFF_PROMISC); 6233 } 6234 if (notify) 6235 __dev_notify_flags(dev, old_flags, IFF_PROMISC); 6236 return 0; 6237 } 6238 6239 /** 6240 * dev_set_promiscuity - update promiscuity count on a device 6241 * @dev: device 6242 * @inc: modifier 6243 * 6244 * Add or remove promiscuity from a device. While the count in the device 6245 * remains above zero the interface remains promiscuous. Once it hits zero 6246 * the device reverts back to normal filtering operation. A negative inc 6247 * value is used to drop promiscuity on the device. 6248 * Return 0 if successful or a negative errno code on error. 6249 */ 6250 int dev_set_promiscuity(struct net_device *dev, int inc) 6251 { 6252 unsigned int old_flags = dev->flags; 6253 int err; 6254 6255 err = __dev_set_promiscuity(dev, inc, true); 6256 if (err < 0) 6257 return err; 6258 if (dev->flags != old_flags) 6259 dev_set_rx_mode(dev); 6260 return err; 6261 } 6262 EXPORT_SYMBOL(dev_set_promiscuity); 6263 6264 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) 6265 { 6266 unsigned int old_flags = dev->flags, old_gflags = dev->gflags; 6267 6268 ASSERT_RTNL(); 6269 6270 dev->flags |= IFF_ALLMULTI; 6271 dev->allmulti += inc; 6272 if (dev->allmulti == 0) { 6273 /* 6274 * Avoid overflow. 6275 * If inc causes overflow, untouch allmulti and return error. 6276 */ 6277 if (inc < 0) 6278 dev->flags &= ~IFF_ALLMULTI; 6279 else { 6280 dev->allmulti -= inc; 6281 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", 6282 dev->name); 6283 return -EOVERFLOW; 6284 } 6285 } 6286 if (dev->flags ^ old_flags) { 6287 dev_change_rx_flags(dev, IFF_ALLMULTI); 6288 dev_set_rx_mode(dev); 6289 if (notify) 6290 __dev_notify_flags(dev, old_flags, 6291 dev->gflags ^ old_gflags); 6292 } 6293 return 0; 6294 } 6295 6296 /** 6297 * dev_set_allmulti - update allmulti count on a device 6298 * @dev: device 6299 * @inc: modifier 6300 * 6301 * Add or remove reception of all multicast frames to a device. While the 6302 * count in the device remains above zero the interface remains listening 6303 * to all interfaces. Once it hits zero the device reverts back to normal 6304 * filtering operation. A negative @inc value is used to drop the counter 6305 * when releasing a resource needing all multicasts. 6306 * Return 0 if successful or a negative errno code on error. 6307 */ 6308 6309 int dev_set_allmulti(struct net_device *dev, int inc) 6310 { 6311 return __dev_set_allmulti(dev, inc, true); 6312 } 6313 EXPORT_SYMBOL(dev_set_allmulti); 6314 6315 /* 6316 * Upload unicast and multicast address lists to device and 6317 * configure RX filtering. When the device doesn't support unicast 6318 * filtering it is put in promiscuous mode while unicast addresses 6319 * are present. 6320 */ 6321 void __dev_set_rx_mode(struct net_device *dev) 6322 { 6323 const struct net_device_ops *ops = dev->netdev_ops; 6324 6325 /* dev_open will call this function so the list will stay sane. */ 6326 if (!(dev->flags&IFF_UP)) 6327 return; 6328 6329 if (!netif_device_present(dev)) 6330 return; 6331 6332 if (!(dev->priv_flags & IFF_UNICAST_FLT)) { 6333 /* Unicast addresses changes may only happen under the rtnl, 6334 * therefore calling __dev_set_promiscuity here is safe. 6335 */ 6336 if (!netdev_uc_empty(dev) && !dev->uc_promisc) { 6337 __dev_set_promiscuity(dev, 1, false); 6338 dev->uc_promisc = true; 6339 } else if (netdev_uc_empty(dev) && dev->uc_promisc) { 6340 __dev_set_promiscuity(dev, -1, false); 6341 dev->uc_promisc = false; 6342 } 6343 } 6344 6345 if (ops->ndo_set_rx_mode) 6346 ops->ndo_set_rx_mode(dev); 6347 } 6348 6349 void dev_set_rx_mode(struct net_device *dev) 6350 { 6351 netif_addr_lock_bh(dev); 6352 __dev_set_rx_mode(dev); 6353 netif_addr_unlock_bh(dev); 6354 } 6355 6356 /** 6357 * dev_get_flags - get flags reported to userspace 6358 * @dev: device 6359 * 6360 * Get the combination of flag bits exported through APIs to userspace. 6361 */ 6362 unsigned int dev_get_flags(const struct net_device *dev) 6363 { 6364 unsigned int flags; 6365 6366 flags = (dev->flags & ~(IFF_PROMISC | 6367 IFF_ALLMULTI | 6368 IFF_RUNNING | 6369 IFF_LOWER_UP | 6370 IFF_DORMANT)) | 6371 (dev->gflags & (IFF_PROMISC | 6372 IFF_ALLMULTI)); 6373 6374 if (netif_running(dev)) { 6375 if (netif_oper_up(dev)) 6376 flags |= IFF_RUNNING; 6377 if (netif_carrier_ok(dev)) 6378 flags |= IFF_LOWER_UP; 6379 if (netif_dormant(dev)) 6380 flags |= IFF_DORMANT; 6381 } 6382 6383 return flags; 6384 } 6385 EXPORT_SYMBOL(dev_get_flags); 6386 6387 int __dev_change_flags(struct net_device *dev, unsigned int flags) 6388 { 6389 unsigned int old_flags = dev->flags; 6390 int ret; 6391 6392 ASSERT_RTNL(); 6393 6394 /* 6395 * Set the flags on our device. 6396 */ 6397 6398 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 6399 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 6400 IFF_AUTOMEDIA)) | 6401 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 6402 IFF_ALLMULTI)); 6403 6404 /* 6405 * Load in the correct multicast list now the flags have changed. 6406 */ 6407 6408 if ((old_flags ^ flags) & IFF_MULTICAST) 6409 dev_change_rx_flags(dev, IFF_MULTICAST); 6410 6411 dev_set_rx_mode(dev); 6412 6413 /* 6414 * Have we downed the interface. We handle IFF_UP ourselves 6415 * according to user attempts to set it, rather than blindly 6416 * setting it. 6417 */ 6418 6419 ret = 0; 6420 if ((old_flags ^ flags) & IFF_UP) 6421 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); 6422 6423 if ((flags ^ dev->gflags) & IFF_PROMISC) { 6424 int inc = (flags & IFF_PROMISC) ? 1 : -1; 6425 unsigned int old_flags = dev->flags; 6426 6427 dev->gflags ^= IFF_PROMISC; 6428 6429 if (__dev_set_promiscuity(dev, inc, false) >= 0) 6430 if (dev->flags != old_flags) 6431 dev_set_rx_mode(dev); 6432 } 6433 6434 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 6435 is important. Some (broken) drivers set IFF_PROMISC, when 6436 IFF_ALLMULTI is requested not asking us and not reporting. 6437 */ 6438 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 6439 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 6440 6441 dev->gflags ^= IFF_ALLMULTI; 6442 __dev_set_allmulti(dev, inc, false); 6443 } 6444 6445 return ret; 6446 } 6447 6448 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, 6449 unsigned int gchanges) 6450 { 6451 unsigned int changes = dev->flags ^ old_flags; 6452 6453 if (gchanges) 6454 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); 6455 6456 if (changes & IFF_UP) { 6457 if (dev->flags & IFF_UP) 6458 call_netdevice_notifiers(NETDEV_UP, dev); 6459 else 6460 call_netdevice_notifiers(NETDEV_DOWN, dev); 6461 } 6462 6463 if (dev->flags & IFF_UP && 6464 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { 6465 struct netdev_notifier_change_info change_info; 6466 6467 change_info.flags_changed = changes; 6468 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 6469 &change_info.info); 6470 } 6471 } 6472 6473 /** 6474 * dev_change_flags - change device settings 6475 * @dev: device 6476 * @flags: device state flags 6477 * 6478 * Change settings on device based state flags. The flags are 6479 * in the userspace exported format. 6480 */ 6481 int dev_change_flags(struct net_device *dev, unsigned int flags) 6482 { 6483 int ret; 6484 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; 6485 6486 ret = __dev_change_flags(dev, flags); 6487 if (ret < 0) 6488 return ret; 6489 6490 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); 6491 __dev_notify_flags(dev, old_flags, changes); 6492 return ret; 6493 } 6494 EXPORT_SYMBOL(dev_change_flags); 6495 6496 static int __dev_set_mtu(struct net_device *dev, int new_mtu) 6497 { 6498 const struct net_device_ops *ops = dev->netdev_ops; 6499 6500 if (ops->ndo_change_mtu) 6501 return ops->ndo_change_mtu(dev, new_mtu); 6502 6503 dev->mtu = new_mtu; 6504 return 0; 6505 } 6506 6507 /** 6508 * dev_set_mtu - Change maximum transfer unit 6509 * @dev: device 6510 * @new_mtu: new transfer unit 6511 * 6512 * Change the maximum transfer size of the network device. 6513 */ 6514 int dev_set_mtu(struct net_device *dev, int new_mtu) 6515 { 6516 int err, orig_mtu; 6517 6518 if (new_mtu == dev->mtu) 6519 return 0; 6520 6521 /* MTU must be positive, and in range */ 6522 if (new_mtu < 0 || new_mtu < dev->min_mtu) { 6523 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n", 6524 dev->name, new_mtu, dev->min_mtu); 6525 return -EINVAL; 6526 } 6527 6528 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { 6529 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n", 6530 dev->name, new_mtu, dev->max_mtu); 6531 return -EINVAL; 6532 } 6533 6534 if (!netif_device_present(dev)) 6535 return -ENODEV; 6536 6537 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); 6538 err = notifier_to_errno(err); 6539 if (err) 6540 return err; 6541 6542 orig_mtu = dev->mtu; 6543 err = __dev_set_mtu(dev, new_mtu); 6544 6545 if (!err) { 6546 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 6547 err = notifier_to_errno(err); 6548 if (err) { 6549 /* setting mtu back and notifying everyone again, 6550 * so that they have a chance to revert changes. 6551 */ 6552 __dev_set_mtu(dev, orig_mtu); 6553 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 6554 } 6555 } 6556 return err; 6557 } 6558 EXPORT_SYMBOL(dev_set_mtu); 6559 6560 /** 6561 * dev_set_group - Change group this device belongs to 6562 * @dev: device 6563 * @new_group: group this device should belong to 6564 */ 6565 void dev_set_group(struct net_device *dev, int new_group) 6566 { 6567 dev->group = new_group; 6568 } 6569 EXPORT_SYMBOL(dev_set_group); 6570 6571 /** 6572 * dev_set_mac_address - Change Media Access Control Address 6573 * @dev: device 6574 * @sa: new address 6575 * 6576 * Change the hardware (MAC) address of the device 6577 */ 6578 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 6579 { 6580 const struct net_device_ops *ops = dev->netdev_ops; 6581 int err; 6582 6583 if (!ops->ndo_set_mac_address) 6584 return -EOPNOTSUPP; 6585 if (sa->sa_family != dev->type) 6586 return -EINVAL; 6587 if (!netif_device_present(dev)) 6588 return -ENODEV; 6589 err = ops->ndo_set_mac_address(dev, sa); 6590 if (err) 6591 return err; 6592 dev->addr_assign_type = NET_ADDR_SET; 6593 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 6594 add_device_randomness(dev->dev_addr, dev->addr_len); 6595 return 0; 6596 } 6597 EXPORT_SYMBOL(dev_set_mac_address); 6598 6599 /** 6600 * dev_change_carrier - Change device carrier 6601 * @dev: device 6602 * @new_carrier: new value 6603 * 6604 * Change device carrier 6605 */ 6606 int dev_change_carrier(struct net_device *dev, bool new_carrier) 6607 { 6608 const struct net_device_ops *ops = dev->netdev_ops; 6609 6610 if (!ops->ndo_change_carrier) 6611 return -EOPNOTSUPP; 6612 if (!netif_device_present(dev)) 6613 return -ENODEV; 6614 return ops->ndo_change_carrier(dev, new_carrier); 6615 } 6616 EXPORT_SYMBOL(dev_change_carrier); 6617 6618 /** 6619 * dev_get_phys_port_id - Get device physical port ID 6620 * @dev: device 6621 * @ppid: port ID 6622 * 6623 * Get device physical port ID 6624 */ 6625 int dev_get_phys_port_id(struct net_device *dev, 6626 struct netdev_phys_item_id *ppid) 6627 { 6628 const struct net_device_ops *ops = dev->netdev_ops; 6629 6630 if (!ops->ndo_get_phys_port_id) 6631 return -EOPNOTSUPP; 6632 return ops->ndo_get_phys_port_id(dev, ppid); 6633 } 6634 EXPORT_SYMBOL(dev_get_phys_port_id); 6635 6636 /** 6637 * dev_get_phys_port_name - Get device physical port name 6638 * @dev: device 6639 * @name: port name 6640 * @len: limit of bytes to copy to name 6641 * 6642 * Get device physical port name 6643 */ 6644 int dev_get_phys_port_name(struct net_device *dev, 6645 char *name, size_t len) 6646 { 6647 const struct net_device_ops *ops = dev->netdev_ops; 6648 6649 if (!ops->ndo_get_phys_port_name) 6650 return -EOPNOTSUPP; 6651 return ops->ndo_get_phys_port_name(dev, name, len); 6652 } 6653 EXPORT_SYMBOL(dev_get_phys_port_name); 6654 6655 /** 6656 * dev_change_proto_down - update protocol port state information 6657 * @dev: device 6658 * @proto_down: new value 6659 * 6660 * This info can be used by switch drivers to set the phys state of the 6661 * port. 6662 */ 6663 int dev_change_proto_down(struct net_device *dev, bool proto_down) 6664 { 6665 const struct net_device_ops *ops = dev->netdev_ops; 6666 6667 if (!ops->ndo_change_proto_down) 6668 return -EOPNOTSUPP; 6669 if (!netif_device_present(dev)) 6670 return -ENODEV; 6671 return ops->ndo_change_proto_down(dev, proto_down); 6672 } 6673 EXPORT_SYMBOL(dev_change_proto_down); 6674 6675 /** 6676 * dev_change_xdp_fd - set or clear a bpf program for a device rx path 6677 * @dev: device 6678 * @fd: new program fd or negative value to clear 6679 * @flags: xdp-related flags 6680 * 6681 * Set or clear a bpf program for a device 6682 */ 6683 int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) 6684 { 6685 const struct net_device_ops *ops = dev->netdev_ops; 6686 struct bpf_prog *prog = NULL; 6687 struct netdev_xdp xdp; 6688 int err; 6689 6690 ASSERT_RTNL(); 6691 6692 if (!ops->ndo_xdp) 6693 return -EOPNOTSUPP; 6694 if (fd >= 0) { 6695 if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { 6696 memset(&xdp, 0, sizeof(xdp)); 6697 xdp.command = XDP_QUERY_PROG; 6698 6699 err = ops->ndo_xdp(dev, &xdp); 6700 if (err < 0) 6701 return err; 6702 if (xdp.prog_attached) 6703 return -EBUSY; 6704 } 6705 6706 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); 6707 if (IS_ERR(prog)) 6708 return PTR_ERR(prog); 6709 } 6710 6711 memset(&xdp, 0, sizeof(xdp)); 6712 xdp.command = XDP_SETUP_PROG; 6713 xdp.prog = prog; 6714 6715 err = ops->ndo_xdp(dev, &xdp); 6716 if (err < 0 && prog) 6717 bpf_prog_put(prog); 6718 6719 return err; 6720 } 6721 EXPORT_SYMBOL(dev_change_xdp_fd); 6722 6723 /** 6724 * dev_new_index - allocate an ifindex 6725 * @net: the applicable net namespace 6726 * 6727 * Returns a suitable unique value for a new device interface 6728 * number. The caller must hold the rtnl semaphore or the 6729 * dev_base_lock to be sure it remains unique. 6730 */ 6731 static int dev_new_index(struct net *net) 6732 { 6733 int ifindex = net->ifindex; 6734 for (;;) { 6735 if (++ifindex <= 0) 6736 ifindex = 1; 6737 if (!__dev_get_by_index(net, ifindex)) 6738 return net->ifindex = ifindex; 6739 } 6740 } 6741 6742 /* Delayed registration/unregisteration */ 6743 static LIST_HEAD(net_todo_list); 6744 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); 6745 6746 static void net_set_todo(struct net_device *dev) 6747 { 6748 list_add_tail(&dev->todo_list, &net_todo_list); 6749 dev_net(dev)->dev_unreg_count++; 6750 } 6751 6752 static void rollback_registered_many(struct list_head *head) 6753 { 6754 struct net_device *dev, *tmp; 6755 LIST_HEAD(close_head); 6756 6757 BUG_ON(dev_boot_phase); 6758 ASSERT_RTNL(); 6759 6760 list_for_each_entry_safe(dev, tmp, head, unreg_list) { 6761 /* Some devices call without registering 6762 * for initialization unwind. Remove those 6763 * devices and proceed with the remaining. 6764 */ 6765 if (dev->reg_state == NETREG_UNINITIALIZED) { 6766 pr_debug("unregister_netdevice: device %s/%p never was registered\n", 6767 dev->name, dev); 6768 6769 WARN_ON(1); 6770 list_del(&dev->unreg_list); 6771 continue; 6772 } 6773 dev->dismantle = true; 6774 BUG_ON(dev->reg_state != NETREG_REGISTERED); 6775 } 6776 6777 /* If device is running, close it first. */ 6778 list_for_each_entry(dev, head, unreg_list) 6779 list_add_tail(&dev->close_list, &close_head); 6780 dev_close_many(&close_head, true); 6781 6782 list_for_each_entry(dev, head, unreg_list) { 6783 /* And unlink it from device chain. */ 6784 unlist_netdevice(dev); 6785 6786 dev->reg_state = NETREG_UNREGISTERING; 6787 } 6788 flush_all_backlogs(); 6789 6790 synchronize_net(); 6791 6792 list_for_each_entry(dev, head, unreg_list) { 6793 struct sk_buff *skb = NULL; 6794 6795 /* Shutdown queueing discipline. */ 6796 dev_shutdown(dev); 6797 6798 6799 /* Notify protocols, that we are about to destroy 6800 this device. They should clean all the things. 6801 */ 6802 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6803 6804 if (!dev->rtnl_link_ops || 6805 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 6806 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 6807 GFP_KERNEL); 6808 6809 /* 6810 * Flush the unicast and multicast chains 6811 */ 6812 dev_uc_flush(dev); 6813 dev_mc_flush(dev); 6814 6815 if (dev->netdev_ops->ndo_uninit) 6816 dev->netdev_ops->ndo_uninit(dev); 6817 6818 if (skb) 6819 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); 6820 6821 /* Notifier chain MUST detach us all upper devices. */ 6822 WARN_ON(netdev_has_any_upper_dev(dev)); 6823 WARN_ON(netdev_has_any_lower_dev(dev)); 6824 6825 /* Remove entries from kobject tree */ 6826 netdev_unregister_kobject(dev); 6827 #ifdef CONFIG_XPS 6828 /* Remove XPS queueing entries */ 6829 netif_reset_xps_queues_gt(dev, 0); 6830 #endif 6831 } 6832 6833 synchronize_net(); 6834 6835 list_for_each_entry(dev, head, unreg_list) 6836 dev_put(dev); 6837 } 6838 6839 static void rollback_registered(struct net_device *dev) 6840 { 6841 LIST_HEAD(single); 6842 6843 list_add(&dev->unreg_list, &single); 6844 rollback_registered_many(&single); 6845 list_del(&single); 6846 } 6847 6848 static netdev_features_t netdev_sync_upper_features(struct net_device *lower, 6849 struct net_device *upper, netdev_features_t features) 6850 { 6851 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; 6852 netdev_features_t feature; 6853 int feature_bit; 6854 6855 for_each_netdev_feature(&upper_disables, feature_bit) { 6856 feature = __NETIF_F_BIT(feature_bit); 6857 if (!(upper->wanted_features & feature) 6858 && (features & feature)) { 6859 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", 6860 &feature, upper->name); 6861 features &= ~feature; 6862 } 6863 } 6864 6865 return features; 6866 } 6867 6868 static void netdev_sync_lower_features(struct net_device *upper, 6869 struct net_device *lower, netdev_features_t features) 6870 { 6871 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; 6872 netdev_features_t feature; 6873 int feature_bit; 6874 6875 for_each_netdev_feature(&upper_disables, feature_bit) { 6876 feature = __NETIF_F_BIT(feature_bit); 6877 if (!(features & feature) && (lower->features & feature)) { 6878 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", 6879 &feature, lower->name); 6880 lower->wanted_features &= ~feature; 6881 netdev_update_features(lower); 6882 6883 if (unlikely(lower->features & feature)) 6884 netdev_WARN(upper, "failed to disable %pNF on %s!\n", 6885 &feature, lower->name); 6886 } 6887 } 6888 } 6889 6890 static netdev_features_t netdev_fix_features(struct net_device *dev, 6891 netdev_features_t features) 6892 { 6893 /* Fix illegal checksum combinations */ 6894 if ((features & NETIF_F_HW_CSUM) && 6895 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 6896 netdev_warn(dev, "mixed HW and IP checksum settings.\n"); 6897 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 6898 } 6899 6900 /* TSO requires that SG is present as well. */ 6901 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { 6902 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); 6903 features &= ~NETIF_F_ALL_TSO; 6904 } 6905 6906 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && 6907 !(features & NETIF_F_IP_CSUM)) { 6908 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); 6909 features &= ~NETIF_F_TSO; 6910 features &= ~NETIF_F_TSO_ECN; 6911 } 6912 6913 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && 6914 !(features & NETIF_F_IPV6_CSUM)) { 6915 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); 6916 features &= ~NETIF_F_TSO6; 6917 } 6918 6919 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ 6920 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) 6921 features &= ~NETIF_F_TSO_MANGLEID; 6922 6923 /* TSO ECN requires that TSO is present as well. */ 6924 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) 6925 features &= ~NETIF_F_TSO_ECN; 6926 6927 /* Software GSO depends on SG. */ 6928 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { 6929 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); 6930 features &= ~NETIF_F_GSO; 6931 } 6932 6933 /* UFO needs SG and checksumming */ 6934 if (features & NETIF_F_UFO) { 6935 /* maybe split UFO into V4 and V6? */ 6936 if (!(features & NETIF_F_HW_CSUM) && 6937 ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != 6938 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) { 6939 netdev_dbg(dev, 6940 "Dropping NETIF_F_UFO since no checksum offload features.\n"); 6941 features &= ~NETIF_F_UFO; 6942 } 6943 6944 if (!(features & NETIF_F_SG)) { 6945 netdev_dbg(dev, 6946 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); 6947 features &= ~NETIF_F_UFO; 6948 } 6949 } 6950 6951 /* GSO partial features require GSO partial be set */ 6952 if ((features & dev->gso_partial_features) && 6953 !(features & NETIF_F_GSO_PARTIAL)) { 6954 netdev_dbg(dev, 6955 "Dropping partially supported GSO features since no GSO partial.\n"); 6956 features &= ~dev->gso_partial_features; 6957 } 6958 6959 #ifdef CONFIG_NET_RX_BUSY_POLL 6960 if (dev->netdev_ops->ndo_busy_poll) 6961 features |= NETIF_F_BUSY_POLL; 6962 else 6963 #endif 6964 features &= ~NETIF_F_BUSY_POLL; 6965 6966 return features; 6967 } 6968 6969 int __netdev_update_features(struct net_device *dev) 6970 { 6971 struct net_device *upper, *lower; 6972 netdev_features_t features; 6973 struct list_head *iter; 6974 int err = -1; 6975 6976 ASSERT_RTNL(); 6977 6978 features = netdev_get_wanted_features(dev); 6979 6980 if (dev->netdev_ops->ndo_fix_features) 6981 features = dev->netdev_ops->ndo_fix_features(dev, features); 6982 6983 /* driver might be less strict about feature dependencies */ 6984 features = netdev_fix_features(dev, features); 6985 6986 /* some features can't be enabled if they're off an an upper device */ 6987 netdev_for_each_upper_dev_rcu(dev, upper, iter) 6988 features = netdev_sync_upper_features(dev, upper, features); 6989 6990 if (dev->features == features) 6991 goto sync_lower; 6992 6993 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", 6994 &dev->features, &features); 6995 6996 if (dev->netdev_ops->ndo_set_features) 6997 err = dev->netdev_ops->ndo_set_features(dev, features); 6998 else 6999 err = 0; 7000 7001 if (unlikely(err < 0)) { 7002 netdev_err(dev, 7003 "set_features() failed (%d); wanted %pNF, left %pNF\n", 7004 err, &features, &dev->features); 7005 /* return non-0 since some features might have changed and 7006 * it's better to fire a spurious notification than miss it 7007 */ 7008 return -1; 7009 } 7010 7011 sync_lower: 7012 /* some features must be disabled on lower devices when disabled 7013 * on an upper device (think: bonding master or bridge) 7014 */ 7015 netdev_for_each_lower_dev(dev, lower, iter) 7016 netdev_sync_lower_features(dev, lower, features); 7017 7018 if (!err) 7019 dev->features = features; 7020 7021 return err < 0 ? 0 : 1; 7022 } 7023 7024 /** 7025 * netdev_update_features - recalculate device features 7026 * @dev: the device to check 7027 * 7028 * Recalculate dev->features set and send notifications if it 7029 * has changed. Should be called after driver or hardware dependent 7030 * conditions might have changed that influence the features. 7031 */ 7032 void netdev_update_features(struct net_device *dev) 7033 { 7034 if (__netdev_update_features(dev)) 7035 netdev_features_change(dev); 7036 } 7037 EXPORT_SYMBOL(netdev_update_features); 7038 7039 /** 7040 * netdev_change_features - recalculate device features 7041 * @dev: the device to check 7042 * 7043 * Recalculate dev->features set and send notifications even 7044 * if they have not changed. Should be called instead of 7045 * netdev_update_features() if also dev->vlan_features might 7046 * have changed to allow the changes to be propagated to stacked 7047 * VLAN devices. 7048 */ 7049 void netdev_change_features(struct net_device *dev) 7050 { 7051 __netdev_update_features(dev); 7052 netdev_features_change(dev); 7053 } 7054 EXPORT_SYMBOL(netdev_change_features); 7055 7056 /** 7057 * netif_stacked_transfer_operstate - transfer operstate 7058 * @rootdev: the root or lower level device to transfer state from 7059 * @dev: the device to transfer operstate to 7060 * 7061 * Transfer operational state from root to device. This is normally 7062 * called when a stacking relationship exists between the root 7063 * device and the device(a leaf device). 7064 */ 7065 void netif_stacked_transfer_operstate(const struct net_device *rootdev, 7066 struct net_device *dev) 7067 { 7068 if (rootdev->operstate == IF_OPER_DORMANT) 7069 netif_dormant_on(dev); 7070 else 7071 netif_dormant_off(dev); 7072 7073 if (netif_carrier_ok(rootdev)) { 7074 if (!netif_carrier_ok(dev)) 7075 netif_carrier_on(dev); 7076 } else { 7077 if (netif_carrier_ok(dev)) 7078 netif_carrier_off(dev); 7079 } 7080 } 7081 EXPORT_SYMBOL(netif_stacked_transfer_operstate); 7082 7083 #ifdef CONFIG_SYSFS 7084 static int netif_alloc_rx_queues(struct net_device *dev) 7085 { 7086 unsigned int i, count = dev->num_rx_queues; 7087 struct netdev_rx_queue *rx; 7088 size_t sz = count * sizeof(*rx); 7089 7090 BUG_ON(count < 1); 7091 7092 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 7093 if (!rx) { 7094 rx = vzalloc(sz); 7095 if (!rx) 7096 return -ENOMEM; 7097 } 7098 dev->_rx = rx; 7099 7100 for (i = 0; i < count; i++) 7101 rx[i].dev = dev; 7102 return 0; 7103 } 7104 #endif 7105 7106 static void netdev_init_one_queue(struct net_device *dev, 7107 struct netdev_queue *queue, void *_unused) 7108 { 7109 /* Initialize queue lock */ 7110 spin_lock_init(&queue->_xmit_lock); 7111 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); 7112 queue->xmit_lock_owner = -1; 7113 netdev_queue_numa_node_write(queue, NUMA_NO_NODE); 7114 queue->dev = dev; 7115 #ifdef CONFIG_BQL 7116 dql_init(&queue->dql, HZ); 7117 #endif 7118 } 7119 7120 static void netif_free_tx_queues(struct net_device *dev) 7121 { 7122 kvfree(dev->_tx); 7123 } 7124 7125 static int netif_alloc_netdev_queues(struct net_device *dev) 7126 { 7127 unsigned int count = dev->num_tx_queues; 7128 struct netdev_queue *tx; 7129 size_t sz = count * sizeof(*tx); 7130 7131 if (count < 1 || count > 0xffff) 7132 return -EINVAL; 7133 7134 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 7135 if (!tx) { 7136 tx = vzalloc(sz); 7137 if (!tx) 7138 return -ENOMEM; 7139 } 7140 dev->_tx = tx; 7141 7142 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 7143 spin_lock_init(&dev->tx_global_lock); 7144 7145 return 0; 7146 } 7147 7148 void netif_tx_stop_all_queues(struct net_device *dev) 7149 { 7150 unsigned int i; 7151 7152 for (i = 0; i < dev->num_tx_queues; i++) { 7153 struct netdev_queue *txq = netdev_get_tx_queue(dev, i); 7154 netif_tx_stop_queue(txq); 7155 } 7156 } 7157 EXPORT_SYMBOL(netif_tx_stop_all_queues); 7158 7159 /** 7160 * register_netdevice - register a network device 7161 * @dev: device to register 7162 * 7163 * Take a completed network device structure and add it to the kernel 7164 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 7165 * chain. 0 is returned on success. A negative errno code is returned 7166 * on a failure to set up the device, or if the name is a duplicate. 7167 * 7168 * Callers must hold the rtnl semaphore. You may want 7169 * register_netdev() instead of this. 7170 * 7171 * BUGS: 7172 * The locking appears insufficient to guarantee two parallel registers 7173 * will not get the same name. 7174 */ 7175 7176 int register_netdevice(struct net_device *dev) 7177 { 7178 int ret; 7179 struct net *net = dev_net(dev); 7180 7181 BUG_ON(dev_boot_phase); 7182 ASSERT_RTNL(); 7183 7184 might_sleep(); 7185 7186 /* When net_device's are persistent, this will be fatal. */ 7187 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 7188 BUG_ON(!net); 7189 7190 spin_lock_init(&dev->addr_list_lock); 7191 netdev_set_addr_lockdep_class(dev); 7192 7193 ret = dev_get_valid_name(net, dev, dev->name); 7194 if (ret < 0) 7195 goto out; 7196 7197 /* Init, if this function is available */ 7198 if (dev->netdev_ops->ndo_init) { 7199 ret = dev->netdev_ops->ndo_init(dev); 7200 if (ret) { 7201 if (ret > 0) 7202 ret = -EIO; 7203 goto out; 7204 } 7205 } 7206 7207 if (((dev->hw_features | dev->features) & 7208 NETIF_F_HW_VLAN_CTAG_FILTER) && 7209 (!dev->netdev_ops->ndo_vlan_rx_add_vid || 7210 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { 7211 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); 7212 ret = -EINVAL; 7213 goto err_uninit; 7214 } 7215 7216 ret = -EBUSY; 7217 if (!dev->ifindex) 7218 dev->ifindex = dev_new_index(net); 7219 else if (__dev_get_by_index(net, dev->ifindex)) 7220 goto err_uninit; 7221 7222 /* Transfer changeable features to wanted_features and enable 7223 * software offloads (GSO and GRO). 7224 */ 7225 dev->hw_features |= NETIF_F_SOFT_FEATURES; 7226 dev->features |= NETIF_F_SOFT_FEATURES; 7227 dev->wanted_features = dev->features & dev->hw_features; 7228 7229 if (!(dev->flags & IFF_LOOPBACK)) 7230 dev->hw_features |= NETIF_F_NOCACHE_COPY; 7231 7232 /* If IPv4 TCP segmentation offload is supported we should also 7233 * allow the device to enable segmenting the frame with the option 7234 * of ignoring a static IP ID value. This doesn't enable the 7235 * feature itself but allows the user to enable it later. 7236 */ 7237 if (dev->hw_features & NETIF_F_TSO) 7238 dev->hw_features |= NETIF_F_TSO_MANGLEID; 7239 if (dev->vlan_features & NETIF_F_TSO) 7240 dev->vlan_features |= NETIF_F_TSO_MANGLEID; 7241 if (dev->mpls_features & NETIF_F_TSO) 7242 dev->mpls_features |= NETIF_F_TSO_MANGLEID; 7243 if (dev->hw_enc_features & NETIF_F_TSO) 7244 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; 7245 7246 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. 7247 */ 7248 dev->vlan_features |= NETIF_F_HIGHDMA; 7249 7250 /* Make NETIF_F_SG inheritable to tunnel devices. 7251 */ 7252 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; 7253 7254 /* Make NETIF_F_SG inheritable to MPLS. 7255 */ 7256 dev->mpls_features |= NETIF_F_SG; 7257 7258 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 7259 ret = notifier_to_errno(ret); 7260 if (ret) 7261 goto err_uninit; 7262 7263 ret = netdev_register_kobject(dev); 7264 if (ret) 7265 goto err_uninit; 7266 dev->reg_state = NETREG_REGISTERED; 7267 7268 __netdev_update_features(dev); 7269 7270 /* 7271 * Default initial state at registry is that the 7272 * device is present. 7273 */ 7274 7275 set_bit(__LINK_STATE_PRESENT, &dev->state); 7276 7277 linkwatch_init_dev(dev); 7278 7279 dev_init_scheduler(dev); 7280 dev_hold(dev); 7281 list_netdevice(dev); 7282 add_device_randomness(dev->dev_addr, dev->addr_len); 7283 7284 /* If the device has permanent device address, driver should 7285 * set dev_addr and also addr_assign_type should be set to 7286 * NET_ADDR_PERM (default value). 7287 */ 7288 if (dev->addr_assign_type == NET_ADDR_PERM) 7289 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); 7290 7291 /* Notify protocols, that a new device appeared. */ 7292 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 7293 ret = notifier_to_errno(ret); 7294 if (ret) { 7295 rollback_registered(dev); 7296 dev->reg_state = NETREG_UNREGISTERED; 7297 } 7298 /* 7299 * Prevent userspace races by waiting until the network 7300 * device is fully setup before sending notifications. 7301 */ 7302 if (!dev->rtnl_link_ops || 7303 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 7304 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 7305 7306 out: 7307 return ret; 7308 7309 err_uninit: 7310 if (dev->netdev_ops->ndo_uninit) 7311 dev->netdev_ops->ndo_uninit(dev); 7312 goto out; 7313 } 7314 EXPORT_SYMBOL(register_netdevice); 7315 7316 /** 7317 * init_dummy_netdev - init a dummy network device for NAPI 7318 * @dev: device to init 7319 * 7320 * This takes a network device structure and initialize the minimum 7321 * amount of fields so it can be used to schedule NAPI polls without 7322 * registering a full blown interface. This is to be used by drivers 7323 * that need to tie several hardware interfaces to a single NAPI 7324 * poll scheduler due to HW limitations. 7325 */ 7326 int init_dummy_netdev(struct net_device *dev) 7327 { 7328 /* Clear everything. Note we don't initialize spinlocks 7329 * are they aren't supposed to be taken by any of the 7330 * NAPI code and this dummy netdev is supposed to be 7331 * only ever used for NAPI polls 7332 */ 7333 memset(dev, 0, sizeof(struct net_device)); 7334 7335 /* make sure we BUG if trying to hit standard 7336 * register/unregister code path 7337 */ 7338 dev->reg_state = NETREG_DUMMY; 7339 7340 /* NAPI wants this */ 7341 INIT_LIST_HEAD(&dev->napi_list); 7342 7343 /* a dummy interface is started by default */ 7344 set_bit(__LINK_STATE_PRESENT, &dev->state); 7345 set_bit(__LINK_STATE_START, &dev->state); 7346 7347 /* Note : We dont allocate pcpu_refcnt for dummy devices, 7348 * because users of this 'device' dont need to change 7349 * its refcount. 7350 */ 7351 7352 return 0; 7353 } 7354 EXPORT_SYMBOL_GPL(init_dummy_netdev); 7355 7356 7357 /** 7358 * register_netdev - register a network device 7359 * @dev: device to register 7360 * 7361 * Take a completed network device structure and add it to the kernel 7362 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 7363 * chain. 0 is returned on success. A negative errno code is returned 7364 * on a failure to set up the device, or if the name is a duplicate. 7365 * 7366 * This is a wrapper around register_netdevice that takes the rtnl semaphore 7367 * and expands the device name if you passed a format string to 7368 * alloc_netdev. 7369 */ 7370 int register_netdev(struct net_device *dev) 7371 { 7372 int err; 7373 7374 rtnl_lock(); 7375 err = register_netdevice(dev); 7376 rtnl_unlock(); 7377 return err; 7378 } 7379 EXPORT_SYMBOL(register_netdev); 7380 7381 int netdev_refcnt_read(const struct net_device *dev) 7382 { 7383 int i, refcnt = 0; 7384 7385 for_each_possible_cpu(i) 7386 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); 7387 return refcnt; 7388 } 7389 EXPORT_SYMBOL(netdev_refcnt_read); 7390 7391 /** 7392 * netdev_wait_allrefs - wait until all references are gone. 7393 * @dev: target net_device 7394 * 7395 * This is called when unregistering network devices. 7396 * 7397 * Any protocol or device that holds a reference should register 7398 * for netdevice notification, and cleanup and put back the 7399 * reference if they receive an UNREGISTER event. 7400 * We can get stuck here if buggy protocols don't correctly 7401 * call dev_put. 7402 */ 7403 static void netdev_wait_allrefs(struct net_device *dev) 7404 { 7405 unsigned long rebroadcast_time, warning_time; 7406 int refcnt; 7407 7408 linkwatch_forget_dev(dev); 7409 7410 rebroadcast_time = warning_time = jiffies; 7411 refcnt = netdev_refcnt_read(dev); 7412 7413 while (refcnt != 0) { 7414 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 7415 rtnl_lock(); 7416 7417 /* Rebroadcast unregister notification */ 7418 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7419 7420 __rtnl_unlock(); 7421 rcu_barrier(); 7422 rtnl_lock(); 7423 7424 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7425 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 7426 &dev->state)) { 7427 /* We must not have linkwatch events 7428 * pending on unregister. If this 7429 * happens, we simply run the queue 7430 * unscheduled, resulting in a noop 7431 * for this device. 7432 */ 7433 linkwatch_run_queue(); 7434 } 7435 7436 __rtnl_unlock(); 7437 7438 rebroadcast_time = jiffies; 7439 } 7440 7441 msleep(250); 7442 7443 refcnt = netdev_refcnt_read(dev); 7444 7445 if (time_after(jiffies, warning_time + 10 * HZ)) { 7446 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", 7447 dev->name, refcnt); 7448 warning_time = jiffies; 7449 } 7450 } 7451 } 7452 7453 /* The sequence is: 7454 * 7455 * rtnl_lock(); 7456 * ... 7457 * register_netdevice(x1); 7458 * register_netdevice(x2); 7459 * ... 7460 * unregister_netdevice(y1); 7461 * unregister_netdevice(y2); 7462 * ... 7463 * rtnl_unlock(); 7464 * free_netdev(y1); 7465 * free_netdev(y2); 7466 * 7467 * We are invoked by rtnl_unlock(). 7468 * This allows us to deal with problems: 7469 * 1) We can delete sysfs objects which invoke hotplug 7470 * without deadlocking with linkwatch via keventd. 7471 * 2) Since we run with the RTNL semaphore not held, we can sleep 7472 * safely in order to wait for the netdev refcnt to drop to zero. 7473 * 7474 * We must not return until all unregister events added during 7475 * the interval the lock was held have been completed. 7476 */ 7477 void netdev_run_todo(void) 7478 { 7479 struct list_head list; 7480 7481 /* Snapshot list, allow later requests */ 7482 list_replace_init(&net_todo_list, &list); 7483 7484 __rtnl_unlock(); 7485 7486 7487 /* Wait for rcu callbacks to finish before next phase */ 7488 if (!list_empty(&list)) 7489 rcu_barrier(); 7490 7491 while (!list_empty(&list)) { 7492 struct net_device *dev 7493 = list_first_entry(&list, struct net_device, todo_list); 7494 list_del(&dev->todo_list); 7495 7496 rtnl_lock(); 7497 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7498 __rtnl_unlock(); 7499 7500 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 7501 pr_err("network todo '%s' but state %d\n", 7502 dev->name, dev->reg_state); 7503 dump_stack(); 7504 continue; 7505 } 7506 7507 dev->reg_state = NETREG_UNREGISTERED; 7508 7509 netdev_wait_allrefs(dev); 7510 7511 /* paranoia */ 7512 BUG_ON(netdev_refcnt_read(dev)); 7513 BUG_ON(!list_empty(&dev->ptype_all)); 7514 BUG_ON(!list_empty(&dev->ptype_specific)); 7515 WARN_ON(rcu_access_pointer(dev->ip_ptr)); 7516 WARN_ON(rcu_access_pointer(dev->ip6_ptr)); 7517 WARN_ON(dev->dn_ptr); 7518 7519 if (dev->destructor) 7520 dev->destructor(dev); 7521 7522 /* Report a network device has been unregistered */ 7523 rtnl_lock(); 7524 dev_net(dev)->dev_unreg_count--; 7525 __rtnl_unlock(); 7526 wake_up(&netdev_unregistering_wq); 7527 7528 /* Free network device */ 7529 kobject_put(&dev->dev.kobj); 7530 } 7531 } 7532 7533 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has 7534 * all the same fields in the same order as net_device_stats, with only 7535 * the type differing, but rtnl_link_stats64 may have additional fields 7536 * at the end for newer counters. 7537 */ 7538 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, 7539 const struct net_device_stats *netdev_stats) 7540 { 7541 #if BITS_PER_LONG == 64 7542 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); 7543 memcpy(stats64, netdev_stats, sizeof(*stats64)); 7544 /* zero out counters that only exist in rtnl_link_stats64 */ 7545 memset((char *)stats64 + sizeof(*netdev_stats), 0, 7546 sizeof(*stats64) - sizeof(*netdev_stats)); 7547 #else 7548 size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long); 7549 const unsigned long *src = (const unsigned long *)netdev_stats; 7550 u64 *dst = (u64 *)stats64; 7551 7552 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); 7553 for (i = 0; i < n; i++) 7554 dst[i] = src[i]; 7555 /* zero out counters that only exist in rtnl_link_stats64 */ 7556 memset((char *)stats64 + n * sizeof(u64), 0, 7557 sizeof(*stats64) - n * sizeof(u64)); 7558 #endif 7559 } 7560 EXPORT_SYMBOL(netdev_stats_to_stats64); 7561 7562 /** 7563 * dev_get_stats - get network device statistics 7564 * @dev: device to get statistics from 7565 * @storage: place to store stats 7566 * 7567 * Get network statistics from device. Return @storage. 7568 * The device driver may provide its own method by setting 7569 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; 7570 * otherwise the internal statistics structure is used. 7571 */ 7572 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, 7573 struct rtnl_link_stats64 *storage) 7574 { 7575 const struct net_device_ops *ops = dev->netdev_ops; 7576 7577 if (ops->ndo_get_stats64) { 7578 memset(storage, 0, sizeof(*storage)); 7579 ops->ndo_get_stats64(dev, storage); 7580 } else if (ops->ndo_get_stats) { 7581 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 7582 } else { 7583 netdev_stats_to_stats64(storage, &dev->stats); 7584 } 7585 storage->rx_dropped += atomic_long_read(&dev->rx_dropped); 7586 storage->tx_dropped += atomic_long_read(&dev->tx_dropped); 7587 storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler); 7588 return storage; 7589 } 7590 EXPORT_SYMBOL(dev_get_stats); 7591 7592 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) 7593 { 7594 struct netdev_queue *queue = dev_ingress_queue(dev); 7595 7596 #ifdef CONFIG_NET_CLS_ACT 7597 if (queue) 7598 return queue; 7599 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 7600 if (!queue) 7601 return NULL; 7602 netdev_init_one_queue(dev, queue, NULL); 7603 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); 7604 queue->qdisc_sleeping = &noop_qdisc; 7605 rcu_assign_pointer(dev->ingress_queue, queue); 7606 #endif 7607 return queue; 7608 } 7609 7610 static const struct ethtool_ops default_ethtool_ops; 7611 7612 void netdev_set_default_ethtool_ops(struct net_device *dev, 7613 const struct ethtool_ops *ops) 7614 { 7615 if (dev->ethtool_ops == &default_ethtool_ops) 7616 dev->ethtool_ops = ops; 7617 } 7618 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); 7619 7620 void netdev_freemem(struct net_device *dev) 7621 { 7622 char *addr = (char *)dev - dev->padded; 7623 7624 kvfree(addr); 7625 } 7626 7627 /** 7628 * alloc_netdev_mqs - allocate network device 7629 * @sizeof_priv: size of private data to allocate space for 7630 * @name: device name format string 7631 * @name_assign_type: origin of device name 7632 * @setup: callback to initialize device 7633 * @txqs: the number of TX subqueues to allocate 7634 * @rxqs: the number of RX subqueues to allocate 7635 * 7636 * Allocates a struct net_device with private data area for driver use 7637 * and performs basic initialization. Also allocates subqueue structs 7638 * for each queue on the device. 7639 */ 7640 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, 7641 unsigned char name_assign_type, 7642 void (*setup)(struct net_device *), 7643 unsigned int txqs, unsigned int rxqs) 7644 { 7645 struct net_device *dev; 7646 size_t alloc_size; 7647 struct net_device *p; 7648 7649 BUG_ON(strlen(name) >= sizeof(dev->name)); 7650 7651 if (txqs < 1) { 7652 pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); 7653 return NULL; 7654 } 7655 7656 #ifdef CONFIG_SYSFS 7657 if (rxqs < 1) { 7658 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); 7659 return NULL; 7660 } 7661 #endif 7662 7663 alloc_size = sizeof(struct net_device); 7664 if (sizeof_priv) { 7665 /* ensure 32-byte alignment of private area */ 7666 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); 7667 alloc_size += sizeof_priv; 7668 } 7669 /* ensure 32-byte alignment of whole construct */ 7670 alloc_size += NETDEV_ALIGN - 1; 7671 7672 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 7673 if (!p) 7674 p = vzalloc(alloc_size); 7675 if (!p) 7676 return NULL; 7677 7678 dev = PTR_ALIGN(p, NETDEV_ALIGN); 7679 dev->padded = (char *)dev - (char *)p; 7680 7681 dev->pcpu_refcnt = alloc_percpu(int); 7682 if (!dev->pcpu_refcnt) 7683 goto free_dev; 7684 7685 if (dev_addr_init(dev)) 7686 goto free_pcpu; 7687 7688 dev_mc_init(dev); 7689 dev_uc_init(dev); 7690 7691 dev_net_set(dev, &init_net); 7692 7693 dev->gso_max_size = GSO_MAX_SIZE; 7694 dev->gso_max_segs = GSO_MAX_SEGS; 7695 7696 INIT_LIST_HEAD(&dev->napi_list); 7697 INIT_LIST_HEAD(&dev->unreg_list); 7698 INIT_LIST_HEAD(&dev->close_list); 7699 INIT_LIST_HEAD(&dev->link_watch_list); 7700 INIT_LIST_HEAD(&dev->adj_list.upper); 7701 INIT_LIST_HEAD(&dev->adj_list.lower); 7702 INIT_LIST_HEAD(&dev->ptype_all); 7703 INIT_LIST_HEAD(&dev->ptype_specific); 7704 #ifdef CONFIG_NET_SCHED 7705 hash_init(dev->qdisc_hash); 7706 #endif 7707 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; 7708 setup(dev); 7709 7710 if (!dev->tx_queue_len) { 7711 dev->priv_flags |= IFF_NO_QUEUE; 7712 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; 7713 } 7714 7715 dev->num_tx_queues = txqs; 7716 dev->real_num_tx_queues = txqs; 7717 if (netif_alloc_netdev_queues(dev)) 7718 goto free_all; 7719 7720 #ifdef CONFIG_SYSFS 7721 dev->num_rx_queues = rxqs; 7722 dev->real_num_rx_queues = rxqs; 7723 if (netif_alloc_rx_queues(dev)) 7724 goto free_all; 7725 #endif 7726 7727 strcpy(dev->name, name); 7728 dev->name_assign_type = name_assign_type; 7729 dev->group = INIT_NETDEV_GROUP; 7730 if (!dev->ethtool_ops) 7731 dev->ethtool_ops = &default_ethtool_ops; 7732 7733 nf_hook_ingress_init(dev); 7734 7735 return dev; 7736 7737 free_all: 7738 free_netdev(dev); 7739 return NULL; 7740 7741 free_pcpu: 7742 free_percpu(dev->pcpu_refcnt); 7743 free_dev: 7744 netdev_freemem(dev); 7745 return NULL; 7746 } 7747 EXPORT_SYMBOL(alloc_netdev_mqs); 7748 7749 /** 7750 * free_netdev - free network device 7751 * @dev: device 7752 * 7753 * This function does the last stage of destroying an allocated device 7754 * interface. The reference to the device object is released. 7755 * If this is the last reference then it will be freed. 7756 * Must be called in process context. 7757 */ 7758 void free_netdev(struct net_device *dev) 7759 { 7760 struct napi_struct *p, *n; 7761 7762 might_sleep(); 7763 netif_free_tx_queues(dev); 7764 #ifdef CONFIG_SYSFS 7765 kvfree(dev->_rx); 7766 #endif 7767 7768 kfree(rcu_dereference_protected(dev->ingress_queue, 1)); 7769 7770 /* Flush device addresses */ 7771 dev_addr_flush(dev); 7772 7773 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 7774 netif_napi_del(p); 7775 7776 free_percpu(dev->pcpu_refcnt); 7777 dev->pcpu_refcnt = NULL; 7778 7779 /* Compatibility with error handling in drivers */ 7780 if (dev->reg_state == NETREG_UNINITIALIZED) { 7781 netdev_freemem(dev); 7782 return; 7783 } 7784 7785 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 7786 dev->reg_state = NETREG_RELEASED; 7787 7788 /* will free via device release */ 7789 put_device(&dev->dev); 7790 } 7791 EXPORT_SYMBOL(free_netdev); 7792 7793 /** 7794 * synchronize_net - Synchronize with packet receive processing 7795 * 7796 * Wait for packets currently being received to be done. 7797 * Does not block later packets from starting. 7798 */ 7799 void synchronize_net(void) 7800 { 7801 might_sleep(); 7802 if (rtnl_is_locked()) 7803 synchronize_rcu_expedited(); 7804 else 7805 synchronize_rcu(); 7806 } 7807 EXPORT_SYMBOL(synchronize_net); 7808 7809 /** 7810 * unregister_netdevice_queue - remove device from the kernel 7811 * @dev: device 7812 * @head: list 7813 * 7814 * This function shuts down a device interface and removes it 7815 * from the kernel tables. 7816 * If head not NULL, device is queued to be unregistered later. 7817 * 7818 * Callers must hold the rtnl semaphore. You may want 7819 * unregister_netdev() instead of this. 7820 */ 7821 7822 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) 7823 { 7824 ASSERT_RTNL(); 7825 7826 if (head) { 7827 list_move_tail(&dev->unreg_list, head); 7828 } else { 7829 rollback_registered(dev); 7830 /* Finish processing unregister after unlock */ 7831 net_set_todo(dev); 7832 } 7833 } 7834 EXPORT_SYMBOL(unregister_netdevice_queue); 7835 7836 /** 7837 * unregister_netdevice_many - unregister many devices 7838 * @head: list of devices 7839 * 7840 * Note: As most callers use a stack allocated list_head, 7841 * we force a list_del() to make sure stack wont be corrupted later. 7842 */ 7843 void unregister_netdevice_many(struct list_head *head) 7844 { 7845 struct net_device *dev; 7846 7847 if (!list_empty(head)) { 7848 rollback_registered_many(head); 7849 list_for_each_entry(dev, head, unreg_list) 7850 net_set_todo(dev); 7851 list_del(head); 7852 } 7853 } 7854 EXPORT_SYMBOL(unregister_netdevice_many); 7855 7856 /** 7857 * unregister_netdev - remove device from the kernel 7858 * @dev: device 7859 * 7860 * This function shuts down a device interface and removes it 7861 * from the kernel tables. 7862 * 7863 * This is just a wrapper for unregister_netdevice that takes 7864 * the rtnl semaphore. In general you want to use this and not 7865 * unregister_netdevice. 7866 */ 7867 void unregister_netdev(struct net_device *dev) 7868 { 7869 rtnl_lock(); 7870 unregister_netdevice(dev); 7871 rtnl_unlock(); 7872 } 7873 EXPORT_SYMBOL(unregister_netdev); 7874 7875 /** 7876 * dev_change_net_namespace - move device to different nethost namespace 7877 * @dev: device 7878 * @net: network namespace 7879 * @pat: If not NULL name pattern to try if the current device name 7880 * is already taken in the destination network namespace. 7881 * 7882 * This function shuts down a device interface and moves it 7883 * to a new network namespace. On success 0 is returned, on 7884 * a failure a netagive errno code is returned. 7885 * 7886 * Callers must hold the rtnl semaphore. 7887 */ 7888 7889 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 7890 { 7891 int err; 7892 7893 ASSERT_RTNL(); 7894 7895 /* Don't allow namespace local devices to be moved. */ 7896 err = -EINVAL; 7897 if (dev->features & NETIF_F_NETNS_LOCAL) 7898 goto out; 7899 7900 /* Ensure the device has been registrered */ 7901 if (dev->reg_state != NETREG_REGISTERED) 7902 goto out; 7903 7904 /* Get out if there is nothing todo */ 7905 err = 0; 7906 if (net_eq(dev_net(dev), net)) 7907 goto out; 7908 7909 /* Pick the destination device name, and ensure 7910 * we can use it in the destination network namespace. 7911 */ 7912 err = -EEXIST; 7913 if (__dev_get_by_name(net, dev->name)) { 7914 /* We get here if we can't use the current device name */ 7915 if (!pat) 7916 goto out; 7917 if (dev_get_valid_name(net, dev, pat) < 0) 7918 goto out; 7919 } 7920 7921 /* 7922 * And now a mini version of register_netdevice unregister_netdevice. 7923 */ 7924 7925 /* If device is running close it first. */ 7926 dev_close(dev); 7927 7928 /* And unlink it from device chain */ 7929 err = -ENODEV; 7930 unlist_netdevice(dev); 7931 7932 synchronize_net(); 7933 7934 /* Shutdown queueing discipline. */ 7935 dev_shutdown(dev); 7936 7937 /* Notify protocols, that we are about to destroy 7938 this device. They should clean all the things. 7939 7940 Note that dev->reg_state stays at NETREG_REGISTERED. 7941 This is wanted because this way 8021q and macvlan know 7942 the device is just moving and can keep their slaves up. 7943 */ 7944 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7945 rcu_barrier(); 7946 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7947 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); 7948 7949 /* 7950 * Flush the unicast and multicast chains 7951 */ 7952 dev_uc_flush(dev); 7953 dev_mc_flush(dev); 7954 7955 /* Send a netdev-removed uevent to the old namespace */ 7956 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); 7957 netdev_adjacent_del_links(dev); 7958 7959 /* Actually switch the network namespace */ 7960 dev_net_set(dev, net); 7961 7962 /* If there is an ifindex conflict assign a new one */ 7963 if (__dev_get_by_index(net, dev->ifindex)) 7964 dev->ifindex = dev_new_index(net); 7965 7966 /* Send a netdev-add uevent to the new namespace */ 7967 kobject_uevent(&dev->dev.kobj, KOBJ_ADD); 7968 netdev_adjacent_add_links(dev); 7969 7970 /* Fixup kobjects */ 7971 err = device_rename(&dev->dev, dev->name); 7972 WARN_ON(err); 7973 7974 /* Add the device back in the hashes */ 7975 list_netdevice(dev); 7976 7977 /* Notify protocols, that a new device appeared. */ 7978 call_netdevice_notifiers(NETDEV_REGISTER, dev); 7979 7980 /* 7981 * Prevent userspace races by waiting until the network 7982 * device is fully setup before sending notifications. 7983 */ 7984 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 7985 7986 synchronize_net(); 7987 err = 0; 7988 out: 7989 return err; 7990 } 7991 EXPORT_SYMBOL_GPL(dev_change_net_namespace); 7992 7993 static int dev_cpu_dead(unsigned int oldcpu) 7994 { 7995 struct sk_buff **list_skb; 7996 struct sk_buff *skb; 7997 unsigned int cpu; 7998 struct softnet_data *sd, *oldsd; 7999 8000 local_irq_disable(); 8001 cpu = smp_processor_id(); 8002 sd = &per_cpu(softnet_data, cpu); 8003 oldsd = &per_cpu(softnet_data, oldcpu); 8004 8005 /* Find end of our completion_queue. */ 8006 list_skb = &sd->completion_queue; 8007 while (*list_skb) 8008 list_skb = &(*list_skb)->next; 8009 /* Append completion queue from offline CPU. */ 8010 *list_skb = oldsd->completion_queue; 8011 oldsd->completion_queue = NULL; 8012 8013 /* Append output queue from offline CPU. */ 8014 if (oldsd->output_queue) { 8015 *sd->output_queue_tailp = oldsd->output_queue; 8016 sd->output_queue_tailp = oldsd->output_queue_tailp; 8017 oldsd->output_queue = NULL; 8018 oldsd->output_queue_tailp = &oldsd->output_queue; 8019 } 8020 /* Append NAPI poll list from offline CPU, with one exception : 8021 * process_backlog() must be called by cpu owning percpu backlog. 8022 * We properly handle process_queue & input_pkt_queue later. 8023 */ 8024 while (!list_empty(&oldsd->poll_list)) { 8025 struct napi_struct *napi = list_first_entry(&oldsd->poll_list, 8026 struct napi_struct, 8027 poll_list); 8028 8029 list_del_init(&napi->poll_list); 8030 if (napi->poll == process_backlog) 8031 napi->state = 0; 8032 else 8033 ____napi_schedule(sd, napi); 8034 } 8035 8036 raise_softirq_irqoff(NET_TX_SOFTIRQ); 8037 local_irq_enable(); 8038 8039 /* Process offline CPU's input_pkt_queue */ 8040 while ((skb = __skb_dequeue(&oldsd->process_queue))) { 8041 netif_rx_ni(skb); 8042 input_queue_head_incr(oldsd); 8043 } 8044 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { 8045 netif_rx_ni(skb); 8046 input_queue_head_incr(oldsd); 8047 } 8048 8049 return 0; 8050 } 8051 8052 /** 8053 * netdev_increment_features - increment feature set by one 8054 * @all: current feature set 8055 * @one: new feature set 8056 * @mask: mask feature set 8057 * 8058 * Computes a new feature set after adding a device with feature set 8059 * @one to the master device with current feature set @all. Will not 8060 * enable anything that is off in @mask. Returns the new feature set. 8061 */ 8062 netdev_features_t netdev_increment_features(netdev_features_t all, 8063 netdev_features_t one, netdev_features_t mask) 8064 { 8065 if (mask & NETIF_F_HW_CSUM) 8066 mask |= NETIF_F_CSUM_MASK; 8067 mask |= NETIF_F_VLAN_CHALLENGED; 8068 8069 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; 8070 all &= one | ~NETIF_F_ALL_FOR_ALL; 8071 8072 /* If one device supports hw checksumming, set for all. */ 8073 if (all & NETIF_F_HW_CSUM) 8074 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); 8075 8076 return all; 8077 } 8078 EXPORT_SYMBOL(netdev_increment_features); 8079 8080 static struct hlist_head * __net_init netdev_create_hash(void) 8081 { 8082 int i; 8083 struct hlist_head *hash; 8084 8085 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 8086 if (hash != NULL) 8087 for (i = 0; i < NETDEV_HASHENTRIES; i++) 8088 INIT_HLIST_HEAD(&hash[i]); 8089 8090 return hash; 8091 } 8092 8093 /* Initialize per network namespace state */ 8094 static int __net_init netdev_init(struct net *net) 8095 { 8096 if (net != &init_net) 8097 INIT_LIST_HEAD(&net->dev_base_head); 8098 8099 net->dev_name_head = netdev_create_hash(); 8100 if (net->dev_name_head == NULL) 8101 goto err_name; 8102 8103 net->dev_index_head = netdev_create_hash(); 8104 if (net->dev_index_head == NULL) 8105 goto err_idx; 8106 8107 return 0; 8108 8109 err_idx: 8110 kfree(net->dev_name_head); 8111 err_name: 8112 return -ENOMEM; 8113 } 8114 8115 /** 8116 * netdev_drivername - network driver for the device 8117 * @dev: network device 8118 * 8119 * Determine network driver for device. 8120 */ 8121 const char *netdev_drivername(const struct net_device *dev) 8122 { 8123 const struct device_driver *driver; 8124 const struct device *parent; 8125 const char *empty = ""; 8126 8127 parent = dev->dev.parent; 8128 if (!parent) 8129 return empty; 8130 8131 driver = parent->driver; 8132 if (driver && driver->name) 8133 return driver->name; 8134 return empty; 8135 } 8136 8137 static void __netdev_printk(const char *level, const struct net_device *dev, 8138 struct va_format *vaf) 8139 { 8140 if (dev && dev->dev.parent) { 8141 dev_printk_emit(level[1] - '0', 8142 dev->dev.parent, 8143 "%s %s %s%s: %pV", 8144 dev_driver_string(dev->dev.parent), 8145 dev_name(dev->dev.parent), 8146 netdev_name(dev), netdev_reg_state(dev), 8147 vaf); 8148 } else if (dev) { 8149 printk("%s%s%s: %pV", 8150 level, netdev_name(dev), netdev_reg_state(dev), vaf); 8151 } else { 8152 printk("%s(NULL net_device): %pV", level, vaf); 8153 } 8154 } 8155 8156 void netdev_printk(const char *level, const struct net_device *dev, 8157 const char *format, ...) 8158 { 8159 struct va_format vaf; 8160 va_list args; 8161 8162 va_start(args, format); 8163 8164 vaf.fmt = format; 8165 vaf.va = &args; 8166 8167 __netdev_printk(level, dev, &vaf); 8168 8169 va_end(args); 8170 } 8171 EXPORT_SYMBOL(netdev_printk); 8172 8173 #define define_netdev_printk_level(func, level) \ 8174 void func(const struct net_device *dev, const char *fmt, ...) \ 8175 { \ 8176 struct va_format vaf; \ 8177 va_list args; \ 8178 \ 8179 va_start(args, fmt); \ 8180 \ 8181 vaf.fmt = fmt; \ 8182 vaf.va = &args; \ 8183 \ 8184 __netdev_printk(level, dev, &vaf); \ 8185 \ 8186 va_end(args); \ 8187 } \ 8188 EXPORT_SYMBOL(func); 8189 8190 define_netdev_printk_level(netdev_emerg, KERN_EMERG); 8191 define_netdev_printk_level(netdev_alert, KERN_ALERT); 8192 define_netdev_printk_level(netdev_crit, KERN_CRIT); 8193 define_netdev_printk_level(netdev_err, KERN_ERR); 8194 define_netdev_printk_level(netdev_warn, KERN_WARNING); 8195 define_netdev_printk_level(netdev_notice, KERN_NOTICE); 8196 define_netdev_printk_level(netdev_info, KERN_INFO); 8197 8198 static void __net_exit netdev_exit(struct net *net) 8199 { 8200 kfree(net->dev_name_head); 8201 kfree(net->dev_index_head); 8202 } 8203 8204 static struct pernet_operations __net_initdata netdev_net_ops = { 8205 .init = netdev_init, 8206 .exit = netdev_exit, 8207 }; 8208 8209 static void __net_exit default_device_exit(struct net *net) 8210 { 8211 struct net_device *dev, *aux; 8212 /* 8213 * Push all migratable network devices back to the 8214 * initial network namespace 8215 */ 8216 rtnl_lock(); 8217 for_each_netdev_safe(net, dev, aux) { 8218 int err; 8219 char fb_name[IFNAMSIZ]; 8220 8221 /* Ignore unmoveable devices (i.e. loopback) */ 8222 if (dev->features & NETIF_F_NETNS_LOCAL) 8223 continue; 8224 8225 /* Leave virtual devices for the generic cleanup */ 8226 if (dev->rtnl_link_ops) 8227 continue; 8228 8229 /* Push remaining network devices to init_net */ 8230 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 8231 err = dev_change_net_namespace(dev, &init_net, fb_name); 8232 if (err) { 8233 pr_emerg("%s: failed to move %s to init_net: %d\n", 8234 __func__, dev->name, err); 8235 BUG(); 8236 } 8237 } 8238 rtnl_unlock(); 8239 } 8240 8241 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) 8242 { 8243 /* Return with the rtnl_lock held when there are no network 8244 * devices unregistering in any network namespace in net_list. 8245 */ 8246 struct net *net; 8247 bool unregistering; 8248 DEFINE_WAIT_FUNC(wait, woken_wake_function); 8249 8250 add_wait_queue(&netdev_unregistering_wq, &wait); 8251 for (;;) { 8252 unregistering = false; 8253 rtnl_lock(); 8254 list_for_each_entry(net, net_list, exit_list) { 8255 if (net->dev_unreg_count > 0) { 8256 unregistering = true; 8257 break; 8258 } 8259 } 8260 if (!unregistering) 8261 break; 8262 __rtnl_unlock(); 8263 8264 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 8265 } 8266 remove_wait_queue(&netdev_unregistering_wq, &wait); 8267 } 8268 8269 static void __net_exit default_device_exit_batch(struct list_head *net_list) 8270 { 8271 /* At exit all network devices most be removed from a network 8272 * namespace. Do this in the reverse order of registration. 8273 * Do this across as many network namespaces as possible to 8274 * improve batching efficiency. 8275 */ 8276 struct net_device *dev; 8277 struct net *net; 8278 LIST_HEAD(dev_kill_list); 8279 8280 /* To prevent network device cleanup code from dereferencing 8281 * loopback devices or network devices that have been freed 8282 * wait here for all pending unregistrations to complete, 8283 * before unregistring the loopback device and allowing the 8284 * network namespace be freed. 8285 * 8286 * The netdev todo list containing all network devices 8287 * unregistrations that happen in default_device_exit_batch 8288 * will run in the rtnl_unlock() at the end of 8289 * default_device_exit_batch. 8290 */ 8291 rtnl_lock_unregistering(net_list); 8292 list_for_each_entry(net, net_list, exit_list) { 8293 for_each_netdev_reverse(net, dev) { 8294 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) 8295 dev->rtnl_link_ops->dellink(dev, &dev_kill_list); 8296 else 8297 unregister_netdevice_queue(dev, &dev_kill_list); 8298 } 8299 } 8300 unregister_netdevice_many(&dev_kill_list); 8301 rtnl_unlock(); 8302 } 8303 8304 static struct pernet_operations __net_initdata default_device_ops = { 8305 .exit = default_device_exit, 8306 .exit_batch = default_device_exit_batch, 8307 }; 8308 8309 /* 8310 * Initialize the DEV module. At boot time this walks the device list and 8311 * unhooks any devices that fail to initialise (normally hardware not 8312 * present) and leaves us with a valid list of present and active devices. 8313 * 8314 */ 8315 8316 /* 8317 * This is called single threaded during boot, so no need 8318 * to take the rtnl semaphore. 8319 */ 8320 static int __init net_dev_init(void) 8321 { 8322 int i, rc = -ENOMEM; 8323 8324 BUG_ON(!dev_boot_phase); 8325 8326 if (dev_proc_init()) 8327 goto out; 8328 8329 if (netdev_kobject_init()) 8330 goto out; 8331 8332 INIT_LIST_HEAD(&ptype_all); 8333 for (i = 0; i < PTYPE_HASH_SIZE; i++) 8334 INIT_LIST_HEAD(&ptype_base[i]); 8335 8336 INIT_LIST_HEAD(&offload_base); 8337 8338 if (register_pernet_subsys(&netdev_net_ops)) 8339 goto out; 8340 8341 /* 8342 * Initialise the packet receive queues. 8343 */ 8344 8345 for_each_possible_cpu(i) { 8346 struct work_struct *flush = per_cpu_ptr(&flush_works, i); 8347 struct softnet_data *sd = &per_cpu(softnet_data, i); 8348 8349 INIT_WORK(flush, flush_backlog); 8350 8351 skb_queue_head_init(&sd->input_pkt_queue); 8352 skb_queue_head_init(&sd->process_queue); 8353 INIT_LIST_HEAD(&sd->poll_list); 8354 sd->output_queue_tailp = &sd->output_queue; 8355 #ifdef CONFIG_RPS 8356 sd->csd.func = rps_trigger_softirq; 8357 sd->csd.info = sd; 8358 sd->cpu = i; 8359 #endif 8360 8361 sd->backlog.poll = process_backlog; 8362 sd->backlog.weight = weight_p; 8363 } 8364 8365 dev_boot_phase = 0; 8366 8367 /* The loopback device is special if any other network devices 8368 * is present in a network namespace the loopback device must 8369 * be present. Since we now dynamically allocate and free the 8370 * loopback device ensure this invariant is maintained by 8371 * keeping the loopback device as the first device on the 8372 * list of network devices. Ensuring the loopback devices 8373 * is the first device that appears and the last network device 8374 * that disappears. 8375 */ 8376 if (register_pernet_device(&loopback_net_ops)) 8377 goto out; 8378 8379 if (register_pernet_device(&default_device_ops)) 8380 goto out; 8381 8382 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 8383 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 8384 8385 rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", 8386 NULL, dev_cpu_dead); 8387 WARN_ON(rc < 0); 8388 dst_subsys_init(); 8389 rc = 0; 8390 out: 8391 return rc; 8392 } 8393 8394 subsys_initcall(net_dev_init); 8395