net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <net/mpls.h>
 122 #include <linux/ipv6.h>
 123 #include <linux/in.h>
 124 #include <linux/jhash.h>
 125 #include <linux/random.h>
 126 #include <trace/events/napi.h>
 127 #include <trace/events/net.h>
 128 #include <trace/events/skb.h>
 129 #include <linux/pci.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 static DEFINE_SPINLOCK(ptype_lock);
 148 static DEFINE_SPINLOCK(offload_lock);
 149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 150 struct list_head ptype_all __read_mostly;       /* Taps */
 151 static struct list_head offload_base __read_mostly;
 152
 153 static int netif_rx_internal(struct sk_buff *skb);
 154 static int call_netdevice_notifiers_info(unsigned long val,
 155                                          struct net_device *dev,
 156                                          struct netdev_notifier_info *info);
 157
 158 /*
 159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 160  * semaphore.
 161  *
 162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 163  *
 164  * Writers must hold the rtnl semaphore while they loop through the
 165  * dev_base_head list, and hold dev_base_lock for writing when they do the
 166  * actual updates.  This allows pure readers to access the list even
 167  * while a writer is preparing to update it.
 168  *
 169  * To put it another way, dev_base_lock is held for writing only to
 170  * protect against pure readers; the rtnl semaphore provides the
 171  * protection against other writers.
 172  *
 173  * See, for example usages, register_netdevice() and
 174  * unregister_netdevice(), which must be called with the rtnl
 175  * semaphore held.
 176  */
 177 DEFINE_RWLOCK(dev_base_lock);
 178 EXPORT_SYMBOL(dev_base_lock);
 179
 180 /* protects napi_hash addition/deletion and napi_gen_id */
 181 static DEFINE_SPINLOCK(napi_hash_lock);
 182
 183 static unsigned int napi_gen_id;
 184 static DEFINE_HASHTABLE(napi_hash, 8);
 185
 186 static seqcount_t devnet_rename_seq;
 187
 188 static inline void dev_base_seq_inc(struct net *net)
 189 {
 190         while (++net->dev_base_seq == 0);
 191 }
 192
 193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 194 {
 195         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 196
 197         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 198 }
 199
 200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 201 {
 202         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 203 }
 204
 205 static inline void rps_lock(struct softnet_data *sd)
 206 {
 207 #ifdef CONFIG_RPS
 208         spin_lock(&sd->input_pkt_queue.lock);
 209 #endif
 210 }
 211
 212 static inline void rps_unlock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_unlock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 /* Device list insertion */
 220 static void list_netdevice(struct net_device *dev)
 221 {
 222         struct net *net = dev_net(dev);
 223
 224         ASSERT_RTNL();
 225
 226         write_lock_bh(&dev_base_lock);
 227         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 228         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 229         hlist_add_head_rcu(&dev->index_hlist,
 230                            dev_index_hash(net, dev->ifindex));
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(net);
 234 }
 235
 236 /* Device list removal
 237  * caller must respect a RCU grace period before freeing/reusing dev
 238  */
 239 static void unlist_netdevice(struct net_device *dev)
 240 {
 241         ASSERT_RTNL();
 242
 243         /* Unlink dev from the device chain */
 244         write_lock_bh(&dev_base_lock);
 245         list_del_rcu(&dev->dev_list);
 246         hlist_del_rcu(&dev->name_hlist);
 247         hlist_del_rcu(&dev->index_hlist);
 248         write_unlock_bh(&dev_base_lock);
 249
 250         dev_base_seq_inc(dev_net(dev));
 251 }
 252
 253 /*
 254  *      Our notifier list
 255  */
 256
 257 static RAW_NOTIFIER_HEAD(netdev_chain);
 258
 259 /*
 260  *      Device drivers call our routines to queue packets here. We empty the
 261  *      queue in the local softnet handler.
 262  */
 263
 264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 265 EXPORT_PER_CPU_SYMBOL(softnet_data);
 266
 267 #ifdef CONFIG_LOCKDEP
 268 /*
 269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 270  * according to dev->type
 271  */
 272 static const unsigned short netdev_lock_type[] =
 273         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 274          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 275          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 276          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 277          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 278          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 279          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 280          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 281          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 282          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 283          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 284          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 285          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 286          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 287          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 288
 289 static const char *const netdev_lock_name[] =
 290         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 291          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 292          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 293          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 294          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 295          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 296          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 297          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 298          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 299          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 300          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 301          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 302          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 303          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 304          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 305
 306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308
 309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 310 {
 311         int i;
 312
 313         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 314                 if (netdev_lock_type[i] == dev_type)
 315                         return i;
 316         /* the last key is used by default */
 317         return ARRAY_SIZE(netdev_lock_type) - 1;
 318 }
 319
 320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 321                                                  unsigned short dev_type)
 322 {
 323         int i;
 324
 325         i = netdev_lock_pos(dev_type);
 326         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 327                                    netdev_lock_name[i]);
 328 }
 329
 330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 331 {
 332         int i;
 333
 334         i = netdev_lock_pos(dev->type);
 335         lockdep_set_class_and_name(&dev->addr_list_lock,
 336                                    &netdev_addr_lock_key[i],
 337                                    netdev_lock_name[i]);
 338 }
 339 #else
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343 }
 344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345 {
 346 }
 347 #endif
 348
 349 /*******************************************************************************
 350
 351                 Protocol management and registration routines
 352
 353 *******************************************************************************/
 354
 355 /*
 356  *      Add a protocol ID to the list. Now that the input handler is
 357  *      smarter we can dispense with all the messy stuff that used to be
 358  *      here.
 359  *
 360  *      BEWARE!!! Protocol handlers, mangling input packets,
 361  *      MUST BE last in hash buckets and checking protocol handlers
 362  *      MUST start from promiscuous ptype_all chain in net_bh.
 363  *      It is true now, do not change it.
 364  *      Explanation follows: if protocol handler, mangling packet, will
 365  *      be the first on list, it is not able to sense, that packet
 366  *      is cloned and should be copied-on-write, so that it will
 367  *      change it and subsequent readers will get broken packet.
 368  *                                                      --ANK (980803)
 369  */
 370
 371 static inline struct list_head *ptype_head(const struct packet_type *pt)
 372 {
 373         if (pt->type == htons(ETH_P_ALL))
 374                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 375         else
 376                 return pt->dev ? &pt->dev->ptype_specific :
 377                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 378 }
 379
 380 /**
 381  *      dev_add_pack - add packet handler
 382  *      @pt: packet type declaration
 383  *
 384  *      Add a protocol handler to the networking stack. The passed &packet_type
 385  *      is linked into kernel lists and may not be freed until it has been
 386  *      removed from the kernel lists.
 387  *
 388  *      This call does not sleep therefore it can not
 389  *      guarantee all CPU's that are in middle of receiving packets
 390  *      will see the new packet type (until the next received packet).
 391  */
 392
 393 void dev_add_pack(struct packet_type *pt)
 394 {
 395         struct list_head *head = ptype_head(pt);
 396
 397         spin_lock(&ptype_lock);
 398         list_add_rcu(&pt->list, head);
 399         spin_unlock(&ptype_lock);
 400 }
 401 EXPORT_SYMBOL(dev_add_pack);
 402
 403 /**
 404  *      __dev_remove_pack        - remove packet handler
 405  *      @pt: packet type declaration
 406  *
 407  *      Remove a protocol handler that was previously added to the kernel
 408  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 409  *      from the kernel lists and can be freed or reused once this function
 410  *      returns.
 411  *
 412  *      The packet type might still be in use by receivers
 413  *      and must not be freed until after all the CPU's have gone
 414  *      through a quiescent state.
 415  */
 416 void __dev_remove_pack(struct packet_type *pt)
 417 {
 418         struct list_head *head = ptype_head(pt);
 419         struct packet_type *pt1;
 420
 421         spin_lock(&ptype_lock);
 422
 423         list_for_each_entry(pt1, head, list) {
 424                 if (pt == pt1) {
 425                         list_del_rcu(&pt->list);
 426                         goto out;
 427                 }
 428         }
 429
 430         pr_warn("dev_remove_pack: %p not found\n", pt);
 431 out:
 432         spin_unlock(&ptype_lock);
 433 }
 434 EXPORT_SYMBOL(__dev_remove_pack);
 435
 436 /**
 437  *      dev_remove_pack  - remove packet handler
 438  *      @pt: packet type declaration
 439  *
 440  *      Remove a protocol handler that was previously added to the kernel
 441  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 442  *      from the kernel lists and can be freed or reused once this function
 443  *      returns.
 444  *
 445  *      This call sleeps to guarantee that no CPU is looking at the packet
 446  *      type after return.
 447  */
 448 void dev_remove_pack(struct packet_type *pt)
 449 {
 450         __dev_remove_pack(pt);
 451
 452         synchronize_net();
 453 }
 454 EXPORT_SYMBOL(dev_remove_pack);
 455
 456
 457 /**
 458  *      dev_add_offload - register offload handlers
 459  *      @po: protocol offload declaration
 460  *
 461  *      Add protocol offload handlers to the networking stack. The passed
 462  *      &proto_offload is linked into kernel lists and may not be freed until
 463  *      it has been removed from the kernel lists.
 464  *
 465  *      This call does not sleep therefore it can not
 466  *      guarantee all CPU's that are in middle of receiving packets
 467  *      will see the new offload handlers (until the next received packet).
 468  */
 469 void dev_add_offload(struct packet_offload *po)
 470 {
 471         struct list_head *head = &offload_base;
 472
 473         spin_lock(&offload_lock);
 474         list_add_rcu(&po->list, head);
 475         spin_unlock(&offload_lock);
 476 }
 477 EXPORT_SYMBOL(dev_add_offload);
 478
 479 /**
 480  *      __dev_remove_offload     - remove offload handler
 481  *      @po: packet offload declaration
 482  *
 483  *      Remove a protocol offload handler that was previously added to the
 484  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 485  *      is removed from the kernel lists and can be freed or reused once this
 486  *      function returns.
 487  *
 488  *      The packet type might still be in use by receivers
 489  *      and must not be freed until after all the CPU's have gone
 490  *      through a quiescent state.
 491  */
 492 static void __dev_remove_offload(struct packet_offload *po)
 493 {
 494         struct list_head *head = &offload_base;
 495         struct packet_offload *po1;
 496
 497         spin_lock(&offload_lock);
 498
 499         list_for_each_entry(po1, head, list) {
 500                 if (po == po1) {
 501                         list_del_rcu(&po->list);
 502                         goto out;
 503                 }
 504         }
 505
 506         pr_warn("dev_remove_offload: %p not found\n", po);
 507 out:
 508         spin_unlock(&offload_lock);
 509 }
 510
 511 /**
 512  *      dev_remove_offload       - remove packet offload handler
 513  *      @po: packet offload declaration
 514  *
 515  *      Remove a packet offload handler that was previously added to the kernel
 516  *      offload handlers by dev_add_offload(). The passed &offload_type is
 517  *      removed from the kernel lists and can be freed or reused once this
 518  *      function returns.
 519  *
 520  *      This call sleeps to guarantee that no CPU is looking at the packet
 521  *      type after return.
 522  */
 523 void dev_remove_offload(struct packet_offload *po)
 524 {
 525         __dev_remove_offload(po);
 526
 527         synchronize_net();
 528 }
 529 EXPORT_SYMBOL(dev_remove_offload);
 530
 531 /******************************************************************************
 532
 533                       Device Boot-time Settings Routines
 534
 535 *******************************************************************************/
 536
 537 /* Boot time configuration table */
 538 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 539
 540 /**
 541  *      netdev_boot_setup_add   - add new setup entry
 542  *      @name: name of the device
 543  *      @map: configured settings for the device
 544  *
 545  *      Adds new setup entry to the dev_boot_setup list.  The function
 546  *      returns 0 on error and 1 on success.  This is a generic routine to
 547  *      all netdevices.
 548  */
 549 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 550 {
 551         struct netdev_boot_setup *s;
 552         int i;
 553
 554         s = dev_boot_setup;
 555         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 556                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 557                         memset(s[i].name, 0, sizeof(s[i].name));
 558                         strlcpy(s[i].name, name, IFNAMSIZ);
 559                         memcpy(&s[i].map, map, sizeof(s[i].map));
 560                         break;
 561                 }
 562         }
 563
 564         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 565 }
 566
 567 /**
 568  *      netdev_boot_setup_check - check boot time settings
 569  *      @dev: the netdevice
 570  *
 571  *      Check boot time settings for the device.
 572  *      The found settings are set for the device to be used
 573  *      later in the device probing.
 574  *      Returns 0 if no settings found, 1 if they are.
 575  */
 576 int netdev_boot_setup_check(struct net_device *dev)
 577 {
 578         struct netdev_boot_setup *s = dev_boot_setup;
 579         int i;
 580
 581         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 582                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 583                     !strcmp(dev->name, s[i].name)) {
 584                         dev->irq        = s[i].map.irq;
 585                         dev->base_addr  = s[i].map.base_addr;
 586                         dev->mem_start  = s[i].map.mem_start;
 587                         dev->mem_end    = s[i].map.mem_end;
 588                         return 1;
 589                 }
 590         }
 591         return 0;
 592 }
 593 EXPORT_SYMBOL(netdev_boot_setup_check);
 594
 595
 596 /**
 597  *      netdev_boot_base        - get address from boot time settings
 598  *      @prefix: prefix for network device
 599  *      @unit: id for network device
 600  *
 601  *      Check boot time settings for the base address of device.
 602  *      The found settings are set for the device to be used
 603  *      later in the device probing.
 604  *      Returns 0 if no settings found.
 605  */
 606 unsigned long netdev_boot_base(const char *prefix, int unit)
 607 {
 608         const struct netdev_boot_setup *s = dev_boot_setup;
 609         char name[IFNAMSIZ];
 610         int i;
 611
 612         sprintf(name, "%s%d", prefix, unit);
 613
 614         /*
 615          * If device already registered then return base of 1
 616          * to indicate not to probe for this interface
 617          */
 618         if (__dev_get_by_name(&init_net, name))
 619                 return 1;
 620
 621         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 622                 if (!strcmp(name, s[i].name))
 623                         return s[i].map.base_addr;
 624         return 0;
 625 }
 626
 627 /*
 628  * Saves at boot time configured settings for any netdevice.
 629  */
 630 int __init netdev_boot_setup(char *str)
 631 {
 632         int ints[5];
 633         struct ifmap map;
 634
 635         str = get_options(str, ARRAY_SIZE(ints), ints);
 636         if (!str || !*str)
 637                 return 0;
 638
 639         /* Save settings */
 640         memset(&map, 0, sizeof(map));
 641         if (ints[0] > 0)
 642                 map.irq = ints[1];
 643         if (ints[0] > 1)
 644                 map.base_addr = ints[2];
 645         if (ints[0] > 2)
 646                 map.mem_start = ints[3];
 647         if (ints[0] > 3)
 648                 map.mem_end = ints[4];
 649
 650         /* Add new entry to the list */
 651         return netdev_boot_setup_add(str, &map);
 652 }
 653
 654 __setup("netdev=", netdev_boot_setup);
 655
 656 /*******************************************************************************
 657
 658                             Device Interface Subroutines
 659
 660 *******************************************************************************/
 661
 662 /**
 663  *      __dev_get_by_name       - find a device by its name
 664  *      @net: the applicable net namespace
 665  *      @name: name to find
 666  *
 667  *      Find an interface by name. Must be called under RTNL semaphore
 668  *      or @dev_base_lock. If the name is found a pointer to the device
 669  *      is returned. If the name is not found then %NULL is returned. The
 670  *      reference counters are not incremented so the caller must be
 671  *      careful with locks.
 672  */
 673
 674 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 675 {
 676         struct net_device *dev;
 677         struct hlist_head *head = dev_name_hash(net, name);
 678
 679         hlist_for_each_entry(dev, head, name_hlist)
 680                 if (!strncmp(dev->name, name, IFNAMSIZ))
 681                         return dev;
 682
 683         return NULL;
 684 }
 685 EXPORT_SYMBOL(__dev_get_by_name);
 686
 687 /**
 688  *      dev_get_by_name_rcu     - find a device by its name
 689  *      @net: the applicable net namespace
 690  *      @name: name to find
 691  *
 692  *      Find an interface by name.
 693  *      If the name is found a pointer to the device is returned.
 694  *      If the name is not found then %NULL is returned.
 695  *      The reference counters are not incremented so the caller must be
 696  *      careful with locks. The caller must hold RCU lock.
 697  */
 698
 699 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 700 {
 701         struct net_device *dev;
 702         struct hlist_head *head = dev_name_hash(net, name);
 703
 704         hlist_for_each_entry_rcu(dev, head, name_hlist)
 705                 if (!strncmp(dev->name, name, IFNAMSIZ))
 706                         return dev;
 707
 708         return NULL;
 709 }
 710 EXPORT_SYMBOL(dev_get_by_name_rcu);
 711
 712 /**
 713  *      dev_get_by_name         - find a device by its name
 714  *      @net: the applicable net namespace
 715  *      @name: name to find
 716  *
 717  *      Find an interface by name. This can be called from any
 718  *      context and does its own locking. The returned handle has
 719  *      the usage count incremented and the caller must use dev_put() to
 720  *      release it when it is no longer needed. %NULL is returned if no
 721  *      matching device is found.
 722  */
 723
 724 struct net_device *dev_get_by_name(struct net *net, const char *name)
 725 {
 726         struct net_device *dev;
 727
 728         rcu_read_lock();
 729         dev = dev_get_by_name_rcu(net, name);
 730         if (dev)
 731                 dev_hold(dev);
 732         rcu_read_unlock();
 733         return dev;
 734 }
 735 EXPORT_SYMBOL(dev_get_by_name);
 736
 737 /**
 738  *      __dev_get_by_index - find a device by its ifindex
 739  *      @net: the applicable net namespace
 740  *      @ifindex: index of device
 741  *
 742  *      Search for an interface by index. Returns %NULL if the device
 743  *      is not found or a pointer to the device. The device has not
 744  *      had its reference counter increased so the caller must be careful
 745  *      about locking. The caller must hold either the RTNL semaphore
 746  *      or @dev_base_lock.
 747  */
 748
 749 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 750 {
 751         struct net_device *dev;
 752         struct hlist_head *head = dev_index_hash(net, ifindex);
 753
 754         hlist_for_each_entry(dev, head, index_hlist)
 755                 if (dev->ifindex == ifindex)
 756                         return dev;
 757
 758         return NULL;
 759 }
 760 EXPORT_SYMBOL(__dev_get_by_index);
 761
 762 /**
 763  *      dev_get_by_index_rcu - find a device by its ifindex
 764  *      @net: the applicable net namespace
 765  *      @ifindex: index of device
 766  *
 767  *      Search for an interface by index. Returns %NULL if the device
 768  *      is not found or a pointer to the device. The device has not
 769  *      had its reference counter increased so the caller must be careful
 770  *      about locking. The caller must hold RCU lock.
 771  */
 772
 773 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 774 {
 775         struct net_device *dev;
 776         struct hlist_head *head = dev_index_hash(net, ifindex);
 777
 778         hlist_for_each_entry_rcu(dev, head, index_hlist)
 779                 if (dev->ifindex == ifindex)
 780                         return dev;
 781
 782         return NULL;
 783 }
 784 EXPORT_SYMBOL(dev_get_by_index_rcu);
 785
 786
 787 /**
 788  *      dev_get_by_index - find a device by its ifindex
 789  *      @net: the applicable net namespace
 790  *      @ifindex: index of device
 791  *
 792  *      Search for an interface by index. Returns NULL if the device
 793  *      is not found or a pointer to the device. The device returned has
 794  *      had a reference added and the pointer is safe until the user calls
 795  *      dev_put to indicate they have finished with it.
 796  */
 797
 798 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 799 {
 800         struct net_device *dev;
 801
 802         rcu_read_lock();
 803         dev = dev_get_by_index_rcu(net, ifindex);
 804         if (dev)
 805                 dev_hold(dev);
 806         rcu_read_unlock();
 807         return dev;
 808 }
 809 EXPORT_SYMBOL(dev_get_by_index);
 810
 811 /**
 812  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 813  *      @net: network namespace
 814  *      @name: a pointer to the buffer where the name will be stored.
 815  *      @ifindex: the ifindex of the interface to get the name from.
 816  *
 817  *      The use of raw_seqcount_begin() and cond_resched() before
 818  *      retrying is required as we want to give the writers a chance
 819  *      to complete when CONFIG_PREEMPT is not set.
 820  */
 821 int netdev_get_name(struct net *net, char *name, int ifindex)
 822 {
 823         struct net_device *dev;
 824         unsigned int seq;
 825
 826 retry:
 827         seq = raw_seqcount_begin(&devnet_rename_seq);
 828         rcu_read_lock();
 829         dev = dev_get_by_index_rcu(net, ifindex);
 830         if (!dev) {
 831                 rcu_read_unlock();
 832                 return -ENODEV;
 833         }
 834
 835         strcpy(name, dev->name);
 836         rcu_read_unlock();
 837         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 838                 cond_resched();
 839                 goto retry;
 840         }
 841
 842         return 0;
 843 }
 844
 845 /**
 846  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 847  *      @net: the applicable net namespace
 848  *      @type: media type of device
 849  *      @ha: hardware address
 850  *
 851  *      Search for an interface by MAC address. Returns NULL if the device
 852  *      is not found or a pointer to the device.
 853  *      The caller must hold RCU or RTNL.
 854  *      The returned device has not had its ref count increased
 855  *      and the caller must therefore be careful about locking
 856  *
 857  */
 858
 859 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 860                                        const char *ha)
 861 {
 862         struct net_device *dev;
 863
 864         for_each_netdev_rcu(net, dev)
 865                 if (dev->type == type &&
 866                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 867                         return dev;
 868
 869         return NULL;
 870 }
 871 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 872
 873 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 874 {
 875         struct net_device *dev;
 876
 877         ASSERT_RTNL();
 878         for_each_netdev(net, dev)
 879                 if (dev->type == type)
 880                         return dev;
 881
 882         return NULL;
 883 }
 884 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 885
 886 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 887 {
 888         struct net_device *dev, *ret = NULL;
 889
 890         rcu_read_lock();
 891         for_each_netdev_rcu(net, dev)
 892                 if (dev->type == type) {
 893                         dev_hold(dev);
 894                         ret = dev;
 895                         break;
 896                 }
 897         rcu_read_unlock();
 898         return ret;
 899 }
 900 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 901
 902 /**
 903  *      __dev_get_by_flags - find any device with given flags
 904  *      @net: the applicable net namespace
 905  *      @if_flags: IFF_* values
 906  *      @mask: bitmask of bits in if_flags to check
 907  *
 908  *      Search for any interface with the given flags. Returns NULL if a device
 909  *      is not found or a pointer to the device. Must be called inside
 910  *      rtnl_lock(), and result refcount is unchanged.
 911  */
 912
 913 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 914                                       unsigned short mask)
 915 {
 916         struct net_device *dev, *ret;
 917
 918         ASSERT_RTNL();
 919
 920         ret = NULL;
 921         for_each_netdev(net, dev) {
 922                 if (((dev->flags ^ if_flags) & mask) == 0) {
 923                         ret = dev;
 924                         break;
 925                 }
 926         }
 927         return ret;
 928 }
 929 EXPORT_SYMBOL(__dev_get_by_flags);
 930
 931 /**
 932  *      dev_valid_name - check if name is okay for network device
 933  *      @name: name string
 934  *
 935  *      Network device names need to be valid file names to
 936  *      to allow sysfs to work.  We also disallow any kind of
 937  *      whitespace.
 938  */
 939 bool dev_valid_name(const char *name)
 940 {
 941         if (*name == '\0')
 942                 return false;
 943         if (strlen(name) >= IFNAMSIZ)
 944                 return false;
 945         if (!strcmp(name, ".") || !strcmp(name, ".."))
 946                 return false;
 947
 948         while (*name) {
 949                 if (*name == '/' || isspace(*name))
 950                         return false;
 951                 name++;
 952         }
 953         return true;
 954 }
 955 EXPORT_SYMBOL(dev_valid_name);
 956
 957 /**
 958  *      __dev_alloc_name - allocate a name for a device
 959  *      @net: network namespace to allocate the device name in
 960  *      @name: name format string
 961  *      @buf:  scratch buffer and result name string
 962  *
 963  *      Passed a format string - eg "lt%d" it will try and find a suitable
 964  *      id. It scans list of devices to build up a free map, then chooses
 965  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 966  *      while allocating the name and adding the device in order to avoid
 967  *      duplicates.
 968  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 969  *      Returns the number of the unit assigned or a negative errno code.
 970  */
 971
 972 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 973 {
 974         int i = 0;
 975         const char *p;
 976         const int max_netdevices = 8*PAGE_SIZE;
 977         unsigned long *inuse;
 978         struct net_device *d;
 979
 980         p = strnchr(name, IFNAMSIZ-1, '%');
 981         if (p) {
 982                 /*
 983                  * Verify the string as this thing may have come from
 984                  * the user.  There must be either one "%d" and no other "%"
 985                  * characters.
 986                  */
 987                 if (p[1] != 'd' || strchr(p + 2, '%'))
 988                         return -EINVAL;
 989
 990                 /* Use one page as a bit array of possible slots */
 991                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 992                 if (!inuse)
 993                         return -ENOMEM;
 994
 995                 for_each_netdev(net, d) {
 996                         if (!sscanf(d->name, name, &i))
 997                                 continue;
 998                         if (i < 0 || i >= max_netdevices)
 999                                 continue;
1000
1001                         /*  avoid cases where sscanf is not exact inverse of printf */
1002                         snprintf(buf, IFNAMSIZ, name, i);
1003                         if (!strncmp(buf, d->name, IFNAMSIZ))
1004                                 set_bit(i, inuse);
1005                 }
1006
1007                 i = find_first_zero_bit(inuse, max_netdevices);
1008                 free_page((unsigned long) inuse);
1009         }
1010
1011         if (buf != name)
1012                 snprintf(buf, IFNAMSIZ, name, i);
1013         if (!__dev_get_by_name(net, buf))
1014                 return i;
1015
1016         /* It is possible to run out of possible slots
1017          * when the name is long and there isn't enough space left
1018          * for the digits, or if all bits are used.
1019          */
1020         return -ENFILE;
1021 }
1022
1023 /**
1024  *      dev_alloc_name - allocate a name for a device
1025  *      @dev: device
1026  *      @name: name format string
1027  *
1028  *      Passed a format string - eg "lt%d" it will try and find a suitable
1029  *      id. It scans list of devices to build up a free map, then chooses
1030  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1031  *      while allocating the name and adding the device in order to avoid
1032  *      duplicates.
1033  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1034  *      Returns the number of the unit assigned or a negative errno code.
1035  */
1036
1037 int dev_alloc_name(struct net_device *dev, const char *name)
1038 {
1039         char buf[IFNAMSIZ];
1040         struct net *net;
1041         int ret;
1042
1043         BUG_ON(!dev_net(dev));
1044         net = dev_net(dev);
1045         ret = __dev_alloc_name(net, name, buf);
1046         if (ret >= 0)
1047                 strlcpy(dev->name, buf, IFNAMSIZ);
1048         return ret;
1049 }
1050 EXPORT_SYMBOL(dev_alloc_name);
1051
1052 static int dev_alloc_name_ns(struct net *net,
1053                              struct net_device *dev,
1054                              const char *name)
1055 {
1056         char buf[IFNAMSIZ];
1057         int ret;
1058
1059         ret = __dev_alloc_name(net, name, buf);
1060         if (ret >= 0)
1061                 strlcpy(dev->name, buf, IFNAMSIZ);
1062         return ret;
1063 }
1064
1065 static int dev_get_valid_name(struct net *net,
1066                               struct net_device *dev,
1067                               const char *name)
1068 {
1069         BUG_ON(!net);
1070
1071         if (!dev_valid_name(name))
1072                 return -EINVAL;
1073
1074         if (strchr(name, '%'))
1075                 return dev_alloc_name_ns(net, dev, name);
1076         else if (__dev_get_by_name(net, name))
1077                 return -EEXIST;
1078         else if (dev->name != name)
1079                 strlcpy(dev->name, name, IFNAMSIZ);
1080
1081         return 0;
1082 }
1083
1084 /**
1085  *      dev_change_name - change name of a device
1086  *      @dev: device
1087  *      @newname: name (or format string) must be at least IFNAMSIZ
1088  *
1089  *      Change name of a device, can pass format strings "eth%d".
1090  *      for wildcarding.
1091  */
1092 int dev_change_name(struct net_device *dev, const char *newname)
1093 {
1094         unsigned char old_assign_type;
1095         char oldname[IFNAMSIZ];
1096         int err = 0;
1097         int ret;
1098         struct net *net;
1099
1100         ASSERT_RTNL();
1101         BUG_ON(!dev_net(dev));
1102
1103         net = dev_net(dev);
1104         if (dev->flags & IFF_UP)
1105                 return -EBUSY;
1106
1107         write_seqcount_begin(&devnet_rename_seq);
1108
1109         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1110                 write_seqcount_end(&devnet_rename_seq);
1111                 return 0;
1112         }
1113
1114         memcpy(oldname, dev->name, IFNAMSIZ);
1115
1116         err = dev_get_valid_name(net, dev, newname);
1117         if (err < 0) {
1118                 write_seqcount_end(&devnet_rename_seq);
1119                 return err;
1120         }
1121
1122         if (oldname[0] && !strchr(oldname, '%'))
1123                 netdev_info(dev, "renamed from %s\n", oldname);
1124
1125         old_assign_type = dev->name_assign_type;
1126         dev->name_assign_type = NET_NAME_RENAMED;
1127
1128 rollback:
1129         ret = device_rename(&dev->dev, dev->name);
1130         if (ret) {
1131                 memcpy(dev->name, oldname, IFNAMSIZ);
1132                 dev->name_assign_type = old_assign_type;
1133                 write_seqcount_end(&devnet_rename_seq);
1134                 return ret;
1135         }
1136
1137         write_seqcount_end(&devnet_rename_seq);
1138
1139         netdev_adjacent_rename_links(dev, oldname);
1140
1141         write_lock_bh(&dev_base_lock);
1142         hlist_del_rcu(&dev->name_hlist);
1143         write_unlock_bh(&dev_base_lock);
1144
1145         synchronize_rcu();
1146
1147         write_lock_bh(&dev_base_lock);
1148         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1149         write_unlock_bh(&dev_base_lock);
1150
1151         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1152         ret = notifier_to_errno(ret);
1153
1154         if (ret) {
1155                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1156                 if (err >= 0) {
1157                         err = ret;
1158                         write_seqcount_begin(&devnet_rename_seq);
1159                         memcpy(dev->name, oldname, IFNAMSIZ);
1160                         memcpy(oldname, newname, IFNAMSIZ);
1161                         dev->name_assign_type = old_assign_type;
1162                         old_assign_type = NET_NAME_RENAMED;
1163                         goto rollback;
1164                 } else {
1165                         pr_err("%s: name change rollback failed: %d\n",
1166                                dev->name, ret);
1167                 }
1168         }
1169
1170         return err;
1171 }
1172
1173 /**
1174  *      dev_set_alias - change ifalias of a device
1175  *      @dev: device
1176  *      @alias: name up to IFALIASZ
1177  *      @len: limit of bytes to copy from info
1178  *
1179  *      Set ifalias for a device,
1180  */
1181 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1182 {
1183         char *new_ifalias;
1184
1185         ASSERT_RTNL();
1186
1187         if (len >= IFALIASZ)
1188                 return -EINVAL;
1189
1190         if (!len) {
1191                 kfree(dev->ifalias);
1192                 dev->ifalias = NULL;
1193                 return 0;
1194         }
1195
1196         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1197         if (!new_ifalias)
1198                 return -ENOMEM;
1199         dev->ifalias = new_ifalias;
1200
1201         strlcpy(dev->ifalias, alias, len+1);
1202         return len;
1203 }
1204
1205
1206 /**
1207  *      netdev_features_change - device changes features
1208  *      @dev: device to cause notification
1209  *
1210  *      Called to indicate a device has changed features.
1211  */
1212 void netdev_features_change(struct net_device *dev)
1213 {
1214         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1215 }
1216 EXPORT_SYMBOL(netdev_features_change);
1217
1218 /**
1219  *      netdev_state_change - device changes state
1220  *      @dev: device to cause notification
1221  *
1222  *      Called to indicate a device has changed state. This function calls
1223  *      the notifier chains for netdev_chain and sends a NEWLINK message
1224  *      to the routing socket.
1225  */
1226 void netdev_state_change(struct net_device *dev)
1227 {
1228         if (dev->flags & IFF_UP) {
1229                 struct netdev_notifier_change_info change_info;
1230
1231                 change_info.flags_changed = 0;
1232                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1233                                               &change_info.info);
1234                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1235         }
1236 }
1237 EXPORT_SYMBOL(netdev_state_change);
1238
1239 /**
1240  *      netdev_notify_peers - notify network peers about existence of @dev
1241  *      @dev: network device
1242  *
1243  * Generate traffic such that interested network peers are aware of
1244  * @dev, such as by generating a gratuitous ARP. This may be used when
1245  * a device wants to inform the rest of the network about some sort of
1246  * reconfiguration such as a failover event or virtual machine
1247  * migration.
1248  */
1249 void netdev_notify_peers(struct net_device *dev)
1250 {
1251         rtnl_lock();
1252         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1253         rtnl_unlock();
1254 }
1255 EXPORT_SYMBOL(netdev_notify_peers);
1256
1257 static int __dev_open(struct net_device *dev)
1258 {
1259         const struct net_device_ops *ops = dev->netdev_ops;
1260         int ret;
1261
1262         ASSERT_RTNL();
1263
1264         if (!netif_device_present(dev))
1265                 return -ENODEV;
1266
1267         /* Block netpoll from trying to do any rx path servicing.
1268          * If we don't do this there is a chance ndo_poll_controller
1269          * or ndo_poll may be running while we open the device
1270          */
1271         netpoll_poll_disable(dev);
1272
1273         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1274         ret = notifier_to_errno(ret);
1275         if (ret)
1276                 return ret;
1277
1278         set_bit(__LINK_STATE_START, &dev->state);
1279
1280         if (ops->ndo_validate_addr)
1281                 ret = ops->ndo_validate_addr(dev);
1282
1283         if (!ret && ops->ndo_open)
1284                 ret = ops->ndo_open(dev);
1285
1286         netpoll_poll_enable(dev);
1287
1288         if (ret)
1289                 clear_bit(__LINK_STATE_START, &dev->state);
1290         else {
1291                 dev->flags |= IFF_UP;
1292                 dev_set_rx_mode(dev);
1293                 dev_activate(dev);
1294                 add_device_randomness(dev->dev_addr, dev->addr_len);
1295         }
1296
1297         return ret;
1298 }
1299
1300 /**
1301  *      dev_open        - prepare an interface for use.
1302  *      @dev:   device to open
1303  *
1304  *      Takes a device from down to up state. The device's private open
1305  *      function is invoked and then the multicast lists are loaded. Finally
1306  *      the device is moved into the up state and a %NETDEV_UP message is
1307  *      sent to the netdev notifier chain.
1308  *
1309  *      Calling this function on an active interface is a nop. On a failure
1310  *      a negative errno code is returned.
1311  */
1312 int dev_open(struct net_device *dev)
1313 {
1314         int ret;
1315
1316         if (dev->flags & IFF_UP)
1317                 return 0;
1318
1319         ret = __dev_open(dev);
1320         if (ret < 0)
1321                 return ret;
1322
1323         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1324         call_netdevice_notifiers(NETDEV_UP, dev);
1325
1326         return ret;
1327 }
1328 EXPORT_SYMBOL(dev_open);
1329
1330 static int __dev_close_many(struct list_head *head)
1331 {
1332         struct net_device *dev;
1333
1334         ASSERT_RTNL();
1335         might_sleep();
1336
1337         list_for_each_entry(dev, head, close_list) {
1338                 /* Temporarily disable netpoll until the interface is down */
1339                 netpoll_poll_disable(dev);
1340
1341                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1342
1343                 clear_bit(__LINK_STATE_START, &dev->state);
1344
1345                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1346                  * can be even on different cpu. So just clear netif_running().
1347                  *
1348                  * dev->stop() will invoke napi_disable() on all of it's
1349                  * napi_struct instances on this device.
1350                  */
1351                 smp_mb__after_atomic(); /* Commit netif_running(). */
1352         }
1353
1354         dev_deactivate_many(head);
1355
1356         list_for_each_entry(dev, head, close_list) {
1357                 const struct net_device_ops *ops = dev->netdev_ops;
1358
1359                 /*
1360                  *      Call the device specific close. This cannot fail.
1361                  *      Only if device is UP
1362                  *
1363                  *      We allow it to be called even after a DETACH hot-plug
1364                  *      event.
1365                  */
1366                 if (ops->ndo_stop)
1367                         ops->ndo_stop(dev);
1368
1369                 dev->flags &= ~IFF_UP;
1370                 netpoll_poll_enable(dev);
1371         }
1372
1373         return 0;
1374 }
1375
1376 static int __dev_close(struct net_device *dev)
1377 {
1378         int retval;
1379         LIST_HEAD(single);
1380
1381         list_add(&dev->close_list, &single);
1382         retval = __dev_close_many(&single);
1383         list_del(&single);
1384
1385         return retval;
1386 }
1387
1388 static int dev_close_many(struct list_head *head)
1389 {
1390         struct net_device *dev, *tmp;
1391
1392         /* Remove the devices that don't need to be closed */
1393         list_for_each_entry_safe(dev, tmp, head, close_list)
1394                 if (!(dev->flags & IFF_UP))
1395                         list_del_init(&dev->close_list);
1396
1397         __dev_close_many(head);
1398
1399         list_for_each_entry_safe(dev, tmp, head, close_list) {
1400                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1401                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1402                 list_del_init(&dev->close_list);
1403         }
1404
1405         return 0;
1406 }
1407
1408 /**
1409  *      dev_close - shutdown an interface.
1410  *      @dev: device to shutdown
1411  *
1412  *      This function moves an active device into down state. A
1413  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1414  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1415  *      chain.
1416  */
1417 int dev_close(struct net_device *dev)
1418 {
1419         if (dev->flags & IFF_UP) {
1420                 LIST_HEAD(single);
1421
1422                 list_add(&dev->close_list, &single);
1423                 dev_close_many(&single);
1424                 list_del(&single);
1425         }
1426         return 0;
1427 }
1428 EXPORT_SYMBOL(dev_close);
1429
1430
1431 /**
1432  *      dev_disable_lro - disable Large Receive Offload on a device
1433  *      @dev: device
1434  *
1435  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1436  *      called under RTNL.  This is needed if received packets may be
1437  *      forwarded to another interface.
1438  */
1439 void dev_disable_lro(struct net_device *dev)
1440 {
1441         struct net_device *lower_dev;
1442         struct list_head *iter;
1443
1444         dev->wanted_features &= ~NETIF_F_LRO;
1445         netdev_update_features(dev);
1446
1447         if (unlikely(dev->features & NETIF_F_LRO))
1448                 netdev_WARN(dev, "failed to disable LRO!\n");
1449
1450         netdev_for_each_lower_dev(dev, lower_dev, iter)
1451                 dev_disable_lro(lower_dev);
1452 }
1453 EXPORT_SYMBOL(dev_disable_lro);
1454
1455 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1456                                    struct net_device *dev)
1457 {
1458         struct netdev_notifier_info info;
1459
1460         netdev_notifier_info_init(&info, dev);
1461         return nb->notifier_call(nb, val, &info);
1462 }
1463
1464 static int dev_boot_phase = 1;
1465
1466 /**
1467  *      register_netdevice_notifier - register a network notifier block
1468  *      @nb: notifier
1469  *
1470  *      Register a notifier to be called when network device events occur.
1471  *      The notifier passed is linked into the kernel structures and must
1472  *      not be reused until it has been unregistered. A negative errno code
1473  *      is returned on a failure.
1474  *
1475  *      When registered all registration and up events are replayed
1476  *      to the new notifier to allow device to have a race free
1477  *      view of the network device list.
1478  */
1479
1480 int register_netdevice_notifier(struct notifier_block *nb)
1481 {
1482         struct net_device *dev;
1483         struct net_device *last;
1484         struct net *net;
1485         int err;
1486
1487         rtnl_lock();
1488         err = raw_notifier_chain_register(&netdev_chain, nb);
1489         if (err)
1490                 goto unlock;
1491         if (dev_boot_phase)
1492                 goto unlock;
1493         for_each_net(net) {
1494                 for_each_netdev(net, dev) {
1495                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1496                         err = notifier_to_errno(err);
1497                         if (err)
1498                                 goto rollback;
1499
1500                         if (!(dev->flags & IFF_UP))
1501                                 continue;
1502
1503                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1504                 }
1505         }
1506
1507 unlock:
1508         rtnl_unlock();
1509         return err;
1510
1511 rollback:
1512         last = dev;
1513         for_each_net(net) {
1514                 for_each_netdev(net, dev) {
1515                         if (dev == last)
1516                                 goto outroll;
1517
1518                         if (dev->flags & IFF_UP) {
1519                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1520                                                         dev);
1521                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1522                         }
1523                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1524                 }
1525         }
1526
1527 outroll:
1528         raw_notifier_chain_unregister(&netdev_chain, nb);
1529         goto unlock;
1530 }
1531 EXPORT_SYMBOL(register_netdevice_notifier);
1532
1533 /**
1534  *      unregister_netdevice_notifier - unregister a network notifier block
1535  *      @nb: notifier
1536  *
1537  *      Unregister a notifier previously registered by
1538  *      register_netdevice_notifier(). The notifier is unlinked into the
1539  *      kernel structures and may then be reused. A negative errno code
1540  *      is returned on a failure.
1541  *
1542  *      After unregistering unregister and down device events are synthesized
1543  *      for all devices on the device list to the removed notifier to remove
1544  *      the need for special case cleanup code.
1545  */
1546
1547 int unregister_netdevice_notifier(struct notifier_block *nb)
1548 {
1549         struct net_device *dev;
1550         struct net *net;
1551         int err;
1552
1553         rtnl_lock();
1554         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1555         if (err)
1556                 goto unlock;
1557
1558         for_each_net(net) {
1559                 for_each_netdev(net, dev) {
1560                         if (dev->flags & IFF_UP) {
1561                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1562                                                         dev);
1563                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1564                         }
1565                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1566                 }
1567         }
1568 unlock:
1569         rtnl_unlock();
1570         return err;
1571 }
1572 EXPORT_SYMBOL(unregister_netdevice_notifier);
1573
1574 /**
1575  *      call_netdevice_notifiers_info - call all network notifier blocks
1576  *      @val: value passed unmodified to notifier function
1577  *      @dev: net_device pointer passed unmodified to notifier function
1578  *      @info: notifier information data
1579  *
1580  *      Call all network notifier blocks.  Parameters and return value
1581  *      are as for raw_notifier_call_chain().
1582  */
1583
1584 static int call_netdevice_notifiers_info(unsigned long val,
1585                                          struct net_device *dev,
1586                                          struct netdev_notifier_info *info)
1587 {
1588         ASSERT_RTNL();
1589         netdev_notifier_info_init(info, dev);
1590         return raw_notifier_call_chain(&netdev_chain, val, info);
1591 }
1592
1593 /**
1594  *      call_netdevice_notifiers - call all network notifier blocks
1595  *      @val: value passed unmodified to notifier function
1596  *      @dev: net_device pointer passed unmodified to notifier function
1597  *
1598  *      Call all network notifier blocks.  Parameters and return value
1599  *      are as for raw_notifier_call_chain().
1600  */
1601
1602 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1603 {
1604         struct netdev_notifier_info info;
1605
1606         return call_netdevice_notifiers_info(val, dev, &info);
1607 }
1608 EXPORT_SYMBOL(call_netdevice_notifiers);
1609
1610 static struct static_key netstamp_needed __read_mostly;
1611 #ifdef HAVE_JUMP_LABEL
1612 /* We are not allowed to call static_key_slow_dec() from irq context
1613  * If net_disable_timestamp() is called from irq context, defer the
1614  * static_key_slow_dec() calls.
1615  */
1616 static atomic_t netstamp_needed_deferred;
1617 #endif
1618
1619 void net_enable_timestamp(void)
1620 {
1621 #ifdef HAVE_JUMP_LABEL
1622         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1623
1624         if (deferred) {
1625                 while (--deferred)
1626                         static_key_slow_dec(&netstamp_needed);
1627                 return;
1628         }
1629 #endif
1630         static_key_slow_inc(&netstamp_needed);
1631 }
1632 EXPORT_SYMBOL(net_enable_timestamp);
1633
1634 void net_disable_timestamp(void)
1635 {
1636 #ifdef HAVE_JUMP_LABEL
1637         if (in_interrupt()) {
1638                 atomic_inc(&netstamp_needed_deferred);
1639                 return;
1640         }
1641 #endif
1642         static_key_slow_dec(&netstamp_needed);
1643 }
1644 EXPORT_SYMBOL(net_disable_timestamp);
1645
1646 static inline void net_timestamp_set(struct sk_buff *skb)
1647 {
1648         skb->tstamp.tv64 = 0;
1649         if (static_key_false(&netstamp_needed))
1650                 __net_timestamp(skb);
1651 }
1652
1653 #define net_timestamp_check(COND, SKB)                  \
1654         if (static_key_false(&netstamp_needed)) {               \
1655                 if ((COND) && !(SKB)->tstamp.tv64)      \
1656                         __net_timestamp(SKB);           \
1657         }                                               \
1658
1659 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1660 {
1661         unsigned int len;
1662
1663         if (!(dev->flags & IFF_UP))
1664                 return false;
1665
1666         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1667         if (skb->len <= len)
1668                 return true;
1669
1670         /* if TSO is enabled, we don't care about the length as the packet
1671          * could be forwarded without being segmented before
1672          */
1673         if (skb_is_gso(skb))
1674                 return true;
1675
1676         return false;
1677 }
1678 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1679
1680 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1681 {
1682         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1683                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1684                         atomic_long_inc(&dev->rx_dropped);
1685                         kfree_skb(skb);
1686                         return NET_RX_DROP;
1687                 }
1688         }
1689
1690         if (unlikely(!is_skb_forwardable(dev, skb))) {
1691                 atomic_long_inc(&dev->rx_dropped);
1692                 kfree_skb(skb);
1693                 return NET_RX_DROP;
1694         }
1695
1696         skb_scrub_packet(skb, true);
1697         skb->protocol = eth_type_trans(skb, dev);
1698         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1699
1700         return 0;
1701 }
1702 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1703
1704 /**
1705  * dev_forward_skb - loopback an skb to another netif
1706  *
1707  * @dev: destination network device
1708  * @skb: buffer to forward
1709  *
1710  * return values:
1711  *      NET_RX_SUCCESS  (no congestion)
1712  *      NET_RX_DROP     (packet was dropped, but freed)
1713  *
1714  * dev_forward_skb can be used for injecting an skb from the
1715  * start_xmit function of one device into the receive queue
1716  * of another device.
1717  *
1718  * The receiving device may be in another namespace, so
1719  * we have to clear all information in the skb that could
1720  * impact namespace isolation.
1721  */
1722 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1723 {
1724         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1725 }
1726 EXPORT_SYMBOL_GPL(dev_forward_skb);
1727
1728 static inline int deliver_skb(struct sk_buff *skb,
1729                               struct packet_type *pt_prev,
1730                               struct net_device *orig_dev)
1731 {
1732         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1733                 return -ENOMEM;
1734         atomic_inc(&skb->users);
1735         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1736 }
1737
1738 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1739                                           struct packet_type **pt,
1740                                           struct net_device *dev, __be16 type,
1741                                           struct list_head *ptype_list)
1742 {
1743         struct packet_type *ptype, *pt_prev = *pt;
1744
1745         list_for_each_entry_rcu(ptype, ptype_list, list) {
1746                 if (ptype->type != type)
1747                         continue;
1748                 if (pt_prev)
1749                         deliver_skb(skb, pt_prev, dev);
1750                 pt_prev = ptype;
1751         }
1752         *pt = pt_prev;
1753 }
1754
1755 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1756 {
1757         if (!ptype->af_packet_priv || !skb->sk)
1758                 return false;
1759
1760         if (ptype->id_match)
1761                 return ptype->id_match(ptype, skb->sk);
1762         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1763                 return true;
1764
1765         return false;
1766 }
1767
1768 /*
1769  *      Support routine. Sends outgoing frames to any network
1770  *      taps currently in use.
1771  */
1772
1773 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1774 {
1775         struct packet_type *ptype;
1776         struct sk_buff *skb2 = NULL;
1777         struct packet_type *pt_prev = NULL;
1778         struct list_head *ptype_list = &ptype_all;
1779
1780         rcu_read_lock();
1781 again:
1782         list_for_each_entry_rcu(ptype, ptype_list, list) {
1783                 /* Never send packets back to the socket
1784                  * they originated from - MvS (miquels@drinkel.ow.org)
1785                  */
1786                 if (skb_loop_sk(ptype, skb))
1787                         continue;
1788
1789                 if (pt_prev) {
1790                         deliver_skb(skb2, pt_prev, skb->dev);
1791                         pt_prev = ptype;
1792                         continue;
1793                 }
1794
1795                 /* need to clone skb, done only once */
1796                 skb2 = skb_clone(skb, GFP_ATOMIC);
1797                 if (!skb2)
1798                         goto out_unlock;
1799
1800                 net_timestamp_set(skb2);
1801
1802                 /* skb->nh should be correctly
1803                  * set by sender, so that the second statement is
1804                  * just protection against buggy protocols.
1805                  */
1806                 skb_reset_mac_header(skb2);
1807
1808                 if (skb_network_header(skb2) < skb2->data ||
1809                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1810                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1811                                              ntohs(skb2->protocol),
1812                                              dev->name);
1813                         skb_reset_network_header(skb2);
1814                 }
1815
1816                 skb2->transport_header = skb2->network_header;
1817                 skb2->pkt_type = PACKET_OUTGOING;
1818                 pt_prev = ptype;
1819         }
1820
1821         if (ptype_list == &ptype_all) {
1822                 ptype_list = &dev->ptype_all;
1823                 goto again;
1824         }
1825 out_unlock:
1826         if (pt_prev)
1827                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1828         rcu_read_unlock();
1829 }
1830
1831 /**
1832  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1833  * @dev: Network device
1834  * @txq: number of queues available
1835  *
1836  * If real_num_tx_queues is changed the tc mappings may no longer be
1837  * valid. To resolve this verify the tc mapping remains valid and if
1838  * not NULL the mapping. With no priorities mapping to this
1839  * offset/count pair it will no longer be used. In the worst case TC0
1840  * is invalid nothing can be done so disable priority mappings. If is
1841  * expected that drivers will fix this mapping if they can before
1842  * calling netif_set_real_num_tx_queues.
1843  */
1844 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1845 {
1846         int i;
1847         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1848
1849         /* If TC0 is invalidated disable TC mapping */
1850         if (tc->offset + tc->count > txq) {
1851                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1852                 dev->num_tc = 0;
1853                 return;
1854         }
1855
1856         /* Invalidated prio to tc mappings set to TC0 */
1857         for (i = 1; i < TC_BITMASK + 1; i++) {
1858                 int q = netdev_get_prio_tc_map(dev, i);
1859
1860                 tc = &dev->tc_to_txq[q];
1861                 if (tc->offset + tc->count > txq) {
1862                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1863                                 i, q);
1864                         netdev_set_prio_tc_map(dev, i, 0);
1865                 }
1866         }
1867 }
1868
1869 #ifdef CONFIG_XPS
1870 static DEFINE_MUTEX(xps_map_mutex);
1871 #define xmap_dereference(P)             \
1872         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1873
1874 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1875                                         int cpu, u16 index)
1876 {
1877         struct xps_map *map = NULL;
1878         int pos;
1879
1880         if (dev_maps)
1881                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1882
1883         for (pos = 0; map && pos < map->len; pos++) {
1884                 if (map->queues[pos] == index) {
1885                         if (map->len > 1) {
1886                                 map->queues[pos] = map->queues[--map->len];
1887                         } else {
1888                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1889                                 kfree_rcu(map, rcu);
1890                                 map = NULL;
1891                         }
1892                         break;
1893                 }
1894         }
1895
1896         return map;
1897 }
1898
1899 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1900 {
1901         struct xps_dev_maps *dev_maps;
1902         int cpu, i;
1903         bool active = false;
1904
1905         mutex_lock(&xps_map_mutex);
1906         dev_maps = xmap_dereference(dev->xps_maps);
1907
1908         if (!dev_maps)
1909                 goto out_no_maps;
1910
1911         for_each_possible_cpu(cpu) {
1912                 for (i = index; i < dev->num_tx_queues; i++) {
1913                         if (!remove_xps_queue(dev_maps, cpu, i))
1914                                 break;
1915                 }
1916                 if (i == dev->num_tx_queues)
1917                         active = true;
1918         }
1919
1920         if (!active) {
1921                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1922                 kfree_rcu(dev_maps, rcu);
1923         }
1924
1925         for (i = index; i < dev->num_tx_queues; i++)
1926                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1927                                              NUMA_NO_NODE);
1928
1929 out_no_maps:
1930         mutex_unlock(&xps_map_mutex);
1931 }
1932
1933 static struct xps_map *expand_xps_map(struct xps_map *map,
1934                                       int cpu, u16 index)
1935 {
1936         struct xps_map *new_map;
1937         int alloc_len = XPS_MIN_MAP_ALLOC;
1938         int i, pos;
1939
1940         for (pos = 0; map && pos < map->len; pos++) {
1941                 if (map->queues[pos] != index)
1942                         continue;
1943                 return map;
1944         }
1945
1946         /* Need to add queue to this CPU's existing map */
1947         if (map) {
1948                 if (pos < map->alloc_len)
1949                         return map;
1950
1951                 alloc_len = map->alloc_len * 2;
1952         }
1953
1954         /* Need to allocate new map to store queue on this CPU's map */
1955         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1956                                cpu_to_node(cpu));
1957         if (!new_map)
1958                 return NULL;
1959
1960         for (i = 0; i < pos; i++)
1961                 new_map->queues[i] = map->queues[i];
1962         new_map->alloc_len = alloc_len;
1963         new_map->len = pos;
1964
1965         return new_map;
1966 }
1967
1968 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1969                         u16 index)
1970 {
1971         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1972         struct xps_map *map, *new_map;
1973         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1974         int cpu, numa_node_id = -2;
1975         bool active = false;
1976
1977         mutex_lock(&xps_map_mutex);
1978
1979         dev_maps = xmap_dereference(dev->xps_maps);
1980
1981         /* allocate memory for queue storage */
1982         for_each_online_cpu(cpu) {
1983                 if (!cpumask_test_cpu(cpu, mask))
1984                         continue;
1985
1986                 if (!new_dev_maps)
1987                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1988                 if (!new_dev_maps) {
1989                         mutex_unlock(&xps_map_mutex);
1990                         return -ENOMEM;
1991                 }
1992
1993                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1994                                  NULL;
1995
1996                 map = expand_xps_map(map, cpu, index);
1997                 if (!map)
1998                         goto error;
1999
2000                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2001         }
2002
2003         if (!new_dev_maps)
2004                 goto out_no_new_maps;
2005
2006         for_each_possible_cpu(cpu) {
2007                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2008                         /* add queue to CPU maps */
2009                         int pos = 0;
2010
2011                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2012                         while ((pos < map->len) && (map->queues[pos] != index))
2013                                 pos++;
2014
2015                         if (pos == map->len)
2016                                 map->queues[map->len++] = index;
2017 #ifdef CONFIG_NUMA
2018                         if (numa_node_id == -2)
2019                                 numa_node_id = cpu_to_node(cpu);
2020                         else if (numa_node_id != cpu_to_node(cpu))
2021                                 numa_node_id = -1;
2022 #endif
2023                 } else if (dev_maps) {
2024                         /* fill in the new device map from the old device map */
2025                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2026                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2027                 }
2028
2029         }
2030
2031         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2032
2033         /* Cleanup old maps */
2034         if (dev_maps) {
2035                 for_each_possible_cpu(cpu) {
2036                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2037                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2038                         if (map && map != new_map)
2039                                 kfree_rcu(map, rcu);
2040                 }
2041
2042                 kfree_rcu(dev_maps, rcu);
2043         }
2044
2045         dev_maps = new_dev_maps;
2046         active = true;
2047
2048 out_no_new_maps:
2049         /* update Tx queue numa node */
2050         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2051                                      (numa_node_id >= 0) ? numa_node_id :
2052                                      NUMA_NO_NODE);
2053
2054         if (!dev_maps)
2055                 goto out_no_maps;
2056
2057         /* removes queue from unused CPUs */
2058         for_each_possible_cpu(cpu) {
2059                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2060                         continue;
2061
2062                 if (remove_xps_queue(dev_maps, cpu, index))
2063                         active = true;
2064         }
2065
2066         /* free map if not active */
2067         if (!active) {
2068                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2069                 kfree_rcu(dev_maps, rcu);
2070         }
2071
2072 out_no_maps:
2073         mutex_unlock(&xps_map_mutex);
2074
2075         return 0;
2076 error:
2077         /* remove any maps that we added */
2078         for_each_possible_cpu(cpu) {
2079                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2080                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2081                                  NULL;
2082                 if (new_map && new_map != map)
2083                         kfree(new_map);
2084         }
2085
2086         mutex_unlock(&xps_map_mutex);
2087
2088         kfree(new_dev_maps);
2089         return -ENOMEM;
2090 }
2091 EXPORT_SYMBOL(netif_set_xps_queue);
2092
2093 #endif
2094 /*
2095  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2096  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2097  */
2098 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2099 {
2100         int rc;
2101
2102         if (txq < 1 || txq > dev->num_tx_queues)
2103                 return -EINVAL;
2104
2105         if (dev->reg_state == NETREG_REGISTERED ||
2106             dev->reg_state == NETREG_UNREGISTERING) {
2107                 ASSERT_RTNL();
2108
2109                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2110                                                   txq);
2111                 if (rc)
2112                         return rc;
2113
2114                 if (dev->num_tc)
2115                         netif_setup_tc(dev, txq);
2116
2117                 if (txq < dev->real_num_tx_queues) {
2118                         qdisc_reset_all_tx_gt(dev, txq);
2119 #ifdef CONFIG_XPS
2120                         netif_reset_xps_queues_gt(dev, txq);
2121 #endif
2122                 }
2123         }
2124
2125         dev->real_num_tx_queues = txq;
2126         return 0;
2127 }
2128 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2129
2130 #ifdef CONFIG_SYSFS
2131 /**
2132  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2133  *      @dev: Network device
2134  *      @rxq: Actual number of RX queues
2135  *
2136  *      This must be called either with the rtnl_lock held or before
2137  *      registration of the net device.  Returns 0 on success, or a
2138  *      negative error code.  If called before registration, it always
2139  *      succeeds.
2140  */
2141 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2142 {
2143         int rc;
2144
2145         if (rxq < 1 || rxq > dev->num_rx_queues)
2146                 return -EINVAL;
2147
2148         if (dev->reg_state == NETREG_REGISTERED) {
2149                 ASSERT_RTNL();
2150
2151                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2152                                                   rxq);
2153                 if (rc)
2154                         return rc;
2155         }
2156
2157         dev->real_num_rx_queues = rxq;
2158         return 0;
2159 }
2160 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2161 #endif
2162
2163 /**
2164  * netif_get_num_default_rss_queues - default number of RSS queues
2165  *
2166  * This routine should set an upper limit on the number of RSS queues
2167  * used by default by multiqueue devices.
2168  */
2169 int netif_get_num_default_rss_queues(void)
2170 {
2171         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2172 }
2173 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2174
2175 static inline void __netif_reschedule(struct Qdisc *q)
2176 {
2177         struct softnet_data *sd;
2178         unsigned long flags;
2179
2180         local_irq_save(flags);
2181         sd = this_cpu_ptr(&softnet_data);
2182         q->next_sched = NULL;
2183         *sd->output_queue_tailp = q;
2184         sd->output_queue_tailp = &q->next_sched;
2185         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2186         local_irq_restore(flags);
2187 }
2188
2189 void __netif_schedule(struct Qdisc *q)
2190 {
2191         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2192                 __netif_reschedule(q);
2193 }
2194 EXPORT_SYMBOL(__netif_schedule);
2195
2196 struct dev_kfree_skb_cb {
2197         enum skb_free_reason reason;
2198 };
2199
2200 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2201 {
2202         return (struct dev_kfree_skb_cb *)skb->cb;
2203 }
2204
2205 void netif_schedule_queue(struct netdev_queue *txq)
2206 {
2207         rcu_read_lock();
2208         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2209                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2210
2211                 __netif_schedule(q);
2212         }
2213         rcu_read_unlock();
2214 }
2215 EXPORT_SYMBOL(netif_schedule_queue);
2216
2217 /**
2218  *      netif_wake_subqueue - allow sending packets on subqueue
2219  *      @dev: network device
2220  *      @queue_index: sub queue index
2221  *
2222  * Resume individual transmit queue of a device with multiple transmit queues.
2223  */
2224 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2225 {
2226         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2227
2228         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2229                 struct Qdisc *q;
2230
2231                 rcu_read_lock();
2232                 q = rcu_dereference(txq->qdisc);
2233                 __netif_schedule(q);
2234                 rcu_read_unlock();
2235         }
2236 }
2237 EXPORT_SYMBOL(netif_wake_subqueue);
2238
2239 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2240 {
2241         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2242                 struct Qdisc *q;
2243
2244                 rcu_read_lock();
2245                 q = rcu_dereference(dev_queue->qdisc);
2246                 __netif_schedule(q);
2247                 rcu_read_unlock();
2248         }
2249 }
2250 EXPORT_SYMBOL(netif_tx_wake_queue);
2251
2252 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2253 {
2254         unsigned long flags;
2255
2256         if (likely(atomic_read(&skb->users) == 1)) {
2257                 smp_rmb();
2258                 atomic_set(&skb->users, 0);
2259         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2260                 return;
2261         }
2262         get_kfree_skb_cb(skb)->reason = reason;
2263         local_irq_save(flags);
2264         skb->next = __this_cpu_read(softnet_data.completion_queue);
2265         __this_cpu_write(softnet_data.completion_queue, skb);
2266         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2267         local_irq_restore(flags);
2268 }
2269 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2270
2271 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2272 {
2273         if (in_irq() || irqs_disabled())
2274                 __dev_kfree_skb_irq(skb, reason);
2275         else
2276                 dev_kfree_skb(skb);
2277 }
2278 EXPORT_SYMBOL(__dev_kfree_skb_any);
2279
2280
2281 /**
2282  * netif_device_detach - mark device as removed
2283  * @dev: network device
2284  *
2285  * Mark device as removed from system and therefore no longer available.
2286  */
2287 void netif_device_detach(struct net_device *dev)
2288 {
2289         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2290             netif_running(dev)) {
2291                 netif_tx_stop_all_queues(dev);
2292         }
2293 }
2294 EXPORT_SYMBOL(netif_device_detach);
2295
2296 /**
2297  * netif_device_attach - mark device as attached
2298  * @dev: network device
2299  *
2300  * Mark device as attached from system and restart if needed.
2301  */
2302 void netif_device_attach(struct net_device *dev)
2303 {
2304         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2305             netif_running(dev)) {
2306                 netif_tx_wake_all_queues(dev);
2307                 __netdev_watchdog_up(dev);
2308         }
2309 }
2310 EXPORT_SYMBOL(netif_device_attach);
2311
2312 static void skb_warn_bad_offload(const struct sk_buff *skb)
2313 {
2314         static const netdev_features_t null_features = 0;
2315         struct net_device *dev = skb->dev;
2316         const char *driver = "";
2317
2318         if (!net_ratelimit())
2319                 return;
2320
2321         if (dev && dev->dev.parent)
2322                 driver = dev_driver_string(dev->dev.parent);
2323
2324         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2325              "gso_type=%d ip_summed=%d\n",
2326              driver, dev ? &dev->features : &null_features,
2327              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2328              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2329              skb_shinfo(skb)->gso_type, skb->ip_summed);
2330 }
2331
2332 /*
2333  * Invalidate hardware checksum when packet is to be mangled, and
2334  * complete checksum manually on outgoing path.
2335  */
2336 int skb_checksum_help(struct sk_buff *skb)
2337 {
2338         __wsum csum;
2339         int ret = 0, offset;
2340
2341         if (skb->ip_summed == CHECKSUM_COMPLETE)
2342                 goto out_set_summed;
2343
2344         if (unlikely(skb_shinfo(skb)->gso_size)) {
2345                 skb_warn_bad_offload(skb);
2346                 return -EINVAL;
2347         }
2348
2349         /* Before computing a checksum, we should make sure no frag could
2350          * be modified by an external entity : checksum could be wrong.
2351          */
2352         if (skb_has_shared_frag(skb)) {
2353                 ret = __skb_linearize(skb);
2354                 if (ret)
2355                         goto out;
2356         }
2357
2358         offset = skb_checksum_start_offset(skb);
2359         BUG_ON(offset >= skb_headlen(skb));
2360         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2361
2362         offset += skb->csum_offset;
2363         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2364
2365         if (skb_cloned(skb) &&
2366             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2367                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2368                 if (ret)
2369                         goto out;
2370         }
2371
2372         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2373 out_set_summed:
2374         skb->ip_summed = CHECKSUM_NONE;
2375 out:
2376         return ret;
2377 }
2378 EXPORT_SYMBOL(skb_checksum_help);
2379
2380 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2381 {
2382         __be16 type = skb->protocol;
2383
2384         /* Tunnel gso handlers can set protocol to ethernet. */
2385         if (type == htons(ETH_P_TEB)) {
2386                 struct ethhdr *eth;
2387
2388                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2389                         return 0;
2390
2391                 eth = (struct ethhdr *)skb_mac_header(skb);
2392                 type = eth->h_proto;
2393         }
2394
2395         return __vlan_get_protocol(skb, type, depth);
2396 }
2397
2398 /**
2399  *      skb_mac_gso_segment - mac layer segmentation handler.
2400  *      @skb: buffer to segment
2401  *      @features: features for the output path (see dev->features)
2402  */
2403 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2404                                     netdev_features_t features)
2405 {
2406         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2407         struct packet_offload *ptype;
2408         int vlan_depth = skb->mac_len;
2409         __be16 type = skb_network_protocol(skb, &vlan_depth);
2410
2411         if (unlikely(!type))
2412                 return ERR_PTR(-EINVAL);
2413
2414         __skb_pull(skb, vlan_depth);
2415
2416         rcu_read_lock();
2417         list_for_each_entry_rcu(ptype, &offload_base, list) {
2418                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2419                         segs = ptype->callbacks.gso_segment(skb, features);
2420                         break;
2421                 }
2422         }
2423         rcu_read_unlock();
2424
2425         __skb_push(skb, skb->data - skb_mac_header(skb));
2426
2427         return segs;
2428 }
2429 EXPORT_SYMBOL(skb_mac_gso_segment);
2430
2431
2432 /* openvswitch calls this on rx path, so we need a different check.
2433  */
2434 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2435 {
2436         if (tx_path)
2437                 return skb->ip_summed != CHECKSUM_PARTIAL;
2438         else
2439                 return skb->ip_summed == CHECKSUM_NONE;
2440 }
2441
2442 /**
2443  *      __skb_gso_segment - Perform segmentation on skb.
2444  *      @skb: buffer to segment
2445  *      @features: features for the output path (see dev->features)
2446  *      @tx_path: whether it is called in TX path
2447  *
2448  *      This function segments the given skb and returns a list of segments.
2449  *
2450  *      It may return NULL if the skb requires no segmentation.  This is
2451  *      only possible when GSO is used for verifying header integrity.
2452  */
2453 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2454                                   netdev_features_t features, bool tx_path)
2455 {
2456         if (unlikely(skb_needs_check(skb, tx_path))) {
2457                 int err;
2458
2459                 skb_warn_bad_offload(skb);
2460
2461                 err = skb_cow_head(skb, 0);
2462                 if (err < 0)
2463                         return ERR_PTR(err);
2464         }
2465
2466         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2467         SKB_GSO_CB(skb)->encap_level = 0;
2468
2469         skb_reset_mac_header(skb);
2470         skb_reset_mac_len(skb);
2471
2472         return skb_mac_gso_segment(skb, features);
2473 }
2474 EXPORT_SYMBOL(__skb_gso_segment);
2475
2476 /* Take action when hardware reception checksum errors are detected. */
2477 #ifdef CONFIG_BUG
2478 void netdev_rx_csum_fault(struct net_device *dev)
2479 {
2480         if (net_ratelimit()) {
2481                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2482                 dump_stack();
2483         }
2484 }
2485 EXPORT_SYMBOL(netdev_rx_csum_fault);
2486 #endif
2487
2488 /* Actually, we should eliminate this check as soon as we know, that:
2489  * 1. IOMMU is present and allows to map all the memory.
2490  * 2. No high memory really exists on this machine.
2491  */
2492
2493 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2494 {
2495 #ifdef CONFIG_HIGHMEM
2496         int i;
2497         if (!(dev->features & NETIF_F_HIGHDMA)) {
2498                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2499                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2500                         if (PageHighMem(skb_frag_page(frag)))
2501                                 return 1;
2502                 }
2503         }
2504
2505         if (PCI_DMA_BUS_IS_PHYS) {
2506                 struct device *pdev = dev->dev.parent;
2507
2508                 if (!pdev)
2509                         return 0;
2510                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2511                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2512                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2513                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2514                                 return 1;
2515                 }
2516         }
2517 #endif
2518         return 0;
2519 }
2520
2521 /* If MPLS offload request, verify we are testing hardware MPLS features
2522  * instead of standard features for the netdev.
2523  */
2524 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2525 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2526                                            netdev_features_t features,
2527                                            __be16 type)
2528 {
2529         if (eth_p_mpls(type))
2530                 features &= skb->dev->mpls_features;
2531
2532         return features;
2533 }
2534 #else
2535 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2536                                            netdev_features_t features,
2537                                            __be16 type)
2538 {
2539         return features;
2540 }
2541 #endif
2542
2543 static netdev_features_t harmonize_features(struct sk_buff *skb,
2544         netdev_features_t features)
2545 {
2546         int tmp;
2547         __be16 type;
2548
2549         type = skb_network_protocol(skb, &tmp);
2550         features = net_mpls_features(skb, features, type);
2551
2552         if (skb->ip_summed != CHECKSUM_NONE &&
2553             !can_checksum_protocol(features, type)) {
2554                 features &= ~NETIF_F_ALL_CSUM;
2555         } else if (illegal_highdma(skb->dev, skb)) {
2556                 features &= ~NETIF_F_SG;
2557         }
2558
2559         return features;
2560 }
2561
2562 netdev_features_t netif_skb_features(struct sk_buff *skb)
2563 {
2564         struct net_device *dev = skb->dev;
2565         netdev_features_t features = dev->features;
2566         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2567         __be16 protocol = skb->protocol;
2568
2569         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2570                 features &= ~NETIF_F_GSO_MASK;
2571
2572         /* If encapsulation offload request, verify we are testing
2573          * hardware encapsulation features instead of standard
2574          * features for the netdev
2575          */
2576         if (skb->encapsulation)
2577                 features &= dev->hw_enc_features;
2578
2579         if (!skb_vlan_tag_present(skb)) {
2580                 if (unlikely(protocol == htons(ETH_P_8021Q) ||
2581                              protocol == htons(ETH_P_8021AD))) {
2582                         struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2583                         protocol = veh->h_vlan_encapsulated_proto;
2584                 } else {
2585                         goto finalize;
2586                 }
2587         }
2588
2589         features = netdev_intersect_features(features,
2590                                              dev->vlan_features |
2591                                              NETIF_F_HW_VLAN_CTAG_TX |
2592                                              NETIF_F_HW_VLAN_STAG_TX);
2593
2594         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2595                 features = netdev_intersect_features(features,
2596                                                      NETIF_F_SG |
2597                                                      NETIF_F_HIGHDMA |
2598                                                      NETIF_F_FRAGLIST |
2599                                                      NETIF_F_GEN_CSUM |
2600                                                      NETIF_F_HW_VLAN_CTAG_TX |
2601                                                      NETIF_F_HW_VLAN_STAG_TX);
2602
2603 finalize:
2604         if (dev->netdev_ops->ndo_features_check)
2605                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2606                                                                 features);
2607
2608         return harmonize_features(skb, features);
2609 }
2610 EXPORT_SYMBOL(netif_skb_features);
2611
2612 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2613                     struct netdev_queue *txq, bool more)
2614 {
2615         unsigned int len;
2616         int rc;
2617
2618         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2619                 dev_queue_xmit_nit(skb, dev);
2620
2621         len = skb->len;
2622         trace_net_dev_start_xmit(skb, dev);
2623         rc = netdev_start_xmit(skb, dev, txq, more);
2624         trace_net_dev_xmit(skb, rc, dev, len);
2625
2626         return rc;
2627 }
2628
2629 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2630                                     struct netdev_queue *txq, int *ret)
2631 {
2632         struct sk_buff *skb = first;
2633         int rc = NETDEV_TX_OK;
2634
2635         while (skb) {
2636                 struct sk_buff *next = skb->next;
2637
2638                 skb->next = NULL;
2639                 rc = xmit_one(skb, dev, txq, next != NULL);
2640                 if (unlikely(!dev_xmit_complete(rc))) {
2641                         skb->next = next;
2642                         goto out;
2643                 }
2644
2645                 skb = next;
2646                 if (netif_xmit_stopped(txq) && skb) {
2647                         rc = NETDEV_TX_BUSY;
2648                         break;
2649                 }
2650         }
2651
2652 out:
2653         *ret = rc;
2654         return skb;
2655 }
2656
2657 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2658                                           netdev_features_t features)
2659 {
2660         if (skb_vlan_tag_present(skb) &&
2661             !vlan_hw_offload_capable(features, skb->vlan_proto))
2662                 skb = __vlan_hwaccel_push_inside(skb);
2663         return skb;
2664 }
2665
2666 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2667 {
2668         netdev_features_t features;
2669
2670         if (skb->next)
2671                 return skb;
2672
2673         features = netif_skb_features(skb);
2674         skb = validate_xmit_vlan(skb, features);
2675         if (unlikely(!skb))
2676                 goto out_null;
2677
2678         if (netif_needs_gso(dev, skb, features)) {
2679                 struct sk_buff *segs;
2680
2681                 segs = skb_gso_segment(skb, features);
2682                 if (IS_ERR(segs)) {
2683                         goto out_kfree_skb;
2684                 } else if (segs) {
2685                         consume_skb(skb);
2686                         skb = segs;
2687                 }
2688         } else {
2689                 if (skb_needs_linearize(skb, features) &&
2690                     __skb_linearize(skb))
2691                         goto out_kfree_skb;
2692
2693                 /* If packet is not checksummed and device does not
2694                  * support checksumming for this protocol, complete
2695                  * checksumming here.
2696                  */
2697                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2698                         if (skb->encapsulation)
2699                                 skb_set_inner_transport_header(skb,
2700                                                                skb_checksum_start_offset(skb));
2701                         else
2702                                 skb_set_transport_header(skb,
2703                                                          skb_checksum_start_offset(skb));
2704                         if (!(features & NETIF_F_ALL_CSUM) &&
2705                             skb_checksum_help(skb))
2706                                 goto out_kfree_skb;
2707                 }
2708         }
2709
2710         return skb;
2711
2712 out_kfree_skb:
2713         kfree_skb(skb);
2714 out_null:
2715         return NULL;
2716 }
2717
2718 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2719 {
2720         struct sk_buff *next, *head = NULL, *tail;
2721
2722         for (; skb != NULL; skb = next) {
2723                 next = skb->next;
2724                 skb->next = NULL;
2725
2726                 /* in case skb wont be segmented, point to itself */
2727                 skb->prev = skb;
2728
2729                 skb = validate_xmit_skb(skb, dev);
2730                 if (!skb)
2731                         continue;
2732
2733                 if (!head)
2734                         head = skb;
2735                 else
2736                         tail->next = skb;
2737                 /* If skb was segmented, skb->prev points to
2738                  * the last segment. If not, it still contains skb.
2739                  */
2740                 tail = skb->prev;
2741         }
2742         return head;
2743 }
2744
2745 static void qdisc_pkt_len_init(struct sk_buff *skb)
2746 {
2747         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2748
2749         qdisc_skb_cb(skb)->pkt_len = skb->len;
2750
2751         /* To get more precise estimation of bytes sent on wire,
2752          * we add to pkt_len the headers size of all segments
2753          */
2754         if (shinfo->gso_size)  {
2755                 unsigned int hdr_len;
2756                 u16 gso_segs = shinfo->gso_segs;
2757
2758                 /* mac layer + network layer */
2759                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2760
2761                 /* + transport layer */
2762                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2763                         hdr_len += tcp_hdrlen(skb);
2764                 else
2765                         hdr_len += sizeof(struct udphdr);
2766
2767                 if (shinfo->gso_type & SKB_GSO_DODGY)
2768                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2769                                                 shinfo->gso_size);
2770
2771                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2772         }
2773 }
2774
2775 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2776                                  struct net_device *dev,
2777                                  struct netdev_queue *txq)
2778 {
2779         spinlock_t *root_lock = qdisc_lock(q);
2780         bool contended;
2781         int rc;
2782
2783         qdisc_pkt_len_init(skb);
2784         qdisc_calculate_pkt_len(skb, q);
2785         /*
2786          * Heuristic to force contended enqueues to serialize on a
2787          * separate lock before trying to get qdisc main lock.
2788          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2789          * often and dequeue packets faster.
2790          */
2791         contended = qdisc_is_running(q);
2792         if (unlikely(contended))
2793                 spin_lock(&q->busylock);
2794
2795         spin_lock(root_lock);
2796         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2797                 kfree_skb(skb);
2798                 rc = NET_XMIT_DROP;
2799         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2800                    qdisc_run_begin(q)) {
2801                 /*
2802                  * This is a work-conserving queue; there are no old skbs
2803                  * waiting to be sent out; and the qdisc is not running -
2804                  * xmit the skb directly.
2805                  */
2806
2807                 qdisc_bstats_update(q, skb);
2808
2809                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2810                         if (unlikely(contended)) {
2811                                 spin_unlock(&q->busylock);
2812                                 contended = false;
2813                         }
2814                         __qdisc_run(q);
2815                 } else
2816                         qdisc_run_end(q);
2817
2818                 rc = NET_XMIT_SUCCESS;
2819         } else {
2820                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2821                 if (qdisc_run_begin(q)) {
2822                         if (unlikely(contended)) {
2823                                 spin_unlock(&q->busylock);
2824                                 contended = false;
2825                         }
2826                         __qdisc_run(q);
2827                 }
2828         }
2829         spin_unlock(root_lock);
2830         if (unlikely(contended))
2831                 spin_unlock(&q->busylock);
2832         return rc;
2833 }
2834
2835 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2836 static void skb_update_prio(struct sk_buff *skb)
2837 {
2838         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2839
2840         if (!skb->priority && skb->sk && map) {
2841                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2842
2843                 if (prioidx < map->priomap_len)
2844                         skb->priority = map->priomap[prioidx];
2845         }
2846 }
2847 #else
2848 #define skb_update_prio(skb)
2849 #endif
2850
2851 static DEFINE_PER_CPU(int, xmit_recursion);
2852 #define RECURSION_LIMIT 10
2853
2854 /**
2855  *      dev_loopback_xmit - loop back @skb
2856  *      @skb: buffer to transmit
2857  */
2858 int dev_loopback_xmit(struct sk_buff *skb)
2859 {
2860         skb_reset_mac_header(skb);
2861         __skb_pull(skb, skb_network_offset(skb));
2862         skb->pkt_type = PACKET_LOOPBACK;
2863         skb->ip_summed = CHECKSUM_UNNECESSARY;
2864         WARN_ON(!skb_dst(skb));
2865         skb_dst_force(skb);
2866         netif_rx_ni(skb);
2867         return 0;
2868 }
2869 EXPORT_SYMBOL(dev_loopback_xmit);
2870
2871 /**
2872  *      __dev_queue_xmit - transmit a buffer
2873  *      @skb: buffer to transmit
2874  *      @accel_priv: private data used for L2 forwarding offload
2875  *
2876  *      Queue a buffer for transmission to a network device. The caller must
2877  *      have set the device and priority and built the buffer before calling
2878  *      this function. The function can be called from an interrupt.
2879  *
2880  *      A negative errno code is returned on a failure. A success does not
2881  *      guarantee the frame will be transmitted as it may be dropped due
2882  *      to congestion or traffic shaping.
2883  *
2884  * -----------------------------------------------------------------------------------
2885  *      I notice this method can also return errors from the queue disciplines,
2886  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2887  *      be positive.
2888  *
2889  *      Regardless of the return value, the skb is consumed, so it is currently
2890  *      difficult to retry a send to this method.  (You can bump the ref count
2891  *      before sending to hold a reference for retry if you are careful.)
2892  *
2893  *      When calling this method, interrupts MUST be enabled.  This is because
2894  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2895  *          --BLG
2896  */
2897 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2898 {
2899         struct net_device *dev = skb->dev;
2900         struct netdev_queue *txq;
2901         struct Qdisc *q;
2902         int rc = -ENOMEM;
2903
2904         skb_reset_mac_header(skb);
2905
2906         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2907                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2908
2909         /* Disable soft irqs for various locks below. Also
2910          * stops preemption for RCU.
2911          */
2912         rcu_read_lock_bh();
2913
2914         skb_update_prio(skb);
2915
2916         /* If device/qdisc don't need skb->dst, release it right now while
2917          * its hot in this cpu cache.
2918          */
2919         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2920                 skb_dst_drop(skb);
2921         else
2922                 skb_dst_force(skb);
2923
2924         txq = netdev_pick_tx(dev, skb, accel_priv);
2925         q = rcu_dereference_bh(txq->qdisc);
2926
2927 #ifdef CONFIG_NET_CLS_ACT
2928         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2929 #endif
2930         trace_net_dev_queue(skb);
2931         if (q->enqueue) {
2932                 rc = __dev_xmit_skb(skb, q, dev, txq);
2933                 goto out;
2934         }
2935
2936         /* The device has no queue. Common case for software devices:
2937            loopback, all the sorts of tunnels...
2938
2939            Really, it is unlikely that netif_tx_lock protection is necessary
2940            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2941            counters.)
2942            However, it is possible, that they rely on protection
2943            made by us here.
2944
2945            Check this and shot the lock. It is not prone from deadlocks.
2946            Either shot noqueue qdisc, it is even simpler 8)
2947          */
2948         if (dev->flags & IFF_UP) {
2949                 int cpu = smp_processor_id(); /* ok because BHs are off */
2950
2951                 if (txq->xmit_lock_owner != cpu) {
2952
2953                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2954                                 goto recursion_alert;
2955
2956                         skb = validate_xmit_skb(skb, dev);
2957                         if (!skb)
2958                                 goto drop;
2959
2960                         HARD_TX_LOCK(dev, txq, cpu);
2961
2962                         if (!netif_xmit_stopped(txq)) {
2963                                 __this_cpu_inc(xmit_recursion);
2964                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2965                                 __this_cpu_dec(xmit_recursion);
2966                                 if (dev_xmit_complete(rc)) {
2967                                         HARD_TX_UNLOCK(dev, txq);
2968                                         goto out;
2969                                 }
2970                         }
2971                         HARD_TX_UNLOCK(dev, txq);
2972                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2973                                              dev->name);
2974                 } else {
2975                         /* Recursion is detected! It is possible,
2976                          * unfortunately
2977                          */
2978 recursion_alert:
2979                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2980                                              dev->name);
2981                 }
2982         }
2983
2984         rc = -ENETDOWN;
2985 drop:
2986         rcu_read_unlock_bh();
2987
2988         atomic_long_inc(&dev->tx_dropped);
2989         kfree_skb_list(skb);
2990         return rc;
2991 out:
2992         rcu_read_unlock_bh();
2993         return rc;
2994 }
2995
2996 int dev_queue_xmit(struct sk_buff *skb)
2997 {
2998         return __dev_queue_xmit(skb, NULL);
2999 }
3000 EXPORT_SYMBOL(dev_queue_xmit);
3001
3002 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3003 {
3004         return __dev_queue_xmit(skb, accel_priv);
3005 }
3006 EXPORT_SYMBOL(dev_queue_xmit_accel);
3007
3008
3009 /*=======================================================================
3010                         Receiver routines
3011   =======================================================================*/
3012
3013 int netdev_max_backlog __read_mostly = 1000;
3014 EXPORT_SYMBOL(netdev_max_backlog);
3015
3016 int netdev_tstamp_prequeue __read_mostly = 1;
3017 int netdev_budget __read_mostly = 300;
3018 int weight_p __read_mostly = 64;            /* old backlog weight */
3019
3020 /* Called with irq disabled */
3021 static inline void ____napi_schedule(struct softnet_data *sd,
3022                                      struct napi_struct *napi)
3023 {
3024         list_add_tail(&napi->poll_list, &sd->poll_list);
3025         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3026 }
3027
3028 #ifdef CONFIG_RPS
3029
3030 /* One global table that all flow-based protocols share. */
3031 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3032 EXPORT_SYMBOL(rps_sock_flow_table);
3033
3034 struct static_key rps_needed __read_mostly;
3035
3036 static struct rps_dev_flow *
3037 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3038             struct rps_dev_flow *rflow, u16 next_cpu)
3039 {
3040         if (next_cpu != RPS_NO_CPU) {
3041 #ifdef CONFIG_RFS_ACCEL
3042                 struct netdev_rx_queue *rxqueue;
3043                 struct rps_dev_flow_table *flow_table;
3044                 struct rps_dev_flow *old_rflow;
3045                 u32 flow_id;
3046                 u16 rxq_index;
3047                 int rc;
3048
3049                 /* Should we steer this flow to a different hardware queue? */
3050                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3051                     !(dev->features & NETIF_F_NTUPLE))
3052                         goto out;
3053                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3054                 if (rxq_index == skb_get_rx_queue(skb))
3055                         goto out;
3056
3057                 rxqueue = dev->_rx + rxq_index;
3058                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3059                 if (!flow_table)
3060                         goto out;
3061                 flow_id = skb_get_hash(skb) & flow_table->mask;
3062                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3063                                                         rxq_index, flow_id);
3064                 if (rc < 0)
3065                         goto out;
3066                 old_rflow = rflow;
3067                 rflow = &flow_table->flows[flow_id];
3068                 rflow->filter = rc;
3069                 if (old_rflow->filter == rflow->filter)
3070                         old_rflow->filter = RPS_NO_FILTER;
3071         out:
3072 #endif
3073                 rflow->last_qtail =
3074                         per_cpu(softnet_data, next_cpu).input_queue_head;
3075         }
3076
3077         rflow->cpu = next_cpu;
3078         return rflow;
3079 }
3080
3081 /*
3082  * get_rps_cpu is called from netif_receive_skb and returns the target
3083  * CPU from the RPS map of the receiving queue for a given skb.
3084  * rcu_read_lock must be held on entry.
3085  */
3086 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3087                        struct rps_dev_flow **rflowp)
3088 {
3089         struct netdev_rx_queue *rxqueue;
3090         struct rps_map *map;
3091         struct rps_dev_flow_table *flow_table;
3092         struct rps_sock_flow_table *sock_flow_table;
3093         int cpu = -1;
3094         u16 tcpu;
3095         u32 hash;
3096
3097         if (skb_rx_queue_recorded(skb)) {
3098                 u16 index = skb_get_rx_queue(skb);
3099                 if (unlikely(index >= dev->real_num_rx_queues)) {
3100                         WARN_ONCE(dev->real_num_rx_queues > 1,
3101                                   "%s received packet on queue %u, but number "
3102                                   "of RX queues is %u\n",
3103                                   dev->name, index, dev->real_num_rx_queues);
3104                         goto done;
3105                 }
3106                 rxqueue = dev->_rx + index;
3107         } else
3108                 rxqueue = dev->_rx;
3109
3110         map = rcu_dereference(rxqueue->rps_map);
3111         if (map) {
3112                 if (map->len == 1 &&
3113                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3114                         tcpu = map->cpus[0];
3115                         if (cpu_online(tcpu))
3116                                 cpu = tcpu;
3117                         goto done;
3118                 }
3119         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3120                 goto done;
3121         }
3122
3123         skb_reset_network_header(skb);
3124         hash = skb_get_hash(skb);
3125         if (!hash)
3126                 goto done;
3127
3128         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3129         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3130         if (flow_table && sock_flow_table) {
3131                 u16 next_cpu;
3132                 struct rps_dev_flow *rflow;
3133
3134                 rflow = &flow_table->flows[hash & flow_table->mask];
3135                 tcpu = rflow->cpu;
3136
3137                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3138
3139                 /*
3140                  * If the desired CPU (where last recvmsg was done) is
3141                  * different from current CPU (one in the rx-queue flow
3142                  * table entry), switch if one of the following holds:
3143                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3144                  *   - Current CPU is offline.
3145                  *   - The current CPU's queue tail has advanced beyond the
3146                  *     last packet that was enqueued using this table entry.
3147                  *     This guarantees that all previous packets for the flow
3148                  *     have been dequeued, thus preserving in order delivery.
3149                  */
3150                 if (unlikely(tcpu != next_cpu) &&
3151                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3152                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3153                       rflow->last_qtail)) >= 0)) {
3154                         tcpu = next_cpu;
3155                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3156                 }
3157
3158                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3159                         *rflowp = rflow;
3160                         cpu = tcpu;
3161                         goto done;
3162                 }
3163         }
3164
3165         if (map) {
3166                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3167                 if (cpu_online(tcpu)) {
3168                         cpu = tcpu;
3169                         goto done;
3170                 }
3171         }
3172
3173 done:
3174         return cpu;
3175 }
3176
3177 #ifdef CONFIG_RFS_ACCEL
3178
3179 /**
3180  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3181  * @dev: Device on which the filter was set
3182  * @rxq_index: RX queue index
3183  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3184  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3185  *
3186  * Drivers that implement ndo_rx_flow_steer() should periodically call
3187  * this function for each installed filter and remove the filters for
3188  * which it returns %true.
3189  */
3190 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3191                          u32 flow_id, u16 filter_id)
3192 {
3193         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3194         struct rps_dev_flow_table *flow_table;
3195         struct rps_dev_flow *rflow;
3196         bool expire = true;
3197         int cpu;
3198
3199         rcu_read_lock();
3200         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3201         if (flow_table && flow_id <= flow_table->mask) {
3202                 rflow = &flow_table->flows[flow_id];
3203                 cpu = ACCESS_ONCE(rflow->cpu);
3204                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3205                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3206                            rflow->last_qtail) <
3207                      (int)(10 * flow_table->mask)))
3208                         expire = false;
3209         }
3210         rcu_read_unlock();
3211         return expire;
3212 }
3213 EXPORT_SYMBOL(rps_may_expire_flow);
3214
3215 #endif /* CONFIG_RFS_ACCEL */
3216
3217 /* Called from hardirq (IPI) context */
3218 static void rps_trigger_softirq(void *data)
3219 {
3220         struct softnet_data *sd = data;
3221
3222         ____napi_schedule(sd, &sd->backlog);
3223         sd->received_rps++;
3224 }
3225
3226 #endif /* CONFIG_RPS */
3227
3228 /*
3229  * Check if this softnet_data structure is another cpu one
3230  * If yes, queue it to our IPI list and return 1
3231  * If no, return 0
3232  */
3233 static int rps_ipi_queued(struct softnet_data *sd)
3234 {
3235 #ifdef CONFIG_RPS
3236         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3237
3238         if (sd != mysd) {
3239                 sd->rps_ipi_next = mysd->rps_ipi_list;
3240                 mysd->rps_ipi_list = sd;
3241
3242                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3243                 return 1;
3244         }
3245 #endif /* CONFIG_RPS */
3246         return 0;
3247 }
3248
3249 #ifdef CONFIG_NET_FLOW_LIMIT
3250 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3251 #endif
3252
3253 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3254 {
3255 #ifdef CONFIG_NET_FLOW_LIMIT
3256         struct sd_flow_limit *fl;
3257         struct softnet_data *sd;
3258         unsigned int old_flow, new_flow;
3259
3260         if (qlen < (netdev_max_backlog >> 1))
3261                 return false;
3262
3263         sd = this_cpu_ptr(&softnet_data);
3264
3265         rcu_read_lock();
3266         fl = rcu_dereference(sd->flow_limit);
3267         if (fl) {
3268                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3269                 old_flow = fl->history[fl->history_head];
3270                 fl->history[fl->history_head] = new_flow;
3271
3272                 fl->history_head++;
3273                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3274
3275                 if (likely(fl->buckets[old_flow]))
3276                         fl->buckets[old_flow]--;
3277
3278                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3279                         fl->count++;
3280                         rcu_read_unlock();
3281                         return true;
3282                 }
3283         }
3284         rcu_read_unlock();
3285 #endif
3286         return false;
3287 }
3288
3289 /*
3290  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3291  * queue (may be a remote CPU queue).
3292  */
3293 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3294                               unsigned int *qtail)
3295 {
3296         struct softnet_data *sd;
3297         unsigned long flags;
3298         unsigned int qlen;
3299
3300         sd = &per_cpu(softnet_data, cpu);
3301
3302         local_irq_save(flags);
3303
3304         rps_lock(sd);
3305         qlen = skb_queue_len(&sd->input_pkt_queue);
3306         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3307                 if (qlen) {
3308 enqueue:
3309                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3310                         input_queue_tail_incr_save(sd, qtail);
3311                         rps_unlock(sd);
3312                         local_irq_restore(flags);
3313                         return NET_RX_SUCCESS;
3314                 }
3315
3316                 /* Schedule NAPI for backlog device
3317                  * We can use non atomic operation since we own the queue lock
3318                  */
3319                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3320                         if (!rps_ipi_queued(sd))
3321                                 ____napi_schedule(sd, &sd->backlog);
3322                 }
3323                 goto enqueue;
3324         }
3325
3326         sd->dropped++;
3327         rps_unlock(sd);
3328
3329         local_irq_restore(flags);
3330
3331         atomic_long_inc(&skb->dev->rx_dropped);
3332         kfree_skb(skb);
3333         return NET_RX_DROP;
3334 }
3335
3336 static int netif_rx_internal(struct sk_buff *skb)
3337 {
3338         int ret;
3339
3340         net_timestamp_check(netdev_tstamp_prequeue, skb);
3341
3342         trace_netif_rx(skb);
3343 #ifdef CONFIG_RPS
3344         if (static_key_false(&rps_needed)) {
3345                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3346                 int cpu;
3347
3348                 preempt_disable();
3349                 rcu_read_lock();
3350
3351                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3352                 if (cpu < 0)
3353                         cpu = smp_processor_id();
3354
3355                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3356
3357                 rcu_read_unlock();
3358                 preempt_enable();
3359         } else
3360 #endif
3361         {
3362                 unsigned int qtail;
3363                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3364                 put_cpu();
3365         }
3366         return ret;
3367 }
3368
3369 /**
3370  *      netif_rx        -       post buffer to the network code
3371  *      @skb: buffer to post
3372  *
3373  *      This function receives a packet from a device driver and queues it for
3374  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3375  *      may be dropped during processing for congestion control or by the
3376  *      protocol layers.
3377  *
3378  *      return values:
3379  *      NET_RX_SUCCESS  (no congestion)
3380  *      NET_RX_DROP     (packet was dropped)
3381  *
3382  */
3383
3384 int netif_rx(struct sk_buff *skb)
3385 {
3386         trace_netif_rx_entry(skb);
3387
3388         return netif_rx_internal(skb);
3389 }
3390 EXPORT_SYMBOL(netif_rx);
3391
3392 int netif_rx_ni(struct sk_buff *skb)
3393 {
3394         int err;
3395
3396         trace_netif_rx_ni_entry(skb);
3397
3398         preempt_disable();
3399         err = netif_rx_internal(skb);
3400         if (local_softirq_pending())
3401                 do_softirq();
3402         preempt_enable();
3403
3404         return err;
3405 }
3406 EXPORT_SYMBOL(netif_rx_ni);
3407
3408 static void net_tx_action(struct softirq_action *h)
3409 {
3410         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3411
3412         if (sd->completion_queue) {
3413                 struct sk_buff *clist;
3414
3415                 local_irq_disable();
3416                 clist = sd->completion_queue;
3417                 sd->completion_queue = NULL;
3418                 local_irq_enable();
3419
3420                 while (clist) {
3421                         struct sk_buff *skb = clist;
3422                         clist = clist->next;
3423
3424                         WARN_ON(atomic_read(&skb->users));
3425                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3426                                 trace_consume_skb(skb);
3427                         else
3428                                 trace_kfree_skb(skb, net_tx_action);
3429                         __kfree_skb(skb);
3430                 }
3431         }
3432
3433         if (sd->output_queue) {
3434                 struct Qdisc *head;
3435
3436                 local_irq_disable();
3437                 head = sd->output_queue;
3438                 sd->output_queue = NULL;
3439                 sd->output_queue_tailp = &sd->output_queue;
3440                 local_irq_enable();
3441
3442                 while (head) {
3443                         struct Qdisc *q = head;
3444                         spinlock_t *root_lock;
3445
3446                         head = head->next_sched;
3447
3448                         root_lock = qdisc_lock(q);
3449                         if (spin_trylock(root_lock)) {
3450                                 smp_mb__before_atomic();
3451                                 clear_bit(__QDISC_STATE_SCHED,
3452                                           &q->state);
3453                                 qdisc_run(q);
3454                                 spin_unlock(root_lock);
3455                         } else {
3456                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3457                                               &q->state)) {
3458                                         __netif_reschedule(q);
3459                                 } else {
3460                                         smp_mb__before_atomic();
3461                                         clear_bit(__QDISC_STATE_SCHED,
3462                                                   &q->state);
3463                                 }
3464                         }
3465                 }
3466         }
3467 }
3468
3469 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3470     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3471 /* This hook is defined here for ATM LANE */
3472 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3473                              unsigned char *addr) __read_mostly;
3474 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3475 #endif
3476
3477 #ifdef CONFIG_NET_CLS_ACT
3478 /* TODO: Maybe we should just force sch_ingress to be compiled in
3479  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3480  * a compare and 2 stores extra right now if we dont have it on
3481  * but have CONFIG_NET_CLS_ACT
3482  * NOTE: This doesn't stop any functionality; if you dont have
3483  * the ingress scheduler, you just can't add policies on ingress.
3484  *
3485  */
3486 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3487 {
3488         struct net_device *dev = skb->dev;
3489         u32 ttl = G_TC_RTTL(skb->tc_verd);
3490         int result = TC_ACT_OK;
3491         struct Qdisc *q;
3492
3493         if (unlikely(MAX_RED_LOOP < ttl++)) {
3494                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3495                                      skb->skb_iif, dev->ifindex);
3496                 return TC_ACT_SHOT;
3497         }
3498
3499         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3500         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3501
3502         q = rcu_dereference(rxq->qdisc);
3503         if (q != &noop_qdisc) {
3504                 spin_lock(qdisc_lock(q));
3505                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3506                         result = qdisc_enqueue_root(skb, q);
3507                 spin_unlock(qdisc_lock(q));
3508         }
3509
3510         return result;
3511 }
3512
3513 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3514                                          struct packet_type **pt_prev,
3515                                          int *ret, struct net_device *orig_dev)
3516 {
3517         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3518
3519         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3520                 goto out;
3521
3522         if (*pt_prev) {
3523                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3524                 *pt_prev = NULL;
3525         }
3526
3527         switch (ing_filter(skb, rxq)) {
3528         case TC_ACT_SHOT:
3529         case TC_ACT_STOLEN:
3530                 kfree_skb(skb);
3531                 return NULL;
3532         }
3533
3534 out:
3535         skb->tc_verd = 0;
3536         return skb;
3537 }
3538 #endif
3539
3540 /**
3541  *      netdev_rx_handler_register - register receive handler
3542  *      @dev: device to register a handler for
3543  *      @rx_handler: receive handler to register
3544  *      @rx_handler_data: data pointer that is used by rx handler
3545  *
3546  *      Register a receive handler for a device. This handler will then be
3547  *      called from __netif_receive_skb. A negative errno code is returned
3548  *      on a failure.
3549  *
3550  *      The caller must hold the rtnl_mutex.
3551  *
3552  *      For a general description of rx_handler, see enum rx_handler_result.
3553  */
3554 int netdev_rx_handler_register(struct net_device *dev,
3555                                rx_handler_func_t *rx_handler,
3556                                void *rx_handler_data)
3557 {
3558         ASSERT_RTNL();
3559
3560         if (dev->rx_handler)
3561                 return -EBUSY;
3562
3563         /* Note: rx_handler_data must be set before rx_handler */
3564         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3565         rcu_assign_pointer(dev->rx_handler, rx_handler);
3566
3567         return 0;
3568 }
3569 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3570
3571 /**
3572  *      netdev_rx_handler_unregister - unregister receive handler
3573  *      @dev: device to unregister a handler from
3574  *
3575  *      Unregister a receive handler from a device.
3576  *
3577  *      The caller must hold the rtnl_mutex.
3578  */
3579 void netdev_rx_handler_unregister(struct net_device *dev)
3580 {
3581
3582         ASSERT_RTNL();
3583         RCU_INIT_POINTER(dev->rx_handler, NULL);
3584         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3585          * section has a guarantee to see a non NULL rx_handler_data
3586          * as well.
3587          */
3588         synchronize_net();
3589         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3590 }
3591 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3592
3593 /*
3594  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3595  * the special handling of PFMEMALLOC skbs.
3596  */
3597 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3598 {
3599         switch (skb->protocol) {
3600         case htons(ETH_P_ARP):
3601         case htons(ETH_P_IP):
3602         case htons(ETH_P_IPV6):
3603         case htons(ETH_P_8021Q):
3604         case htons(ETH_P_8021AD):
3605                 return true;
3606         default:
3607                 return false;
3608         }
3609 }
3610
3611 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3612 {
3613         struct packet_type *ptype, *pt_prev;
3614         rx_handler_func_t *rx_handler;
3615         struct net_device *orig_dev;
3616         bool deliver_exact = false;
3617         int ret = NET_RX_DROP;
3618         __be16 type;
3619
3620         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3621
3622         trace_netif_receive_skb(skb);
3623
3624         orig_dev = skb->dev;
3625
3626         skb_reset_network_header(skb);
3627         if (!skb_transport_header_was_set(skb))
3628                 skb_reset_transport_header(skb);
3629         skb_reset_mac_len(skb);
3630
3631         pt_prev = NULL;
3632
3633         rcu_read_lock();
3634
3635 another_round:
3636         skb->skb_iif = skb->dev->ifindex;
3637
3638         __this_cpu_inc(softnet_data.processed);
3639
3640         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3641             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3642                 skb = skb_vlan_untag(skb);
3643                 if (unlikely(!skb))
3644                         goto unlock;
3645         }
3646
3647 #ifdef CONFIG_NET_CLS_ACT
3648         if (skb->tc_verd & TC_NCLS) {
3649                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3650                 goto ncls;
3651         }
3652 #endif
3653
3654         if (pfmemalloc)
3655                 goto skip_taps;
3656
3657         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3658                 if (pt_prev)
3659                         ret = deliver_skb(skb, pt_prev, orig_dev);
3660                 pt_prev = ptype;
3661         }
3662
3663         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3664                 if (pt_prev)
3665                         ret = deliver_skb(skb, pt_prev, orig_dev);
3666                 pt_prev = ptype;
3667         }
3668
3669 skip_taps:
3670 #ifdef CONFIG_NET_CLS_ACT
3671         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3672         if (!skb)
3673                 goto unlock;
3674 ncls:
3675 #endif
3676
3677         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3678                 goto drop;
3679
3680         if (skb_vlan_tag_present(skb)) {
3681                 if (pt_prev) {
3682                         ret = deliver_skb(skb, pt_prev, orig_dev);
3683                         pt_prev = NULL;
3684                 }
3685                 if (vlan_do_receive(&skb))
3686                         goto another_round;
3687                 else if (unlikely(!skb))
3688                         goto unlock;
3689         }
3690
3691         rx_handler = rcu_dereference(skb->dev->rx_handler);
3692         if (rx_handler) {
3693                 if (pt_prev) {
3694                         ret = deliver_skb(skb, pt_prev, orig_dev);
3695                         pt_prev = NULL;
3696                 }
3697                 switch (rx_handler(&skb)) {
3698                 case RX_HANDLER_CONSUMED:
3699                         ret = NET_RX_SUCCESS;
3700                         goto unlock;
3701                 case RX_HANDLER_ANOTHER:
3702                         goto another_round;
3703                 case RX_HANDLER_EXACT:
3704                         deliver_exact = true;
3705                 case RX_HANDLER_PASS:
3706                         break;
3707                 default:
3708                         BUG();
3709                 }
3710         }
3711
3712         if (unlikely(skb_vlan_tag_present(skb))) {
3713                 if (skb_vlan_tag_get_id(skb))
3714                         skb->pkt_type = PACKET_OTHERHOST;
3715                 /* Note: we might in the future use prio bits
3716                  * and set skb->priority like in vlan_do_receive()
3717                  * For the time being, just ignore Priority Code Point
3718                  */
3719                 skb->vlan_tci = 0;
3720         }
3721
3722         type = skb->protocol;
3723
3724         /* deliver only exact match when indicated */
3725         if (likely(!deliver_exact)) {
3726                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3727                                        &ptype_base[ntohs(type) &
3728                                                    PTYPE_HASH_MASK]);
3729         }
3730
3731         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3732                                &orig_dev->ptype_specific);
3733
3734         if (unlikely(skb->dev != orig_dev)) {
3735                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3736                                        &skb->dev->ptype_specific);
3737         }
3738
3739         if (pt_prev) {
3740                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3741                         goto drop;
3742                 else
3743                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3744         } else {
3745 drop:
3746                 atomic_long_inc(&skb->dev->rx_dropped);
3747                 kfree_skb(skb);
3748                 /* Jamal, now you will not able to escape explaining
3749                  * me how you were going to use this. :-)
3750                  */
3751                 ret = NET_RX_DROP;
3752         }
3753
3754 unlock:
3755         rcu_read_unlock();
3756         return ret;
3757 }
3758
3759 static int __netif_receive_skb(struct sk_buff *skb)
3760 {
3761         int ret;
3762
3763         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3764                 unsigned long pflags = current->flags;
3765
3766                 /*
3767                  * PFMEMALLOC skbs are special, they should
3768                  * - be delivered to SOCK_MEMALLOC sockets only
3769                  * - stay away from userspace
3770                  * - have bounded memory usage
3771                  *
3772                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3773                  * context down to all allocation sites.
3774                  */
3775                 current->flags |= PF_MEMALLOC;
3776                 ret = __netif_receive_skb_core(skb, true);
3777                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3778         } else
3779                 ret = __netif_receive_skb_core(skb, false);
3780
3781         return ret;
3782 }
3783
3784 static int netif_receive_skb_internal(struct sk_buff *skb)
3785 {
3786         net_timestamp_check(netdev_tstamp_prequeue, skb);
3787
3788         if (skb_defer_rx_timestamp(skb))
3789                 return NET_RX_SUCCESS;
3790
3791 #ifdef CONFIG_RPS
3792         if (static_key_false(&rps_needed)) {
3793                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3794                 int cpu, ret;
3795
3796                 rcu_read_lock();
3797
3798                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3799
3800                 if (cpu >= 0) {
3801                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3802                         rcu_read_unlock();
3803                         return ret;
3804                 }
3805                 rcu_read_unlock();
3806         }
3807 #endif
3808         return __netif_receive_skb(skb);
3809 }
3810
3811 /**
3812  *      netif_receive_skb - process receive buffer from network
3813  *      @skb: buffer to process
3814  *
3815  *      netif_receive_skb() is the main receive data processing function.
3816  *      It always succeeds. The buffer may be dropped during processing
3817  *      for congestion control or by the protocol layers.
3818  *
3819  *      This function may only be called from softirq context and interrupts
3820  *      should be enabled.
3821  *
3822  *      Return values (usually ignored):
3823  *      NET_RX_SUCCESS: no congestion
3824  *      NET_RX_DROP: packet was dropped
3825  */
3826 int netif_receive_skb(struct sk_buff *skb)
3827 {
3828         trace_netif_receive_skb_entry(skb);
3829
3830         return netif_receive_skb_internal(skb);
3831 }
3832 EXPORT_SYMBOL(netif_receive_skb);
3833
3834 /* Network device is going away, flush any packets still pending
3835  * Called with irqs disabled.
3836  */
3837 static void flush_backlog(void *arg)
3838 {
3839         struct net_device *dev = arg;
3840         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3841         struct sk_buff *skb, *tmp;
3842
3843         rps_lock(sd);
3844         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3845                 if (skb->dev == dev) {
3846                         __skb_unlink(skb, &sd->input_pkt_queue);
3847                         kfree_skb(skb);
3848                         input_queue_head_incr(sd);
3849                 }
3850         }
3851         rps_unlock(sd);
3852
3853         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3854                 if (skb->dev == dev) {
3855                         __skb_unlink(skb, &sd->process_queue);
3856                         kfree_skb(skb);
3857                         input_queue_head_incr(sd);
3858                 }
3859         }
3860 }
3861
3862 static int napi_gro_complete(struct sk_buff *skb)
3863 {
3864         struct packet_offload *ptype;
3865         __be16 type = skb->protocol;
3866         struct list_head *head = &offload_base;
3867         int err = -ENOENT;
3868
3869         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3870
3871         if (NAPI_GRO_CB(skb)->count == 1) {
3872                 skb_shinfo(skb)->gso_size = 0;
3873                 goto out;
3874         }
3875
3876         rcu_read_lock();
3877         list_for_each_entry_rcu(ptype, head, list) {
3878                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3879                         continue;
3880
3881                 err = ptype->callbacks.gro_complete(skb, 0);
3882                 break;
3883         }
3884         rcu_read_unlock();
3885
3886         if (err) {
3887                 WARN_ON(&ptype->list == head);
3888                 kfree_skb(skb);
3889                 return NET_RX_SUCCESS;
3890         }
3891
3892 out:
3893         return netif_receive_skb_internal(skb);
3894 }
3895
3896 /* napi->gro_list contains packets ordered by age.
3897  * youngest packets at the head of it.
3898  * Complete skbs in reverse order to reduce latencies.
3899  */
3900 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3901 {
3902         struct sk_buff *skb, *prev = NULL;
3903
3904         /* scan list and build reverse chain */
3905         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3906                 skb->prev = prev;
3907                 prev = skb;
3908         }
3909
3910         for (skb = prev; skb; skb = prev) {
3911                 skb->next = NULL;
3912
3913                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3914                         return;
3915
3916                 prev = skb->prev;
3917                 napi_gro_complete(skb);
3918                 napi->gro_count--;
3919         }
3920
3921         napi->gro_list = NULL;
3922 }
3923 EXPORT_SYMBOL(napi_gro_flush);
3924
3925 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3926 {
3927         struct sk_buff *p;
3928         unsigned int maclen = skb->dev->hard_header_len;
3929         u32 hash = skb_get_hash_raw(skb);
3930
3931         for (p = napi->gro_list; p; p = p->next) {
3932                 unsigned long diffs;
3933
3934                 NAPI_GRO_CB(p)->flush = 0;
3935
3936                 if (hash != skb_get_hash_raw(p)) {
3937                         NAPI_GRO_CB(p)->same_flow = 0;
3938                         continue;
3939                 }
3940
3941                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3942                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3943                 if (maclen == ETH_HLEN)
3944                         diffs |= compare_ether_header(skb_mac_header(p),
3945                                                       skb_mac_header(skb));
3946                 else if (!diffs)
3947                         diffs = memcmp(skb_mac_header(p),
3948                                        skb_mac_header(skb),
3949                                        maclen);
3950                 NAPI_GRO_CB(p)->same_flow = !diffs;
3951         }
3952 }
3953
3954 static void skb_gro_reset_offset(struct sk_buff *skb)
3955 {
3956         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3957         const skb_frag_t *frag0 = &pinfo->frags[0];
3958
3959         NAPI_GRO_CB(skb)->data_offset = 0;
3960         NAPI_GRO_CB(skb)->frag0 = NULL;
3961         NAPI_GRO_CB(skb)->frag0_len = 0;
3962
3963         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3964             pinfo->nr_frags &&
3965             !PageHighMem(skb_frag_page(frag0))) {
3966                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3967                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3968         }
3969 }
3970
3971 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3972 {
3973         struct skb_shared_info *pinfo = skb_shinfo(skb);
3974
3975         BUG_ON(skb->end - skb->tail < grow);
3976
3977         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3978
3979         skb->data_len -= grow;
3980         skb->tail += grow;
3981
3982         pinfo->frags[0].page_offset += grow;
3983         skb_frag_size_sub(&pinfo->frags[0], grow);
3984
3985         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3986                 skb_frag_unref(skb, 0);
3987                 memmove(pinfo->frags, pinfo->frags + 1,
3988                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3989         }
3990 }
3991
3992 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3993 {
3994         struct sk_buff **pp = NULL;
3995         struct packet_offload *ptype;
3996         __be16 type = skb->protocol;
3997         struct list_head *head = &offload_base;
3998         int same_flow;
3999         enum gro_result ret;
4000         int grow;
4001
4002         if (!(skb->dev->features & NETIF_F_GRO))
4003                 goto normal;
4004
4005         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4006                 goto normal;
4007
4008         gro_list_prepare(napi, skb);
4009
4010         rcu_read_lock();
4011         list_for_each_entry_rcu(ptype, head, list) {
4012                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4013                         continue;
4014
4015                 skb_set_network_header(skb, skb_gro_offset(skb));
4016                 skb_reset_mac_len(skb);
4017                 NAPI_GRO_CB(skb)->same_flow = 0;
4018                 NAPI_GRO_CB(skb)->flush = 0;
4019                 NAPI_GRO_CB(skb)->free = 0;
4020                 NAPI_GRO_CB(skb)->udp_mark = 0;
4021
4022                 /* Setup for GRO checksum validation */
4023                 switch (skb->ip_summed) {
4024                 case CHECKSUM_COMPLETE:
4025                         NAPI_GRO_CB(skb)->csum = skb->csum;
4026                         NAPI_GRO_CB(skb)->csum_valid = 1;
4027                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4028                         break;
4029                 case CHECKSUM_UNNECESSARY:
4030                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4031                         NAPI_GRO_CB(skb)->csum_valid = 0;
4032                         break;
4033                 default:
4034                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4035                         NAPI_GRO_CB(skb)->csum_valid = 0;
4036                 }
4037
4038                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4039                 break;
4040         }
4041         rcu_read_unlock();
4042
4043         if (&ptype->list == head)
4044                 goto normal;
4045
4046         same_flow = NAPI_GRO_CB(skb)->same_flow;
4047         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4048
4049         if (pp) {
4050                 struct sk_buff *nskb = *pp;
4051
4052                 *pp = nskb->next;
4053                 nskb->next = NULL;
4054                 napi_gro_complete(nskb);
4055                 napi->gro_count--;
4056         }
4057
4058         if (same_flow)
4059                 goto ok;
4060
4061         if (NAPI_GRO_CB(skb)->flush)
4062                 goto normal;
4063
4064         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4065                 struct sk_buff *nskb = napi->gro_list;
4066
4067                 /* locate the end of the list to select the 'oldest' flow */
4068                 while (nskb->next) {
4069                         pp = &nskb->next;
4070                         nskb = *pp;
4071                 }
4072                 *pp = NULL;
4073                 nskb->next = NULL;
4074                 napi_gro_complete(nskb);
4075         } else {
4076                 napi->gro_count++;
4077         }
4078         NAPI_GRO_CB(skb)->count = 1;
4079         NAPI_GRO_CB(skb)->age = jiffies;
4080         NAPI_GRO_CB(skb)->last = skb;
4081         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4082         skb->next = napi->gro_list;
4083         napi->gro_list = skb;
4084         ret = GRO_HELD;
4085
4086 pull:
4087         grow = skb_gro_offset(skb) - skb_headlen(skb);
4088         if (grow > 0)
4089                 gro_pull_from_frag0(skb, grow);
4090 ok:
4091         return ret;
4092
4093 normal:
4094         ret = GRO_NORMAL;
4095         goto pull;
4096 }
4097
4098 struct packet_offload *gro_find_receive_by_type(__be16 type)
4099 {
4100         struct list_head *offload_head = &offload_base;
4101         struct packet_offload *ptype;
4102
4103         list_for_each_entry_rcu(ptype, offload_head, list) {
4104                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4105                         continue;
4106                 return ptype;
4107         }
4108         return NULL;
4109 }
4110 EXPORT_SYMBOL(gro_find_receive_by_type);
4111
4112 struct packet_offload *gro_find_complete_by_type(__be16 type)
4113 {
4114         struct list_head *offload_head = &offload_base;
4115         struct packet_offload *ptype;
4116
4117         list_for_each_entry_rcu(ptype, offload_head, list) {
4118                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4119                         continue;
4120                 return ptype;
4121         }
4122         return NULL;
4123 }
4124 EXPORT_SYMBOL(gro_find_complete_by_type);
4125
4126 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4127 {
4128         switch (ret) {
4129         case GRO_NORMAL:
4130                 if (netif_receive_skb_internal(skb))
4131                         ret = GRO_DROP;
4132                 break;
4133
4134         case GRO_DROP:
4135                 kfree_skb(skb);
4136                 break;
4137
4138         case GRO_MERGED_FREE:
4139                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4140                         kmem_cache_free(skbuff_head_cache, skb);
4141                 else
4142                         __kfree_skb(skb);
4143                 break;
4144
4145         case GRO_HELD:
4146         case GRO_MERGED:
4147                 break;
4148         }
4149
4150         return ret;
4151 }
4152
4153 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4154 {
4155         trace_napi_gro_receive_entry(skb);
4156
4157         skb_gro_reset_offset(skb);
4158
4159         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4160 }
4161 EXPORT_SYMBOL(napi_gro_receive);
4162
4163 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4164 {
4165         if (unlikely(skb->pfmemalloc)) {
4166                 consume_skb(skb);
4167                 return;
4168         }
4169         __skb_pull(skb, skb_headlen(skb));
4170         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4171         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4172         skb->vlan_tci = 0;
4173         skb->dev = napi->dev;
4174         skb->skb_iif = 0;
4175         skb->encapsulation = 0;
4176         skb_shinfo(skb)->gso_type = 0;
4177         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4178
4179         napi->skb = skb;
4180 }
4181
4182 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4183 {
4184         struct sk_buff *skb = napi->skb;
4185
4186         if (!skb) {
4187                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4188                 napi->skb = skb;
4189         }
4190         return skb;
4191 }
4192 EXPORT_SYMBOL(napi_get_frags);
4193
4194 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4195                                       struct sk_buff *skb,
4196                                       gro_result_t ret)
4197 {
4198         switch (ret) {
4199         case GRO_NORMAL:
4200         case GRO_HELD:
4201                 __skb_push(skb, ETH_HLEN);
4202                 skb->protocol = eth_type_trans(skb, skb->dev);
4203                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4204                         ret = GRO_DROP;
4205                 break;
4206
4207         case GRO_DROP:
4208         case GRO_MERGED_FREE:
4209                 napi_reuse_skb(napi, skb);
4210                 break;
4211
4212         case GRO_MERGED:
4213                 break;
4214         }
4215
4216         return ret;
4217 }
4218
4219 /* Upper GRO stack assumes network header starts at gro_offset=0
4220  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4221  * We copy ethernet header into skb->data to have a common layout.
4222  */
4223 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4224 {
4225         struct sk_buff *skb = napi->skb;
4226         const struct ethhdr *eth;
4227         unsigned int hlen = sizeof(*eth);
4228
4229         napi->skb = NULL;
4230
4231         skb_reset_mac_header(skb);
4232         skb_gro_reset_offset(skb);
4233
4234         eth = skb_gro_header_fast(skb, 0);
4235         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4236                 eth = skb_gro_header_slow(skb, hlen, 0);
4237                 if (unlikely(!eth)) {
4238                         napi_reuse_skb(napi, skb);
4239                         return NULL;
4240                 }
4241         } else {
4242                 gro_pull_from_frag0(skb, hlen);
4243                 NAPI_GRO_CB(skb)->frag0 += hlen;
4244                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4245         }
4246         __skb_pull(skb, hlen);
4247
4248         /*
4249          * This works because the only protocols we care about don't require
4250          * special handling.
4251          * We'll fix it up properly in napi_frags_finish()
4252          */
4253         skb->protocol = eth->h_proto;
4254
4255         return skb;
4256 }
4257
4258 gro_result_t napi_gro_frags(struct napi_struct *napi)
4259 {
4260         struct sk_buff *skb = napi_frags_skb(napi);
4261
4262         if (!skb)
4263                 return GRO_DROP;
4264
4265         trace_napi_gro_frags_entry(skb);
4266
4267         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4268 }
4269 EXPORT_SYMBOL(napi_gro_frags);
4270
4271 /* Compute the checksum from gro_offset and return the folded value
4272  * after adding in any pseudo checksum.
4273  */
4274 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4275 {
4276         __wsum wsum;
4277         __sum16 sum;
4278
4279         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4280
4281         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4282         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4283         if (likely(!sum)) {
4284                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4285                     !skb->csum_complete_sw)
4286                         netdev_rx_csum_fault(skb->dev);
4287         }
4288
4289         NAPI_GRO_CB(skb)->csum = wsum;
4290         NAPI_GRO_CB(skb)->csum_valid = 1;
4291
4292         return sum;
4293 }
4294 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4295
4296 /*
4297  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4298  * Note: called with local irq disabled, but exits with local irq enabled.
4299  */
4300 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4301 {
4302 #ifdef CONFIG_RPS
4303         struct softnet_data *remsd = sd->rps_ipi_list;
4304
4305         if (remsd) {
4306                 sd->rps_ipi_list = NULL;
4307
4308                 local_irq_enable();
4309
4310                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4311                 while (remsd) {
4312                         struct softnet_data *next = remsd->rps_ipi_next;
4313
4314                         if (cpu_online(remsd->cpu))
4315                                 smp_call_function_single_async(remsd->cpu,
4316                                                            &remsd->csd);
4317                         remsd = next;
4318                 }
4319         } else
4320 #endif
4321                 local_irq_enable();
4322 }
4323
4324 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4325 {
4326 #ifdef CONFIG_RPS
4327         return sd->rps_ipi_list != NULL;
4328 #else
4329         return false;
4330 #endif
4331 }
4332
4333 static int process_backlog(struct napi_struct *napi, int quota)
4334 {
4335         int work = 0;
4336         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4337
4338         /* Check if we have pending ipi, its better to send them now,
4339          * not waiting net_rx_action() end.
4340          */
4341         if (sd_has_rps_ipi_waiting(sd)) {
4342                 local_irq_disable();
4343                 net_rps_action_and_irq_enable(sd);
4344         }
4345
4346         napi->weight = weight_p;
4347         local_irq_disable();
4348         while (1) {
4349                 struct sk_buff *skb;
4350
4351                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4352                         local_irq_enable();
4353                         __netif_receive_skb(skb);
4354                         local_irq_disable();
4355                         input_queue_head_incr(sd);
4356                         if (++work >= quota) {
4357                                 local_irq_enable();
4358                                 return work;
4359                         }
4360                 }
4361
4362                 rps_lock(sd);
4363                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4364                         /*
4365                          * Inline a custom version of __napi_complete().
4366                          * only current cpu owns and manipulates this napi,
4367                          * and NAPI_STATE_SCHED is the only possible flag set
4368                          * on backlog.
4369                          * We can use a plain write instead of clear_bit(),
4370                          * and we dont need an smp_mb() memory barrier.
4371                          */
4372                         napi->state = 0;
4373                         rps_unlock(sd);
4374
4375                         break;
4376                 }
4377
4378                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4379                                            &sd->process_queue);
4380                 rps_unlock(sd);
4381         }
4382         local_irq_enable();
4383
4384         return work;
4385 }
4386
4387 /**
4388  * __napi_schedule - schedule for receive
4389  * @n: entry to schedule
4390  *
4391  * The entry's receive function will be scheduled to run.
4392  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4393  */
4394 void __napi_schedule(struct napi_struct *n)
4395 {
4396         unsigned long flags;
4397
4398         local_irq_save(flags);
4399         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4400         local_irq_restore(flags);
4401 }
4402 EXPORT_SYMBOL(__napi_schedule);
4403
4404 /**
4405  * __napi_schedule_irqoff - schedule for receive
4406  * @n: entry to schedule
4407  *
4408  * Variant of __napi_schedule() assuming hard irqs are masked
4409  */
4410 void __napi_schedule_irqoff(struct napi_struct *n)
4411 {
4412         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4413 }
4414 EXPORT_SYMBOL(__napi_schedule_irqoff);
4415
4416 void __napi_complete(struct napi_struct *n)
4417 {
4418         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4419
4420         list_del_init(&n->poll_list);
4421         smp_mb__before_atomic();
4422         clear_bit(NAPI_STATE_SCHED, &n->state);
4423 }
4424 EXPORT_SYMBOL(__napi_complete);
4425
4426 void napi_complete_done(struct napi_struct *n, int work_done)
4427 {
4428         unsigned long flags;
4429
4430         /*
4431          * don't let napi dequeue from the cpu poll list
4432          * just in case its running on a different cpu
4433          */
4434         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4435                 return;
4436
4437         if (n->gro_list) {
4438                 unsigned long timeout = 0;
4439
4440                 if (work_done)
4441                         timeout = n->dev->gro_flush_timeout;
4442
4443                 if (timeout)
4444                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4445                                       HRTIMER_MODE_REL_PINNED);
4446                 else
4447                         napi_gro_flush(n, false);
4448         }
4449         if (likely(list_empty(&n->poll_list))) {
4450                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4451         } else {
4452                 /* If n->poll_list is not empty, we need to mask irqs */
4453                 local_irq_save(flags);
4454                 __napi_complete(n);
4455                 local_irq_restore(flags);
4456         }
4457 }
4458 EXPORT_SYMBOL(napi_complete_done);
4459
4460 /* must be called under rcu_read_lock(), as we dont take a reference */
4461 struct napi_struct *napi_by_id(unsigned int napi_id)
4462 {
4463         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4464         struct napi_struct *napi;
4465
4466         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4467                 if (napi->napi_id == napi_id)
4468                         return napi;
4469
4470         return NULL;
4471 }
4472 EXPORT_SYMBOL_GPL(napi_by_id);
4473
4474 void napi_hash_add(struct napi_struct *napi)
4475 {
4476         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4477
4478                 spin_lock(&napi_hash_lock);
4479
4480                 /* 0 is not a valid id, we also skip an id that is taken
4481                  * we expect both events to be extremely rare
4482                  */
4483                 napi->napi_id = 0;
4484                 while (!napi->napi_id) {
4485                         napi->napi_id = ++napi_gen_id;
4486                         if (napi_by_id(napi->napi_id))
4487                                 napi->napi_id = 0;
4488                 }
4489
4490                 hlist_add_head_rcu(&napi->napi_hash_node,
4491                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4492
4493                 spin_unlock(&napi_hash_lock);
4494         }
4495 }
4496 EXPORT_SYMBOL_GPL(napi_hash_add);
4497
4498 /* Warning : caller is responsible to make sure rcu grace period
4499  * is respected before freeing memory containing @napi
4500  */
4501 void napi_hash_del(struct napi_struct *napi)
4502 {
4503         spin_lock(&napi_hash_lock);
4504
4505         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4506                 hlist_del_rcu(&napi->napi_hash_node);
4507
4508         spin_unlock(&napi_hash_lock);
4509 }
4510 EXPORT_SYMBOL_GPL(napi_hash_del);
4511
4512 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4513 {
4514         struct napi_struct *napi;
4515
4516         napi = container_of(timer, struct napi_struct, timer);
4517         if (napi->gro_list)
4518                 napi_schedule(napi);
4519
4520         return HRTIMER_NORESTART;
4521 }
4522
4523 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4524                     int (*poll)(struct napi_struct *, int), int weight)
4525 {
4526         INIT_LIST_HEAD(&napi->poll_list);
4527         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4528         napi->timer.function = napi_watchdog;
4529         napi->gro_count = 0;
4530         napi->gro_list = NULL;
4531         napi->skb = NULL;
4532         napi->poll = poll;
4533         if (weight > NAPI_POLL_WEIGHT)
4534                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4535                             weight, dev->name);
4536         napi->weight = weight;
4537         list_add(&napi->dev_list, &dev->napi_list);
4538         napi->dev = dev;
4539 #ifdef CONFIG_NETPOLL
4540         spin_lock_init(&napi->poll_lock);
4541         napi->poll_owner = -1;
4542 #endif
4543         set_bit(NAPI_STATE_SCHED, &napi->state);
4544 }
4545 EXPORT_SYMBOL(netif_napi_add);
4546
4547 void napi_disable(struct napi_struct *n)
4548 {
4549         might_sleep();
4550         set_bit(NAPI_STATE_DISABLE, &n->state);
4551
4552         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4553                 msleep(1);
4554
4555         hrtimer_cancel(&n->timer);
4556
4557         clear_bit(NAPI_STATE_DISABLE, &n->state);
4558 }
4559 EXPORT_SYMBOL(napi_disable);
4560
4561 void netif_napi_del(struct napi_struct *napi)
4562 {
4563         list_del_init(&napi->dev_list);
4564         napi_free_frags(napi);
4565
4566         kfree_skb_list(napi->gro_list);
4567         napi->gro_list = NULL;
4568         napi->gro_count = 0;
4569 }
4570 EXPORT_SYMBOL(netif_napi_del);
4571
4572 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4573 {
4574         void *have;
4575         int work, weight;
4576
4577         list_del_init(&n->poll_list);
4578
4579         have = netpoll_poll_lock(n);
4580
4581         weight = n->weight;
4582
4583         /* This NAPI_STATE_SCHED test is for avoiding a race
4584          * with netpoll's poll_napi().  Only the entity which
4585          * obtains the lock and sees NAPI_STATE_SCHED set will
4586          * actually make the ->poll() call.  Therefore we avoid
4587          * accidentally calling ->poll() when NAPI is not scheduled.
4588          */
4589         work = 0;
4590         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4591                 work = n->poll(n, weight);
4592                 trace_napi_poll(n);
4593         }
4594
4595         WARN_ON_ONCE(work > weight);
4596
4597         if (likely(work < weight))
4598                 goto out_unlock;
4599
4600         /* Drivers must not modify the NAPI state if they
4601          * consume the entire weight.  In such cases this code
4602          * still "owns" the NAPI instance and therefore can
4603          * move the instance around on the list at-will.
4604          */
4605         if (unlikely(napi_disable_pending(n))) {
4606                 napi_complete(n);
4607                 goto out_unlock;
4608         }
4609
4610         if (n->gro_list) {
4611                 /* flush too old packets
4612                  * If HZ < 1000, flush all packets.
4613                  */
4614                 napi_gro_flush(n, HZ >= 1000);
4615         }
4616
4617         /* Some drivers may have called napi_schedule
4618          * prior to exhausting their budget.
4619          */
4620         if (unlikely(!list_empty(&n->poll_list))) {
4621                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4622                              n->dev ? n->dev->name : "backlog");
4623                 goto out_unlock;
4624         }
4625
4626         list_add_tail(&n->poll_list, repoll);
4627
4628 out_unlock:
4629         netpoll_poll_unlock(have);
4630
4631         return work;
4632 }
4633
4634 static void net_rx_action(struct softirq_action *h)
4635 {
4636         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4637         unsigned long time_limit = jiffies + 2;
4638         int budget = netdev_budget;
4639         LIST_HEAD(list);
4640         LIST_HEAD(repoll);
4641
4642         local_irq_disable();
4643         list_splice_init(&sd->poll_list, &list);
4644         local_irq_enable();
4645
4646         for (;;) {
4647                 struct napi_struct *n;
4648
4649                 if (list_empty(&list)) {
4650                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4651                                 return;
4652                         break;
4653                 }
4654
4655                 n = list_first_entry(&list, struct napi_struct, poll_list);
4656                 budget -= napi_poll(n, &repoll);
4657
4658                 /* If softirq window is exhausted then punt.
4659                  * Allow this to run for 2 jiffies since which will allow
4660                  * an average latency of 1.5/HZ.
4661                  */
4662                 if (unlikely(budget <= 0 ||
4663                              time_after_eq(jiffies, time_limit))) {
4664                         sd->time_squeeze++;
4665                         break;
4666                 }
4667         }
4668
4669         local_irq_disable();
4670
4671         list_splice_tail_init(&sd->poll_list, &list);
4672         list_splice_tail(&repoll, &list);
4673         list_splice(&list, &sd->poll_list);
4674         if (!list_empty(&sd->poll_list))
4675                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4676
4677         net_rps_action_and_irq_enable(sd);
4678 }
4679
4680 struct netdev_adjacent {
4681         struct net_device *dev;
4682
4683         /* upper master flag, there can only be one master device per list */
4684         bool master;
4685
4686         /* counter for the number of times this device was added to us */
4687         u16 ref_nr;
4688
4689         /* private field for the users */
4690         void *private;
4691
4692         struct list_head list;
4693         struct rcu_head rcu;
4694 };
4695
4696 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4697                                                  struct net_device *adj_dev,
4698                                                  struct list_head *adj_list)
4699 {
4700         struct netdev_adjacent *adj;
4701
4702         list_for_each_entry(adj, adj_list, list) {
4703                 if (adj->dev == adj_dev)
4704                         return adj;
4705         }
4706         return NULL;
4707 }
4708
4709 /**
4710  * netdev_has_upper_dev - Check if device is linked to an upper device
4711  * @dev: device
4712  * @upper_dev: upper device to check
4713  *
4714  * Find out if a device is linked to specified upper device and return true
4715  * in case it is. Note that this checks only immediate upper device,
4716  * not through a complete stack of devices. The caller must hold the RTNL lock.
4717  */
4718 bool netdev_has_upper_dev(struct net_device *dev,
4719                           struct net_device *upper_dev)
4720 {
4721         ASSERT_RTNL();
4722
4723         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4724 }
4725 EXPORT_SYMBOL(netdev_has_upper_dev);
4726
4727 /**
4728  * netdev_has_any_upper_dev - Check if device is linked to some device
4729  * @dev: device
4730  *
4731  * Find out if a device is linked to an upper device and return true in case
4732  * it is. The caller must hold the RTNL lock.
4733  */
4734 static bool netdev_has_any_upper_dev(struct net_device *dev)
4735 {
4736         ASSERT_RTNL();
4737
4738         return !list_empty(&dev->all_adj_list.upper);
4739 }
4740
4741 /**
4742  * netdev_master_upper_dev_get - Get master upper device
4743  * @dev: device
4744  *
4745  * Find a master upper device and return pointer to it or NULL in case
4746  * it's not there. The caller must hold the RTNL lock.
4747  */
4748 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4749 {
4750         struct netdev_adjacent *upper;
4751
4752         ASSERT_RTNL();
4753
4754         if (list_empty(&dev->adj_list.upper))
4755                 return NULL;
4756
4757         upper = list_first_entry(&dev->adj_list.upper,
4758                                  struct netdev_adjacent, list);
4759         if (likely(upper->master))
4760                 return upper->dev;
4761         return NULL;
4762 }
4763 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4764
4765 void *netdev_adjacent_get_private(struct list_head *adj_list)
4766 {
4767         struct netdev_adjacent *adj;
4768
4769         adj = list_entry(adj_list, struct netdev_adjacent, list);
4770
4771         return adj->private;
4772 }
4773 EXPORT_SYMBOL(netdev_adjacent_get_private);
4774
4775 /**
4776  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4777  * @dev: device
4778  * @iter: list_head ** of the current position
4779  *
4780  * Gets the next device from the dev's upper list, starting from iter
4781  * position. The caller must hold RCU read lock.
4782  */
4783 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4784                                                  struct list_head **iter)
4785 {
4786         struct netdev_adjacent *upper;
4787
4788         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4789
4790         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4791
4792         if (&upper->list == &dev->adj_list.upper)
4793                 return NULL;
4794
4795         *iter = &upper->list;
4796
4797         return upper->dev;
4798 }
4799 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4800
4801 /**
4802  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4803  * @dev: device
4804  * @iter: list_head ** of the current position
4805  *
4806  * Gets the next device from the dev's upper list, starting from iter
4807  * position. The caller must hold RCU read lock.
4808  */
4809 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4810                                                      struct list_head **iter)
4811 {
4812         struct netdev_adjacent *upper;
4813
4814         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4815
4816         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4817
4818         if (&upper->list == &dev->all_adj_list.upper)
4819                 return NULL;
4820
4821         *iter = &upper->list;
4822
4823         return upper->dev;
4824 }
4825 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4826
4827 /**
4828  * netdev_lower_get_next_private - Get the next ->private from the
4829  *                                 lower neighbour list
4830  * @dev: device
4831  * @iter: list_head ** of the current position
4832  *
4833  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4834  * list, starting from iter position. The caller must hold either hold the
4835  * RTNL lock or its own locking that guarantees that the neighbour lower
4836  * list will remain unchainged.
4837  */
4838 void *netdev_lower_get_next_private(struct net_device *dev,
4839                                     struct list_head **iter)
4840 {
4841         struct netdev_adjacent *lower;
4842
4843         lower = list_entry(*iter, struct netdev_adjacent, list);
4844
4845         if (&lower->list == &dev->adj_list.lower)
4846                 return NULL;
4847
4848         *iter = lower->list.next;
4849
4850         return lower->private;
4851 }
4852 EXPORT_SYMBOL(netdev_lower_get_next_private);
4853
4854 /**
4855  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4856  *                                     lower neighbour list, RCU
4857  *                                     variant
4858  * @dev: device
4859  * @iter: list_head ** of the current position
4860  *
4861  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4862  * list, starting from iter position. The caller must hold RCU read lock.
4863  */
4864 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4865                                         struct list_head **iter)
4866 {
4867         struct netdev_adjacent *lower;
4868
4869         WARN_ON_ONCE(!rcu_read_lock_held());
4870
4871         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4872
4873         if (&lower->list == &dev->adj_list.lower)
4874                 return NULL;
4875
4876         *iter = &lower->list;
4877
4878         return lower->private;
4879 }
4880 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4881
4882 /**
4883  * netdev_lower_get_next - Get the next device from the lower neighbour
4884  *                         list
4885  * @dev: device
4886  * @iter: list_head ** of the current position
4887  *
4888  * Gets the next netdev_adjacent from the dev's lower neighbour
4889  * list, starting from iter position. The caller must hold RTNL lock or
4890  * its own locking that guarantees that the neighbour lower
4891  * list will remain unchainged.
4892  */
4893 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4894 {
4895         struct netdev_adjacent *lower;
4896
4897         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4898
4899         if (&lower->list == &dev->adj_list.lower)
4900                 return NULL;
4901
4902         *iter = &lower->list;
4903
4904         return lower->dev;
4905 }
4906 EXPORT_SYMBOL(netdev_lower_get_next);
4907
4908 /**
4909  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4910  *                                     lower neighbour list, RCU
4911  *                                     variant
4912  * @dev: device
4913  *
4914  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4915  * list. The caller must hold RCU read lock.
4916  */
4917 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4918 {
4919         struct netdev_adjacent *lower;
4920
4921         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4922                         struct netdev_adjacent, list);
4923         if (lower)
4924                 return lower->private;
4925         return NULL;
4926 }
4927 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4928
4929 /**
4930  * netdev_master_upper_dev_get_rcu - Get master upper device
4931  * @dev: device
4932  *
4933  * Find a master upper device and return pointer to it or NULL in case
4934  * it's not there. The caller must hold the RCU read lock.
4935  */
4936 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4937 {
4938         struct netdev_adjacent *upper;
4939
4940         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4941                                        struct netdev_adjacent, list);
4942         if (upper && likely(upper->master))
4943                 return upper->dev;
4944         return NULL;
4945 }
4946 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4947
4948 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4949                               struct net_device *adj_dev,
4950                               struct list_head *dev_list)
4951 {
4952         char linkname[IFNAMSIZ+7];
4953         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4954                 "upper_%s" : "lower_%s", adj_dev->name);
4955         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4956                                  linkname);
4957 }
4958 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4959                                char *name,
4960                                struct list_head *dev_list)
4961 {
4962         char linkname[IFNAMSIZ+7];
4963         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4964                 "upper_%s" : "lower_%s", name);
4965         sysfs_remove_link(&(dev->dev.kobj), linkname);
4966 }
4967
4968 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4969                                                  struct net_device *adj_dev,
4970                                                  struct list_head *dev_list)
4971 {
4972         return (dev_list == &dev->adj_list.upper ||
4973                 dev_list == &dev->adj_list.lower) &&
4974                 net_eq(dev_net(dev), dev_net(adj_dev));
4975 }
4976
4977 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4978                                         struct net_device *adj_dev,
4979                                         struct list_head *dev_list,
4980                                         void *private, bool master)
4981 {
4982         struct netdev_adjacent *adj;
4983         int ret;
4984
4985         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4986
4987         if (adj) {
4988                 adj->ref_nr++;
4989                 return 0;
4990         }
4991
4992         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4993         if (!adj)
4994                 return -ENOMEM;
4995
4996         adj->dev = adj_dev;
4997         adj->master = master;
4998         adj->ref_nr = 1;
4999         adj->private = private;
5000         dev_hold(adj_dev);
5001
5002         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5003                  adj_dev->name, dev->name, adj_dev->name);
5004
5005         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5006                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5007                 if (ret)
5008                         goto free_adj;
5009         }
5010
5011         /* Ensure that master link is always the first item in list. */
5012         if (master) {
5013                 ret = sysfs_create_link(&(dev->dev.kobj),
5014                                         &(adj_dev->dev.kobj), "master");
5015                 if (ret)
5016                         goto remove_symlinks;
5017
5018                 list_add_rcu(&adj->list, dev_list);
5019         } else {
5020                 list_add_tail_rcu(&adj->list, dev_list);
5021         }
5022
5023         return 0;
5024
5025 remove_symlinks:
5026         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5027                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5028 free_adj:
5029         kfree(adj);
5030         dev_put(adj_dev);
5031
5032         return ret;
5033 }
5034
5035 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5036                                          struct net_device *adj_dev,
5037                                          struct list_head *dev_list)
5038 {
5039         struct netdev_adjacent *adj;
5040
5041         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5042
5043         if (!adj) {
5044                 pr_err("tried to remove device %s from %s\n",
5045                        dev->name, adj_dev->name);
5046                 BUG();
5047         }
5048
5049         if (adj->ref_nr > 1) {
5050                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5051                          adj->ref_nr-1);
5052                 adj->ref_nr--;
5053                 return;
5054         }
5055
5056         if (adj->master)
5057                 sysfs_remove_link(&(dev->dev.kobj), "master");
5058
5059         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5060                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5061
5062         list_del_rcu(&adj->list);
5063         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5064                  adj_dev->name, dev->name, adj_dev->name);
5065         dev_put(adj_dev);
5066         kfree_rcu(adj, rcu);
5067 }
5068
5069 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5070                                             struct net_device *upper_dev,
5071                                             struct list_head *up_list,
5072                                             struct list_head *down_list,
5073                                             void *private, bool master)
5074 {
5075         int ret;
5076
5077         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5078                                            master);
5079         if (ret)
5080                 return ret;
5081
5082         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5083                                            false);
5084         if (ret) {
5085                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5086                 return ret;
5087         }
5088
5089         return 0;
5090 }
5091
5092 static int __netdev_adjacent_dev_link(struct net_device *dev,
5093                                       struct net_device *upper_dev)
5094 {
5095         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5096                                                 &dev->all_adj_list.upper,
5097                                                 &upper_dev->all_adj_list.lower,
5098                                                 NULL, false);
5099 }
5100
5101 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5102                                                struct net_device *upper_dev,
5103                                                struct list_head *up_list,
5104                                                struct list_head *down_list)
5105 {
5106         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5107         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5108 }
5109
5110 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5111                                          struct net_device *upper_dev)
5112 {
5113         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5114                                            &dev->all_adj_list.upper,
5115                                            &upper_dev->all_adj_list.lower);
5116 }
5117
5118 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5119                                                 struct net_device *upper_dev,
5120                                                 void *private, bool master)
5121 {
5122         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5123
5124         if (ret)
5125                 return ret;
5126
5127         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5128                                                &dev->adj_list.upper,
5129                                                &upper_dev->adj_list.lower,
5130                                                private, master);
5131         if (ret) {
5132                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5133                 return ret;
5134         }
5135
5136         return 0;
5137 }
5138
5139 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5140                                                    struct net_device *upper_dev)
5141 {
5142         __netdev_adjacent_dev_unlink(dev, upper_dev);
5143         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5144                                            &dev->adj_list.upper,
5145                                            &upper_dev->adj_list.lower);
5146 }
5147
5148 static int __netdev_upper_dev_link(struct net_device *dev,
5149                                    struct net_device *upper_dev, bool master,
5150                                    void *private)
5151 {
5152         struct netdev_adjacent *i, *j, *to_i, *to_j;
5153         int ret = 0;
5154
5155         ASSERT_RTNL();
5156
5157         if (dev == upper_dev)
5158                 return -EBUSY;
5159
5160         /* To prevent loops, check if dev is not upper device to upper_dev. */
5161         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5162                 return -EBUSY;
5163
5164         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5165                 return -EEXIST;
5166
5167         if (master && netdev_master_upper_dev_get(dev))
5168                 return -EBUSY;
5169
5170         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5171                                                    master);
5172         if (ret)
5173                 return ret;
5174
5175         /* Now that we linked these devs, make all the upper_dev's
5176          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5177          * versa, and don't forget the devices itself. All of these
5178          * links are non-neighbours.
5179          */
5180         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5181                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5182                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5183                                  i->dev->name, j->dev->name);
5184                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5185                         if (ret)
5186                                 goto rollback_mesh;
5187                 }
5188         }
5189
5190         /* add dev to every upper_dev's upper device */
5191         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5192                 pr_debug("linking %s's upper device %s with %s\n",
5193                          upper_dev->name, i->dev->name, dev->name);
5194                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5195                 if (ret)
5196                         goto rollback_upper_mesh;
5197         }
5198
5199         /* add upper_dev to every dev's lower device */
5200         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5201                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5202                          i->dev->name, upper_dev->name);
5203                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5204                 if (ret)
5205                         goto rollback_lower_mesh;
5206         }
5207
5208         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5209         return 0;
5210
5211 rollback_lower_mesh:
5212         to_i = i;
5213         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5214                 if (i == to_i)
5215                         break;
5216                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5217         }
5218
5219         i = NULL;
5220
5221 rollback_upper_mesh:
5222         to_i = i;
5223         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5224                 if (i == to_i)
5225                         break;
5226                 __netdev_adjacent_dev_unlink(dev, i->dev);
5227         }
5228
5229         i = j = NULL;
5230
5231 rollback_mesh:
5232         to_i = i;
5233         to_j = j;
5234         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5235                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5236                         if (i == to_i && j == to_j)
5237                                 break;
5238                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5239                 }
5240                 if (i == to_i)
5241                         break;
5242         }
5243
5244         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5245
5246         return ret;
5247 }
5248
5249 /**
5250  * netdev_upper_dev_link - Add a link to the upper device
5251  * @dev: device
5252  * @upper_dev: new upper device
5253  *
5254  * Adds a link to device which is upper to this one. The caller must hold
5255  * the RTNL lock. On a failure a negative errno code is returned.
5256  * On success the reference counts are adjusted and the function
5257  * returns zero.
5258  */
5259 int netdev_upper_dev_link(struct net_device *dev,
5260                           struct net_device *upper_dev)
5261 {
5262         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5263 }
5264 EXPORT_SYMBOL(netdev_upper_dev_link);
5265
5266 /**
5267  * netdev_master_upper_dev_link - Add a master link to the upper device
5268  * @dev: device
5269  * @upper_dev: new upper device
5270  *
5271  * Adds a link to device which is upper to this one. In this case, only
5272  * one master upper device can be linked, although other non-master devices
5273  * might be linked as well. The caller must hold the RTNL lock.
5274  * On a failure a negative errno code is returned. On success the reference
5275  * counts are adjusted and the function returns zero.
5276  */
5277 int netdev_master_upper_dev_link(struct net_device *dev,
5278                                  struct net_device *upper_dev)
5279 {
5280         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5281 }
5282 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5283
5284 int netdev_master_upper_dev_link_private(struct net_device *dev,
5285                                          struct net_device *upper_dev,
5286                                          void *private)
5287 {
5288         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5289 }
5290 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5291
5292 /**
5293  * netdev_upper_dev_unlink - Removes a link to upper device
5294  * @dev: device
5295  * @upper_dev: new upper device
5296  *
5297  * Removes a link to device which is upper to this one. The caller must hold
5298  * the RTNL lock.
5299  */
5300 void netdev_upper_dev_unlink(struct net_device *dev,
5301                              struct net_device *upper_dev)
5302 {
5303         struct netdev_adjacent *i, *j;
5304         ASSERT_RTNL();
5305
5306         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5307
5308         /* Here is the tricky part. We must remove all dev's lower
5309          * devices from all upper_dev's upper devices and vice
5310          * versa, to maintain the graph relationship.
5311          */
5312         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5313                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5314                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5315
5316         /* remove also the devices itself from lower/upper device
5317          * list
5318          */
5319         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5320                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5321
5322         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5323                 __netdev_adjacent_dev_unlink(dev, i->dev);
5324
5325         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5326 }
5327 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5328
5329 /**
5330  * netdev_bonding_info_change - Dispatch event about slave change
5331  * @dev: device
5332  * @netdev_bonding_info: info to dispatch
5333  *
5334  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5335  * The caller must hold the RTNL lock.
5336  */
5337 void netdev_bonding_info_change(struct net_device *dev,
5338                                 struct netdev_bonding_info *bonding_info)
5339 {
5340         struct netdev_notifier_bonding_info     info;
5341
5342         memcpy(&info.bonding_info, bonding_info,
5343                sizeof(struct netdev_bonding_info));
5344         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5345                                       &info.info);
5346 }
5347 EXPORT_SYMBOL(netdev_bonding_info_change);
5348
5349 static void netdev_adjacent_add_links(struct net_device *dev)
5350 {
5351         struct netdev_adjacent *iter;
5352
5353         struct net *net = dev_net(dev);
5354
5355         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5356                 if (!net_eq(net,dev_net(iter->dev)))
5357                         continue;
5358                 netdev_adjacent_sysfs_add(iter->dev, dev,
5359                                           &iter->dev->adj_list.lower);
5360                 netdev_adjacent_sysfs_add(dev, iter->dev,
5361                                           &dev->adj_list.upper);
5362         }
5363
5364         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5365                 if (!net_eq(net,dev_net(iter->dev)))
5366                         continue;
5367                 netdev_adjacent_sysfs_add(iter->dev, dev,
5368                                           &iter->dev->adj_list.upper);
5369                 netdev_adjacent_sysfs_add(dev, iter->dev,
5370                                           &dev->adj_list.lower);
5371         }
5372 }
5373
5374 static void netdev_adjacent_del_links(struct net_device *dev)
5375 {
5376         struct netdev_adjacent *iter;
5377
5378         struct net *net = dev_net(dev);
5379
5380         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5381                 if (!net_eq(net,dev_net(iter->dev)))
5382                         continue;
5383                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5384                                           &iter->dev->adj_list.lower);
5385                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5386                                           &dev->adj_list.upper);
5387         }
5388
5389         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5390                 if (!net_eq(net,dev_net(iter->dev)))
5391                         continue;
5392                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5393                                           &iter->dev->adj_list.upper);
5394                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5395                                           &dev->adj_list.lower);
5396         }
5397 }
5398
5399 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5400 {
5401         struct netdev_adjacent *iter;
5402
5403         struct net *net = dev_net(dev);
5404
5405         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5406                 if (!net_eq(net,dev_net(iter->dev)))
5407                         continue;
5408                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5409                                           &iter->dev->adj_list.lower);
5410                 netdev_adjacent_sysfs_add(iter->dev, dev,
5411                                           &iter->dev->adj_list.lower);
5412         }
5413
5414         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5415                 if (!net_eq(net,dev_net(iter->dev)))
5416                         continue;
5417                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5418                                           &iter->dev->adj_list.upper);
5419                 netdev_adjacent_sysfs_add(iter->dev, dev,
5420                                           &iter->dev->adj_list.upper);
5421         }
5422 }
5423
5424 void *netdev_lower_dev_get_private(struct net_device *dev,
5425                                    struct net_device *lower_dev)
5426 {
5427         struct netdev_adjacent *lower;
5428
5429         if (!lower_dev)
5430                 return NULL;
5431         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5432         if (!lower)
5433                 return NULL;
5434
5435         return lower->private;
5436 }
5437 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5438
5439
5440 int dev_get_nest_level(struct net_device *dev,
5441                        bool (*type_check)(struct net_device *dev))
5442 {
5443         struct net_device *lower = NULL;
5444         struct list_head *iter;
5445         int max_nest = -1;
5446         int nest;
5447
5448         ASSERT_RTNL();
5449
5450         netdev_for_each_lower_dev(dev, lower, iter) {
5451                 nest = dev_get_nest_level(lower, type_check);
5452                 if (max_nest < nest)
5453                         max_nest = nest;
5454         }
5455
5456         if (type_check(dev))
5457                 max_nest++;
5458
5459         return max_nest;
5460 }
5461 EXPORT_SYMBOL(dev_get_nest_level);
5462
5463 static void dev_change_rx_flags(struct net_device *dev, int flags)
5464 {
5465         const struct net_device_ops *ops = dev->netdev_ops;
5466
5467         if (ops->ndo_change_rx_flags)
5468                 ops->ndo_change_rx_flags(dev, flags);
5469 }
5470
5471 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5472 {
5473         unsigned int old_flags = dev->flags;
5474         kuid_t uid;
5475         kgid_t gid;
5476
5477         ASSERT_RTNL();
5478
5479         dev->flags |= IFF_PROMISC;
5480         dev->promiscuity += inc;
5481         if (dev->promiscuity == 0) {
5482                 /*
5483                  * Avoid overflow.
5484                  * If inc causes overflow, untouch promisc and return error.
5485                  */
5486                 if (inc < 0)
5487                         dev->flags &= ~IFF_PROMISC;
5488                 else {
5489                         dev->promiscuity -= inc;
5490                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5491                                 dev->name);
5492                         return -EOVERFLOW;
5493                 }
5494         }
5495         if (dev->flags != old_flags) {
5496                 pr_info("device %s %s promiscuous mode\n",
5497                         dev->name,
5498                         dev->flags & IFF_PROMISC ? "entered" : "left");
5499                 if (audit_enabled) {
5500                         current_uid_gid(&uid, &gid);
5501                         audit_log(current->audit_context, GFP_ATOMIC,
5502                                 AUDIT_ANOM_PROMISCUOUS,
5503                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5504                                 dev->name, (dev->flags & IFF_PROMISC),
5505                                 (old_flags & IFF_PROMISC),
5506                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5507                                 from_kuid(&init_user_ns, uid),
5508                                 from_kgid(&init_user_ns, gid),
5509                                 audit_get_sessionid(current));
5510                 }
5511
5512                 dev_change_rx_flags(dev, IFF_PROMISC);
5513         }
5514         if (notify)
5515                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5516         return 0;
5517 }
5518
5519 /**
5520  *      dev_set_promiscuity     - update promiscuity count on a device
5521  *      @dev: device
5522  *      @inc: modifier
5523  *
5524  *      Add or remove promiscuity from a device. While the count in the device
5525  *      remains above zero the interface remains promiscuous. Once it hits zero
5526  *      the device reverts back to normal filtering operation. A negative inc
5527  *      value is used to drop promiscuity on the device.
5528  *      Return 0 if successful or a negative errno code on error.
5529  */
5530 int dev_set_promiscuity(struct net_device *dev, int inc)
5531 {
5532         unsigned int old_flags = dev->flags;
5533         int err;
5534
5535         err = __dev_set_promiscuity(dev, inc, true);
5536         if (err < 0)
5537                 return err;
5538         if (dev->flags != old_flags)
5539                 dev_set_rx_mode(dev);
5540         return err;
5541 }
5542 EXPORT_SYMBOL(dev_set_promiscuity);
5543
5544 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5545 {
5546         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5547
5548         ASSERT_RTNL();
5549
5550         dev->flags |= IFF_ALLMULTI;
5551         dev->allmulti += inc;
5552         if (dev->allmulti == 0) {
5553                 /*
5554                  * Avoid overflow.
5555                  * If inc causes overflow, untouch allmulti and return error.
5556                  */
5557                 if (inc < 0)
5558                         dev->flags &= ~IFF_ALLMULTI;
5559                 else {
5560                         dev->allmulti -= inc;
5561                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5562                                 dev->name);
5563                         return -EOVERFLOW;
5564                 }
5565         }
5566         if (dev->flags ^ old_flags) {
5567                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5568                 dev_set_rx_mode(dev);
5569                 if (notify)
5570                         __dev_notify_flags(dev, old_flags,
5571                                            dev->gflags ^ old_gflags);
5572         }
5573         return 0;
5574 }
5575
5576 /**
5577  *      dev_set_allmulti        - update allmulti count on a device
5578  *      @dev: device
5579  *      @inc: modifier
5580  *
5581  *      Add or remove reception of all multicast frames to a device. While the
5582  *      count in the device remains above zero the interface remains listening
5583  *      to all interfaces. Once it hits zero the device reverts back to normal
5584  *      filtering operation. A negative @inc value is used to drop the counter
5585  *      when releasing a resource needing all multicasts.
5586  *      Return 0 if successful or a negative errno code on error.
5587  */
5588
5589 int dev_set_allmulti(struct net_device *dev, int inc)
5590 {
5591         return __dev_set_allmulti(dev, inc, true);
5592 }
5593 EXPORT_SYMBOL(dev_set_allmulti);
5594
5595 /*
5596  *      Upload unicast and multicast address lists to device and
5597  *      configure RX filtering. When the device doesn't support unicast
5598  *      filtering it is put in promiscuous mode while unicast addresses
5599  *      are present.
5600  */
5601 void __dev_set_rx_mode(struct net_device *dev)
5602 {
5603         const struct net_device_ops *ops = dev->netdev_ops;
5604
5605         /* dev_open will call this function so the list will stay sane. */
5606         if (!(dev->flags&IFF_UP))
5607                 return;
5608
5609         if (!netif_device_present(dev))
5610                 return;
5611
5612         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5613                 /* Unicast addresses changes may only happen under the rtnl,
5614                  * therefore calling __dev_set_promiscuity here is safe.
5615                  */
5616                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5617                         __dev_set_promiscuity(dev, 1, false);
5618                         dev->uc_promisc = true;
5619                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5620                         __dev_set_promiscuity(dev, -1, false);
5621                         dev->uc_promisc = false;
5622                 }
5623         }
5624
5625         if (ops->ndo_set_rx_mode)
5626                 ops->ndo_set_rx_mode(dev);
5627 }
5628
5629 void dev_set_rx_mode(struct net_device *dev)
5630 {
5631         netif_addr_lock_bh(dev);
5632         __dev_set_rx_mode(dev);
5633         netif_addr_unlock_bh(dev);
5634 }
5635
5636 /**
5637  *      dev_get_flags - get flags reported to userspace
5638  *      @dev: device
5639  *
5640  *      Get the combination of flag bits exported through APIs to userspace.
5641  */
5642 unsigned int dev_get_flags(const struct net_device *dev)
5643 {
5644         unsigned int flags;
5645
5646         flags = (dev->flags & ~(IFF_PROMISC |
5647                                 IFF_ALLMULTI |
5648                                 IFF_RUNNING |
5649                                 IFF_LOWER_UP |
5650                                 IFF_DORMANT)) |
5651                 (dev->gflags & (IFF_PROMISC |
5652                                 IFF_ALLMULTI));
5653
5654         if (netif_running(dev)) {
5655                 if (netif_oper_up(dev))
5656                         flags |= IFF_RUNNING;
5657                 if (netif_carrier_ok(dev))
5658                         flags |= IFF_LOWER_UP;
5659                 if (netif_dormant(dev))
5660                         flags |= IFF_DORMANT;
5661         }
5662
5663         return flags;
5664 }
5665 EXPORT_SYMBOL(dev_get_flags);
5666
5667 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5668 {
5669         unsigned int old_flags = dev->flags;
5670         int ret;
5671
5672         ASSERT_RTNL();
5673
5674         /*
5675          *      Set the flags on our device.
5676          */
5677
5678         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5679                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5680                                IFF_AUTOMEDIA)) |
5681                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5682                                     IFF_ALLMULTI));
5683
5684         /*
5685          *      Load in the correct multicast list now the flags have changed.
5686          */
5687
5688         if ((old_flags ^ flags) & IFF_MULTICAST)
5689                 dev_change_rx_flags(dev, IFF_MULTICAST);
5690
5691         dev_set_rx_mode(dev);
5692
5693         /*
5694          *      Have we downed the interface. We handle IFF_UP ourselves
5695          *      according to user attempts to set it, rather than blindly
5696          *      setting it.
5697          */
5698
5699         ret = 0;
5700         if ((old_flags ^ flags) & IFF_UP)
5701                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5702
5703         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5704                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5705                 unsigned int old_flags = dev->flags;
5706
5707                 dev->gflags ^= IFF_PROMISC;
5708
5709                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5710                         if (dev->flags != old_flags)
5711                                 dev_set_rx_mode(dev);
5712         }
5713
5714         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5715            is important. Some (broken) drivers set IFF_PROMISC, when
5716            IFF_ALLMULTI is requested not asking us and not reporting.
5717          */
5718         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5719                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5720
5721                 dev->gflags ^= IFF_ALLMULTI;
5722                 __dev_set_allmulti(dev, inc, false);
5723         }
5724
5725         return ret;
5726 }
5727
5728 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5729                         unsigned int gchanges)
5730 {
5731         unsigned int changes = dev->flags ^ old_flags;
5732
5733         if (gchanges)
5734                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5735
5736         if (changes & IFF_UP) {
5737                 if (dev->flags & IFF_UP)
5738                         call_netdevice_notifiers(NETDEV_UP, dev);
5739                 else
5740                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5741         }
5742
5743         if (dev->flags & IFF_UP &&
5744             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5745                 struct netdev_notifier_change_info change_info;
5746
5747                 change_info.flags_changed = changes;
5748                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5749                                               &change_info.info);
5750         }
5751 }
5752
5753 /**
5754  *      dev_change_flags - change device settings
5755  *      @dev: device
5756  *      @flags: device state flags
5757  *
5758  *      Change settings on device based state flags. The flags are
5759  *      in the userspace exported format.
5760  */
5761 int dev_change_flags(struct net_device *dev, unsigned int flags)
5762 {
5763         int ret;
5764         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5765
5766         ret = __dev_change_flags(dev, flags);
5767         if (ret < 0)
5768                 return ret;
5769
5770         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5771         __dev_notify_flags(dev, old_flags, changes);
5772         return ret;
5773 }
5774 EXPORT_SYMBOL(dev_change_flags);
5775
5776 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5777 {
5778         const struct net_device_ops *ops = dev->netdev_ops;
5779
5780         if (ops->ndo_change_mtu)
5781                 return ops->ndo_change_mtu(dev, new_mtu);
5782
5783         dev->mtu = new_mtu;
5784         return 0;
5785 }
5786
5787 /**
5788  *      dev_set_mtu - Change maximum transfer unit
5789  *      @dev: device
5790  *      @new_mtu: new transfer unit
5791  *
5792  *      Change the maximum transfer size of the network device.
5793  */
5794 int dev_set_mtu(struct net_device *dev, int new_mtu)
5795 {
5796         int err, orig_mtu;
5797
5798         if (new_mtu == dev->mtu)
5799                 return 0;
5800
5801         /*      MTU must be positive.    */
5802         if (new_mtu < 0)
5803                 return -EINVAL;
5804
5805         if (!netif_device_present(dev))
5806                 return -ENODEV;
5807
5808         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5809         err = notifier_to_errno(err);
5810         if (err)
5811                 return err;
5812
5813         orig_mtu = dev->mtu;
5814         err = __dev_set_mtu(dev, new_mtu);
5815
5816         if (!err) {
5817                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5818                 err = notifier_to_errno(err);
5819                 if (err) {
5820                         /* setting mtu back and notifying everyone again,
5821                          * so that they have a chance to revert changes.
5822                          */
5823                         __dev_set_mtu(dev, orig_mtu);
5824                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5825                 }
5826         }
5827         return err;
5828 }
5829 EXPORT_SYMBOL(dev_set_mtu);
5830
5831 /**
5832  *      dev_set_group - Change group this device belongs to
5833  *      @dev: device
5834  *      @new_group: group this device should belong to
5835  */
5836 void dev_set_group(struct net_device *dev, int new_group)
5837 {
5838         dev->group = new_group;
5839 }
5840 EXPORT_SYMBOL(dev_set_group);
5841
5842 /**
5843  *      dev_set_mac_address - Change Media Access Control Address
5844  *      @dev: device
5845  *      @sa: new address
5846  *
5847  *      Change the hardware (MAC) address of the device
5848  */
5849 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5850 {
5851         const struct net_device_ops *ops = dev->netdev_ops;
5852         int err;
5853
5854         if (!ops->ndo_set_mac_address)
5855                 return -EOPNOTSUPP;
5856         if (sa->sa_family != dev->type)
5857                 return -EINVAL;
5858         if (!netif_device_present(dev))
5859                 return -ENODEV;
5860         err = ops->ndo_set_mac_address(dev, sa);
5861         if (err)
5862                 return err;
5863         dev->addr_assign_type = NET_ADDR_SET;
5864         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5865         add_device_randomness(dev->dev_addr, dev->addr_len);
5866         return 0;
5867 }
5868 EXPORT_SYMBOL(dev_set_mac_address);
5869
5870 /**
5871  *      dev_change_carrier - Change device carrier
5872  *      @dev: device
5873  *      @new_carrier: new value
5874  *
5875  *      Change device carrier
5876  */
5877 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5878 {
5879         const struct net_device_ops *ops = dev->netdev_ops;
5880
5881         if (!ops->ndo_change_carrier)
5882                 return -EOPNOTSUPP;
5883         if (!netif_device_present(dev))
5884                 return -ENODEV;
5885         return ops->ndo_change_carrier(dev, new_carrier);
5886 }
5887 EXPORT_SYMBOL(dev_change_carrier);
5888
5889 /**
5890  *      dev_get_phys_port_id - Get device physical port ID
5891  *      @dev: device
5892  *      @ppid: port ID
5893  *
5894  *      Get device physical port ID
5895  */
5896 int dev_get_phys_port_id(struct net_device *dev,
5897                          struct netdev_phys_item_id *ppid)
5898 {
5899         const struct net_device_ops *ops = dev->netdev_ops;
5900
5901         if (!ops->ndo_get_phys_port_id)
5902                 return -EOPNOTSUPP;
5903         return ops->ndo_get_phys_port_id(dev, ppid);
5904 }
5905 EXPORT_SYMBOL(dev_get_phys_port_id);
5906
5907 /**
5908  *      dev_new_index   -       allocate an ifindex
5909  *      @net: the applicable net namespace
5910  *
5911  *      Returns a suitable unique value for a new device interface
5912  *      number.  The caller must hold the rtnl semaphore or the
5913  *      dev_base_lock to be sure it remains unique.
5914  */
5915 static int dev_new_index(struct net *net)
5916 {
5917         int ifindex = net->ifindex;
5918         for (;;) {
5919                 if (++ifindex <= 0)
5920                         ifindex = 1;
5921                 if (!__dev_get_by_index(net, ifindex))
5922                         return net->ifindex = ifindex;
5923         }
5924 }
5925
5926 /* Delayed registration/unregisteration */
5927 static LIST_HEAD(net_todo_list);
5928 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5929
5930 static void net_set_todo(struct net_device *dev)
5931 {
5932         list_add_tail(&dev->todo_list, &net_todo_list);
5933         dev_net(dev)->dev_unreg_count++;
5934 }
5935
5936 static void rollback_registered_many(struct list_head *head)
5937 {
5938         struct net_device *dev, *tmp;
5939         LIST_HEAD(close_head);
5940
5941         BUG_ON(dev_boot_phase);
5942         ASSERT_RTNL();
5943
5944         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5945                 /* Some devices call without registering
5946                  * for initialization unwind. Remove those
5947                  * devices and proceed with the remaining.
5948                  */
5949                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5950                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5951                                  dev->name, dev);
5952
5953                         WARN_ON(1);
5954                         list_del(&dev->unreg_list);
5955                         continue;
5956                 }
5957                 dev->dismantle = true;
5958                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5959         }
5960
5961         /* If device is running, close it first. */
5962         list_for_each_entry(dev, head, unreg_list)
5963                 list_add_tail(&dev->close_list, &close_head);
5964         dev_close_many(&close_head);
5965
5966         list_for_each_entry(dev, head, unreg_list) {
5967                 /* And unlink it from device chain. */
5968                 unlist_netdevice(dev);
5969
5970                 dev->reg_state = NETREG_UNREGISTERING;
5971         }
5972
5973         synchronize_net();
5974
5975         list_for_each_entry(dev, head, unreg_list) {
5976                 struct sk_buff *skb = NULL;
5977
5978                 /* Shutdown queueing discipline. */
5979                 dev_shutdown(dev);
5980
5981
5982                 /* Notify protocols, that we are about to destroy
5983                    this device. They should clean all the things.
5984                 */
5985                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5986
5987                 if (!dev->rtnl_link_ops ||
5988                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5989                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
5990                                                      GFP_KERNEL);
5991
5992                 /*
5993                  *      Flush the unicast and multicast chains
5994                  */
5995                 dev_uc_flush(dev);
5996                 dev_mc_flush(dev);
5997
5998                 if (dev->netdev_ops->ndo_uninit)
5999                         dev->netdev_ops->ndo_uninit(dev);
6000
6001                 if (skb)
6002                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6003
6004                 /* Notifier chain MUST detach us all upper devices. */
6005                 WARN_ON(netdev_has_any_upper_dev(dev));
6006
6007                 /* Remove entries from kobject tree */
6008                 netdev_unregister_kobject(dev);
6009 #ifdef CONFIG_XPS
6010                 /* Remove XPS queueing entries */
6011                 netif_reset_xps_queues_gt(dev, 0);
6012 #endif
6013         }
6014
6015         synchronize_net();
6016
6017         list_for_each_entry(dev, head, unreg_list)
6018                 dev_put(dev);
6019 }
6020
6021 static void rollback_registered(struct net_device *dev)
6022 {
6023         LIST_HEAD(single);
6024
6025         list_add(&dev->unreg_list, &single);
6026         rollback_registered_many(&single);
6027         list_del(&single);
6028 }
6029
6030 static netdev_features_t netdev_fix_features(struct net_device *dev,
6031         netdev_features_t features)
6032 {
6033         /* Fix illegal checksum combinations */
6034         if ((features & NETIF_F_HW_CSUM) &&
6035             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6036                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6037                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6038         }
6039
6040         /* TSO requires that SG is present as well. */
6041         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6042                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6043                 features &= ~NETIF_F_ALL_TSO;
6044         }
6045
6046         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6047                                         !(features & NETIF_F_IP_CSUM)) {
6048                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6049                 features &= ~NETIF_F_TSO;
6050                 features &= ~NETIF_F_TSO_ECN;
6051         }
6052
6053         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6054                                          !(features & NETIF_F_IPV6_CSUM)) {
6055                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6056                 features &= ~NETIF_F_TSO6;
6057         }
6058
6059         /* TSO ECN requires that TSO is present as well. */
6060         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6061                 features &= ~NETIF_F_TSO_ECN;
6062
6063         /* Software GSO depends on SG. */
6064         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6065                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6066                 features &= ~NETIF_F_GSO;
6067         }
6068
6069         /* UFO needs SG and checksumming */
6070         if (features & NETIF_F_UFO) {
6071                 /* maybe split UFO into V4 and V6? */
6072                 if (!((features & NETIF_F_GEN_CSUM) ||
6073                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6074                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6075                         netdev_dbg(dev,
6076                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6077                         features &= ~NETIF_F_UFO;
6078                 }
6079
6080                 if (!(features & NETIF_F_SG)) {
6081                         netdev_dbg(dev,
6082                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6083                         features &= ~NETIF_F_UFO;
6084                 }
6085         }
6086
6087 #ifdef CONFIG_NET_RX_BUSY_POLL
6088         if (dev->netdev_ops->ndo_busy_poll)
6089                 features |= NETIF_F_BUSY_POLL;
6090         else
6091 #endif
6092                 features &= ~NETIF_F_BUSY_POLL;
6093
6094         return features;
6095 }
6096
6097 int __netdev_update_features(struct net_device *dev)
6098 {
6099         netdev_features_t features;
6100         int err = 0;
6101
6102         ASSERT_RTNL();
6103
6104         features = netdev_get_wanted_features(dev);
6105
6106         if (dev->netdev_ops->ndo_fix_features)
6107                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6108
6109         /* driver might be less strict about feature dependencies */
6110         features = netdev_fix_features(dev, features);
6111
6112         if (dev->features == features)
6113                 return 0;
6114
6115         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6116                 &dev->features, &features);
6117
6118         if (dev->netdev_ops->ndo_set_features)
6119                 err = dev->netdev_ops->ndo_set_features(dev, features);
6120
6121         if (unlikely(err < 0)) {
6122                 netdev_err(dev,
6123                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6124                         err, &features, &dev->features);
6125                 return -1;
6126         }
6127
6128         if (!err)
6129                 dev->features = features;
6130
6131         return 1;
6132 }
6133
6134 /**
6135  *      netdev_update_features - recalculate device features
6136  *      @dev: the device to check
6137  *
6138  *      Recalculate dev->features set and send notifications if it
6139  *      has changed. Should be called after driver or hardware dependent
6140  *      conditions might have changed that influence the features.
6141  */
6142 void netdev_update_features(struct net_device *dev)
6143 {
6144         if (__netdev_update_features(dev))
6145                 netdev_features_change(dev);
6146 }
6147 EXPORT_SYMBOL(netdev_update_features);
6148
6149 /**
6150  *      netdev_change_features - recalculate device features
6151  *      @dev: the device to check
6152  *
6153  *      Recalculate dev->features set and send notifications even
6154  *      if they have not changed. Should be called instead of
6155  *      netdev_update_features() if also dev->vlan_features might
6156  *      have changed to allow the changes to be propagated to stacked
6157  *      VLAN devices.
6158  */
6159 void netdev_change_features(struct net_device *dev)
6160 {
6161         __netdev_update_features(dev);
6162         netdev_features_change(dev);
6163 }
6164 EXPORT_SYMBOL(netdev_change_features);
6165
6166 /**
6167  *      netif_stacked_transfer_operstate -      transfer operstate
6168  *      @rootdev: the root or lower level device to transfer state from
6169  *      @dev: the device to transfer operstate to
6170  *
6171  *      Transfer operational state from root to device. This is normally
6172  *      called when a stacking relationship exists between the root
6173  *      device and the device(a leaf device).
6174  */
6175 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6176                                         struct net_device *dev)
6177 {
6178         if (rootdev->operstate == IF_OPER_DORMANT)
6179                 netif_dormant_on(dev);
6180         else
6181                 netif_dormant_off(dev);
6182
6183         if (netif_carrier_ok(rootdev)) {
6184                 if (!netif_carrier_ok(dev))
6185                         netif_carrier_on(dev);
6186         } else {
6187                 if (netif_carrier_ok(dev))
6188                         netif_carrier_off(dev);
6189         }
6190 }
6191 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6192
6193 #ifdef CONFIG_SYSFS
6194 static int netif_alloc_rx_queues(struct net_device *dev)
6195 {
6196         unsigned int i, count = dev->num_rx_queues;
6197         struct netdev_rx_queue *rx;
6198         size_t sz = count * sizeof(*rx);
6199
6200         BUG_ON(count < 1);
6201
6202         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6203         if (!rx) {
6204                 rx = vzalloc(sz);
6205                 if (!rx)
6206                         return -ENOMEM;
6207         }
6208         dev->_rx = rx;
6209
6210         for (i = 0; i < count; i++)
6211                 rx[i].dev = dev;
6212         return 0;
6213 }
6214 #endif
6215
6216 static void netdev_init_one_queue(struct net_device *dev,
6217                                   struct netdev_queue *queue, void *_unused)
6218 {
6219         /* Initialize queue lock */
6220         spin_lock_init(&queue->_xmit_lock);
6221         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6222         queue->xmit_lock_owner = -1;
6223         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6224         queue->dev = dev;
6225 #ifdef CONFIG_BQL
6226         dql_init(&queue->dql, HZ);
6227 #endif
6228 }
6229
6230 static void netif_free_tx_queues(struct net_device *dev)
6231 {
6232         kvfree(dev->_tx);
6233 }
6234
6235 static int netif_alloc_netdev_queues(struct net_device *dev)
6236 {
6237         unsigned int count = dev->num_tx_queues;
6238         struct netdev_queue *tx;
6239         size_t sz = count * sizeof(*tx);
6240
6241         BUG_ON(count < 1 || count > 0xffff);
6242
6243         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6244         if (!tx) {
6245                 tx = vzalloc(sz);
6246                 if (!tx)
6247                         return -ENOMEM;
6248         }
6249         dev->_tx = tx;
6250
6251         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6252         spin_lock_init(&dev->tx_global_lock);
6253
6254         return 0;
6255 }
6256
6257 /**
6258  *      register_netdevice      - register a network device
6259  *      @dev: device to register
6260  *
6261  *      Take a completed network device structure and add it to the kernel
6262  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6263  *      chain. 0 is returned on success. A negative errno code is returned
6264  *      on a failure to set up the device, or if the name is a duplicate.
6265  *
6266  *      Callers must hold the rtnl semaphore. You may want
6267  *      register_netdev() instead of this.
6268  *
6269  *      BUGS:
6270  *      The locking appears insufficient to guarantee two parallel registers
6271  *      will not get the same name.
6272  */
6273
6274 int register_netdevice(struct net_device *dev)
6275 {
6276         int ret;
6277         struct net *net = dev_net(dev);
6278
6279         BUG_ON(dev_boot_phase);
6280         ASSERT_RTNL();
6281
6282         might_sleep();
6283
6284         /* When net_device's are persistent, this will be fatal. */
6285         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6286         BUG_ON(!net);
6287
6288         spin_lock_init(&dev->addr_list_lock);
6289         netdev_set_addr_lockdep_class(dev);
6290
6291         dev->iflink = -1;
6292
6293         ret = dev_get_valid_name(net, dev, dev->name);
6294         if (ret < 0)
6295                 goto out;
6296
6297         /* Init, if this function is available */
6298         if (dev->netdev_ops->ndo_init) {
6299                 ret = dev->netdev_ops->ndo_init(dev);
6300                 if (ret) {
6301                         if (ret > 0)
6302                                 ret = -EIO;
6303                         goto out;
6304                 }
6305         }
6306
6307         if (((dev->hw_features | dev->features) &
6308              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6309             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6310              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6311                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6312                 ret = -EINVAL;
6313                 goto err_uninit;
6314         }
6315
6316         ret = -EBUSY;
6317         if (!dev->ifindex)
6318                 dev->ifindex = dev_new_index(net);
6319         else if (__dev_get_by_index(net, dev->ifindex))
6320                 goto err_uninit;
6321
6322         if (dev->iflink == -1)
6323                 dev->iflink = dev->ifindex;
6324
6325         /* Transfer changeable features to wanted_features and enable
6326          * software offloads (GSO and GRO).
6327          */
6328         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6329         dev->features |= NETIF_F_SOFT_FEATURES;
6330         dev->wanted_features = dev->features & dev->hw_features;
6331
6332         if (!(dev->flags & IFF_LOOPBACK)) {
6333                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6334         }
6335
6336         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6337          */
6338         dev->vlan_features |= NETIF_F_HIGHDMA;
6339
6340         /* Make NETIF_F_SG inheritable to tunnel devices.
6341          */
6342         dev->hw_enc_features |= NETIF_F_SG;
6343
6344         /* Make NETIF_F_SG inheritable to MPLS.
6345          */
6346         dev->mpls_features |= NETIF_F_SG;
6347
6348         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6349         ret = notifier_to_errno(ret);
6350         if (ret)
6351                 goto err_uninit;
6352
6353         ret = netdev_register_kobject(dev);
6354         if (ret)
6355                 goto err_uninit;
6356         dev->reg_state = NETREG_REGISTERED;
6357
6358         __netdev_update_features(dev);
6359
6360         /*
6361          *      Default initial state at registry is that the
6362          *      device is present.
6363          */
6364
6365         set_bit(__LINK_STATE_PRESENT, &dev->state);
6366
6367         linkwatch_init_dev(dev);
6368
6369         dev_init_scheduler(dev);
6370         dev_hold(dev);
6371         list_netdevice(dev);
6372         add_device_randomness(dev->dev_addr, dev->addr_len);
6373
6374         /* If the device has permanent device address, driver should
6375          * set dev_addr and also addr_assign_type should be set to
6376          * NET_ADDR_PERM (default value).
6377          */
6378         if (dev->addr_assign_type == NET_ADDR_PERM)
6379                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6380
6381         /* Notify protocols, that a new device appeared. */
6382         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6383         ret = notifier_to_errno(ret);
6384         if (ret) {
6385                 rollback_registered(dev);
6386                 dev->reg_state = NETREG_UNREGISTERED;
6387         }
6388         /*
6389          *      Prevent userspace races by waiting until the network
6390          *      device is fully setup before sending notifications.
6391          */
6392         if (!dev->rtnl_link_ops ||
6393             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6394                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6395
6396 out:
6397         return ret;
6398
6399 err_uninit:
6400         if (dev->netdev_ops->ndo_uninit)
6401                 dev->netdev_ops->ndo_uninit(dev);
6402         goto out;
6403 }
6404 EXPORT_SYMBOL(register_netdevice);
6405
6406 /**
6407  *      init_dummy_netdev       - init a dummy network device for NAPI
6408  *      @dev: device to init
6409  *
6410  *      This takes a network device structure and initialize the minimum
6411  *      amount of fields so it can be used to schedule NAPI polls without
6412  *      registering a full blown interface. This is to be used by drivers
6413  *      that need to tie several hardware interfaces to a single NAPI
6414  *      poll scheduler due to HW limitations.
6415  */
6416 int init_dummy_netdev(struct net_device *dev)
6417 {
6418         /* Clear everything. Note we don't initialize spinlocks
6419          * are they aren't supposed to be taken by any of the
6420          * NAPI code and this dummy netdev is supposed to be
6421          * only ever used for NAPI polls
6422          */
6423         memset(dev, 0, sizeof(struct net_device));
6424
6425         /* make sure we BUG if trying to hit standard
6426          * register/unregister code path
6427          */
6428         dev->reg_state = NETREG_DUMMY;
6429
6430         /* NAPI wants this */
6431         INIT_LIST_HEAD(&dev->napi_list);
6432
6433         /* a dummy interface is started by default */
6434         set_bit(__LINK_STATE_PRESENT, &dev->state);
6435         set_bit(__LINK_STATE_START, &dev->state);
6436
6437         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6438          * because users of this 'device' dont need to change
6439          * its refcount.
6440          */
6441
6442         return 0;
6443 }
6444 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6445
6446
6447 /**
6448  *      register_netdev - register a network device
6449  *      @dev: device to register
6450  *
6451  *      Take a completed network device structure and add it to the kernel
6452  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6453  *      chain. 0 is returned on success. A negative errno code is returned
6454  *      on a failure to set up the device, or if the name is a duplicate.
6455  *
6456  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6457  *      and expands the device name if you passed a format string to
6458  *      alloc_netdev.
6459  */
6460 int register_netdev(struct net_device *dev)
6461 {
6462         int err;
6463
6464         rtnl_lock();
6465         err = register_netdevice(dev);
6466         rtnl_unlock();
6467         return err;
6468 }
6469 EXPORT_SYMBOL(register_netdev);
6470
6471 int netdev_refcnt_read(const struct net_device *dev)
6472 {
6473         int i, refcnt = 0;
6474
6475         for_each_possible_cpu(i)
6476                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6477         return refcnt;
6478 }
6479 EXPORT_SYMBOL(netdev_refcnt_read);
6480
6481 /**
6482  * netdev_wait_allrefs - wait until all references are gone.
6483  * @dev: target net_device
6484  *
6485  * This is called when unregistering network devices.
6486  *
6487  * Any protocol or device that holds a reference should register
6488  * for netdevice notification, and cleanup and put back the
6489  * reference if they receive an UNREGISTER event.
6490  * We can get stuck here if buggy protocols don't correctly
6491  * call dev_put.
6492  */
6493 static void netdev_wait_allrefs(struct net_device *dev)
6494 {
6495         unsigned long rebroadcast_time, warning_time;
6496         int refcnt;
6497
6498         linkwatch_forget_dev(dev);
6499
6500         rebroadcast_time = warning_time = jiffies;
6501         refcnt = netdev_refcnt_read(dev);
6502
6503         while (refcnt != 0) {
6504                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6505                         rtnl_lock();
6506
6507                         /* Rebroadcast unregister notification */
6508                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6509
6510                         __rtnl_unlock();
6511                         rcu_barrier();
6512                         rtnl_lock();
6513
6514                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6515                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6516                                      &dev->state)) {
6517                                 /* We must not have linkwatch events
6518                                  * pending on unregister. If this
6519                                  * happens, we simply run the queue
6520                                  * unscheduled, resulting in a noop
6521                                  * for this device.
6522                                  */
6523                                 linkwatch_run_queue();
6524                         }
6525
6526                         __rtnl_unlock();
6527
6528                         rebroadcast_time = jiffies;
6529                 }
6530
6531                 msleep(250);
6532
6533                 refcnt = netdev_refcnt_read(dev);
6534
6535                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6536                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6537                                  dev->name, refcnt);
6538                         warning_time = jiffies;
6539                 }
6540         }
6541 }
6542
6543 /* The sequence is:
6544  *
6545  *      rtnl_lock();
6546  *      ...
6547  *      register_netdevice(x1);
6548  *      register_netdevice(x2);
6549  *      ...
6550  *      unregister_netdevice(y1);
6551  *      unregister_netdevice(y2);
6552  *      ...
6553  *      rtnl_unlock();
6554  *      free_netdev(y1);
6555  *      free_netdev(y2);
6556  *
6557  * We are invoked by rtnl_unlock().
6558  * This allows us to deal with problems:
6559  * 1) We can delete sysfs objects which invoke hotplug
6560  *    without deadlocking with linkwatch via keventd.
6561  * 2) Since we run with the RTNL semaphore not held, we can sleep
6562  *    safely in order to wait for the netdev refcnt to drop to zero.
6563  *
6564  * We must not return until all unregister events added during
6565  * the interval the lock was held have been completed.
6566  */
6567 void netdev_run_todo(void)
6568 {
6569         struct list_head list;
6570
6571         /* Snapshot list, allow later requests */
6572         list_replace_init(&net_todo_list, &list);
6573
6574         __rtnl_unlock();
6575
6576
6577         /* Wait for rcu callbacks to finish before next phase */
6578         if (!list_empty(&list))
6579                 rcu_barrier();
6580
6581         while (!list_empty(&list)) {
6582                 struct net_device *dev
6583                         = list_first_entry(&list, struct net_device, todo_list);
6584                 list_del(&dev->todo_list);
6585
6586                 rtnl_lock();
6587                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6588                 __rtnl_unlock();
6589
6590                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6591                         pr_err("network todo '%s' but state %d\n",
6592                                dev->name, dev->reg_state);
6593                         dump_stack();
6594                         continue;
6595                 }
6596
6597                 dev->reg_state = NETREG_UNREGISTERED;
6598
6599                 on_each_cpu(flush_backlog, dev, 1);
6600
6601                 netdev_wait_allrefs(dev);
6602
6603                 /* paranoia */
6604                 BUG_ON(netdev_refcnt_read(dev));
6605                 BUG_ON(!list_empty(&dev->ptype_all));
6606                 BUG_ON(!list_empty(&dev->ptype_specific));
6607                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6608                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6609                 WARN_ON(dev->dn_ptr);
6610
6611                 if (dev->destructor)
6612                         dev->destructor(dev);
6613
6614                 /* Report a network device has been unregistered */
6615                 rtnl_lock();
6616                 dev_net(dev)->dev_unreg_count--;
6617                 __rtnl_unlock();
6618                 wake_up(&netdev_unregistering_wq);
6619
6620                 /* Free network device */
6621                 kobject_put(&dev->dev.kobj);
6622         }
6623 }
6624
6625 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6626  * fields in the same order, with only the type differing.
6627  */
6628 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6629                              const struct net_device_stats *netdev_stats)
6630 {
6631 #if BITS_PER_LONG == 64
6632         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6633         memcpy(stats64, netdev_stats, sizeof(*stats64));
6634 #else
6635         size_t i, n = sizeof(*stats64) / sizeof(u64);
6636         const unsigned long *src = (const unsigned long *)netdev_stats;
6637         u64 *dst = (u64 *)stats64;
6638
6639         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6640                      sizeof(*stats64) / sizeof(u64));
6641         for (i = 0; i < n; i++)
6642                 dst[i] = src[i];
6643 #endif
6644 }
6645 EXPORT_SYMBOL(netdev_stats_to_stats64);
6646
6647 /**
6648  *      dev_get_stats   - get network device statistics
6649  *      @dev: device to get statistics from
6650  *      @storage: place to store stats
6651  *
6652  *      Get network statistics from device. Return @storage.
6653  *      The device driver may provide its own method by setting
6654  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6655  *      otherwise the internal statistics structure is used.
6656  */
6657 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6658                                         struct rtnl_link_stats64 *storage)
6659 {
6660         const struct net_device_ops *ops = dev->netdev_ops;
6661
6662         if (ops->ndo_get_stats64) {
6663                 memset(storage, 0, sizeof(*storage));
6664                 ops->ndo_get_stats64(dev, storage);
6665         } else if (ops->ndo_get_stats) {
6666                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6667         } else {
6668                 netdev_stats_to_stats64(storage, &dev->stats);
6669         }
6670         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6671         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6672         return storage;
6673 }
6674 EXPORT_SYMBOL(dev_get_stats);
6675
6676 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6677 {
6678         struct netdev_queue *queue = dev_ingress_queue(dev);
6679
6680 #ifdef CONFIG_NET_CLS_ACT
6681         if (queue)
6682                 return queue;
6683         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6684         if (!queue)
6685                 return NULL;
6686         netdev_init_one_queue(dev, queue, NULL);
6687         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6688         queue->qdisc_sleeping = &noop_qdisc;
6689         rcu_assign_pointer(dev->ingress_queue, queue);
6690 #endif
6691         return queue;
6692 }
6693
6694 static const struct ethtool_ops default_ethtool_ops;
6695
6696 void netdev_set_default_ethtool_ops(struct net_device *dev,
6697                                     const struct ethtool_ops *ops)
6698 {
6699         if (dev->ethtool_ops == &default_ethtool_ops)
6700                 dev->ethtool_ops = ops;
6701 }
6702 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6703
6704 void netdev_freemem(struct net_device *dev)
6705 {
6706         char *addr = (char *)dev - dev->padded;
6707
6708         kvfree(addr);
6709 }
6710
6711 /**
6712  *      alloc_netdev_mqs - allocate network device
6713  *      @sizeof_priv:           size of private data to allocate space for
6714  *      @name:                  device name format string
6715  *      @name_assign_type:      origin of device name
6716  *      @setup:                 callback to initialize device
6717  *      @txqs:                  the number of TX subqueues to allocate
6718  *      @rxqs:                  the number of RX subqueues to allocate
6719  *
6720  *      Allocates a struct net_device with private data area for driver use
6721  *      and performs basic initialization.  Also allocates subqueue structs
6722  *      for each queue on the device.
6723  */
6724 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6725                 unsigned char name_assign_type,
6726                 void (*setup)(struct net_device *),
6727                 unsigned int txqs, unsigned int rxqs)
6728 {
6729         struct net_device *dev;
6730         size_t alloc_size;
6731         struct net_device *p;
6732
6733         BUG_ON(strlen(name) >= sizeof(dev->name));
6734
6735         if (txqs < 1) {
6736                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6737                 return NULL;
6738         }
6739
6740 #ifdef CONFIG_SYSFS
6741         if (rxqs < 1) {
6742                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6743                 return NULL;
6744         }
6745 #endif
6746
6747         alloc_size = sizeof(struct net_device);
6748         if (sizeof_priv) {
6749                 /* ensure 32-byte alignment of private area */
6750                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6751                 alloc_size += sizeof_priv;
6752         }
6753         /* ensure 32-byte alignment of whole construct */
6754         alloc_size += NETDEV_ALIGN - 1;
6755
6756         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6757         if (!p)
6758                 p = vzalloc(alloc_size);
6759         if (!p)
6760                 return NULL;
6761
6762         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6763         dev->padded = (char *)dev - (char *)p;
6764
6765         dev->pcpu_refcnt = alloc_percpu(int);
6766         if (!dev->pcpu_refcnt)
6767                 goto free_dev;
6768
6769         if (dev_addr_init(dev))
6770                 goto free_pcpu;
6771
6772         dev_mc_init(dev);
6773         dev_uc_init(dev);
6774
6775         dev_net_set(dev, &init_net);
6776
6777         dev->gso_max_size = GSO_MAX_SIZE;
6778         dev->gso_max_segs = GSO_MAX_SEGS;
6779         dev->gso_min_segs = 0;
6780
6781         INIT_LIST_HEAD(&dev->napi_list);
6782         INIT_LIST_HEAD(&dev->unreg_list);
6783         INIT_LIST_HEAD(&dev->close_list);
6784         INIT_LIST_HEAD(&dev->link_watch_list);
6785         INIT_LIST_HEAD(&dev->adj_list.upper);
6786         INIT_LIST_HEAD(&dev->adj_list.lower);
6787         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6788         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6789         INIT_LIST_HEAD(&dev->ptype_all);
6790         INIT_LIST_HEAD(&dev->ptype_specific);
6791         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6792         setup(dev);
6793
6794         dev->num_tx_queues = txqs;
6795         dev->real_num_tx_queues = txqs;
6796         if (netif_alloc_netdev_queues(dev))
6797                 goto free_all;
6798
6799 #ifdef CONFIG_SYSFS
6800         dev->num_rx_queues = rxqs;
6801         dev->real_num_rx_queues = rxqs;
6802         if (netif_alloc_rx_queues(dev))
6803                 goto free_all;
6804 #endif
6805
6806         strcpy(dev->name, name);
6807         dev->name_assign_type = name_assign_type;
6808         dev->group = INIT_NETDEV_GROUP;
6809         if (!dev->ethtool_ops)
6810                 dev->ethtool_ops = &default_ethtool_ops;
6811         return dev;
6812
6813 free_all:
6814         free_netdev(dev);
6815         return NULL;
6816
6817 free_pcpu:
6818         free_percpu(dev->pcpu_refcnt);
6819 free_dev:
6820         netdev_freemem(dev);
6821         return NULL;
6822 }
6823 EXPORT_SYMBOL(alloc_netdev_mqs);
6824
6825 /**
6826  *      free_netdev - free network device
6827  *      @dev: device
6828  *
6829  *      This function does the last stage of destroying an allocated device
6830  *      interface. The reference to the device object is released.
6831  *      If this is the last reference then it will be freed.
6832  */
6833 void free_netdev(struct net_device *dev)
6834 {
6835         struct napi_struct *p, *n;
6836
6837         release_net(dev_net(dev));
6838
6839         netif_free_tx_queues(dev);
6840 #ifdef CONFIG_SYSFS
6841         kvfree(dev->_rx);
6842 #endif
6843
6844         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6845
6846         /* Flush device addresses */
6847         dev_addr_flush(dev);
6848
6849         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6850                 netif_napi_del(p);
6851
6852         free_percpu(dev->pcpu_refcnt);
6853         dev->pcpu_refcnt = NULL;
6854
6855         /*  Compatibility with error handling in drivers */
6856         if (dev->reg_state == NETREG_UNINITIALIZED) {
6857                 netdev_freemem(dev);
6858                 return;
6859         }
6860
6861         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6862         dev->reg_state = NETREG_RELEASED;
6863
6864         /* will free via device release */
6865         put_device(&dev->dev);
6866 }
6867 EXPORT_SYMBOL(free_netdev);
6868
6869 /**
6870  *      synchronize_net -  Synchronize with packet receive processing
6871  *
6872  *      Wait for packets currently being received to be done.
6873  *      Does not block later packets from starting.
6874  */
6875 void synchronize_net(void)
6876 {
6877         might_sleep();
6878         if (rtnl_is_locked())
6879                 synchronize_rcu_expedited();
6880         else
6881                 synchronize_rcu();
6882 }
6883 EXPORT_SYMBOL(synchronize_net);
6884
6885 /**
6886  *      unregister_netdevice_queue - remove device from the kernel
6887  *      @dev: device
6888  *      @head: list
6889  *
6890  *      This function shuts down a device interface and removes it
6891  *      from the kernel tables.
6892  *      If head not NULL, device is queued to be unregistered later.
6893  *
6894  *      Callers must hold the rtnl semaphore.  You may want
6895  *      unregister_netdev() instead of this.
6896  */
6897
6898 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6899 {
6900         ASSERT_RTNL();
6901
6902         if (head) {
6903                 list_move_tail(&dev->unreg_list, head);
6904         } else {
6905                 rollback_registered(dev);
6906                 /* Finish processing unregister after unlock */
6907                 net_set_todo(dev);
6908         }
6909 }
6910 EXPORT_SYMBOL(unregister_netdevice_queue);
6911
6912 /**
6913  *      unregister_netdevice_many - unregister many devices
6914  *      @head: list of devices
6915  *
6916  *  Note: As most callers use a stack allocated list_head,
6917  *  we force a list_del() to make sure stack wont be corrupted later.
6918  */
6919 void unregister_netdevice_many(struct list_head *head)
6920 {
6921         struct net_device *dev;
6922
6923         if (!list_empty(head)) {
6924                 rollback_registered_many(head);
6925                 list_for_each_entry(dev, head, unreg_list)
6926                         net_set_todo(dev);
6927                 list_del(head);
6928         }
6929 }
6930 EXPORT_SYMBOL(unregister_netdevice_many);
6931
6932 /**
6933  *      unregister_netdev - remove device from the kernel
6934  *      @dev: device
6935  *
6936  *      This function shuts down a device interface and removes it
6937  *      from the kernel tables.
6938  *
6939  *      This is just a wrapper for unregister_netdevice that takes
6940  *      the rtnl semaphore.  In general you want to use this and not
6941  *      unregister_netdevice.
6942  */
6943 void unregister_netdev(struct net_device *dev)
6944 {
6945         rtnl_lock();
6946         unregister_netdevice(dev);
6947         rtnl_unlock();
6948 }
6949 EXPORT_SYMBOL(unregister_netdev);
6950
6951 /**
6952  *      dev_change_net_namespace - move device to different nethost namespace
6953  *      @dev: device
6954  *      @net: network namespace
6955  *      @pat: If not NULL name pattern to try if the current device name
6956  *            is already taken in the destination network namespace.
6957  *
6958  *      This function shuts down a device interface and moves it
6959  *      to a new network namespace. On success 0 is returned, on
6960  *      a failure a netagive errno code is returned.
6961  *
6962  *      Callers must hold the rtnl semaphore.
6963  */
6964
6965 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6966 {
6967         int err;
6968
6969         ASSERT_RTNL();
6970
6971         /* Don't allow namespace local devices to be moved. */
6972         err = -EINVAL;
6973         if (dev->features & NETIF_F_NETNS_LOCAL)
6974                 goto out;
6975
6976         /* Ensure the device has been registrered */
6977         if (dev->reg_state != NETREG_REGISTERED)
6978                 goto out;
6979
6980         /* Get out if there is nothing todo */
6981         err = 0;
6982         if (net_eq(dev_net(dev), net))
6983                 goto out;
6984
6985         /* Pick the destination device name, and ensure
6986          * we can use it in the destination network namespace.
6987          */
6988         err = -EEXIST;
6989         if (__dev_get_by_name(net, dev->name)) {
6990                 /* We get here if we can't use the current device name */
6991                 if (!pat)
6992                         goto out;
6993                 if (dev_get_valid_name(net, dev, pat) < 0)
6994                         goto out;
6995         }
6996
6997         /*
6998          * And now a mini version of register_netdevice unregister_netdevice.
6999          */
7000
7001         /* If device is running close it first. */
7002         dev_close(dev);
7003
7004         /* And unlink it from device chain */
7005         err = -ENODEV;
7006         unlist_netdevice(dev);
7007
7008         synchronize_net();
7009
7010         /* Shutdown queueing discipline. */
7011         dev_shutdown(dev);
7012
7013         /* Notify protocols, that we are about to destroy
7014            this device. They should clean all the things.
7015
7016            Note that dev->reg_state stays at NETREG_REGISTERED.
7017            This is wanted because this way 8021q and macvlan know
7018            the device is just moving and can keep their slaves up.
7019         */
7020         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7021         rcu_barrier();
7022         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7023         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7024
7025         /*
7026          *      Flush the unicast and multicast chains
7027          */
7028         dev_uc_flush(dev);
7029         dev_mc_flush(dev);
7030
7031         /* Send a netdev-removed uevent to the old namespace */
7032         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7033         netdev_adjacent_del_links(dev);
7034
7035         /* Actually switch the network namespace */
7036         dev_net_set(dev, net);
7037
7038         /* If there is an ifindex conflict assign a new one */
7039         if (__dev_get_by_index(net, dev->ifindex)) {
7040                 int iflink = (dev->iflink == dev->ifindex);
7041                 dev->ifindex = dev_new_index(net);
7042                 if (iflink)
7043                         dev->iflink = dev->ifindex;
7044         }
7045
7046         /* Send a netdev-add uevent to the new namespace */
7047         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7048         netdev_adjacent_add_links(dev);
7049
7050         /* Fixup kobjects */
7051         err = device_rename(&dev->dev, dev->name);
7052         WARN_ON(err);
7053
7054         /* Add the device back in the hashes */
7055         list_netdevice(dev);
7056
7057         /* Notify protocols, that a new device appeared. */
7058         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7059
7060         /*
7061          *      Prevent userspace races by waiting until the network
7062          *      device is fully setup before sending notifications.
7063          */
7064         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7065
7066         synchronize_net();
7067         err = 0;
7068 out:
7069         return err;
7070 }
7071 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7072
7073 static int dev_cpu_callback(struct notifier_block *nfb,
7074                             unsigned long action,
7075                             void *ocpu)
7076 {
7077         struct sk_buff **list_skb;
7078         struct sk_buff *skb;
7079         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7080         struct softnet_data *sd, *oldsd;
7081
7082         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7083                 return NOTIFY_OK;
7084
7085         local_irq_disable();
7086         cpu = smp_processor_id();
7087         sd = &per_cpu(softnet_data, cpu);
7088         oldsd = &per_cpu(softnet_data, oldcpu);
7089
7090         /* Find end of our completion_queue. */
7091         list_skb = &sd->completion_queue;
7092         while (*list_skb)
7093                 list_skb = &(*list_skb)->next;
7094         /* Append completion queue from offline CPU. */
7095         *list_skb = oldsd->completion_queue;
7096         oldsd->completion_queue = NULL;
7097
7098         /* Append output queue from offline CPU. */
7099         if (oldsd->output_queue) {
7100                 *sd->output_queue_tailp = oldsd->output_queue;
7101                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7102                 oldsd->output_queue = NULL;
7103                 oldsd->output_queue_tailp = &oldsd->output_queue;
7104         }
7105         /* Append NAPI poll list from offline CPU, with one exception :
7106          * process_backlog() must be called by cpu owning percpu backlog.
7107          * We properly handle process_queue & input_pkt_queue later.
7108          */
7109         while (!list_empty(&oldsd->poll_list)) {
7110                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7111                                                             struct napi_struct,
7112                                                             poll_list);
7113
7114                 list_del_init(&napi->poll_list);
7115                 if (napi->poll == process_backlog)
7116                         napi->state = 0;
7117                 else
7118                         ____napi_schedule(sd, napi);
7119         }
7120
7121         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7122         local_irq_enable();
7123
7124         /* Process offline CPU's input_pkt_queue */
7125         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7126                 netif_rx_internal(skb);
7127                 input_queue_head_incr(oldsd);
7128         }
7129         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7130                 netif_rx_internal(skb);
7131                 input_queue_head_incr(oldsd);
7132         }
7133
7134         return NOTIFY_OK;
7135 }
7136
7137
7138 /**
7139  *      netdev_increment_features - increment feature set by one
7140  *      @all: current feature set
7141  *      @one: new feature set
7142  *      @mask: mask feature set
7143  *
7144  *      Computes a new feature set after adding a device with feature set
7145  *      @one to the master device with current feature set @all.  Will not
7146  *      enable anything that is off in @mask. Returns the new feature set.
7147  */
7148 netdev_features_t netdev_increment_features(netdev_features_t all,
7149         netdev_features_t one, netdev_features_t mask)
7150 {
7151         if (mask & NETIF_F_GEN_CSUM)
7152                 mask |= NETIF_F_ALL_CSUM;
7153         mask |= NETIF_F_VLAN_CHALLENGED;
7154
7155         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7156         all &= one | ~NETIF_F_ALL_FOR_ALL;
7157
7158         /* If one device supports hw checksumming, set for all. */
7159         if (all & NETIF_F_GEN_CSUM)
7160                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7161
7162         return all;
7163 }
7164 EXPORT_SYMBOL(netdev_increment_features);
7165
7166 static struct hlist_head * __net_init netdev_create_hash(void)
7167 {
7168         int i;
7169         struct hlist_head *hash;
7170
7171         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7172         if (hash != NULL)
7173                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7174                         INIT_HLIST_HEAD(&hash[i]);
7175
7176         return hash;
7177 }
7178
7179 /* Initialize per network namespace state */
7180 static int __net_init netdev_init(struct net *net)
7181 {
7182         if (net != &init_net)
7183                 INIT_LIST_HEAD(&net->dev_base_head);
7184
7185         net->dev_name_head = netdev_create_hash();
7186         if (net->dev_name_head == NULL)
7187                 goto err_name;
7188
7189         net->dev_index_head = netdev_create_hash();
7190         if (net->dev_index_head == NULL)
7191                 goto err_idx;
7192
7193         return 0;
7194
7195 err_idx:
7196         kfree(net->dev_name_head);
7197 err_name:
7198         return -ENOMEM;
7199 }
7200
7201 /**
7202  *      netdev_drivername - network driver for the device
7203  *      @dev: network device
7204  *
7205  *      Determine network driver for device.
7206  */
7207 const char *netdev_drivername(const struct net_device *dev)
7208 {
7209         const struct device_driver *driver;
7210         const struct device *parent;
7211         const char *empty = "";
7212
7213         parent = dev->dev.parent;
7214         if (!parent)
7215                 return empty;
7216
7217         driver = parent->driver;
7218         if (driver && driver->name)
7219                 return driver->name;
7220         return empty;
7221 }
7222
7223 static void __netdev_printk(const char *level, const struct net_device *dev,
7224                             struct va_format *vaf)
7225 {
7226         if (dev && dev->dev.parent) {
7227                 dev_printk_emit(level[1] - '0',
7228                                 dev->dev.parent,
7229                                 "%s %s %s%s: %pV",
7230                                 dev_driver_string(dev->dev.parent),
7231                                 dev_name(dev->dev.parent),
7232                                 netdev_name(dev), netdev_reg_state(dev),
7233                                 vaf);
7234         } else if (dev) {
7235                 printk("%s%s%s: %pV",
7236                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7237         } else {
7238                 printk("%s(NULL net_device): %pV", level, vaf);
7239         }
7240 }
7241
7242 void netdev_printk(const char *level, const struct net_device *dev,
7243                    const char *format, ...)
7244 {
7245         struct va_format vaf;
7246         va_list args;
7247
7248         va_start(args, format);
7249
7250         vaf.fmt = format;
7251         vaf.va = &args;
7252
7253         __netdev_printk(level, dev, &vaf);
7254
7255         va_end(args);
7256 }
7257 EXPORT_SYMBOL(netdev_printk);
7258
7259 #define define_netdev_printk_level(func, level)                 \
7260 void func(const struct net_device *dev, const char *fmt, ...)   \
7261 {                                                               \
7262         struct va_format vaf;                                   \
7263         va_list args;                                           \
7264                                                                 \
7265         va_start(args, fmt);                                    \
7266                                                                 \
7267         vaf.fmt = fmt;                                          \
7268         vaf.va = &args;                                         \
7269                                                                 \
7270         __netdev_printk(level, dev, &vaf);                      \
7271                                                                 \
7272         va_end(args);                                           \
7273 }                                                               \
7274 EXPORT_SYMBOL(func);
7275
7276 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7277 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7278 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7279 define_netdev_printk_level(netdev_err, KERN_ERR);
7280 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7281 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7282 define_netdev_printk_level(netdev_info, KERN_INFO);
7283
7284 static void __net_exit netdev_exit(struct net *net)
7285 {
7286         kfree(net->dev_name_head);
7287         kfree(net->dev_index_head);
7288 }
7289
7290 static struct pernet_operations __net_initdata netdev_net_ops = {
7291         .init = netdev_init,
7292         .exit = netdev_exit,
7293 };
7294
7295 static void __net_exit default_device_exit(struct net *net)
7296 {
7297         struct net_device *dev, *aux;
7298         /*
7299          * Push all migratable network devices back to the
7300          * initial network namespace
7301          */
7302         rtnl_lock();
7303         for_each_netdev_safe(net, dev, aux) {
7304                 int err;
7305                 char fb_name[IFNAMSIZ];
7306
7307                 /* Ignore unmoveable devices (i.e. loopback) */
7308                 if (dev->features & NETIF_F_NETNS_LOCAL)
7309                         continue;
7310
7311                 /* Leave virtual devices for the generic cleanup */
7312                 if (dev->rtnl_link_ops)
7313                         continue;
7314
7315                 /* Push remaining network devices to init_net */
7316                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7317                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7318                 if (err) {
7319                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7320                                  __func__, dev->name, err);
7321                         BUG();
7322                 }
7323         }
7324         rtnl_unlock();
7325 }
7326
7327 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7328 {
7329         /* Return with the rtnl_lock held when there are no network
7330          * devices unregistering in any network namespace in net_list.
7331          */
7332         struct net *net;
7333         bool unregistering;
7334         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7335
7336         add_wait_queue(&netdev_unregistering_wq, &wait);
7337         for (;;) {
7338                 unregistering = false;
7339                 rtnl_lock();
7340                 list_for_each_entry(net, net_list, exit_list) {
7341                         if (net->dev_unreg_count > 0) {
7342                                 unregistering = true;
7343                                 break;
7344                         }
7345                 }
7346                 if (!unregistering)
7347                         break;
7348                 __rtnl_unlock();
7349
7350                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7351         }
7352         remove_wait_queue(&netdev_unregistering_wq, &wait);
7353 }
7354
7355 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7356 {
7357         /* At exit all network devices most be removed from a network
7358          * namespace.  Do this in the reverse order of registration.
7359          * Do this across as many network namespaces as possible to
7360          * improve batching efficiency.
7361          */
7362         struct net_device *dev;
7363         struct net *net;
7364         LIST_HEAD(dev_kill_list);
7365
7366         /* To prevent network device cleanup code from dereferencing
7367          * loopback devices or network devices that have been freed
7368          * wait here for all pending unregistrations to complete,
7369          * before unregistring the loopback device and allowing the
7370          * network namespace be freed.
7371          *
7372          * The netdev todo list containing all network devices
7373          * unregistrations that happen in default_device_exit_batch
7374          * will run in the rtnl_unlock() at the end of
7375          * default_device_exit_batch.
7376          */
7377         rtnl_lock_unregistering(net_list);
7378         list_for_each_entry(net, net_list, exit_list) {
7379                 for_each_netdev_reverse(net, dev) {
7380                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7381                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7382                         else
7383                                 unregister_netdevice_queue(dev, &dev_kill_list);
7384                 }
7385         }
7386         unregister_netdevice_many(&dev_kill_list);
7387         rtnl_unlock();
7388 }
7389
7390 static struct pernet_operations __net_initdata default_device_ops = {
7391         .exit = default_device_exit,
7392         .exit_batch = default_device_exit_batch,
7393 };
7394
7395 /*
7396  *      Initialize the DEV module. At boot time this walks the device list and
7397  *      unhooks any devices that fail to initialise (normally hardware not
7398  *      present) and leaves us with a valid list of present and active devices.
7399  *
7400  */
7401
7402 /*
7403  *       This is called single threaded during boot, so no need
7404  *       to take the rtnl semaphore.
7405  */
7406 static int __init net_dev_init(void)
7407 {
7408         int i, rc = -ENOMEM;
7409
7410         BUG_ON(!dev_boot_phase);
7411
7412         if (dev_proc_init())
7413                 goto out;
7414
7415         if (netdev_kobject_init())
7416                 goto out;
7417
7418         INIT_LIST_HEAD(&ptype_all);
7419         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7420                 INIT_LIST_HEAD(&ptype_base[i]);
7421
7422         INIT_LIST_HEAD(&offload_base);
7423
7424         if (register_pernet_subsys(&netdev_net_ops))
7425                 goto out;
7426
7427         /*
7428          *      Initialise the packet receive queues.
7429          */
7430
7431         for_each_possible_cpu(i) {
7432                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7433
7434                 skb_queue_head_init(&sd->input_pkt_queue);
7435                 skb_queue_head_init(&sd->process_queue);
7436                 INIT_LIST_HEAD(&sd->poll_list);
7437                 sd->output_queue_tailp = &sd->output_queue;
7438 #ifdef CONFIG_RPS
7439                 sd->csd.func = rps_trigger_softirq;
7440                 sd->csd.info = sd;
7441                 sd->cpu = i;
7442 #endif
7443
7444                 sd->backlog.poll = process_backlog;
7445                 sd->backlog.weight = weight_p;
7446         }
7447
7448         dev_boot_phase = 0;
7449
7450         /* The loopback device is special if any other network devices
7451          * is present in a network namespace the loopback device must
7452          * be present. Since we now dynamically allocate and free the
7453          * loopback device ensure this invariant is maintained by
7454          * keeping the loopback device as the first device on the
7455          * list of network devices.  Ensuring the loopback devices
7456          * is the first device that appears and the last network device
7457          * that disappears.
7458          */
7459         if (register_pernet_device(&loopback_net_ops))
7460                 goto out;
7461
7462         if (register_pernet_device(&default_device_ops))
7463                 goto out;
7464
7465         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7466         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7467
7468         hotcpu_notifier(dev_cpu_callback, 0);
7469         dst_init();
7470         rc = 0;
7471 out:
7472         return rc;
7473 }
7474
7475 subsys_initcall(net_dev_init);