net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/proc_fs.h>
 101 #include <linux/seq_file.h>
 102 #include <linux/stat.h>
 103 #include <net/dst.h>
 104 #include <net/pkt_sched.h>
 105 #include <net/checksum.h>
 106 #include <net/xfrm.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130 #include <trace/events/net.h>
 131 #include <trace/events/skb.h>
 132 #include <linux/pci.h>
 133 #include <linux/inetdevice.h>
 134 #include <linux/cpu_rmap.h>
 135 #include <linux/net_tstamp.h>
 136 #include <linux/static_key.h>
 137
 138 #include "net-sysfs.h"
 139
 140 /* Instead of increasing this, you should create a hash table. */
 141 #define MAX_GRO_SKBS 8
 142
 143 /* This should be increased if a protocol with a bigger head is added. */
 144 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 145
 146 /*
 147  *      The list of packet types we will receive (as opposed to discard)
 148  *      and the routines to invoke.
 149  *
 150  *      Why 16. Because with 16 the only overlap we get on a hash of the
 151  *      low nibble of the protocol value is RARP/SNAP/X.25.
 152  *
 153  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 154  *             sure which should go first, but I bet it won't make much
 155  *             difference if we are running VLANs.  The good news is that
 156  *             this protocol won't be in the list unless compiled in, so
 157  *             the average user (w/out VLANs) will not be adversely affected.
 158  *             --BLG
 159  *
 160  *              0800    IP
 161  *              8100    802.1Q VLAN
 162  *              0001    802.3
 163  *              0002    AX.25
 164  *              0004    802.2
 165  *              8035    RARP
 166  *              0005    SNAP
 167  *              0805    X.25
 168  *              0806    ARP
 169  *              8137    IPX
 170  *              0009    Localtalk
 171  *              86DD    IPv6
 172  */
 173
 174 #define PTYPE_HASH_SIZE (16)
 175 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 176
 177 static DEFINE_SPINLOCK(ptype_lock);
 178 static DEFINE_SPINLOCK(offload_lock);
 179 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 180 static struct list_head ptype_all __read_mostly;        /* Taps */
 181 static struct list_head offload_base __read_mostly;
 182
 183 /*
 184  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 185  * semaphore.
 186  *
 187  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 188  *
 189  * Writers must hold the rtnl semaphore while they loop through the
 190  * dev_base_head list, and hold dev_base_lock for writing when they do the
 191  * actual updates.  This allows pure readers to access the list even
 192  * while a writer is preparing to update it.
 193  *
 194  * To put it another way, dev_base_lock is held for writing only to
 195  * protect against pure readers; the rtnl semaphore provides the
 196  * protection against other writers.
 197  *
 198  * See, for example usages, register_netdevice() and
 199  * unregister_netdevice(), which must be called with the rtnl
 200  * semaphore held.
 201  */
 202 DEFINE_RWLOCK(dev_base_lock);
 203 EXPORT_SYMBOL(dev_base_lock);
 204
 205 seqcount_t devnet_rename_seq;
 206
 207 static inline void dev_base_seq_inc(struct net *net)
 208 {
 209         while (++net->dev_base_seq == 0);
 210 }
 211
 212 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 213 {
 214         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 215
 216         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 217 }
 218
 219 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 220 {
 221         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 222 }
 223
 224 static inline void rps_lock(struct softnet_data *sd)
 225 {
 226 #ifdef CONFIG_RPS
 227         spin_lock(&sd->input_pkt_queue.lock);
 228 #endif
 229 }
 230
 231 static inline void rps_unlock(struct softnet_data *sd)
 232 {
 233 #ifdef CONFIG_RPS
 234         spin_unlock(&sd->input_pkt_queue.lock);
 235 #endif
 236 }
 237
 238 /* Device list insertion */
 239 static int list_netdevice(struct net_device *dev)
 240 {
 241         struct net *net = dev_net(dev);
 242
 243         ASSERT_RTNL();
 244
 245         write_lock_bh(&dev_base_lock);
 246         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 247         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 248         hlist_add_head_rcu(&dev->index_hlist,
 249                            dev_index_hash(net, dev->ifindex));
 250         write_unlock_bh(&dev_base_lock);
 251
 252         dev_base_seq_inc(net);
 253
 254         return 0;
 255 }
 256
 257 /* Device list removal
 258  * caller must respect a RCU grace period before freeing/reusing dev
 259  */
 260 static void unlist_netdevice(struct net_device *dev)
 261 {
 262         ASSERT_RTNL();
 263
 264         /* Unlink dev from the device chain */
 265         write_lock_bh(&dev_base_lock);
 266         list_del_rcu(&dev->dev_list);
 267         hlist_del_rcu(&dev->name_hlist);
 268         hlist_del_rcu(&dev->index_hlist);
 269         write_unlock_bh(&dev_base_lock);
 270
 271         dev_base_seq_inc(dev_net(dev));
 272 }
 273
 274 /*
 275  *      Our notifier list
 276  */
 277
 278 static RAW_NOTIFIER_HEAD(netdev_chain);
 279
 280 /*
 281  *      Device drivers call our routines to queue packets here. We empty the
 282  *      queue in the local softnet handler.
 283  */
 284
 285 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 286 EXPORT_PER_CPU_SYMBOL(softnet_data);
 287
 288 #ifdef CONFIG_LOCKDEP
 289 /*
 290  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 291  * according to dev->type
 292  */
 293 static const unsigned short netdev_lock_type[] =
 294         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 295          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 296          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 297          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 298          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 299          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 300          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 301          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 302          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 303          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 304          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 305          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 306          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 307          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 308          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 309
 310 static const char *const netdev_lock_name[] =
 311         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 312          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 313          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 314          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 315          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 316          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 317          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 318          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 319          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 320          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 321          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 322          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 323          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 324          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 325          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 326
 327 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 329
 330 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 331 {
 332         int i;
 333
 334         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 335                 if (netdev_lock_type[i] == dev_type)
 336                         return i;
 337         /* the last key is used by default */
 338         return ARRAY_SIZE(netdev_lock_type) - 1;
 339 }
 340
 341 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 342                                                  unsigned short dev_type)
 343 {
 344         int i;
 345
 346         i = netdev_lock_pos(dev_type);
 347         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 348                                    netdev_lock_name[i]);
 349 }
 350
 351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352 {
 353         int i;
 354
 355         i = netdev_lock_pos(dev->type);
 356         lockdep_set_class_and_name(&dev->addr_list_lock,
 357                                    &netdev_addr_lock_key[i],
 358                                    netdev_lock_name[i]);
 359 }
 360 #else
 361 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 362                                                  unsigned short dev_type)
 363 {
 364 }
 365 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 366 {
 367 }
 368 #endif
 369
 370 /*******************************************************************************
 371
 372                 Protocol management and registration routines
 373
 374 *******************************************************************************/
 375
 376 /*
 377  *      Add a protocol ID to the list. Now that the input handler is
 378  *      smarter we can dispense with all the messy stuff that used to be
 379  *      here.
 380  *
 381  *      BEWARE!!! Protocol handlers, mangling input packets,
 382  *      MUST BE last in hash buckets and checking protocol handlers
 383  *      MUST start from promiscuous ptype_all chain in net_bh.
 384  *      It is true now, do not change it.
 385  *      Explanation follows: if protocol handler, mangling packet, will
 386  *      be the first on list, it is not able to sense, that packet
 387  *      is cloned and should be copied-on-write, so that it will
 388  *      change it and subsequent readers will get broken packet.
 389  *                                                      --ANK (980803)
 390  */
 391
 392 static inline struct list_head *ptype_head(const struct packet_type *pt)
 393 {
 394         if (pt->type == htons(ETH_P_ALL))
 395                 return &ptype_all;
 396         else
 397                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 398 }
 399
 400 /**
 401  *      dev_add_pack - add packet handler
 402  *      @pt: packet type declaration
 403  *
 404  *      Add a protocol handler to the networking stack. The passed &packet_type
 405  *      is linked into kernel lists and may not be freed until it has been
 406  *      removed from the kernel lists.
 407  *
 408  *      This call does not sleep therefore it can not
 409  *      guarantee all CPU's that are in middle of receiving packets
 410  *      will see the new packet type (until the next received packet).
 411  */
 412
 413 void dev_add_pack(struct packet_type *pt)
 414 {
 415         struct list_head *head = ptype_head(pt);
 416
 417         spin_lock(&ptype_lock);
 418         list_add_rcu(&pt->list, head);
 419         spin_unlock(&ptype_lock);
 420 }
 421 EXPORT_SYMBOL(dev_add_pack);
 422
 423 /**
 424  *      __dev_remove_pack        - remove packet handler
 425  *      @pt: packet type declaration
 426  *
 427  *      Remove a protocol handler that was previously added to the kernel
 428  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 429  *      from the kernel lists and can be freed or reused once this function
 430  *      returns.
 431  *
 432  *      The packet type might still be in use by receivers
 433  *      and must not be freed until after all the CPU's have gone
 434  *      through a quiescent state.
 435  */
 436 void __dev_remove_pack(struct packet_type *pt)
 437 {
 438         struct list_head *head = ptype_head(pt);
 439         struct packet_type *pt1;
 440
 441         spin_lock(&ptype_lock);
 442
 443         list_for_each_entry(pt1, head, list) {
 444                 if (pt == pt1) {
 445                         list_del_rcu(&pt->list);
 446                         goto out;
 447                 }
 448         }
 449
 450         pr_warn("dev_remove_pack: %p not found\n", pt);
 451 out:
 452         spin_unlock(&ptype_lock);
 453 }
 454 EXPORT_SYMBOL(__dev_remove_pack);
 455
 456 /**
 457  *      dev_remove_pack  - remove packet handler
 458  *      @pt: packet type declaration
 459  *
 460  *      Remove a protocol handler that was previously added to the kernel
 461  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 462  *      from the kernel lists and can be freed or reused once this function
 463  *      returns.
 464  *
 465  *      This call sleeps to guarantee that no CPU is looking at the packet
 466  *      type after return.
 467  */
 468 void dev_remove_pack(struct packet_type *pt)
 469 {
 470         __dev_remove_pack(pt);
 471
 472         synchronize_net();
 473 }
 474 EXPORT_SYMBOL(dev_remove_pack);
 475
 476
 477 /**
 478  *      dev_add_offload - register offload handlers
 479  *      @po: protocol offload declaration
 480  *
 481  *      Add protocol offload handlers to the networking stack. The passed
 482  *      &proto_offload is linked into kernel lists and may not be freed until
 483  *      it has been removed from the kernel lists.
 484  *
 485  *      This call does not sleep therefore it can not
 486  *      guarantee all CPU's that are in middle of receiving packets
 487  *      will see the new offload handlers (until the next received packet).
 488  */
 489 void dev_add_offload(struct packet_offload *po)
 490 {
 491         struct list_head *head = &offload_base;
 492
 493         spin_lock(&offload_lock);
 494         list_add_rcu(&po->list, head);
 495         spin_unlock(&offload_lock);
 496 }
 497 EXPORT_SYMBOL(dev_add_offload);
 498
 499 /**
 500  *      __dev_remove_offload     - remove offload handler
 501  *      @po: packet offload declaration
 502  *
 503  *      Remove a protocol offload handler that was previously added to the
 504  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 505  *      is removed from the kernel lists and can be freed or reused once this
 506  *      function returns.
 507  *
 508  *      The packet type might still be in use by receivers
 509  *      and must not be freed until after all the CPU's have gone
 510  *      through a quiescent state.
 511  */
 512 void __dev_remove_offload(struct packet_offload *po)
 513 {
 514         struct list_head *head = &offload_base;
 515         struct packet_offload *po1;
 516
 517         spin_lock(&offload_lock);
 518
 519         list_for_each_entry(po1, head, list) {
 520                 if (po == po1) {
 521                         list_del_rcu(&po->list);
 522                         goto out;
 523                 }
 524         }
 525
 526         pr_warn("dev_remove_offload: %p not found\n", po);
 527 out:
 528         spin_unlock(&offload_lock);
 529 }
 530 EXPORT_SYMBOL(__dev_remove_offload);
 531
 532 /**
 533  *      dev_remove_offload       - remove packet offload handler
 534  *      @po: packet offload declaration
 535  *
 536  *      Remove a packet offload handler that was previously added to the kernel
 537  *      offload handlers by dev_add_offload(). The passed &offload_type is
 538  *      removed from the kernel lists and can be freed or reused once this
 539  *      function returns.
 540  *
 541  *      This call sleeps to guarantee that no CPU is looking at the packet
 542  *      type after return.
 543  */
 544 void dev_remove_offload(struct packet_offload *po)
 545 {
 546         __dev_remove_offload(po);
 547
 548         synchronize_net();
 549 }
 550 EXPORT_SYMBOL(dev_remove_offload);
 551
 552 /******************************************************************************
 553
 554                       Device Boot-time Settings Routines
 555
 556 *******************************************************************************/
 557
 558 /* Boot time configuration table */
 559 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 560
 561 /**
 562  *      netdev_boot_setup_add   - add new setup entry
 563  *      @name: name of the device
 564  *      @map: configured settings for the device
 565  *
 566  *      Adds new setup entry to the dev_boot_setup list.  The function
 567  *      returns 0 on error and 1 on success.  This is a generic routine to
 568  *      all netdevices.
 569  */
 570 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 571 {
 572         struct netdev_boot_setup *s;
 573         int i;
 574
 575         s = dev_boot_setup;
 576         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 577                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 578                         memset(s[i].name, 0, sizeof(s[i].name));
 579                         strlcpy(s[i].name, name, IFNAMSIZ);
 580                         memcpy(&s[i].map, map, sizeof(s[i].map));
 581                         break;
 582                 }
 583         }
 584
 585         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 586 }
 587
 588 /**
 589  *      netdev_boot_setup_check - check boot time settings
 590  *      @dev: the netdevice
 591  *
 592  *      Check boot time settings for the device.
 593  *      The found settings are set for the device to be used
 594  *      later in the device probing.
 595  *      Returns 0 if no settings found, 1 if they are.
 596  */
 597 int netdev_boot_setup_check(struct net_device *dev)
 598 {
 599         struct netdev_boot_setup *s = dev_boot_setup;
 600         int i;
 601
 602         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 603                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 604                     !strcmp(dev->name, s[i].name)) {
 605                         dev->irq        = s[i].map.irq;
 606                         dev->base_addr  = s[i].map.base_addr;
 607                         dev->mem_start  = s[i].map.mem_start;
 608                         dev->mem_end    = s[i].map.mem_end;
 609                         return 1;
 610                 }
 611         }
 612         return 0;
 613 }
 614 EXPORT_SYMBOL(netdev_boot_setup_check);
 615
 616
 617 /**
 618  *      netdev_boot_base        - get address from boot time settings
 619  *      @prefix: prefix for network device
 620  *      @unit: id for network device
 621  *
 622  *      Check boot time settings for the base address of device.
 623  *      The found settings are set for the device to be used
 624  *      later in the device probing.
 625  *      Returns 0 if no settings found.
 626  */
 627 unsigned long netdev_boot_base(const char *prefix, int unit)
 628 {
 629         const struct netdev_boot_setup *s = dev_boot_setup;
 630         char name[IFNAMSIZ];
 631         int i;
 632
 633         sprintf(name, "%s%d", prefix, unit);
 634
 635         /*
 636          * If device already registered then return base of 1
 637          * to indicate not to probe for this interface
 638          */
 639         if (__dev_get_by_name(&init_net, name))
 640                 return 1;
 641
 642         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 643                 if (!strcmp(name, s[i].name))
 644                         return s[i].map.base_addr;
 645         return 0;
 646 }
 647
 648 /*
 649  * Saves at boot time configured settings for any netdevice.
 650  */
 651 int __init netdev_boot_setup(char *str)
 652 {
 653         int ints[5];
 654         struct ifmap map;
 655
 656         str = get_options(str, ARRAY_SIZE(ints), ints);
 657         if (!str || !*str)
 658                 return 0;
 659
 660         /* Save settings */
 661         memset(&map, 0, sizeof(map));
 662         if (ints[0] > 0)
 663                 map.irq = ints[1];
 664         if (ints[0] > 1)
 665                 map.base_addr = ints[2];
 666         if (ints[0] > 2)
 667                 map.mem_start = ints[3];
 668         if (ints[0] > 3)
 669                 map.mem_end = ints[4];
 670
 671         /* Add new entry to the list */
 672         return netdev_boot_setup_add(str, &map);
 673 }
 674
 675 __setup("netdev=", netdev_boot_setup);
 676
 677 /*******************************************************************************
 678
 679                             Device Interface Subroutines
 680
 681 *******************************************************************************/
 682
 683 /**
 684  *      __dev_get_by_name       - find a device by its name
 685  *      @net: the applicable net namespace
 686  *      @name: name to find
 687  *
 688  *      Find an interface by name. Must be called under RTNL semaphore
 689  *      or @dev_base_lock. If the name is found a pointer to the device
 690  *      is returned. If the name is not found then %NULL is returned. The
 691  *      reference counters are not incremented so the caller must be
 692  *      careful with locks.
 693  */
 694
 695 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 696 {
 697         struct hlist_node *p;
 698         struct net_device *dev;
 699         struct hlist_head *head = dev_name_hash(net, name);
 700
 701         hlist_for_each_entry(dev, p, head, name_hlist)
 702                 if (!strncmp(dev->name, name, IFNAMSIZ))
 703                         return dev;
 704
 705         return NULL;
 706 }
 707 EXPORT_SYMBOL(__dev_get_by_name);
 708
 709 /**
 710  *      dev_get_by_name_rcu     - find a device by its name
 711  *      @net: the applicable net namespace
 712  *      @name: name to find
 713  *
 714  *      Find an interface by name.
 715  *      If the name is found a pointer to the device is returned.
 716  *      If the name is not found then %NULL is returned.
 717  *      The reference counters are not incremented so the caller must be
 718  *      careful with locks. The caller must hold RCU lock.
 719  */
 720
 721 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 722 {
 723         struct hlist_node *p;
 724         struct net_device *dev;
 725         struct hlist_head *head = dev_name_hash(net, name);
 726
 727         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 728                 if (!strncmp(dev->name, name, IFNAMSIZ))
 729                         return dev;
 730
 731         return NULL;
 732 }
 733 EXPORT_SYMBOL(dev_get_by_name_rcu);
 734
 735 /**
 736  *      dev_get_by_name         - find a device by its name
 737  *      @net: the applicable net namespace
 738  *      @name: name to find
 739  *
 740  *      Find an interface by name. This can be called from any
 741  *      context and does its own locking. The returned handle has
 742  *      the usage count incremented and the caller must use dev_put() to
 743  *      release it when it is no longer needed. %NULL is returned if no
 744  *      matching device is found.
 745  */
 746
 747 struct net_device *dev_get_by_name(struct net *net, const char *name)
 748 {
 749         struct net_device *dev;
 750
 751         rcu_read_lock();
 752         dev = dev_get_by_name_rcu(net, name);
 753         if (dev)
 754                 dev_hold(dev);
 755         rcu_read_unlock();
 756         return dev;
 757 }
 758 EXPORT_SYMBOL(dev_get_by_name);
 759
 760 /**
 761  *      __dev_get_by_index - find a device by its ifindex
 762  *      @net: the applicable net namespace
 763  *      @ifindex: index of device
 764  *
 765  *      Search for an interface by index. Returns %NULL if the device
 766  *      is not found or a pointer to the device. The device has not
 767  *      had its reference counter increased so the caller must be careful
 768  *      about locking. The caller must hold either the RTNL semaphore
 769  *      or @dev_base_lock.
 770  */
 771
 772 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 773 {
 774         struct hlist_node *p;
 775         struct net_device *dev;
 776         struct hlist_head *head = dev_index_hash(net, ifindex);
 777
 778         hlist_for_each_entry(dev, p, head, index_hlist)
 779                 if (dev->ifindex == ifindex)
 780                         return dev;
 781
 782         return NULL;
 783 }
 784 EXPORT_SYMBOL(__dev_get_by_index);
 785
 786 /**
 787  *      dev_get_by_index_rcu - find a device by its ifindex
 788  *      @net: the applicable net namespace
 789  *      @ifindex: index of device
 790  *
 791  *      Search for an interface by index. Returns %NULL if the device
 792  *      is not found or a pointer to the device. The device has not
 793  *      had its reference counter increased so the caller must be careful
 794  *      about locking. The caller must hold RCU lock.
 795  */
 796
 797 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 798 {
 799         struct hlist_node *p;
 800         struct net_device *dev;
 801         struct hlist_head *head = dev_index_hash(net, ifindex);
 802
 803         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 804                 if (dev->ifindex == ifindex)
 805                         return dev;
 806
 807         return NULL;
 808 }
 809 EXPORT_SYMBOL(dev_get_by_index_rcu);
 810
 811
 812 /**
 813  *      dev_get_by_index - find a device by its ifindex
 814  *      @net: the applicable net namespace
 815  *      @ifindex: index of device
 816  *
 817  *      Search for an interface by index. Returns NULL if the device
 818  *      is not found or a pointer to the device. The device returned has
 819  *      had a reference added and the pointer is safe until the user calls
 820  *      dev_put to indicate they have finished with it.
 821  */
 822
 823 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 824 {
 825         struct net_device *dev;
 826
 827         rcu_read_lock();
 828         dev = dev_get_by_index_rcu(net, ifindex);
 829         if (dev)
 830                 dev_hold(dev);
 831         rcu_read_unlock();
 832         return dev;
 833 }
 834 EXPORT_SYMBOL(dev_get_by_index);
 835
 836 /**
 837  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 838  *      @net: the applicable net namespace
 839  *      @type: media type of device
 840  *      @ha: hardware address
 841  *
 842  *      Search for an interface by MAC address. Returns NULL if the device
 843  *      is not found or a pointer to the device.
 844  *      The caller must hold RCU or RTNL.
 845  *      The returned device has not had its ref count increased
 846  *      and the caller must therefore be careful about locking
 847  *
 848  */
 849
 850 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 851                                        const char *ha)
 852 {
 853         struct net_device *dev;
 854
 855         for_each_netdev_rcu(net, dev)
 856                 if (dev->type == type &&
 857                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 858                         return dev;
 859
 860         return NULL;
 861 }
 862 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 863
 864 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 865 {
 866         struct net_device *dev;
 867
 868         ASSERT_RTNL();
 869         for_each_netdev(net, dev)
 870                 if (dev->type == type)
 871                         return dev;
 872
 873         return NULL;
 874 }
 875 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 876
 877 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 878 {
 879         struct net_device *dev, *ret = NULL;
 880
 881         rcu_read_lock();
 882         for_each_netdev_rcu(net, dev)
 883                 if (dev->type == type) {
 884                         dev_hold(dev);
 885                         ret = dev;
 886                         break;
 887                 }
 888         rcu_read_unlock();
 889         return ret;
 890 }
 891 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 892
 893 /**
 894  *      dev_get_by_flags_rcu - find any device with given flags
 895  *      @net: the applicable net namespace
 896  *      @if_flags: IFF_* values
 897  *      @mask: bitmask of bits in if_flags to check
 898  *
 899  *      Search for any interface with the given flags. Returns NULL if a device
 900  *      is not found or a pointer to the device. Must be called inside
 901  *      rcu_read_lock(), and result refcount is unchanged.
 902  */
 903
 904 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 905                                     unsigned short mask)
 906 {
 907         struct net_device *dev, *ret;
 908
 909         ret = NULL;
 910         for_each_netdev_rcu(net, dev) {
 911                 if (((dev->flags ^ if_flags) & mask) == 0) {
 912                         ret = dev;
 913                         break;
 914                 }
 915         }
 916         return ret;
 917 }
 918 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 919
 920 /**
 921  *      dev_valid_name - check if name is okay for network device
 922  *      @name: name string
 923  *
 924  *      Network device names need to be valid file names to
 925  *      to allow sysfs to work.  We also disallow any kind of
 926  *      whitespace.
 927  */
 928 bool dev_valid_name(const char *name)
 929 {
 930         if (*name == '\0')
 931                 return false;
 932         if (strlen(name) >= IFNAMSIZ)
 933                 return false;
 934         if (!strcmp(name, ".") || !strcmp(name, ".."))
 935                 return false;
 936
 937         while (*name) {
 938                 if (*name == '/' || isspace(*name))
 939                         return false;
 940                 name++;
 941         }
 942         return true;
 943 }
 944 EXPORT_SYMBOL(dev_valid_name);
 945
 946 /**
 947  *      __dev_alloc_name - allocate a name for a device
 948  *      @net: network namespace to allocate the device name in
 949  *      @name: name format string
 950  *      @buf:  scratch buffer and result name string
 951  *
 952  *      Passed a format string - eg "lt%d" it will try and find a suitable
 953  *      id. It scans list of devices to build up a free map, then chooses
 954  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 955  *      while allocating the name and adding the device in order to avoid
 956  *      duplicates.
 957  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 958  *      Returns the number of the unit assigned or a negative errno code.
 959  */
 960
 961 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 962 {
 963         int i = 0;
 964         const char *p;
 965         const int max_netdevices = 8*PAGE_SIZE;
 966         unsigned long *inuse;
 967         struct net_device *d;
 968
 969         p = strnchr(name, IFNAMSIZ-1, '%');
 970         if (p) {
 971                 /*
 972                  * Verify the string as this thing may have come from
 973                  * the user.  There must be either one "%d" and no other "%"
 974                  * characters.
 975                  */
 976                 if (p[1] != 'd' || strchr(p + 2, '%'))
 977                         return -EINVAL;
 978
 979                 /* Use one page as a bit array of possible slots */
 980                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 981                 if (!inuse)
 982                         return -ENOMEM;
 983
 984                 for_each_netdev(net, d) {
 985                         if (!sscanf(d->name, name, &i))
 986                                 continue;
 987                         if (i < 0 || i >= max_netdevices)
 988                                 continue;
 989
 990                         /*  avoid cases where sscanf is not exact inverse of printf */
 991                         snprintf(buf, IFNAMSIZ, name, i);
 992                         if (!strncmp(buf, d->name, IFNAMSIZ))
 993                                 set_bit(i, inuse);
 994                 }
 995
 996                 i = find_first_zero_bit(inuse, max_netdevices);
 997                 free_page((unsigned long) inuse);
 998         }
 999
1000         if (buf != name)
1001                 snprintf(buf, IFNAMSIZ, name, i);
1002         if (!__dev_get_by_name(net, buf))
1003                 return i;
1004
1005         /* It is possible to run out of possible slots
1006          * when the name is long and there isn't enough space left
1007          * for the digits, or if all bits are used.
1008          */
1009         return -ENFILE;
1010 }
1011
1012 /**
1013  *      dev_alloc_name - allocate a name for a device
1014  *      @dev: device
1015  *      @name: name format string
1016  *
1017  *      Passed a format string - eg "lt%d" it will try and find a suitable
1018  *      id. It scans list of devices to build up a free map, then chooses
1019  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1020  *      while allocating the name and adding the device in order to avoid
1021  *      duplicates.
1022  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1023  *      Returns the number of the unit assigned or a negative errno code.
1024  */
1025
1026 int dev_alloc_name(struct net_device *dev, const char *name)
1027 {
1028         char buf[IFNAMSIZ];
1029         struct net *net;
1030         int ret;
1031
1032         BUG_ON(!dev_net(dev));
1033         net = dev_net(dev);
1034         ret = __dev_alloc_name(net, name, buf);
1035         if (ret >= 0)
1036                 strlcpy(dev->name, buf, IFNAMSIZ);
1037         return ret;
1038 }
1039 EXPORT_SYMBOL(dev_alloc_name);
1040
1041 static int dev_alloc_name_ns(struct net *net,
1042                              struct net_device *dev,
1043                              const char *name)
1044 {
1045         char buf[IFNAMSIZ];
1046         int ret;
1047
1048         ret = __dev_alloc_name(net, name, buf);
1049         if (ret >= 0)
1050                 strlcpy(dev->name, buf, IFNAMSIZ);
1051         return ret;
1052 }
1053
1054 static int dev_get_valid_name(struct net *net,
1055                               struct net_device *dev,
1056                               const char *name)
1057 {
1058         BUG_ON(!net);
1059
1060         if (!dev_valid_name(name))
1061                 return -EINVAL;
1062
1063         if (strchr(name, '%'))
1064                 return dev_alloc_name_ns(net, dev, name);
1065         else if (__dev_get_by_name(net, name))
1066                 return -EEXIST;
1067         else if (dev->name != name)
1068                 strlcpy(dev->name, name, IFNAMSIZ);
1069
1070         return 0;
1071 }
1072
1073 /**
1074  *      dev_change_name - change name of a device
1075  *      @dev: device
1076  *      @newname: name (or format string) must be at least IFNAMSIZ
1077  *
1078  *      Change name of a device, can pass format strings "eth%d".
1079  *      for wildcarding.
1080  */
1081 int dev_change_name(struct net_device *dev, const char *newname)
1082 {
1083         char oldname[IFNAMSIZ];
1084         int err = 0;
1085         int ret;
1086         struct net *net;
1087
1088         ASSERT_RTNL();
1089         BUG_ON(!dev_net(dev));
1090
1091         net = dev_net(dev);
1092         if (dev->flags & IFF_UP)
1093                 return -EBUSY;
1094
1095         write_seqcount_begin(&devnet_rename_seq);
1096
1097         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1098                 write_seqcount_end(&devnet_rename_seq);
1099                 return 0;
1100         }
1101
1102         memcpy(oldname, dev->name, IFNAMSIZ);
1103
1104         err = dev_get_valid_name(net, dev, newname);
1105         if (err < 0) {
1106                 write_seqcount_end(&devnet_rename_seq);
1107                 return err;
1108         }
1109
1110 rollback:
1111         ret = device_rename(&dev->dev, dev->name);
1112         if (ret) {
1113                 memcpy(dev->name, oldname, IFNAMSIZ);
1114                 write_seqcount_end(&devnet_rename_seq);
1115                 return ret;
1116         }
1117
1118         write_seqcount_end(&devnet_rename_seq);
1119
1120         write_lock_bh(&dev_base_lock);
1121         hlist_del_rcu(&dev->name_hlist);
1122         write_unlock_bh(&dev_base_lock);
1123
1124         synchronize_rcu();
1125
1126         write_lock_bh(&dev_base_lock);
1127         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1128         write_unlock_bh(&dev_base_lock);
1129
1130         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1131         ret = notifier_to_errno(ret);
1132
1133         if (ret) {
1134                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1135                 if (err >= 0) {
1136                         err = ret;
1137                         write_seqcount_begin(&devnet_rename_seq);
1138                         memcpy(dev->name, oldname, IFNAMSIZ);
1139                         goto rollback;
1140                 } else {
1141                         pr_err("%s: name change rollback failed: %d\n",
1142                                dev->name, ret);
1143                 }
1144         }
1145
1146         return err;
1147 }
1148
1149 /**
1150  *      dev_set_alias - change ifalias of a device
1151  *      @dev: device
1152  *      @alias: name up to IFALIASZ
1153  *      @len: limit of bytes to copy from info
1154  *
1155  *      Set ifalias for a device,
1156  */
1157 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1158 {
1159         char *new_ifalias;
1160
1161         ASSERT_RTNL();
1162
1163         if (len >= IFALIASZ)
1164                 return -EINVAL;
1165
1166         if (!len) {
1167                 kfree(dev->ifalias);
1168                 dev->ifalias = NULL;
1169                 return 0;
1170         }
1171
1172         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1173         if (!new_ifalias)
1174                 return -ENOMEM;
1175         dev->ifalias = new_ifalias;
1176
1177         strlcpy(dev->ifalias, alias, len+1);
1178         return len;
1179 }
1180
1181
1182 /**
1183  *      netdev_features_change - device changes features
1184  *      @dev: device to cause notification
1185  *
1186  *      Called to indicate a device has changed features.
1187  */
1188 void netdev_features_change(struct net_device *dev)
1189 {
1190         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1191 }
1192 EXPORT_SYMBOL(netdev_features_change);
1193
1194 /**
1195  *      netdev_state_change - device changes state
1196  *      @dev: device to cause notification
1197  *
1198  *      Called to indicate a device has changed state. This function calls
1199  *      the notifier chains for netdev_chain and sends a NEWLINK message
1200  *      to the routing socket.
1201  */
1202 void netdev_state_change(struct net_device *dev)
1203 {
1204         if (dev->flags & IFF_UP) {
1205                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1206                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1207         }
1208 }
1209 EXPORT_SYMBOL(netdev_state_change);
1210
1211 /**
1212  *      netdev_notify_peers - notify network peers about existence of @dev
1213  *      @dev: network device
1214  *
1215  * Generate traffic such that interested network peers are aware of
1216  * @dev, such as by generating a gratuitous ARP. This may be used when
1217  * a device wants to inform the rest of the network about some sort of
1218  * reconfiguration such as a failover event or virtual machine
1219  * migration.
1220  */
1221 void netdev_notify_peers(struct net_device *dev)
1222 {
1223         rtnl_lock();
1224         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1225         rtnl_unlock();
1226 }
1227 EXPORT_SYMBOL(netdev_notify_peers);
1228
1229 /**
1230  *      dev_load        - load a network module
1231  *      @net: the applicable net namespace
1232  *      @name: name of interface
1233  *
1234  *      If a network interface is not present and the process has suitable
1235  *      privileges this function loads the module. If module loading is not
1236  *      available in this kernel then it becomes a nop.
1237  */
1238
1239 void dev_load(struct net *net, const char *name)
1240 {
1241         struct net_device *dev;
1242         int no_module;
1243
1244         rcu_read_lock();
1245         dev = dev_get_by_name_rcu(net, name);
1246         rcu_read_unlock();
1247
1248         no_module = !dev;
1249         if (no_module && capable(CAP_NET_ADMIN))
1250                 no_module = request_module("netdev-%s", name);
1251         if (no_module && capable(CAP_SYS_MODULE)) {
1252                 if (!request_module("%s", name))
1253                         pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1254                                 name);
1255         }
1256 }
1257 EXPORT_SYMBOL(dev_load);
1258
1259 static int __dev_open(struct net_device *dev)
1260 {
1261         const struct net_device_ops *ops = dev->netdev_ops;
1262         int ret;
1263
1264         ASSERT_RTNL();
1265
1266         if (!netif_device_present(dev))
1267                 return -ENODEV;
1268
1269         /* Block netpoll from trying to do any rx path servicing.
1270          * If we don't do this there is a chance ndo_poll_controller
1271          * or ndo_poll may be running while we open the device
1272          */
1273         ret = netpoll_rx_disable(dev);
1274         if (ret)
1275                 return ret;
1276
1277         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1278         ret = notifier_to_errno(ret);
1279         if (ret)
1280                 return ret;
1281
1282         set_bit(__LINK_STATE_START, &dev->state);
1283
1284         if (ops->ndo_validate_addr)
1285                 ret = ops->ndo_validate_addr(dev);
1286
1287         if (!ret && ops->ndo_open)
1288                 ret = ops->ndo_open(dev);
1289
1290         netpoll_rx_enable(dev);
1291
1292         if (ret)
1293                 clear_bit(__LINK_STATE_START, &dev->state);
1294         else {
1295                 dev->flags |= IFF_UP;
1296                 net_dmaengine_get();
1297                 dev_set_rx_mode(dev);
1298                 dev_activate(dev);
1299                 add_device_randomness(dev->dev_addr, dev->addr_len);
1300         }
1301
1302         return ret;
1303 }
1304
1305 /**
1306  *      dev_open        - prepare an interface for use.
1307  *      @dev:   device to open
1308  *
1309  *      Takes a device from down to up state. The device's private open
1310  *      function is invoked and then the multicast lists are loaded. Finally
1311  *      the device is moved into the up state and a %NETDEV_UP message is
1312  *      sent to the netdev notifier chain.
1313  *
1314  *      Calling this function on an active interface is a nop. On a failure
1315  *      a negative errno code is returned.
1316  */
1317 int dev_open(struct net_device *dev)
1318 {
1319         int ret;
1320
1321         if (dev->flags & IFF_UP)
1322                 return 0;
1323
1324         ret = __dev_open(dev);
1325         if (ret < 0)
1326                 return ret;
1327
1328         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1329         call_netdevice_notifiers(NETDEV_UP, dev);
1330
1331         return ret;
1332 }
1333 EXPORT_SYMBOL(dev_open);
1334
1335 static int __dev_close_many(struct list_head *head)
1336 {
1337         struct net_device *dev;
1338
1339         ASSERT_RTNL();
1340         might_sleep();
1341
1342         list_for_each_entry(dev, head, unreg_list) {
1343                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1344
1345                 clear_bit(__LINK_STATE_START, &dev->state);
1346
1347                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1348                  * can be even on different cpu. So just clear netif_running().
1349                  *
1350                  * dev->stop() will invoke napi_disable() on all of it's
1351                  * napi_struct instances on this device.
1352                  */
1353                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1354         }
1355
1356         dev_deactivate_many(head);
1357
1358         list_for_each_entry(dev, head, unreg_list) {
1359                 const struct net_device_ops *ops = dev->netdev_ops;
1360
1361                 /*
1362                  *      Call the device specific close. This cannot fail.
1363                  *      Only if device is UP
1364                  *
1365                  *      We allow it to be called even after a DETACH hot-plug
1366                  *      event.
1367                  */
1368                 if (ops->ndo_stop)
1369                         ops->ndo_stop(dev);
1370
1371                 dev->flags &= ~IFF_UP;
1372                 net_dmaengine_put();
1373         }
1374
1375         return 0;
1376 }
1377
1378 static int __dev_close(struct net_device *dev)
1379 {
1380         int retval;
1381         LIST_HEAD(single);
1382
1383         /* Temporarily disable netpoll until the interface is down */
1384         retval = netpoll_rx_disable(dev);
1385         if (retval)
1386                 return retval;
1387
1388         list_add(&dev->unreg_list, &single);
1389         retval = __dev_close_many(&single);
1390         list_del(&single);
1391
1392         netpoll_rx_enable(dev);
1393         return retval;
1394 }
1395
1396 static int dev_close_many(struct list_head *head)
1397 {
1398         struct net_device *dev, *tmp;
1399         LIST_HEAD(tmp_list);
1400
1401         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1402                 if (!(dev->flags & IFF_UP))
1403                         list_move(&dev->unreg_list, &tmp_list);
1404
1405         __dev_close_many(head);
1406
1407         list_for_each_entry(dev, head, unreg_list) {
1408                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1409                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1410         }
1411
1412         /* rollback_registered_many needs the complete original list */
1413         list_splice(&tmp_list, head);
1414         return 0;
1415 }
1416
1417 /**
1418  *      dev_close - shutdown an interface.
1419  *      @dev: device to shutdown
1420  *
1421  *      This function moves an active device into down state. A
1422  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1423  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1424  *      chain.
1425  */
1426 int dev_close(struct net_device *dev)
1427 {
1428         int ret = 0;
1429         if (dev->flags & IFF_UP) {
1430                 LIST_HEAD(single);
1431
1432                 /* Block netpoll rx while the interface is going down */
1433                 ret = netpoll_rx_disable(dev);
1434                 if (ret)
1435                         return ret;
1436
1437                 list_add(&dev->unreg_list, &single);
1438                 dev_close_many(&single);
1439                 list_del(&single);
1440
1441                 netpoll_rx_enable(dev);
1442         }
1443         return ret;
1444 }
1445 EXPORT_SYMBOL(dev_close);
1446
1447
1448 /**
1449  *      dev_disable_lro - disable Large Receive Offload on a device
1450  *      @dev: device
1451  *
1452  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1453  *      called under RTNL.  This is needed if received packets may be
1454  *      forwarded to another interface.
1455  */
1456 void dev_disable_lro(struct net_device *dev)
1457 {
1458         /*
1459          * If we're trying to disable lro on a vlan device
1460          * use the underlying physical device instead
1461          */
1462         if (is_vlan_dev(dev))
1463                 dev = vlan_dev_real_dev(dev);
1464
1465         dev->wanted_features &= ~NETIF_F_LRO;
1466         netdev_update_features(dev);
1467
1468         if (unlikely(dev->features & NETIF_F_LRO))
1469                 netdev_WARN(dev, "failed to disable LRO!\n");
1470 }
1471 EXPORT_SYMBOL(dev_disable_lro);
1472
1473
1474 static int dev_boot_phase = 1;
1475
1476 /**
1477  *      register_netdevice_notifier - register a network notifier block
1478  *      @nb: notifier
1479  *
1480  *      Register a notifier to be called when network device events occur.
1481  *      The notifier passed is linked into the kernel structures and must
1482  *      not be reused until it has been unregistered. A negative errno code
1483  *      is returned on a failure.
1484  *
1485  *      When registered all registration and up events are replayed
1486  *      to the new notifier to allow device to have a race free
1487  *      view of the network device list.
1488  */
1489
1490 int register_netdevice_notifier(struct notifier_block *nb)
1491 {
1492         struct net_device *dev;
1493         struct net_device *last;
1494         struct net *net;
1495         int err;
1496
1497         rtnl_lock();
1498         err = raw_notifier_chain_register(&netdev_chain, nb);
1499         if (err)
1500                 goto unlock;
1501         if (dev_boot_phase)
1502                 goto unlock;
1503         for_each_net(net) {
1504                 for_each_netdev(net, dev) {
1505                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1506                         err = notifier_to_errno(err);
1507                         if (err)
1508                                 goto rollback;
1509
1510                         if (!(dev->flags & IFF_UP))
1511                                 continue;
1512
1513                         nb->notifier_call(nb, NETDEV_UP, dev);
1514                 }
1515         }
1516
1517 unlock:
1518         rtnl_unlock();
1519         return err;
1520
1521 rollback:
1522         last = dev;
1523         for_each_net(net) {
1524                 for_each_netdev(net, dev) {
1525                         if (dev == last)
1526                                 goto outroll;
1527
1528                         if (dev->flags & IFF_UP) {
1529                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1530                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1531                         }
1532                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1533                 }
1534         }
1535
1536 outroll:
1537         raw_notifier_chain_unregister(&netdev_chain, nb);
1538         goto unlock;
1539 }
1540 EXPORT_SYMBOL(register_netdevice_notifier);
1541
1542 /**
1543  *      unregister_netdevice_notifier - unregister a network notifier block
1544  *      @nb: notifier
1545  *
1546  *      Unregister a notifier previously registered by
1547  *      register_netdevice_notifier(). The notifier is unlinked into the
1548  *      kernel structures and may then be reused. A negative errno code
1549  *      is returned on a failure.
1550  *
1551  *      After unregistering unregister and down device events are synthesized
1552  *      for all devices on the device list to the removed notifier to remove
1553  *      the need for special case cleanup code.
1554  */
1555
1556 int unregister_netdevice_notifier(struct notifier_block *nb)
1557 {
1558         struct net_device *dev;
1559         struct net *net;
1560         int err;
1561
1562         rtnl_lock();
1563         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1564         if (err)
1565                 goto unlock;
1566
1567         for_each_net(net) {
1568                 for_each_netdev(net, dev) {
1569                         if (dev->flags & IFF_UP) {
1570                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1571                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1572                         }
1573                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1574                 }
1575         }
1576 unlock:
1577         rtnl_unlock();
1578         return err;
1579 }
1580 EXPORT_SYMBOL(unregister_netdevice_notifier);
1581
1582 /**
1583  *      call_netdevice_notifiers - call all network notifier blocks
1584  *      @val: value passed unmodified to notifier function
1585  *      @dev: net_device pointer passed unmodified to notifier function
1586  *
1587  *      Call all network notifier blocks.  Parameters and return value
1588  *      are as for raw_notifier_call_chain().
1589  */
1590
1591 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1592 {
1593         ASSERT_RTNL();
1594         return raw_notifier_call_chain(&netdev_chain, val, dev);
1595 }
1596 EXPORT_SYMBOL(call_netdevice_notifiers);
1597
1598 static struct static_key netstamp_needed __read_mostly;
1599 #ifdef HAVE_JUMP_LABEL
1600 /* We are not allowed to call static_key_slow_dec() from irq context
1601  * If net_disable_timestamp() is called from irq context, defer the
1602  * static_key_slow_dec() calls.
1603  */
1604 static atomic_t netstamp_needed_deferred;
1605 #endif
1606
1607 void net_enable_timestamp(void)
1608 {
1609 #ifdef HAVE_JUMP_LABEL
1610         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1611
1612         if (deferred) {
1613                 while (--deferred)
1614                         static_key_slow_dec(&netstamp_needed);
1615                 return;
1616         }
1617 #endif
1618         WARN_ON(in_interrupt());
1619         static_key_slow_inc(&netstamp_needed);
1620 }
1621 EXPORT_SYMBOL(net_enable_timestamp);
1622
1623 void net_disable_timestamp(void)
1624 {
1625 #ifdef HAVE_JUMP_LABEL
1626         if (in_interrupt()) {
1627                 atomic_inc(&netstamp_needed_deferred);
1628                 return;
1629         }
1630 #endif
1631         static_key_slow_dec(&netstamp_needed);
1632 }
1633 EXPORT_SYMBOL(net_disable_timestamp);
1634
1635 static inline void net_timestamp_set(struct sk_buff *skb)
1636 {
1637         skb->tstamp.tv64 = 0;
1638         if (static_key_false(&netstamp_needed))
1639                 __net_timestamp(skb);
1640 }
1641
1642 #define net_timestamp_check(COND, SKB)                  \
1643         if (static_key_false(&netstamp_needed)) {               \
1644                 if ((COND) && !(SKB)->tstamp.tv64)      \
1645                         __net_timestamp(SKB);           \
1646         }                                               \
1647
1648 static int net_hwtstamp_validate(struct ifreq *ifr)
1649 {
1650         struct hwtstamp_config cfg;
1651         enum hwtstamp_tx_types tx_type;
1652         enum hwtstamp_rx_filters rx_filter;
1653         int tx_type_valid = 0;
1654         int rx_filter_valid = 0;
1655
1656         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1657                 return -EFAULT;
1658
1659         if (cfg.flags) /* reserved for future extensions */
1660                 return -EINVAL;
1661
1662         tx_type = cfg.tx_type;
1663         rx_filter = cfg.rx_filter;
1664
1665         switch (tx_type) {
1666         case HWTSTAMP_TX_OFF:
1667         case HWTSTAMP_TX_ON:
1668         case HWTSTAMP_TX_ONESTEP_SYNC:
1669                 tx_type_valid = 1;
1670                 break;
1671         }
1672
1673         switch (rx_filter) {
1674         case HWTSTAMP_FILTER_NONE:
1675         case HWTSTAMP_FILTER_ALL:
1676         case HWTSTAMP_FILTER_SOME:
1677         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1678         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1679         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1680         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1681         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1682         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1683         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1684         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1685         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1686         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1687         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1688         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1689                 rx_filter_valid = 1;
1690                 break;
1691         }
1692
1693         if (!tx_type_valid || !rx_filter_valid)
1694                 return -ERANGE;
1695
1696         return 0;
1697 }
1698
1699 static inline bool is_skb_forwardable(struct net_device *dev,
1700                                       struct sk_buff *skb)
1701 {
1702         unsigned int len;
1703
1704         if (!(dev->flags & IFF_UP))
1705                 return false;
1706
1707         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1708         if (skb->len <= len)
1709                 return true;
1710
1711         /* if TSO is enabled, we don't care about the length as the packet
1712          * could be forwarded without being segmented before
1713          */
1714         if (skb_is_gso(skb))
1715                 return true;
1716
1717         return false;
1718 }
1719
1720 /**
1721  * dev_forward_skb - loopback an skb to another netif
1722  *
1723  * @dev: destination network device
1724  * @skb: buffer to forward
1725  *
1726  * return values:
1727  *      NET_RX_SUCCESS  (no congestion)
1728  *      NET_RX_DROP     (packet was dropped, but freed)
1729  *
1730  * dev_forward_skb can be used for injecting an skb from the
1731  * start_xmit function of one device into the receive queue
1732  * of another device.
1733  *
1734  * The receiving device may be in another namespace, so
1735  * we have to clear all information in the skb that could
1736  * impact namespace isolation.
1737  */
1738 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1739 {
1740         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1741                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1742                         atomic_long_inc(&dev->rx_dropped);
1743                         kfree_skb(skb);
1744                         return NET_RX_DROP;
1745                 }
1746         }
1747
1748         skb_orphan(skb);
1749         nf_reset(skb);
1750
1751         if (unlikely(!is_skb_forwardable(dev, skb))) {
1752                 atomic_long_inc(&dev->rx_dropped);
1753                 kfree_skb(skb);
1754                 return NET_RX_DROP;
1755         }
1756         skb->skb_iif = 0;
1757         skb->dev = dev;
1758         skb_dst_drop(skb);
1759         skb->tstamp.tv64 = 0;
1760         skb->pkt_type = PACKET_HOST;
1761         skb->protocol = eth_type_trans(skb, dev);
1762         skb->mark = 0;
1763         secpath_reset(skb);
1764         nf_reset(skb);
1765         return netif_rx(skb);
1766 }
1767 EXPORT_SYMBOL_GPL(dev_forward_skb);
1768
1769 static inline int deliver_skb(struct sk_buff *skb,
1770                               struct packet_type *pt_prev,
1771                               struct net_device *orig_dev)
1772 {
1773         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1774                 return -ENOMEM;
1775         atomic_inc(&skb->users);
1776         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1777 }
1778
1779 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1780 {
1781         if (!ptype->af_packet_priv || !skb->sk)
1782                 return false;
1783
1784         if (ptype->id_match)
1785                 return ptype->id_match(ptype, skb->sk);
1786         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1787                 return true;
1788
1789         return false;
1790 }
1791
1792 /*
1793  *      Support routine. Sends outgoing frames to any network
1794  *      taps currently in use.
1795  */
1796
1797 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1798 {
1799         struct packet_type *ptype;
1800         struct sk_buff *skb2 = NULL;
1801         struct packet_type *pt_prev = NULL;
1802
1803         rcu_read_lock();
1804         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1805                 /* Never send packets back to the socket
1806                  * they originated from - MvS (miquels@drinkel.ow.org)
1807                  */
1808                 if ((ptype->dev == dev || !ptype->dev) &&
1809                     (!skb_loop_sk(ptype, skb))) {
1810                         if (pt_prev) {
1811                                 deliver_skb(skb2, pt_prev, skb->dev);
1812                                 pt_prev = ptype;
1813                                 continue;
1814                         }
1815
1816                         skb2 = skb_clone(skb, GFP_ATOMIC);
1817                         if (!skb2)
1818                                 break;
1819
1820                         net_timestamp_set(skb2);
1821
1822                         /* skb->nh should be correctly
1823                            set by sender, so that the second statement is
1824                            just protection against buggy protocols.
1825                          */
1826                         skb_reset_mac_header(skb2);
1827
1828                         if (skb_network_header(skb2) < skb2->data ||
1829                             skb2->network_header > skb2->tail) {
1830                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1831                                                      ntohs(skb2->protocol),
1832                                                      dev->name);
1833                                 skb_reset_network_header(skb2);
1834                         }
1835
1836                         skb2->transport_header = skb2->network_header;
1837                         skb2->pkt_type = PACKET_OUTGOING;
1838                         pt_prev = ptype;
1839                 }
1840         }
1841         if (pt_prev)
1842                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1843         rcu_read_unlock();
1844 }
1845
1846 /**
1847  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1848  * @dev: Network device
1849  * @txq: number of queues available
1850  *
1851  * If real_num_tx_queues is changed the tc mappings may no longer be
1852  * valid. To resolve this verify the tc mapping remains valid and if
1853  * not NULL the mapping. With no priorities mapping to this
1854  * offset/count pair it will no longer be used. In the worst case TC0
1855  * is invalid nothing can be done so disable priority mappings. If is
1856  * expected that drivers will fix this mapping if they can before
1857  * calling netif_set_real_num_tx_queues.
1858  */
1859 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1860 {
1861         int i;
1862         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1863
1864         /* If TC0 is invalidated disable TC mapping */
1865         if (tc->offset + tc->count > txq) {
1866                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1867                 dev->num_tc = 0;
1868                 return;
1869         }
1870
1871         /* Invalidated prio to tc mappings set to TC0 */
1872         for (i = 1; i < TC_BITMASK + 1; i++) {
1873                 int q = netdev_get_prio_tc_map(dev, i);
1874
1875                 tc = &dev->tc_to_txq[q];
1876                 if (tc->offset + tc->count > txq) {
1877                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1878                                 i, q);
1879                         netdev_set_prio_tc_map(dev, i, 0);
1880                 }
1881         }
1882 }
1883
1884 #ifdef CONFIG_XPS
1885 static DEFINE_MUTEX(xps_map_mutex);
1886 #define xmap_dereference(P)             \
1887         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1888
1889 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1890                                         int cpu, u16 index)
1891 {
1892         struct xps_map *map = NULL;
1893         int pos;
1894
1895         if (dev_maps)
1896                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1897
1898         for (pos = 0; map && pos < map->len; pos++) {
1899                 if (map->queues[pos] == index) {
1900                         if (map->len > 1) {
1901                                 map->queues[pos] = map->queues[--map->len];
1902                         } else {
1903                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1904                                 kfree_rcu(map, rcu);
1905                                 map = NULL;
1906                         }
1907                         break;
1908                 }
1909         }
1910
1911         return map;
1912 }
1913
1914 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1915 {
1916         struct xps_dev_maps *dev_maps;
1917         int cpu, i;
1918         bool active = false;
1919
1920         mutex_lock(&xps_map_mutex);
1921         dev_maps = xmap_dereference(dev->xps_maps);
1922
1923         if (!dev_maps)
1924                 goto out_no_maps;
1925
1926         for_each_possible_cpu(cpu) {
1927                 for (i = index; i < dev->num_tx_queues; i++) {
1928                         if (!remove_xps_queue(dev_maps, cpu, i))
1929                                 break;
1930                 }
1931                 if (i == dev->num_tx_queues)
1932                         active = true;
1933         }
1934
1935         if (!active) {
1936                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1937                 kfree_rcu(dev_maps, rcu);
1938         }
1939
1940         for (i = index; i < dev->num_tx_queues; i++)
1941                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1942                                              NUMA_NO_NODE);
1943
1944 out_no_maps:
1945         mutex_unlock(&xps_map_mutex);
1946 }
1947
1948 static struct xps_map *expand_xps_map(struct xps_map *map,
1949                                       int cpu, u16 index)
1950 {
1951         struct xps_map *new_map;
1952         int alloc_len = XPS_MIN_MAP_ALLOC;
1953         int i, pos;
1954
1955         for (pos = 0; map && pos < map->len; pos++) {
1956                 if (map->queues[pos] != index)
1957                         continue;
1958                 return map;
1959         }
1960
1961         /* Need to add queue to this CPU's existing map */
1962         if (map) {
1963                 if (pos < map->alloc_len)
1964                         return map;
1965
1966                 alloc_len = map->alloc_len * 2;
1967         }
1968
1969         /* Need to allocate new map to store queue on this CPU's map */
1970         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1971                                cpu_to_node(cpu));
1972         if (!new_map)
1973                 return NULL;
1974
1975         for (i = 0; i < pos; i++)
1976                 new_map->queues[i] = map->queues[i];
1977         new_map->alloc_len = alloc_len;
1978         new_map->len = pos;
1979
1980         return new_map;
1981 }
1982
1983 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1984 {
1985         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1986         struct xps_map *map, *new_map;
1987         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1988         int cpu, numa_node_id = -2;
1989         bool active = false;
1990
1991         mutex_lock(&xps_map_mutex);
1992
1993         dev_maps = xmap_dereference(dev->xps_maps);
1994
1995         /* allocate memory for queue storage */
1996         for_each_online_cpu(cpu) {
1997                 if (!cpumask_test_cpu(cpu, mask))
1998                         continue;
1999
2000                 if (!new_dev_maps)
2001                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2002                 if (!new_dev_maps)
2003                         return -ENOMEM;
2004
2005                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2006                                  NULL;
2007
2008                 map = expand_xps_map(map, cpu, index);
2009                 if (!map)
2010                         goto error;
2011
2012                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2013         }
2014
2015         if (!new_dev_maps)
2016                 goto out_no_new_maps;
2017
2018         for_each_possible_cpu(cpu) {
2019                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2020                         /* add queue to CPU maps */
2021                         int pos = 0;
2022
2023                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2024                         while ((pos < map->len) && (map->queues[pos] != index))
2025                                 pos++;
2026
2027                         if (pos == map->len)
2028                                 map->queues[map->len++] = index;
2029 #ifdef CONFIG_NUMA
2030                         if (numa_node_id == -2)
2031                                 numa_node_id = cpu_to_node(cpu);
2032                         else if (numa_node_id != cpu_to_node(cpu))
2033                                 numa_node_id = -1;
2034 #endif
2035                 } else if (dev_maps) {
2036                         /* fill in the new device map from the old device map */
2037                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2038                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2039                 }
2040
2041         }
2042
2043         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2044
2045         /* Cleanup old maps */
2046         if (dev_maps) {
2047                 for_each_possible_cpu(cpu) {
2048                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2049                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2050                         if (map && map != new_map)
2051                                 kfree_rcu(map, rcu);
2052                 }
2053
2054                 kfree_rcu(dev_maps, rcu);
2055         }
2056
2057         dev_maps = new_dev_maps;
2058         active = true;
2059
2060 out_no_new_maps:
2061         /* update Tx queue numa node */
2062         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2063                                      (numa_node_id >= 0) ? numa_node_id :
2064                                      NUMA_NO_NODE);
2065
2066         if (!dev_maps)
2067                 goto out_no_maps;
2068
2069         /* removes queue from unused CPUs */
2070         for_each_possible_cpu(cpu) {
2071                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2072                         continue;
2073
2074                 if (remove_xps_queue(dev_maps, cpu, index))
2075                         active = true;
2076         }
2077
2078         /* free map if not active */
2079         if (!active) {
2080                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2081                 kfree_rcu(dev_maps, rcu);
2082         }
2083
2084 out_no_maps:
2085         mutex_unlock(&xps_map_mutex);
2086
2087         return 0;
2088 error:
2089         /* remove any maps that we added */
2090         for_each_possible_cpu(cpu) {
2091                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2092                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2093                                  NULL;
2094                 if (new_map && new_map != map)
2095                         kfree(new_map);
2096         }
2097
2098         mutex_unlock(&xps_map_mutex);
2099
2100         kfree(new_dev_maps);
2101         return -ENOMEM;
2102 }
2103 EXPORT_SYMBOL(netif_set_xps_queue);
2104
2105 #endif
2106 /*
2107  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2108  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2109  */
2110 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2111 {
2112         int rc;
2113
2114         if (txq < 1 || txq > dev->num_tx_queues)
2115                 return -EINVAL;
2116
2117         if (dev->reg_state == NETREG_REGISTERED ||
2118             dev->reg_state == NETREG_UNREGISTERING) {
2119                 ASSERT_RTNL();
2120
2121                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2122                                                   txq);
2123                 if (rc)
2124                         return rc;
2125
2126                 if (dev->num_tc)
2127                         netif_setup_tc(dev, txq);
2128
2129                 if (txq < dev->real_num_tx_queues) {
2130                         qdisc_reset_all_tx_gt(dev, txq);
2131 #ifdef CONFIG_XPS
2132                         netif_reset_xps_queues_gt(dev, txq);
2133 #endif
2134                 }
2135         }
2136
2137         dev->real_num_tx_queues = txq;
2138         return 0;
2139 }
2140 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2141
2142 #ifdef CONFIG_RPS
2143 /**
2144  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2145  *      @dev: Network device
2146  *      @rxq: Actual number of RX queues
2147  *
2148  *      This must be called either with the rtnl_lock held or before
2149  *      registration of the net device.  Returns 0 on success, or a
2150  *      negative error code.  If called before registration, it always
2151  *      succeeds.
2152  */
2153 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2154 {
2155         int rc;
2156
2157         if (rxq < 1 || rxq > dev->num_rx_queues)
2158                 return -EINVAL;
2159
2160         if (dev->reg_state == NETREG_REGISTERED) {
2161                 ASSERT_RTNL();
2162
2163                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2164                                                   rxq);
2165                 if (rc)
2166                         return rc;
2167         }
2168
2169         dev->real_num_rx_queues = rxq;
2170         return 0;
2171 }
2172 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2173 #endif
2174
2175 /**
2176  * netif_get_num_default_rss_queues - default number of RSS queues
2177  *
2178  * This routine should set an upper limit on the number of RSS queues
2179  * used by default by multiqueue devices.
2180  */
2181 int netif_get_num_default_rss_queues(void)
2182 {
2183         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2184 }
2185 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2186
2187 static inline void __netif_reschedule(struct Qdisc *q)
2188 {
2189         struct softnet_data *sd;
2190         unsigned long flags;
2191
2192         local_irq_save(flags);
2193         sd = &__get_cpu_var(softnet_data);
2194         q->next_sched = NULL;
2195         *sd->output_queue_tailp = q;
2196         sd->output_queue_tailp = &q->next_sched;
2197         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2198         local_irq_restore(flags);
2199 }
2200
2201 void __netif_schedule(struct Qdisc *q)
2202 {
2203         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2204                 __netif_reschedule(q);
2205 }
2206 EXPORT_SYMBOL(__netif_schedule);
2207
2208 void dev_kfree_skb_irq(struct sk_buff *skb)
2209 {
2210         if (atomic_dec_and_test(&skb->users)) {
2211                 struct softnet_data *sd;
2212                 unsigned long flags;
2213
2214                 local_irq_save(flags);
2215                 sd = &__get_cpu_var(softnet_data);
2216                 skb->next = sd->completion_queue;
2217                 sd->completion_queue = skb;
2218                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2219                 local_irq_restore(flags);
2220         }
2221 }
2222 EXPORT_SYMBOL(dev_kfree_skb_irq);
2223
2224 void dev_kfree_skb_any(struct sk_buff *skb)
2225 {
2226         if (in_irq() || irqs_disabled())
2227                 dev_kfree_skb_irq(skb);
2228         else
2229                 dev_kfree_skb(skb);
2230 }
2231 EXPORT_SYMBOL(dev_kfree_skb_any);
2232
2233
2234 /**
2235  * netif_device_detach - mark device as removed
2236  * @dev: network device
2237  *
2238  * Mark device as removed from system and therefore no longer available.
2239  */
2240 void netif_device_detach(struct net_device *dev)
2241 {
2242         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2243             netif_running(dev)) {
2244                 netif_tx_stop_all_queues(dev);
2245         }
2246 }
2247 EXPORT_SYMBOL(netif_device_detach);
2248
2249 /**
2250  * netif_device_attach - mark device as attached
2251  * @dev: network device
2252  *
2253  * Mark device as attached from system and restart if needed.
2254  */
2255 void netif_device_attach(struct net_device *dev)
2256 {
2257         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2258             netif_running(dev)) {
2259                 netif_tx_wake_all_queues(dev);
2260                 __netdev_watchdog_up(dev);
2261         }
2262 }
2263 EXPORT_SYMBOL(netif_device_attach);
2264
2265 static void skb_warn_bad_offload(const struct sk_buff *skb)
2266 {
2267         static const netdev_features_t null_features = 0;
2268         struct net_device *dev = skb->dev;
2269         const char *driver = "";
2270
2271         if (dev && dev->dev.parent)
2272                 driver = dev_driver_string(dev->dev.parent);
2273
2274         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2275              "gso_type=%d ip_summed=%d\n",
2276              driver, dev ? &dev->features : &null_features,
2277              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2278              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2279              skb_shinfo(skb)->gso_type, skb->ip_summed);
2280 }
2281
2282 /*
2283  * Invalidate hardware checksum when packet is to be mangled, and
2284  * complete checksum manually on outgoing path.
2285  */
2286 int skb_checksum_help(struct sk_buff *skb)
2287 {
2288         __wsum csum;
2289         int ret = 0, offset;
2290
2291         if (skb->ip_summed == CHECKSUM_COMPLETE)
2292                 goto out_set_summed;
2293
2294         if (unlikely(skb_shinfo(skb)->gso_size)) {
2295                 skb_warn_bad_offload(skb);
2296                 return -EINVAL;
2297         }
2298
2299         /* Before computing a checksum, we should make sure no frag could
2300          * be modified by an external entity : checksum could be wrong.
2301          */
2302         if (skb_has_shared_frag(skb)) {
2303                 ret = __skb_linearize(skb);
2304                 if (ret)
2305                         goto out;
2306         }
2307
2308         offset = skb_checksum_start_offset(skb);
2309         BUG_ON(offset >= skb_headlen(skb));
2310         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2311
2312         offset += skb->csum_offset;
2313         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2314
2315         if (skb_cloned(skb) &&
2316             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2317                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2318                 if (ret)
2319                         goto out;
2320         }
2321
2322         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2323 out_set_summed:
2324         skb->ip_summed = CHECKSUM_NONE;
2325 out:
2326         return ret;
2327 }
2328 EXPORT_SYMBOL(skb_checksum_help);
2329
2330 /**
2331  *      skb_mac_gso_segment - mac layer segmentation handler.
2332  *      @skb: buffer to segment
2333  *      @features: features for the output path (see dev->features)
2334  */
2335 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2336                                     netdev_features_t features)
2337 {
2338         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2339         struct packet_offload *ptype;
2340         __be16 type = skb->protocol;
2341
2342         while (type == htons(ETH_P_8021Q)) {
2343                 int vlan_depth = ETH_HLEN;
2344                 struct vlan_hdr *vh;
2345
2346                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2347                         return ERR_PTR(-EINVAL);
2348
2349                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2350                 type = vh->h_vlan_encapsulated_proto;
2351                 vlan_depth += VLAN_HLEN;
2352         }
2353
2354         __skb_pull(skb, skb->mac_len);
2355
2356         rcu_read_lock();
2357         list_for_each_entry_rcu(ptype, &offload_base, list) {
2358                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2359                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2360                                 int err;
2361
2362                                 err = ptype->callbacks.gso_send_check(skb);
2363                                 segs = ERR_PTR(err);
2364                                 if (err || skb_gso_ok(skb, features))
2365                                         break;
2366                                 __skb_push(skb, (skb->data -
2367                                                  skb_network_header(skb)));
2368                         }
2369                         segs = ptype->callbacks.gso_segment(skb, features);
2370                         break;
2371                 }
2372         }
2373         rcu_read_unlock();
2374
2375         __skb_push(skb, skb->data - skb_mac_header(skb));
2376
2377         return segs;
2378 }
2379 EXPORT_SYMBOL(skb_mac_gso_segment);
2380
2381
2382 /* openvswitch calls this on rx path, so we need a different check.
2383  */
2384 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2385 {
2386         if (tx_path)
2387                 return skb->ip_summed != CHECKSUM_PARTIAL;
2388         else
2389                 return skb->ip_summed == CHECKSUM_NONE;
2390 }
2391
2392 /**
2393  *      __skb_gso_segment - Perform segmentation on skb.
2394  *      @skb: buffer to segment
2395  *      @features: features for the output path (see dev->features)
2396  *      @tx_path: whether it is called in TX path
2397  *
2398  *      This function segments the given skb and returns a list of segments.
2399  *
2400  *      It may return NULL if the skb requires no segmentation.  This is
2401  *      only possible when GSO is used for verifying header integrity.
2402  */
2403 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2404                                   netdev_features_t features, bool tx_path)
2405 {
2406         if (unlikely(skb_needs_check(skb, tx_path))) {
2407                 int err;
2408
2409                 skb_warn_bad_offload(skb);
2410
2411                 if (skb_header_cloned(skb) &&
2412                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2413                         return ERR_PTR(err);
2414         }
2415
2416         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2417         skb_reset_mac_header(skb);
2418         skb_reset_mac_len(skb);
2419
2420         return skb_mac_gso_segment(skb, features);
2421 }
2422 EXPORT_SYMBOL(__skb_gso_segment);
2423
2424 /* Take action when hardware reception checksum errors are detected. */
2425 #ifdef CONFIG_BUG
2426 void netdev_rx_csum_fault(struct net_device *dev)
2427 {
2428         if (net_ratelimit()) {
2429                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2430                 dump_stack();
2431         }
2432 }
2433 EXPORT_SYMBOL(netdev_rx_csum_fault);
2434 #endif
2435
2436 /* Actually, we should eliminate this check as soon as we know, that:
2437  * 1. IOMMU is present and allows to map all the memory.
2438  * 2. No high memory really exists on this machine.
2439  */
2440
2441 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2442 {
2443 #ifdef CONFIG_HIGHMEM
2444         int i;
2445         if (!(dev->features & NETIF_F_HIGHDMA)) {
2446                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2447                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2448                         if (PageHighMem(skb_frag_page(frag)))
2449                                 return 1;
2450                 }
2451         }
2452
2453         if (PCI_DMA_BUS_IS_PHYS) {
2454                 struct device *pdev = dev->dev.parent;
2455
2456                 if (!pdev)
2457                         return 0;
2458                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2459                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2460                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2461                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2462                                 return 1;
2463                 }
2464         }
2465 #endif
2466         return 0;
2467 }
2468
2469 struct dev_gso_cb {
2470         void (*destructor)(struct sk_buff *skb);
2471 };
2472
2473 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2474
2475 static void dev_gso_skb_destructor(struct sk_buff *skb)
2476 {
2477         struct dev_gso_cb *cb;
2478
2479         do {
2480                 struct sk_buff *nskb = skb->next;
2481
2482                 skb->next = nskb->next;
2483                 nskb->next = NULL;
2484                 kfree_skb(nskb);
2485         } while (skb->next);
2486
2487         cb = DEV_GSO_CB(skb);
2488         if (cb->destructor)
2489                 cb->destructor(skb);
2490 }
2491
2492 /**
2493  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2494  *      @skb: buffer to segment
2495  *      @features: device features as applicable to this skb
2496  *
2497  *      This function segments the given skb and stores the list of segments
2498  *      in skb->next.
2499  */
2500 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2501 {
2502         struct sk_buff *segs;
2503
2504         segs = skb_gso_segment(skb, features);
2505
2506         /* Verifying header integrity only. */
2507         if (!segs)
2508                 return 0;
2509
2510         if (IS_ERR(segs))
2511                 return PTR_ERR(segs);
2512
2513         skb->next = segs;
2514         DEV_GSO_CB(skb)->destructor = skb->destructor;
2515         skb->destructor = dev_gso_skb_destructor;
2516
2517         return 0;
2518 }
2519
2520 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2521 {
2522         return ((features & NETIF_F_GEN_CSUM) ||
2523                 ((features & NETIF_F_V4_CSUM) &&
2524                  protocol == htons(ETH_P_IP)) ||
2525                 ((features & NETIF_F_V6_CSUM) &&
2526                  protocol == htons(ETH_P_IPV6)) ||
2527                 ((features & NETIF_F_FCOE_CRC) &&
2528                  protocol == htons(ETH_P_FCOE)));
2529 }
2530
2531 static netdev_features_t harmonize_features(struct sk_buff *skb,
2532         __be16 protocol, netdev_features_t features)
2533 {
2534         if (skb->ip_summed != CHECKSUM_NONE &&
2535             !can_checksum_protocol(features, protocol)) {
2536                 features &= ~NETIF_F_ALL_CSUM;
2537                 features &= ~NETIF_F_SG;
2538         } else if (illegal_highdma(skb->dev, skb)) {
2539                 features &= ~NETIF_F_SG;
2540         }
2541
2542         return features;
2543 }
2544
2545 netdev_features_t netif_skb_features(struct sk_buff *skb)
2546 {
2547         __be16 protocol = skb->protocol;
2548         netdev_features_t features = skb->dev->features;
2549
2550         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2551                 features &= ~NETIF_F_GSO_MASK;
2552
2553         if (protocol == htons(ETH_P_8021Q)) {
2554                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2555                 protocol = veh->h_vlan_encapsulated_proto;
2556         } else if (!vlan_tx_tag_present(skb)) {
2557                 return harmonize_features(skb, protocol, features);
2558         }
2559
2560         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2561
2562         if (protocol != htons(ETH_P_8021Q)) {
2563                 return harmonize_features(skb, protocol, features);
2564         } else {
2565                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2566                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2567                 return harmonize_features(skb, protocol, features);
2568         }
2569 }
2570 EXPORT_SYMBOL(netif_skb_features);
2571
2572 /*
2573  * Returns true if either:
2574  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2575  *      2. skb is fragmented and the device does not support SG.
2576  */
2577 static inline int skb_needs_linearize(struct sk_buff *skb,
2578                                       int features)
2579 {
2580         return skb_is_nonlinear(skb) &&
2581                         ((skb_has_frag_list(skb) &&
2582                                 !(features & NETIF_F_FRAGLIST)) ||
2583                         (skb_shinfo(skb)->nr_frags &&
2584                                 !(features & NETIF_F_SG)));
2585 }
2586
2587 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2588                         struct netdev_queue *txq)
2589 {
2590         const struct net_device_ops *ops = dev->netdev_ops;
2591         int rc = NETDEV_TX_OK;
2592         unsigned int skb_len;
2593
2594         if (likely(!skb->next)) {
2595                 netdev_features_t features;
2596
2597                 /*
2598                  * If device doesn't need skb->dst, release it right now while
2599                  * its hot in this cpu cache
2600                  */
2601                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2602                         skb_dst_drop(skb);
2603
2604                 features = netif_skb_features(skb);
2605
2606                 if (vlan_tx_tag_present(skb) &&
2607                     !(features & NETIF_F_HW_VLAN_TX)) {
2608                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2609                         if (unlikely(!skb))
2610                                 goto out;
2611
2612                         skb->vlan_tci = 0;
2613                 }
2614
2615                 /* If encapsulation offload request, verify we are testing
2616                  * hardware encapsulation features instead of standard
2617                  * features for the netdev
2618                  */
2619                 if (skb->encapsulation)
2620                         features &= dev->hw_enc_features;
2621
2622                 if (netif_needs_gso(skb, features)) {
2623                         if (unlikely(dev_gso_segment(skb, features)))
2624                                 goto out_kfree_skb;
2625                         if (skb->next)
2626                                 goto gso;
2627                 } else {
2628                         if (skb_needs_linearize(skb, features) &&
2629                             __skb_linearize(skb))
2630                                 goto out_kfree_skb;
2631
2632                         /* If packet is not checksummed and device does not
2633                          * support checksumming for this protocol, complete
2634                          * checksumming here.
2635                          */
2636                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2637                                 if (skb->encapsulation)
2638                                         skb_set_inner_transport_header(skb,
2639                                                 skb_checksum_start_offset(skb));
2640                                 else
2641                                         skb_set_transport_header(skb,
2642                                                 skb_checksum_start_offset(skb));
2643                                 if (!(features & NETIF_F_ALL_CSUM) &&
2644                                      skb_checksum_help(skb))
2645                                         goto out_kfree_skb;
2646                         }
2647                 }
2648
2649                 if (!list_empty(&ptype_all))
2650                         dev_queue_xmit_nit(skb, dev);
2651
2652                 skb_len = skb->len;
2653                 rc = ops->ndo_start_xmit(skb, dev);
2654                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2655                 if (rc == NETDEV_TX_OK)
2656                         txq_trans_update(txq);
2657                 return rc;
2658         }
2659
2660 gso:
2661         do {
2662                 struct sk_buff *nskb = skb->next;
2663
2664                 skb->next = nskb->next;
2665                 nskb->next = NULL;
2666
2667                 /*
2668                  * If device doesn't need nskb->dst, release it right now while
2669                  * its hot in this cpu cache
2670                  */
2671                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2672                         skb_dst_drop(nskb);
2673
2674                 if (!list_empty(&ptype_all))
2675                         dev_queue_xmit_nit(nskb, dev);
2676
2677                 skb_len = nskb->len;
2678                 rc = ops->ndo_start_xmit(nskb, dev);
2679                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2680                 if (unlikely(rc != NETDEV_TX_OK)) {
2681                         if (rc & ~NETDEV_TX_MASK)
2682                                 goto out_kfree_gso_skb;
2683                         nskb->next = skb->next;
2684                         skb->next = nskb;
2685                         return rc;
2686                 }
2687                 txq_trans_update(txq);
2688                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2689                         return NETDEV_TX_BUSY;
2690         } while (skb->next);
2691
2692 out_kfree_gso_skb:
2693         if (likely(skb->next == NULL))
2694                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2695 out_kfree_skb:
2696         kfree_skb(skb);
2697 out:
2698         return rc;
2699 }
2700
2701 static void qdisc_pkt_len_init(struct sk_buff *skb)
2702 {
2703         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2704
2705         qdisc_skb_cb(skb)->pkt_len = skb->len;
2706
2707         /* To get more precise estimation of bytes sent on wire,
2708          * we add to pkt_len the headers size of all segments
2709          */
2710         if (shinfo->gso_size)  {
2711                 unsigned int hdr_len;
2712
2713                 /* mac layer + network layer */
2714                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2715
2716                 /* + transport layer */
2717                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2718                         hdr_len += tcp_hdrlen(skb);
2719                 else
2720                         hdr_len += sizeof(struct udphdr);
2721                 qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
2722         }
2723 }
2724
2725 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2726                                  struct net_device *dev,
2727                                  struct netdev_queue *txq)
2728 {
2729         spinlock_t *root_lock = qdisc_lock(q);
2730         bool contended;
2731         int rc;
2732
2733         qdisc_pkt_len_init(skb);
2734         qdisc_calculate_pkt_len(skb, q);
2735         /*
2736          * Heuristic to force contended enqueues to serialize on a
2737          * separate lock before trying to get qdisc main lock.
2738          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2739          * and dequeue packets faster.
2740          */
2741         contended = qdisc_is_running(q);
2742         if (unlikely(contended))
2743                 spin_lock(&q->busylock);
2744
2745         spin_lock(root_lock);
2746         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2747                 kfree_skb(skb);
2748                 rc = NET_XMIT_DROP;
2749         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2750                    qdisc_run_begin(q)) {
2751                 /*
2752                  * This is a work-conserving queue; there are no old skbs
2753                  * waiting to be sent out; and the qdisc is not running -
2754                  * xmit the skb directly.
2755                  */
2756                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2757                         skb_dst_force(skb);
2758
2759                 qdisc_bstats_update(q, skb);
2760
2761                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2762                         if (unlikely(contended)) {
2763                                 spin_unlock(&q->busylock);
2764                                 contended = false;
2765                         }
2766                         __qdisc_run(q);
2767                 } else
2768                         qdisc_run_end(q);
2769
2770                 rc = NET_XMIT_SUCCESS;
2771         } else {
2772                 skb_dst_force(skb);
2773                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2774                 if (qdisc_run_begin(q)) {
2775                         if (unlikely(contended)) {
2776                                 spin_unlock(&q->busylock);
2777                                 contended = false;
2778                         }
2779                         __qdisc_run(q);
2780                 }
2781         }
2782         spin_unlock(root_lock);
2783         if (unlikely(contended))
2784                 spin_unlock(&q->busylock);
2785         return rc;
2786 }
2787
2788 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2789 static void skb_update_prio(struct sk_buff *skb)
2790 {
2791         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2792
2793         if (!skb->priority && skb->sk && map) {
2794                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2795
2796                 if (prioidx < map->priomap_len)
2797                         skb->priority = map->priomap[prioidx];
2798         }
2799 }
2800 #else
2801 #define skb_update_prio(skb)
2802 #endif
2803
2804 static DEFINE_PER_CPU(int, xmit_recursion);
2805 #define RECURSION_LIMIT 10
2806
2807 /**
2808  *      dev_loopback_xmit - loop back @skb
2809  *      @skb: buffer to transmit
2810  */
2811 int dev_loopback_xmit(struct sk_buff *skb)
2812 {
2813         skb_reset_mac_header(skb);
2814         __skb_pull(skb, skb_network_offset(skb));
2815         skb->pkt_type = PACKET_LOOPBACK;
2816         skb->ip_summed = CHECKSUM_UNNECESSARY;
2817         WARN_ON(!skb_dst(skb));
2818         skb_dst_force(skb);
2819         netif_rx_ni(skb);
2820         return 0;
2821 }
2822 EXPORT_SYMBOL(dev_loopback_xmit);
2823
2824 /**
2825  *      dev_queue_xmit - transmit a buffer
2826  *      @skb: buffer to transmit
2827  *
2828  *      Queue a buffer for transmission to a network device. The caller must
2829  *      have set the device and priority and built the buffer before calling
2830  *      this function. The function can be called from an interrupt.
2831  *
2832  *      A negative errno code is returned on a failure. A success does not
2833  *      guarantee the frame will be transmitted as it may be dropped due
2834  *      to congestion or traffic shaping.
2835  *
2836  * -----------------------------------------------------------------------------------
2837  *      I notice this method can also return errors from the queue disciplines,
2838  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2839  *      be positive.
2840  *
2841  *      Regardless of the return value, the skb is consumed, so it is currently
2842  *      difficult to retry a send to this method.  (You can bump the ref count
2843  *      before sending to hold a reference for retry if you are careful.)
2844  *
2845  *      When calling this method, interrupts MUST be enabled.  This is because
2846  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2847  *          --BLG
2848  */
2849 int dev_queue_xmit(struct sk_buff *skb)
2850 {
2851         struct net_device *dev = skb->dev;
2852         struct netdev_queue *txq;
2853         struct Qdisc *q;
2854         int rc = -ENOMEM;
2855
2856         skb_reset_mac_header(skb);
2857
2858         /* Disable soft irqs for various locks below. Also
2859          * stops preemption for RCU.
2860          */
2861         rcu_read_lock_bh();
2862
2863         skb_update_prio(skb);
2864
2865         txq = netdev_pick_tx(dev, skb);
2866         q = rcu_dereference_bh(txq->qdisc);
2867
2868 #ifdef CONFIG_NET_CLS_ACT
2869         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2870 #endif
2871         trace_net_dev_queue(skb);
2872         if (q->enqueue) {
2873                 rc = __dev_xmit_skb(skb, q, dev, txq);
2874                 goto out;
2875         }
2876
2877         /* The device has no queue. Common case for software devices:
2878            loopback, all the sorts of tunnels...
2879
2880            Really, it is unlikely that netif_tx_lock protection is necessary
2881            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2882            counters.)
2883            However, it is possible, that they rely on protection
2884            made by us here.
2885
2886            Check this and shot the lock. It is not prone from deadlocks.
2887            Either shot noqueue qdisc, it is even simpler 8)
2888          */
2889         if (dev->flags & IFF_UP) {
2890                 int cpu = smp_processor_id(); /* ok because BHs are off */
2891
2892                 if (txq->xmit_lock_owner != cpu) {
2893
2894                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2895                                 goto recursion_alert;
2896
2897                         HARD_TX_LOCK(dev, txq, cpu);
2898
2899                         if (!netif_xmit_stopped(txq)) {
2900                                 __this_cpu_inc(xmit_recursion);
2901                                 rc = dev_hard_start_xmit(skb, dev, txq);
2902                                 __this_cpu_dec(xmit_recursion);
2903                                 if (dev_xmit_complete(rc)) {
2904                                         HARD_TX_UNLOCK(dev, txq);
2905                                         goto out;
2906                                 }
2907                         }
2908                         HARD_TX_UNLOCK(dev, txq);
2909                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2910                                              dev->name);
2911                 } else {
2912                         /* Recursion is detected! It is possible,
2913                          * unfortunately
2914                          */
2915 recursion_alert:
2916                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2917                                              dev->name);
2918                 }
2919         }
2920
2921         rc = -ENETDOWN;
2922         rcu_read_unlock_bh();
2923
2924         kfree_skb(skb);
2925         return rc;
2926 out:
2927         rcu_read_unlock_bh();
2928         return rc;
2929 }
2930 EXPORT_SYMBOL(dev_queue_xmit);
2931
2932
2933 /*=======================================================================
2934                         Receiver routines
2935   =======================================================================*/
2936
2937 int netdev_max_backlog __read_mostly = 1000;
2938 EXPORT_SYMBOL(netdev_max_backlog);
2939
2940 int netdev_tstamp_prequeue __read_mostly = 1;
2941 int netdev_budget __read_mostly = 300;
2942 int weight_p __read_mostly = 64;            /* old backlog weight */
2943
2944 /* Called with irq disabled */
2945 static inline void ____napi_schedule(struct softnet_data *sd,
2946                                      struct napi_struct *napi)
2947 {
2948         list_add_tail(&napi->poll_list, &sd->poll_list);
2949         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2950 }
2951
2952 #ifdef CONFIG_RPS
2953
2954 /* One global table that all flow-based protocols share. */
2955 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2956 EXPORT_SYMBOL(rps_sock_flow_table);
2957
2958 struct static_key rps_needed __read_mostly;
2959
2960 static struct rps_dev_flow *
2961 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2962             struct rps_dev_flow *rflow, u16 next_cpu)
2963 {
2964         if (next_cpu != RPS_NO_CPU) {
2965 #ifdef CONFIG_RFS_ACCEL
2966                 struct netdev_rx_queue *rxqueue;
2967                 struct rps_dev_flow_table *flow_table;
2968                 struct rps_dev_flow *old_rflow;
2969                 u32 flow_id;
2970                 u16 rxq_index;
2971                 int rc;
2972
2973                 /* Should we steer this flow to a different hardware queue? */
2974                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2975                     !(dev->features & NETIF_F_NTUPLE))
2976                         goto out;
2977                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2978                 if (rxq_index == skb_get_rx_queue(skb))
2979                         goto out;
2980
2981                 rxqueue = dev->_rx + rxq_index;
2982                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2983                 if (!flow_table)
2984                         goto out;
2985                 flow_id = skb->rxhash & flow_table->mask;
2986                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2987                                                         rxq_index, flow_id);
2988                 if (rc < 0)
2989                         goto out;
2990                 old_rflow = rflow;
2991                 rflow = &flow_table->flows[flow_id];
2992                 rflow->filter = rc;
2993                 if (old_rflow->filter == rflow->filter)
2994                         old_rflow->filter = RPS_NO_FILTER;
2995         out:
2996 #endif
2997                 rflow->last_qtail =
2998                         per_cpu(softnet_data, next_cpu).input_queue_head;
2999         }
3000
3001         rflow->cpu = next_cpu;
3002         return rflow;
3003 }
3004
3005 /*
3006  * get_rps_cpu is called from netif_receive_skb and returns the target
3007  * CPU from the RPS map of the receiving queue for a given skb.
3008  * rcu_read_lock must be held on entry.
3009  */
3010 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3011                        struct rps_dev_flow **rflowp)
3012 {
3013         struct netdev_rx_queue *rxqueue;
3014         struct rps_map *map;
3015         struct rps_dev_flow_table *flow_table;
3016         struct rps_sock_flow_table *sock_flow_table;
3017         int cpu = -1;
3018         u16 tcpu;
3019
3020         if (skb_rx_queue_recorded(skb)) {
3021                 u16 index = skb_get_rx_queue(skb);
3022                 if (unlikely(index >= dev->real_num_rx_queues)) {
3023                         WARN_ONCE(dev->real_num_rx_queues > 1,
3024                                   "%s received packet on queue %u, but number "
3025                                   "of RX queues is %u\n",
3026                                   dev->name, index, dev->real_num_rx_queues);
3027                         goto done;
3028                 }
3029                 rxqueue = dev->_rx + index;
3030         } else
3031                 rxqueue = dev->_rx;
3032
3033         map = rcu_dereference(rxqueue->rps_map);
3034         if (map) {
3035                 if (map->len == 1 &&
3036                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3037                         tcpu = map->cpus[0];
3038                         if (cpu_online(tcpu))
3039                                 cpu = tcpu;
3040                         goto done;
3041                 }
3042         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3043                 goto done;
3044         }
3045
3046         skb_reset_network_header(skb);
3047         if (!skb_get_rxhash(skb))
3048                 goto done;
3049
3050         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3051         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3052         if (flow_table && sock_flow_table) {
3053                 u16 next_cpu;
3054                 struct rps_dev_flow *rflow;
3055
3056                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3057                 tcpu = rflow->cpu;
3058
3059                 next_cpu = sock_flow_table->ents[skb->rxhash &
3060                     sock_flow_table->mask];
3061
3062                 /*
3063                  * If the desired CPU (where last recvmsg was done) is
3064                  * different from current CPU (one in the rx-queue flow
3065                  * table entry), switch if one of the following holds:
3066                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3067                  *   - Current CPU is offline.
3068                  *   - The current CPU's queue tail has advanced beyond the
3069                  *     last packet that was enqueued using this table entry.
3070                  *     This guarantees that all previous packets for the flow
3071                  *     have been dequeued, thus preserving in order delivery.
3072                  */
3073                 if (unlikely(tcpu != next_cpu) &&
3074                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3075                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3076                       rflow->last_qtail)) >= 0)) {
3077                         tcpu = next_cpu;
3078                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3079                 }
3080
3081                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3082                         *rflowp = rflow;
3083                         cpu = tcpu;
3084                         goto done;
3085                 }
3086         }
3087
3088         if (map) {
3089                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3090
3091                 if (cpu_online(tcpu)) {
3092                         cpu = tcpu;
3093                         goto done;
3094                 }
3095         }
3096
3097 done:
3098         return cpu;
3099 }
3100
3101 #ifdef CONFIG_RFS_ACCEL
3102
3103 /**
3104  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3105  * @dev: Device on which the filter was set
3106  * @rxq_index: RX queue index
3107  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3108  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3109  *
3110  * Drivers that implement ndo_rx_flow_steer() should periodically call
3111  * this function for each installed filter and remove the filters for
3112  * which it returns %true.
3113  */
3114 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3115                          u32 flow_id, u16 filter_id)
3116 {
3117         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3118         struct rps_dev_flow_table *flow_table;
3119         struct rps_dev_flow *rflow;
3120         bool expire = true;
3121         int cpu;
3122
3123         rcu_read_lock();
3124         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3125         if (flow_table && flow_id <= flow_table->mask) {
3126                 rflow = &flow_table->flows[flow_id];
3127                 cpu = ACCESS_ONCE(rflow->cpu);
3128                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3129                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3130                            rflow->last_qtail) <
3131                      (int)(10 * flow_table->mask)))
3132                         expire = false;
3133         }
3134         rcu_read_unlock();
3135         return expire;
3136 }
3137 EXPORT_SYMBOL(rps_may_expire_flow);
3138
3139 #endif /* CONFIG_RFS_ACCEL */
3140
3141 /* Called from hardirq (IPI) context */
3142 static void rps_trigger_softirq(void *data)
3143 {
3144         struct softnet_data *sd = data;
3145
3146         ____napi_schedule(sd, &sd->backlog);
3147         sd->received_rps++;
3148 }
3149
3150 #endif /* CONFIG_RPS */
3151
3152 /*
3153  * Check if this softnet_data structure is another cpu one
3154  * If yes, queue it to our IPI list and return 1
3155  * If no, return 0
3156  */
3157 static int rps_ipi_queued(struct softnet_data *sd)
3158 {
3159 #ifdef CONFIG_RPS
3160         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3161
3162         if (sd != mysd) {
3163                 sd->rps_ipi_next = mysd->rps_ipi_list;
3164                 mysd->rps_ipi_list = sd;
3165
3166                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3167                 return 1;
3168         }
3169 #endif /* CONFIG_RPS */
3170         return 0;
3171 }
3172
3173 /*
3174  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3175  * queue (may be a remote CPU queue).
3176  */
3177 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3178                               unsigned int *qtail)
3179 {
3180         struct softnet_data *sd;
3181         unsigned long flags;
3182
3183         sd = &per_cpu(softnet_data, cpu);
3184
3185         local_irq_save(flags);
3186
3187         rps_lock(sd);
3188         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3189                 if (skb_queue_len(&sd->input_pkt_queue)) {
3190 enqueue:
3191                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3192                         input_queue_tail_incr_save(sd, qtail);
3193                         rps_unlock(sd);
3194                         local_irq_restore(flags);
3195                         return NET_RX_SUCCESS;
3196                 }
3197
3198                 /* Schedule NAPI for backlog device
3199                  * We can use non atomic operation since we own the queue lock
3200                  */
3201                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3202                         if (!rps_ipi_queued(sd))
3203                                 ____napi_schedule(sd, &sd->backlog);
3204                 }
3205                 goto enqueue;
3206         }
3207
3208         sd->dropped++;
3209         rps_unlock(sd);
3210
3211         local_irq_restore(flags);
3212
3213         atomic_long_inc(&skb->dev->rx_dropped);
3214         kfree_skb(skb);
3215         return NET_RX_DROP;
3216 }
3217
3218 /**
3219  *      netif_rx        -       post buffer to the network code
3220  *      @skb: buffer to post
3221  *
3222  *      This function receives a packet from a device driver and queues it for
3223  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3224  *      may be dropped during processing for congestion control or by the
3225  *      protocol layers.
3226  *
3227  *      return values:
3228  *      NET_RX_SUCCESS  (no congestion)
3229  *      NET_RX_DROP     (packet was dropped)
3230  *
3231  */
3232
3233 int netif_rx(struct sk_buff *skb)
3234 {
3235         int ret;
3236
3237         /* if netpoll wants it, pretend we never saw it */
3238         if (netpoll_rx(skb))
3239                 return NET_RX_DROP;
3240
3241         net_timestamp_check(netdev_tstamp_prequeue, skb);
3242
3243         trace_netif_rx(skb);
3244 #ifdef CONFIG_RPS
3245         if (static_key_false(&rps_needed)) {
3246                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3247                 int cpu;
3248
3249                 preempt_disable();
3250                 rcu_read_lock();
3251
3252                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3253                 if (cpu < 0)
3254                         cpu = smp_processor_id();
3255
3256                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3257
3258                 rcu_read_unlock();
3259                 preempt_enable();
3260         } else
3261 #endif
3262         {
3263                 unsigned int qtail;
3264                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3265                 put_cpu();
3266         }
3267         return ret;
3268 }
3269 EXPORT_SYMBOL(netif_rx);
3270
3271 int netif_rx_ni(struct sk_buff *skb)
3272 {
3273         int err;
3274
3275         preempt_disable();
3276         err = netif_rx(skb);
3277         if (local_softirq_pending())
3278                 do_softirq();
3279         preempt_enable();
3280
3281         return err;
3282 }
3283 EXPORT_SYMBOL(netif_rx_ni);
3284
3285 static void net_tx_action(struct softirq_action *h)
3286 {
3287         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3288
3289         if (sd->completion_queue) {
3290                 struct sk_buff *clist;
3291
3292                 local_irq_disable();
3293                 clist = sd->completion_queue;
3294                 sd->completion_queue = NULL;
3295                 local_irq_enable();
3296
3297                 while (clist) {
3298                         struct sk_buff *skb = clist;
3299                         clist = clist->next;
3300
3301                         WARN_ON(atomic_read(&skb->users));
3302                         trace_kfree_skb(skb, net_tx_action);
3303                         __kfree_skb(skb);
3304                 }
3305         }
3306
3307         if (sd->output_queue) {
3308                 struct Qdisc *head;
3309
3310                 local_irq_disable();
3311                 head = sd->output_queue;
3312                 sd->output_queue = NULL;
3313                 sd->output_queue_tailp = &sd->output_queue;
3314                 local_irq_enable();
3315
3316                 while (head) {
3317                         struct Qdisc *q = head;
3318                         spinlock_t *root_lock;
3319
3320                         head = head->next_sched;
3321
3322                         root_lock = qdisc_lock(q);
3323                         if (spin_trylock(root_lock)) {
3324                                 smp_mb__before_clear_bit();
3325                                 clear_bit(__QDISC_STATE_SCHED,
3326                                           &q->state);
3327                                 qdisc_run(q);
3328                                 spin_unlock(root_lock);
3329                         } else {
3330                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3331                                               &q->state)) {
3332                                         __netif_reschedule(q);
3333                                 } else {
3334                                         smp_mb__before_clear_bit();
3335                                         clear_bit(__QDISC_STATE_SCHED,
3336                                                   &q->state);
3337                                 }
3338                         }
3339                 }
3340         }
3341 }
3342
3343 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3344     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3345 /* This hook is defined here for ATM LANE */
3346 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3347                              unsigned char *addr) __read_mostly;
3348 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3349 #endif
3350
3351 #ifdef CONFIG_NET_CLS_ACT
3352 /* TODO: Maybe we should just force sch_ingress to be compiled in
3353  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3354  * a compare and 2 stores extra right now if we dont have it on
3355  * but have CONFIG_NET_CLS_ACT
3356  * NOTE: This doesn't stop any functionality; if you dont have
3357  * the ingress scheduler, you just can't add policies on ingress.
3358  *
3359  */
3360 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3361 {
3362         struct net_device *dev = skb->dev;
3363         u32 ttl = G_TC_RTTL(skb->tc_verd);
3364         int result = TC_ACT_OK;
3365         struct Qdisc *q;
3366
3367         if (unlikely(MAX_RED_LOOP < ttl++)) {
3368                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3369                                      skb->skb_iif, dev->ifindex);
3370                 return TC_ACT_SHOT;
3371         }
3372
3373         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3374         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3375
3376         q = rxq->qdisc;
3377         if (q != &noop_qdisc) {
3378                 spin_lock(qdisc_lock(q));
3379                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3380                         result = qdisc_enqueue_root(skb, q);
3381                 spin_unlock(qdisc_lock(q));
3382         }
3383
3384         return result;
3385 }
3386
3387 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3388                                          struct packet_type **pt_prev,
3389                                          int *ret, struct net_device *orig_dev)
3390 {
3391         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3392
3393         if (!rxq || rxq->qdisc == &noop_qdisc)
3394                 goto out;
3395
3396         if (*pt_prev) {
3397                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3398                 *pt_prev = NULL;
3399         }
3400
3401         switch (ing_filter(skb, rxq)) {
3402         case TC_ACT_SHOT:
3403         case TC_ACT_STOLEN:
3404                 kfree_skb(skb);
3405                 return NULL;
3406         }
3407
3408 out:
3409         skb->tc_verd = 0;
3410         return skb;
3411 }
3412 #endif
3413
3414 /**
3415  *      netdev_rx_handler_register - register receive handler
3416  *      @dev: device to register a handler for
3417  *      @rx_handler: receive handler to register
3418  *      @rx_handler_data: data pointer that is used by rx handler
3419  *
3420  *      Register a receive hander for a device. This handler will then be
3421  *      called from __netif_receive_skb. A negative errno code is returned
3422  *      on a failure.
3423  *
3424  *      The caller must hold the rtnl_mutex.
3425  *
3426  *      For a general description of rx_handler, see enum rx_handler_result.
3427  */
3428 int netdev_rx_handler_register(struct net_device *dev,
3429                                rx_handler_func_t *rx_handler,
3430                                void *rx_handler_data)
3431 {
3432         ASSERT_RTNL();
3433
3434         if (dev->rx_handler)
3435                 return -EBUSY;
3436
3437         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3438         rcu_assign_pointer(dev->rx_handler, rx_handler);
3439
3440         return 0;
3441 }
3442 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3443
3444 /**
3445  *      netdev_rx_handler_unregister - unregister receive handler
3446  *      @dev: device to unregister a handler from
3447  *
3448  *      Unregister a receive hander from a device.
3449  *
3450  *      The caller must hold the rtnl_mutex.
3451  */
3452 void netdev_rx_handler_unregister(struct net_device *dev)
3453 {
3454
3455         ASSERT_RTNL();
3456         RCU_INIT_POINTER(dev->rx_handler, NULL);
3457         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3458 }
3459 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3460
3461 /*
3462  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3463  * the special handling of PFMEMALLOC skbs.
3464  */
3465 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3466 {
3467         switch (skb->protocol) {
3468         case __constant_htons(ETH_P_ARP):
3469         case __constant_htons(ETH_P_IP):
3470         case __constant_htons(ETH_P_IPV6):
3471         case __constant_htons(ETH_P_8021Q):
3472                 return true;
3473         default:
3474                 return false;
3475         }
3476 }
3477
3478 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3479 {
3480         struct packet_type *ptype, *pt_prev;
3481         rx_handler_func_t *rx_handler;
3482         struct net_device *orig_dev;
3483         struct net_device *null_or_dev;
3484         bool deliver_exact = false;
3485         int ret = NET_RX_DROP;
3486         __be16 type;
3487
3488         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3489
3490         trace_netif_receive_skb(skb);
3491
3492         /* if we've gotten here through NAPI, check netpoll */
3493         if (netpoll_receive_skb(skb))
3494                 goto out;
3495
3496         orig_dev = skb->dev;
3497
3498         skb_reset_network_header(skb);
3499         if (!skb_transport_header_was_set(skb))
3500                 skb_reset_transport_header(skb);
3501         skb_reset_mac_len(skb);
3502
3503         pt_prev = NULL;
3504
3505         rcu_read_lock();
3506
3507 another_round:
3508         skb->skb_iif = skb->dev->ifindex;
3509
3510         __this_cpu_inc(softnet_data.processed);
3511
3512         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3513                 skb = vlan_untag(skb);
3514                 if (unlikely(!skb))
3515                         goto unlock;
3516         }
3517
3518 #ifdef CONFIG_NET_CLS_ACT
3519         if (skb->tc_verd & TC_NCLS) {
3520                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3521                 goto ncls;
3522         }
3523 #endif
3524
3525         if (pfmemalloc)
3526                 goto skip_taps;
3527
3528         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3529                 if (!ptype->dev || ptype->dev == skb->dev) {
3530                         if (pt_prev)
3531                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3532                         pt_prev = ptype;
3533                 }
3534         }
3535
3536 skip_taps:
3537 #ifdef CONFIG_NET_CLS_ACT
3538         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3539         if (!skb)
3540                 goto unlock;
3541 ncls:
3542 #endif
3543
3544         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3545                 goto drop;
3546
3547         if (vlan_tx_tag_present(skb)) {
3548                 if (pt_prev) {
3549                         ret = deliver_skb(skb, pt_prev, orig_dev);
3550                         pt_prev = NULL;
3551                 }
3552                 if (vlan_do_receive(&skb))
3553                         goto another_round;
3554                 else if (unlikely(!skb))
3555                         goto unlock;
3556         }
3557
3558         rx_handler = rcu_dereference(skb->dev->rx_handler);
3559         if (rx_handler) {
3560                 if (pt_prev) {
3561                         ret = deliver_skb(skb, pt_prev, orig_dev);
3562                         pt_prev = NULL;
3563                 }
3564                 switch (rx_handler(&skb)) {
3565                 case RX_HANDLER_CONSUMED:
3566                         goto unlock;
3567                 case RX_HANDLER_ANOTHER:
3568                         goto another_round;
3569                 case RX_HANDLER_EXACT:
3570                         deliver_exact = true;
3571                 case RX_HANDLER_PASS:
3572                         break;
3573                 default:
3574                         BUG();
3575                 }
3576         }
3577
3578         if (vlan_tx_nonzero_tag_present(skb))
3579                 skb->pkt_type = PACKET_OTHERHOST;
3580
3581         /* deliver only exact match when indicated */
3582         null_or_dev = deliver_exact ? skb->dev : NULL;
3583
3584         type = skb->protocol;
3585         list_for_each_entry_rcu(ptype,
3586                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3587                 if (ptype->type == type &&
3588                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3589                      ptype->dev == orig_dev)) {
3590                         if (pt_prev)
3591                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3592                         pt_prev = ptype;
3593                 }
3594         }
3595
3596         if (pt_prev) {
3597                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3598                         goto drop;
3599                 else
3600                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3601         } else {
3602 drop:
3603                 atomic_long_inc(&skb->dev->rx_dropped);
3604                 kfree_skb(skb);
3605                 /* Jamal, now you will not able to escape explaining
3606                  * me how you were going to use this. :-)
3607                  */
3608                 ret = NET_RX_DROP;
3609         }
3610
3611 unlock:
3612         rcu_read_unlock();
3613 out:
3614         return ret;
3615 }
3616
3617 static int __netif_receive_skb(struct sk_buff *skb)
3618 {
3619         int ret;
3620
3621         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3622                 unsigned long pflags = current->flags;
3623
3624                 /*
3625                  * PFMEMALLOC skbs are special, they should
3626                  * - be delivered to SOCK_MEMALLOC sockets only
3627                  * - stay away from userspace
3628                  * - have bounded memory usage
3629                  *
3630                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3631                  * context down to all allocation sites.
3632                  */
3633                 current->flags |= PF_MEMALLOC;
3634                 ret = __netif_receive_skb_core(skb, true);
3635                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3636         } else
3637                 ret = __netif_receive_skb_core(skb, false);
3638
3639         return ret;
3640 }
3641
3642 /**
3643  *      netif_receive_skb - process receive buffer from network
3644  *      @skb: buffer to process
3645  *
3646  *      netif_receive_skb() is the main receive data processing function.
3647  *      It always succeeds. The buffer may be dropped during processing
3648  *      for congestion control or by the protocol layers.
3649  *
3650  *      This function may only be called from softirq context and interrupts
3651  *      should be enabled.
3652  *
3653  *      Return values (usually ignored):
3654  *      NET_RX_SUCCESS: no congestion
3655  *      NET_RX_DROP: packet was dropped
3656  */
3657 int netif_receive_skb(struct sk_buff *skb)
3658 {
3659         net_timestamp_check(netdev_tstamp_prequeue, skb);
3660
3661         if (skb_defer_rx_timestamp(skb))
3662                 return NET_RX_SUCCESS;
3663
3664 #ifdef CONFIG_RPS
3665         if (static_key_false(&rps_needed)) {
3666                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3667                 int cpu, ret;
3668
3669                 rcu_read_lock();
3670
3671                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3672
3673                 if (cpu >= 0) {
3674                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3675                         rcu_read_unlock();
3676                         return ret;
3677                 }
3678                 rcu_read_unlock();
3679         }
3680 #endif
3681         return __netif_receive_skb(skb);
3682 }
3683 EXPORT_SYMBOL(netif_receive_skb);
3684
3685 /* Network device is going away, flush any packets still pending
3686  * Called with irqs disabled.
3687  */
3688 static void flush_backlog(void *arg)
3689 {
3690         struct net_device *dev = arg;
3691         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3692         struct sk_buff *skb, *tmp;
3693
3694         rps_lock(sd);
3695         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3696                 if (skb->dev == dev) {
3697                         __skb_unlink(skb, &sd->input_pkt_queue);
3698                         kfree_skb(skb);
3699                         input_queue_head_incr(sd);
3700                 }
3701         }
3702         rps_unlock(sd);
3703
3704         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3705                 if (skb->dev == dev) {
3706                         __skb_unlink(skb, &sd->process_queue);
3707                         kfree_skb(skb);
3708                         input_queue_head_incr(sd);
3709                 }
3710         }
3711 }
3712
3713 static int napi_gro_complete(struct sk_buff *skb)
3714 {
3715         struct packet_offload *ptype;
3716         __be16 type = skb->protocol;
3717         struct list_head *head = &offload_base;
3718         int err = -ENOENT;
3719
3720         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3721
3722         if (NAPI_GRO_CB(skb)->count == 1) {
3723                 skb_shinfo(skb)->gso_size = 0;
3724                 goto out;
3725         }
3726
3727         rcu_read_lock();
3728         list_for_each_entry_rcu(ptype, head, list) {
3729                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3730                         continue;
3731
3732                 err = ptype->callbacks.gro_complete(skb);
3733                 break;
3734         }
3735         rcu_read_unlock();
3736
3737         if (err) {
3738                 WARN_ON(&ptype->list == head);
3739                 kfree_skb(skb);
3740                 return NET_RX_SUCCESS;
3741         }
3742
3743 out:
3744         return netif_receive_skb(skb);
3745 }
3746
3747 /* napi->gro_list contains packets ordered by age.
3748  * youngest packets at the head of it.
3749  * Complete skbs in reverse order to reduce latencies.
3750  */
3751 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3752 {
3753         struct sk_buff *skb, *prev = NULL;
3754
3755         /* scan list and build reverse chain */
3756         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3757                 skb->prev = prev;
3758                 prev = skb;
3759         }
3760
3761         for (skb = prev; skb; skb = prev) {
3762                 skb->next = NULL;
3763
3764                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3765                         return;
3766
3767                 prev = skb->prev;
3768                 napi_gro_complete(skb);
3769                 napi->gro_count--;
3770         }
3771
3772         napi->gro_list = NULL;
3773 }
3774 EXPORT_SYMBOL(napi_gro_flush);
3775
3776 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3777 {
3778         struct sk_buff *p;
3779         unsigned int maclen = skb->dev->hard_header_len;
3780
3781         for (p = napi->gro_list; p; p = p->next) {
3782                 unsigned long diffs;
3783
3784                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3785                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3786                 if (maclen == ETH_HLEN)
3787                         diffs |= compare_ether_header(skb_mac_header(p),
3788                                                       skb_gro_mac_header(skb));
3789                 else if (!diffs)
3790                         diffs = memcmp(skb_mac_header(p),
3791                                        skb_gro_mac_header(skb),
3792                                        maclen);
3793                 NAPI_GRO_CB(p)->same_flow = !diffs;
3794                 NAPI_GRO_CB(p)->flush = 0;
3795         }
3796 }
3797
3798 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3799 {
3800         struct sk_buff **pp = NULL;
3801         struct packet_offload *ptype;
3802         __be16 type = skb->protocol;
3803         struct list_head *head = &offload_base;
3804         int same_flow;
3805         int mac_len;
3806         enum gro_result ret;
3807
3808         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3809                 goto normal;
3810
3811         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3812                 goto normal;
3813
3814         gro_list_prepare(napi, skb);
3815
3816         rcu_read_lock();
3817         list_for_each_entry_rcu(ptype, head, list) {
3818                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3819                         continue;
3820
3821                 skb_set_network_header(skb, skb_gro_offset(skb));
3822                 mac_len = skb->network_header - skb->mac_header;
3823                 skb->mac_len = mac_len;
3824                 NAPI_GRO_CB(skb)->same_flow = 0;
3825                 NAPI_GRO_CB(skb)->flush = 0;
3826                 NAPI_GRO_CB(skb)->free = 0;
3827
3828                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3829                 break;
3830         }
3831         rcu_read_unlock();
3832
3833         if (&ptype->list == head)
3834                 goto normal;
3835
3836         same_flow = NAPI_GRO_CB(skb)->same_flow;
3837         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3838
3839         if (pp) {
3840                 struct sk_buff *nskb = *pp;
3841
3842                 *pp = nskb->next;
3843                 nskb->next = NULL;
3844                 napi_gro_complete(nskb);
3845                 napi->gro_count--;
3846         }
3847
3848         if (same_flow)
3849                 goto ok;
3850
3851         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3852                 goto normal;
3853
3854         napi->gro_count++;
3855         NAPI_GRO_CB(skb)->count = 1;
3856         NAPI_GRO_CB(skb)->age = jiffies;
3857         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3858         skb->next = napi->gro_list;
3859         napi->gro_list = skb;
3860         ret = GRO_HELD;
3861
3862 pull:
3863         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3864                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3865
3866                 BUG_ON(skb->end - skb->tail < grow);
3867
3868                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3869
3870                 skb->tail += grow;
3871                 skb->data_len -= grow;
3872
3873                 skb_shinfo(skb)->frags[0].page_offset += grow;
3874                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3875
3876                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3877                         skb_frag_unref(skb, 0);
3878                         memmove(skb_shinfo(skb)->frags,
3879                                 skb_shinfo(skb)->frags + 1,
3880                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3881                 }
3882         }
3883
3884 ok:
3885         return ret;
3886
3887 normal:
3888         ret = GRO_NORMAL;
3889         goto pull;
3890 }
3891
3892
3893 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3894 {
3895         switch (ret) {
3896         case GRO_NORMAL:
3897                 if (netif_receive_skb(skb))
3898                         ret = GRO_DROP;
3899                 break;
3900
3901         case GRO_DROP:
3902                 kfree_skb(skb);
3903                 break;
3904
3905         case GRO_MERGED_FREE:
3906                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3907                         kmem_cache_free(skbuff_head_cache, skb);
3908                 else
3909                         __kfree_skb(skb);
3910                 break;
3911
3912         case GRO_HELD:
3913         case GRO_MERGED:
3914                 break;
3915         }
3916
3917         return ret;
3918 }
3919
3920 static void skb_gro_reset_offset(struct sk_buff *skb)
3921 {
3922         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3923         const skb_frag_t *frag0 = &pinfo->frags[0];
3924
3925         NAPI_GRO_CB(skb)->data_offset = 0;
3926         NAPI_GRO_CB(skb)->frag0 = NULL;
3927         NAPI_GRO_CB(skb)->frag0_len = 0;
3928
3929         if (skb->mac_header == skb->tail &&
3930             pinfo->nr_frags &&
3931             !PageHighMem(skb_frag_page(frag0))) {
3932                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3933                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3934         }
3935 }
3936
3937 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3938 {
3939         skb_gro_reset_offset(skb);
3940
3941         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3942 }
3943 EXPORT_SYMBOL(napi_gro_receive);
3944
3945 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3946 {
3947         __skb_pull(skb, skb_headlen(skb));
3948         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3949         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3950         skb->vlan_tci = 0;
3951         skb->dev = napi->dev;
3952         skb->skb_iif = 0;
3953
3954         napi->skb = skb;
3955 }
3956
3957 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3958 {
3959         struct sk_buff *skb = napi->skb;
3960
3961         if (!skb) {
3962                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3963                 if (skb)
3964                         napi->skb = skb;
3965         }
3966         return skb;
3967 }
3968 EXPORT_SYMBOL(napi_get_frags);
3969
3970 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3971                                gro_result_t ret)
3972 {
3973         switch (ret) {
3974         case GRO_NORMAL:
3975         case GRO_HELD:
3976                 skb->protocol = eth_type_trans(skb, skb->dev);
3977
3978                 if (ret == GRO_HELD)
3979                         skb_gro_pull(skb, -ETH_HLEN);
3980                 else if (netif_receive_skb(skb))
3981                         ret = GRO_DROP;
3982                 break;
3983
3984         case GRO_DROP:
3985         case GRO_MERGED_FREE:
3986                 napi_reuse_skb(napi, skb);
3987                 break;
3988
3989         case GRO_MERGED:
3990                 break;
3991         }
3992
3993         return ret;
3994 }
3995
3996 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3997 {
3998         struct sk_buff *skb = napi->skb;
3999         struct ethhdr *eth;
4000         unsigned int hlen;
4001         unsigned int off;
4002
4003         napi->skb = NULL;
4004
4005         skb_reset_mac_header(skb);
4006         skb_gro_reset_offset(skb);
4007
4008         off = skb_gro_offset(skb);
4009         hlen = off + sizeof(*eth);
4010         eth = skb_gro_header_fast(skb, off);
4011         if (skb_gro_header_hard(skb, hlen)) {
4012                 eth = skb_gro_header_slow(skb, hlen, off);
4013                 if (unlikely(!eth)) {
4014                         napi_reuse_skb(napi, skb);
4015                         skb = NULL;
4016                         goto out;
4017                 }
4018         }
4019
4020         skb_gro_pull(skb, sizeof(*eth));
4021
4022         /*
4023          * This works because the only protocols we care about don't require
4024          * special handling.  We'll fix it up properly at the end.
4025          */
4026         skb->protocol = eth->h_proto;
4027
4028 out:
4029         return skb;
4030 }
4031
4032 gro_result_t napi_gro_frags(struct napi_struct *napi)
4033 {
4034         struct sk_buff *skb = napi_frags_skb(napi);
4035
4036         if (!skb)
4037                 return GRO_DROP;
4038
4039         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4040 }
4041 EXPORT_SYMBOL(napi_gro_frags);
4042
4043 /*
4044  * net_rps_action sends any pending IPI's for rps.
4045  * Note: called with local irq disabled, but exits with local irq enabled.
4046  */
4047 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4048 {
4049 #ifdef CONFIG_RPS
4050         struct softnet_data *remsd = sd->rps_ipi_list;
4051
4052         if (remsd) {
4053                 sd->rps_ipi_list = NULL;
4054
4055                 local_irq_enable();
4056
4057                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4058                 while (remsd) {
4059                         struct softnet_data *next = remsd->rps_ipi_next;
4060
4061                         if (cpu_online(remsd->cpu))
4062                                 __smp_call_function_single(remsd->cpu,
4063                                                            &remsd->csd, 0);
4064                         remsd = next;
4065                 }
4066         } else
4067 #endif
4068                 local_irq_enable();
4069 }
4070
4071 static int process_backlog(struct napi_struct *napi, int quota)
4072 {
4073         int work = 0;
4074         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4075
4076 #ifdef CONFIG_RPS
4077         /* Check if we have pending ipi, its better to send them now,
4078          * not waiting net_rx_action() end.
4079          */
4080         if (sd->rps_ipi_list) {
4081                 local_irq_disable();
4082                 net_rps_action_and_irq_enable(sd);
4083         }
4084 #endif
4085         napi->weight = weight_p;
4086         local_irq_disable();
4087         while (work < quota) {
4088                 struct sk_buff *skb;
4089                 unsigned int qlen;
4090
4091                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4092                         local_irq_enable();
4093                         __netif_receive_skb(skb);
4094                         local_irq_disable();
4095                         input_queue_head_incr(sd);
4096                         if (++work >= quota) {
4097                                 local_irq_enable();
4098                                 return work;
4099                         }
4100                 }
4101
4102                 rps_lock(sd);
4103                 qlen = skb_queue_len(&sd->input_pkt_queue);
4104                 if (qlen)
4105                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4106                                                    &sd->process_queue);
4107
4108                 if (qlen < quota - work) {
4109                         /*
4110                          * Inline a custom version of __napi_complete().
4111                          * only current cpu owns and manipulates this napi,
4112                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4113                          * we can use a plain write instead of clear_bit(),
4114                          * and we dont need an smp_mb() memory barrier.
4115                          */
4116                         list_del(&napi->poll_list);
4117                         napi->state = 0;
4118
4119                         quota = work + qlen;
4120                 }
4121                 rps_unlock(sd);
4122         }
4123         local_irq_enable();
4124
4125         return work;
4126 }
4127
4128 /**
4129  * __napi_schedule - schedule for receive
4130  * @n: entry to schedule
4131  *
4132  * The entry's receive function will be scheduled to run
4133  */
4134 void __napi_schedule(struct napi_struct *n)
4135 {
4136         unsigned long flags;
4137
4138         local_irq_save(flags);
4139         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4140         local_irq_restore(flags);
4141 }
4142 EXPORT_SYMBOL(__napi_schedule);
4143
4144 void __napi_complete(struct napi_struct *n)
4145 {
4146         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4147         BUG_ON(n->gro_list);
4148
4149         list_del(&n->poll_list);
4150         smp_mb__before_clear_bit();
4151         clear_bit(NAPI_STATE_SCHED, &n->state);
4152 }
4153 EXPORT_SYMBOL(__napi_complete);
4154
4155 void napi_complete(struct napi_struct *n)
4156 {
4157         unsigned long flags;
4158
4159         /*
4160          * don't let napi dequeue from the cpu poll list
4161          * just in case its running on a different cpu
4162          */
4163         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4164                 return;
4165
4166         napi_gro_flush(n, false);
4167         local_irq_save(flags);
4168         __napi_complete(n);
4169         local_irq_restore(flags);
4170 }
4171 EXPORT_SYMBOL(napi_complete);
4172
4173 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4174                     int (*poll)(struct napi_struct *, int), int weight)
4175 {
4176         INIT_LIST_HEAD(&napi->poll_list);
4177         napi->gro_count = 0;
4178         napi->gro_list = NULL;
4179         napi->skb = NULL;
4180         napi->poll = poll;
4181         napi->weight = weight;
4182         list_add(&napi->dev_list, &dev->napi_list);
4183         napi->dev = dev;
4184 #ifdef CONFIG_NETPOLL
4185         spin_lock_init(&napi->poll_lock);
4186         napi->poll_owner = -1;
4187 #endif
4188         set_bit(NAPI_STATE_SCHED, &napi->state);
4189 }
4190 EXPORT_SYMBOL(netif_napi_add);
4191
4192 void netif_napi_del(struct napi_struct *napi)
4193 {
4194         struct sk_buff *skb, *next;
4195
4196         list_del_init(&napi->dev_list);
4197         napi_free_frags(napi);
4198
4199         for (skb = napi->gro_list; skb; skb = next) {
4200                 next = skb->next;
4201                 skb->next = NULL;
4202                 kfree_skb(skb);
4203         }
4204
4205         napi->gro_list = NULL;
4206         napi->gro_count = 0;
4207 }
4208 EXPORT_SYMBOL(netif_napi_del);
4209
4210 static void net_rx_action(struct softirq_action *h)
4211 {
4212         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4213         unsigned long time_limit = jiffies + 2;
4214         int budget = netdev_budget;
4215         void *have;
4216
4217         local_irq_disable();
4218
4219         while (!list_empty(&sd->poll_list)) {
4220                 struct napi_struct *n;
4221                 int work, weight;
4222
4223                 /* If softirq window is exhuasted then punt.
4224                  * Allow this to run for 2 jiffies since which will allow
4225                  * an average latency of 1.5/HZ.
4226                  */
4227                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4228                         goto softnet_break;
4229
4230                 local_irq_enable();
4231
4232                 /* Even though interrupts have been re-enabled, this
4233                  * access is safe because interrupts can only add new
4234                  * entries to the tail of this list, and only ->poll()
4235                  * calls can remove this head entry from the list.
4236                  */
4237                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4238
4239                 have = netpoll_poll_lock(n);
4240
4241                 weight = n->weight;
4242
4243                 /* This NAPI_STATE_SCHED test is for avoiding a race
4244                  * with netpoll's poll_napi().  Only the entity which
4245                  * obtains the lock and sees NAPI_STATE_SCHED set will
4246                  * actually make the ->poll() call.  Therefore we avoid
4247                  * accidentally calling ->poll() when NAPI is not scheduled.
4248                  */
4249                 work = 0;
4250                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4251                         work = n->poll(n, weight);
4252                         trace_napi_poll(n);
4253                 }
4254
4255                 WARN_ON_ONCE(work > weight);
4256
4257                 budget -= work;
4258
4259                 local_irq_disable();
4260
4261                 /* Drivers must not modify the NAPI state if they
4262                  * consume the entire weight.  In such cases this code
4263                  * still "owns" the NAPI instance and therefore can
4264                  * move the instance around on the list at-will.
4265                  */
4266                 if (unlikely(work == weight)) {
4267                         if (unlikely(napi_disable_pending(n))) {
4268                                 local_irq_enable();
4269                                 napi_complete(n);
4270                                 local_irq_disable();
4271                         } else {
4272                                 if (n->gro_list) {
4273                                         /* flush too old packets
4274                                          * If HZ < 1000, flush all packets.
4275                                          */
4276                                         local_irq_enable();
4277                                         napi_gro_flush(n, HZ >= 1000);
4278                                         local_irq_disable();
4279                                 }
4280                                 list_move_tail(&n->poll_list, &sd->poll_list);
4281                         }
4282                 }
4283
4284                 netpoll_poll_unlock(have);
4285         }
4286 out:
4287         net_rps_action_and_irq_enable(sd);
4288
4289 #ifdef CONFIG_NET_DMA
4290         /*
4291          * There may not be any more sk_buffs coming right now, so push
4292          * any pending DMA copies to hardware
4293          */
4294         dma_issue_pending_all();
4295 #endif
4296
4297         return;
4298
4299 softnet_break:
4300         sd->time_squeeze++;
4301         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4302         goto out;
4303 }
4304
4305 static gifconf_func_t *gifconf_list[NPROTO];
4306
4307 /**
4308  *      register_gifconf        -       register a SIOCGIF handler
4309  *      @family: Address family
4310  *      @gifconf: Function handler
4311  *
4312  *      Register protocol dependent address dumping routines. The handler
4313  *      that is passed must not be freed or reused until it has been replaced
4314  *      by another handler.
4315  */
4316 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4317 {
4318         if (family >= NPROTO)
4319                 return -EINVAL;
4320         gifconf_list[family] = gifconf;
4321         return 0;
4322 }
4323 EXPORT_SYMBOL(register_gifconf);
4324
4325
4326 /*
4327  *      Map an interface index to its name (SIOCGIFNAME)
4328  */
4329
4330 /*
4331  *      We need this ioctl for efficient implementation of the
4332  *      if_indextoname() function required by the IPv6 API.  Without
4333  *      it, we would have to search all the interfaces to find a
4334  *      match.  --pb
4335  */
4336
4337 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4338 {
4339         struct net_device *dev;
4340         struct ifreq ifr;
4341         unsigned seq;
4342
4343         /*
4344          *      Fetch the caller's info block.
4345          */
4346
4347         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4348                 return -EFAULT;
4349
4350 retry:
4351         seq = read_seqcount_begin(&devnet_rename_seq);
4352         rcu_read_lock();
4353         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4354         if (!dev) {
4355                 rcu_read_unlock();
4356                 return -ENODEV;
4357         }
4358
4359         strcpy(ifr.ifr_name, dev->name);
4360         rcu_read_unlock();
4361         if (read_seqcount_retry(&devnet_rename_seq, seq))
4362                 goto retry;
4363
4364         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4365                 return -EFAULT;
4366         return 0;
4367 }
4368
4369 /*
4370  *      Perform a SIOCGIFCONF call. This structure will change
4371  *      size eventually, and there is nothing I can do about it.
4372  *      Thus we will need a 'compatibility mode'.
4373  */
4374
4375 static int dev_ifconf(struct net *net, char __user *arg)
4376 {
4377         struct ifconf ifc;
4378         struct net_device *dev;
4379         char __user *pos;
4380         int len;
4381         int total;
4382         int i;
4383
4384         /*
4385          *      Fetch the caller's info block.
4386          */
4387
4388         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4389                 return -EFAULT;
4390
4391         pos = ifc.ifc_buf;
4392         len = ifc.ifc_len;
4393
4394         /*
4395          *      Loop over the interfaces, and write an info block for each.
4396          */
4397
4398         total = 0;
4399         for_each_netdev(net, dev) {
4400                 for (i = 0; i < NPROTO; i++) {
4401                         if (gifconf_list[i]) {
4402                                 int done;
4403                                 if (!pos)
4404                                         done = gifconf_list[i](dev, NULL, 0);
4405                                 else
4406                                         done = gifconf_list[i](dev, pos + total,
4407                                                                len - total);
4408                                 if (done < 0)
4409                                         return -EFAULT;
4410                                 total += done;
4411                         }
4412                 }
4413         }
4414
4415         /*
4416          *      All done.  Write the updated control block back to the caller.
4417          */
4418         ifc.ifc_len = total;
4419
4420         /*
4421          *      Both BSD and Solaris return 0 here, so we do too.
4422          */
4423         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4424 }
4425
4426 #ifdef CONFIG_PROC_FS
4427
4428 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4429
4430 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4431 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4432 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4433
4434 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4435 {
4436         struct net *net = seq_file_net(seq);
4437         struct net_device *dev;
4438         struct hlist_node *p;
4439         struct hlist_head *h;
4440         unsigned int count = 0, offset = get_offset(*pos);
4441
4442         h = &net->dev_name_head[get_bucket(*pos)];
4443         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4444                 if (++count == offset)
4445                         return dev;
4446         }
4447
4448         return NULL;
4449 }
4450
4451 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4452 {
4453         struct net_device *dev;
4454         unsigned int bucket;
4455
4456         do {
4457                 dev = dev_from_same_bucket(seq, pos);
4458                 if (dev)
4459                         return dev;
4460
4461                 bucket = get_bucket(*pos) + 1;
4462                 *pos = set_bucket_offset(bucket, 1);
4463         } while (bucket < NETDEV_HASHENTRIES);
4464
4465         return NULL;
4466 }
4467
4468 /*
4469  *      This is invoked by the /proc filesystem handler to display a device
4470  *      in detail.
4471  */
4472 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4473         __acquires(RCU)
4474 {
4475         rcu_read_lock();
4476         if (!*pos)
4477                 return SEQ_START_TOKEN;
4478
4479         if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4480                 return NULL;
4481
4482         return dev_from_bucket(seq, pos);
4483 }
4484
4485 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4486 {
4487         ++*pos;
4488         return dev_from_bucket(seq, pos);
4489 }
4490
4491 void dev_seq_stop(struct seq_file *seq, void *v)
4492         __releases(RCU)
4493 {
4494         rcu_read_unlock();
4495 }
4496
4497 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4498 {
4499         struct rtnl_link_stats64 temp;
4500         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4501
4502         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4503                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4504                    dev->name, stats->rx_bytes, stats->rx_packets,
4505                    stats->rx_errors,
4506                    stats->rx_dropped + stats->rx_missed_errors,
4507                    stats->rx_fifo_errors,
4508                    stats->rx_length_errors + stats->rx_over_errors +
4509                     stats->rx_crc_errors + stats->rx_frame_errors,
4510                    stats->rx_compressed, stats->multicast,
4511                    stats->tx_bytes, stats->tx_packets,
4512                    stats->tx_errors, stats->tx_dropped,
4513                    stats->tx_fifo_errors, stats->collisions,
4514                    stats->tx_carrier_errors +
4515                     stats->tx_aborted_errors +
4516                     stats->tx_window_errors +
4517                     stats->tx_heartbeat_errors,
4518                    stats->tx_compressed);
4519 }
4520
4521 /*
4522  *      Called from the PROCfs module. This now uses the new arbitrary sized
4523  *      /proc/net interface to create /proc/net/dev
4524  */
4525 static int dev_seq_show(struct seq_file *seq, void *v)
4526 {
4527         if (v == SEQ_START_TOKEN)
4528                 seq_puts(seq, "Inter-|   Receive                            "
4529                               "                    |  Transmit\n"
4530                               " face |bytes    packets errs drop fifo frame "
4531                               "compressed multicast|bytes    packets errs "
4532                               "drop fifo colls carrier compressed\n");
4533         else
4534                 dev_seq_printf_stats(seq, v);
4535         return 0;
4536 }
4537
4538 static struct softnet_data *softnet_get_online(loff_t *pos)
4539 {
4540         struct softnet_data *sd = NULL;
4541
4542         while (*pos < nr_cpu_ids)
4543                 if (cpu_online(*pos)) {
4544                         sd = &per_cpu(softnet_data, *pos);
4545                         break;
4546                 } else
4547                         ++*pos;
4548         return sd;
4549 }
4550
4551 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4552 {
4553         return softnet_get_online(pos);
4554 }
4555
4556 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4557 {
4558         ++*pos;
4559         return softnet_get_online(pos);
4560 }
4561
4562 static void softnet_seq_stop(struct seq_file *seq, void *v)
4563 {
4564 }
4565
4566 static int softnet_seq_show(struct seq_file *seq, void *v)
4567 {
4568         struct softnet_data *sd = v;
4569
4570         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4571                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4572                    0, 0, 0, 0, /* was fastroute */
4573                    sd->cpu_collision, sd->received_rps);
4574         return 0;
4575 }
4576
4577 static const struct seq_operations dev_seq_ops = {
4578         .start = dev_seq_start,
4579         .next  = dev_seq_next,
4580         .stop  = dev_seq_stop,
4581         .show  = dev_seq_show,
4582 };
4583
4584 static int dev_seq_open(struct inode *inode, struct file *file)
4585 {
4586         return seq_open_net(inode, file, &dev_seq_ops,
4587                             sizeof(struct seq_net_private));
4588 }
4589
4590 static const struct file_operations dev_seq_fops = {
4591         .owner   = THIS_MODULE,
4592         .open    = dev_seq_open,
4593         .read    = seq_read,
4594         .llseek  = seq_lseek,
4595         .release = seq_release_net,
4596 };
4597
4598 static const struct seq_operations softnet_seq_ops = {
4599         .start = softnet_seq_start,
4600         .next  = softnet_seq_next,
4601         .stop  = softnet_seq_stop,
4602         .show  = softnet_seq_show,
4603 };
4604
4605 static int softnet_seq_open(struct inode *inode, struct file *file)
4606 {
4607         return seq_open(file, &softnet_seq_ops);
4608 }
4609
4610 static const struct file_operations softnet_seq_fops = {
4611         .owner   = THIS_MODULE,
4612         .open    = softnet_seq_open,
4613         .read    = seq_read,
4614         .llseek  = seq_lseek,
4615         .release = seq_release,
4616 };
4617
4618 static void *ptype_get_idx(loff_t pos)
4619 {
4620         struct packet_type *pt = NULL;
4621         loff_t i = 0;
4622         int t;
4623
4624         list_for_each_entry_rcu(pt, &ptype_all, list) {
4625                 if (i == pos)
4626                         return pt;
4627                 ++i;
4628         }
4629
4630         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4631                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4632                         if (i == pos)
4633                                 return pt;
4634                         ++i;
4635                 }
4636         }
4637         return NULL;
4638 }
4639
4640 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4641         __acquires(RCU)
4642 {
4643         rcu_read_lock();
4644         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4645 }
4646
4647 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4648 {
4649         struct packet_type *pt;
4650         struct list_head *nxt;
4651         int hash;
4652
4653         ++*pos;
4654         if (v == SEQ_START_TOKEN)
4655                 return ptype_get_idx(0);
4656
4657         pt = v;
4658         nxt = pt->list.next;
4659         if (pt->type == htons(ETH_P_ALL)) {
4660                 if (nxt != &ptype_all)
4661                         goto found;
4662                 hash = 0;
4663                 nxt = ptype_base[0].next;
4664         } else
4665                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4666
4667         while (nxt == &ptype_base[hash]) {
4668                 if (++hash >= PTYPE_HASH_SIZE)
4669                         return NULL;
4670                 nxt = ptype_base[hash].next;
4671         }
4672 found:
4673         return list_entry(nxt, struct packet_type, list);
4674 }
4675
4676 static void ptype_seq_stop(struct seq_file *seq, void *v)
4677         __releases(RCU)
4678 {
4679         rcu_read_unlock();
4680 }
4681
4682 static int ptype_seq_show(struct seq_file *seq, void *v)
4683 {
4684         struct packet_type *pt = v;
4685
4686         if (v == SEQ_START_TOKEN)
4687                 seq_puts(seq, "Type Device      Function\n");
4688         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4689                 if (pt->type == htons(ETH_P_ALL))
4690                         seq_puts(seq, "ALL ");
4691                 else
4692                         seq_printf(seq, "%04x", ntohs(pt->type));
4693
4694                 seq_printf(seq, " %-8s %pF\n",
4695                            pt->dev ? pt->dev->name : "", pt->func);
4696         }
4697
4698         return 0;
4699 }
4700
4701 static const struct seq_operations ptype_seq_ops = {
4702         .start = ptype_seq_start,
4703         .next  = ptype_seq_next,
4704         .stop  = ptype_seq_stop,
4705         .show  = ptype_seq_show,
4706 };
4707
4708 static int ptype_seq_open(struct inode *inode, struct file *file)
4709 {
4710         return seq_open_net(inode, file, &ptype_seq_ops,
4711                         sizeof(struct seq_net_private));
4712 }
4713
4714 static const struct file_operations ptype_seq_fops = {
4715         .owner   = THIS_MODULE,
4716         .open    = ptype_seq_open,
4717         .read    = seq_read,
4718         .llseek  = seq_lseek,
4719         .release = seq_release_net,
4720 };
4721
4722
4723 static int __net_init dev_proc_net_init(struct net *net)
4724 {
4725         int rc = -ENOMEM;
4726
4727         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4728                 goto out;
4729         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4730                 goto out_dev;
4731         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4732                 goto out_softnet;
4733
4734         if (wext_proc_init(net))
4735                 goto out_ptype;
4736         rc = 0;
4737 out:
4738         return rc;
4739 out_ptype:
4740         proc_net_remove(net, "ptype");
4741 out_softnet:
4742         proc_net_remove(net, "softnet_stat");
4743 out_dev:
4744         proc_net_remove(net, "dev");
4745         goto out;
4746 }
4747
4748 static void __net_exit dev_proc_net_exit(struct net *net)
4749 {
4750         wext_proc_exit(net);
4751
4752         proc_net_remove(net, "ptype");
4753         proc_net_remove(net, "softnet_stat");
4754         proc_net_remove(net, "dev");
4755 }
4756
4757 static struct pernet_operations __net_initdata dev_proc_ops = {
4758         .init = dev_proc_net_init,
4759         .exit = dev_proc_net_exit,
4760 };
4761
4762 static int __init dev_proc_init(void)
4763 {
4764         return register_pernet_subsys(&dev_proc_ops);
4765 }
4766 #else
4767 #define dev_proc_init() 0
4768 #endif  /* CONFIG_PROC_FS */
4769
4770
4771 struct netdev_upper {
4772         struct net_device *dev;
4773         bool master;
4774         struct list_head list;
4775         struct rcu_head rcu;
4776         struct list_head search_list;
4777 };
4778
4779 static void __append_search_uppers(struct list_head *search_list,
4780                                    struct net_device *dev)
4781 {
4782         struct netdev_upper *upper;
4783
4784         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4785                 /* check if this upper is not already in search list */
4786                 if (list_empty(&upper->search_list))
4787                         list_add_tail(&upper->search_list, search_list);
4788         }
4789 }
4790
4791 static bool __netdev_search_upper_dev(struct net_device *dev,
4792                                       struct net_device *upper_dev)
4793 {
4794         LIST_HEAD(search_list);
4795         struct netdev_upper *upper;
4796         struct netdev_upper *tmp;
4797         bool ret = false;
4798
4799         __append_search_uppers(&search_list, dev);
4800         list_for_each_entry(upper, &search_list, search_list) {
4801                 if (upper->dev == upper_dev) {
4802                         ret = true;
4803                         break;
4804                 }
4805                 __append_search_uppers(&search_list, upper->dev);
4806         }
4807         list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4808                 INIT_LIST_HEAD(&upper->search_list);
4809         return ret;
4810 }
4811
4812 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4813                                                 struct net_device *upper_dev)
4814 {
4815         struct netdev_upper *upper;
4816
4817         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4818                 if (upper->dev == upper_dev)
4819                         return upper;
4820         }
4821         return NULL;
4822 }
4823
4824 /**
4825  * netdev_has_upper_dev - Check if device is linked to an upper device
4826  * @dev: device
4827  * @upper_dev: upper device to check
4828  *
4829  * Find out if a device is linked to specified upper device and return true
4830  * in case it is. Note that this checks only immediate upper device,
4831  * not through a complete stack of devices. The caller must hold the RTNL lock.
4832  */
4833 bool netdev_has_upper_dev(struct net_device *dev,
4834                           struct net_device *upper_dev)
4835 {
4836         ASSERT_RTNL();
4837
4838         return __netdev_find_upper(dev, upper_dev);
4839 }
4840 EXPORT_SYMBOL(netdev_has_upper_dev);
4841
4842 /**
4843  * netdev_has_any_upper_dev - Check if device is linked to some device
4844  * @dev: device
4845  *
4846  * Find out if a device is linked to an upper device and return true in case
4847  * it is. The caller must hold the RTNL lock.
4848  */
4849 bool netdev_has_any_upper_dev(struct net_device *dev)
4850 {
4851         ASSERT_RTNL();
4852
4853         return !list_empty(&dev->upper_dev_list);
4854 }
4855 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4856
4857 /**
4858  * netdev_master_upper_dev_get - Get master upper device
4859  * @dev: device
4860  *
4861  * Find a master upper device and return pointer to it or NULL in case
4862  * it's not there. The caller must hold the RTNL lock.
4863  */
4864 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4865 {
4866         struct netdev_upper *upper;
4867
4868         ASSERT_RTNL();
4869
4870         if (list_empty(&dev->upper_dev_list))
4871                 return NULL;
4872
4873         upper = list_first_entry(&dev->upper_dev_list,
4874                                  struct netdev_upper, list);
4875         if (likely(upper->master))
4876                 return upper->dev;
4877         return NULL;
4878 }
4879 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4880
4881 /**
4882  * netdev_master_upper_dev_get_rcu - Get master upper device
4883  * @dev: device
4884  *
4885  * Find a master upper device and return pointer to it or NULL in case
4886  * it's not there. The caller must hold the RCU read lock.
4887  */
4888 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4889 {
4890         struct netdev_upper *upper;
4891
4892         upper = list_first_or_null_rcu(&dev->upper_dev_list,
4893                                        struct netdev_upper, list);
4894         if (upper && likely(upper->master))
4895                 return upper->dev;
4896         return NULL;
4897 }
4898 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4899
4900 static int __netdev_upper_dev_link(struct net_device *dev,
4901                                    struct net_device *upper_dev, bool master)
4902 {
4903         struct netdev_upper *upper;
4904
4905         ASSERT_RTNL();
4906
4907         if (dev == upper_dev)
4908                 return -EBUSY;
4909
4910         /* To prevent loops, check if dev is not upper device to upper_dev. */
4911         if (__netdev_search_upper_dev(upper_dev, dev))
4912                 return -EBUSY;
4913
4914         if (__netdev_find_upper(dev, upper_dev))
4915                 return -EEXIST;
4916
4917         if (master && netdev_master_upper_dev_get(dev))
4918                 return -EBUSY;
4919
4920         upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4921         if (!upper)
4922                 return -ENOMEM;
4923
4924         upper->dev = upper_dev;
4925         upper->master = master;
4926         INIT_LIST_HEAD(&upper->search_list);
4927
4928         /* Ensure that master upper link is always the first item in list. */
4929         if (master)
4930                 list_add_rcu(&upper->list, &dev->upper_dev_list);
4931         else
4932                 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4933         dev_hold(upper_dev);
4934
4935         return 0;
4936 }
4937
4938 /**
4939  * netdev_upper_dev_link - Add a link to the upper device
4940  * @dev: device
4941  * @upper_dev: new upper device
4942  *
4943  * Adds a link to device which is upper to this one. The caller must hold
4944  * the RTNL lock. On a failure a negative errno code is returned.
4945  * On success the reference counts are adjusted and the function
4946  * returns zero.
4947  */
4948 int netdev_upper_dev_link(struct net_device *dev,
4949                           struct net_device *upper_dev)
4950 {
4951         return __netdev_upper_dev_link(dev, upper_dev, false);
4952 }
4953 EXPORT_SYMBOL(netdev_upper_dev_link);
4954
4955 /**
4956  * netdev_master_upper_dev_link - Add a master link to the upper device
4957  * @dev: device
4958  * @upper_dev: new upper device
4959  *
4960  * Adds a link to device which is upper to this one. In this case, only
4961  * one master upper device can be linked, although other non-master devices
4962  * might be linked as well. The caller must hold the RTNL lock.
4963  * On a failure a negative errno code is returned. On success the reference
4964  * counts are adjusted and the function returns zero.
4965  */
4966 int netdev_master_upper_dev_link(struct net_device *dev,
4967                                  struct net_device *upper_dev)
4968 {
4969         return __netdev_upper_dev_link(dev, upper_dev, true);
4970 }
4971 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4972
4973 /**
4974  * netdev_upper_dev_unlink - Removes a link to upper device
4975  * @dev: device
4976  * @upper_dev: new upper device
4977  *
4978  * Removes a link to device which is upper to this one. The caller must hold
4979  * the RTNL lock.
4980  */
4981 void netdev_upper_dev_unlink(struct net_device *dev,
4982                              struct net_device *upper_dev)
4983 {
4984         struct netdev_upper *upper;
4985
4986         ASSERT_RTNL();
4987
4988         upper = __netdev_find_upper(dev, upper_dev);
4989         if (!upper)
4990                 return;
4991         list_del_rcu(&upper->list);
4992         dev_put(upper_dev);
4993         kfree_rcu(upper, rcu);
4994 }
4995 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4996
4997 static void dev_change_rx_flags(struct net_device *dev, int flags)
4998 {
4999         const struct net_device_ops *ops = dev->netdev_ops;
5000
5001         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
5002                 ops->ndo_change_rx_flags(dev, flags);
5003 }
5004
5005 static int __dev_set_promiscuity(struct net_device *dev, int inc)
5006 {
5007         unsigned int old_flags = dev->flags;
5008         kuid_t uid;
5009         kgid_t gid;
5010
5011         ASSERT_RTNL();
5012
5013         dev->flags |= IFF_PROMISC;
5014         dev->promiscuity += inc;
5015         if (dev->promiscuity == 0) {
5016                 /*
5017                  * Avoid overflow.
5018                  * If inc causes overflow, untouch promisc and return error.
5019                  */
5020                 if (inc < 0)
5021                         dev->flags &= ~IFF_PROMISC;
5022                 else {
5023                         dev->promiscuity -= inc;
5024                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5025                                 dev->name);
5026                         return -EOVERFLOW;
5027                 }
5028         }
5029         if (dev->flags != old_flags) {
5030                 pr_info("device %s %s promiscuous mode\n",
5031                         dev->name,
5032                         dev->flags & IFF_PROMISC ? "entered" : "left");
5033                 if (audit_enabled) {
5034                         current_uid_gid(&uid, &gid);
5035                         audit_log(current->audit_context, GFP_ATOMIC,
5036                                 AUDIT_ANOM_PROMISCUOUS,
5037                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5038                                 dev->name, (dev->flags & IFF_PROMISC),
5039                                 (old_flags & IFF_PROMISC),
5040                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5041                                 from_kuid(&init_user_ns, uid),
5042                                 from_kgid(&init_user_ns, gid),
5043                                 audit_get_sessionid(current));
5044                 }
5045
5046                 dev_change_rx_flags(dev, IFF_PROMISC);
5047         }
5048         return 0;
5049 }
5050
5051 /**
5052  *      dev_set_promiscuity     - update promiscuity count on a device
5053  *      @dev: device
5054  *      @inc: modifier
5055  *
5056  *      Add or remove promiscuity from a device. While the count in the device
5057  *      remains above zero the interface remains promiscuous. Once it hits zero
5058  *      the device reverts back to normal filtering operation. A negative inc
5059  *      value is used to drop promiscuity on the device.
5060  *      Return 0 if successful or a negative errno code on error.
5061  */
5062 int dev_set_promiscuity(struct net_device *dev, int inc)
5063 {
5064         unsigned int old_flags = dev->flags;
5065         int err;
5066
5067         err = __dev_set_promiscuity(dev, inc);
5068         if (err < 0)
5069                 return err;
5070         if (dev->flags != old_flags)
5071                 dev_set_rx_mode(dev);
5072         return err;
5073 }
5074 EXPORT_SYMBOL(dev_set_promiscuity);
5075
5076 /**
5077  *      dev_set_allmulti        - update allmulti count on a device
5078  *      @dev: device
5079  *      @inc: modifier
5080  *
5081  *      Add or remove reception of all multicast frames to a device. While the
5082  *      count in the device remains above zero the interface remains listening
5083  *      to all interfaces. Once it hits zero the device reverts back to normal
5084  *      filtering operation. A negative @inc value is used to drop the counter
5085  *      when releasing a resource needing all multicasts.
5086  *      Return 0 if successful or a negative errno code on error.
5087  */
5088
5089 int dev_set_allmulti(struct net_device *dev, int inc)
5090 {
5091         unsigned int old_flags = dev->flags;
5092
5093         ASSERT_RTNL();
5094
5095         dev->flags |= IFF_ALLMULTI;
5096         dev->allmulti += inc;
5097         if (dev->allmulti == 0) {
5098                 /*
5099                  * Avoid overflow.
5100                  * If inc causes overflow, untouch allmulti and return error.
5101                  */
5102                 if (inc < 0)
5103                         dev->flags &= ~IFF_ALLMULTI;
5104                 else {
5105                         dev->allmulti -= inc;
5106                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5107                                 dev->name);
5108                         return -EOVERFLOW;
5109                 }
5110         }
5111         if (dev->flags ^ old_flags) {
5112                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5113                 dev_set_rx_mode(dev);
5114         }
5115         return 0;
5116 }
5117 EXPORT_SYMBOL(dev_set_allmulti);
5118
5119 /*
5120  *      Upload unicast and multicast address lists to device and
5121  *      configure RX filtering. When the device doesn't support unicast
5122  *      filtering it is put in promiscuous mode while unicast addresses
5123  *      are present.
5124  */
5125 void __dev_set_rx_mode(struct net_device *dev)
5126 {
5127         const struct net_device_ops *ops = dev->netdev_ops;
5128
5129         /* dev_open will call this function so the list will stay sane. */
5130         if (!(dev->flags&IFF_UP))
5131                 return;
5132
5133         if (!netif_device_present(dev))
5134                 return;
5135
5136         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5137                 /* Unicast addresses changes may only happen under the rtnl,
5138                  * therefore calling __dev_set_promiscuity here is safe.
5139                  */
5140                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5141                         __dev_set_promiscuity(dev, 1);
5142                         dev->uc_promisc = true;
5143                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5144                         __dev_set_promiscuity(dev, -1);
5145                         dev->uc_promisc = false;
5146                 }
5147         }
5148
5149         if (ops->ndo_set_rx_mode)
5150                 ops->ndo_set_rx_mode(dev);
5151 }
5152
5153 void dev_set_rx_mode(struct net_device *dev)
5154 {
5155         netif_addr_lock_bh(dev);
5156         __dev_set_rx_mode(dev);
5157         netif_addr_unlock_bh(dev);
5158 }
5159
5160 /**
5161  *      dev_get_flags - get flags reported to userspace
5162  *      @dev: device
5163  *
5164  *      Get the combination of flag bits exported through APIs to userspace.
5165  */
5166 unsigned int dev_get_flags(const struct net_device *dev)
5167 {
5168         unsigned int flags;
5169
5170         flags = (dev->flags & ~(IFF_PROMISC |
5171                                 IFF_ALLMULTI |
5172                                 IFF_RUNNING |
5173                                 IFF_LOWER_UP |
5174                                 IFF_DORMANT)) |
5175                 (dev->gflags & (IFF_PROMISC |
5176                                 IFF_ALLMULTI));
5177
5178         if (netif_running(dev)) {
5179                 if (netif_oper_up(dev))
5180                         flags |= IFF_RUNNING;
5181                 if (netif_carrier_ok(dev))
5182                         flags |= IFF_LOWER_UP;
5183                 if (netif_dormant(dev))
5184                         flags |= IFF_DORMANT;
5185         }
5186
5187         return flags;
5188 }
5189 EXPORT_SYMBOL(dev_get_flags);
5190
5191 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5192 {
5193         unsigned int old_flags = dev->flags;
5194         int ret;
5195
5196         ASSERT_RTNL();
5197
5198         /*
5199          *      Set the flags on our device.
5200          */
5201
5202         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5203                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5204                                IFF_AUTOMEDIA)) |
5205                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5206                                     IFF_ALLMULTI));
5207
5208         /*
5209          *      Load in the correct multicast list now the flags have changed.
5210          */
5211
5212         if ((old_flags ^ flags) & IFF_MULTICAST)
5213                 dev_change_rx_flags(dev, IFF_MULTICAST);
5214
5215         dev_set_rx_mode(dev);
5216
5217         /*
5218          *      Have we downed the interface. We handle IFF_UP ourselves
5219          *      according to user attempts to set it, rather than blindly
5220          *      setting it.
5221          */
5222
5223         ret = 0;
5224         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
5225                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5226
5227                 if (!ret)
5228                         dev_set_rx_mode(dev);
5229         }
5230
5231         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5232                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5233
5234                 dev->gflags ^= IFF_PROMISC;
5235                 dev_set_promiscuity(dev, inc);
5236         }
5237
5238         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5239            is important. Some (broken) drivers set IFF_PROMISC, when
5240            IFF_ALLMULTI is requested not asking us and not reporting.
5241          */
5242         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5243                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5244
5245                 dev->gflags ^= IFF_ALLMULTI;
5246                 dev_set_allmulti(dev, inc);
5247         }
5248
5249         return ret;
5250 }
5251
5252 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
5253 {
5254         unsigned int changes = dev->flags ^ old_flags;
5255
5256         if (changes & IFF_UP) {
5257                 if (dev->flags & IFF_UP)
5258                         call_netdevice_notifiers(NETDEV_UP, dev);
5259                 else
5260                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5261         }
5262
5263         if (dev->flags & IFF_UP &&
5264             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
5265                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
5266 }
5267
5268 /**
5269  *      dev_change_flags - change device settings
5270  *      @dev: device
5271  *      @flags: device state flags
5272  *
5273  *      Change settings on device based state flags. The flags are
5274  *      in the userspace exported format.
5275  */
5276 int dev_change_flags(struct net_device *dev, unsigned int flags)
5277 {
5278         int ret;
5279         unsigned int changes, old_flags = dev->flags;
5280
5281         ret = __dev_change_flags(dev, flags);
5282         if (ret < 0)
5283                 return ret;
5284
5285         changes = old_flags ^ dev->flags;
5286         if (changes)
5287                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
5288
5289         __dev_notify_flags(dev, old_flags);
5290         return ret;
5291 }
5292 EXPORT_SYMBOL(dev_change_flags);
5293
5294 /**
5295  *      dev_set_mtu - Change maximum transfer unit
5296  *      @dev: device
5297  *      @new_mtu: new transfer unit
5298  *
5299  *      Change the maximum transfer size of the network device.
5300  */
5301 int dev_set_mtu(struct net_device *dev, int new_mtu)
5302 {
5303         const struct net_device_ops *ops = dev->netdev_ops;
5304         int err;
5305
5306         if (new_mtu == dev->mtu)
5307                 return 0;
5308
5309         /*      MTU must be positive.    */
5310         if (new_mtu < 0)
5311                 return -EINVAL;
5312
5313         if (!netif_device_present(dev))
5314                 return -ENODEV;
5315
5316         err = 0;
5317         if (ops->ndo_change_mtu)
5318                 err = ops->ndo_change_mtu(dev, new_mtu);
5319         else
5320                 dev->mtu = new_mtu;
5321
5322         if (!err)
5323                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5324         return err;
5325 }
5326 EXPORT_SYMBOL(dev_set_mtu);
5327
5328 /**
5329  *      dev_set_group - Change group this device belongs to
5330  *      @dev: device
5331  *      @new_group: group this device should belong to
5332  */
5333 void dev_set_group(struct net_device *dev, int new_group)
5334 {
5335         dev->group = new_group;
5336 }
5337 EXPORT_SYMBOL(dev_set_group);
5338
5339 /**
5340  *      dev_set_mac_address - Change Media Access Control Address
5341  *      @dev: device
5342  *      @sa: new address
5343  *
5344  *      Change the hardware (MAC) address of the device
5345  */
5346 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5347 {
5348         const struct net_device_ops *ops = dev->netdev_ops;
5349         int err;
5350
5351         if (!ops->ndo_set_mac_address)
5352                 return -EOPNOTSUPP;
5353         if (sa->sa_family != dev->type)
5354                 return -EINVAL;
5355         if (!netif_device_present(dev))
5356                 return -ENODEV;
5357         err = ops->ndo_set_mac_address(dev, sa);
5358         if (err)
5359                 return err;
5360         dev->addr_assign_type = NET_ADDR_SET;
5361         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5362         add_device_randomness(dev->dev_addr, dev->addr_len);
5363         return 0;
5364 }
5365 EXPORT_SYMBOL(dev_set_mac_address);
5366
5367 /**
5368  *      dev_change_carrier - Change device carrier
5369  *      @dev: device
5370  *      @new_carries: new value
5371  *
5372  *      Change device carrier
5373  */
5374 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5375 {
5376         const struct net_device_ops *ops = dev->netdev_ops;
5377
5378         if (!ops->ndo_change_carrier)
5379                 return -EOPNOTSUPP;
5380         if (!netif_device_present(dev))
5381                 return -ENODEV;
5382         return ops->ndo_change_carrier(dev, new_carrier);
5383 }
5384 EXPORT_SYMBOL(dev_change_carrier);
5385
5386 /*
5387  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
5388  */
5389 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
5390 {
5391         int err;
5392         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
5393
5394         if (!dev)
5395                 return -ENODEV;
5396
5397         switch (cmd) {
5398         case SIOCGIFFLAGS:      /* Get interface flags */
5399                 ifr->ifr_flags = (short) dev_get_flags(dev);
5400                 return 0;
5401
5402         case SIOCGIFMETRIC:     /* Get the metric on the interface
5403                                    (currently unused) */
5404                 ifr->ifr_metric = 0;
5405                 return 0;
5406
5407         case SIOCGIFMTU:        /* Get the MTU of a device */
5408                 ifr->ifr_mtu = dev->mtu;
5409                 return 0;
5410
5411         case SIOCGIFHWADDR:
5412                 if (!dev->addr_len)
5413                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
5414                 else
5415                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
5416                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5417                 ifr->ifr_hwaddr.sa_family = dev->type;
5418                 return 0;
5419
5420         case SIOCGIFSLAVE:
5421                 err = -EINVAL;
5422                 break;
5423
5424         case SIOCGIFMAP:
5425                 ifr->ifr_map.mem_start = dev->mem_start;
5426                 ifr->ifr_map.mem_end   = dev->mem_end;
5427                 ifr->ifr_map.base_addr = dev->base_addr;
5428                 ifr->ifr_map.irq       = dev->irq;
5429                 ifr->ifr_map.dma       = dev->dma;
5430                 ifr->ifr_map.port      = dev->if_port;
5431                 return 0;
5432
5433         case SIOCGIFINDEX:
5434                 ifr->ifr_ifindex = dev->ifindex;
5435                 return 0;
5436
5437         case SIOCGIFTXQLEN:
5438                 ifr->ifr_qlen = dev->tx_queue_len;
5439                 return 0;
5440
5441         default:
5442                 /* dev_ioctl() should ensure this case
5443                  * is never reached
5444                  */
5445                 WARN_ON(1);
5446                 err = -ENOTTY;
5447                 break;
5448
5449         }
5450         return err;
5451 }
5452
5453 /*
5454  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
5455  */
5456 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
5457 {
5458         int err;
5459         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5460         const struct net_device_ops *ops;
5461
5462         if (!dev)
5463                 return -ENODEV;
5464
5465         ops = dev->netdev_ops;
5466
5467         switch (cmd) {
5468         case SIOCSIFFLAGS:      /* Set interface flags */
5469                 return dev_change_flags(dev, ifr->ifr_flags);
5470
5471         case SIOCSIFMETRIC:     /* Set the metric on the interface
5472                                    (currently unused) */
5473                 return -EOPNOTSUPP;
5474
5475         case SIOCSIFMTU:        /* Set the MTU of a device */
5476                 return dev_set_mtu(dev, ifr->ifr_mtu);
5477
5478         case SIOCSIFHWADDR:
5479                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
5480
5481         case SIOCSIFHWBROADCAST:
5482                 if (ifr->ifr_hwaddr.sa_family != dev->type)
5483                         return -EINVAL;
5484                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
5485                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5486                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5487                 return 0;
5488
5489         case SIOCSIFMAP:
5490                 if (ops->ndo_set_config) {
5491                         if (!netif_device_present(dev))
5492                                 return -ENODEV;
5493                         return ops->ndo_set_config(dev, &ifr->ifr_map);
5494                 }
5495                 return -EOPNOTSUPP;
5496
5497         case SIOCADDMULTI:
5498                 if (!ops->ndo_set_rx_mode ||
5499                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5500                         return -EINVAL;
5501                 if (!netif_device_present(dev))
5502                         return -ENODEV;
5503                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5504
5505         case SIOCDELMULTI:
5506                 if (!ops->ndo_set_rx_mode ||
5507                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5508                         return -EINVAL;
5509                 if (!netif_device_present(dev))
5510                         return -ENODEV;
5511                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5512
5513         case SIOCSIFTXQLEN:
5514                 if (ifr->ifr_qlen < 0)
5515                         return -EINVAL;
5516                 dev->tx_queue_len = ifr->ifr_qlen;
5517                 return 0;
5518
5519         case SIOCSIFNAME:
5520                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5521                 return dev_change_name(dev, ifr->ifr_newname);
5522
5523         case SIOCSHWTSTAMP:
5524                 err = net_hwtstamp_validate(ifr);
5525                 if (err)
5526                         return err;
5527                 /* fall through */
5528
5529         /*
5530          *      Unknown or private ioctl
5531          */
5532         default:
5533                 if ((cmd >= SIOCDEVPRIVATE &&
5534                     cmd <= SIOCDEVPRIVATE + 15) ||
5535                     cmd == SIOCBONDENSLAVE ||
5536                     cmd == SIOCBONDRELEASE ||
5537                     cmd == SIOCBONDSETHWADDR ||
5538                     cmd == SIOCBONDSLAVEINFOQUERY ||
5539                     cmd == SIOCBONDINFOQUERY ||
5540                     cmd == SIOCBONDCHANGEACTIVE ||
5541                     cmd == SIOCGMIIPHY ||
5542                     cmd == SIOCGMIIREG ||
5543                     cmd == SIOCSMIIREG ||
5544                     cmd == SIOCBRADDIF ||
5545                     cmd == SIOCBRDELIF ||
5546                     cmd == SIOCSHWTSTAMP ||
5547                     cmd == SIOCWANDEV) {
5548                         err = -EOPNOTSUPP;
5549                         if (ops->ndo_do_ioctl) {
5550                                 if (netif_device_present(dev))
5551                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
5552                                 else
5553                                         err = -ENODEV;
5554                         }
5555                 } else
5556                         err = -EINVAL;
5557
5558         }
5559         return err;
5560 }
5561
5562 /*
5563  *      This function handles all "interface"-type I/O control requests. The actual
5564  *      'doing' part of this is dev_ifsioc above.
5565  */
5566
5567 /**
5568  *      dev_ioctl       -       network device ioctl
5569  *      @net: the applicable net namespace
5570  *      @cmd: command to issue
5571  *      @arg: pointer to a struct ifreq in user space
5572  *
5573  *      Issue ioctl functions to devices. This is normally called by the
5574  *      user space syscall interfaces but can sometimes be useful for
5575  *      other purposes. The return value is the return from the syscall if
5576  *      positive or a negative errno code on error.
5577  */
5578
5579 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5580 {
5581         struct ifreq ifr;
5582         int ret;
5583         char *colon;
5584
5585         /* One special case: SIOCGIFCONF takes ifconf argument
5586            and requires shared lock, because it sleeps writing
5587            to user space.
5588          */
5589
5590         if (cmd == SIOCGIFCONF) {
5591                 rtnl_lock();
5592                 ret = dev_ifconf(net, (char __user *) arg);
5593                 rtnl_unlock();
5594                 return ret;
5595         }
5596         if (cmd == SIOCGIFNAME)
5597                 return dev_ifname(net, (struct ifreq __user *)arg);
5598
5599         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5600                 return -EFAULT;
5601
5602         ifr.ifr_name[IFNAMSIZ-1] = 0;
5603
5604         colon = strchr(ifr.ifr_name, ':');
5605         if (colon)
5606                 *colon = 0;
5607
5608         /*
5609          *      See which interface the caller is talking about.
5610          */
5611
5612         switch (cmd) {
5613         /*
5614          *      These ioctl calls:
5615          *      - can be done by all.
5616          *      - atomic and do not require locking.
5617          *      - return a value
5618          */
5619         case SIOCGIFFLAGS:
5620         case SIOCGIFMETRIC:
5621         case SIOCGIFMTU:
5622         case SIOCGIFHWADDR:
5623         case SIOCGIFSLAVE:
5624         case SIOCGIFMAP:
5625         case SIOCGIFINDEX:
5626         case SIOCGIFTXQLEN:
5627                 dev_load(net, ifr.ifr_name);
5628                 rcu_read_lock();
5629                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5630                 rcu_read_unlock();
5631                 if (!ret) {
5632                         if (colon)
5633                                 *colon = ':';
5634                         if (copy_to_user(arg, &ifr,
5635                                          sizeof(struct ifreq)))
5636                                 ret = -EFAULT;
5637                 }
5638                 return ret;
5639
5640         case SIOCETHTOOL:
5641                 dev_load(net, ifr.ifr_name);
5642                 rtnl_lock();
5643                 ret = dev_ethtool(net, &ifr);
5644                 rtnl_unlock();
5645                 if (!ret) {
5646                         if (colon)
5647                                 *colon = ':';
5648                         if (copy_to_user(arg, &ifr,
5649                                          sizeof(struct ifreq)))
5650                                 ret = -EFAULT;
5651                 }
5652                 return ret;
5653
5654         /*
5655          *      These ioctl calls:
5656          *      - require superuser power.
5657          *      - require strict serialization.
5658          *      - return a value
5659          */
5660         case SIOCGMIIPHY:
5661         case SIOCGMIIREG:
5662         case SIOCSIFNAME:
5663                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5664                         return -EPERM;
5665                 dev_load(net, ifr.ifr_name);
5666                 rtnl_lock();
5667                 ret = dev_ifsioc(net, &ifr, cmd);
5668                 rtnl_unlock();
5669                 if (!ret) {
5670                         if (colon)
5671                                 *colon = ':';
5672                         if (copy_to_user(arg, &ifr,
5673                                          sizeof(struct ifreq)))
5674                                 ret = -EFAULT;
5675                 }
5676                 return ret;
5677
5678         /*
5679          *      These ioctl calls:
5680          *      - require superuser power.
5681          *      - require strict serialization.
5682          *      - do not return a value
5683          */
5684         case SIOCSIFMAP:
5685         case SIOCSIFTXQLEN:
5686                 if (!capable(CAP_NET_ADMIN))
5687                         return -EPERM;
5688                 /* fall through */
5689         /*
5690          *      These ioctl calls:
5691          *      - require local superuser power.
5692          *      - require strict serialization.
5693          *      - do not return a value
5694          */
5695         case SIOCSIFFLAGS:
5696         case SIOCSIFMETRIC:
5697         case SIOCSIFMTU:
5698         case SIOCSIFHWADDR:
5699         case SIOCSIFSLAVE:
5700         case SIOCADDMULTI:
5701         case SIOCDELMULTI:
5702         case SIOCSIFHWBROADCAST:
5703         case SIOCSMIIREG:
5704         case SIOCBONDENSLAVE:
5705         case SIOCBONDRELEASE:
5706         case SIOCBONDSETHWADDR:
5707         case SIOCBONDCHANGEACTIVE:
5708         case SIOCBRADDIF:
5709         case SIOCBRDELIF:
5710         case SIOCSHWTSTAMP:
5711                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5712                         return -EPERM;
5713                 /* fall through */
5714         case SIOCBONDSLAVEINFOQUERY:
5715         case SIOCBONDINFOQUERY:
5716                 dev_load(net, ifr.ifr_name);
5717                 rtnl_lock();
5718                 ret = dev_ifsioc(net, &ifr, cmd);
5719                 rtnl_unlock();
5720                 return ret;
5721
5722         case SIOCGIFMEM:
5723                 /* Get the per device memory space. We can add this but
5724                  * currently do not support it */
5725         case SIOCSIFMEM:
5726                 /* Set the per device memory buffer space.
5727                  * Not applicable in our case */
5728         case SIOCSIFLINK:
5729                 return -ENOTTY;
5730
5731         /*
5732          *      Unknown or private ioctl.
5733          */
5734         default:
5735                 if (cmd == SIOCWANDEV ||
5736                     (cmd >= SIOCDEVPRIVATE &&
5737                      cmd <= SIOCDEVPRIVATE + 15)) {
5738                         dev_load(net, ifr.ifr_name);
5739                         rtnl_lock();
5740                         ret = dev_ifsioc(net, &ifr, cmd);
5741                         rtnl_unlock();
5742                         if (!ret && copy_to_user(arg, &ifr,
5743                                                  sizeof(struct ifreq)))
5744                                 ret = -EFAULT;
5745                         return ret;
5746                 }
5747                 /* Take care of Wireless Extensions */
5748                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5749                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5750                 return -ENOTTY;
5751         }
5752 }
5753
5754
5755 /**
5756  *      dev_new_index   -       allocate an ifindex
5757  *      @net: the applicable net namespace
5758  *
5759  *      Returns a suitable unique value for a new device interface
5760  *      number.  The caller must hold the rtnl semaphore or the
5761  *      dev_base_lock to be sure it remains unique.
5762  */
5763 static int dev_new_index(struct net *net)
5764 {
5765         int ifindex = net->ifindex;
5766         for (;;) {
5767                 if (++ifindex <= 0)
5768                         ifindex = 1;
5769                 if (!__dev_get_by_index(net, ifindex))
5770                         return net->ifindex = ifindex;
5771         }
5772 }
5773
5774 /* Delayed registration/unregisteration */
5775 static LIST_HEAD(net_todo_list);
5776
5777 static void net_set_todo(struct net_device *dev)
5778 {
5779         list_add_tail(&dev->todo_list, &net_todo_list);
5780 }
5781
5782 static void rollback_registered_many(struct list_head *head)
5783 {
5784         struct net_device *dev, *tmp;
5785
5786         BUG_ON(dev_boot_phase);
5787         ASSERT_RTNL();
5788
5789         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5790                 /* Some devices call without registering
5791                  * for initialization unwind. Remove those
5792                  * devices and proceed with the remaining.
5793                  */
5794                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5795                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5796                                  dev->name, dev);
5797
5798                         WARN_ON(1);
5799                         list_del(&dev->unreg_list);
5800                         continue;
5801                 }
5802                 dev->dismantle = true;
5803                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5804         }
5805
5806         /* If device is running, close it first. */
5807         dev_close_many(head);
5808
5809         list_for_each_entry(dev, head, unreg_list) {
5810                 /* And unlink it from device chain. */
5811                 unlist_netdevice(dev);
5812
5813                 dev->reg_state = NETREG_UNREGISTERING;
5814         }
5815
5816         synchronize_net();
5817
5818         list_for_each_entry(dev, head, unreg_list) {
5819                 /* Shutdown queueing discipline. */
5820                 dev_shutdown(dev);
5821
5822
5823                 /* Notify protocols, that we are about to destroy
5824                    this device. They should clean all the things.
5825                 */
5826                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5827
5828                 if (!dev->rtnl_link_ops ||
5829                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5830                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5831
5832                 /*
5833                  *      Flush the unicast and multicast chains
5834                  */
5835                 dev_uc_flush(dev);
5836                 dev_mc_flush(dev);
5837
5838                 if (dev->netdev_ops->ndo_uninit)
5839                         dev->netdev_ops->ndo_uninit(dev);
5840
5841                 /* Notifier chain MUST detach us all upper devices. */
5842                 WARN_ON(netdev_has_any_upper_dev(dev));
5843
5844                 /* Remove entries from kobject tree */
5845                 netdev_unregister_kobject(dev);
5846 #ifdef CONFIG_XPS
5847                 /* Remove XPS queueing entries */
5848                 netif_reset_xps_queues_gt(dev, 0);
5849 #endif
5850         }
5851
5852         synchronize_net();
5853
5854         list_for_each_entry(dev, head, unreg_list)
5855                 dev_put(dev);
5856 }
5857
5858 static void rollback_registered(struct net_device *dev)
5859 {
5860         LIST_HEAD(single);
5861
5862         list_add(&dev->unreg_list, &single);
5863         rollback_registered_many(&single);
5864         list_del(&single);
5865 }
5866
5867 static netdev_features_t netdev_fix_features(struct net_device *dev,
5868         netdev_features_t features)
5869 {
5870         /* Fix illegal checksum combinations */
5871         if ((features & NETIF_F_HW_CSUM) &&
5872             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5873                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5874                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5875         }
5876
5877         /* Fix illegal SG+CSUM combinations. */
5878         if ((features & NETIF_F_SG) &&
5879             !(features & NETIF_F_ALL_CSUM)) {
5880                 netdev_dbg(dev,
5881                         "Dropping NETIF_F_SG since no checksum feature.\n");
5882                 features &= ~NETIF_F_SG;
5883         }
5884
5885         /* TSO requires that SG is present as well. */
5886         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5887                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5888                 features &= ~NETIF_F_ALL_TSO;
5889         }
5890
5891         /* TSO ECN requires that TSO is present as well. */
5892         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5893                 features &= ~NETIF_F_TSO_ECN;
5894
5895         /* Software GSO depends on SG. */
5896         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5897                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5898                 features &= ~NETIF_F_GSO;
5899         }
5900
5901         /* UFO needs SG and checksumming */
5902         if (features & NETIF_F_UFO) {
5903                 /* maybe split UFO into V4 and V6? */
5904                 if (!((features & NETIF_F_GEN_CSUM) ||
5905                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5906                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5907                         netdev_dbg(dev,
5908                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5909                         features &= ~NETIF_F_UFO;
5910                 }
5911
5912                 if (!(features & NETIF_F_SG)) {
5913                         netdev_dbg(dev,
5914                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5915                         features &= ~NETIF_F_UFO;
5916                 }
5917         }
5918
5919         return features;
5920 }
5921
5922 int __netdev_update_features(struct net_device *dev)
5923 {
5924         netdev_features_t features;
5925         int err = 0;
5926
5927         ASSERT_RTNL();
5928
5929         features = netdev_get_wanted_features(dev);
5930
5931         if (dev->netdev_ops->ndo_fix_features)
5932                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5933
5934         /* driver might be less strict about feature dependencies */
5935         features = netdev_fix_features(dev, features);
5936
5937         if (dev->features == features)
5938                 return 0;
5939
5940         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5941                 &dev->features, &features);
5942
5943         if (dev->netdev_ops->ndo_set_features)
5944                 err = dev->netdev_ops->ndo_set_features(dev, features);
5945
5946         if (unlikely(err < 0)) {
5947                 netdev_err(dev,
5948                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5949                         err, &features, &dev->features);
5950                 return -1;
5951         }
5952
5953         if (!err)
5954                 dev->features = features;
5955
5956         return 1;
5957 }
5958
5959 /**
5960  *      netdev_update_features - recalculate device features
5961  *      @dev: the device to check
5962  *
5963  *      Recalculate dev->features set and send notifications if it
5964  *      has changed. Should be called after driver or hardware dependent
5965  *      conditions might have changed that influence the features.
5966  */
5967 void netdev_update_features(struct net_device *dev)
5968 {
5969         if (__netdev_update_features(dev))
5970                 netdev_features_change(dev);
5971 }
5972 EXPORT_SYMBOL(netdev_update_features);
5973
5974 /**
5975  *      netdev_change_features - recalculate device features
5976  *      @dev: the device to check
5977  *
5978  *      Recalculate dev->features set and send notifications even
5979  *      if they have not changed. Should be called instead of
5980  *      netdev_update_features() if also dev->vlan_features might
5981  *      have changed to allow the changes to be propagated to stacked
5982  *      VLAN devices.
5983  */
5984 void netdev_change_features(struct net_device *dev)
5985 {
5986         __netdev_update_features(dev);
5987         netdev_features_change(dev);
5988 }
5989 EXPORT_SYMBOL(netdev_change_features);
5990
5991 /**
5992  *      netif_stacked_transfer_operstate -      transfer operstate
5993  *      @rootdev: the root or lower level device to transfer state from
5994  *      @dev: the device to transfer operstate to
5995  *
5996  *      Transfer operational state from root to device. This is normally
5997  *      called when a stacking relationship exists between the root
5998  *      device and the device(a leaf device).
5999  */
6000 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6001                                         struct net_device *dev)
6002 {
6003         if (rootdev->operstate == IF_OPER_DORMANT)
6004                 netif_dormant_on(dev);
6005         else
6006                 netif_dormant_off(dev);
6007
6008         if (netif_carrier_ok(rootdev)) {
6009                 if (!netif_carrier_ok(dev))
6010                         netif_carrier_on(dev);
6011         } else {
6012                 if (netif_carrier_ok(dev))
6013                         netif_carrier_off(dev);
6014         }
6015 }
6016 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6017
6018 #ifdef CONFIG_RPS
6019 static int netif_alloc_rx_queues(struct net_device *dev)
6020 {
6021         unsigned int i, count = dev->num_rx_queues;
6022         struct netdev_rx_queue *rx;
6023
6024         BUG_ON(count < 1);
6025
6026         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6027         if (!rx)
6028                 return -ENOMEM;
6029
6030         dev->_rx = rx;
6031
6032         for (i = 0; i < count; i++)
6033                 rx[i].dev = dev;
6034         return 0;
6035 }
6036 #endif
6037
6038 static void netdev_init_one_queue(struct net_device *dev,
6039                                   struct netdev_queue *queue, void *_unused)
6040 {
6041         /* Initialize queue lock */
6042         spin_lock_init(&queue->_xmit_lock);
6043         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6044         queue->xmit_lock_owner = -1;
6045         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6046         queue->dev = dev;
6047 #ifdef CONFIG_BQL
6048         dql_init(&queue->dql, HZ);
6049 #endif
6050 }
6051
6052 static int netif_alloc_netdev_queues(struct net_device *dev)
6053 {
6054         unsigned int count = dev->num_tx_queues;
6055         struct netdev_queue *tx;
6056
6057         BUG_ON(count < 1);
6058
6059         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
6060         if (!tx)
6061                 return -ENOMEM;
6062
6063         dev->_tx = tx;
6064
6065         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6066         spin_lock_init(&dev->tx_global_lock);
6067
6068         return 0;
6069 }
6070
6071 /**
6072  *      register_netdevice      - register a network device
6073  *      @dev: device to register
6074  *
6075  *      Take a completed network device structure and add it to the kernel
6076  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6077  *      chain. 0 is returned on success. A negative errno code is returned
6078  *      on a failure to set up the device, or if the name is a duplicate.
6079  *
6080  *      Callers must hold the rtnl semaphore. You may want
6081  *      register_netdev() instead of this.
6082  *
6083  *      BUGS:
6084  *      The locking appears insufficient to guarantee two parallel registers
6085  *      will not get the same name.
6086  */
6087
6088 int register_netdevice(struct net_device *dev)
6089 {
6090         int ret;
6091         struct net *net = dev_net(dev);
6092
6093         BUG_ON(dev_boot_phase);
6094         ASSERT_RTNL();
6095
6096         might_sleep();
6097
6098         /* When net_device's are persistent, this will be fatal. */
6099         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6100         BUG_ON(!net);
6101
6102         spin_lock_init(&dev->addr_list_lock);
6103         netdev_set_addr_lockdep_class(dev);
6104
6105         dev->iflink = -1;
6106
6107         ret = dev_get_valid_name(net, dev, dev->name);
6108         if (ret < 0)
6109                 goto out;
6110
6111         /* Init, if this function is available */
6112         if (dev->netdev_ops->ndo_init) {
6113                 ret = dev->netdev_ops->ndo_init(dev);
6114                 if (ret) {
6115                         if (ret > 0)
6116                                 ret = -EIO;
6117                         goto out;
6118                 }
6119         }
6120
6121         if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) &&
6122             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6123              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6124                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6125                 ret = -EINVAL;
6126                 goto err_uninit;
6127         }
6128
6129         ret = -EBUSY;
6130         if (!dev->ifindex)
6131                 dev->ifindex = dev_new_index(net);
6132         else if (__dev_get_by_index(net, dev->ifindex))
6133                 goto err_uninit;
6134
6135         if (dev->iflink == -1)
6136                 dev->iflink = dev->ifindex;
6137
6138         /* Transfer changeable features to wanted_features and enable
6139          * software offloads (GSO and GRO).
6140          */
6141         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6142         dev->features |= NETIF_F_SOFT_FEATURES;
6143         dev->wanted_features = dev->features & dev->hw_features;
6144
6145         /* Turn on no cache copy if HW is doing checksum */
6146         if (!(dev->flags & IFF_LOOPBACK)) {
6147                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6148                 if (dev->features & NETIF_F_ALL_CSUM) {
6149                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
6150                         dev->features |= NETIF_F_NOCACHE_COPY;
6151                 }
6152         }
6153
6154         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6155          */
6156         dev->vlan_features |= NETIF_F_HIGHDMA;
6157
6158         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6159         ret = notifier_to_errno(ret);
6160         if (ret)
6161                 goto err_uninit;
6162
6163         ret = netdev_register_kobject(dev);
6164         if (ret)
6165                 goto err_uninit;
6166         dev->reg_state = NETREG_REGISTERED;
6167
6168         __netdev_update_features(dev);
6169
6170         /*
6171          *      Default initial state at registry is that the
6172          *      device is present.
6173          */
6174
6175         set_bit(__LINK_STATE_PRESENT, &dev->state);
6176
6177         linkwatch_init_dev(dev);
6178
6179         dev_init_scheduler(dev);
6180         dev_hold(dev);
6181         list_netdevice(dev);
6182         add_device_randomness(dev->dev_addr, dev->addr_len);
6183
6184         /* If the device has permanent device address, driver should
6185          * set dev_addr and also addr_assign_type should be set to
6186          * NET_ADDR_PERM (default value).
6187          */
6188         if (dev->addr_assign_type == NET_ADDR_PERM)
6189                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6190
6191         /* Notify protocols, that a new device appeared. */
6192         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6193         ret = notifier_to_errno(ret);
6194         if (ret) {
6195                 rollback_registered(dev);
6196                 dev->reg_state = NETREG_UNREGISTERED;
6197         }
6198         /*
6199          *      Prevent userspace races by waiting until the network
6200          *      device is fully setup before sending notifications.
6201          */
6202         if (!dev->rtnl_link_ops ||
6203             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6204                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6205
6206 out:
6207         return ret;
6208
6209 err_uninit:
6210         if (dev->netdev_ops->ndo_uninit)
6211                 dev->netdev_ops->ndo_uninit(dev);
6212         goto out;
6213 }
6214 EXPORT_SYMBOL(register_netdevice);
6215
6216 /**
6217  *      init_dummy_netdev       - init a dummy network device for NAPI
6218  *      @dev: device to init
6219  *
6220  *      This takes a network device structure and initialize the minimum
6221  *      amount of fields so it can be used to schedule NAPI polls without
6222  *      registering a full blown interface. This is to be used by drivers
6223  *      that need to tie several hardware interfaces to a single NAPI
6224  *      poll scheduler due to HW limitations.
6225  */
6226 int init_dummy_netdev(struct net_device *dev)
6227 {
6228         /* Clear everything. Note we don't initialize spinlocks
6229          * are they aren't supposed to be taken by any of the
6230          * NAPI code and this dummy netdev is supposed to be
6231          * only ever used for NAPI polls
6232          */
6233         memset(dev, 0, sizeof(struct net_device));
6234
6235         /* make sure we BUG if trying to hit standard
6236          * register/unregister code path
6237          */
6238         dev->reg_state = NETREG_DUMMY;
6239
6240         /* NAPI wants this */
6241         INIT_LIST_HEAD(&dev->napi_list);
6242
6243         /* a dummy interface is started by default */
6244         set_bit(__LINK_STATE_PRESENT, &dev->state);
6245         set_bit(__LINK_STATE_START, &dev->state);
6246
6247         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6248          * because users of this 'device' dont need to change
6249          * its refcount.
6250          */
6251
6252         return 0;
6253 }
6254 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6255
6256
6257 /**
6258  *      register_netdev - register a network device
6259  *      @dev: device to register
6260  *
6261  *      Take a completed network device structure and add it to the kernel
6262  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6263  *      chain. 0 is returned on success. A negative errno code is returned
6264  *      on a failure to set up the device, or if the name is a duplicate.
6265  *
6266  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6267  *      and expands the device name if you passed a format string to
6268  *      alloc_netdev.
6269  */
6270 int register_netdev(struct net_device *dev)
6271 {
6272         int err;
6273
6274         rtnl_lock();
6275         err = register_netdevice(dev);
6276         rtnl_unlock();
6277         return err;
6278 }
6279 EXPORT_SYMBOL(register_netdev);
6280
6281 int netdev_refcnt_read(const struct net_device *dev)
6282 {
6283         int i, refcnt = 0;
6284
6285         for_each_possible_cpu(i)
6286                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6287         return refcnt;
6288 }
6289 EXPORT_SYMBOL(netdev_refcnt_read);
6290
6291 /**
6292  * netdev_wait_allrefs - wait until all references are gone.
6293  * @dev: target net_device
6294  *
6295  * This is called when unregistering network devices.
6296  *
6297  * Any protocol or device that holds a reference should register
6298  * for netdevice notification, and cleanup and put back the
6299  * reference if they receive an UNREGISTER event.
6300  * We can get stuck here if buggy protocols don't correctly
6301  * call dev_put.
6302  */
6303 static void netdev_wait_allrefs(struct net_device *dev)
6304 {
6305         unsigned long rebroadcast_time, warning_time;
6306         int refcnt;
6307
6308         linkwatch_forget_dev(dev);
6309
6310         rebroadcast_time = warning_time = jiffies;
6311         refcnt = netdev_refcnt_read(dev);
6312
6313         while (refcnt != 0) {
6314                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6315                         rtnl_lock();
6316
6317                         /* Rebroadcast unregister notification */
6318                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6319
6320                         __rtnl_unlock();
6321                         rcu_barrier();
6322                         rtnl_lock();
6323
6324                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6325                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6326                                      &dev->state)) {
6327                                 /* We must not have linkwatch events
6328                                  * pending on unregister. If this
6329                                  * happens, we simply run the queue
6330                                  * unscheduled, resulting in a noop
6331                                  * for this device.
6332                                  */
6333                                 linkwatch_run_queue();
6334                         }
6335
6336                         __rtnl_unlock();
6337
6338                         rebroadcast_time = jiffies;
6339                 }
6340
6341                 msleep(250);
6342
6343                 refcnt = netdev_refcnt_read(dev);
6344
6345                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6346                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6347                                  dev->name, refcnt);
6348                         warning_time = jiffies;
6349                 }
6350         }
6351 }
6352
6353 /* The sequence is:
6354  *
6355  *      rtnl_lock();
6356  *      ...
6357  *      register_netdevice(x1);
6358  *      register_netdevice(x2);
6359  *      ...
6360  *      unregister_netdevice(y1);
6361  *      unregister_netdevice(y2);
6362  *      ...
6363  *      rtnl_unlock();
6364  *      free_netdev(y1);
6365  *      free_netdev(y2);
6366  *
6367  * We are invoked by rtnl_unlock().
6368  * This allows us to deal with problems:
6369  * 1) We can delete sysfs objects which invoke hotplug
6370  *    without deadlocking with linkwatch via keventd.
6371  * 2) Since we run with the RTNL semaphore not held, we can sleep
6372  *    safely in order to wait for the netdev refcnt to drop to zero.
6373  *
6374  * We must not return until all unregister events added during
6375  * the interval the lock was held have been completed.
6376  */
6377 void netdev_run_todo(void)
6378 {
6379         struct list_head list;
6380
6381         /* Snapshot list, allow later requests */
6382         list_replace_init(&net_todo_list, &list);
6383
6384         __rtnl_unlock();
6385
6386
6387         /* Wait for rcu callbacks to finish before next phase */
6388         if (!list_empty(&list))
6389                 rcu_barrier();
6390
6391         while (!list_empty(&list)) {
6392                 struct net_device *dev
6393                         = list_first_entry(&list, struct net_device, todo_list);
6394                 list_del(&dev->todo_list);
6395
6396                 rtnl_lock();
6397                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6398                 __rtnl_unlock();
6399
6400                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6401                         pr_err("network todo '%s' but state %d\n",
6402                                dev->name, dev->reg_state);
6403                         dump_stack();
6404                         continue;
6405                 }
6406
6407                 dev->reg_state = NETREG_UNREGISTERED;
6408
6409                 on_each_cpu(flush_backlog, dev, 1);
6410
6411                 netdev_wait_allrefs(dev);
6412
6413                 /* paranoia */
6414                 BUG_ON(netdev_refcnt_read(dev));
6415                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6416                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6417                 WARN_ON(dev->dn_ptr);
6418
6419                 if (dev->destructor)
6420                         dev->destructor(dev);
6421
6422                 /* Free network device */
6423                 kobject_put(&dev->dev.kobj);
6424         }
6425 }
6426
6427 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6428  * fields in the same order, with only the type differing.
6429  */
6430 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6431                              const struct net_device_stats *netdev_stats)
6432 {
6433 #if BITS_PER_LONG == 64
6434         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6435         memcpy(stats64, netdev_stats, sizeof(*stats64));
6436 #else
6437         size_t i, n = sizeof(*stats64) / sizeof(u64);
6438         const unsigned long *src = (const unsigned long *)netdev_stats;
6439         u64 *dst = (u64 *)stats64;
6440
6441         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6442                      sizeof(*stats64) / sizeof(u64));
6443         for (i = 0; i < n; i++)
6444                 dst[i] = src[i];
6445 #endif
6446 }
6447 EXPORT_SYMBOL(netdev_stats_to_stats64);
6448
6449 /**
6450  *      dev_get_stats   - get network device statistics
6451  *      @dev: device to get statistics from
6452  *      @storage: place to store stats
6453  *
6454  *      Get network statistics from device. Return @storage.
6455  *      The device driver may provide its own method by setting
6456  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6457  *      otherwise the internal statistics structure is used.
6458  */
6459 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6460                                         struct rtnl_link_stats64 *storage)
6461 {
6462         const struct net_device_ops *ops = dev->netdev_ops;
6463
6464         if (ops->ndo_get_stats64) {
6465                 memset(storage, 0, sizeof(*storage));
6466                 ops->ndo_get_stats64(dev, storage);
6467         } else if (ops->ndo_get_stats) {
6468                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6469         } else {
6470                 netdev_stats_to_stats64(storage, &dev->stats);
6471         }
6472         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6473         return storage;
6474 }
6475 EXPORT_SYMBOL(dev_get_stats);
6476
6477 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6478 {
6479         struct netdev_queue *queue = dev_ingress_queue(dev);
6480
6481 #ifdef CONFIG_NET_CLS_ACT
6482         if (queue)
6483                 return queue;
6484         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6485         if (!queue)
6486                 return NULL;
6487         netdev_init_one_queue(dev, queue, NULL);
6488         queue->qdisc = &noop_qdisc;
6489         queue->qdisc_sleeping = &noop_qdisc;
6490         rcu_assign_pointer(dev->ingress_queue, queue);
6491 #endif
6492         return queue;
6493 }
6494
6495 static const struct ethtool_ops default_ethtool_ops;
6496
6497 void netdev_set_default_ethtool_ops(struct net_device *dev,
6498                                     const struct ethtool_ops *ops)
6499 {
6500         if (dev->ethtool_ops == &default_ethtool_ops)
6501                 dev->ethtool_ops = ops;
6502 }
6503 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6504
6505 /**
6506  *      alloc_netdev_mqs - allocate network device
6507  *      @sizeof_priv:   size of private data to allocate space for
6508  *      @name:          device name format string
6509  *      @setup:         callback to initialize device
6510  *      @txqs:          the number of TX subqueues to allocate
6511  *      @rxqs:          the number of RX subqueues to allocate
6512  *
6513  *      Allocates a struct net_device with private data area for driver use
6514  *      and performs basic initialization.  Also allocates subquue structs
6515  *      for each queue on the device.
6516  */
6517 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6518                 void (*setup)(struct net_device *),
6519                 unsigned int txqs, unsigned int rxqs)
6520 {
6521         struct net_device *dev;
6522         size_t alloc_size;
6523         struct net_device *p;
6524
6525         BUG_ON(strlen(name) >= sizeof(dev->name));
6526
6527         if (txqs < 1) {
6528                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6529                 return NULL;
6530         }
6531
6532 #ifdef CONFIG_RPS
6533         if (rxqs < 1) {
6534                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6535                 return NULL;
6536         }
6537 #endif
6538
6539         alloc_size = sizeof(struct net_device);
6540         if (sizeof_priv) {
6541                 /* ensure 32-byte alignment of private area */
6542                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6543                 alloc_size += sizeof_priv;
6544         }
6545         /* ensure 32-byte alignment of whole construct */
6546         alloc_size += NETDEV_ALIGN - 1;
6547
6548         p = kzalloc(alloc_size, GFP_KERNEL);
6549         if (!p)
6550                 return NULL;
6551
6552         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6553         dev->padded = (char *)dev - (char *)p;
6554
6555         dev->pcpu_refcnt = alloc_percpu(int);
6556         if (!dev->pcpu_refcnt)
6557                 goto free_p;
6558
6559         if (dev_addr_init(dev))
6560                 goto free_pcpu;
6561
6562         dev_mc_init(dev);
6563         dev_uc_init(dev);
6564
6565         dev_net_set(dev, &init_net);
6566
6567         dev->gso_max_size = GSO_MAX_SIZE;
6568         dev->gso_max_segs = GSO_MAX_SEGS;
6569
6570         INIT_LIST_HEAD(&dev->napi_list);
6571         INIT_LIST_HEAD(&dev->unreg_list);
6572         INIT_LIST_HEAD(&dev->link_watch_list);
6573         INIT_LIST_HEAD(&dev->upper_dev_list);
6574         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6575         setup(dev);
6576
6577         dev->num_tx_queues = txqs;
6578         dev->real_num_tx_queues = txqs;
6579         if (netif_alloc_netdev_queues(dev))
6580                 goto free_all;
6581
6582 #ifdef CONFIG_RPS
6583         dev->num_rx_queues = rxqs;
6584         dev->real_num_rx_queues = rxqs;
6585         if (netif_alloc_rx_queues(dev))
6586                 goto free_all;
6587 #endif
6588
6589         strcpy(dev->name, name);
6590         dev->group = INIT_NETDEV_GROUP;
6591         if (!dev->ethtool_ops)
6592                 dev->ethtool_ops = &default_ethtool_ops;
6593         return dev;
6594
6595 free_all:
6596         free_netdev(dev);
6597         return NULL;
6598
6599 free_pcpu:
6600         free_percpu(dev->pcpu_refcnt);
6601         kfree(dev->_tx);
6602 #ifdef CONFIG_RPS
6603         kfree(dev->_rx);
6604 #endif
6605
6606 free_p:
6607         kfree(p);
6608         return NULL;
6609 }
6610 EXPORT_SYMBOL(alloc_netdev_mqs);
6611
6612 /**
6613  *      free_netdev - free network device
6614  *      @dev: device
6615  *
6616  *      This function does the last stage of destroying an allocated device
6617  *      interface. The reference to the device object is released.
6618  *      If this is the last reference then it will be freed.
6619  */
6620 void free_netdev(struct net_device *dev)
6621 {
6622         struct napi_struct *p, *n;
6623
6624         release_net(dev_net(dev));
6625
6626         kfree(dev->_tx);
6627 #ifdef CONFIG_RPS
6628         kfree(dev->_rx);
6629 #endif
6630
6631         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6632
6633         /* Flush device addresses */
6634         dev_addr_flush(dev);
6635
6636         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6637                 netif_napi_del(p);
6638
6639         free_percpu(dev->pcpu_refcnt);
6640         dev->pcpu_refcnt = NULL;
6641
6642         /*  Compatibility with error handling in drivers */
6643         if (dev->reg_state == NETREG_UNINITIALIZED) {
6644                 kfree((char *)dev - dev->padded);
6645                 return;
6646         }
6647
6648         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6649         dev->reg_state = NETREG_RELEASED;
6650
6651         /* will free via device release */
6652         put_device(&dev->dev);
6653 }
6654 EXPORT_SYMBOL(free_netdev);
6655
6656 /**
6657  *      synchronize_net -  Synchronize with packet receive processing
6658  *
6659  *      Wait for packets currently being received to be done.
6660  *      Does not block later packets from starting.
6661  */
6662 void synchronize_net(void)
6663 {
6664         might_sleep();
6665         if (rtnl_is_locked())
6666                 synchronize_rcu_expedited();
6667         else
6668                 synchronize_rcu();
6669 }
6670 EXPORT_SYMBOL(synchronize_net);
6671
6672 /**
6673  *      unregister_netdevice_queue - remove device from the kernel
6674  *      @dev: device
6675  *      @head: list
6676  *
6677  *      This function shuts down a device interface and removes it
6678  *      from the kernel tables.
6679  *      If head not NULL, device is queued to be unregistered later.
6680  *
6681  *      Callers must hold the rtnl semaphore.  You may want
6682  *      unregister_netdev() instead of this.
6683  */
6684
6685 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6686 {
6687         ASSERT_RTNL();
6688
6689         if (head) {
6690                 list_move_tail(&dev->unreg_list, head);
6691         } else {
6692                 rollback_registered(dev);
6693                 /* Finish processing unregister after unlock */
6694                 net_set_todo(dev);
6695         }
6696 }
6697 EXPORT_SYMBOL(unregister_netdevice_queue);
6698
6699 /**
6700  *      unregister_netdevice_many - unregister many devices
6701  *      @head: list of devices
6702  */
6703 void unregister_netdevice_many(struct list_head *head)
6704 {
6705         struct net_device *dev;
6706
6707         if (!list_empty(head)) {
6708                 rollback_registered_many(head);
6709                 list_for_each_entry(dev, head, unreg_list)
6710                         net_set_todo(dev);
6711         }
6712 }
6713 EXPORT_SYMBOL(unregister_netdevice_many);
6714
6715 /**
6716  *      unregister_netdev - remove device from the kernel
6717  *      @dev: device
6718  *
6719  *      This function shuts down a device interface and removes it
6720  *      from the kernel tables.
6721  *
6722  *      This is just a wrapper for unregister_netdevice that takes
6723  *      the rtnl semaphore.  In general you want to use this and not
6724  *      unregister_netdevice.
6725  */
6726 void unregister_netdev(struct net_device *dev)
6727 {
6728         rtnl_lock();
6729         unregister_netdevice(dev);
6730         rtnl_unlock();
6731 }
6732 EXPORT_SYMBOL(unregister_netdev);
6733
6734 /**
6735  *      dev_change_net_namespace - move device to different nethost namespace
6736  *      @dev: device
6737  *      @net: network namespace
6738  *      @pat: If not NULL name pattern to try if the current device name
6739  *            is already taken in the destination network namespace.
6740  *
6741  *      This function shuts down a device interface and moves it
6742  *      to a new network namespace. On success 0 is returned, on
6743  *      a failure a netagive errno code is returned.
6744  *
6745  *      Callers must hold the rtnl semaphore.
6746  */
6747
6748 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6749 {
6750         int err;
6751
6752         ASSERT_RTNL();
6753
6754         /* Don't allow namespace local devices to be moved. */
6755         err = -EINVAL;
6756         if (dev->features & NETIF_F_NETNS_LOCAL)
6757                 goto out;
6758
6759         /* Ensure the device has been registrered */
6760         if (dev->reg_state != NETREG_REGISTERED)
6761                 goto out;
6762
6763         /* Get out if there is nothing todo */
6764         err = 0;
6765         if (net_eq(dev_net(dev), net))
6766                 goto out;
6767
6768         /* Pick the destination device name, and ensure
6769          * we can use it in the destination network namespace.
6770          */
6771         err = -EEXIST;
6772         if (__dev_get_by_name(net, dev->name)) {
6773                 /* We get here if we can't use the current device name */
6774                 if (!pat)
6775                         goto out;
6776                 if (dev_get_valid_name(net, dev, pat) < 0)
6777                         goto out;
6778         }
6779
6780         /*
6781          * And now a mini version of register_netdevice unregister_netdevice.
6782          */
6783
6784         /* If device is running close it first. */
6785         dev_close(dev);
6786
6787         /* And unlink it from device chain */
6788         err = -ENODEV;
6789         unlist_netdevice(dev);
6790
6791         synchronize_net();
6792
6793         /* Shutdown queueing discipline. */
6794         dev_shutdown(dev);
6795
6796         /* Notify protocols, that we are about to destroy
6797            this device. They should clean all the things.
6798
6799            Note that dev->reg_state stays at NETREG_REGISTERED.
6800            This is wanted because this way 8021q and macvlan know
6801            the device is just moving and can keep their slaves up.
6802         */
6803         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6804         rcu_barrier();
6805         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6806         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6807
6808         /*
6809          *      Flush the unicast and multicast chains
6810          */
6811         dev_uc_flush(dev);
6812         dev_mc_flush(dev);
6813
6814         /* Send a netdev-removed uevent to the old namespace */
6815         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6816
6817         /* Actually switch the network namespace */
6818         dev_net_set(dev, net);
6819
6820         /* If there is an ifindex conflict assign a new one */
6821         if (__dev_get_by_index(net, dev->ifindex)) {
6822                 int iflink = (dev->iflink == dev->ifindex);
6823                 dev->ifindex = dev_new_index(net);
6824                 if (iflink)
6825                         dev->iflink = dev->ifindex;
6826         }
6827
6828         /* Send a netdev-add uevent to the new namespace */
6829         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6830
6831         /* Fixup kobjects */
6832         err = device_rename(&dev->dev, dev->name);
6833         WARN_ON(err);
6834
6835         /* Add the device back in the hashes */
6836         list_netdevice(dev);
6837
6838         /* Notify protocols, that a new device appeared. */
6839         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6840
6841         /*
6842          *      Prevent userspace races by waiting until the network
6843          *      device is fully setup before sending notifications.
6844          */
6845         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6846
6847         synchronize_net();
6848         err = 0;
6849 out:
6850         return err;
6851 }
6852 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6853
6854 static int dev_cpu_callback(struct notifier_block *nfb,
6855                             unsigned long action,
6856                             void *ocpu)
6857 {
6858         struct sk_buff **list_skb;
6859         struct sk_buff *skb;
6860         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6861         struct softnet_data *sd, *oldsd;
6862
6863         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6864                 return NOTIFY_OK;
6865
6866         local_irq_disable();
6867         cpu = smp_processor_id();
6868         sd = &per_cpu(softnet_data, cpu);
6869         oldsd = &per_cpu(softnet_data, oldcpu);
6870
6871         /* Find end of our completion_queue. */
6872         list_skb = &sd->completion_queue;
6873         while (*list_skb)
6874                 list_skb = &(*list_skb)->next;
6875         /* Append completion queue from offline CPU. */
6876         *list_skb = oldsd->completion_queue;
6877         oldsd->completion_queue = NULL;
6878
6879         /* Append output queue from offline CPU. */
6880         if (oldsd->output_queue) {
6881                 *sd->output_queue_tailp = oldsd->output_queue;
6882                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6883                 oldsd->output_queue = NULL;
6884                 oldsd->output_queue_tailp = &oldsd->output_queue;
6885         }
6886         /* Append NAPI poll list from offline CPU. */
6887         if (!list_empty(&oldsd->poll_list)) {
6888                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6889                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6890         }
6891
6892         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6893         local_irq_enable();
6894
6895         /* Process offline CPU's input_pkt_queue */
6896         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6897                 netif_rx(skb);
6898                 input_queue_head_incr(oldsd);
6899         }
6900         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6901                 netif_rx(skb);
6902                 input_queue_head_incr(oldsd);
6903         }
6904
6905         return NOTIFY_OK;
6906 }
6907
6908
6909 /**
6910  *      netdev_increment_features - increment feature set by one
6911  *      @all: current feature set
6912  *      @one: new feature set
6913  *      @mask: mask feature set
6914  *
6915  *      Computes a new feature set after adding a device with feature set
6916  *      @one to the master device with current feature set @all.  Will not
6917  *      enable anything that is off in @mask. Returns the new feature set.
6918  */
6919 netdev_features_t netdev_increment_features(netdev_features_t all,
6920         netdev_features_t one, netdev_features_t mask)
6921 {
6922         if (mask & NETIF_F_GEN_CSUM)
6923                 mask |= NETIF_F_ALL_CSUM;
6924         mask |= NETIF_F_VLAN_CHALLENGED;
6925
6926         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6927         all &= one | ~NETIF_F_ALL_FOR_ALL;
6928
6929         /* If one device supports hw checksumming, set for all. */
6930         if (all & NETIF_F_GEN_CSUM)
6931                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6932
6933         return all;
6934 }
6935 EXPORT_SYMBOL(netdev_increment_features);
6936
6937 static struct hlist_head *netdev_create_hash(void)
6938 {
6939         int i;
6940         struct hlist_head *hash;
6941
6942         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6943         if (hash != NULL)
6944                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6945                         INIT_HLIST_HEAD(&hash[i]);
6946
6947         return hash;
6948 }
6949
6950 /* Initialize per network namespace state */
6951 static int __net_init netdev_init(struct net *net)
6952 {
6953         if (net != &init_net)
6954                 INIT_LIST_HEAD(&net->dev_base_head);
6955
6956         net->dev_name_head = netdev_create_hash();
6957         if (net->dev_name_head == NULL)
6958                 goto err_name;
6959
6960         net->dev_index_head = netdev_create_hash();
6961         if (net->dev_index_head == NULL)
6962                 goto err_idx;
6963
6964         return 0;
6965
6966 err_idx:
6967         kfree(net->dev_name_head);
6968 err_name:
6969         return -ENOMEM;
6970 }
6971
6972 /**
6973  *      netdev_drivername - network driver for the device
6974  *      @dev: network device
6975  *
6976  *      Determine network driver for device.
6977  */
6978 const char *netdev_drivername(const struct net_device *dev)
6979 {
6980         const struct device_driver *driver;
6981         const struct device *parent;
6982         const char *empty = "";
6983
6984         parent = dev->dev.parent;
6985         if (!parent)
6986                 return empty;
6987
6988         driver = parent->driver;
6989         if (driver && driver->name)
6990                 return driver->name;
6991         return empty;
6992 }
6993
6994 static int __netdev_printk(const char *level, const struct net_device *dev,
6995                            struct va_format *vaf)
6996 {
6997         int r;
6998
6999         if (dev && dev->dev.parent) {
7000                 r = dev_printk_emit(level[1] - '0',
7001                                     dev->dev.parent,
7002                                     "%s %s %s: %pV",
7003                                     dev_driver_string(dev->dev.parent),
7004                                     dev_name(dev->dev.parent),
7005                                     netdev_name(dev), vaf);
7006         } else if (dev) {
7007                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
7008         } else {
7009                 r = printk("%s(NULL net_device): %pV", level, vaf);
7010         }
7011
7012         return r;
7013 }
7014
7015 int netdev_printk(const char *level, const struct net_device *dev,
7016                   const char *format, ...)
7017 {
7018         struct va_format vaf;
7019         va_list args;
7020         int r;
7021
7022         va_start(args, format);
7023
7024         vaf.fmt = format;
7025         vaf.va = &args;
7026
7027         r = __netdev_printk(level, dev, &vaf);
7028
7029         va_end(args);
7030
7031         return r;
7032 }
7033 EXPORT_SYMBOL(netdev_printk);
7034
7035 #define define_netdev_printk_level(func, level)                 \
7036 int func(const struct net_device *dev, const char *fmt, ...)    \
7037 {                                                               \
7038         int r;                                                  \
7039         struct va_format vaf;                                   \
7040         va_list args;                                           \
7041                                                                 \
7042         va_start(args, fmt);                                    \
7043                                                                 \
7044         vaf.fmt = fmt;                                          \
7045         vaf.va = &args;                                         \
7046                                                                 \
7047         r = __netdev_printk(level, dev, &vaf);                  \
7048                                                                 \
7049         va_end(args);                                           \
7050                                                                 \
7051         return r;                                               \
7052 }                                                               \
7053 EXPORT_SYMBOL(func);
7054
7055 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7056 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7057 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7058 define_netdev_printk_level(netdev_err, KERN_ERR);
7059 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7060 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7061 define_netdev_printk_level(netdev_info, KERN_INFO);
7062
7063 static void __net_exit netdev_exit(struct net *net)
7064 {
7065         kfree(net->dev_name_head);
7066         kfree(net->dev_index_head);
7067 }
7068
7069 static struct pernet_operations __net_initdata netdev_net_ops = {
7070         .init = netdev_init,
7071         .exit = netdev_exit,
7072 };
7073
7074 static void __net_exit default_device_exit(struct net *net)
7075 {
7076         struct net_device *dev, *aux;
7077         /*
7078          * Push all migratable network devices back to the
7079          * initial network namespace
7080          */
7081         rtnl_lock();
7082         for_each_netdev_safe(net, dev, aux) {
7083                 int err;
7084                 char fb_name[IFNAMSIZ];
7085
7086                 /* Ignore unmoveable devices (i.e. loopback) */
7087                 if (dev->features & NETIF_F_NETNS_LOCAL)
7088                         continue;
7089
7090                 /* Leave virtual devices for the generic cleanup */
7091                 if (dev->rtnl_link_ops)
7092                         continue;
7093
7094                 /* Push remaining network devices to init_net */
7095                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7096                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7097                 if (err) {
7098                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7099                                  __func__, dev->name, err);
7100                         BUG();
7101                 }
7102         }
7103         rtnl_unlock();
7104 }
7105
7106 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7107 {
7108         /* At exit all network devices most be removed from a network
7109          * namespace.  Do this in the reverse order of registration.
7110          * Do this across as many network namespaces as possible to
7111          * improve batching efficiency.
7112          */
7113         struct net_device *dev;
7114         struct net *net;
7115         LIST_HEAD(dev_kill_list);
7116
7117         rtnl_lock();
7118         list_for_each_entry(net, net_list, exit_list) {
7119                 for_each_netdev_reverse(net, dev) {
7120                         if (dev->rtnl_link_ops)
7121                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7122                         else
7123                                 unregister_netdevice_queue(dev, &dev_kill_list);
7124                 }
7125         }
7126         unregister_netdevice_many(&dev_kill_list);
7127         list_del(&dev_kill_list);
7128         rtnl_unlock();
7129 }
7130
7131 static struct pernet_operations __net_initdata default_device_ops = {
7132         .exit = default_device_exit,
7133         .exit_batch = default_device_exit_batch,
7134 };
7135
7136 /*
7137  *      Initialize the DEV module. At boot time this walks the device list and
7138  *      unhooks any devices that fail to initialise (normally hardware not
7139  *      present) and leaves us with a valid list of present and active devices.
7140  *
7141  */
7142
7143 /*
7144  *       This is called single threaded during boot, so no need
7145  *       to take the rtnl semaphore.
7146  */
7147 static int __init net_dev_init(void)
7148 {
7149         int i, rc = -ENOMEM;
7150
7151         BUG_ON(!dev_boot_phase);
7152
7153         if (dev_proc_init())
7154                 goto out;
7155
7156         if (netdev_kobject_init())
7157                 goto out;
7158
7159         INIT_LIST_HEAD(&ptype_all);
7160         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7161                 INIT_LIST_HEAD(&ptype_base[i]);
7162
7163         INIT_LIST_HEAD(&offload_base);
7164
7165         if (register_pernet_subsys(&netdev_net_ops))
7166                 goto out;
7167
7168         /*
7169          *      Initialise the packet receive queues.
7170          */
7171
7172         for_each_possible_cpu(i) {
7173                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7174
7175                 memset(sd, 0, sizeof(*sd));
7176                 skb_queue_head_init(&sd->input_pkt_queue);
7177                 skb_queue_head_init(&sd->process_queue);
7178                 sd->completion_queue = NULL;
7179                 INIT_LIST_HEAD(&sd->poll_list);
7180                 sd->output_queue = NULL;
7181                 sd->output_queue_tailp = &sd->output_queue;
7182 #ifdef CONFIG_RPS
7183                 sd->csd.func = rps_trigger_softirq;
7184                 sd->csd.info = sd;
7185                 sd->csd.flags = 0;
7186                 sd->cpu = i;
7187 #endif
7188
7189                 sd->backlog.poll = process_backlog;
7190                 sd->backlog.weight = weight_p;
7191                 sd->backlog.gro_list = NULL;
7192                 sd->backlog.gro_count = 0;
7193         }
7194
7195         dev_boot_phase = 0;
7196
7197         /* The loopback device is special if any other network devices
7198          * is present in a network namespace the loopback device must
7199          * be present. Since we now dynamically allocate and free the
7200          * loopback device ensure this invariant is maintained by
7201          * keeping the loopback device as the first device on the
7202          * list of network devices.  Ensuring the loopback devices
7203          * is the first device that appears and the last network device
7204          * that disappears.
7205          */
7206         if (register_pernet_device(&loopback_net_ops))
7207                 goto out;
7208
7209         if (register_pernet_device(&default_device_ops))
7210                 goto out;
7211
7212         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7213         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7214
7215         hotcpu_notifier(dev_cpu_callback, 0);
7216         dst_init();
7217         dev_mcast_init();
7218         rc = 0;
7219 out:
7220         return rc;
7221 }
7222
7223 subsys_initcall(net_dev_init);