net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/mutex.h>
  84 #include <linux/string.h>
  85 #include <linux/mm.h>
  86 #include <linux/socket.h>
  87 #include <linux/sockios.h>
  88 #include <linux/errno.h>
  89 #include <linux/interrupt.h>
  90 #include <linux/if_ether.h>
  91 #include <linux/netdevice.h>
  92 #include <linux/etherdevice.h>
  93 #include <linux/ethtool.h>
  94 #include <linux/notifier.h>
  95 #include <linux/skbuff.h>
  96 #include <net/net_namespace.h>
  97 #include <net/sock.h>
  98 #include <linux/rtnetlink.h>
  99 #include <linux/proc_fs.h>
 100 #include <linux/seq_file.h>
 101 #include <linux/stat.h>
 102 #include <linux/if_bridge.h>
 103 #include <linux/if_macvlan.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130
 131 #include "net-sysfs.h"
 132
 133 /* Instead of increasing this, you should create a hash table. */
 134 #define MAX_GRO_SKBS 8
 135
 136 /* This should be increased if a protocol with a bigger head is added. */
 137 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 138
 139 /*
 140  *      The list of packet types we will receive (as opposed to discard)
 141  *      and the routines to invoke.
 142  *
 143  *      Why 16. Because with 16 the only overlap we get on a hash of the
 144  *      low nibble of the protocol value is RARP/SNAP/X.25.
 145  *
 146  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 147  *             sure which should go first, but I bet it won't make much
 148  *             difference if we are running VLANs.  The good news is that
 149  *             this protocol won't be in the list unless compiled in, so
 150  *             the average user (w/out VLANs) will not be adversely affected.
 151  *             --BLG
 152  *
 153  *              0800    IP
 154  *              8100    802.1Q VLAN
 155  *              0001    802.3
 156  *              0002    AX.25
 157  *              0004    802.2
 158  *              8035    RARP
 159  *              0005    SNAP
 160  *              0805    X.25
 161  *              0806    ARP
 162  *              8137    IPX
 163  *              0009    Localtalk
 164  *              86DD    IPv6
 165  */
 166
 167 #define PTYPE_HASH_SIZE (16)
 168 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 169
 170 static DEFINE_SPINLOCK(ptype_lock);
 171 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 172 static struct list_head ptype_all __read_mostly;        /* Taps */
 173
 174 /*
 175  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 176  * semaphore.
 177  *
 178  * Pure readers hold dev_base_lock for reading.
 179  *
 180  * Writers must hold the rtnl semaphore while they loop through the
 181  * dev_base_head list, and hold dev_base_lock for writing when they do the
 182  * actual updates.  This allows pure readers to access the list even
 183  * while a writer is preparing to update it.
 184  *
 185  * To put it another way, dev_base_lock is held for writing only to
 186  * protect against pure readers; the rtnl semaphore provides the
 187  * protection against other writers.
 188  *
 189  * See, for example usages, register_netdevice() and
 190  * unregister_netdevice(), which must be called with the rtnl
 191  * semaphore held.
 192  */
 193 DEFINE_RWLOCK(dev_base_lock);
 194 EXPORT_SYMBOL(dev_base_lock);
 195
 196 #define NETDEV_HASHBITS 8
 197 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 198
 199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 200 {
 201         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 202         return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 203 }
 204
 205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206 {
 207         return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 208 }
 209
 210 /* Device list insertion */
 211 static int list_netdevice(struct net_device *dev)
 212 {
 213         struct net *net = dev_net(dev);
 214
 215         ASSERT_RTNL();
 216
 217         write_lock_bh(&dev_base_lock);
 218         list_add_tail(&dev->dev_list, &net->dev_base_head);
 219         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 220         hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 221         write_unlock_bh(&dev_base_lock);
 222         return 0;
 223 }
 224
 225 /* Device list removal */
 226 static void unlist_netdevice(struct net_device *dev)
 227 {
 228         ASSERT_RTNL();
 229
 230         /* Unlink dev from the device chain */
 231         write_lock_bh(&dev_base_lock);
 232         list_del(&dev->dev_list);
 233         hlist_del(&dev->name_hlist);
 234         hlist_del(&dev->index_hlist);
 235         write_unlock_bh(&dev_base_lock);
 236 }
 237
 238 /*
 239  *      Our notifier list
 240  */
 241
 242 static RAW_NOTIFIER_HEAD(netdev_chain);
 243
 244 /*
 245  *      Device drivers call our routines to queue packets here. We empty the
 246  *      queue in the local softnet handler.
 247  */
 248
 249 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 250 EXPORT_PER_CPU_SYMBOL(softnet_data);
 251
 252 #ifdef CONFIG_LOCKDEP
 253 /*
 254  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 255  * according to dev->type
 256  */
 257 static const unsigned short netdev_lock_type[] =
 258         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 259          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 260          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 261          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 262          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 263          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 264          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 265          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 266          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 267          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 268          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 269          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 270          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 271          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 272          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 273          ARPHRD_VOID, ARPHRD_NONE};
 274
 275 static const char *const netdev_lock_name[] =
 276         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 277          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 278          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 279          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 280          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 281          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 282          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 283          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 284          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 285          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 286          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 287          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 288          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 289          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 290          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 291          "_xmit_VOID", "_xmit_NONE"};
 292
 293 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 294 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 295
 296 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 297 {
 298         int i;
 299
 300         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 301                 if (netdev_lock_type[i] == dev_type)
 302                         return i;
 303         /* the last key is used by default */
 304         return ARRAY_SIZE(netdev_lock_type) - 1;
 305 }
 306
 307 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 308                                                  unsigned short dev_type)
 309 {
 310         int i;
 311
 312         i = netdev_lock_pos(dev_type);
 313         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 314                                    netdev_lock_name[i]);
 315 }
 316
 317 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 318 {
 319         int i;
 320
 321         i = netdev_lock_pos(dev->type);
 322         lockdep_set_class_and_name(&dev->addr_list_lock,
 323                                    &netdev_addr_lock_key[i],
 324                                    netdev_lock_name[i]);
 325 }
 326 #else
 327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 328                                                  unsigned short dev_type)
 329 {
 330 }
 331 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 332 {
 333 }
 334 #endif
 335
 336 /*******************************************************************************
 337
 338                 Protocol management and registration routines
 339
 340 *******************************************************************************/
 341
 342 /*
 343  *      Add a protocol ID to the list. Now that the input handler is
 344  *      smarter we can dispense with all the messy stuff that used to be
 345  *      here.
 346  *
 347  *      BEWARE!!! Protocol handlers, mangling input packets,
 348  *      MUST BE last in hash buckets and checking protocol handlers
 349  *      MUST start from promiscuous ptype_all chain in net_bh.
 350  *      It is true now, do not change it.
 351  *      Explanation follows: if protocol handler, mangling packet, will
 352  *      be the first on list, it is not able to sense, that packet
 353  *      is cloned and should be copied-on-write, so that it will
 354  *      change it and subsequent readers will get broken packet.
 355  *                                                      --ANK (980803)
 356  */
 357
 358 /**
 359  *      dev_add_pack - add packet handler
 360  *      @pt: packet type declaration
 361  *
 362  *      Add a protocol handler to the networking stack. The passed &packet_type
 363  *      is linked into kernel lists and may not be freed until it has been
 364  *      removed from the kernel lists.
 365  *
 366  *      This call does not sleep therefore it can not
 367  *      guarantee all CPU's that are in middle of receiving packets
 368  *      will see the new packet type (until the next received packet).
 369  */
 370
 371 void dev_add_pack(struct packet_type *pt)
 372 {
 373         int hash;
 374
 375         spin_lock_bh(&ptype_lock);
 376         if (pt->type == htons(ETH_P_ALL))
 377                 list_add_rcu(&pt->list, &ptype_all);
 378         else {
 379                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 380                 list_add_rcu(&pt->list, &ptype_base[hash]);
 381         }
 382         spin_unlock_bh(&ptype_lock);
 383 }
 384 EXPORT_SYMBOL(dev_add_pack);
 385
 386 /**
 387  *      __dev_remove_pack        - remove packet handler
 388  *      @pt: packet type declaration
 389  *
 390  *      Remove a protocol handler that was previously added to the kernel
 391  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 392  *      from the kernel lists and can be freed or reused once this function
 393  *      returns.
 394  *
 395  *      The packet type might still be in use by receivers
 396  *      and must not be freed until after all the CPU's have gone
 397  *      through a quiescent state.
 398  */
 399 void __dev_remove_pack(struct packet_type *pt)
 400 {
 401         struct list_head *head;
 402         struct packet_type *pt1;
 403
 404         spin_lock_bh(&ptype_lock);
 405
 406         if (pt->type == htons(ETH_P_ALL))
 407                 head = &ptype_all;
 408         else
 409                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 410
 411         list_for_each_entry(pt1, head, list) {
 412                 if (pt == pt1) {
 413                         list_del_rcu(&pt->list);
 414                         goto out;
 415                 }
 416         }
 417
 418         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 419 out:
 420         spin_unlock_bh(&ptype_lock);
 421 }
 422 EXPORT_SYMBOL(__dev_remove_pack);
 423
 424 /**
 425  *      dev_remove_pack  - remove packet handler
 426  *      @pt: packet type declaration
 427  *
 428  *      Remove a protocol handler that was previously added to the kernel
 429  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 430  *      from the kernel lists and can be freed or reused once this function
 431  *      returns.
 432  *
 433  *      This call sleeps to guarantee that no CPU is looking at the packet
 434  *      type after return.
 435  */
 436 void dev_remove_pack(struct packet_type *pt)
 437 {
 438         __dev_remove_pack(pt);
 439
 440         synchronize_net();
 441 }
 442 EXPORT_SYMBOL(dev_remove_pack);
 443
 444 /******************************************************************************
 445
 446                       Device Boot-time Settings Routines
 447
 448 *******************************************************************************/
 449
 450 /* Boot time configuration table */
 451 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 452
 453 /**
 454  *      netdev_boot_setup_add   - add new setup entry
 455  *      @name: name of the device
 456  *      @map: configured settings for the device
 457  *
 458  *      Adds new setup entry to the dev_boot_setup list.  The function
 459  *      returns 0 on error and 1 on success.  This is a generic routine to
 460  *      all netdevices.
 461  */
 462 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 463 {
 464         struct netdev_boot_setup *s;
 465         int i;
 466
 467         s = dev_boot_setup;
 468         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 469                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 470                         memset(s[i].name, 0, sizeof(s[i].name));
 471                         strlcpy(s[i].name, name, IFNAMSIZ);
 472                         memcpy(&s[i].map, map, sizeof(s[i].map));
 473                         break;
 474                 }
 475         }
 476
 477         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 478 }
 479
 480 /**
 481  *      netdev_boot_setup_check - check boot time settings
 482  *      @dev: the netdevice
 483  *
 484  *      Check boot time settings for the device.
 485  *      The found settings are set for the device to be used
 486  *      later in the device probing.
 487  *      Returns 0 if no settings found, 1 if they are.
 488  */
 489 int netdev_boot_setup_check(struct net_device *dev)
 490 {
 491         struct netdev_boot_setup *s = dev_boot_setup;
 492         int i;
 493
 494         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 495                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 496                     !strcmp(dev->name, s[i].name)) {
 497                         dev->irq        = s[i].map.irq;
 498                         dev->base_addr  = s[i].map.base_addr;
 499                         dev->mem_start  = s[i].map.mem_start;
 500                         dev->mem_end    = s[i].map.mem_end;
 501                         return 1;
 502                 }
 503         }
 504         return 0;
 505 }
 506 EXPORT_SYMBOL(netdev_boot_setup_check);
 507
 508
 509 /**
 510  *      netdev_boot_base        - get address from boot time settings
 511  *      @prefix: prefix for network device
 512  *      @unit: id for network device
 513  *
 514  *      Check boot time settings for the base address of device.
 515  *      The found settings are set for the device to be used
 516  *      later in the device probing.
 517  *      Returns 0 if no settings found.
 518  */
 519 unsigned long netdev_boot_base(const char *prefix, int unit)
 520 {
 521         const struct netdev_boot_setup *s = dev_boot_setup;
 522         char name[IFNAMSIZ];
 523         int i;
 524
 525         sprintf(name, "%s%d", prefix, unit);
 526
 527         /*
 528          * If device already registered then return base of 1
 529          * to indicate not to probe for this interface
 530          */
 531         if (__dev_get_by_name(&init_net, name))
 532                 return 1;
 533
 534         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 535                 if (!strcmp(name, s[i].name))
 536                         return s[i].map.base_addr;
 537         return 0;
 538 }
 539
 540 /*
 541  * Saves at boot time configured settings for any netdevice.
 542  */
 543 int __init netdev_boot_setup(char *str)
 544 {
 545         int ints[5];
 546         struct ifmap map;
 547
 548         str = get_options(str, ARRAY_SIZE(ints), ints);
 549         if (!str || !*str)
 550                 return 0;
 551
 552         /* Save settings */
 553         memset(&map, 0, sizeof(map));
 554         if (ints[0] > 0)
 555                 map.irq = ints[1];
 556         if (ints[0] > 1)
 557                 map.base_addr = ints[2];
 558         if (ints[0] > 2)
 559                 map.mem_start = ints[3];
 560         if (ints[0] > 3)
 561                 map.mem_end = ints[4];
 562
 563         /* Add new entry to the list */
 564         return netdev_boot_setup_add(str, &map);
 565 }
 566
 567 __setup("netdev=", netdev_boot_setup);
 568
 569 /*******************************************************************************
 570
 571                             Device Interface Subroutines
 572
 573 *******************************************************************************/
 574
 575 /**
 576  *      __dev_get_by_name       - find a device by its name
 577  *      @net: the applicable net namespace
 578  *      @name: name to find
 579  *
 580  *      Find an interface by name. Must be called under RTNL semaphore
 581  *      or @dev_base_lock. If the name is found a pointer to the device
 582  *      is returned. If the name is not found then %NULL is returned. The
 583  *      reference counters are not incremented so the caller must be
 584  *      careful with locks.
 585  */
 586
 587 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 588 {
 589         struct hlist_node *p;
 590
 591         hlist_for_each(p, dev_name_hash(net, name)) {
 592                 struct net_device *dev
 593                         = hlist_entry(p, struct net_device, name_hlist);
 594                 if (!strncmp(dev->name, name, IFNAMSIZ))
 595                         return dev;
 596         }
 597         return NULL;
 598 }
 599 EXPORT_SYMBOL(__dev_get_by_name);
 600
 601 /**
 602  *      dev_get_by_name         - find a device by its name
 603  *      @net: the applicable net namespace
 604  *      @name: name to find
 605  *
 606  *      Find an interface by name. This can be called from any
 607  *      context and does its own locking. The returned handle has
 608  *      the usage count incremented and the caller must use dev_put() to
 609  *      release it when it is no longer needed. %NULL is returned if no
 610  *      matching device is found.
 611  */
 612
 613 struct net_device *dev_get_by_name(struct net *net, const char *name)
 614 {
 615         struct net_device *dev;
 616
 617         read_lock(&dev_base_lock);
 618         dev = __dev_get_by_name(net, name);
 619         if (dev)
 620                 dev_hold(dev);
 621         read_unlock(&dev_base_lock);
 622         return dev;
 623 }
 624 EXPORT_SYMBOL(dev_get_by_name);
 625
 626 /**
 627  *      __dev_get_by_index - find a device by its ifindex
 628  *      @net: the applicable net namespace
 629  *      @ifindex: index of device
 630  *
 631  *      Search for an interface by index. Returns %NULL if the device
 632  *      is not found or a pointer to the device. The device has not
 633  *      had its reference counter increased so the caller must be careful
 634  *      about locking. The caller must hold either the RTNL semaphore
 635  *      or @dev_base_lock.
 636  */
 637
 638 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 639 {
 640         struct hlist_node *p;
 641
 642         hlist_for_each(p, dev_index_hash(net, ifindex)) {
 643                 struct net_device *dev
 644                         = hlist_entry(p, struct net_device, index_hlist);
 645                 if (dev->ifindex == ifindex)
 646                         return dev;
 647         }
 648         return NULL;
 649 }
 650 EXPORT_SYMBOL(__dev_get_by_index);
 651
 652
 653 /**
 654  *      dev_get_by_index - find a device by its ifindex
 655  *      @net: the applicable net namespace
 656  *      @ifindex: index of device
 657  *
 658  *      Search for an interface by index. Returns NULL if the device
 659  *      is not found or a pointer to the device. The device returned has
 660  *      had a reference added and the pointer is safe until the user calls
 661  *      dev_put to indicate they have finished with it.
 662  */
 663
 664 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 665 {
 666         struct net_device *dev;
 667
 668         read_lock(&dev_base_lock);
 669         dev = __dev_get_by_index(net, ifindex);
 670         if (dev)
 671                 dev_hold(dev);
 672         read_unlock(&dev_base_lock);
 673         return dev;
 674 }
 675 EXPORT_SYMBOL(dev_get_by_index);
 676
 677 /**
 678  *      dev_getbyhwaddr - find a device by its hardware address
 679  *      @net: the applicable net namespace
 680  *      @type: media type of device
 681  *      @ha: hardware address
 682  *
 683  *      Search for an interface by MAC address. Returns NULL if the device
 684  *      is not found or a pointer to the device. The caller must hold the
 685  *      rtnl semaphore. The returned device has not had its ref count increased
 686  *      and the caller must therefore be careful about locking
 687  *
 688  *      BUGS:
 689  *      If the API was consistent this would be __dev_get_by_hwaddr
 690  */
 691
 692 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 693 {
 694         struct net_device *dev;
 695
 696         ASSERT_RTNL();
 697
 698         for_each_netdev(net, dev)
 699                 if (dev->type == type &&
 700                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 701                         return dev;
 702
 703         return NULL;
 704 }
 705 EXPORT_SYMBOL(dev_getbyhwaddr);
 706
 707 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 708 {
 709         struct net_device *dev;
 710
 711         ASSERT_RTNL();
 712         for_each_netdev(net, dev)
 713                 if (dev->type == type)
 714                         return dev;
 715
 716         return NULL;
 717 }
 718 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 719
 720 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 721 {
 722         struct net_device *dev;
 723
 724         rtnl_lock();
 725         dev = __dev_getfirstbyhwtype(net, type);
 726         if (dev)
 727                 dev_hold(dev);
 728         rtnl_unlock();
 729         return dev;
 730 }
 731 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 732
 733 /**
 734  *      dev_get_by_flags - find any device with given flags
 735  *      @net: the applicable net namespace
 736  *      @if_flags: IFF_* values
 737  *      @mask: bitmask of bits in if_flags to check
 738  *
 739  *      Search for any interface with the given flags. Returns NULL if a device
 740  *      is not found or a pointer to the device. The device returned has
 741  *      had a reference added and the pointer is safe until the user calls
 742  *      dev_put to indicate they have finished with it.
 743  */
 744
 745 struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
 746                                     unsigned short mask)
 747 {
 748         struct net_device *dev, *ret;
 749
 750         ret = NULL;
 751         read_lock(&dev_base_lock);
 752         for_each_netdev(net, dev) {
 753                 if (((dev->flags ^ if_flags) & mask) == 0) {
 754                         dev_hold(dev);
 755                         ret = dev;
 756                         break;
 757                 }
 758         }
 759         read_unlock(&dev_base_lock);
 760         return ret;
 761 }
 762 EXPORT_SYMBOL(dev_get_by_flags);
 763
 764 /**
 765  *      dev_valid_name - check if name is okay for network device
 766  *      @name: name string
 767  *
 768  *      Network device names need to be valid file names to
 769  *      to allow sysfs to work.  We also disallow any kind of
 770  *      whitespace.
 771  */
 772 int dev_valid_name(const char *name)
 773 {
 774         if (*name == '\0')
 775                 return 0;
 776         if (strlen(name) >= IFNAMSIZ)
 777                 return 0;
 778         if (!strcmp(name, ".") || !strcmp(name, ".."))
 779                 return 0;
 780
 781         while (*name) {
 782                 if (*name == '/' || isspace(*name))
 783                         return 0;
 784                 name++;
 785         }
 786         return 1;
 787 }
 788 EXPORT_SYMBOL(dev_valid_name);
 789
 790 /**
 791  *      __dev_alloc_name - allocate a name for a device
 792  *      @net: network namespace to allocate the device name in
 793  *      @name: name format string
 794  *      @buf:  scratch buffer and result name string
 795  *
 796  *      Passed a format string - eg "lt%d" it will try and find a suitable
 797  *      id. It scans list of devices to build up a free map, then chooses
 798  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 799  *      while allocating the name and adding the device in order to avoid
 800  *      duplicates.
 801  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 802  *      Returns the number of the unit assigned or a negative errno code.
 803  */
 804
 805 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 806 {
 807         int i = 0;
 808         const char *p;
 809         const int max_netdevices = 8*PAGE_SIZE;
 810         unsigned long *inuse;
 811         struct net_device *d;
 812
 813         p = strnchr(name, IFNAMSIZ-1, '%');
 814         if (p) {
 815                 /*
 816                  * Verify the string as this thing may have come from
 817                  * the user.  There must be either one "%d" and no other "%"
 818                  * characters.
 819                  */
 820                 if (p[1] != 'd' || strchr(p + 2, '%'))
 821                         return -EINVAL;
 822
 823                 /* Use one page as a bit array of possible slots */
 824                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 825                 if (!inuse)
 826                         return -ENOMEM;
 827
 828                 for_each_netdev(net, d) {
 829                         if (!sscanf(d->name, name, &i))
 830                                 continue;
 831                         if (i < 0 || i >= max_netdevices)
 832                                 continue;
 833
 834                         /*  avoid cases where sscanf is not exact inverse of printf */
 835                         snprintf(buf, IFNAMSIZ, name, i);
 836                         if (!strncmp(buf, d->name, IFNAMSIZ))
 837                                 set_bit(i, inuse);
 838                 }
 839
 840                 i = find_first_zero_bit(inuse, max_netdevices);
 841                 free_page((unsigned long) inuse);
 842         }
 843
 844         snprintf(buf, IFNAMSIZ, name, i);
 845         if (!__dev_get_by_name(net, buf))
 846                 return i;
 847
 848         /* It is possible to run out of possible slots
 849          * when the name is long and there isn't enough space left
 850          * for the digits, or if all bits are used.
 851          */
 852         return -ENFILE;
 853 }
 854
 855 /**
 856  *      dev_alloc_name - allocate a name for a device
 857  *      @dev: device
 858  *      @name: name format string
 859  *
 860  *      Passed a format string - eg "lt%d" it will try and find a suitable
 861  *      id. It scans list of devices to build up a free map, then chooses
 862  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 863  *      while allocating the name and adding the device in order to avoid
 864  *      duplicates.
 865  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 866  *      Returns the number of the unit assigned or a negative errno code.
 867  */
 868
 869 int dev_alloc_name(struct net_device *dev, const char *name)
 870 {
 871         char buf[IFNAMSIZ];
 872         struct net *net;
 873         int ret;
 874
 875         BUG_ON(!dev_net(dev));
 876         net = dev_net(dev);
 877         ret = __dev_alloc_name(net, name, buf);
 878         if (ret >= 0)
 879                 strlcpy(dev->name, buf, IFNAMSIZ);
 880         return ret;
 881 }
 882 EXPORT_SYMBOL(dev_alloc_name);
 883
 884
 885 /**
 886  *      dev_change_name - change name of a device
 887  *      @dev: device
 888  *      @newname: name (or format string) must be at least IFNAMSIZ
 889  *
 890  *      Change name of a device, can pass format strings "eth%d".
 891  *      for wildcarding.
 892  */
 893 int dev_change_name(struct net_device *dev, const char *newname)
 894 {
 895         char oldname[IFNAMSIZ];
 896         int err = 0;
 897         int ret;
 898         struct net *net;
 899
 900         ASSERT_RTNL();
 901         BUG_ON(!dev_net(dev));
 902
 903         net = dev_net(dev);
 904         if (dev->flags & IFF_UP)
 905                 return -EBUSY;
 906
 907         if (!dev_valid_name(newname))
 908                 return -EINVAL;
 909
 910         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 911                 return 0;
 912
 913         memcpy(oldname, dev->name, IFNAMSIZ);
 914
 915         if (strchr(newname, '%')) {
 916                 err = dev_alloc_name(dev, newname);
 917                 if (err < 0)
 918                         return err;
 919         } else if (__dev_get_by_name(net, newname))
 920                 return -EEXIST;
 921         else
 922                 strlcpy(dev->name, newname, IFNAMSIZ);
 923
 924 rollback:
 925         /* For now only devices in the initial network namespace
 926          * are in sysfs.
 927          */
 928         if (net == &init_net) {
 929                 ret = device_rename(&dev->dev, dev->name);
 930                 if (ret) {
 931                         memcpy(dev->name, oldname, IFNAMSIZ);
 932                         return ret;
 933                 }
 934         }
 935
 936         write_lock_bh(&dev_base_lock);
 937         hlist_del(&dev->name_hlist);
 938         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 939         write_unlock_bh(&dev_base_lock);
 940
 941         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 942         ret = notifier_to_errno(ret);
 943
 944         if (ret) {
 945                 /* err >= 0 after dev_alloc_name() or stores the first errno */
 946                 if (err >= 0) {
 947                         err = ret;
 948                         memcpy(dev->name, oldname, IFNAMSIZ);
 949                         goto rollback;
 950                 } else {
 951                         printk(KERN_ERR
 952                                "%s: name change rollback failed: %d.\n",
 953                                dev->name, ret);
 954                 }
 955         }
 956
 957         return err;
 958 }
 959
 960 /**
 961  *      dev_set_alias - change ifalias of a device
 962  *      @dev: device
 963  *      @alias: name up to IFALIASZ
 964  *      @len: limit of bytes to copy from info
 965  *
 966  *      Set ifalias for a device,
 967  */
 968 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 969 {
 970         ASSERT_RTNL();
 971
 972         if (len >= IFALIASZ)
 973                 return -EINVAL;
 974
 975         if (!len) {
 976                 if (dev->ifalias) {
 977                         kfree(dev->ifalias);
 978                         dev->ifalias = NULL;
 979                 }
 980                 return 0;
 981         }
 982
 983         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
 984         if (!dev->ifalias)
 985                 return -ENOMEM;
 986
 987         strlcpy(dev->ifalias, alias, len+1);
 988         return len;
 989 }
 990
 991
 992 /**
 993  *      netdev_features_change - device changes features
 994  *      @dev: device to cause notification
 995  *
 996  *      Called to indicate a device has changed features.
 997  */
 998 void netdev_features_change(struct net_device *dev)
 999 {
1000         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1001 }
1002 EXPORT_SYMBOL(netdev_features_change);
1003
1004 /**
1005  *      netdev_state_change - device changes state
1006  *      @dev: device to cause notification
1007  *
1008  *      Called to indicate a device has changed state. This function calls
1009  *      the notifier chains for netdev_chain and sends a NEWLINK message
1010  *      to the routing socket.
1011  */
1012 void netdev_state_change(struct net_device *dev)
1013 {
1014         if (dev->flags & IFF_UP) {
1015                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1016                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1017         }
1018 }
1019 EXPORT_SYMBOL(netdev_state_change);
1020
1021 void netdev_bonding_change(struct net_device *dev, unsigned long event)
1022 {
1023         call_netdevice_notifiers(event, dev);
1024 }
1025 EXPORT_SYMBOL(netdev_bonding_change);
1026
1027 /**
1028  *      dev_load        - load a network module
1029  *      @net: the applicable net namespace
1030  *      @name: name of interface
1031  *
1032  *      If a network interface is not present and the process has suitable
1033  *      privileges this function loads the module. If module loading is not
1034  *      available in this kernel then it becomes a nop.
1035  */
1036
1037 void dev_load(struct net *net, const char *name)
1038 {
1039         struct net_device *dev;
1040         int no_module;
1041
1042         read_lock(&dev_base_lock);
1043         dev = __dev_get_by_name(net, name);
1044         read_unlock(&dev_base_lock);
1045
1046         no_module = !dev;
1047         if (no_module && capable(CAP_NET_ADMIN))
1048                 no_module = request_module("netdev-%s", name);
1049         if (no_module && capable(CAP_SYS_MODULE)) {
1050                 if (!request_module("%s", name))
1051                         pr_err("Loading kernel module for a network device "
1052 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1053 "instead\n", name);
1054         }
1055 }
1056 EXPORT_SYMBOL(dev_load);
1057
1058 /**
1059  *      dev_open        - prepare an interface for use.
1060  *      @dev:   device to open
1061  *
1062  *      Takes a device from down to up state. The device's private open
1063  *      function is invoked and then the multicast lists are loaded. Finally
1064  *      the device is moved into the up state and a %NETDEV_UP message is
1065  *      sent to the netdev notifier chain.
1066  *
1067  *      Calling this function on an active interface is a nop. On a failure
1068  *      a negative errno code is returned.
1069  */
1070 int dev_open(struct net_device *dev)
1071 {
1072         const struct net_device_ops *ops = dev->netdev_ops;
1073         int ret;
1074
1075         ASSERT_RTNL();
1076
1077         /*
1078          *      Is it already up?
1079          */
1080
1081         if (dev->flags & IFF_UP)
1082                 return 0;
1083
1084         /*
1085          *      Is it even present?
1086          */
1087         if (!netif_device_present(dev))
1088                 return -ENODEV;
1089
1090         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1091         ret = notifier_to_errno(ret);
1092         if (ret)
1093                 return ret;
1094
1095         /*
1096          *      Call device private open method
1097          */
1098         set_bit(__LINK_STATE_START, &dev->state);
1099
1100         if (ops->ndo_validate_addr)
1101                 ret = ops->ndo_validate_addr(dev);
1102
1103         if (!ret && ops->ndo_open)
1104                 ret = ops->ndo_open(dev);
1105
1106         /*
1107          *      If it went open OK then:
1108          */
1109
1110         if (ret)
1111                 clear_bit(__LINK_STATE_START, &dev->state);
1112         else {
1113                 /*
1114                  *      Set the flags.
1115                  */
1116                 dev->flags |= IFF_UP;
1117
1118                 /*
1119                  *      Enable NET_DMA
1120                  */
1121                 net_dmaengine_get();
1122
1123                 /*
1124                  *      Initialize multicasting status
1125                  */
1126                 dev_set_rx_mode(dev);
1127
1128                 /*
1129                  *      Wakeup transmit queue engine
1130                  */
1131                 dev_activate(dev);
1132
1133                 /*
1134                  *      ... and announce new interface.
1135                  */
1136                 call_netdevice_notifiers(NETDEV_UP, dev);
1137         }
1138
1139         return ret;
1140 }
1141 EXPORT_SYMBOL(dev_open);
1142
1143 /**
1144  *      dev_close - shutdown an interface.
1145  *      @dev: device to shutdown
1146  *
1147  *      This function moves an active device into down state. A
1148  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1149  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1150  *      chain.
1151  */
1152 int dev_close(struct net_device *dev)
1153 {
1154         const struct net_device_ops *ops = dev->netdev_ops;
1155         ASSERT_RTNL();
1156
1157         might_sleep();
1158
1159         if (!(dev->flags & IFF_UP))
1160                 return 0;
1161
1162         /*
1163          *      Tell people we are going down, so that they can
1164          *      prepare to death, when device is still operating.
1165          */
1166         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1167
1168         clear_bit(__LINK_STATE_START, &dev->state);
1169
1170         /* Synchronize to scheduled poll. We cannot touch poll list,
1171          * it can be even on different cpu. So just clear netif_running().
1172          *
1173          * dev->stop() will invoke napi_disable() on all of it's
1174          * napi_struct instances on this device.
1175          */
1176         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1177
1178         dev_deactivate(dev);
1179
1180         /*
1181          *      Call the device specific close. This cannot fail.
1182          *      Only if device is UP
1183          *
1184          *      We allow it to be called even after a DETACH hot-plug
1185          *      event.
1186          */
1187         if (ops->ndo_stop)
1188                 ops->ndo_stop(dev);
1189
1190         /*
1191          *      Device is now down.
1192          */
1193
1194         dev->flags &= ~IFF_UP;
1195
1196         /*
1197          * Tell people we are down
1198          */
1199         call_netdevice_notifiers(NETDEV_DOWN, dev);
1200
1201         /*
1202          *      Shutdown NET_DMA
1203          */
1204         net_dmaengine_put();
1205
1206         return 0;
1207 }
1208 EXPORT_SYMBOL(dev_close);
1209
1210
1211 /**
1212  *      dev_disable_lro - disable Large Receive Offload on a device
1213  *      @dev: device
1214  *
1215  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1216  *      called under RTNL.  This is needed if received packets may be
1217  *      forwarded to another interface.
1218  */
1219 void dev_disable_lro(struct net_device *dev)
1220 {
1221         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1222             dev->ethtool_ops->set_flags) {
1223                 u32 flags = dev->ethtool_ops->get_flags(dev);
1224                 if (flags & ETH_FLAG_LRO) {
1225                         flags &= ~ETH_FLAG_LRO;
1226                         dev->ethtool_ops->set_flags(dev, flags);
1227                 }
1228         }
1229         WARN_ON(dev->features & NETIF_F_LRO);
1230 }
1231 EXPORT_SYMBOL(dev_disable_lro);
1232
1233
1234 static int dev_boot_phase = 1;
1235
1236 /*
1237  *      Device change register/unregister. These are not inline or static
1238  *      as we export them to the world.
1239  */
1240
1241 /**
1242  *      register_netdevice_notifier - register a network notifier block
1243  *      @nb: notifier
1244  *
1245  *      Register a notifier to be called when network device events occur.
1246  *      The notifier passed is linked into the kernel structures and must
1247  *      not be reused until it has been unregistered. A negative errno code
1248  *      is returned on a failure.
1249  *
1250  *      When registered all registration and up events are replayed
1251  *      to the new notifier to allow device to have a race free
1252  *      view of the network device list.
1253  */
1254
1255 int register_netdevice_notifier(struct notifier_block *nb)
1256 {
1257         struct net_device *dev;
1258         struct net_device *last;
1259         struct net *net;
1260         int err;
1261
1262         rtnl_lock();
1263         err = raw_notifier_chain_register(&netdev_chain, nb);
1264         if (err)
1265                 goto unlock;
1266         if (dev_boot_phase)
1267                 goto unlock;
1268         for_each_net(net) {
1269                 for_each_netdev(net, dev) {
1270                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1271                         err = notifier_to_errno(err);
1272                         if (err)
1273                                 goto rollback;
1274
1275                         if (!(dev->flags & IFF_UP))
1276                                 continue;
1277
1278                         nb->notifier_call(nb, NETDEV_UP, dev);
1279                 }
1280         }
1281
1282 unlock:
1283         rtnl_unlock();
1284         return err;
1285
1286 rollback:
1287         last = dev;
1288         for_each_net(net) {
1289                 for_each_netdev(net, dev) {
1290                         if (dev == last)
1291                                 break;
1292
1293                         if (dev->flags & IFF_UP) {
1294                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1295                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1296                         }
1297                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1298                 }
1299         }
1300
1301         raw_notifier_chain_unregister(&netdev_chain, nb);
1302         goto unlock;
1303 }
1304 EXPORT_SYMBOL(register_netdevice_notifier);
1305
1306 /**
1307  *      unregister_netdevice_notifier - unregister a network notifier block
1308  *      @nb: notifier
1309  *
1310  *      Unregister a notifier previously registered by
1311  *      register_netdevice_notifier(). The notifier is unlinked into the
1312  *      kernel structures and may then be reused. A negative errno code
1313  *      is returned on a failure.
1314  */
1315
1316 int unregister_netdevice_notifier(struct notifier_block *nb)
1317 {
1318         int err;
1319
1320         rtnl_lock();
1321         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1322         rtnl_unlock();
1323         return err;
1324 }
1325 EXPORT_SYMBOL(unregister_netdevice_notifier);
1326
1327 /**
1328  *      call_netdevice_notifiers - call all network notifier blocks
1329  *      @val: value passed unmodified to notifier function
1330  *      @dev: net_device pointer passed unmodified to notifier function
1331  *
1332  *      Call all network notifier blocks.  Parameters and return value
1333  *      are as for raw_notifier_call_chain().
1334  */
1335
1336 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1337 {
1338         return raw_notifier_call_chain(&netdev_chain, val, dev);
1339 }
1340
1341 /* When > 0 there are consumers of rx skb time stamps */
1342 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1343
1344 void net_enable_timestamp(void)
1345 {
1346         atomic_inc(&netstamp_needed);
1347 }
1348 EXPORT_SYMBOL(net_enable_timestamp);
1349
1350 void net_disable_timestamp(void)
1351 {
1352         atomic_dec(&netstamp_needed);
1353 }
1354 EXPORT_SYMBOL(net_disable_timestamp);
1355
1356 static inline void net_timestamp(struct sk_buff *skb)
1357 {
1358         if (atomic_read(&netstamp_needed))
1359                 __net_timestamp(skb);
1360         else
1361                 skb->tstamp.tv64 = 0;
1362 }
1363
1364 /*
1365  *      Support routine. Sends outgoing frames to any network
1366  *      taps currently in use.
1367  */
1368
1369 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1370 {
1371         struct packet_type *ptype;
1372
1373 #ifdef CONFIG_NET_CLS_ACT
1374         if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1375                 net_timestamp(skb);
1376 #else
1377         net_timestamp(skb);
1378 #endif
1379
1380         rcu_read_lock();
1381         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1382                 /* Never send packets back to the socket
1383                  * they originated from - MvS (miquels@drinkel.ow.org)
1384                  */
1385                 if ((ptype->dev == dev || !ptype->dev) &&
1386                     (ptype->af_packet_priv == NULL ||
1387                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1388                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1389                         if (!skb2)
1390                                 break;
1391
1392                         /* skb->nh should be correctly
1393                            set by sender, so that the second statement is
1394                            just protection against buggy protocols.
1395                          */
1396                         skb_reset_mac_header(skb2);
1397
1398                         if (skb_network_header(skb2) < skb2->data ||
1399                             skb2->network_header > skb2->tail) {
1400                                 if (net_ratelimit())
1401                                         printk(KERN_CRIT "protocol %04x is "
1402                                                "buggy, dev %s\n",
1403                                                skb2->protocol, dev->name);
1404                                 skb_reset_network_header(skb2);
1405                         }
1406
1407                         skb2->transport_header = skb2->network_header;
1408                         skb2->pkt_type = PACKET_OUTGOING;
1409                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1410                 }
1411         }
1412         rcu_read_unlock();
1413 }
1414
1415
1416 static inline void __netif_reschedule(struct Qdisc *q)
1417 {
1418         struct softnet_data *sd;
1419         unsigned long flags;
1420
1421         local_irq_save(flags);
1422         sd = &__get_cpu_var(softnet_data);
1423         q->next_sched = sd->output_queue;
1424         sd->output_queue = q;
1425         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1426         local_irq_restore(flags);
1427 }
1428
1429 void __netif_schedule(struct Qdisc *q)
1430 {
1431         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1432                 __netif_reschedule(q);
1433 }
1434 EXPORT_SYMBOL(__netif_schedule);
1435
1436 void dev_kfree_skb_irq(struct sk_buff *skb)
1437 {
1438         if (atomic_dec_and_test(&skb->users)) {
1439                 struct softnet_data *sd;
1440                 unsigned long flags;
1441
1442                 local_irq_save(flags);
1443                 sd = &__get_cpu_var(softnet_data);
1444                 skb->next = sd->completion_queue;
1445                 sd->completion_queue = skb;
1446                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1447                 local_irq_restore(flags);
1448         }
1449 }
1450 EXPORT_SYMBOL(dev_kfree_skb_irq);
1451
1452 void dev_kfree_skb_any(struct sk_buff *skb)
1453 {
1454         if (in_irq() || irqs_disabled())
1455                 dev_kfree_skb_irq(skb);
1456         else
1457                 dev_kfree_skb(skb);
1458 }
1459 EXPORT_SYMBOL(dev_kfree_skb_any);
1460
1461
1462 /**
1463  * netif_device_detach - mark device as removed
1464  * @dev: network device
1465  *
1466  * Mark device as removed from system and therefore no longer available.
1467  */
1468 void netif_device_detach(struct net_device *dev)
1469 {
1470         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1471             netif_running(dev)) {
1472                 netif_tx_stop_all_queues(dev);
1473         }
1474 }
1475 EXPORT_SYMBOL(netif_device_detach);
1476
1477 /**
1478  * netif_device_attach - mark device as attached
1479  * @dev: network device
1480  *
1481  * Mark device as attached from system and restart if needed.
1482  */
1483 void netif_device_attach(struct net_device *dev)
1484 {
1485         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1486             netif_running(dev)) {
1487                 netif_tx_wake_all_queues(dev);
1488                 __netdev_watchdog_up(dev);
1489         }
1490 }
1491 EXPORT_SYMBOL(netif_device_attach);
1492
1493 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1494 {
1495         return ((features & NETIF_F_NO_CSUM) ||
1496                 ((features & NETIF_F_V4_CSUM) &&
1497                  protocol == htons(ETH_P_IP)) ||
1498                 ((features & NETIF_F_V6_CSUM) &&
1499                  protocol == htons(ETH_P_IPV6)) ||
1500                 ((features & NETIF_F_FCOE_CRC) &&
1501                  protocol == htons(ETH_P_FCOE)));
1502 }
1503
1504 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1505 {
1506         if (can_checksum_protocol(dev->features, skb->protocol))
1507                 return true;
1508
1509         if (skb->protocol == htons(ETH_P_8021Q)) {
1510                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1511                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1512                                           veh->h_vlan_encapsulated_proto))
1513                         return true;
1514         }
1515
1516         return false;
1517 }
1518
1519 /*
1520  * Invalidate hardware checksum when packet is to be mangled, and
1521  * complete checksum manually on outgoing path.
1522  */
1523 int skb_checksum_help(struct sk_buff *skb)
1524 {
1525         __wsum csum;
1526         int ret = 0, offset;
1527
1528         if (skb->ip_summed == CHECKSUM_COMPLETE)
1529                 goto out_set_summed;
1530
1531         if (unlikely(skb_shinfo(skb)->gso_size)) {
1532                 /* Let GSO fix up the checksum. */
1533                 goto out_set_summed;
1534         }
1535
1536         offset = skb->csum_start - skb_headroom(skb);
1537         BUG_ON(offset >= skb_headlen(skb));
1538         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1539
1540         offset += skb->csum_offset;
1541         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1542
1543         if (skb_cloned(skb) &&
1544             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1545                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1546                 if (ret)
1547                         goto out;
1548         }
1549
1550         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1551 out_set_summed:
1552         skb->ip_summed = CHECKSUM_NONE;
1553 out:
1554         return ret;
1555 }
1556 EXPORT_SYMBOL(skb_checksum_help);
1557
1558 /**
1559  *      skb_gso_segment - Perform segmentation on skb.
1560  *      @skb: buffer to segment
1561  *      @features: features for the output path (see dev->features)
1562  *
1563  *      This function segments the given skb and returns a list of segments.
1564  *
1565  *      It may return NULL if the skb requires no segmentation.  This is
1566  *      only possible when GSO is used for verifying header integrity.
1567  */
1568 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1569 {
1570         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1571         struct packet_type *ptype;
1572         __be16 type = skb->protocol;
1573         int err;
1574
1575         skb_reset_mac_header(skb);
1576         skb->mac_len = skb->network_header - skb->mac_header;
1577         __skb_pull(skb, skb->mac_len);
1578
1579         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1580                 struct net_device *dev = skb->dev;
1581                 struct ethtool_drvinfo info = {};
1582
1583                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1584                         dev->ethtool_ops->get_drvinfo(dev, &info);
1585
1586                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1587                         "ip_summed=%d",
1588                      info.driver, dev ? dev->features : 0L,
1589                      skb->sk ? skb->sk->sk_route_caps : 0L,
1590                      skb->len, skb->data_len, skb->ip_summed);
1591
1592                 if (skb_header_cloned(skb) &&
1593                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1594                         return ERR_PTR(err);
1595         }
1596
1597         rcu_read_lock();
1598         list_for_each_entry_rcu(ptype,
1599                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1600                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1601                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1602                                 err = ptype->gso_send_check(skb);
1603                                 segs = ERR_PTR(err);
1604                                 if (err || skb_gso_ok(skb, features))
1605                                         break;
1606                                 __skb_push(skb, (skb->data -
1607                                                  skb_network_header(skb)));
1608                         }
1609                         segs = ptype->gso_segment(skb, features);
1610                         break;
1611                 }
1612         }
1613         rcu_read_unlock();
1614
1615         __skb_push(skb, skb->data - skb_mac_header(skb));
1616
1617         return segs;
1618 }
1619 EXPORT_SYMBOL(skb_gso_segment);
1620
1621 /* Take action when hardware reception checksum errors are detected. */
1622 #ifdef CONFIG_BUG
1623 void netdev_rx_csum_fault(struct net_device *dev)
1624 {
1625         if (net_ratelimit()) {
1626                 printk(KERN_ERR "%s: hw csum failure.\n",
1627                         dev ? dev->name : "<unknown>");
1628                 dump_stack();
1629         }
1630 }
1631 EXPORT_SYMBOL(netdev_rx_csum_fault);
1632 #endif
1633
1634 /* Actually, we should eliminate this check as soon as we know, that:
1635  * 1. IOMMU is present and allows to map all the memory.
1636  * 2. No high memory really exists on this machine.
1637  */
1638
1639 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1640 {
1641 #ifdef CONFIG_HIGHMEM
1642         int i;
1643
1644         if (dev->features & NETIF_F_HIGHDMA)
1645                 return 0;
1646
1647         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1648                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1649                         return 1;
1650
1651 #endif
1652         return 0;
1653 }
1654
1655 struct dev_gso_cb {
1656         void (*destructor)(struct sk_buff *skb);
1657 };
1658
1659 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1660
1661 static void dev_gso_skb_destructor(struct sk_buff *skb)
1662 {
1663         struct dev_gso_cb *cb;
1664
1665         do {
1666                 struct sk_buff *nskb = skb->next;
1667
1668                 skb->next = nskb->next;
1669                 nskb->next = NULL;
1670                 kfree_skb(nskb);
1671         } while (skb->next);
1672
1673         cb = DEV_GSO_CB(skb);
1674         if (cb->destructor)
1675                 cb->destructor(skb);
1676 }
1677
1678 /**
1679  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1680  *      @skb: buffer to segment
1681  *
1682  *      This function segments the given skb and stores the list of segments
1683  *      in skb->next.
1684  */
1685 static int dev_gso_segment(struct sk_buff *skb)
1686 {
1687         struct net_device *dev = skb->dev;
1688         struct sk_buff *segs;
1689         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1690                                          NETIF_F_SG : 0);
1691
1692         segs = skb_gso_segment(skb, features);
1693
1694         /* Verifying header integrity only. */
1695         if (!segs)
1696                 return 0;
1697
1698         if (IS_ERR(segs))
1699                 return PTR_ERR(segs);
1700
1701         skb->next = segs;
1702         DEV_GSO_CB(skb)->destructor = skb->destructor;
1703         skb->destructor = dev_gso_skb_destructor;
1704
1705         return 0;
1706 }
1707
1708 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1709                         struct netdev_queue *txq)
1710 {
1711         const struct net_device_ops *ops = dev->netdev_ops;
1712         int rc;
1713
1714         if (likely(!skb->next)) {
1715                 if (!list_empty(&ptype_all))
1716                         dev_queue_xmit_nit(skb, dev);
1717
1718                 if (netif_needs_gso(dev, skb)) {
1719                         if (unlikely(dev_gso_segment(skb)))
1720                                 goto out_kfree_skb;
1721                         if (skb->next)
1722                                 goto gso;
1723                 }
1724
1725                 /*
1726                  * If device doesnt need skb->dst, release it right now while
1727                  * its hot in this cpu cache
1728                  */
1729                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1730                         skb_dst_drop(skb);
1731
1732                 rc = ops->ndo_start_xmit(skb, dev);
1733                 if (rc == NETDEV_TX_OK)
1734                         txq_trans_update(txq);
1735                 /*
1736                  * TODO: if skb_orphan() was called by
1737                  * dev->hard_start_xmit() (for example, the unmodified
1738                  * igb driver does that; bnx2 doesn't), then
1739                  * skb_tx_software_timestamp() will be unable to send
1740                  * back the time stamp.
1741                  *
1742                  * How can this be prevented? Always create another
1743                  * reference to the socket before calling
1744                  * dev->hard_start_xmit()? Prevent that skb_orphan()
1745                  * does anything in dev->hard_start_xmit() by clearing
1746                  * the skb destructor before the call and restoring it
1747                  * afterwards, then doing the skb_orphan() ourselves?
1748                  */
1749                 return rc;
1750         }
1751
1752 gso:
1753         do {
1754                 struct sk_buff *nskb = skb->next;
1755
1756                 skb->next = nskb->next;
1757                 nskb->next = NULL;
1758
1759                 /*
1760                  * If device doesnt need nskb->dst, release it right now while
1761                  * its hot in this cpu cache
1762                  */
1763                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1764                         skb_dst_drop(nskb);
1765
1766                 rc = ops->ndo_start_xmit(nskb, dev);
1767                 if (unlikely(rc != NETDEV_TX_OK)) {
1768                         nskb->next = skb->next;
1769                         skb->next = nskb;
1770                         return rc;
1771                 }
1772                 txq_trans_update(txq);
1773                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1774                         return NETDEV_TX_BUSY;
1775         } while (skb->next);
1776
1777         skb->destructor = DEV_GSO_CB(skb)->destructor;
1778
1779 out_kfree_skb:
1780         kfree_skb(skb);
1781         return NETDEV_TX_OK;
1782 }
1783
1784 static u32 skb_tx_hashrnd;
1785
1786 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1787 {
1788         u32 hash;
1789
1790         if (skb_rx_queue_recorded(skb)) {
1791                 hash = skb_get_rx_queue(skb);
1792                 while (unlikely(hash >= dev->real_num_tx_queues))
1793                         hash -= dev->real_num_tx_queues;
1794                 return hash;
1795         }
1796
1797         if (skb->sk && skb->sk->sk_hash)
1798                 hash = skb->sk->sk_hash;
1799         else
1800                 hash = skb->protocol;
1801
1802         hash = jhash_1word(hash, skb_tx_hashrnd);
1803
1804         return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1805 }
1806 EXPORT_SYMBOL(skb_tx_hash);
1807
1808 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1809                                         struct sk_buff *skb)
1810 {
1811         const struct net_device_ops *ops = dev->netdev_ops;
1812         u16 queue_index = 0;
1813
1814         if (ops->ndo_select_queue)
1815                 queue_index = ops->ndo_select_queue(dev, skb);
1816         else if (dev->real_num_tx_queues > 1)
1817                 queue_index = skb_tx_hash(dev, skb);
1818
1819         skb_set_queue_mapping(skb, queue_index);
1820         return netdev_get_tx_queue(dev, queue_index);
1821 }
1822
1823 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
1824                                  struct net_device *dev,
1825                                  struct netdev_queue *txq)
1826 {
1827         spinlock_t *root_lock = qdisc_lock(q);
1828         int rc;
1829
1830         spin_lock(root_lock);
1831         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1832                 kfree_skb(skb);
1833                 rc = NET_XMIT_DROP;
1834         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
1835                    !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
1836                 /*
1837                  * This is a work-conserving queue; there are no old skbs
1838                  * waiting to be sent out; and the qdisc is not running -
1839                  * xmit the skb directly.
1840                  */
1841                 __qdisc_update_bstats(q, skb->len);
1842                 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
1843                         __qdisc_run(q);
1844                 else
1845                         clear_bit(__QDISC_STATE_RUNNING, &q->state);
1846
1847                 rc = NET_XMIT_SUCCESS;
1848         } else {
1849                 rc = qdisc_enqueue_root(skb, q);
1850                 qdisc_run(q);
1851         }
1852         spin_unlock(root_lock);
1853
1854         return rc;
1855 }
1856
1857 /**
1858  *      dev_queue_xmit - transmit a buffer
1859  *      @skb: buffer to transmit
1860  *
1861  *      Queue a buffer for transmission to a network device. The caller must
1862  *      have set the device and priority and built the buffer before calling
1863  *      this function. The function can be called from an interrupt.
1864  *
1865  *      A negative errno code is returned on a failure. A success does not
1866  *      guarantee the frame will be transmitted as it may be dropped due
1867  *      to congestion or traffic shaping.
1868  *
1869  * -----------------------------------------------------------------------------------
1870  *      I notice this method can also return errors from the queue disciplines,
1871  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1872  *      be positive.
1873  *
1874  *      Regardless of the return value, the skb is consumed, so it is currently
1875  *      difficult to retry a send to this method.  (You can bump the ref count
1876  *      before sending to hold a reference for retry if you are careful.)
1877  *
1878  *      When calling this method, interrupts MUST be enabled.  This is because
1879  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1880  *          --BLG
1881  */
1882 int dev_queue_xmit(struct sk_buff *skb)
1883 {
1884         struct net_device *dev = skb->dev;
1885         struct netdev_queue *txq;
1886         struct Qdisc *q;
1887         int rc = -ENOMEM;
1888
1889         /* GSO will handle the following emulations directly. */
1890         if (netif_needs_gso(dev, skb))
1891                 goto gso;
1892
1893         if (skb_has_frags(skb) &&
1894             !(dev->features & NETIF_F_FRAGLIST) &&
1895             __skb_linearize(skb))
1896                 goto out_kfree_skb;
1897
1898         /* Fragmented skb is linearized if device does not support SG,
1899          * or if at least one of fragments is in highmem and device
1900          * does not support DMA from it.
1901          */
1902         if (skb_shinfo(skb)->nr_frags &&
1903             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1904             __skb_linearize(skb))
1905                 goto out_kfree_skb;
1906
1907         /* If packet is not checksummed and device does not support
1908          * checksumming for this protocol, complete checksumming here.
1909          */
1910         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1911                 skb_set_transport_header(skb, skb->csum_start -
1912                                               skb_headroom(skb));
1913                 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1914                         goto out_kfree_skb;
1915         }
1916
1917 gso:
1918         /* Disable soft irqs for various locks below. Also
1919          * stops preemption for RCU.
1920          */
1921         rcu_read_lock_bh();
1922
1923         txq = dev_pick_tx(dev, skb);
1924         q = rcu_dereference(txq->qdisc);
1925
1926 #ifdef CONFIG_NET_CLS_ACT
1927         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1928 #endif
1929         if (q->enqueue) {
1930                 rc = __dev_xmit_skb(skb, q, dev, txq);
1931                 goto out;
1932         }
1933
1934         /* The device has no queue. Common case for software devices:
1935            loopback, all the sorts of tunnels...
1936
1937            Really, it is unlikely that netif_tx_lock protection is necessary
1938            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1939            counters.)
1940            However, it is possible, that they rely on protection
1941            made by us here.
1942
1943            Check this and shot the lock. It is not prone from deadlocks.
1944            Either shot noqueue qdisc, it is even simpler 8)
1945          */
1946         if (dev->flags & IFF_UP) {
1947                 int cpu = smp_processor_id(); /* ok because BHs are off */
1948
1949                 if (txq->xmit_lock_owner != cpu) {
1950
1951                         HARD_TX_LOCK(dev, txq, cpu);
1952
1953                         if (!netif_tx_queue_stopped(txq)) {
1954                                 rc = NET_XMIT_SUCCESS;
1955                                 if (!dev_hard_start_xmit(skb, dev, txq)) {
1956                                         HARD_TX_UNLOCK(dev, txq);
1957                                         goto out;
1958                                 }
1959                         }
1960                         HARD_TX_UNLOCK(dev, txq);
1961                         if (net_ratelimit())
1962                                 printk(KERN_CRIT "Virtual device %s asks to "
1963                                        "queue packet!\n", dev->name);
1964                 } else {
1965                         /* Recursion is detected! It is possible,
1966                          * unfortunately */
1967                         if (net_ratelimit())
1968                                 printk(KERN_CRIT "Dead loop on virtual device "
1969                                        "%s, fix it urgently!\n", dev->name);
1970                 }
1971         }
1972
1973         rc = -ENETDOWN;
1974         rcu_read_unlock_bh();
1975
1976 out_kfree_skb:
1977         kfree_skb(skb);
1978         return rc;
1979 out:
1980         rcu_read_unlock_bh();
1981         return rc;
1982 }
1983 EXPORT_SYMBOL(dev_queue_xmit);
1984
1985
1986 /*=======================================================================
1987                         Receiver routines
1988   =======================================================================*/
1989
1990 int netdev_max_backlog __read_mostly = 1000;
1991 int netdev_budget __read_mostly = 300;
1992 int weight_p __read_mostly = 64;            /* old backlog weight */
1993
1994 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1995
1996
1997 /**
1998  *      netif_rx        -       post buffer to the network code
1999  *      @skb: buffer to post
2000  *
2001  *      This function receives a packet from a device driver and queues it for
2002  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2003  *      may be dropped during processing for congestion control or by the
2004  *      protocol layers.
2005  *
2006  *      return values:
2007  *      NET_RX_SUCCESS  (no congestion)
2008  *      NET_RX_DROP     (packet was dropped)
2009  *
2010  */
2011
2012 int netif_rx(struct sk_buff *skb)
2013 {
2014         struct softnet_data *queue;
2015         unsigned long flags;
2016
2017         /* if netpoll wants it, pretend we never saw it */
2018         if (netpoll_rx(skb))
2019                 return NET_RX_DROP;
2020
2021         if (!skb->tstamp.tv64)
2022                 net_timestamp(skb);
2023
2024         /*
2025          * The code is rearranged so that the path is the most
2026          * short when CPU is congested, but is still operating.
2027          */
2028         local_irq_save(flags);
2029         queue = &__get_cpu_var(softnet_data);
2030
2031         __get_cpu_var(netdev_rx_stat).total++;
2032         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2033                 if (queue->input_pkt_queue.qlen) {
2034 enqueue:
2035                         __skb_queue_tail(&queue->input_pkt_queue, skb);
2036                         local_irq_restore(flags);
2037                         return NET_RX_SUCCESS;
2038                 }
2039
2040                 napi_schedule(&queue->backlog);
2041                 goto enqueue;
2042         }
2043
2044         __get_cpu_var(netdev_rx_stat).dropped++;
2045         local_irq_restore(flags);
2046
2047         kfree_skb(skb);
2048         return NET_RX_DROP;
2049 }
2050 EXPORT_SYMBOL(netif_rx);
2051
2052 int netif_rx_ni(struct sk_buff *skb)
2053 {
2054         int err;
2055
2056         preempt_disable();
2057         err = netif_rx(skb);
2058         if (local_softirq_pending())
2059                 do_softirq();
2060         preempt_enable();
2061
2062         return err;
2063 }
2064 EXPORT_SYMBOL(netif_rx_ni);
2065
2066 static void net_tx_action(struct softirq_action *h)
2067 {
2068         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2069
2070         if (sd->completion_queue) {
2071                 struct sk_buff *clist;
2072
2073                 local_irq_disable();
2074                 clist = sd->completion_queue;
2075                 sd->completion_queue = NULL;
2076                 local_irq_enable();
2077
2078                 while (clist) {
2079                         struct sk_buff *skb = clist;
2080                         clist = clist->next;
2081
2082                         WARN_ON(atomic_read(&skb->users));
2083                         __kfree_skb(skb);
2084                 }
2085         }
2086
2087         if (sd->output_queue) {
2088                 struct Qdisc *head;
2089
2090                 local_irq_disable();
2091                 head = sd->output_queue;
2092                 sd->output_queue = NULL;
2093                 local_irq_enable();
2094
2095                 while (head) {
2096                         struct Qdisc *q = head;
2097                         spinlock_t *root_lock;
2098
2099                         head = head->next_sched;
2100
2101                         root_lock = qdisc_lock(q);
2102                         if (spin_trylock(root_lock)) {
2103                                 smp_mb__before_clear_bit();
2104                                 clear_bit(__QDISC_STATE_SCHED,
2105                                           &q->state);
2106                                 qdisc_run(q);
2107                                 spin_unlock(root_lock);
2108                         } else {
2109                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2110                                               &q->state)) {
2111                                         __netif_reschedule(q);
2112                                 } else {
2113                                         smp_mb__before_clear_bit();
2114                                         clear_bit(__QDISC_STATE_SCHED,
2115                                                   &q->state);
2116                                 }
2117                         }
2118                 }
2119         }
2120 }
2121
2122 static inline int deliver_skb(struct sk_buff *skb,
2123                               struct packet_type *pt_prev,
2124                               struct net_device *orig_dev)
2125 {
2126         atomic_inc(&skb->users);
2127         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2128 }
2129
2130 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2131
2132 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2133 /* This hook is defined here for ATM LANE */
2134 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2135                              unsigned char *addr) __read_mostly;
2136 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2137 #endif
2138
2139 /*
2140  * If bridge module is loaded call bridging hook.
2141  *  returns NULL if packet was consumed.
2142  */
2143 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2144                                         struct sk_buff *skb) __read_mostly;
2145 EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2146
2147 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2148                                             struct packet_type **pt_prev, int *ret,
2149                                             struct net_device *orig_dev)
2150 {
2151         struct net_bridge_port *port;
2152
2153         if (skb->pkt_type == PACKET_LOOPBACK ||
2154             (port = rcu_dereference(skb->dev->br_port)) == NULL)
2155                 return skb;
2156
2157         if (*pt_prev) {
2158                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2159                 *pt_prev = NULL;
2160         }
2161
2162         return br_handle_frame_hook(port, skb);
2163 }
2164 #else
2165 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
2166 #endif
2167
2168 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2169 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2170 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2171
2172 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2173                                              struct packet_type **pt_prev,
2174                                              int *ret,
2175                                              struct net_device *orig_dev)
2176 {
2177         if (skb->dev->macvlan_port == NULL)
2178                 return skb;
2179
2180         if (*pt_prev) {
2181                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2182                 *pt_prev = NULL;
2183         }
2184         return macvlan_handle_frame_hook(skb);
2185 }
2186 #else
2187 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
2188 #endif
2189
2190 #ifdef CONFIG_NET_CLS_ACT
2191 /* TODO: Maybe we should just force sch_ingress to be compiled in
2192  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2193  * a compare and 2 stores extra right now if we dont have it on
2194  * but have CONFIG_NET_CLS_ACT
2195  * NOTE: This doesnt stop any functionality; if you dont have
2196  * the ingress scheduler, you just cant add policies on ingress.
2197  *
2198  */
2199 static int ing_filter(struct sk_buff *skb)
2200 {
2201         struct net_device *dev = skb->dev;
2202         u32 ttl = G_TC_RTTL(skb->tc_verd);
2203         struct netdev_queue *rxq;
2204         int result = TC_ACT_OK;
2205         struct Qdisc *q;
2206
2207         if (MAX_RED_LOOP < ttl++) {
2208                 printk(KERN_WARNING
2209                        "Redir loop detected Dropping packet (%d->%d)\n",
2210                        skb->iif, dev->ifindex);
2211                 return TC_ACT_SHOT;
2212         }
2213
2214         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2215         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2216
2217         rxq = &dev->rx_queue;
2218
2219         q = rxq->qdisc;
2220         if (q != &noop_qdisc) {
2221                 spin_lock(qdisc_lock(q));
2222                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2223                         result = qdisc_enqueue_root(skb, q);
2224                 spin_unlock(qdisc_lock(q));
2225         }
2226
2227         return result;
2228 }
2229
2230 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2231                                          struct packet_type **pt_prev,
2232                                          int *ret, struct net_device *orig_dev)
2233 {
2234         if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2235                 goto out;
2236
2237         if (*pt_prev) {
2238                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2239                 *pt_prev = NULL;
2240         } else {
2241                 /* Huh? Why does turning on AF_PACKET affect this? */
2242                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2243         }
2244
2245         switch (ing_filter(skb)) {
2246         case TC_ACT_SHOT:
2247         case TC_ACT_STOLEN:
2248                 kfree_skb(skb);
2249                 return NULL;
2250         }
2251
2252 out:
2253         skb->tc_verd = 0;
2254         return skb;
2255 }
2256 #endif
2257
2258 /*
2259  *      netif_nit_deliver - deliver received packets to network taps
2260  *      @skb: buffer
2261  *
2262  *      This function is used to deliver incoming packets to network
2263  *      taps. It should be used when the normal netif_receive_skb path
2264  *      is bypassed, for example because of VLAN acceleration.
2265  */
2266 void netif_nit_deliver(struct sk_buff *skb)
2267 {
2268         struct packet_type *ptype;
2269
2270         if (list_empty(&ptype_all))
2271                 return;
2272
2273         skb_reset_network_header(skb);
2274         skb_reset_transport_header(skb);
2275         skb->mac_len = skb->network_header - skb->mac_header;
2276
2277         rcu_read_lock();
2278         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2279                 if (!ptype->dev || ptype->dev == skb->dev)
2280                         deliver_skb(skb, ptype, skb->dev);
2281         }
2282         rcu_read_unlock();
2283 }
2284
2285 /**
2286  *      netif_receive_skb - process receive buffer from network
2287  *      @skb: buffer to process
2288  *
2289  *      netif_receive_skb() is the main receive data processing function.
2290  *      It always succeeds. The buffer may be dropped during processing
2291  *      for congestion control or by the protocol layers.
2292  *
2293  *      This function may only be called from softirq context and interrupts
2294  *      should be enabled.
2295  *
2296  *      Return values (usually ignored):
2297  *      NET_RX_SUCCESS: no congestion
2298  *      NET_RX_DROP: packet was dropped
2299  */
2300 int netif_receive_skb(struct sk_buff *skb)
2301 {
2302         struct packet_type *ptype, *pt_prev;
2303         struct net_device *orig_dev;
2304         struct net_device *null_or_orig;
2305         int ret = NET_RX_DROP;
2306         __be16 type;
2307
2308         if (!skb->tstamp.tv64)
2309                 net_timestamp(skb);
2310
2311         if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2312                 return NET_RX_SUCCESS;
2313
2314         /* if we've gotten here through NAPI, check netpoll */
2315         if (netpoll_receive_skb(skb))
2316                 return NET_RX_DROP;
2317
2318         if (!skb->iif)
2319                 skb->iif = skb->dev->ifindex;
2320
2321         null_or_orig = NULL;
2322         orig_dev = skb->dev;
2323         if (orig_dev->master) {
2324                 if (skb_bond_should_drop(skb))
2325                         null_or_orig = orig_dev; /* deliver only exact match */
2326                 else
2327                         skb->dev = orig_dev->master;
2328         }
2329
2330         __get_cpu_var(netdev_rx_stat).total++;
2331
2332         skb_reset_network_header(skb);
2333         skb_reset_transport_header(skb);
2334         skb->mac_len = skb->network_header - skb->mac_header;
2335
2336         pt_prev = NULL;
2337
2338         rcu_read_lock();
2339
2340 #ifdef CONFIG_NET_CLS_ACT
2341         if (skb->tc_verd & TC_NCLS) {
2342                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2343                 goto ncls;
2344         }
2345 #endif
2346
2347         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2348                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2349                     ptype->dev == orig_dev) {
2350                         if (pt_prev)
2351                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2352                         pt_prev = ptype;
2353                 }
2354         }
2355
2356 #ifdef CONFIG_NET_CLS_ACT
2357         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2358         if (!skb)
2359                 goto out;
2360 ncls:
2361 #endif
2362
2363         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2364         if (!skb)
2365                 goto out;
2366         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2367         if (!skb)
2368                 goto out;
2369
2370         type = skb->protocol;
2371         list_for_each_entry_rcu(ptype,
2372                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2373                 if (ptype->type == type &&
2374                     (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2375                      ptype->dev == orig_dev)) {
2376                         if (pt_prev)
2377                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2378                         pt_prev = ptype;
2379                 }
2380         }
2381
2382         if (pt_prev) {
2383                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2384         } else {
2385                 kfree_skb(skb);
2386                 /* Jamal, now you will not able to escape explaining
2387                  * me how you were going to use this. :-)
2388                  */
2389                 ret = NET_RX_DROP;
2390         }
2391
2392 out:
2393         rcu_read_unlock();
2394         return ret;
2395 }
2396 EXPORT_SYMBOL(netif_receive_skb);
2397
2398 /* Network device is going away, flush any packets still pending  */
2399 static void flush_backlog(void *arg)
2400 {
2401         struct net_device *dev = arg;
2402         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2403         struct sk_buff *skb, *tmp;
2404
2405         skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2406                 if (skb->dev == dev) {
2407                         __skb_unlink(skb, &queue->input_pkt_queue);
2408                         kfree_skb(skb);
2409                 }
2410 }
2411
2412 static int napi_gro_complete(struct sk_buff *skb)
2413 {
2414         struct packet_type *ptype;
2415         __be16 type = skb->protocol;
2416         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2417         int err = -ENOENT;
2418
2419         if (NAPI_GRO_CB(skb)->count == 1) {
2420                 skb_shinfo(skb)->gso_size = 0;
2421                 goto out;
2422         }
2423
2424         rcu_read_lock();
2425         list_for_each_entry_rcu(ptype, head, list) {
2426                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2427                         continue;
2428
2429                 err = ptype->gro_complete(skb);
2430                 break;
2431         }
2432         rcu_read_unlock();
2433
2434         if (err) {
2435                 WARN_ON(&ptype->list == head);
2436                 kfree_skb(skb);
2437                 return NET_RX_SUCCESS;
2438         }
2439
2440 out:
2441         return netif_receive_skb(skb);
2442 }
2443
2444 void napi_gro_flush(struct napi_struct *napi)
2445 {
2446         struct sk_buff *skb, *next;
2447
2448         for (skb = napi->gro_list; skb; skb = next) {
2449                 next = skb->next;
2450                 skb->next = NULL;
2451                 napi_gro_complete(skb);
2452         }
2453
2454         napi->gro_count = 0;
2455         napi->gro_list = NULL;
2456 }
2457 EXPORT_SYMBOL(napi_gro_flush);
2458
2459 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2460 {
2461         struct sk_buff **pp = NULL;
2462         struct packet_type *ptype;
2463         __be16 type = skb->protocol;
2464         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2465         int same_flow;
2466         int mac_len;
2467         int ret;
2468
2469         if (!(skb->dev->features & NETIF_F_GRO))
2470                 goto normal;
2471
2472         if (skb_is_gso(skb) || skb_has_frags(skb))
2473                 goto normal;
2474
2475         rcu_read_lock();
2476         list_for_each_entry_rcu(ptype, head, list) {
2477                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2478                         continue;
2479
2480                 skb_set_network_header(skb, skb_gro_offset(skb));
2481                 mac_len = skb->network_header - skb->mac_header;
2482                 skb->mac_len = mac_len;
2483                 NAPI_GRO_CB(skb)->same_flow = 0;
2484                 NAPI_GRO_CB(skb)->flush = 0;
2485                 NAPI_GRO_CB(skb)->free = 0;
2486
2487                 pp = ptype->gro_receive(&napi->gro_list, skb);
2488                 break;
2489         }
2490         rcu_read_unlock();
2491
2492         if (&ptype->list == head)
2493                 goto normal;
2494
2495         same_flow = NAPI_GRO_CB(skb)->same_flow;
2496         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2497
2498         if (pp) {
2499                 struct sk_buff *nskb = *pp;
2500
2501                 *pp = nskb->next;
2502                 nskb->next = NULL;
2503                 napi_gro_complete(nskb);
2504                 napi->gro_count--;
2505         }
2506
2507         if (same_flow)
2508                 goto ok;
2509
2510         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2511                 goto normal;
2512
2513         napi->gro_count++;
2514         NAPI_GRO_CB(skb)->count = 1;
2515         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2516         skb->next = napi->gro_list;
2517         napi->gro_list = skb;
2518         ret = GRO_HELD;
2519
2520 pull:
2521         if (skb_headlen(skb) < skb_gro_offset(skb)) {
2522                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
2523
2524                 BUG_ON(skb->end - skb->tail < grow);
2525
2526                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
2527
2528                 skb->tail += grow;
2529                 skb->data_len -= grow;
2530
2531                 skb_shinfo(skb)->frags[0].page_offset += grow;
2532                 skb_shinfo(skb)->frags[0].size -= grow;
2533
2534                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
2535                         put_page(skb_shinfo(skb)->frags[0].page);
2536                         memmove(skb_shinfo(skb)->frags,
2537                                 skb_shinfo(skb)->frags + 1,
2538                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
2539                 }
2540         }
2541
2542 ok:
2543         return ret;
2544
2545 normal:
2546         ret = GRO_NORMAL;
2547         goto pull;
2548 }
2549 EXPORT_SYMBOL(dev_gro_receive);
2550
2551 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2552 {
2553         struct sk_buff *p;
2554
2555         if (netpoll_rx_on(skb))
2556                 return GRO_NORMAL;
2557
2558         for (p = napi->gro_list; p; p = p->next) {
2559                 NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev)
2560                         && !compare_ether_header(skb_mac_header(p),
2561                                                  skb_gro_mac_header(skb));
2562                 NAPI_GRO_CB(p)->flush = 0;
2563         }
2564
2565         return dev_gro_receive(napi, skb);
2566 }
2567
2568 int napi_skb_finish(int ret, struct sk_buff *skb)
2569 {
2570         int err = NET_RX_SUCCESS;
2571
2572         switch (ret) {
2573         case GRO_NORMAL:
2574                 return netif_receive_skb(skb);
2575
2576         case GRO_DROP:
2577                 err = NET_RX_DROP;
2578                 /* fall through */
2579
2580         case GRO_MERGED_FREE:
2581                 kfree_skb(skb);
2582                 break;
2583         }
2584
2585         return err;
2586 }
2587 EXPORT_SYMBOL(napi_skb_finish);
2588
2589 void skb_gro_reset_offset(struct sk_buff *skb)
2590 {
2591         NAPI_GRO_CB(skb)->data_offset = 0;
2592         NAPI_GRO_CB(skb)->frag0 = NULL;
2593         NAPI_GRO_CB(skb)->frag0_len = 0;
2594
2595         if (skb->mac_header == skb->tail &&
2596             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
2597                 NAPI_GRO_CB(skb)->frag0 =
2598                         page_address(skb_shinfo(skb)->frags[0].page) +
2599                         skb_shinfo(skb)->frags[0].page_offset;
2600                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
2601         }
2602 }
2603 EXPORT_SYMBOL(skb_gro_reset_offset);
2604
2605 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2606 {
2607         skb_gro_reset_offset(skb);
2608
2609         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2610 }
2611 EXPORT_SYMBOL(napi_gro_receive);
2612
2613 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2614 {
2615         __skb_pull(skb, skb_headlen(skb));
2616         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2617         skb->dev = napi->dev;
2618         skb->iif = 0;
2619
2620         napi->skb = skb;
2621 }
2622 EXPORT_SYMBOL(napi_reuse_skb);
2623
2624 struct sk_buff *napi_get_frags(struct napi_struct *napi)
2625 {
2626         struct net_device *dev = napi->dev;
2627         struct sk_buff *skb = napi->skb;
2628
2629         if (!skb) {
2630                 skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2631                 if (!skb)
2632                         goto out;
2633
2634                 skb_reserve(skb, NET_IP_ALIGN);
2635
2636                 napi->skb = skb;
2637         }
2638
2639 out:
2640         return skb;
2641 }
2642 EXPORT_SYMBOL(napi_get_frags);
2643
2644 int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
2645 {
2646         int err = NET_RX_SUCCESS;
2647
2648         switch (ret) {
2649         case GRO_NORMAL:
2650         case GRO_HELD:
2651                 skb->protocol = eth_type_trans(skb, skb->dev);
2652
2653                 if (ret == GRO_NORMAL)
2654                         return netif_receive_skb(skb);
2655
2656                 skb_gro_pull(skb, -ETH_HLEN);
2657                 break;
2658
2659         case GRO_DROP:
2660                 err = NET_RX_DROP;
2661                 /* fall through */
2662
2663         case GRO_MERGED_FREE:
2664                 napi_reuse_skb(napi, skb);
2665                 break;
2666         }
2667
2668         return err;
2669 }
2670 EXPORT_SYMBOL(napi_frags_finish);
2671
2672 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
2673 {
2674         struct sk_buff *skb = napi->skb;
2675         struct ethhdr *eth;
2676         unsigned int hlen;
2677         unsigned int off;
2678
2679         napi->skb = NULL;
2680
2681         skb_reset_mac_header(skb);
2682         skb_gro_reset_offset(skb);
2683
2684         off = skb_gro_offset(skb);
2685         hlen = off + sizeof(*eth);
2686         eth = skb_gro_header_fast(skb, off);
2687         if (skb_gro_header_hard(skb, hlen)) {
2688                 eth = skb_gro_header_slow(skb, hlen, off);
2689                 if (unlikely(!eth)) {
2690                         napi_reuse_skb(napi, skb);
2691                         skb = NULL;
2692                         goto out;
2693                 }
2694         }
2695
2696         skb_gro_pull(skb, sizeof(*eth));
2697
2698         /*
2699          * This works because the only protocols we care about don't require
2700          * special handling.  We'll fix it up properly at the end.
2701          */
2702         skb->protocol = eth->h_proto;
2703
2704 out:
2705         return skb;
2706 }
2707 EXPORT_SYMBOL(napi_frags_skb);
2708
2709 int napi_gro_frags(struct napi_struct *napi)
2710 {
2711         struct sk_buff *skb = napi_frags_skb(napi);
2712
2713         if (!skb)
2714                 return NET_RX_DROP;
2715
2716         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2717 }
2718 EXPORT_SYMBOL(napi_gro_frags);
2719
2720 static int process_backlog(struct napi_struct *napi, int quota)
2721 {
2722         int work = 0;
2723         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2724         unsigned long start_time = jiffies;
2725
2726         napi->weight = weight_p;
2727         do {
2728                 struct sk_buff *skb;
2729
2730                 local_irq_disable();
2731                 skb = __skb_dequeue(&queue->input_pkt_queue);
2732                 if (!skb) {
2733                         __napi_complete(napi);
2734                         local_irq_enable();
2735                         break;
2736                 }
2737                 local_irq_enable();
2738
2739                 netif_receive_skb(skb);
2740         } while (++work < quota && jiffies == start_time);
2741
2742         return work;
2743 }
2744
2745 /**
2746  * __napi_schedule - schedule for receive
2747  * @n: entry to schedule
2748  *
2749  * The entry's receive function will be scheduled to run
2750  */
2751 void __napi_schedule(struct napi_struct *n)
2752 {
2753         unsigned long flags;
2754
2755         local_irq_save(flags);
2756         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2757         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2758         local_irq_restore(flags);
2759 }
2760 EXPORT_SYMBOL(__napi_schedule);
2761
2762 void __napi_complete(struct napi_struct *n)
2763 {
2764         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2765         BUG_ON(n->gro_list);
2766
2767         list_del(&n->poll_list);
2768         smp_mb__before_clear_bit();
2769         clear_bit(NAPI_STATE_SCHED, &n->state);
2770 }
2771 EXPORT_SYMBOL(__napi_complete);
2772
2773 void napi_complete(struct napi_struct *n)
2774 {
2775         unsigned long flags;
2776
2777         /*
2778          * don't let napi dequeue from the cpu poll list
2779          * just in case its running on a different cpu
2780          */
2781         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2782                 return;
2783
2784         napi_gro_flush(n);
2785         local_irq_save(flags);
2786         __napi_complete(n);
2787         local_irq_restore(flags);
2788 }
2789 EXPORT_SYMBOL(napi_complete);
2790
2791 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2792                     int (*poll)(struct napi_struct *, int), int weight)
2793 {
2794         INIT_LIST_HEAD(&napi->poll_list);
2795         napi->gro_count = 0;
2796         napi->gro_list = NULL;
2797         napi->skb = NULL;
2798         napi->poll = poll;
2799         napi->weight = weight;
2800         list_add(&napi->dev_list, &dev->napi_list);
2801         napi->dev = dev;
2802 #ifdef CONFIG_NETPOLL
2803         spin_lock_init(&napi->poll_lock);
2804         napi->poll_owner = -1;
2805 #endif
2806         set_bit(NAPI_STATE_SCHED, &napi->state);
2807 }
2808 EXPORT_SYMBOL(netif_napi_add);
2809
2810 void netif_napi_del(struct napi_struct *napi)
2811 {
2812         struct sk_buff *skb, *next;
2813
2814         list_del_init(&napi->dev_list);
2815         napi_free_frags(napi);
2816
2817         for (skb = napi->gro_list; skb; skb = next) {
2818                 next = skb->next;
2819                 skb->next = NULL;
2820                 kfree_skb(skb);
2821         }
2822
2823         napi->gro_list = NULL;
2824         napi->gro_count = 0;
2825 }
2826 EXPORT_SYMBOL(netif_napi_del);
2827
2828
2829 static void net_rx_action(struct softirq_action *h)
2830 {
2831         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2832         unsigned long time_limit = jiffies + 2;
2833         int budget = netdev_budget;
2834         void *have;
2835
2836         local_irq_disable();
2837
2838         while (!list_empty(list)) {
2839                 struct napi_struct *n;
2840                 int work, weight;
2841
2842                 /* If softirq window is exhuasted then punt.
2843                  * Allow this to run for 2 jiffies since which will allow
2844                  * an average latency of 1.5/HZ.
2845                  */
2846                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2847                         goto softnet_break;
2848
2849                 local_irq_enable();
2850
2851                 /* Even though interrupts have been re-enabled, this
2852                  * access is safe because interrupts can only add new
2853                  * entries to the tail of this list, and only ->poll()
2854                  * calls can remove this head entry from the list.
2855                  */
2856                 n = list_entry(list->next, struct napi_struct, poll_list);
2857
2858                 have = netpoll_poll_lock(n);
2859
2860                 weight = n->weight;
2861
2862                 /* This NAPI_STATE_SCHED test is for avoiding a race
2863                  * with netpoll's poll_napi().  Only the entity which
2864                  * obtains the lock and sees NAPI_STATE_SCHED set will
2865                  * actually make the ->poll() call.  Therefore we avoid
2866                  * accidently calling ->poll() when NAPI is not scheduled.
2867                  */
2868                 work = 0;
2869                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
2870                         work = n->poll(n, weight);
2871                         trace_napi_poll(n);
2872                 }
2873
2874                 WARN_ON_ONCE(work > weight);
2875
2876                 budget -= work;
2877
2878                 local_irq_disable();
2879
2880                 /* Drivers must not modify the NAPI state if they
2881                  * consume the entire weight.  In such cases this code
2882                  * still "owns" the NAPI instance and therefore can
2883                  * move the instance around on the list at-will.
2884                  */
2885                 if (unlikely(work == weight)) {
2886                         if (unlikely(napi_disable_pending(n))) {
2887                                 local_irq_enable();
2888                                 napi_complete(n);
2889                                 local_irq_disable();
2890                         } else
2891                                 list_move_tail(&n->poll_list, list);
2892                 }
2893
2894                 netpoll_poll_unlock(have);
2895         }
2896 out:
2897         local_irq_enable();
2898
2899 #ifdef CONFIG_NET_DMA
2900         /*
2901          * There may not be any more sk_buffs coming right now, so push
2902          * any pending DMA copies to hardware
2903          */
2904         dma_issue_pending_all();
2905 #endif
2906
2907         return;
2908
2909 softnet_break:
2910         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2911         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2912         goto out;
2913 }
2914
2915 static gifconf_func_t *gifconf_list[NPROTO];
2916
2917 /**
2918  *      register_gifconf        -       register a SIOCGIF handler
2919  *      @family: Address family
2920  *      @gifconf: Function handler
2921  *
2922  *      Register protocol dependent address dumping routines. The handler
2923  *      that is passed must not be freed or reused until it has been replaced
2924  *      by another handler.
2925  */
2926 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
2927 {
2928         if (family >= NPROTO)
2929                 return -EINVAL;
2930         gifconf_list[family] = gifconf;
2931         return 0;
2932 }
2933 EXPORT_SYMBOL(register_gifconf);
2934
2935
2936 /*
2937  *      Map an interface index to its name (SIOCGIFNAME)
2938  */
2939
2940 /*
2941  *      We need this ioctl for efficient implementation of the
2942  *      if_indextoname() function required by the IPv6 API.  Without
2943  *      it, we would have to search all the interfaces to find a
2944  *      match.  --pb
2945  */
2946
2947 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2948 {
2949         struct net_device *dev;
2950         struct ifreq ifr;
2951
2952         /*
2953          *      Fetch the caller's info block.
2954          */
2955
2956         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2957                 return -EFAULT;
2958
2959         read_lock(&dev_base_lock);
2960         dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2961         if (!dev) {
2962                 read_unlock(&dev_base_lock);
2963                 return -ENODEV;
2964         }
2965
2966         strcpy(ifr.ifr_name, dev->name);
2967         read_unlock(&dev_base_lock);
2968
2969         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2970                 return -EFAULT;
2971         return 0;
2972 }
2973
2974 /*
2975  *      Perform a SIOCGIFCONF call. This structure will change
2976  *      size eventually, and there is nothing I can do about it.
2977  *      Thus we will need a 'compatibility mode'.
2978  */
2979
2980 static int dev_ifconf(struct net *net, char __user *arg)
2981 {
2982         struct ifconf ifc;
2983         struct net_device *dev;
2984         char __user *pos;
2985         int len;
2986         int total;
2987         int i;
2988
2989         /*
2990          *      Fetch the caller's info block.
2991          */
2992
2993         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2994                 return -EFAULT;
2995
2996         pos = ifc.ifc_buf;
2997         len = ifc.ifc_len;
2998
2999         /*
3000          *      Loop over the interfaces, and write an info block for each.
3001          */
3002
3003         total = 0;
3004         for_each_netdev(net, dev) {
3005                 for (i = 0; i < NPROTO; i++) {
3006                         if (gifconf_list[i]) {
3007                                 int done;
3008                                 if (!pos)
3009                                         done = gifconf_list[i](dev, NULL, 0);
3010                                 else
3011                                         done = gifconf_list[i](dev, pos + total,
3012                                                                len - total);
3013                                 if (done < 0)
3014                                         return -EFAULT;
3015                                 total += done;
3016                         }
3017                 }
3018         }
3019
3020         /*
3021          *      All done.  Write the updated control block back to the caller.
3022          */
3023         ifc.ifc_len = total;
3024
3025         /*
3026          *      Both BSD and Solaris return 0 here, so we do too.
3027          */
3028         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3029 }
3030
3031 #ifdef CONFIG_PROC_FS
3032 /*
3033  *      This is invoked by the /proc filesystem handler to display a device
3034  *      in detail.
3035  */
3036 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3037         __acquires(dev_base_lock)
3038 {
3039         struct net *net = seq_file_net(seq);
3040         loff_t off;
3041         struct net_device *dev;
3042
3043         read_lock(&dev_base_lock);
3044         if (!*pos)
3045                 return SEQ_START_TOKEN;
3046
3047         off = 1;
3048         for_each_netdev(net, dev)
3049                 if (off++ == *pos)
3050                         return dev;
3051
3052         return NULL;
3053 }
3054
3055 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3056 {
3057         struct net *net = seq_file_net(seq);
3058         ++*pos;
3059         return v == SEQ_START_TOKEN ?
3060                 first_net_device(net) : next_net_device((struct net_device *)v);
3061 }
3062
3063 void dev_seq_stop(struct seq_file *seq, void *v)
3064         __releases(dev_base_lock)
3065 {
3066         read_unlock(&dev_base_lock);
3067 }
3068
3069 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3070 {
3071         const struct net_device_stats *stats = dev_get_stats(dev);
3072
3073         seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3074                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3075                    dev->name, stats->rx_bytes, stats->rx_packets,
3076                    stats->rx_errors,
3077                    stats->rx_dropped + stats->rx_missed_errors,
3078                    stats->rx_fifo_errors,
3079                    stats->rx_length_errors + stats->rx_over_errors +
3080                     stats->rx_crc_errors + stats->rx_frame_errors,
3081                    stats->rx_compressed, stats->multicast,
3082                    stats->tx_bytes, stats->tx_packets,
3083                    stats->tx_errors, stats->tx_dropped,
3084                    stats->tx_fifo_errors, stats->collisions,
3085                    stats->tx_carrier_errors +
3086                     stats->tx_aborted_errors +
3087                     stats->tx_window_errors +
3088                     stats->tx_heartbeat_errors,
3089                    stats->tx_compressed);
3090 }
3091
3092 /*
3093  *      Called from the PROCfs module. This now uses the new arbitrary sized
3094  *      /proc/net interface to create /proc/net/dev
3095  */
3096 static int dev_seq_show(struct seq_file *seq, void *v)
3097 {
3098         if (v == SEQ_START_TOKEN)
3099                 seq_puts(seq, "Inter-|   Receive                            "
3100                               "                    |  Transmit\n"
3101                               " face |bytes    packets errs drop fifo frame "
3102                               "compressed multicast|bytes    packets errs "
3103                               "drop fifo colls carrier compressed\n");
3104         else
3105                 dev_seq_printf_stats(seq, v);
3106         return 0;
3107 }
3108
3109 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3110 {
3111         struct netif_rx_stats *rc = NULL;
3112
3113         while (*pos < nr_cpu_ids)
3114                 if (cpu_online(*pos)) {
3115                         rc = &per_cpu(netdev_rx_stat, *pos);
3116                         break;
3117                 } else
3118                         ++*pos;
3119         return rc;
3120 }
3121
3122 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3123 {
3124         return softnet_get_online(pos);
3125 }
3126
3127 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3128 {
3129         ++*pos;
3130         return softnet_get_online(pos);
3131 }
3132
3133 static void softnet_seq_stop(struct seq_file *seq, void *v)
3134 {
3135 }
3136
3137 static int softnet_seq_show(struct seq_file *seq, void *v)
3138 {
3139         struct netif_rx_stats *s = v;
3140
3141         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3142                    s->total, s->dropped, s->time_squeeze, 0,
3143                    0, 0, 0, 0, /* was fastroute */
3144                    s->cpu_collision);
3145         return 0;
3146 }
3147
3148 static const struct seq_operations dev_seq_ops = {
3149         .start = dev_seq_start,
3150         .next  = dev_seq_next,
3151         .stop  = dev_seq_stop,
3152         .show  = dev_seq_show,
3153 };
3154
3155 static int dev_seq_open(struct inode *inode, struct file *file)
3156 {
3157         return seq_open_net(inode, file, &dev_seq_ops,
3158                             sizeof(struct seq_net_private));
3159 }
3160
3161 static const struct file_operations dev_seq_fops = {
3162         .owner   = THIS_MODULE,
3163         .open    = dev_seq_open,
3164         .read    = seq_read,
3165         .llseek  = seq_lseek,
3166         .release = seq_release_net,
3167 };
3168
3169 static const struct seq_operations softnet_seq_ops = {
3170         .start = softnet_seq_start,
3171         .next  = softnet_seq_next,
3172         .stop  = softnet_seq_stop,
3173         .show  = softnet_seq_show,
3174 };
3175
3176 static int softnet_seq_open(struct inode *inode, struct file *file)
3177 {
3178         return seq_open(file, &softnet_seq_ops);
3179 }
3180
3181 static const struct file_operations softnet_seq_fops = {
3182         .owner   = THIS_MODULE,
3183         .open    = softnet_seq_open,
3184         .read    = seq_read,
3185         .llseek  = seq_lseek,
3186         .release = seq_release,
3187 };
3188
3189 static void *ptype_get_idx(loff_t pos)
3190 {
3191         struct packet_type *pt = NULL;
3192         loff_t i = 0;
3193         int t;
3194
3195         list_for_each_entry_rcu(pt, &ptype_all, list) {
3196                 if (i == pos)
3197                         return pt;
3198                 ++i;
3199         }
3200
3201         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3202                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3203                         if (i == pos)
3204                                 return pt;
3205                         ++i;
3206                 }
3207         }
3208         return NULL;
3209 }
3210
3211 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3212         __acquires(RCU)
3213 {
3214         rcu_read_lock();
3215         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3216 }
3217
3218 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3219 {
3220         struct packet_type *pt;
3221         struct list_head *nxt;
3222         int hash;
3223
3224         ++*pos;
3225         if (v == SEQ_START_TOKEN)
3226                 return ptype_get_idx(0);
3227
3228         pt = v;
3229         nxt = pt->list.next;
3230         if (pt->type == htons(ETH_P_ALL)) {
3231                 if (nxt != &ptype_all)
3232                         goto found;
3233                 hash = 0;
3234                 nxt = ptype_base[0].next;
3235         } else
3236                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3237
3238         while (nxt == &ptype_base[hash]) {
3239                 if (++hash >= PTYPE_HASH_SIZE)
3240                         return NULL;
3241                 nxt = ptype_base[hash].next;
3242         }
3243 found:
3244         return list_entry(nxt, struct packet_type, list);
3245 }
3246
3247 static void ptype_seq_stop(struct seq_file *seq, void *v)
3248         __releases(RCU)
3249 {
3250         rcu_read_unlock();
3251 }
3252
3253 static int ptype_seq_show(struct seq_file *seq, void *v)
3254 {
3255         struct packet_type *pt = v;
3256
3257         if (v == SEQ_START_TOKEN)
3258                 seq_puts(seq, "Type Device      Function\n");
3259         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3260                 if (pt->type == htons(ETH_P_ALL))
3261                         seq_puts(seq, "ALL ");
3262                 else
3263                         seq_printf(seq, "%04x", ntohs(pt->type));
3264
3265                 seq_printf(seq, " %-8s %pF\n",
3266                            pt->dev ? pt->dev->name : "", pt->func);
3267         }
3268
3269         return 0;
3270 }
3271
3272 static const struct seq_operations ptype_seq_ops = {
3273         .start = ptype_seq_start,
3274         .next  = ptype_seq_next,
3275         .stop  = ptype_seq_stop,
3276         .show  = ptype_seq_show,
3277 };
3278
3279 static int ptype_seq_open(struct inode *inode, struct file *file)
3280 {
3281         return seq_open_net(inode, file, &ptype_seq_ops,
3282                         sizeof(struct seq_net_private));
3283 }
3284
3285 static const struct file_operations ptype_seq_fops = {
3286         .owner   = THIS_MODULE,
3287         .open    = ptype_seq_open,
3288         .read    = seq_read,
3289         .llseek  = seq_lseek,
3290         .release = seq_release_net,
3291 };
3292
3293
3294 static int __net_init dev_proc_net_init(struct net *net)
3295 {
3296         int rc = -ENOMEM;
3297
3298         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3299                 goto out;
3300         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3301                 goto out_dev;
3302         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3303                 goto out_softnet;
3304
3305         if (wext_proc_init(net))
3306                 goto out_ptype;
3307         rc = 0;
3308 out:
3309         return rc;
3310 out_ptype:
3311         proc_net_remove(net, "ptype");
3312 out_softnet:
3313         proc_net_remove(net, "softnet_stat");
3314 out_dev:
3315         proc_net_remove(net, "dev");
3316         goto out;
3317 }
3318
3319 static void __net_exit dev_proc_net_exit(struct net *net)
3320 {
3321         wext_proc_exit(net);
3322
3323         proc_net_remove(net, "ptype");
3324         proc_net_remove(net, "softnet_stat");
3325         proc_net_remove(net, "dev");
3326 }
3327
3328 static struct pernet_operations __net_initdata dev_proc_ops = {
3329         .init = dev_proc_net_init,
3330         .exit = dev_proc_net_exit,
3331 };
3332
3333 static int __init dev_proc_init(void)
3334 {
3335         return register_pernet_subsys(&dev_proc_ops);
3336 }
3337 #else
3338 #define dev_proc_init() 0
3339 #endif  /* CONFIG_PROC_FS */
3340
3341
3342 /**
3343  *      netdev_set_master       -       set up master/slave pair
3344  *      @slave: slave device
3345  *      @master: new master device
3346  *
3347  *      Changes the master device of the slave. Pass %NULL to break the
3348  *      bonding. The caller must hold the RTNL semaphore. On a failure
3349  *      a negative errno code is returned. On success the reference counts
3350  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3351  *      function returns zero.
3352  */
3353 int netdev_set_master(struct net_device *slave, struct net_device *master)
3354 {
3355         struct net_device *old = slave->master;
3356
3357         ASSERT_RTNL();
3358
3359         if (master) {
3360                 if (old)
3361                         return -EBUSY;
3362                 dev_hold(master);
3363         }
3364
3365         slave->master = master;
3366
3367         synchronize_net();
3368
3369         if (old)
3370                 dev_put(old);
3371
3372         if (master)
3373                 slave->flags |= IFF_SLAVE;
3374         else
3375                 slave->flags &= ~IFF_SLAVE;
3376
3377         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3378         return 0;
3379 }
3380 EXPORT_SYMBOL(netdev_set_master);
3381
3382 static void dev_change_rx_flags(struct net_device *dev, int flags)
3383 {
3384         const struct net_device_ops *ops = dev->netdev_ops;
3385
3386         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3387                 ops->ndo_change_rx_flags(dev, flags);
3388 }
3389
3390 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3391 {
3392         unsigned short old_flags = dev->flags;
3393         uid_t uid;
3394         gid_t gid;
3395
3396         ASSERT_RTNL();
3397
3398         dev->flags |= IFF_PROMISC;
3399         dev->promiscuity += inc;
3400         if (dev->promiscuity == 0) {
3401                 /*
3402                  * Avoid overflow.
3403                  * If inc causes overflow, untouch promisc and return error.
3404                  */
3405                 if (inc < 0)
3406                         dev->flags &= ~IFF_PROMISC;
3407                 else {
3408                         dev->promiscuity -= inc;
3409                         printk(KERN_WARNING "%s: promiscuity touches roof, "
3410                                 "set promiscuity failed, promiscuity feature "
3411                                 "of device might be broken.\n", dev->name);
3412                         return -EOVERFLOW;
3413                 }
3414         }
3415         if (dev->flags != old_flags) {
3416                 printk(KERN_INFO "device %s %s promiscuous mode\n",
3417                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3418                                                                "left");
3419                 if (audit_enabled) {
3420                         current_uid_gid(&uid, &gid);
3421                         audit_log(current->audit_context, GFP_ATOMIC,
3422                                 AUDIT_ANOM_PROMISCUOUS,
3423                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3424                                 dev->name, (dev->flags & IFF_PROMISC),
3425                                 (old_flags & IFF_PROMISC),
3426                                 audit_get_loginuid(current),
3427                                 uid, gid,
3428                                 audit_get_sessionid(current));
3429                 }
3430
3431                 dev_change_rx_flags(dev, IFF_PROMISC);
3432         }
3433         return 0;
3434 }
3435
3436 /**
3437  *      dev_set_promiscuity     - update promiscuity count on a device
3438  *      @dev: device
3439  *      @inc: modifier
3440  *
3441  *      Add or remove promiscuity from a device. While the count in the device
3442  *      remains above zero the interface remains promiscuous. Once it hits zero
3443  *      the device reverts back to normal filtering operation. A negative inc
3444  *      value is used to drop promiscuity on the device.
3445  *      Return 0 if successful or a negative errno code on error.
3446  */
3447 int dev_set_promiscuity(struct net_device *dev, int inc)
3448 {
3449         unsigned short old_flags = dev->flags;
3450         int err;
3451
3452         err = __dev_set_promiscuity(dev, inc);
3453         if (err < 0)
3454                 return err;
3455         if (dev->flags != old_flags)
3456                 dev_set_rx_mode(dev);
3457         return err;
3458 }
3459 EXPORT_SYMBOL(dev_set_promiscuity);
3460
3461 /**
3462  *      dev_set_allmulti        - update allmulti count on a device
3463  *      @dev: device
3464  *      @inc: modifier
3465  *
3466  *      Add or remove reception of all multicast frames to a device. While the
3467  *      count in the device remains above zero the interface remains listening
3468  *      to all interfaces. Once it hits zero the device reverts back to normal
3469  *      filtering operation. A negative @inc value is used to drop the counter
3470  *      when releasing a resource needing all multicasts.
3471  *      Return 0 if successful or a negative errno code on error.
3472  */
3473
3474 int dev_set_allmulti(struct net_device *dev, int inc)
3475 {
3476         unsigned short old_flags = dev->flags;
3477
3478         ASSERT_RTNL();
3479
3480         dev->flags |= IFF_ALLMULTI;
3481         dev->allmulti += inc;
3482         if (dev->allmulti == 0) {
3483                 /*
3484                  * Avoid overflow.
3485                  * If inc causes overflow, untouch allmulti and return error.
3486                  */
3487                 if (inc < 0)
3488                         dev->flags &= ~IFF_ALLMULTI;
3489                 else {
3490                         dev->allmulti -= inc;
3491                         printk(KERN_WARNING "%s: allmulti touches roof, "
3492                                 "set allmulti failed, allmulti feature of "
3493                                 "device might be broken.\n", dev->name);
3494                         return -EOVERFLOW;
3495                 }
3496         }
3497         if (dev->flags ^ old_flags) {
3498                 dev_change_rx_flags(dev, IFF_ALLMULTI);
3499                 dev_set_rx_mode(dev);
3500         }
3501         return 0;
3502 }
3503 EXPORT_SYMBOL(dev_set_allmulti);
3504
3505 /*
3506  *      Upload unicast and multicast address lists to device and
3507  *      configure RX filtering. When the device doesn't support unicast
3508  *      filtering it is put in promiscuous mode while unicast addresses
3509  *      are present.
3510  */
3511 void __dev_set_rx_mode(struct net_device *dev)
3512 {
3513         const struct net_device_ops *ops = dev->netdev_ops;
3514
3515         /* dev_open will call this function so the list will stay sane. */
3516         if (!(dev->flags&IFF_UP))
3517                 return;
3518
3519         if (!netif_device_present(dev))
3520                 return;
3521
3522         if (ops->ndo_set_rx_mode)
3523                 ops->ndo_set_rx_mode(dev);
3524         else {
3525                 /* Unicast addresses changes may only happen under the rtnl,
3526                  * therefore calling __dev_set_promiscuity here is safe.
3527                  */
3528                 if (dev->uc.count > 0 && !dev->uc_promisc) {
3529                         __dev_set_promiscuity(dev, 1);
3530                         dev->uc_promisc = 1;
3531                 } else if (dev->uc.count == 0 && dev->uc_promisc) {
3532                         __dev_set_promiscuity(dev, -1);
3533                         dev->uc_promisc = 0;
3534                 }
3535
3536                 if (ops->ndo_set_multicast_list)
3537                         ops->ndo_set_multicast_list(dev);
3538         }
3539 }
3540
3541 void dev_set_rx_mode(struct net_device *dev)
3542 {
3543         netif_addr_lock_bh(dev);
3544         __dev_set_rx_mode(dev);
3545         netif_addr_unlock_bh(dev);
3546 }
3547
3548 /* hw addresses list handling functions */
3549
3550 static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
3551                          int addr_len, unsigned char addr_type)
3552 {
3553         struct netdev_hw_addr *ha;
3554         int alloc_size;
3555
3556         if (addr_len > MAX_ADDR_LEN)
3557                 return -EINVAL;
3558
3559         list_for_each_entry(ha, &list->list, list) {
3560                 if (!memcmp(ha->addr, addr, addr_len) &&
3561                     ha->type == addr_type) {
3562                         ha->refcount++;
3563                         return 0;
3564                 }
3565         }
3566
3567
3568         alloc_size = sizeof(*ha);
3569         if (alloc_size < L1_CACHE_BYTES)
3570                 alloc_size = L1_CACHE_BYTES;
3571         ha = kmalloc(alloc_size, GFP_ATOMIC);
3572         if (!ha)
3573                 return -ENOMEM;
3574         memcpy(ha->addr, addr, addr_len);
3575         ha->type = addr_type;
3576         ha->refcount = 1;
3577         ha->synced = false;
3578         list_add_tail_rcu(&ha->list, &list->list);
3579         list->count++;
3580         return 0;
3581 }
3582
3583 static void ha_rcu_free(struct rcu_head *head)
3584 {
3585         struct netdev_hw_addr *ha;
3586
3587         ha = container_of(head, struct netdev_hw_addr, rcu_head);
3588         kfree(ha);
3589 }
3590
3591 static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
3592                          int addr_len, unsigned char addr_type)
3593 {
3594         struct netdev_hw_addr *ha;
3595
3596         list_for_each_entry(ha, &list->list, list) {
3597                 if (!memcmp(ha->addr, addr, addr_len) &&
3598                     (ha->type == addr_type || !addr_type)) {
3599                         if (--ha->refcount)
3600                                 return 0;
3601                         list_del_rcu(&ha->list);
3602                         call_rcu(&ha->rcu_head, ha_rcu_free);
3603                         list->count--;
3604                         return 0;
3605                 }
3606         }
3607         return -ENOENT;
3608 }
3609
3610 static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
3611                                   struct netdev_hw_addr_list *from_list,
3612                                   int addr_len,
3613                                   unsigned char addr_type)
3614 {
3615         int err;
3616         struct netdev_hw_addr *ha, *ha2;
3617         unsigned char type;
3618
3619         list_for_each_entry(ha, &from_list->list, list) {
3620                 type = addr_type ? addr_type : ha->type;
3621                 err = __hw_addr_add(to_list, ha->addr, addr_len, type);
3622                 if (err)
3623                         goto unroll;
3624         }
3625         return 0;
3626
3627 unroll:
3628         list_for_each_entry(ha2, &from_list->list, list) {
3629                 if (ha2 == ha)
3630                         break;
3631                 type = addr_type ? addr_type : ha2->type;
3632                 __hw_addr_del(to_list, ha2->addr, addr_len, type);
3633         }
3634         return err;
3635 }
3636
3637 static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
3638                                    struct netdev_hw_addr_list *from_list,
3639                                    int addr_len,
3640                                    unsigned char addr_type)
3641 {
3642         struct netdev_hw_addr *ha;
3643         unsigned char type;
3644
3645         list_for_each_entry(ha, &from_list->list, list) {
3646                 type = addr_type ? addr_type : ha->type;
3647                 __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
3648         }
3649 }
3650
3651 static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
3652                           struct netdev_hw_addr_list *from_list,
3653                           int addr_len)
3654 {
3655         int err = 0;
3656         struct netdev_hw_addr *ha, *tmp;
3657
3658         list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3659                 if (!ha->synced) {
3660                         err = __hw_addr_add(to_list, ha->addr,
3661                                             addr_len, ha->type);
3662                         if (err)
3663                                 break;
3664                         ha->synced = true;
3665                         ha->refcount++;
3666                 } else if (ha->refcount == 1) {
3667                         __hw_addr_del(to_list, ha->addr, addr_len, ha->type);
3668                         __hw_addr_del(from_list, ha->addr, addr_len, ha->type);
3669                 }
3670         }
3671         return err;
3672 }
3673
3674 static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
3675                              struct netdev_hw_addr_list *from_list,
3676                              int addr_len)
3677 {
3678         struct netdev_hw_addr *ha, *tmp;
3679
3680         list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3681                 if (ha->synced) {
3682                         __hw_addr_del(to_list, ha->addr,
3683                                       addr_len, ha->type);
3684                         ha->synced = false;
3685                         __hw_addr_del(from_list, ha->addr,
3686                                       addr_len, ha->type);
3687                 }
3688         }
3689 }
3690
3691 static void __hw_addr_flush(struct netdev_hw_addr_list *list)
3692 {
3693         struct netdev_hw_addr *ha, *tmp;
3694
3695         list_for_each_entry_safe(ha, tmp, &list->list, list) {
3696                 list_del_rcu(&ha->list);
3697                 call_rcu(&ha->rcu_head, ha_rcu_free);
3698         }
3699         list->count = 0;
3700 }
3701
3702 static void __hw_addr_init(struct netdev_hw_addr_list *list)
3703 {
3704         INIT_LIST_HEAD(&list->list);
3705         list->count = 0;
3706 }
3707
3708 /* Device addresses handling functions */
3709
3710 static void dev_addr_flush(struct net_device *dev)
3711 {
3712         /* rtnl_mutex must be held here */
3713
3714         __hw_addr_flush(&dev->dev_addrs);
3715         dev->dev_addr = NULL;
3716 }
3717
3718 static int dev_addr_init(struct net_device *dev)
3719 {
3720         unsigned char addr[MAX_ADDR_LEN];
3721         struct netdev_hw_addr *ha;
3722         int err;
3723
3724         /* rtnl_mutex must be held here */
3725
3726         __hw_addr_init(&dev->dev_addrs);
3727         memset(addr, 0, sizeof(addr));
3728         err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
3729                             NETDEV_HW_ADDR_T_LAN);
3730         if (!err) {
3731                 /*
3732                  * Get the first (previously created) address from the list
3733                  * and set dev_addr pointer to this location.
3734                  */
3735                 ha = list_first_entry(&dev->dev_addrs.list,
3736                                       struct netdev_hw_addr, list);
3737                 dev->dev_addr = ha->addr;
3738         }
3739         return err;
3740 }
3741
3742 /**
3743  *      dev_addr_add    - Add a device address
3744  *      @dev: device
3745  *      @addr: address to add
3746  *      @addr_type: address type
3747  *
3748  *      Add a device address to the device or increase the reference count if
3749  *      it already exists.
3750  *
3751  *      The caller must hold the rtnl_mutex.
3752  */
3753 int dev_addr_add(struct net_device *dev, unsigned char *addr,
3754                  unsigned char addr_type)
3755 {
3756         int err;
3757
3758         ASSERT_RTNL();
3759
3760         err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
3761         if (!err)
3762                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3763         return err;
3764 }
3765 EXPORT_SYMBOL(dev_addr_add);
3766
3767 /**
3768  *      dev_addr_del    - Release a device address.
3769  *      @dev: device
3770  *      @addr: address to delete
3771  *      @addr_type: address type
3772  *
3773  *      Release reference to a device address and remove it from the device
3774  *      if the reference count drops to zero.
3775  *
3776  *      The caller must hold the rtnl_mutex.
3777  */
3778 int dev_addr_del(struct net_device *dev, unsigned char *addr,
3779                  unsigned char addr_type)
3780 {
3781         int err;
3782         struct netdev_hw_addr *ha;
3783
3784         ASSERT_RTNL();
3785
3786         /*
3787          * We can not remove the first address from the list because
3788          * dev->dev_addr points to that.
3789          */
3790         ha = list_first_entry(&dev->dev_addrs.list,
3791                               struct netdev_hw_addr, list);
3792         if (ha->addr == dev->dev_addr && ha->refcount == 1)
3793                 return -ENOENT;
3794
3795         err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
3796                             addr_type);
3797         if (!err)
3798                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3799         return err;
3800 }
3801 EXPORT_SYMBOL(dev_addr_del);
3802
3803 /**
3804  *      dev_addr_add_multiple   - Add device addresses from another device
3805  *      @to_dev: device to which addresses will be added
3806  *      @from_dev: device from which addresses will be added
3807  *      @addr_type: address type - 0 means type will be used from from_dev
3808  *
3809  *      Add device addresses of the one device to another.
3810  **
3811  *      The caller must hold the rtnl_mutex.
3812  */
3813 int dev_addr_add_multiple(struct net_device *to_dev,
3814                           struct net_device *from_dev,
3815                           unsigned char addr_type)
3816 {
3817         int err;
3818
3819         ASSERT_RTNL();
3820
3821         if (from_dev->addr_len != to_dev->addr_len)
3822                 return -EINVAL;
3823         err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3824                                      to_dev->addr_len, addr_type);
3825         if (!err)
3826                 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3827         return err;
3828 }
3829 EXPORT_SYMBOL(dev_addr_add_multiple);
3830
3831 /**
3832  *      dev_addr_del_multiple   - Delete device addresses by another device
3833  *      @to_dev: device where the addresses will be deleted
3834  *      @from_dev: device by which addresses the addresses will be deleted
3835  *      @addr_type: address type - 0 means type will used from from_dev
3836  *
3837  *      Deletes addresses in to device by the list of addresses in from device.
3838  *
3839  *      The caller must hold the rtnl_mutex.
3840  */
3841 int dev_addr_del_multiple(struct net_device *to_dev,
3842                           struct net_device *from_dev,
3843                           unsigned char addr_type)
3844 {
3845         ASSERT_RTNL();
3846
3847         if (from_dev->addr_len != to_dev->addr_len)
3848                 return -EINVAL;
3849         __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3850                                to_dev->addr_len, addr_type);
3851         call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3852         return 0;
3853 }
3854 EXPORT_SYMBOL(dev_addr_del_multiple);
3855
3856 /* multicast addresses handling functions */
3857
3858 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3859                       void *addr, int alen, int glbl)
3860 {
3861         struct dev_addr_list *da;
3862
3863         for (; (da = *list) != NULL; list = &da->next) {
3864                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3865                     alen == da->da_addrlen) {
3866                         if (glbl) {
3867                                 int old_glbl = da->da_gusers;
3868                                 da->da_gusers = 0;
3869                                 if (old_glbl == 0)
3870                                         break;
3871                         }
3872                         if (--da->da_users)
3873                                 return 0;
3874
3875                         *list = da->next;
3876                         kfree(da);
3877                         (*count)--;
3878                         return 0;
3879                 }
3880         }
3881         return -ENOENT;
3882 }
3883
3884 int __dev_addr_add(struct dev_addr_list **list, int *count,
3885                    void *addr, int alen, int glbl)
3886 {
3887         struct dev_addr_list *da;
3888
3889         for (da = *list; da != NULL; da = da->next) {
3890                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3891                     da->da_addrlen == alen) {
3892                         if (glbl) {
3893                                 int old_glbl = da->da_gusers;
3894                                 da->da_gusers = 1;
3895                                 if (old_glbl)
3896                                         return 0;
3897                         }
3898                         da->da_users++;
3899                         return 0;
3900                 }
3901         }
3902
3903         da = kzalloc(sizeof(*da), GFP_ATOMIC);
3904         if (da == NULL)
3905                 return -ENOMEM;
3906         memcpy(da->da_addr, addr, alen);
3907         da->da_addrlen = alen;
3908         da->da_users = 1;
3909         da->da_gusers = glbl ? 1 : 0;
3910         da->next = *list;
3911         *list = da;
3912         (*count)++;
3913         return 0;
3914 }
3915
3916 /**
3917  *      dev_unicast_delete      - Release secondary unicast address.
3918  *      @dev: device
3919  *      @addr: address to delete
3920  *
3921  *      Release reference to a secondary unicast address and remove it
3922  *      from the device if the reference count drops to zero.
3923  *
3924  *      The caller must hold the rtnl_mutex.
3925  */
3926 int dev_unicast_delete(struct net_device *dev, void *addr)
3927 {
3928         int err;
3929
3930         ASSERT_RTNL();
3931
3932         netif_addr_lock_bh(dev);
3933         err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
3934                             NETDEV_HW_ADDR_T_UNICAST);
3935         if (!err)
3936                 __dev_set_rx_mode(dev);
3937         netif_addr_unlock_bh(dev);
3938         return err;
3939 }
3940 EXPORT_SYMBOL(dev_unicast_delete);
3941
3942 /**
3943  *      dev_unicast_add         - add a secondary unicast address
3944  *      @dev: device
3945  *      @addr: address to add
3946  *
3947  *      Add a secondary unicast address to the device or increase
3948  *      the reference count if it already exists.
3949  *
3950  *      The caller must hold the rtnl_mutex.
3951  */
3952 int dev_unicast_add(struct net_device *dev, void *addr)
3953 {
3954         int err;
3955
3956         ASSERT_RTNL();
3957
3958         netif_addr_lock_bh(dev);
3959         err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
3960                             NETDEV_HW_ADDR_T_UNICAST);
3961         if (!err)
3962                 __dev_set_rx_mode(dev);
3963         netif_addr_unlock_bh(dev);
3964         return err;
3965 }
3966 EXPORT_SYMBOL(dev_unicast_add);
3967
3968 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3969                     struct dev_addr_list **from, int *from_count)
3970 {
3971         struct dev_addr_list *da, *next;
3972         int err = 0;
3973
3974         da = *from;
3975         while (da != NULL) {
3976                 next = da->next;
3977                 if (!da->da_synced) {
3978                         err = __dev_addr_add(to, to_count,
3979                                              da->da_addr, da->da_addrlen, 0);
3980                         if (err < 0)
3981                                 break;
3982                         da->da_synced = 1;
3983                         da->da_users++;
3984                 } else if (da->da_users == 1) {
3985                         __dev_addr_delete(to, to_count,
3986                                           da->da_addr, da->da_addrlen, 0);
3987                         __dev_addr_delete(from, from_count,
3988                                           da->da_addr, da->da_addrlen, 0);
3989                 }
3990                 da = next;
3991         }
3992         return err;
3993 }
3994 EXPORT_SYMBOL_GPL(__dev_addr_sync);
3995
3996 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3997                        struct dev_addr_list **from, int *from_count)
3998 {
3999         struct dev_addr_list *da, *next;
4000
4001         da = *from;
4002         while (da != NULL) {
4003                 next = da->next;
4004                 if (da->da_synced) {
4005                         __dev_addr_delete(to, to_count,
4006                                           da->da_addr, da->da_addrlen, 0);
4007                         da->da_synced = 0;
4008                         __dev_addr_delete(from, from_count,
4009                                           da->da_addr, da->da_addrlen, 0);
4010                 }
4011                 da = next;
4012         }
4013 }
4014 EXPORT_SYMBOL_GPL(__dev_addr_unsync);
4015
4016 /**
4017  *      dev_unicast_sync - Synchronize device's unicast list to another device
4018  *      @to: destination device
4019  *      @from: source device
4020  *
4021  *      Add newly added addresses to the destination device and release
4022  *      addresses that have no users left. The source device must be
4023  *      locked by netif_tx_lock_bh.
4024  *
4025  *      This function is intended to be called from the dev->set_rx_mode
4026  *      function of layered software devices.
4027  */
4028 int dev_unicast_sync(struct net_device *to, struct net_device *from)
4029 {
4030         int err = 0;
4031
4032         if (to->addr_len != from->addr_len)
4033                 return -EINVAL;
4034
4035         netif_addr_lock_bh(to);
4036         err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
4037         if (!err)
4038                 __dev_set_rx_mode(to);
4039         netif_addr_unlock_bh(to);
4040         return err;
4041 }
4042 EXPORT_SYMBOL(dev_unicast_sync);
4043
4044 /**
4045  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
4046  *      @to: destination device
4047  *      @from: source device
4048  *
4049  *      Remove all addresses that were added to the destination device by
4050  *      dev_unicast_sync(). This function is intended to be called from the
4051  *      dev->stop function of layered software devices.
4052  */
4053 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4054 {
4055         if (to->addr_len != from->addr_len)
4056                 return;
4057
4058         netif_addr_lock_bh(from);
4059         netif_addr_lock(to);
4060         __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
4061         __dev_set_rx_mode(to);
4062         netif_addr_unlock(to);
4063         netif_addr_unlock_bh(from);
4064 }
4065 EXPORT_SYMBOL(dev_unicast_unsync);
4066
4067 static void dev_unicast_flush(struct net_device *dev)
4068 {
4069         netif_addr_lock_bh(dev);
4070         __hw_addr_flush(&dev->uc);
4071         netif_addr_unlock_bh(dev);
4072 }
4073
4074 static void dev_unicast_init(struct net_device *dev)
4075 {
4076         __hw_addr_init(&dev->uc);
4077 }
4078
4079
4080 static void __dev_addr_discard(struct dev_addr_list **list)
4081 {
4082         struct dev_addr_list *tmp;
4083
4084         while (*list != NULL) {
4085                 tmp = *list;
4086                 *list = tmp->next;
4087                 if (tmp->da_users > tmp->da_gusers)
4088                         printk("__dev_addr_discard: address leakage! "
4089                                "da_users=%d\n", tmp->da_users);
4090                 kfree(tmp);
4091         }
4092 }
4093
4094 static void dev_addr_discard(struct net_device *dev)
4095 {
4096         netif_addr_lock_bh(dev);
4097
4098         __dev_addr_discard(&dev->mc_list);
4099         dev->mc_count = 0;
4100
4101         netif_addr_unlock_bh(dev);
4102 }
4103
4104 /**
4105  *      dev_get_flags - get flags reported to userspace
4106  *      @dev: device
4107  *
4108  *      Get the combination of flag bits exported through APIs to userspace.
4109  */
4110 unsigned dev_get_flags(const struct net_device *dev)
4111 {
4112         unsigned flags;
4113
4114         flags = (dev->flags & ~(IFF_PROMISC |
4115                                 IFF_ALLMULTI |
4116                                 IFF_RUNNING |
4117                                 IFF_LOWER_UP |
4118                                 IFF_DORMANT)) |
4119                 (dev->gflags & (IFF_PROMISC |
4120                                 IFF_ALLMULTI));
4121
4122         if (netif_running(dev)) {
4123                 if (netif_oper_up(dev))
4124                         flags |= IFF_RUNNING;
4125                 if (netif_carrier_ok(dev))
4126                         flags |= IFF_LOWER_UP;
4127                 if (netif_dormant(dev))
4128                         flags |= IFF_DORMANT;
4129         }
4130
4131         return flags;
4132 }
4133 EXPORT_SYMBOL(dev_get_flags);
4134
4135 /**
4136  *      dev_change_flags - change device settings
4137  *      @dev: device
4138  *      @flags: device state flags
4139  *
4140  *      Change settings on device based state flags. The flags are
4141  *      in the userspace exported format.
4142  */
4143 int dev_change_flags(struct net_device *dev, unsigned flags)
4144 {
4145         int ret, changes;
4146         int old_flags = dev->flags;
4147
4148         ASSERT_RTNL();
4149
4150         /*
4151          *      Set the flags on our device.
4152          */
4153
4154         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4155                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4156                                IFF_AUTOMEDIA)) |
4157                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4158                                     IFF_ALLMULTI));
4159
4160         /*
4161          *      Load in the correct multicast list now the flags have changed.
4162          */
4163
4164         if ((old_flags ^ flags) & IFF_MULTICAST)
4165                 dev_change_rx_flags(dev, IFF_MULTICAST);
4166
4167         dev_set_rx_mode(dev);
4168
4169         /*
4170          *      Have we downed the interface. We handle IFF_UP ourselves
4171          *      according to user attempts to set it, rather than blindly
4172          *      setting it.
4173          */
4174
4175         ret = 0;
4176         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4177                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
4178
4179                 if (!ret)
4180                         dev_set_rx_mode(dev);
4181         }
4182
4183         if (dev->flags & IFF_UP &&
4184             ((old_flags ^ dev->flags) & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
4185                                           IFF_VOLATILE)))
4186                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4187
4188         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4189                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4190
4191                 dev->gflags ^= IFF_PROMISC;
4192                 dev_set_promiscuity(dev, inc);
4193         }
4194
4195         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4196            is important. Some (broken) drivers set IFF_PROMISC, when
4197            IFF_ALLMULTI is requested not asking us and not reporting.
4198          */
4199         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4200                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4201
4202                 dev->gflags ^= IFF_ALLMULTI;
4203                 dev_set_allmulti(dev, inc);
4204         }
4205
4206         /* Exclude state transition flags, already notified */
4207         changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
4208         if (changes)
4209                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4210
4211         return ret;
4212 }
4213 EXPORT_SYMBOL(dev_change_flags);
4214
4215 /**
4216  *      dev_set_mtu - Change maximum transfer unit
4217  *      @dev: device
4218  *      @new_mtu: new transfer unit
4219  *
4220  *      Change the maximum transfer size of the network device.
4221  */
4222 int dev_set_mtu(struct net_device *dev, int new_mtu)
4223 {
4224         const struct net_device_ops *ops = dev->netdev_ops;
4225         int err;
4226
4227         if (new_mtu == dev->mtu)
4228                 return 0;
4229
4230         /*      MTU must be positive.    */
4231         if (new_mtu < 0)
4232                 return -EINVAL;
4233
4234         if (!netif_device_present(dev))
4235                 return -ENODEV;
4236
4237         err = 0;
4238         if (ops->ndo_change_mtu)
4239                 err = ops->ndo_change_mtu(dev, new_mtu);
4240         else
4241                 dev->mtu = new_mtu;
4242
4243         if (!err && dev->flags & IFF_UP)
4244                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4245         return err;
4246 }
4247 EXPORT_SYMBOL(dev_set_mtu);
4248
4249 /**
4250  *      dev_set_mac_address - Change Media Access Control Address
4251  *      @dev: device
4252  *      @sa: new address
4253  *
4254  *      Change the hardware (MAC) address of the device
4255  */
4256 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4257 {
4258         const struct net_device_ops *ops = dev->netdev_ops;
4259         int err;
4260
4261         if (!ops->ndo_set_mac_address)
4262                 return -EOPNOTSUPP;
4263         if (sa->sa_family != dev->type)
4264                 return -EINVAL;
4265         if (!netif_device_present(dev))
4266                 return -ENODEV;
4267         err = ops->ndo_set_mac_address(dev, sa);
4268         if (!err)
4269                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4270         return err;
4271 }
4272 EXPORT_SYMBOL(dev_set_mac_address);
4273
4274 /*
4275  *      Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
4276  */
4277 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4278 {
4279         int err;
4280         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4281
4282         if (!dev)
4283                 return -ENODEV;
4284
4285         switch (cmd) {
4286         case SIOCGIFFLAGS:      /* Get interface flags */
4287                 ifr->ifr_flags = (short) dev_get_flags(dev);
4288                 return 0;
4289
4290         case SIOCGIFMETRIC:     /* Get the metric on the interface
4291                                    (currently unused) */
4292                 ifr->ifr_metric = 0;
4293                 return 0;
4294
4295         case SIOCGIFMTU:        /* Get the MTU of a device */
4296                 ifr->ifr_mtu = dev->mtu;
4297                 return 0;
4298
4299         case SIOCGIFHWADDR:
4300                 if (!dev->addr_len)
4301                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4302                 else
4303                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4304                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4305                 ifr->ifr_hwaddr.sa_family = dev->type;
4306                 return 0;
4307
4308         case SIOCGIFSLAVE:
4309                 err = -EINVAL;
4310                 break;
4311
4312         case SIOCGIFMAP:
4313                 ifr->ifr_map.mem_start = dev->mem_start;
4314                 ifr->ifr_map.mem_end   = dev->mem_end;
4315                 ifr->ifr_map.base_addr = dev->base_addr;
4316                 ifr->ifr_map.irq       = dev->irq;
4317                 ifr->ifr_map.dma       = dev->dma;
4318                 ifr->ifr_map.port      = dev->if_port;
4319                 return 0;
4320
4321         case SIOCGIFINDEX:
4322                 ifr->ifr_ifindex = dev->ifindex;
4323                 return 0;
4324
4325         case SIOCGIFTXQLEN:
4326                 ifr->ifr_qlen = dev->tx_queue_len;
4327                 return 0;
4328
4329         default:
4330                 /* dev_ioctl() should ensure this case
4331                  * is never reached
4332                  */
4333                 WARN_ON(1);
4334                 err = -EINVAL;
4335                 break;
4336
4337         }
4338         return err;
4339 }
4340
4341 /*
4342  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4343  */
4344 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4345 {
4346         int err;
4347         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4348         const struct net_device_ops *ops;
4349
4350         if (!dev)
4351                 return -ENODEV;
4352
4353         ops = dev->netdev_ops;
4354
4355         switch (cmd) {
4356         case SIOCSIFFLAGS:      /* Set interface flags */
4357                 return dev_change_flags(dev, ifr->ifr_flags);
4358
4359         case SIOCSIFMETRIC:     /* Set the metric on the interface
4360                                    (currently unused) */
4361                 return -EOPNOTSUPP;
4362
4363         case SIOCSIFMTU:        /* Set the MTU of a device */
4364                 return dev_set_mtu(dev, ifr->ifr_mtu);
4365
4366         case SIOCSIFHWADDR:
4367                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4368
4369         case SIOCSIFHWBROADCAST:
4370                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4371                         return -EINVAL;
4372                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4373                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4374                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4375                 return 0;
4376
4377         case SIOCSIFMAP:
4378                 if (ops->ndo_set_config) {
4379                         if (!netif_device_present(dev))
4380                                 return -ENODEV;
4381                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4382                 }
4383                 return -EOPNOTSUPP;
4384
4385         case SIOCADDMULTI:
4386                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4387                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4388                         return -EINVAL;
4389                 if (!netif_device_present(dev))
4390                         return -ENODEV;
4391                 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
4392                                   dev->addr_len, 1);
4393
4394         case SIOCDELMULTI:
4395                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4396                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4397                         return -EINVAL;
4398                 if (!netif_device_present(dev))
4399                         return -ENODEV;
4400                 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
4401                                      dev->addr_len, 1);
4402
4403         case SIOCSIFTXQLEN:
4404                 if (ifr->ifr_qlen < 0)
4405                         return -EINVAL;
4406                 dev->tx_queue_len = ifr->ifr_qlen;
4407                 return 0;
4408
4409         case SIOCSIFNAME:
4410                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4411                 return dev_change_name(dev, ifr->ifr_newname);
4412
4413         /*
4414          *      Unknown or private ioctl
4415          */
4416         default:
4417                 if ((cmd >= SIOCDEVPRIVATE &&
4418                     cmd <= SIOCDEVPRIVATE + 15) ||
4419                     cmd == SIOCBONDENSLAVE ||
4420                     cmd == SIOCBONDRELEASE ||
4421                     cmd == SIOCBONDSETHWADDR ||
4422                     cmd == SIOCBONDSLAVEINFOQUERY ||
4423                     cmd == SIOCBONDINFOQUERY ||
4424                     cmd == SIOCBONDCHANGEACTIVE ||
4425                     cmd == SIOCGMIIPHY ||
4426                     cmd == SIOCGMIIREG ||
4427                     cmd == SIOCSMIIREG ||
4428                     cmd == SIOCBRADDIF ||
4429                     cmd == SIOCBRDELIF ||
4430                     cmd == SIOCSHWTSTAMP ||
4431                     cmd == SIOCWANDEV) {
4432                         err = -EOPNOTSUPP;
4433                         if (ops->ndo_do_ioctl) {
4434                                 if (netif_device_present(dev))
4435                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4436                                 else
4437                                         err = -ENODEV;
4438                         }
4439                 } else
4440                         err = -EINVAL;
4441
4442         }
4443         return err;
4444 }
4445
4446 /*
4447  *      This function handles all "interface"-type I/O control requests. The actual
4448  *      'doing' part of this is dev_ifsioc above.
4449  */
4450
4451 /**
4452  *      dev_ioctl       -       network device ioctl
4453  *      @net: the applicable net namespace
4454  *      @cmd: command to issue
4455  *      @arg: pointer to a struct ifreq in user space
4456  *
4457  *      Issue ioctl functions to devices. This is normally called by the
4458  *      user space syscall interfaces but can sometimes be useful for
4459  *      other purposes. The return value is the return from the syscall if
4460  *      positive or a negative errno code on error.
4461  */
4462
4463 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4464 {
4465         struct ifreq ifr;
4466         int ret;
4467         char *colon;
4468
4469         /* One special case: SIOCGIFCONF takes ifconf argument
4470            and requires shared lock, because it sleeps writing
4471            to user space.
4472          */
4473
4474         if (cmd == SIOCGIFCONF) {
4475                 rtnl_lock();
4476                 ret = dev_ifconf(net, (char __user *) arg);
4477                 rtnl_unlock();
4478                 return ret;
4479         }
4480         if (cmd == SIOCGIFNAME)
4481                 return dev_ifname(net, (struct ifreq __user *)arg);
4482
4483         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4484                 return -EFAULT;
4485
4486         ifr.ifr_name[IFNAMSIZ-1] = 0;
4487
4488         colon = strchr(ifr.ifr_name, ':');
4489         if (colon)
4490                 *colon = 0;
4491
4492         /*
4493          *      See which interface the caller is talking about.
4494          */
4495
4496         switch (cmd) {
4497         /*
4498          *      These ioctl calls:
4499          *      - can be done by all.
4500          *      - atomic and do not require locking.
4501          *      - return a value
4502          */
4503         case SIOCGIFFLAGS:
4504         case SIOCGIFMETRIC:
4505         case SIOCGIFMTU:
4506         case SIOCGIFHWADDR:
4507         case SIOCGIFSLAVE:
4508         case SIOCGIFMAP:
4509         case SIOCGIFINDEX:
4510         case SIOCGIFTXQLEN:
4511                 dev_load(net, ifr.ifr_name);
4512                 read_lock(&dev_base_lock);
4513                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4514                 read_unlock(&dev_base_lock);
4515                 if (!ret) {
4516                         if (colon)
4517                                 *colon = ':';
4518                         if (copy_to_user(arg, &ifr,
4519                                          sizeof(struct ifreq)))
4520                                 ret = -EFAULT;
4521                 }
4522                 return ret;
4523
4524         case SIOCETHTOOL:
4525                 dev_load(net, ifr.ifr_name);
4526                 rtnl_lock();
4527                 ret = dev_ethtool(net, &ifr);
4528                 rtnl_unlock();
4529                 if (!ret) {
4530                         if (colon)
4531                                 *colon = ':';
4532                         if (copy_to_user(arg, &ifr,
4533                                          sizeof(struct ifreq)))
4534                                 ret = -EFAULT;
4535                 }
4536                 return ret;
4537
4538         /*
4539          *      These ioctl calls:
4540          *      - require superuser power.
4541          *      - require strict serialization.
4542          *      - return a value
4543          */
4544         case SIOCGMIIPHY:
4545         case SIOCGMIIREG:
4546         case SIOCSIFNAME:
4547                 if (!capable(CAP_NET_ADMIN))
4548                         return -EPERM;
4549                 dev_load(net, ifr.ifr_name);
4550                 rtnl_lock();
4551                 ret = dev_ifsioc(net, &ifr, cmd);
4552                 rtnl_unlock();
4553                 if (!ret) {
4554                         if (colon)
4555                                 *colon = ':';
4556                         if (copy_to_user(arg, &ifr,
4557                                          sizeof(struct ifreq)))
4558                                 ret = -EFAULT;
4559                 }
4560                 return ret;
4561
4562         /*
4563          *      These ioctl calls:
4564          *      - require superuser power.
4565          *      - require strict serialization.
4566          *      - do not return a value
4567          */
4568         case SIOCSIFFLAGS:
4569         case SIOCSIFMETRIC:
4570         case SIOCSIFMTU:
4571         case SIOCSIFMAP:
4572         case SIOCSIFHWADDR:
4573         case SIOCSIFSLAVE:
4574         case SIOCADDMULTI:
4575         case SIOCDELMULTI:
4576         case SIOCSIFHWBROADCAST:
4577         case SIOCSIFTXQLEN:
4578         case SIOCSMIIREG:
4579         case SIOCBONDENSLAVE:
4580         case SIOCBONDRELEASE:
4581         case SIOCBONDSETHWADDR:
4582         case SIOCBONDCHANGEACTIVE:
4583         case SIOCBRADDIF:
4584         case SIOCBRDELIF:
4585         case SIOCSHWTSTAMP:
4586                 if (!capable(CAP_NET_ADMIN))
4587                         return -EPERM;
4588                 /* fall through */
4589         case SIOCBONDSLAVEINFOQUERY:
4590         case SIOCBONDINFOQUERY:
4591                 dev_load(net, ifr.ifr_name);
4592                 rtnl_lock();
4593                 ret = dev_ifsioc(net, &ifr, cmd);
4594                 rtnl_unlock();
4595                 return ret;
4596
4597         case SIOCGIFMEM:
4598                 /* Get the per device memory space. We can add this but
4599                  * currently do not support it */
4600         case SIOCSIFMEM:
4601                 /* Set the per device memory buffer space.
4602                  * Not applicable in our case */
4603         case SIOCSIFLINK:
4604                 return -EINVAL;
4605
4606         /*
4607          *      Unknown or private ioctl.
4608          */
4609         default:
4610                 if (cmd == SIOCWANDEV ||
4611                     (cmd >= SIOCDEVPRIVATE &&
4612                      cmd <= SIOCDEVPRIVATE + 15)) {
4613                         dev_load(net, ifr.ifr_name);
4614                         rtnl_lock();
4615                         ret = dev_ifsioc(net, &ifr, cmd);
4616                         rtnl_unlock();
4617                         if (!ret && copy_to_user(arg, &ifr,
4618                                                  sizeof(struct ifreq)))
4619                                 ret = -EFAULT;
4620                         return ret;
4621                 }
4622                 /* Take care of Wireless Extensions */
4623                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4624                         return wext_handle_ioctl(net, &ifr, cmd, arg);
4625                 return -EINVAL;
4626         }
4627 }
4628
4629
4630 /**
4631  *      dev_new_index   -       allocate an ifindex
4632  *      @net: the applicable net namespace
4633  *
4634  *      Returns a suitable unique value for a new device interface
4635  *      number.  The caller must hold the rtnl semaphore or the
4636  *      dev_base_lock to be sure it remains unique.
4637  */
4638 static int dev_new_index(struct net *net)
4639 {
4640         static int ifindex;
4641         for (;;) {
4642                 if (++ifindex <= 0)
4643                         ifindex = 1;
4644                 if (!__dev_get_by_index(net, ifindex))
4645                         return ifindex;
4646         }
4647 }
4648
4649 /* Delayed registration/unregisteration */
4650 static LIST_HEAD(net_todo_list);
4651
4652 static void net_set_todo(struct net_device *dev)
4653 {
4654         list_add_tail(&dev->todo_list, &net_todo_list);
4655 }
4656
4657 static void rollback_registered(struct net_device *dev)
4658 {
4659         BUG_ON(dev_boot_phase);
4660         ASSERT_RTNL();
4661
4662         /* Some devices call without registering for initialization unwind. */
4663         if (dev->reg_state == NETREG_UNINITIALIZED) {
4664                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4665                                   "was registered\n", dev->name, dev);
4666
4667                 WARN_ON(1);
4668                 return;
4669         }
4670
4671         BUG_ON(dev->reg_state != NETREG_REGISTERED);
4672
4673         /* If device is running, close it first. */
4674         dev_close(dev);
4675
4676         /* And unlink it from device chain. */
4677         unlist_netdevice(dev);
4678
4679         dev->reg_state = NETREG_UNREGISTERING;
4680
4681         synchronize_net();
4682
4683         /* Shutdown queueing discipline. */
4684         dev_shutdown(dev);
4685
4686
4687         /* Notify protocols, that we are about to destroy
4688            this device. They should clean all the things.
4689         */
4690         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4691
4692         /*
4693          *      Flush the unicast and multicast chains
4694          */
4695         dev_unicast_flush(dev);
4696         dev_addr_discard(dev);
4697
4698         if (dev->netdev_ops->ndo_uninit)
4699                 dev->netdev_ops->ndo_uninit(dev);
4700
4701         /* Notifier chain MUST detach us from master device. */
4702         WARN_ON(dev->master);
4703
4704         /* Remove entries from kobject tree */
4705         netdev_unregister_kobject(dev);
4706
4707         synchronize_net();
4708
4709         dev_put(dev);
4710 }
4711
4712 static void __netdev_init_queue_locks_one(struct net_device *dev,
4713                                           struct netdev_queue *dev_queue,
4714                                           void *_unused)
4715 {
4716         spin_lock_init(&dev_queue->_xmit_lock);
4717         netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4718         dev_queue->xmit_lock_owner = -1;
4719 }
4720
4721 static void netdev_init_queue_locks(struct net_device *dev)
4722 {
4723         netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4724         __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4725 }
4726
4727 unsigned long netdev_fix_features(unsigned long features, const char *name)
4728 {
4729         /* Fix illegal SG+CSUM combinations. */
4730         if ((features & NETIF_F_SG) &&
4731             !(features & NETIF_F_ALL_CSUM)) {
4732                 if (name)
4733                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4734                                "checksum feature.\n", name);
4735                 features &= ~NETIF_F_SG;
4736         }
4737
4738         /* TSO requires that SG is present as well. */
4739         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4740                 if (name)
4741                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4742                                "SG feature.\n", name);
4743                 features &= ~NETIF_F_TSO;
4744         }
4745
4746         if (features & NETIF_F_UFO) {
4747                 if (!(features & NETIF_F_GEN_CSUM)) {
4748                         if (name)
4749                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4750                                        "since no NETIF_F_HW_CSUM feature.\n",
4751                                        name);
4752                         features &= ~NETIF_F_UFO;
4753                 }
4754
4755                 if (!(features & NETIF_F_SG)) {
4756                         if (name)
4757                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4758                                        "since no NETIF_F_SG feature.\n", name);
4759                         features &= ~NETIF_F_UFO;
4760                 }
4761         }
4762
4763         return features;
4764 }
4765 EXPORT_SYMBOL(netdev_fix_features);
4766
4767 /**
4768  *      register_netdevice      - register a network device
4769  *      @dev: device to register
4770  *
4771  *      Take a completed network device structure and add it to the kernel
4772  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4773  *      chain. 0 is returned on success. A negative errno code is returned
4774  *      on a failure to set up the device, or if the name is a duplicate.
4775  *
4776  *      Callers must hold the rtnl semaphore. You may want
4777  *      register_netdev() instead of this.
4778  *
4779  *      BUGS:
4780  *      The locking appears insufficient to guarantee two parallel registers
4781  *      will not get the same name.
4782  */
4783
4784 int register_netdevice(struct net_device *dev)
4785 {
4786         struct hlist_head *head;
4787         struct hlist_node *p;
4788         int ret;
4789         struct net *net = dev_net(dev);
4790
4791         BUG_ON(dev_boot_phase);
4792         ASSERT_RTNL();
4793
4794         might_sleep();
4795
4796         /* When net_device's are persistent, this will be fatal. */
4797         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4798         BUG_ON(!net);
4799
4800         spin_lock_init(&dev->addr_list_lock);
4801         netdev_set_addr_lockdep_class(dev);
4802         netdev_init_queue_locks(dev);
4803
4804         dev->iflink = -1;
4805
4806         /* Init, if this function is available */
4807         if (dev->netdev_ops->ndo_init) {
4808                 ret = dev->netdev_ops->ndo_init(dev);
4809                 if (ret) {
4810                         if (ret > 0)
4811                                 ret = -EIO;
4812                         goto out;
4813                 }
4814         }
4815
4816         if (!dev_valid_name(dev->name)) {
4817                 ret = -EINVAL;
4818                 goto err_uninit;
4819         }
4820
4821         dev->ifindex = dev_new_index(net);
4822         if (dev->iflink == -1)
4823                 dev->iflink = dev->ifindex;
4824
4825         /* Check for existence of name */
4826         head = dev_name_hash(net, dev->name);
4827         hlist_for_each(p, head) {
4828                 struct net_device *d
4829                         = hlist_entry(p, struct net_device, name_hlist);
4830                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4831                         ret = -EEXIST;
4832                         goto err_uninit;
4833                 }
4834         }
4835
4836         /* Fix illegal checksum combinations */
4837         if ((dev->features & NETIF_F_HW_CSUM) &&
4838             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4839                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4840                        dev->name);
4841                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4842         }
4843
4844         if ((dev->features & NETIF_F_NO_CSUM) &&
4845             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4846                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4847                        dev->name);
4848                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4849         }
4850
4851         dev->features = netdev_fix_features(dev->features, dev->name);
4852
4853         /* Enable software GSO if SG is supported. */
4854         if (dev->features & NETIF_F_SG)
4855                 dev->features |= NETIF_F_GSO;
4856
4857         netdev_initialize_kobject(dev);
4858         ret = netdev_register_kobject(dev);
4859         if (ret)
4860                 goto err_uninit;
4861         dev->reg_state = NETREG_REGISTERED;
4862
4863         /*
4864          *      Default initial state at registry is that the
4865          *      device is present.
4866          */
4867
4868         set_bit(__LINK_STATE_PRESENT, &dev->state);
4869
4870         dev_init_scheduler(dev);
4871         dev_hold(dev);
4872         list_netdevice(dev);
4873
4874         /* Notify protocols, that a new device appeared. */
4875         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4876         ret = notifier_to_errno(ret);
4877         if (ret) {
4878                 rollback_registered(dev);
4879                 dev->reg_state = NETREG_UNREGISTERED;
4880         }
4881         /*
4882          *      Prevent userspace races by waiting until the network
4883          *      device is fully setup before sending notifications.
4884          */
4885         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
4886
4887 out:
4888         return ret;
4889
4890 err_uninit:
4891         if (dev->netdev_ops->ndo_uninit)
4892                 dev->netdev_ops->ndo_uninit(dev);
4893         goto out;
4894 }
4895 EXPORT_SYMBOL(register_netdevice);
4896
4897 /**
4898  *      init_dummy_netdev       - init a dummy network device for NAPI
4899  *      @dev: device to init
4900  *
4901  *      This takes a network device structure and initialize the minimum
4902  *      amount of fields so it can be used to schedule NAPI polls without
4903  *      registering a full blown interface. This is to be used by drivers
4904  *      that need to tie several hardware interfaces to a single NAPI
4905  *      poll scheduler due to HW limitations.
4906  */
4907 int init_dummy_netdev(struct net_device *dev)
4908 {
4909         /* Clear everything. Note we don't initialize spinlocks
4910          * are they aren't supposed to be taken by any of the
4911          * NAPI code and this dummy netdev is supposed to be
4912          * only ever used for NAPI polls
4913          */
4914         memset(dev, 0, sizeof(struct net_device));
4915
4916         /* make sure we BUG if trying to hit standard
4917          * register/unregister code path
4918          */
4919         dev->reg_state = NETREG_DUMMY;
4920
4921         /* initialize the ref count */
4922         atomic_set(&dev->refcnt, 1);
4923
4924         /* NAPI wants this */
4925         INIT_LIST_HEAD(&dev->napi_list);
4926
4927         /* a dummy interface is started by default */
4928         set_bit(__LINK_STATE_PRESENT, &dev->state);
4929         set_bit(__LINK_STATE_START, &dev->state);
4930
4931         return 0;
4932 }
4933 EXPORT_SYMBOL_GPL(init_dummy_netdev);
4934
4935
4936 /**
4937  *      register_netdev - register a network device
4938  *      @dev: device to register
4939  *
4940  *      Take a completed network device structure and add it to the kernel
4941  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4942  *      chain. 0 is returned on success. A negative errno code is returned
4943  *      on a failure to set up the device, or if the name is a duplicate.
4944  *
4945  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
4946  *      and expands the device name if you passed a format string to
4947  *      alloc_netdev.
4948  */
4949 int register_netdev(struct net_device *dev)
4950 {
4951         int err;
4952
4953         rtnl_lock();
4954
4955         /*
4956          * If the name is a format string the caller wants us to do a
4957          * name allocation.
4958          */
4959         if (strchr(dev->name, '%')) {
4960                 err = dev_alloc_name(dev, dev->name);
4961                 if (err < 0)
4962                         goto out;
4963         }
4964
4965         err = register_netdevice(dev);
4966 out:
4967         rtnl_unlock();
4968         return err;
4969 }
4970 EXPORT_SYMBOL(register_netdev);
4971
4972 /*
4973  * netdev_wait_allrefs - wait until all references are gone.
4974  *
4975  * This is called when unregistering network devices.
4976  *
4977  * Any protocol or device that holds a reference should register
4978  * for netdevice notification, and cleanup and put back the
4979  * reference if they receive an UNREGISTER event.
4980  * We can get stuck here if buggy protocols don't correctly
4981  * call dev_put.
4982  */
4983 static void netdev_wait_allrefs(struct net_device *dev)
4984 {
4985         unsigned long rebroadcast_time, warning_time;
4986
4987         rebroadcast_time = warning_time = jiffies;
4988         while (atomic_read(&dev->refcnt) != 0) {
4989                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4990                         rtnl_lock();
4991
4992                         /* Rebroadcast unregister notification */
4993                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4994
4995                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4996                                      &dev->state)) {
4997                                 /* We must not have linkwatch events
4998                                  * pending on unregister. If this
4999                                  * happens, we simply run the queue
5000                                  * unscheduled, resulting in a noop
5001                                  * for this device.
5002                                  */
5003                                 linkwatch_run_queue();
5004                         }
5005
5006                         __rtnl_unlock();
5007
5008                         rebroadcast_time = jiffies;
5009                 }
5010
5011                 msleep(250);
5012
5013                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5014                         printk(KERN_EMERG "unregister_netdevice: "
5015                                "waiting for %s to become free. Usage "
5016                                "count = %d\n",
5017                                dev->name, atomic_read(&dev->refcnt));
5018                         warning_time = jiffies;
5019                 }
5020         }
5021 }
5022
5023 /* The sequence is:
5024  *
5025  *      rtnl_lock();
5026  *      ...
5027  *      register_netdevice(x1);
5028  *      register_netdevice(x2);
5029  *      ...
5030  *      unregister_netdevice(y1);
5031  *      unregister_netdevice(y2);
5032  *      ...
5033  *      rtnl_unlock();
5034  *      free_netdev(y1);
5035  *      free_netdev(y2);
5036  *
5037  * We are invoked by rtnl_unlock().
5038  * This allows us to deal with problems:
5039  * 1) We can delete sysfs objects which invoke hotplug
5040  *    without deadlocking with linkwatch via keventd.
5041  * 2) Since we run with the RTNL semaphore not held, we can sleep
5042  *    safely in order to wait for the netdev refcnt to drop to zero.
5043  *
5044  * We must not return until all unregister events added during
5045  * the interval the lock was held have been completed.
5046  */
5047 void netdev_run_todo(void)
5048 {
5049         struct list_head list;
5050
5051         /* Snapshot list, allow later requests */
5052         list_replace_init(&net_todo_list, &list);
5053
5054         __rtnl_unlock();
5055
5056         while (!list_empty(&list)) {
5057                 struct net_device *dev
5058                         = list_entry(list.next, struct net_device, todo_list);
5059                 list_del(&dev->todo_list);
5060
5061                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5062                         printk(KERN_ERR "network todo '%s' but state %d\n",
5063                                dev->name, dev->reg_state);
5064                         dump_stack();
5065                         continue;
5066                 }
5067
5068                 dev->reg_state = NETREG_UNREGISTERED;
5069
5070                 on_each_cpu(flush_backlog, dev, 1);
5071
5072                 netdev_wait_allrefs(dev);
5073
5074                 /* paranoia */
5075                 BUG_ON(atomic_read(&dev->refcnt));
5076                 WARN_ON(dev->ip_ptr);
5077                 WARN_ON(dev->ip6_ptr);
5078                 WARN_ON(dev->dn_ptr);
5079
5080                 if (dev->destructor)
5081                         dev->destructor(dev);
5082
5083                 /* Free network device */
5084                 kobject_put(&dev->dev.kobj);
5085         }
5086 }
5087
5088 /**
5089  *      dev_get_stats   - get network device statistics
5090  *      @dev: device to get statistics from
5091  *
5092  *      Get network statistics from device. The device driver may provide
5093  *      its own method by setting dev->netdev_ops->get_stats; otherwise
5094  *      the internal statistics structure is used.
5095  */
5096 const struct net_device_stats *dev_get_stats(struct net_device *dev)
5097 {
5098         const struct net_device_ops *ops = dev->netdev_ops;
5099
5100         if (ops->ndo_get_stats)
5101                 return ops->ndo_get_stats(dev);
5102         else {
5103                 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5104                 struct net_device_stats *stats = &dev->stats;
5105                 unsigned int i;
5106                 struct netdev_queue *txq;
5107
5108                 for (i = 0; i < dev->num_tx_queues; i++) {
5109                         txq = netdev_get_tx_queue(dev, i);
5110                         tx_bytes   += txq->tx_bytes;
5111                         tx_packets += txq->tx_packets;
5112                         tx_dropped += txq->tx_dropped;
5113                 }
5114                 if (tx_bytes || tx_packets || tx_dropped) {
5115                         stats->tx_bytes   = tx_bytes;
5116                         stats->tx_packets = tx_packets;
5117                         stats->tx_dropped = tx_dropped;
5118                 }
5119                 return stats;
5120         }
5121 }
5122 EXPORT_SYMBOL(dev_get_stats);
5123
5124 static void netdev_init_one_queue(struct net_device *dev,
5125                                   struct netdev_queue *queue,
5126                                   void *_unused)
5127 {
5128         queue->dev = dev;
5129 }
5130
5131 static void netdev_init_queues(struct net_device *dev)
5132 {
5133         netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5134         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5135         spin_lock_init(&dev->tx_global_lock);
5136 }
5137
5138 /**
5139  *      alloc_netdev_mq - allocate network device
5140  *      @sizeof_priv:   size of private data to allocate space for
5141  *      @name:          device name format string
5142  *      @setup:         callback to initialize device
5143  *      @queue_count:   the number of subqueues to allocate
5144  *
5145  *      Allocates a struct net_device with private data area for driver use
5146  *      and performs basic initialization.  Also allocates subquue structs
5147  *      for each queue on the device at the end of the netdevice.
5148  */
5149 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5150                 void (*setup)(struct net_device *), unsigned int queue_count)
5151 {
5152         struct netdev_queue *tx;
5153         struct net_device *dev;
5154         size_t alloc_size;
5155         struct net_device *p;
5156
5157         BUG_ON(strlen(name) >= sizeof(dev->name));
5158
5159         alloc_size = sizeof(struct net_device);
5160         if (sizeof_priv) {
5161                 /* ensure 32-byte alignment of private area */
5162                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5163                 alloc_size += sizeof_priv;
5164         }
5165         /* ensure 32-byte alignment of whole construct */
5166         alloc_size += NETDEV_ALIGN - 1;
5167
5168         p = kzalloc(alloc_size, GFP_KERNEL);
5169         if (!p) {
5170                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5171                 return NULL;
5172         }
5173
5174         tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5175         if (!tx) {
5176                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5177                        "tx qdiscs.\n");
5178                 goto free_p;
5179         }
5180
5181         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5182         dev->padded = (char *)dev - (char *)p;
5183
5184         if (dev_addr_init(dev))
5185                 goto free_tx;
5186
5187         dev_unicast_init(dev);
5188
5189         dev_net_set(dev, &init_net);
5190
5191         dev->_tx = tx;
5192         dev->num_tx_queues = queue_count;
5193         dev->real_num_tx_queues = queue_count;
5194
5195         dev->gso_max_size = GSO_MAX_SIZE;
5196
5197         netdev_init_queues(dev);
5198
5199         INIT_LIST_HEAD(&dev->napi_list);
5200         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5201         setup(dev);
5202         strcpy(dev->name, name);
5203         return dev;
5204
5205 free_tx:
5206         kfree(tx);
5207
5208 free_p:
5209         kfree(p);
5210         return NULL;
5211 }
5212 EXPORT_SYMBOL(alloc_netdev_mq);
5213
5214 /**
5215  *      free_netdev - free network device
5216  *      @dev: device
5217  *
5218  *      This function does the last stage of destroying an allocated device
5219  *      interface. The reference to the device object is released.
5220  *      If this is the last reference then it will be freed.
5221  */
5222 void free_netdev(struct net_device *dev)
5223 {
5224         struct napi_struct *p, *n;
5225
5226         release_net(dev_net(dev));
5227
5228         kfree(dev->_tx);
5229
5230         /* Flush device addresses */
5231         dev_addr_flush(dev);
5232
5233         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5234                 netif_napi_del(p);
5235
5236         /*  Compatibility with error handling in drivers */
5237         if (dev->reg_state == NETREG_UNINITIALIZED) {
5238                 kfree((char *)dev - dev->padded);
5239                 return;
5240         }
5241
5242         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5243         dev->reg_state = NETREG_RELEASED;
5244
5245         /* will free via device release */
5246         put_device(&dev->dev);
5247 }
5248 EXPORT_SYMBOL(free_netdev);
5249
5250 /**
5251  *      synchronize_net -  Synchronize with packet receive processing
5252  *
5253  *      Wait for packets currently being received to be done.
5254  *      Does not block later packets from starting.
5255  */
5256 void synchronize_net(void)
5257 {
5258         might_sleep();
5259         synchronize_rcu();
5260 }
5261 EXPORT_SYMBOL(synchronize_net);
5262
5263 /**
5264  *      unregister_netdevice - remove device from the kernel
5265  *      @dev: device
5266  *
5267  *      This function shuts down a device interface and removes it
5268  *      from the kernel tables.
5269  *
5270  *      Callers must hold the rtnl semaphore.  You may want
5271  *      unregister_netdev() instead of this.
5272  */
5273
5274 void unregister_netdevice(struct net_device *dev)
5275 {
5276         ASSERT_RTNL();
5277
5278         rollback_registered(dev);
5279         /* Finish processing unregister after unlock */
5280         net_set_todo(dev);
5281 }
5282 EXPORT_SYMBOL(unregister_netdevice);
5283
5284 /**
5285  *      unregister_netdev - remove device from the kernel
5286  *      @dev: device
5287  *
5288  *      This function shuts down a device interface and removes it
5289  *      from the kernel tables.
5290  *
5291  *      This is just a wrapper for unregister_netdevice that takes
5292  *      the rtnl semaphore.  In general you want to use this and not
5293  *      unregister_netdevice.
5294  */
5295 void unregister_netdev(struct net_device *dev)
5296 {
5297         rtnl_lock();
5298         unregister_netdevice(dev);
5299         rtnl_unlock();
5300 }
5301 EXPORT_SYMBOL(unregister_netdev);
5302
5303 /**
5304  *      dev_change_net_namespace - move device to different nethost namespace
5305  *      @dev: device
5306  *      @net: network namespace
5307  *      @pat: If not NULL name pattern to try if the current device name
5308  *            is already taken in the destination network namespace.
5309  *
5310  *      This function shuts down a device interface and moves it
5311  *      to a new network namespace. On success 0 is returned, on
5312  *      a failure a netagive errno code is returned.
5313  *
5314  *      Callers must hold the rtnl semaphore.
5315  */
5316
5317 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5318 {
5319         char buf[IFNAMSIZ];
5320         const char *destname;
5321         int err;
5322
5323         ASSERT_RTNL();
5324
5325         /* Don't allow namespace local devices to be moved. */
5326         err = -EINVAL;
5327         if (dev->features & NETIF_F_NETNS_LOCAL)
5328                 goto out;
5329
5330 #ifdef CONFIG_SYSFS
5331         /* Don't allow real devices to be moved when sysfs
5332          * is enabled.
5333          */
5334         err = -EINVAL;
5335         if (dev->dev.parent)
5336                 goto out;
5337 #endif
5338
5339         /* Ensure the device has been registrered */
5340         err = -EINVAL;
5341         if (dev->reg_state != NETREG_REGISTERED)
5342                 goto out;
5343
5344         /* Get out if there is nothing todo */
5345         err = 0;
5346         if (net_eq(dev_net(dev), net))
5347                 goto out;
5348
5349         /* Pick the destination device name, and ensure
5350          * we can use it in the destination network namespace.
5351          */
5352         err = -EEXIST;
5353         destname = dev->name;
5354         if (__dev_get_by_name(net, destname)) {
5355                 /* We get here if we can't use the current device name */
5356                 if (!pat)
5357                         goto out;
5358                 if (!dev_valid_name(pat))
5359                         goto out;
5360                 if (strchr(pat, '%')) {
5361                         if (__dev_alloc_name(net, pat, buf) < 0)
5362                                 goto out;
5363                         destname = buf;
5364                 } else
5365                         destname = pat;
5366                 if (__dev_get_by_name(net, destname))
5367                         goto out;
5368         }
5369
5370         /*
5371          * And now a mini version of register_netdevice unregister_netdevice.
5372          */
5373
5374         /* If device is running close it first. */
5375         dev_close(dev);
5376
5377         /* And unlink it from device chain */
5378         err = -ENODEV;
5379         unlist_netdevice(dev);
5380
5381         synchronize_net();
5382
5383         /* Shutdown queueing discipline. */
5384         dev_shutdown(dev);
5385
5386         /* Notify protocols, that we are about to destroy
5387            this device. They should clean all the things.
5388         */
5389         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5390
5391         /*
5392          *      Flush the unicast and multicast chains
5393          */
5394         dev_unicast_flush(dev);
5395         dev_addr_discard(dev);
5396
5397         netdev_unregister_kobject(dev);
5398
5399         /* Actually switch the network namespace */
5400         dev_net_set(dev, net);
5401
5402         /* Assign the new device name */
5403         if (destname != dev->name)
5404                 strcpy(dev->name, destname);
5405
5406         /* If there is an ifindex conflict assign a new one */
5407         if (__dev_get_by_index(net, dev->ifindex)) {
5408                 int iflink = (dev->iflink == dev->ifindex);
5409                 dev->ifindex = dev_new_index(net);
5410                 if (iflink)
5411                         dev->iflink = dev->ifindex;
5412         }
5413
5414         /* Fixup kobjects */
5415         err = netdev_register_kobject(dev);
5416         WARN_ON(err);
5417
5418         /* Add the device back in the hashes */
5419         list_netdevice(dev);
5420
5421         /* Notify protocols, that a new device appeared. */
5422         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5423
5424         /*
5425          *      Prevent userspace races by waiting until the network
5426          *      device is fully setup before sending notifications.
5427          */
5428         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5429
5430         synchronize_net();
5431         err = 0;
5432 out:
5433         return err;
5434 }
5435 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5436
5437 static int dev_cpu_callback(struct notifier_block *nfb,
5438                             unsigned long action,
5439                             void *ocpu)
5440 {
5441         struct sk_buff **list_skb;
5442         struct Qdisc **list_net;
5443         struct sk_buff *skb;
5444         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5445         struct softnet_data *sd, *oldsd;
5446
5447         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5448                 return NOTIFY_OK;
5449
5450         local_irq_disable();
5451         cpu = smp_processor_id();
5452         sd = &per_cpu(softnet_data, cpu);
5453         oldsd = &per_cpu(softnet_data, oldcpu);
5454
5455         /* Find end of our completion_queue. */
5456         list_skb = &sd->completion_queue;
5457         while (*list_skb)
5458                 list_skb = &(*list_skb)->next;
5459         /* Append completion queue from offline CPU. */
5460         *list_skb = oldsd->completion_queue;
5461         oldsd->completion_queue = NULL;
5462
5463         /* Find end of our output_queue. */
5464         list_net = &sd->output_queue;
5465         while (*list_net)
5466                 list_net = &(*list_net)->next_sched;
5467         /* Append output queue from offline CPU. */
5468         *list_net = oldsd->output_queue;
5469         oldsd->output_queue = NULL;
5470
5471         raise_softirq_irqoff(NET_TX_SOFTIRQ);
5472         local_irq_enable();
5473
5474         /* Process offline CPU's input_pkt_queue */
5475         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5476                 netif_rx(skb);
5477
5478         return NOTIFY_OK;
5479 }
5480
5481
5482 /**
5483  *      netdev_increment_features - increment feature set by one
5484  *      @all: current feature set
5485  *      @one: new feature set
5486  *      @mask: mask feature set
5487  *
5488  *      Computes a new feature set after adding a device with feature set
5489  *      @one to the master device with current feature set @all.  Will not
5490  *      enable anything that is off in @mask. Returns the new feature set.
5491  */
5492 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5493                                         unsigned long mask)
5494 {
5495         /* If device needs checksumming, downgrade to it. */
5496         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5497                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5498         else if (mask & NETIF_F_ALL_CSUM) {
5499                 /* If one device supports v4/v6 checksumming, set for all. */
5500                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5501                     !(all & NETIF_F_GEN_CSUM)) {
5502                         all &= ~NETIF_F_ALL_CSUM;
5503                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5504                 }
5505
5506                 /* If one device supports hw checksumming, set for all. */
5507                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5508                         all &= ~NETIF_F_ALL_CSUM;
5509                         all |= NETIF_F_HW_CSUM;
5510                 }
5511         }
5512
5513         one |= NETIF_F_ALL_CSUM;
5514
5515         one |= all & NETIF_F_ONE_FOR_ALL;
5516         all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5517         all |= one & mask & NETIF_F_ONE_FOR_ALL;
5518
5519         return all;
5520 }
5521 EXPORT_SYMBOL(netdev_increment_features);
5522
5523 static struct hlist_head *netdev_create_hash(void)
5524 {
5525         int i;
5526         struct hlist_head *hash;
5527
5528         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5529         if (hash != NULL)
5530                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5531                         INIT_HLIST_HEAD(&hash[i]);
5532
5533         return hash;
5534 }
5535
5536 /* Initialize per network namespace state */
5537 static int __net_init netdev_init(struct net *net)
5538 {
5539         INIT_LIST_HEAD(&net->dev_base_head);
5540
5541         net->dev_name_head = netdev_create_hash();
5542         if (net->dev_name_head == NULL)
5543                 goto err_name;
5544
5545         net->dev_index_head = netdev_create_hash();
5546         if (net->dev_index_head == NULL)
5547                 goto err_idx;
5548
5549         return 0;
5550
5551 err_idx:
5552         kfree(net->dev_name_head);
5553 err_name:
5554         return -ENOMEM;
5555 }
5556
5557 /**
5558  *      netdev_drivername - network driver for the device
5559  *      @dev: network device
5560  *      @buffer: buffer for resulting name
5561  *      @len: size of buffer
5562  *
5563  *      Determine network driver for device.
5564  */
5565 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5566 {
5567         const struct device_driver *driver;
5568         const struct device *parent;
5569
5570         if (len <= 0 || !buffer)
5571                 return buffer;
5572         buffer[0] = 0;
5573
5574         parent = dev->dev.parent;
5575
5576         if (!parent)
5577                 return buffer;
5578
5579         driver = parent->driver;
5580         if (driver && driver->name)
5581                 strlcpy(buffer, driver->name, len);
5582         return buffer;
5583 }
5584
5585 static void __net_exit netdev_exit(struct net *net)
5586 {
5587         kfree(net->dev_name_head);
5588         kfree(net->dev_index_head);
5589 }
5590
5591 static struct pernet_operations __net_initdata netdev_net_ops = {
5592         .init = netdev_init,
5593         .exit = netdev_exit,
5594 };
5595
5596 static void __net_exit default_device_exit(struct net *net)
5597 {
5598         struct net_device *dev;
5599         /*
5600          * Push all migratable of the network devices back to the
5601          * initial network namespace
5602          */
5603         rtnl_lock();
5604 restart:
5605         for_each_netdev(net, dev) {
5606                 int err;
5607                 char fb_name[IFNAMSIZ];
5608
5609                 /* Ignore unmoveable devices (i.e. loopback) */
5610                 if (dev->features & NETIF_F_NETNS_LOCAL)
5611                         continue;
5612
5613                 /* Delete virtual devices */
5614                 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5615                         dev->rtnl_link_ops->dellink(dev);
5616                         goto restart;
5617                 }
5618
5619                 /* Push remaing network devices to init_net */
5620                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5621                 err = dev_change_net_namespace(dev, &init_net, fb_name);
5622                 if (err) {
5623                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5624                                 __func__, dev->name, err);
5625                         BUG();
5626                 }
5627                 goto restart;
5628         }
5629         rtnl_unlock();
5630 }
5631
5632 static struct pernet_operations __net_initdata default_device_ops = {
5633         .exit = default_device_exit,
5634 };
5635
5636 /*
5637  *      Initialize the DEV module. At boot time this walks the device list and
5638  *      unhooks any devices that fail to initialise (normally hardware not
5639  *      present) and leaves us with a valid list of present and active devices.
5640  *
5641  */
5642
5643 /*
5644  *       This is called single threaded during boot, so no need
5645  *       to take the rtnl semaphore.
5646  */
5647 static int __init net_dev_init(void)
5648 {
5649         int i, rc = -ENOMEM;
5650
5651         BUG_ON(!dev_boot_phase);
5652
5653         if (dev_proc_init())
5654                 goto out;
5655
5656         if (netdev_kobject_init())
5657                 goto out;
5658
5659         INIT_LIST_HEAD(&ptype_all);
5660         for (i = 0; i < PTYPE_HASH_SIZE; i++)
5661                 INIT_LIST_HEAD(&ptype_base[i]);
5662
5663         if (register_pernet_subsys(&netdev_net_ops))
5664                 goto out;
5665
5666         /*
5667          *      Initialise the packet receive queues.
5668          */
5669
5670         for_each_possible_cpu(i) {
5671                 struct softnet_data *queue;
5672
5673                 queue = &per_cpu(softnet_data, i);
5674                 skb_queue_head_init(&queue->input_pkt_queue);
5675                 queue->completion_queue = NULL;
5676                 INIT_LIST_HEAD(&queue->poll_list);
5677
5678                 queue->backlog.poll = process_backlog;
5679                 queue->backlog.weight = weight_p;
5680                 queue->backlog.gro_list = NULL;
5681                 queue->backlog.gro_count = 0;
5682         }
5683
5684         dev_boot_phase = 0;
5685
5686         /* The loopback device is special if any other network devices
5687          * is present in a network namespace the loopback device must
5688          * be present. Since we now dynamically allocate and free the
5689          * loopback device ensure this invariant is maintained by
5690          * keeping the loopback device as the first device on the
5691          * list of network devices.  Ensuring the loopback devices
5692          * is the first device that appears and the last network device
5693          * that disappears.
5694          */
5695         if (register_pernet_device(&loopback_net_ops))
5696                 goto out;
5697
5698         if (register_pernet_device(&default_device_ops))
5699                 goto out;
5700
5701         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5702         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5703
5704         hotcpu_notifier(dev_cpu_callback, 0);
5705         dst_init();
5706         dev_mcast_init();
5707         rc = 0;
5708 out:
5709         return rc;
5710 }
5711
5712 subsys_initcall(net_dev_init);
5713
5714 static int __init initialize_hashrnd(void)
5715 {
5716         get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
5717         return 0;
5718 }
5719
5720 late_initcall_sync(initialize_hashrnd);
5721