net: use a work queue to defer net_disable_timestamp() work
[firefly-linux-kernel-4.4.55.git] / net / core / dev.c
1 /*
2  *      NET3    Protocol independent device support routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  *      Derived from the non IP parts of dev.c 1.0.19
10  *              Authors:        Ross Biro
11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *      Additional Authors:
15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
17  *              David Hinds <dahinds@users.sourceforge.net>
18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *              Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *      Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *                                      to 2 if register_netdev gets called
25  *                                      before net_dev_init & also removed a
26  *                                      few lines of code in the process.
27  *              Alan Cox        :       device private ioctl copies fields back.
28  *              Alan Cox        :       Transmit queue code does relevant
29  *                                      stunts to keep the queue safe.
30  *              Alan Cox        :       Fixed double lock.
31  *              Alan Cox        :       Fixed promisc NULL pointer trap
32  *              ????????        :       Support the full private ioctl range
33  *              Alan Cox        :       Moved ioctl permission check into
34  *                                      drivers
35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
36  *              Alan Cox        :       100 backlog just doesn't cut it when
37  *                                      you start doing multicast video 8)
38  *              Alan Cox        :       Rewrote net_bh and list manager.
39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
40  *              Alan Cox        :       Took out transmit every packet pass
41  *                                      Saved a few bytes in the ioctl handler
42  *              Alan Cox        :       Network driver sets packet type before
43  *                                      calling netif_rx. Saves a function
44  *                                      call a packet.
45  *              Alan Cox        :       Hashed net_bh()
46  *              Richard Kooijman:       Timestamp fixes.
47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
48  *              Alan Cox        :       Device lock protection.
49  *              Alan Cox        :       Fixed nasty side effect of device close
50  *                                      changes.
51  *              Rudi Cilibrasi  :       Pass the right thing to
52  *                                      set_mac_address()
53  *              Dave Miller     :       32bit quantity for the device lock to
54  *                                      make it work out on a Sparc.
55  *              Bjorn Ekwall    :       Added KERNELD hack.
56  *              Alan Cox        :       Cleaned up the backlog initialise.
57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
58  *                                      1 device.
59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
60  *                                      is no device open function.
61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
63  *              Cyrus Durgin    :       Cleaned for KMOD
64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
65  *                                      A network device unload needs to purge
66  *                                      the backlog queue.
67  *      Paul Rusty Russell      :       SIOCSIFNAME
68  *              Pekka Riikonen  :       Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *                                      indefinitely on dev->refcnt
71  *              J Hadi Salim    :       - Backlog queue sampling
72  *                                      - netif_rx() feedback
73  */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/dst_metadata.h>
103 #include <net/pkt_sched.h>
104 #include <net/checksum.h>
105 #include <net/xfrm.h>
106 #include <linux/highmem.h>
107 #include <linux/init.h>
108 #include <linux/module.h>
109 #include <linux/netpoll.h>
110 #include <linux/rcupdate.h>
111 #include <linux/delay.h>
112 #include <net/iw_handler.h>
113 #include <asm/current.h>
114 #include <linux/audit.h>
115 #include <linux/dmaengine.h>
116 #include <linux/err.h>
117 #include <linux/ctype.h>
118 #include <linux/if_arp.h>
119 #include <linux/if_vlan.h>
120 #include <linux/ip.h>
121 #include <net/ip.h>
122 #include <net/mpls.h>
123 #include <linux/ipv6.h>
124 #include <linux/in.h>
125 #include <linux/jhash.h>
126 #include <linux/random.h>
127 #include <trace/events/napi.h>
128 #include <trace/events/net.h>
129 #include <trace/events/skb.h>
130 #include <linux/pci.h>
131 #include <linux/inetdevice.h>
132 #include <linux/cpu_rmap.h>
133 #include <linux/static_key.h>
134 #include <linux/hashtable.h>
135 #include <linux/vmalloc.h>
136 #include <linux/if_macvlan.h>
137 #include <linux/errqueue.h>
138 #include <linux/hrtimer.h>
139 #include <linux/netfilter_ingress.h>
140
141 #include "net-sysfs.h"
142
143 /* Instead of increasing this, you should create a hash table. */
144 #define MAX_GRO_SKBS 8
145
146 /* This should be increased if a protocol with a bigger head is added. */
147 #define GRO_MAX_HEAD (MAX_HEADER + 128)
148
149 static DEFINE_SPINLOCK(ptype_lock);
150 static DEFINE_SPINLOCK(offload_lock);
151 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
152 struct list_head ptype_all __read_mostly;       /* Taps */
153 static struct list_head offload_base __read_mostly;
154
155 static int netif_rx_internal(struct sk_buff *skb);
156 static int call_netdevice_notifiers_info(unsigned long val,
157                                          struct net_device *dev,
158                                          struct netdev_notifier_info *info);
159
160 /*
161  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
162  * semaphore.
163  *
164  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
165  *
166  * Writers must hold the rtnl semaphore while they loop through the
167  * dev_base_head list, and hold dev_base_lock for writing when they do the
168  * actual updates.  This allows pure readers to access the list even
169  * while a writer is preparing to update it.
170  *
171  * To put it another way, dev_base_lock is held for writing only to
172  * protect against pure readers; the rtnl semaphore provides the
173  * protection against other writers.
174  *
175  * See, for example usages, register_netdevice() and
176  * unregister_netdevice(), which must be called with the rtnl
177  * semaphore held.
178  */
179 DEFINE_RWLOCK(dev_base_lock);
180 EXPORT_SYMBOL(dev_base_lock);
181
182 /* protects napi_hash addition/deletion and napi_gen_id */
183 static DEFINE_SPINLOCK(napi_hash_lock);
184
185 static unsigned int napi_gen_id;
186 static DEFINE_HASHTABLE(napi_hash, 8);
187
188 static seqcount_t devnet_rename_seq;
189
190 static inline void dev_base_seq_inc(struct net *net)
191 {
192         while (++net->dev_base_seq == 0);
193 }
194
195 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
196 {
197         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
198
199         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
200 }
201
202 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
203 {
204         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
205 }
206
207 static inline void rps_lock(struct softnet_data *sd)
208 {
209 #ifdef CONFIG_RPS
210         spin_lock(&sd->input_pkt_queue.lock);
211 #endif
212 }
213
214 static inline void rps_unlock(struct softnet_data *sd)
215 {
216 #ifdef CONFIG_RPS
217         spin_unlock(&sd->input_pkt_queue.lock);
218 #endif
219 }
220
221 /* Device list insertion */
222 static void list_netdevice(struct net_device *dev)
223 {
224         struct net *net = dev_net(dev);
225
226         ASSERT_RTNL();
227
228         write_lock_bh(&dev_base_lock);
229         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
230         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
231         hlist_add_head_rcu(&dev->index_hlist,
232                            dev_index_hash(net, dev->ifindex));
233         write_unlock_bh(&dev_base_lock);
234
235         dev_base_seq_inc(net);
236 }
237
238 /* Device list removal
239  * caller must respect a RCU grace period before freeing/reusing dev
240  */
241 static void unlist_netdevice(struct net_device *dev)
242 {
243         ASSERT_RTNL();
244
245         /* Unlink dev from the device chain */
246         write_lock_bh(&dev_base_lock);
247         list_del_rcu(&dev->dev_list);
248         hlist_del_rcu(&dev->name_hlist);
249         hlist_del_rcu(&dev->index_hlist);
250         write_unlock_bh(&dev_base_lock);
251
252         dev_base_seq_inc(dev_net(dev));
253 }
254
255 /*
256  *      Our notifier list
257  */
258
259 static RAW_NOTIFIER_HEAD(netdev_chain);
260
261 /*
262  *      Device drivers call our routines to queue packets here. We empty the
263  *      queue in the local softnet handler.
264  */
265
266 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
267 EXPORT_PER_CPU_SYMBOL(softnet_data);
268
269 #ifdef CONFIG_LOCKDEP
270 /*
271  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
272  * according to dev->type
273  */
274 static const unsigned short netdev_lock_type[] =
275         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
276          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
277          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
278          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
279          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
280          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
281          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
282          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
283          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
284          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
285          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
286          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
287          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
288          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
289          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
290
291 static const char *const netdev_lock_name[] =
292         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
293          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
294          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
295          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
296          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
297          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
298          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
299          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
300          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
301          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
302          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
303          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
304          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
305          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
306          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
307
308 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
309 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
310
311 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
312 {
313         int i;
314
315         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
316                 if (netdev_lock_type[i] == dev_type)
317                         return i;
318         /* the last key is used by default */
319         return ARRAY_SIZE(netdev_lock_type) - 1;
320 }
321
322 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
323                                                  unsigned short dev_type)
324 {
325         int i;
326
327         i = netdev_lock_pos(dev_type);
328         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
329                                    netdev_lock_name[i]);
330 }
331
332 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
333 {
334         int i;
335
336         i = netdev_lock_pos(dev->type);
337         lockdep_set_class_and_name(&dev->addr_list_lock,
338                                    &netdev_addr_lock_key[i],
339                                    netdev_lock_name[i]);
340 }
341 #else
342 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
343                                                  unsigned short dev_type)
344 {
345 }
346 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
347 {
348 }
349 #endif
350
351 /*******************************************************************************
352
353                 Protocol management and registration routines
354
355 *******************************************************************************/
356
357 /*
358  *      Add a protocol ID to the list. Now that the input handler is
359  *      smarter we can dispense with all the messy stuff that used to be
360  *      here.
361  *
362  *      BEWARE!!! Protocol handlers, mangling input packets,
363  *      MUST BE last in hash buckets and checking protocol handlers
364  *      MUST start from promiscuous ptype_all chain in net_bh.
365  *      It is true now, do not change it.
366  *      Explanation follows: if protocol handler, mangling packet, will
367  *      be the first on list, it is not able to sense, that packet
368  *      is cloned and should be copied-on-write, so that it will
369  *      change it and subsequent readers will get broken packet.
370  *                                                      --ANK (980803)
371  */
372
373 static inline struct list_head *ptype_head(const struct packet_type *pt)
374 {
375         if (pt->type == htons(ETH_P_ALL))
376                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
377         else
378                 return pt->dev ? &pt->dev->ptype_specific :
379                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
380 }
381
382 /**
383  *      dev_add_pack - add packet handler
384  *      @pt: packet type declaration
385  *
386  *      Add a protocol handler to the networking stack. The passed &packet_type
387  *      is linked into kernel lists and may not be freed until it has been
388  *      removed from the kernel lists.
389  *
390  *      This call does not sleep therefore it can not
391  *      guarantee all CPU's that are in middle of receiving packets
392  *      will see the new packet type (until the next received packet).
393  */
394
395 void dev_add_pack(struct packet_type *pt)
396 {
397         struct list_head *head = ptype_head(pt);
398
399         spin_lock(&ptype_lock);
400         list_add_rcu(&pt->list, head);
401         spin_unlock(&ptype_lock);
402 }
403 EXPORT_SYMBOL(dev_add_pack);
404
405 /**
406  *      __dev_remove_pack        - remove packet handler
407  *      @pt: packet type declaration
408  *
409  *      Remove a protocol handler that was previously added to the kernel
410  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
411  *      from the kernel lists and can be freed or reused once this function
412  *      returns.
413  *
414  *      The packet type might still be in use by receivers
415  *      and must not be freed until after all the CPU's have gone
416  *      through a quiescent state.
417  */
418 void __dev_remove_pack(struct packet_type *pt)
419 {
420         struct list_head *head = ptype_head(pt);
421         struct packet_type *pt1;
422
423         spin_lock(&ptype_lock);
424
425         list_for_each_entry(pt1, head, list) {
426                 if (pt == pt1) {
427                         list_del_rcu(&pt->list);
428                         goto out;
429                 }
430         }
431
432         pr_warn("dev_remove_pack: %p not found\n", pt);
433 out:
434         spin_unlock(&ptype_lock);
435 }
436 EXPORT_SYMBOL(__dev_remove_pack);
437
438 /**
439  *      dev_remove_pack  - remove packet handler
440  *      @pt: packet type declaration
441  *
442  *      Remove a protocol handler that was previously added to the kernel
443  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
444  *      from the kernel lists and can be freed or reused once this function
445  *      returns.
446  *
447  *      This call sleeps to guarantee that no CPU is looking at the packet
448  *      type after return.
449  */
450 void dev_remove_pack(struct packet_type *pt)
451 {
452         __dev_remove_pack(pt);
453
454         synchronize_net();
455 }
456 EXPORT_SYMBOL(dev_remove_pack);
457
458
459 /**
460  *      dev_add_offload - register offload handlers
461  *      @po: protocol offload declaration
462  *
463  *      Add protocol offload handlers to the networking stack. The passed
464  *      &proto_offload is linked into kernel lists and may not be freed until
465  *      it has been removed from the kernel lists.
466  *
467  *      This call does not sleep therefore it can not
468  *      guarantee all CPU's that are in middle of receiving packets
469  *      will see the new offload handlers (until the next received packet).
470  */
471 void dev_add_offload(struct packet_offload *po)
472 {
473         struct packet_offload *elem;
474
475         spin_lock(&offload_lock);
476         list_for_each_entry(elem, &offload_base, list) {
477                 if (po->priority < elem->priority)
478                         break;
479         }
480         list_add_rcu(&po->list, elem->list.prev);
481         spin_unlock(&offload_lock);
482 }
483 EXPORT_SYMBOL(dev_add_offload);
484
485 /**
486  *      __dev_remove_offload     - remove offload handler
487  *      @po: packet offload declaration
488  *
489  *      Remove a protocol offload handler that was previously added to the
490  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
491  *      is removed from the kernel lists and can be freed or reused once this
492  *      function returns.
493  *
494  *      The packet type might still be in use by receivers
495  *      and must not be freed until after all the CPU's have gone
496  *      through a quiescent state.
497  */
498 static void __dev_remove_offload(struct packet_offload *po)
499 {
500         struct list_head *head = &offload_base;
501         struct packet_offload *po1;
502
503         spin_lock(&offload_lock);
504
505         list_for_each_entry(po1, head, list) {
506                 if (po == po1) {
507                         list_del_rcu(&po->list);
508                         goto out;
509                 }
510         }
511
512         pr_warn("dev_remove_offload: %p not found\n", po);
513 out:
514         spin_unlock(&offload_lock);
515 }
516
517 /**
518  *      dev_remove_offload       - remove packet offload handler
519  *      @po: packet offload declaration
520  *
521  *      Remove a packet offload handler that was previously added to the kernel
522  *      offload handlers by dev_add_offload(). The passed &offload_type is
523  *      removed from the kernel lists and can be freed or reused once this
524  *      function returns.
525  *
526  *      This call sleeps to guarantee that no CPU is looking at the packet
527  *      type after return.
528  */
529 void dev_remove_offload(struct packet_offload *po)
530 {
531         __dev_remove_offload(po);
532
533         synchronize_net();
534 }
535 EXPORT_SYMBOL(dev_remove_offload);
536
537 /******************************************************************************
538
539                       Device Boot-time Settings Routines
540
541 *******************************************************************************/
542
543 /* Boot time configuration table */
544 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
545
546 /**
547  *      netdev_boot_setup_add   - add new setup entry
548  *      @name: name of the device
549  *      @map: configured settings for the device
550  *
551  *      Adds new setup entry to the dev_boot_setup list.  The function
552  *      returns 0 on error and 1 on success.  This is a generic routine to
553  *      all netdevices.
554  */
555 static int netdev_boot_setup_add(char *name, struct ifmap *map)
556 {
557         struct netdev_boot_setup *s;
558         int i;
559
560         s = dev_boot_setup;
561         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
562                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
563                         memset(s[i].name, 0, sizeof(s[i].name));
564                         strlcpy(s[i].name, name, IFNAMSIZ);
565                         memcpy(&s[i].map, map, sizeof(s[i].map));
566                         break;
567                 }
568         }
569
570         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
571 }
572
573 /**
574  *      netdev_boot_setup_check - check boot time settings
575  *      @dev: the netdevice
576  *
577  *      Check boot time settings for the device.
578  *      The found settings are set for the device to be used
579  *      later in the device probing.
580  *      Returns 0 if no settings found, 1 if they are.
581  */
582 int netdev_boot_setup_check(struct net_device *dev)
583 {
584         struct netdev_boot_setup *s = dev_boot_setup;
585         int i;
586
587         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
588                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
589                     !strcmp(dev->name, s[i].name)) {
590                         dev->irq        = s[i].map.irq;
591                         dev->base_addr  = s[i].map.base_addr;
592                         dev->mem_start  = s[i].map.mem_start;
593                         dev->mem_end    = s[i].map.mem_end;
594                         return 1;
595                 }
596         }
597         return 0;
598 }
599 EXPORT_SYMBOL(netdev_boot_setup_check);
600
601
602 /**
603  *      netdev_boot_base        - get address from boot time settings
604  *      @prefix: prefix for network device
605  *      @unit: id for network device
606  *
607  *      Check boot time settings for the base address of device.
608  *      The found settings are set for the device to be used
609  *      later in the device probing.
610  *      Returns 0 if no settings found.
611  */
612 unsigned long netdev_boot_base(const char *prefix, int unit)
613 {
614         const struct netdev_boot_setup *s = dev_boot_setup;
615         char name[IFNAMSIZ];
616         int i;
617
618         sprintf(name, "%s%d", prefix, unit);
619
620         /*
621          * If device already registered then return base of 1
622          * to indicate not to probe for this interface
623          */
624         if (__dev_get_by_name(&init_net, name))
625                 return 1;
626
627         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
628                 if (!strcmp(name, s[i].name))
629                         return s[i].map.base_addr;
630         return 0;
631 }
632
633 /*
634  * Saves at boot time configured settings for any netdevice.
635  */
636 int __init netdev_boot_setup(char *str)
637 {
638         int ints[5];
639         struct ifmap map;
640
641         str = get_options(str, ARRAY_SIZE(ints), ints);
642         if (!str || !*str)
643                 return 0;
644
645         /* Save settings */
646         memset(&map, 0, sizeof(map));
647         if (ints[0] > 0)
648                 map.irq = ints[1];
649         if (ints[0] > 1)
650                 map.base_addr = ints[2];
651         if (ints[0] > 2)
652                 map.mem_start = ints[3];
653         if (ints[0] > 3)
654                 map.mem_end = ints[4];
655
656         /* Add new entry to the list */
657         return netdev_boot_setup_add(str, &map);
658 }
659
660 __setup("netdev=", netdev_boot_setup);
661
662 /*******************************************************************************
663
664                             Device Interface Subroutines
665
666 *******************************************************************************/
667
668 /**
669  *      dev_get_iflink  - get 'iflink' value of a interface
670  *      @dev: targeted interface
671  *
672  *      Indicates the ifindex the interface is linked to.
673  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
674  */
675
676 int dev_get_iflink(const struct net_device *dev)
677 {
678         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
679                 return dev->netdev_ops->ndo_get_iflink(dev);
680
681         return dev->ifindex;
682 }
683 EXPORT_SYMBOL(dev_get_iflink);
684
685 /**
686  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
687  *      @dev: targeted interface
688  *      @skb: The packet.
689  *
690  *      For better visibility of tunnel traffic OVS needs to retrieve
691  *      egress tunnel information for a packet. Following API allows
692  *      user to get this info.
693  */
694 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
695 {
696         struct ip_tunnel_info *info;
697
698         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
699                 return -EINVAL;
700
701         info = skb_tunnel_info_unclone(skb);
702         if (!info)
703                 return -ENOMEM;
704         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
705                 return -EINVAL;
706
707         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
708 }
709 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
710
711 /**
712  *      __dev_get_by_name       - find a device by its name
713  *      @net: the applicable net namespace
714  *      @name: name to find
715  *
716  *      Find an interface by name. Must be called under RTNL semaphore
717  *      or @dev_base_lock. If the name is found a pointer to the device
718  *      is returned. If the name is not found then %NULL is returned. The
719  *      reference counters are not incremented so the caller must be
720  *      careful with locks.
721  */
722
723 struct net_device *__dev_get_by_name(struct net *net, const char *name)
724 {
725         struct net_device *dev;
726         struct hlist_head *head = dev_name_hash(net, name);
727
728         hlist_for_each_entry(dev, head, name_hlist)
729                 if (!strncmp(dev->name, name, IFNAMSIZ))
730                         return dev;
731
732         return NULL;
733 }
734 EXPORT_SYMBOL(__dev_get_by_name);
735
736 /**
737  *      dev_get_by_name_rcu     - find a device by its name
738  *      @net: the applicable net namespace
739  *      @name: name to find
740  *
741  *      Find an interface by name.
742  *      If the name is found a pointer to the device is returned.
743  *      If the name is not found then %NULL is returned.
744  *      The reference counters are not incremented so the caller must be
745  *      careful with locks. The caller must hold RCU lock.
746  */
747
748 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
749 {
750         struct net_device *dev;
751         struct hlist_head *head = dev_name_hash(net, name);
752
753         hlist_for_each_entry_rcu(dev, head, name_hlist)
754                 if (!strncmp(dev->name, name, IFNAMSIZ))
755                         return dev;
756
757         return NULL;
758 }
759 EXPORT_SYMBOL(dev_get_by_name_rcu);
760
761 /**
762  *      dev_get_by_name         - find a device by its name
763  *      @net: the applicable net namespace
764  *      @name: name to find
765  *
766  *      Find an interface by name. This can be called from any
767  *      context and does its own locking. The returned handle has
768  *      the usage count incremented and the caller must use dev_put() to
769  *      release it when it is no longer needed. %NULL is returned if no
770  *      matching device is found.
771  */
772
773 struct net_device *dev_get_by_name(struct net *net, const char *name)
774 {
775         struct net_device *dev;
776
777         rcu_read_lock();
778         dev = dev_get_by_name_rcu(net, name);
779         if (dev)
780                 dev_hold(dev);
781         rcu_read_unlock();
782         return dev;
783 }
784 EXPORT_SYMBOL(dev_get_by_name);
785
786 /**
787  *      __dev_get_by_index - find a device by its ifindex
788  *      @net: the applicable net namespace
789  *      @ifindex: index of device
790  *
791  *      Search for an interface by index. Returns %NULL if the device
792  *      is not found or a pointer to the device. The device has not
793  *      had its reference counter increased so the caller must be careful
794  *      about locking. The caller must hold either the RTNL semaphore
795  *      or @dev_base_lock.
796  */
797
798 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
799 {
800         struct net_device *dev;
801         struct hlist_head *head = dev_index_hash(net, ifindex);
802
803         hlist_for_each_entry(dev, head, index_hlist)
804                 if (dev->ifindex == ifindex)
805                         return dev;
806
807         return NULL;
808 }
809 EXPORT_SYMBOL(__dev_get_by_index);
810
811 /**
812  *      dev_get_by_index_rcu - find a device by its ifindex
813  *      @net: the applicable net namespace
814  *      @ifindex: index of device
815  *
816  *      Search for an interface by index. Returns %NULL if the device
817  *      is not found or a pointer to the device. The device has not
818  *      had its reference counter increased so the caller must be careful
819  *      about locking. The caller must hold RCU lock.
820  */
821
822 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
823 {
824         struct net_device *dev;
825         struct hlist_head *head = dev_index_hash(net, ifindex);
826
827         hlist_for_each_entry_rcu(dev, head, index_hlist)
828                 if (dev->ifindex == ifindex)
829                         return dev;
830
831         return NULL;
832 }
833 EXPORT_SYMBOL(dev_get_by_index_rcu);
834
835
836 /**
837  *      dev_get_by_index - find a device by its ifindex
838  *      @net: the applicable net namespace
839  *      @ifindex: index of device
840  *
841  *      Search for an interface by index. Returns NULL if the device
842  *      is not found or a pointer to the device. The device returned has
843  *      had a reference added and the pointer is safe until the user calls
844  *      dev_put to indicate they have finished with it.
845  */
846
847 struct net_device *dev_get_by_index(struct net *net, int ifindex)
848 {
849         struct net_device *dev;
850
851         rcu_read_lock();
852         dev = dev_get_by_index_rcu(net, ifindex);
853         if (dev)
854                 dev_hold(dev);
855         rcu_read_unlock();
856         return dev;
857 }
858 EXPORT_SYMBOL(dev_get_by_index);
859
860 /**
861  *      netdev_get_name - get a netdevice name, knowing its ifindex.
862  *      @net: network namespace
863  *      @name: a pointer to the buffer where the name will be stored.
864  *      @ifindex: the ifindex of the interface to get the name from.
865  *
866  *      The use of raw_seqcount_begin() and cond_resched() before
867  *      retrying is required as we want to give the writers a chance
868  *      to complete when CONFIG_PREEMPT is not set.
869  */
870 int netdev_get_name(struct net *net, char *name, int ifindex)
871 {
872         struct net_device *dev;
873         unsigned int seq;
874
875 retry:
876         seq = raw_seqcount_begin(&devnet_rename_seq);
877         rcu_read_lock();
878         dev = dev_get_by_index_rcu(net, ifindex);
879         if (!dev) {
880                 rcu_read_unlock();
881                 return -ENODEV;
882         }
883
884         strcpy(name, dev->name);
885         rcu_read_unlock();
886         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
887                 cond_resched();
888                 goto retry;
889         }
890
891         return 0;
892 }
893
894 /**
895  *      dev_getbyhwaddr_rcu - find a device by its hardware address
896  *      @net: the applicable net namespace
897  *      @type: media type of device
898  *      @ha: hardware address
899  *
900  *      Search for an interface by MAC address. Returns NULL if the device
901  *      is not found or a pointer to the device.
902  *      The caller must hold RCU or RTNL.
903  *      The returned device has not had its ref count increased
904  *      and the caller must therefore be careful about locking
905  *
906  */
907
908 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
909                                        const char *ha)
910 {
911         struct net_device *dev;
912
913         for_each_netdev_rcu(net, dev)
914                 if (dev->type == type &&
915                     !memcmp(dev->dev_addr, ha, dev->addr_len))
916                         return dev;
917
918         return NULL;
919 }
920 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
921
922 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
923 {
924         struct net_device *dev;
925
926         ASSERT_RTNL();
927         for_each_netdev(net, dev)
928                 if (dev->type == type)
929                         return dev;
930
931         return NULL;
932 }
933 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
934
935 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
936 {
937         struct net_device *dev, *ret = NULL;
938
939         rcu_read_lock();
940         for_each_netdev_rcu(net, dev)
941                 if (dev->type == type) {
942                         dev_hold(dev);
943                         ret = dev;
944                         break;
945                 }
946         rcu_read_unlock();
947         return ret;
948 }
949 EXPORT_SYMBOL(dev_getfirstbyhwtype);
950
951 /**
952  *      __dev_get_by_flags - find any device with given flags
953  *      @net: the applicable net namespace
954  *      @if_flags: IFF_* values
955  *      @mask: bitmask of bits in if_flags to check
956  *
957  *      Search for any interface with the given flags. Returns NULL if a device
958  *      is not found or a pointer to the device. Must be called inside
959  *      rtnl_lock(), and result refcount is unchanged.
960  */
961
962 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
963                                       unsigned short mask)
964 {
965         struct net_device *dev, *ret;
966
967         ASSERT_RTNL();
968
969         ret = NULL;
970         for_each_netdev(net, dev) {
971                 if (((dev->flags ^ if_flags) & mask) == 0) {
972                         ret = dev;
973                         break;
974                 }
975         }
976         return ret;
977 }
978 EXPORT_SYMBOL(__dev_get_by_flags);
979
980 /**
981  *      dev_valid_name - check if name is okay for network device
982  *      @name: name string
983  *
984  *      Network device names need to be valid file names to
985  *      to allow sysfs to work.  We also disallow any kind of
986  *      whitespace.
987  */
988 bool dev_valid_name(const char *name)
989 {
990         if (*name == '\0')
991                 return false;
992         if (strlen(name) >= IFNAMSIZ)
993                 return false;
994         if (!strcmp(name, ".") || !strcmp(name, ".."))
995                 return false;
996
997         while (*name) {
998                 if (*name == '/' || *name == ':' || isspace(*name))
999                         return false;
1000                 name++;
1001         }
1002         return true;
1003 }
1004 EXPORT_SYMBOL(dev_valid_name);
1005
1006 /**
1007  *      __dev_alloc_name - allocate a name for a device
1008  *      @net: network namespace to allocate the device name in
1009  *      @name: name format string
1010  *      @buf:  scratch buffer and result name string
1011  *
1012  *      Passed a format string - eg "lt%d" it will try and find a suitable
1013  *      id. It scans list of devices to build up a free map, then chooses
1014  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1015  *      while allocating the name and adding the device in order to avoid
1016  *      duplicates.
1017  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1018  *      Returns the number of the unit assigned or a negative errno code.
1019  */
1020
1021 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1022 {
1023         int i = 0;
1024         const char *p;
1025         const int max_netdevices = 8*PAGE_SIZE;
1026         unsigned long *inuse;
1027         struct net_device *d;
1028
1029         p = strnchr(name, IFNAMSIZ-1, '%');
1030         if (p) {
1031                 /*
1032                  * Verify the string as this thing may have come from
1033                  * the user.  There must be either one "%d" and no other "%"
1034                  * characters.
1035                  */
1036                 if (p[1] != 'd' || strchr(p + 2, '%'))
1037                         return -EINVAL;
1038
1039                 /* Use one page as a bit array of possible slots */
1040                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1041                 if (!inuse)
1042                         return -ENOMEM;
1043
1044                 for_each_netdev(net, d) {
1045                         if (!sscanf(d->name, name, &i))
1046                                 continue;
1047                         if (i < 0 || i >= max_netdevices)
1048                                 continue;
1049
1050                         /*  avoid cases where sscanf is not exact inverse of printf */
1051                         snprintf(buf, IFNAMSIZ, name, i);
1052                         if (!strncmp(buf, d->name, IFNAMSIZ))
1053                                 set_bit(i, inuse);
1054                 }
1055
1056                 i = find_first_zero_bit(inuse, max_netdevices);
1057                 free_page((unsigned long) inuse);
1058         }
1059
1060         if (buf != name)
1061                 snprintf(buf, IFNAMSIZ, name, i);
1062         if (!__dev_get_by_name(net, buf))
1063                 return i;
1064
1065         /* It is possible to run out of possible slots
1066          * when the name is long and there isn't enough space left
1067          * for the digits, or if all bits are used.
1068          */
1069         return -ENFILE;
1070 }
1071
1072 /**
1073  *      dev_alloc_name - allocate a name for a device
1074  *      @dev: device
1075  *      @name: name format string
1076  *
1077  *      Passed a format string - eg "lt%d" it will try and find a suitable
1078  *      id. It scans list of devices to build up a free map, then chooses
1079  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1080  *      while allocating the name and adding the device in order to avoid
1081  *      duplicates.
1082  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1083  *      Returns the number of the unit assigned or a negative errno code.
1084  */
1085
1086 int dev_alloc_name(struct net_device *dev, const char *name)
1087 {
1088         char buf[IFNAMSIZ];
1089         struct net *net;
1090         int ret;
1091
1092         BUG_ON(!dev_net(dev));
1093         net = dev_net(dev);
1094         ret = __dev_alloc_name(net, name, buf);
1095         if (ret >= 0)
1096                 strlcpy(dev->name, buf, IFNAMSIZ);
1097         return ret;
1098 }
1099 EXPORT_SYMBOL(dev_alloc_name);
1100
1101 static int dev_alloc_name_ns(struct net *net,
1102                              struct net_device *dev,
1103                              const char *name)
1104 {
1105         char buf[IFNAMSIZ];
1106         int ret;
1107
1108         ret = __dev_alloc_name(net, name, buf);
1109         if (ret >= 0)
1110                 strlcpy(dev->name, buf, IFNAMSIZ);
1111         return ret;
1112 }
1113
1114 static int dev_get_valid_name(struct net *net,
1115                               struct net_device *dev,
1116                               const char *name)
1117 {
1118         BUG_ON(!net);
1119
1120         if (!dev_valid_name(name))
1121                 return -EINVAL;
1122
1123         if (strchr(name, '%'))
1124                 return dev_alloc_name_ns(net, dev, name);
1125         else if (__dev_get_by_name(net, name))
1126                 return -EEXIST;
1127         else if (dev->name != name)
1128                 strlcpy(dev->name, name, IFNAMSIZ);
1129
1130         return 0;
1131 }
1132
1133 /**
1134  *      dev_change_name - change name of a device
1135  *      @dev: device
1136  *      @newname: name (or format string) must be at least IFNAMSIZ
1137  *
1138  *      Change name of a device, can pass format strings "eth%d".
1139  *      for wildcarding.
1140  */
1141 int dev_change_name(struct net_device *dev, const char *newname)
1142 {
1143         unsigned char old_assign_type;
1144         char oldname[IFNAMSIZ];
1145         int err = 0;
1146         int ret;
1147         struct net *net;
1148
1149         ASSERT_RTNL();
1150         BUG_ON(!dev_net(dev));
1151
1152         net = dev_net(dev);
1153         if (dev->flags & IFF_UP)
1154                 return -EBUSY;
1155
1156         write_seqcount_begin(&devnet_rename_seq);
1157
1158         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1159                 write_seqcount_end(&devnet_rename_seq);
1160                 return 0;
1161         }
1162
1163         memcpy(oldname, dev->name, IFNAMSIZ);
1164
1165         err = dev_get_valid_name(net, dev, newname);
1166         if (err < 0) {
1167                 write_seqcount_end(&devnet_rename_seq);
1168                 return err;
1169         }
1170
1171         if (oldname[0] && !strchr(oldname, '%'))
1172                 netdev_info(dev, "renamed from %s\n", oldname);
1173
1174         old_assign_type = dev->name_assign_type;
1175         dev->name_assign_type = NET_NAME_RENAMED;
1176
1177 rollback:
1178         ret = device_rename(&dev->dev, dev->name);
1179         if (ret) {
1180                 memcpy(dev->name, oldname, IFNAMSIZ);
1181                 dev->name_assign_type = old_assign_type;
1182                 write_seqcount_end(&devnet_rename_seq);
1183                 return ret;
1184         }
1185
1186         write_seqcount_end(&devnet_rename_seq);
1187
1188         netdev_adjacent_rename_links(dev, oldname);
1189
1190         write_lock_bh(&dev_base_lock);
1191         hlist_del_rcu(&dev->name_hlist);
1192         write_unlock_bh(&dev_base_lock);
1193
1194         synchronize_rcu();
1195
1196         write_lock_bh(&dev_base_lock);
1197         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1198         write_unlock_bh(&dev_base_lock);
1199
1200         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1201         ret = notifier_to_errno(ret);
1202
1203         if (ret) {
1204                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1205                 if (err >= 0) {
1206                         err = ret;
1207                         write_seqcount_begin(&devnet_rename_seq);
1208                         memcpy(dev->name, oldname, IFNAMSIZ);
1209                         memcpy(oldname, newname, IFNAMSIZ);
1210                         dev->name_assign_type = old_assign_type;
1211                         old_assign_type = NET_NAME_RENAMED;
1212                         goto rollback;
1213                 } else {
1214                         pr_err("%s: name change rollback failed: %d\n",
1215                                dev->name, ret);
1216                 }
1217         }
1218
1219         return err;
1220 }
1221
1222 /**
1223  *      dev_set_alias - change ifalias of a device
1224  *      @dev: device
1225  *      @alias: name up to IFALIASZ
1226  *      @len: limit of bytes to copy from info
1227  *
1228  *      Set ifalias for a device,
1229  */
1230 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1231 {
1232         char *new_ifalias;
1233
1234         ASSERT_RTNL();
1235
1236         if (len >= IFALIASZ)
1237                 return -EINVAL;
1238
1239         if (!len) {
1240                 kfree(dev->ifalias);
1241                 dev->ifalias = NULL;
1242                 return 0;
1243         }
1244
1245         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1246         if (!new_ifalias)
1247                 return -ENOMEM;
1248         dev->ifalias = new_ifalias;
1249
1250         strlcpy(dev->ifalias, alias, len+1);
1251         return len;
1252 }
1253
1254
1255 /**
1256  *      netdev_features_change - device changes features
1257  *      @dev: device to cause notification
1258  *
1259  *      Called to indicate a device has changed features.
1260  */
1261 void netdev_features_change(struct net_device *dev)
1262 {
1263         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1264 }
1265 EXPORT_SYMBOL(netdev_features_change);
1266
1267 /**
1268  *      netdev_state_change - device changes state
1269  *      @dev: device to cause notification
1270  *
1271  *      Called to indicate a device has changed state. This function calls
1272  *      the notifier chains for netdev_chain and sends a NEWLINK message
1273  *      to the routing socket.
1274  */
1275 void netdev_state_change(struct net_device *dev)
1276 {
1277         if (dev->flags & IFF_UP) {
1278                 struct netdev_notifier_change_info change_info;
1279
1280                 change_info.flags_changed = 0;
1281                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1282                                               &change_info.info);
1283                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1284         }
1285 }
1286 EXPORT_SYMBOL(netdev_state_change);
1287
1288 /**
1289  *      netdev_notify_peers - notify network peers about existence of @dev
1290  *      @dev: network device
1291  *
1292  * Generate traffic such that interested network peers are aware of
1293  * @dev, such as by generating a gratuitous ARP. This may be used when
1294  * a device wants to inform the rest of the network about some sort of
1295  * reconfiguration such as a failover event or virtual machine
1296  * migration.
1297  */
1298 void netdev_notify_peers(struct net_device *dev)
1299 {
1300         rtnl_lock();
1301         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1302         rtnl_unlock();
1303 }
1304 EXPORT_SYMBOL(netdev_notify_peers);
1305
1306 static int __dev_open(struct net_device *dev)
1307 {
1308         const struct net_device_ops *ops = dev->netdev_ops;
1309         int ret;
1310
1311         ASSERT_RTNL();
1312
1313         if (!netif_device_present(dev))
1314                 return -ENODEV;
1315
1316         /* Block netpoll from trying to do any rx path servicing.
1317          * If we don't do this there is a chance ndo_poll_controller
1318          * or ndo_poll may be running while we open the device
1319          */
1320         netpoll_poll_disable(dev);
1321
1322         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1323         ret = notifier_to_errno(ret);
1324         if (ret)
1325                 return ret;
1326
1327         set_bit(__LINK_STATE_START, &dev->state);
1328
1329         if (ops->ndo_validate_addr)
1330                 ret = ops->ndo_validate_addr(dev);
1331
1332         if (!ret && ops->ndo_open)
1333                 ret = ops->ndo_open(dev);
1334
1335         netpoll_poll_enable(dev);
1336
1337         if (ret)
1338                 clear_bit(__LINK_STATE_START, &dev->state);
1339         else {
1340                 dev->flags |= IFF_UP;
1341                 dev_set_rx_mode(dev);
1342                 dev_activate(dev);
1343                 add_device_randomness(dev->dev_addr, dev->addr_len);
1344         }
1345
1346         return ret;
1347 }
1348
1349 /**
1350  *      dev_open        - prepare an interface for use.
1351  *      @dev:   device to open
1352  *
1353  *      Takes a device from down to up state. The device's private open
1354  *      function is invoked and then the multicast lists are loaded. Finally
1355  *      the device is moved into the up state and a %NETDEV_UP message is
1356  *      sent to the netdev notifier chain.
1357  *
1358  *      Calling this function on an active interface is a nop. On a failure
1359  *      a negative errno code is returned.
1360  */
1361 int dev_open(struct net_device *dev)
1362 {
1363         int ret;
1364
1365         if (dev->flags & IFF_UP)
1366                 return 0;
1367
1368         ret = __dev_open(dev);
1369         if (ret < 0)
1370                 return ret;
1371
1372         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1373         call_netdevice_notifiers(NETDEV_UP, dev);
1374
1375         return ret;
1376 }
1377 EXPORT_SYMBOL(dev_open);
1378
1379 static int __dev_close_many(struct list_head *head)
1380 {
1381         struct net_device *dev;
1382
1383         ASSERT_RTNL();
1384         might_sleep();
1385
1386         list_for_each_entry(dev, head, close_list) {
1387                 /* Temporarily disable netpoll until the interface is down */
1388                 netpoll_poll_disable(dev);
1389
1390                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1391
1392                 clear_bit(__LINK_STATE_START, &dev->state);
1393
1394                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1395                  * can be even on different cpu. So just clear netif_running().
1396                  *
1397                  * dev->stop() will invoke napi_disable() on all of it's
1398                  * napi_struct instances on this device.
1399                  */
1400                 smp_mb__after_atomic(); /* Commit netif_running(). */
1401         }
1402
1403         dev_deactivate_many(head);
1404
1405         list_for_each_entry(dev, head, close_list) {
1406                 const struct net_device_ops *ops = dev->netdev_ops;
1407
1408                 /*
1409                  *      Call the device specific close. This cannot fail.
1410                  *      Only if device is UP
1411                  *
1412                  *      We allow it to be called even after a DETACH hot-plug
1413                  *      event.
1414                  */
1415                 if (ops->ndo_stop)
1416                         ops->ndo_stop(dev);
1417
1418                 dev->flags &= ~IFF_UP;
1419                 netpoll_poll_enable(dev);
1420         }
1421
1422         return 0;
1423 }
1424
1425 static int __dev_close(struct net_device *dev)
1426 {
1427         int retval;
1428         LIST_HEAD(single);
1429
1430         list_add(&dev->close_list, &single);
1431         retval = __dev_close_many(&single);
1432         list_del(&single);
1433
1434         return retval;
1435 }
1436
1437 int dev_close_many(struct list_head *head, bool unlink)
1438 {
1439         struct net_device *dev, *tmp;
1440
1441         /* Remove the devices that don't need to be closed */
1442         list_for_each_entry_safe(dev, tmp, head, close_list)
1443                 if (!(dev->flags & IFF_UP))
1444                         list_del_init(&dev->close_list);
1445
1446         __dev_close_many(head);
1447
1448         list_for_each_entry_safe(dev, tmp, head, close_list) {
1449                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1450                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1451                 if (unlink)
1452                         list_del_init(&dev->close_list);
1453         }
1454
1455         return 0;
1456 }
1457 EXPORT_SYMBOL(dev_close_many);
1458
1459 /**
1460  *      dev_close - shutdown an interface.
1461  *      @dev: device to shutdown
1462  *
1463  *      This function moves an active device into down state. A
1464  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1465  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1466  *      chain.
1467  */
1468 int dev_close(struct net_device *dev)
1469 {
1470         if (dev->flags & IFF_UP) {
1471                 LIST_HEAD(single);
1472
1473                 list_add(&dev->close_list, &single);
1474                 dev_close_many(&single, true);
1475                 list_del(&single);
1476         }
1477         return 0;
1478 }
1479 EXPORT_SYMBOL(dev_close);
1480
1481
1482 /**
1483  *      dev_disable_lro - disable Large Receive Offload on a device
1484  *      @dev: device
1485  *
1486  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1487  *      called under RTNL.  This is needed if received packets may be
1488  *      forwarded to another interface.
1489  */
1490 void dev_disable_lro(struct net_device *dev)
1491 {
1492         struct net_device *lower_dev;
1493         struct list_head *iter;
1494
1495         dev->wanted_features &= ~NETIF_F_LRO;
1496         netdev_update_features(dev);
1497
1498         if (unlikely(dev->features & NETIF_F_LRO))
1499                 netdev_WARN(dev, "failed to disable LRO!\n");
1500
1501         netdev_for_each_lower_dev(dev, lower_dev, iter)
1502                 dev_disable_lro(lower_dev);
1503 }
1504 EXPORT_SYMBOL(dev_disable_lro);
1505
1506 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1507                                    struct net_device *dev)
1508 {
1509         struct netdev_notifier_info info;
1510
1511         netdev_notifier_info_init(&info, dev);
1512         return nb->notifier_call(nb, val, &info);
1513 }
1514
1515 static int dev_boot_phase = 1;
1516
1517 /**
1518  *      register_netdevice_notifier - register a network notifier block
1519  *      @nb: notifier
1520  *
1521  *      Register a notifier to be called when network device events occur.
1522  *      The notifier passed is linked into the kernel structures and must
1523  *      not be reused until it has been unregistered. A negative errno code
1524  *      is returned on a failure.
1525  *
1526  *      When registered all registration and up events are replayed
1527  *      to the new notifier to allow device to have a race free
1528  *      view of the network device list.
1529  */
1530
1531 int register_netdevice_notifier(struct notifier_block *nb)
1532 {
1533         struct net_device *dev;
1534         struct net_device *last;
1535         struct net *net;
1536         int err;
1537
1538         rtnl_lock();
1539         err = raw_notifier_chain_register(&netdev_chain, nb);
1540         if (err)
1541                 goto unlock;
1542         if (dev_boot_phase)
1543                 goto unlock;
1544         for_each_net(net) {
1545                 for_each_netdev(net, dev) {
1546                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1547                         err = notifier_to_errno(err);
1548                         if (err)
1549                                 goto rollback;
1550
1551                         if (!(dev->flags & IFF_UP))
1552                                 continue;
1553
1554                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1555                 }
1556         }
1557
1558 unlock:
1559         rtnl_unlock();
1560         return err;
1561
1562 rollback:
1563         last = dev;
1564         for_each_net(net) {
1565                 for_each_netdev(net, dev) {
1566                         if (dev == last)
1567                                 goto outroll;
1568
1569                         if (dev->flags & IFF_UP) {
1570                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1571                                                         dev);
1572                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1573                         }
1574                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1575                 }
1576         }
1577
1578 outroll:
1579         raw_notifier_chain_unregister(&netdev_chain, nb);
1580         goto unlock;
1581 }
1582 EXPORT_SYMBOL(register_netdevice_notifier);
1583
1584 /**
1585  *      unregister_netdevice_notifier - unregister a network notifier block
1586  *      @nb: notifier
1587  *
1588  *      Unregister a notifier previously registered by
1589  *      register_netdevice_notifier(). The notifier is unlinked into the
1590  *      kernel structures and may then be reused. A negative errno code
1591  *      is returned on a failure.
1592  *
1593  *      After unregistering unregister and down device events are synthesized
1594  *      for all devices on the device list to the removed notifier to remove
1595  *      the need for special case cleanup code.
1596  */
1597
1598 int unregister_netdevice_notifier(struct notifier_block *nb)
1599 {
1600         struct net_device *dev;
1601         struct net *net;
1602         int err;
1603
1604         rtnl_lock();
1605         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1606         if (err)
1607                 goto unlock;
1608
1609         for_each_net(net) {
1610                 for_each_netdev(net, dev) {
1611                         if (dev->flags & IFF_UP) {
1612                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1613                                                         dev);
1614                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1615                         }
1616                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1617                 }
1618         }
1619 unlock:
1620         rtnl_unlock();
1621         return err;
1622 }
1623 EXPORT_SYMBOL(unregister_netdevice_notifier);
1624
1625 /**
1626  *      call_netdevice_notifiers_info - call all network notifier blocks
1627  *      @val: value passed unmodified to notifier function
1628  *      @dev: net_device pointer passed unmodified to notifier function
1629  *      @info: notifier information data
1630  *
1631  *      Call all network notifier blocks.  Parameters and return value
1632  *      are as for raw_notifier_call_chain().
1633  */
1634
1635 static int call_netdevice_notifiers_info(unsigned long val,
1636                                          struct net_device *dev,
1637                                          struct netdev_notifier_info *info)
1638 {
1639         ASSERT_RTNL();
1640         netdev_notifier_info_init(info, dev);
1641         return raw_notifier_call_chain(&netdev_chain, val, info);
1642 }
1643
1644 /**
1645  *      call_netdevice_notifiers - call all network notifier blocks
1646  *      @val: value passed unmodified to notifier function
1647  *      @dev: net_device pointer passed unmodified to notifier function
1648  *
1649  *      Call all network notifier blocks.  Parameters and return value
1650  *      are as for raw_notifier_call_chain().
1651  */
1652
1653 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1654 {
1655         struct netdev_notifier_info info;
1656
1657         return call_netdevice_notifiers_info(val, dev, &info);
1658 }
1659 EXPORT_SYMBOL(call_netdevice_notifiers);
1660
1661 #ifdef CONFIG_NET_INGRESS
1662 static struct static_key ingress_needed __read_mostly;
1663
1664 void net_inc_ingress_queue(void)
1665 {
1666         static_key_slow_inc(&ingress_needed);
1667 }
1668 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1669
1670 void net_dec_ingress_queue(void)
1671 {
1672         static_key_slow_dec(&ingress_needed);
1673 }
1674 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1675 #endif
1676
1677 static struct static_key netstamp_needed __read_mostly;
1678 #ifdef HAVE_JUMP_LABEL
1679 static atomic_t netstamp_needed_deferred;
1680 static void netstamp_clear(struct work_struct *work)
1681 {
1682         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1683
1684         while (deferred--)
1685                 static_key_slow_dec(&netstamp_needed);
1686 }
1687 static DECLARE_WORK(netstamp_work, netstamp_clear);
1688 #endif
1689
1690 void net_enable_timestamp(void)
1691 {
1692         static_key_slow_inc(&netstamp_needed);
1693 }
1694 EXPORT_SYMBOL(net_enable_timestamp);
1695
1696 void net_disable_timestamp(void)
1697 {
1698 #ifdef HAVE_JUMP_LABEL
1699         /* net_disable_timestamp() can be called from non process context */
1700         atomic_inc(&netstamp_needed_deferred);
1701         schedule_work(&netstamp_work);
1702 #else
1703         static_key_slow_dec(&netstamp_needed);
1704 #endif
1705 }
1706 EXPORT_SYMBOL(net_disable_timestamp);
1707
1708 static inline void net_timestamp_set(struct sk_buff *skb)
1709 {
1710         skb->tstamp.tv64 = 0;
1711         if (static_key_false(&netstamp_needed))
1712                 __net_timestamp(skb);
1713 }
1714
1715 #define net_timestamp_check(COND, SKB)                  \
1716         if (static_key_false(&netstamp_needed)) {               \
1717                 if ((COND) && !(SKB)->tstamp.tv64)      \
1718                         __net_timestamp(SKB);           \
1719         }                                               \
1720
1721 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1722 {
1723         unsigned int len;
1724
1725         if (!(dev->flags & IFF_UP))
1726                 return false;
1727
1728         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1729         if (skb->len <= len)
1730                 return true;
1731
1732         /* if TSO is enabled, we don't care about the length as the packet
1733          * could be forwarded without being segmented before
1734          */
1735         if (skb_is_gso(skb))
1736                 return true;
1737
1738         return false;
1739 }
1740 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1741
1742 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1743 {
1744         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1745             unlikely(!is_skb_forwardable(dev, skb))) {
1746                 atomic_long_inc(&dev->rx_dropped);
1747                 kfree_skb(skb);
1748                 return NET_RX_DROP;
1749         }
1750
1751         skb_scrub_packet(skb, true);
1752         skb->priority = 0;
1753         skb->protocol = eth_type_trans(skb, dev);
1754         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1755
1756         return 0;
1757 }
1758 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1759
1760 /**
1761  * dev_forward_skb - loopback an skb to another netif
1762  *
1763  * @dev: destination network device
1764  * @skb: buffer to forward
1765  *
1766  * return values:
1767  *      NET_RX_SUCCESS  (no congestion)
1768  *      NET_RX_DROP     (packet was dropped, but freed)
1769  *
1770  * dev_forward_skb can be used for injecting an skb from the
1771  * start_xmit function of one device into the receive queue
1772  * of another device.
1773  *
1774  * The receiving device may be in another namespace, so
1775  * we have to clear all information in the skb that could
1776  * impact namespace isolation.
1777  */
1778 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1779 {
1780         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1781 }
1782 EXPORT_SYMBOL_GPL(dev_forward_skb);
1783
1784 static inline int deliver_skb(struct sk_buff *skb,
1785                               struct packet_type *pt_prev,
1786                               struct net_device *orig_dev)
1787 {
1788         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1789                 return -ENOMEM;
1790         atomic_inc(&skb->users);
1791         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1792 }
1793
1794 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1795                                           struct packet_type **pt,
1796                                           struct net_device *orig_dev,
1797                                           __be16 type,
1798                                           struct list_head *ptype_list)
1799 {
1800         struct packet_type *ptype, *pt_prev = *pt;
1801
1802         list_for_each_entry_rcu(ptype, ptype_list, list) {
1803                 if (ptype->type != type)
1804                         continue;
1805                 if (pt_prev)
1806                         deliver_skb(skb, pt_prev, orig_dev);
1807                 pt_prev = ptype;
1808         }
1809         *pt = pt_prev;
1810 }
1811
1812 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1813 {
1814         if (!ptype->af_packet_priv || !skb->sk)
1815                 return false;
1816
1817         if (ptype->id_match)
1818                 return ptype->id_match(ptype, skb->sk);
1819         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1820                 return true;
1821
1822         return false;
1823 }
1824
1825 /*
1826  *      Support routine. Sends outgoing frames to any network
1827  *      taps currently in use.
1828  */
1829
1830 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1831 {
1832         struct packet_type *ptype;
1833         struct sk_buff *skb2 = NULL;
1834         struct packet_type *pt_prev = NULL;
1835         struct list_head *ptype_list = &ptype_all;
1836
1837         rcu_read_lock();
1838 again:
1839         list_for_each_entry_rcu(ptype, ptype_list, list) {
1840                 /* Never send packets back to the socket
1841                  * they originated from - MvS (miquels@drinkel.ow.org)
1842                  */
1843                 if (skb_loop_sk(ptype, skb))
1844                         continue;
1845
1846                 if (pt_prev) {
1847                         deliver_skb(skb2, pt_prev, skb->dev);
1848                         pt_prev = ptype;
1849                         continue;
1850                 }
1851
1852                 /* need to clone skb, done only once */
1853                 skb2 = skb_clone(skb, GFP_ATOMIC);
1854                 if (!skb2)
1855                         goto out_unlock;
1856
1857                 net_timestamp_set(skb2);
1858
1859                 /* skb->nh should be correctly
1860                  * set by sender, so that the second statement is
1861                  * just protection against buggy protocols.
1862                  */
1863                 skb_reset_mac_header(skb2);
1864
1865                 if (skb_network_header(skb2) < skb2->data ||
1866                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1867                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1868                                              ntohs(skb2->protocol),
1869                                              dev->name);
1870                         skb_reset_network_header(skb2);
1871                 }
1872
1873                 skb2->transport_header = skb2->network_header;
1874                 skb2->pkt_type = PACKET_OUTGOING;
1875                 pt_prev = ptype;
1876         }
1877
1878         if (ptype_list == &ptype_all) {
1879                 ptype_list = &dev->ptype_all;
1880                 goto again;
1881         }
1882 out_unlock:
1883         if (pt_prev)
1884                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1885         rcu_read_unlock();
1886 }
1887
1888 /**
1889  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1890  * @dev: Network device
1891  * @txq: number of queues available
1892  *
1893  * If real_num_tx_queues is changed the tc mappings may no longer be
1894  * valid. To resolve this verify the tc mapping remains valid and if
1895  * not NULL the mapping. With no priorities mapping to this
1896  * offset/count pair it will no longer be used. In the worst case TC0
1897  * is invalid nothing can be done so disable priority mappings. If is
1898  * expected that drivers will fix this mapping if they can before
1899  * calling netif_set_real_num_tx_queues.
1900  */
1901 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1902 {
1903         int i;
1904         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1905
1906         /* If TC0 is invalidated disable TC mapping */
1907         if (tc->offset + tc->count > txq) {
1908                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1909                 dev->num_tc = 0;
1910                 return;
1911         }
1912
1913         /* Invalidated prio to tc mappings set to TC0 */
1914         for (i = 1; i < TC_BITMASK + 1; i++) {
1915                 int q = netdev_get_prio_tc_map(dev, i);
1916
1917                 tc = &dev->tc_to_txq[q];
1918                 if (tc->offset + tc->count > txq) {
1919                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1920                                 i, q);
1921                         netdev_set_prio_tc_map(dev, i, 0);
1922                 }
1923         }
1924 }
1925
1926 #ifdef CONFIG_XPS
1927 static DEFINE_MUTEX(xps_map_mutex);
1928 #define xmap_dereference(P)             \
1929         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1930
1931 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1932                                         int cpu, u16 index)
1933 {
1934         struct xps_map *map = NULL;
1935         int pos;
1936
1937         if (dev_maps)
1938                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1939
1940         for (pos = 0; map && pos < map->len; pos++) {
1941                 if (map->queues[pos] == index) {
1942                         if (map->len > 1) {
1943                                 map->queues[pos] = map->queues[--map->len];
1944                         } else {
1945                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1946                                 kfree_rcu(map, rcu);
1947                                 map = NULL;
1948                         }
1949                         break;
1950                 }
1951         }
1952
1953         return map;
1954 }
1955
1956 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1957 {
1958         struct xps_dev_maps *dev_maps;
1959         int cpu, i;
1960         bool active = false;
1961
1962         mutex_lock(&xps_map_mutex);
1963         dev_maps = xmap_dereference(dev->xps_maps);
1964
1965         if (!dev_maps)
1966                 goto out_no_maps;
1967
1968         for_each_possible_cpu(cpu) {
1969                 for (i = index; i < dev->num_tx_queues; i++) {
1970                         if (!remove_xps_queue(dev_maps, cpu, i))
1971                                 break;
1972                 }
1973                 if (i == dev->num_tx_queues)
1974                         active = true;
1975         }
1976
1977         if (!active) {
1978                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1979                 kfree_rcu(dev_maps, rcu);
1980         }
1981
1982         for (i = index; i < dev->num_tx_queues; i++)
1983                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1984                                              NUMA_NO_NODE);
1985
1986 out_no_maps:
1987         mutex_unlock(&xps_map_mutex);
1988 }
1989
1990 static struct xps_map *expand_xps_map(struct xps_map *map,
1991                                       int cpu, u16 index)
1992 {
1993         struct xps_map *new_map;
1994         int alloc_len = XPS_MIN_MAP_ALLOC;
1995         int i, pos;
1996
1997         for (pos = 0; map && pos < map->len; pos++) {
1998                 if (map->queues[pos] != index)
1999                         continue;
2000                 return map;
2001         }
2002
2003         /* Need to add queue to this CPU's existing map */
2004         if (map) {
2005                 if (pos < map->alloc_len)
2006                         return map;
2007
2008                 alloc_len = map->alloc_len * 2;
2009         }
2010
2011         /* Need to allocate new map to store queue on this CPU's map */
2012         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2013                                cpu_to_node(cpu));
2014         if (!new_map)
2015                 return NULL;
2016
2017         for (i = 0; i < pos; i++)
2018                 new_map->queues[i] = map->queues[i];
2019         new_map->alloc_len = alloc_len;
2020         new_map->len = pos;
2021
2022         return new_map;
2023 }
2024
2025 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2026                         u16 index)
2027 {
2028         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2029         struct xps_map *map, *new_map;
2030         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2031         int cpu, numa_node_id = -2;
2032         bool active = false;
2033
2034         mutex_lock(&xps_map_mutex);
2035
2036         dev_maps = xmap_dereference(dev->xps_maps);
2037
2038         /* allocate memory for queue storage */
2039         for_each_online_cpu(cpu) {
2040                 if (!cpumask_test_cpu(cpu, mask))
2041                         continue;
2042
2043                 if (!new_dev_maps)
2044                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2045                 if (!new_dev_maps) {
2046                         mutex_unlock(&xps_map_mutex);
2047                         return -ENOMEM;
2048                 }
2049
2050                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2051                                  NULL;
2052
2053                 map = expand_xps_map(map, cpu, index);
2054                 if (!map)
2055                         goto error;
2056
2057                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2058         }
2059
2060         if (!new_dev_maps)
2061                 goto out_no_new_maps;
2062
2063         for_each_possible_cpu(cpu) {
2064                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2065                         /* add queue to CPU maps */
2066                         int pos = 0;
2067
2068                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2069                         while ((pos < map->len) && (map->queues[pos] != index))
2070                                 pos++;
2071
2072                         if (pos == map->len)
2073                                 map->queues[map->len++] = index;
2074 #ifdef CONFIG_NUMA
2075                         if (numa_node_id == -2)
2076                                 numa_node_id = cpu_to_node(cpu);
2077                         else if (numa_node_id != cpu_to_node(cpu))
2078                                 numa_node_id = -1;
2079 #endif
2080                 } else if (dev_maps) {
2081                         /* fill in the new device map from the old device map */
2082                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2083                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2084                 }
2085
2086         }
2087
2088         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2089
2090         /* Cleanup old maps */
2091         if (dev_maps) {
2092                 for_each_possible_cpu(cpu) {
2093                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2094                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2095                         if (map && map != new_map)
2096                                 kfree_rcu(map, rcu);
2097                 }
2098
2099                 kfree_rcu(dev_maps, rcu);
2100         }
2101
2102         dev_maps = new_dev_maps;
2103         active = true;
2104
2105 out_no_new_maps:
2106         /* update Tx queue numa node */
2107         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2108                                      (numa_node_id >= 0) ? numa_node_id :
2109                                      NUMA_NO_NODE);
2110
2111         if (!dev_maps)
2112                 goto out_no_maps;
2113
2114         /* removes queue from unused CPUs */
2115         for_each_possible_cpu(cpu) {
2116                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2117                         continue;
2118
2119                 if (remove_xps_queue(dev_maps, cpu, index))
2120                         active = true;
2121         }
2122
2123         /* free map if not active */
2124         if (!active) {
2125                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2126                 kfree_rcu(dev_maps, rcu);
2127         }
2128
2129 out_no_maps:
2130         mutex_unlock(&xps_map_mutex);
2131
2132         return 0;
2133 error:
2134         /* remove any maps that we added */
2135         for_each_possible_cpu(cpu) {
2136                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2137                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2138                                  NULL;
2139                 if (new_map && new_map != map)
2140                         kfree(new_map);
2141         }
2142
2143         mutex_unlock(&xps_map_mutex);
2144
2145         kfree(new_dev_maps);
2146         return -ENOMEM;
2147 }
2148 EXPORT_SYMBOL(netif_set_xps_queue);
2149
2150 #endif
2151 /*
2152  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2153  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2154  */
2155 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2156 {
2157         int rc;
2158
2159         if (txq < 1 || txq > dev->num_tx_queues)
2160                 return -EINVAL;
2161
2162         if (dev->reg_state == NETREG_REGISTERED ||
2163             dev->reg_state == NETREG_UNREGISTERING) {
2164                 ASSERT_RTNL();
2165
2166                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2167                                                   txq);
2168                 if (rc)
2169                         return rc;
2170
2171                 if (dev->num_tc)
2172                         netif_setup_tc(dev, txq);
2173
2174                 if (txq < dev->real_num_tx_queues) {
2175                         qdisc_reset_all_tx_gt(dev, txq);
2176 #ifdef CONFIG_XPS
2177                         netif_reset_xps_queues_gt(dev, txq);
2178 #endif
2179                 }
2180         }
2181
2182         dev->real_num_tx_queues = txq;
2183         return 0;
2184 }
2185 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2186
2187 #ifdef CONFIG_SYSFS
2188 /**
2189  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2190  *      @dev: Network device
2191  *      @rxq: Actual number of RX queues
2192  *
2193  *      This must be called either with the rtnl_lock held or before
2194  *      registration of the net device.  Returns 0 on success, or a
2195  *      negative error code.  If called before registration, it always
2196  *      succeeds.
2197  */
2198 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2199 {
2200         int rc;
2201
2202         if (rxq < 1 || rxq > dev->num_rx_queues)
2203                 return -EINVAL;
2204
2205         if (dev->reg_state == NETREG_REGISTERED) {
2206                 ASSERT_RTNL();
2207
2208                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2209                                                   rxq);
2210                 if (rc)
2211                         return rc;
2212         }
2213
2214         dev->real_num_rx_queues = rxq;
2215         return 0;
2216 }
2217 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2218 #endif
2219
2220 /**
2221  * netif_get_num_default_rss_queues - default number of RSS queues
2222  *
2223  * This routine should set an upper limit on the number of RSS queues
2224  * used by default by multiqueue devices.
2225  */
2226 int netif_get_num_default_rss_queues(void)
2227 {
2228         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2229 }
2230 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2231
2232 static inline void __netif_reschedule(struct Qdisc *q)
2233 {
2234         struct softnet_data *sd;
2235         unsigned long flags;
2236
2237         local_irq_save(flags);
2238         sd = this_cpu_ptr(&softnet_data);
2239         q->next_sched = NULL;
2240         *sd->output_queue_tailp = q;
2241         sd->output_queue_tailp = &q->next_sched;
2242         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2243         local_irq_restore(flags);
2244 }
2245
2246 void __netif_schedule(struct Qdisc *q)
2247 {
2248         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2249                 __netif_reschedule(q);
2250 }
2251 EXPORT_SYMBOL(__netif_schedule);
2252
2253 struct dev_kfree_skb_cb {
2254         enum skb_free_reason reason;
2255 };
2256
2257 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2258 {
2259         return (struct dev_kfree_skb_cb *)skb->cb;
2260 }
2261
2262 void netif_schedule_queue(struct netdev_queue *txq)
2263 {
2264         rcu_read_lock();
2265         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2266                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2267
2268                 __netif_schedule(q);
2269         }
2270         rcu_read_unlock();
2271 }
2272 EXPORT_SYMBOL(netif_schedule_queue);
2273
2274 /**
2275  *      netif_wake_subqueue - allow sending packets on subqueue
2276  *      @dev: network device
2277  *      @queue_index: sub queue index
2278  *
2279  * Resume individual transmit queue of a device with multiple transmit queues.
2280  */
2281 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2282 {
2283         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2284
2285         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2286                 struct Qdisc *q;
2287
2288                 rcu_read_lock();
2289                 q = rcu_dereference(txq->qdisc);
2290                 __netif_schedule(q);
2291                 rcu_read_unlock();
2292         }
2293 }
2294 EXPORT_SYMBOL(netif_wake_subqueue);
2295
2296 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2297 {
2298         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2299                 struct Qdisc *q;
2300
2301                 rcu_read_lock();
2302                 q = rcu_dereference(dev_queue->qdisc);
2303                 __netif_schedule(q);
2304                 rcu_read_unlock();
2305         }
2306 }
2307 EXPORT_SYMBOL(netif_tx_wake_queue);
2308
2309 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2310 {
2311         unsigned long flags;
2312
2313         if (likely(atomic_read(&skb->users) == 1)) {
2314                 smp_rmb();
2315                 atomic_set(&skb->users, 0);
2316         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2317                 return;
2318         }
2319         get_kfree_skb_cb(skb)->reason = reason;
2320         local_irq_save(flags);
2321         skb->next = __this_cpu_read(softnet_data.completion_queue);
2322         __this_cpu_write(softnet_data.completion_queue, skb);
2323         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2324         local_irq_restore(flags);
2325 }
2326 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2327
2328 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2329 {
2330         if (in_irq() || irqs_disabled())
2331                 __dev_kfree_skb_irq(skb, reason);
2332         else
2333                 dev_kfree_skb(skb);
2334 }
2335 EXPORT_SYMBOL(__dev_kfree_skb_any);
2336
2337
2338 /**
2339  * netif_device_detach - mark device as removed
2340  * @dev: network device
2341  *
2342  * Mark device as removed from system and therefore no longer available.
2343  */
2344 void netif_device_detach(struct net_device *dev)
2345 {
2346         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2347             netif_running(dev)) {
2348                 netif_tx_stop_all_queues(dev);
2349         }
2350 }
2351 EXPORT_SYMBOL(netif_device_detach);
2352
2353 /**
2354  * netif_device_attach - mark device as attached
2355  * @dev: network device
2356  *
2357  * Mark device as attached from system and restart if needed.
2358  */
2359 void netif_device_attach(struct net_device *dev)
2360 {
2361         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2362             netif_running(dev)) {
2363                 netif_tx_wake_all_queues(dev);
2364                 __netdev_watchdog_up(dev);
2365         }
2366 }
2367 EXPORT_SYMBOL(netif_device_attach);
2368
2369 /*
2370  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2371  * to be used as a distribution range.
2372  */
2373 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2374                   unsigned int num_tx_queues)
2375 {
2376         u32 hash;
2377         u16 qoffset = 0;
2378         u16 qcount = num_tx_queues;
2379
2380         if (skb_rx_queue_recorded(skb)) {
2381                 hash = skb_get_rx_queue(skb);
2382                 while (unlikely(hash >= num_tx_queues))
2383                         hash -= num_tx_queues;
2384                 return hash;
2385         }
2386
2387         if (dev->num_tc) {
2388                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2389                 qoffset = dev->tc_to_txq[tc].offset;
2390                 qcount = dev->tc_to_txq[tc].count;
2391         }
2392
2393         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2394 }
2395 EXPORT_SYMBOL(__skb_tx_hash);
2396
2397 static void skb_warn_bad_offload(const struct sk_buff *skb)
2398 {
2399         static const netdev_features_t null_features = 0;
2400         struct net_device *dev = skb->dev;
2401         const char *name = "";
2402
2403         if (!net_ratelimit())
2404                 return;
2405
2406         if (dev) {
2407                 if (dev->dev.parent)
2408                         name = dev_driver_string(dev->dev.parent);
2409                 else
2410                         name = netdev_name(dev);
2411         }
2412         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2413              "gso_type=%d ip_summed=%d\n",
2414              name, dev ? &dev->features : &null_features,
2415              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2416              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2417              skb_shinfo(skb)->gso_type, skb->ip_summed);
2418 }
2419
2420 /*
2421  * Invalidate hardware checksum when packet is to be mangled, and
2422  * complete checksum manually on outgoing path.
2423  */
2424 int skb_checksum_help(struct sk_buff *skb)
2425 {
2426         __wsum csum;
2427         int ret = 0, offset;
2428
2429         if (skb->ip_summed == CHECKSUM_COMPLETE)
2430                 goto out_set_summed;
2431
2432         if (unlikely(skb_shinfo(skb)->gso_size)) {
2433                 skb_warn_bad_offload(skb);
2434                 return -EINVAL;
2435         }
2436
2437         /* Before computing a checksum, we should make sure no frag could
2438          * be modified by an external entity : checksum could be wrong.
2439          */
2440         if (skb_has_shared_frag(skb)) {
2441                 ret = __skb_linearize(skb);
2442                 if (ret)
2443                         goto out;
2444         }
2445
2446         offset = skb_checksum_start_offset(skb);
2447         BUG_ON(offset >= skb_headlen(skb));
2448         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2449
2450         offset += skb->csum_offset;
2451         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2452
2453         if (skb_cloned(skb) &&
2454             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2455                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2456                 if (ret)
2457                         goto out;
2458         }
2459
2460         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2461 out_set_summed:
2462         skb->ip_summed = CHECKSUM_NONE;
2463 out:
2464         return ret;
2465 }
2466 EXPORT_SYMBOL(skb_checksum_help);
2467
2468 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2469 {
2470         __be16 type = skb->protocol;
2471
2472         /* Tunnel gso handlers can set protocol to ethernet. */
2473         if (type == htons(ETH_P_TEB)) {
2474                 struct ethhdr *eth;
2475
2476                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2477                         return 0;
2478
2479                 eth = (struct ethhdr *)skb_mac_header(skb);
2480                 type = eth->h_proto;
2481         }
2482
2483         return __vlan_get_protocol(skb, type, depth);
2484 }
2485
2486 /**
2487  *      skb_mac_gso_segment - mac layer segmentation handler.
2488  *      @skb: buffer to segment
2489  *      @features: features for the output path (see dev->features)
2490  */
2491 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2492                                     netdev_features_t features)
2493 {
2494         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2495         struct packet_offload *ptype;
2496         int vlan_depth = skb->mac_len;
2497         __be16 type = skb_network_protocol(skb, &vlan_depth);
2498
2499         if (unlikely(!type))
2500                 return ERR_PTR(-EINVAL);
2501
2502         __skb_pull(skb, vlan_depth);
2503
2504         rcu_read_lock();
2505         list_for_each_entry_rcu(ptype, &offload_base, list) {
2506                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2507                         segs = ptype->callbacks.gso_segment(skb, features);
2508                         break;
2509                 }
2510         }
2511         rcu_read_unlock();
2512
2513         __skb_push(skb, skb->data - skb_mac_header(skb));
2514
2515         return segs;
2516 }
2517 EXPORT_SYMBOL(skb_mac_gso_segment);
2518
2519
2520 /* openvswitch calls this on rx path, so we need a different check.
2521  */
2522 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2523 {
2524         if (tx_path)
2525                 return skb->ip_summed != CHECKSUM_PARTIAL;
2526         else
2527                 return skb->ip_summed == CHECKSUM_NONE;
2528 }
2529
2530 /**
2531  *      __skb_gso_segment - Perform segmentation on skb.
2532  *      @skb: buffer to segment
2533  *      @features: features for the output path (see dev->features)
2534  *      @tx_path: whether it is called in TX path
2535  *
2536  *      This function segments the given skb and returns a list of segments.
2537  *
2538  *      It may return NULL if the skb requires no segmentation.  This is
2539  *      only possible when GSO is used for verifying header integrity.
2540  *
2541  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2542  */
2543 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2544                                   netdev_features_t features, bool tx_path)
2545 {
2546         if (unlikely(skb_needs_check(skb, tx_path))) {
2547                 int err;
2548
2549                 skb_warn_bad_offload(skb);
2550
2551                 err = skb_cow_head(skb, 0);
2552                 if (err < 0)
2553                         return ERR_PTR(err);
2554         }
2555
2556         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2557                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2558
2559         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2560         SKB_GSO_CB(skb)->encap_level = 0;
2561
2562         skb_reset_mac_header(skb);
2563         skb_reset_mac_len(skb);
2564
2565         return skb_mac_gso_segment(skb, features);
2566 }
2567 EXPORT_SYMBOL(__skb_gso_segment);
2568
2569 /* Take action when hardware reception checksum errors are detected. */
2570 #ifdef CONFIG_BUG
2571 void netdev_rx_csum_fault(struct net_device *dev)
2572 {
2573         if (net_ratelimit()) {
2574                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2575                 dump_stack();
2576         }
2577 }
2578 EXPORT_SYMBOL(netdev_rx_csum_fault);
2579 #endif
2580
2581 /* Actually, we should eliminate this check as soon as we know, that:
2582  * 1. IOMMU is present and allows to map all the memory.
2583  * 2. No high memory really exists on this machine.
2584  */
2585
2586 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2587 {
2588 #ifdef CONFIG_HIGHMEM
2589         int i;
2590         if (!(dev->features & NETIF_F_HIGHDMA)) {
2591                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2592                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2593                         if (PageHighMem(skb_frag_page(frag)))
2594                                 return 1;
2595                 }
2596         }
2597
2598         if (PCI_DMA_BUS_IS_PHYS) {
2599                 struct device *pdev = dev->dev.parent;
2600
2601                 if (!pdev)
2602                         return 0;
2603                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2604                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2605                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2606                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2607                                 return 1;
2608                 }
2609         }
2610 #endif
2611         return 0;
2612 }
2613
2614 /* If MPLS offload request, verify we are testing hardware MPLS features
2615  * instead of standard features for the netdev.
2616  */
2617 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2618 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2619                                            netdev_features_t features,
2620                                            __be16 type)
2621 {
2622         if (eth_p_mpls(type))
2623                 features &= skb->dev->mpls_features;
2624
2625         return features;
2626 }
2627 #else
2628 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2629                                            netdev_features_t features,
2630                                            __be16 type)
2631 {
2632         return features;
2633 }
2634 #endif
2635
2636 static netdev_features_t harmonize_features(struct sk_buff *skb,
2637         netdev_features_t features)
2638 {
2639         int tmp;
2640         __be16 type;
2641
2642         type = skb_network_protocol(skb, &tmp);
2643         features = net_mpls_features(skb, features, type);
2644
2645         if (skb->ip_summed != CHECKSUM_NONE &&
2646             !can_checksum_protocol(features, type)) {
2647                 features &= ~NETIF_F_ALL_CSUM;
2648         }
2649         if (illegal_highdma(skb->dev, skb))
2650                 features &= ~NETIF_F_SG;
2651
2652         return features;
2653 }
2654
2655 netdev_features_t passthru_features_check(struct sk_buff *skb,
2656                                           struct net_device *dev,
2657                                           netdev_features_t features)
2658 {
2659         return features;
2660 }
2661 EXPORT_SYMBOL(passthru_features_check);
2662
2663 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2664                                              struct net_device *dev,
2665                                              netdev_features_t features)
2666 {
2667         return vlan_features_check(skb, features);
2668 }
2669
2670 netdev_features_t netif_skb_features(struct sk_buff *skb)
2671 {
2672         struct net_device *dev = skb->dev;
2673         netdev_features_t features = dev->features;
2674         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2675
2676         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2677                 features &= ~NETIF_F_GSO_MASK;
2678
2679         /* If encapsulation offload request, verify we are testing
2680          * hardware encapsulation features instead of standard
2681          * features for the netdev
2682          */
2683         if (skb->encapsulation)
2684                 features &= dev->hw_enc_features;
2685
2686         if (skb_vlan_tagged(skb))
2687                 features = netdev_intersect_features(features,
2688                                                      dev->vlan_features |
2689                                                      NETIF_F_HW_VLAN_CTAG_TX |
2690                                                      NETIF_F_HW_VLAN_STAG_TX);
2691
2692         if (dev->netdev_ops->ndo_features_check)
2693                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2694                                                                 features);
2695         else
2696                 features &= dflt_features_check(skb, dev, features);
2697
2698         return harmonize_features(skb, features);
2699 }
2700 EXPORT_SYMBOL(netif_skb_features);
2701
2702 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2703                     struct netdev_queue *txq, bool more)
2704 {
2705         unsigned int len;
2706         int rc;
2707
2708         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2709                 dev_queue_xmit_nit(skb, dev);
2710
2711         len = skb->len;
2712         trace_net_dev_start_xmit(skb, dev);
2713         rc = netdev_start_xmit(skb, dev, txq, more);
2714         trace_net_dev_xmit(skb, rc, dev, len);
2715
2716         return rc;
2717 }
2718
2719 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2720                                     struct netdev_queue *txq, int *ret)
2721 {
2722         struct sk_buff *skb = first;
2723         int rc = NETDEV_TX_OK;
2724
2725         while (skb) {
2726                 struct sk_buff *next = skb->next;
2727
2728                 skb->next = NULL;
2729                 rc = xmit_one(skb, dev, txq, next != NULL);
2730                 if (unlikely(!dev_xmit_complete(rc))) {
2731                         skb->next = next;
2732                         goto out;
2733                 }
2734
2735                 skb = next;
2736                 if (netif_xmit_stopped(txq) && skb) {
2737                         rc = NETDEV_TX_BUSY;
2738                         break;
2739                 }
2740         }
2741
2742 out:
2743         *ret = rc;
2744         return skb;
2745 }
2746
2747 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2748                                           netdev_features_t features)
2749 {
2750         if (skb_vlan_tag_present(skb) &&
2751             !vlan_hw_offload_capable(features, skb->vlan_proto))
2752                 skb = __vlan_hwaccel_push_inside(skb);
2753         return skb;
2754 }
2755
2756 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2757 {
2758         netdev_features_t features;
2759
2760         if (skb->next)
2761                 return skb;
2762
2763         features = netif_skb_features(skb);
2764         skb = validate_xmit_vlan(skb, features);
2765         if (unlikely(!skb))
2766                 goto out_null;
2767
2768         if (netif_needs_gso(skb, features)) {
2769                 struct sk_buff *segs;
2770
2771                 segs = skb_gso_segment(skb, features);
2772                 if (IS_ERR(segs)) {
2773                         goto out_kfree_skb;
2774                 } else if (segs) {
2775                         consume_skb(skb);
2776                         skb = segs;
2777                 }
2778         } else {
2779                 if (skb_needs_linearize(skb, features) &&
2780                     __skb_linearize(skb))
2781                         goto out_kfree_skb;
2782
2783                 /* If packet is not checksummed and device does not
2784                  * support checksumming for this protocol, complete
2785                  * checksumming here.
2786                  */
2787                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2788                         if (skb->encapsulation)
2789                                 skb_set_inner_transport_header(skb,
2790                                                                skb_checksum_start_offset(skb));
2791                         else
2792                                 skb_set_transport_header(skb,
2793                                                          skb_checksum_start_offset(skb));
2794                         if (!(features & NETIF_F_ALL_CSUM) &&
2795                             skb_checksum_help(skb))
2796                                 goto out_kfree_skb;
2797                 }
2798         }
2799
2800         return skb;
2801
2802 out_kfree_skb:
2803         kfree_skb(skb);
2804 out_null:
2805         return NULL;
2806 }
2807
2808 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2809 {
2810         struct sk_buff *next, *head = NULL, *tail;
2811
2812         for (; skb != NULL; skb = next) {
2813                 next = skb->next;
2814                 skb->next = NULL;
2815
2816                 /* in case skb wont be segmented, point to itself */
2817                 skb->prev = skb;
2818
2819                 skb = validate_xmit_skb(skb, dev);
2820                 if (!skb)
2821                         continue;
2822
2823                 if (!head)
2824                         head = skb;
2825                 else
2826                         tail->next = skb;
2827                 /* If skb was segmented, skb->prev points to
2828                  * the last segment. If not, it still contains skb.
2829                  */
2830                 tail = skb->prev;
2831         }
2832         return head;
2833 }
2834 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
2835
2836 static void qdisc_pkt_len_init(struct sk_buff *skb)
2837 {
2838         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2839
2840         qdisc_skb_cb(skb)->pkt_len = skb->len;
2841
2842         /* To get more precise estimation of bytes sent on wire,
2843          * we add to pkt_len the headers size of all segments
2844          */
2845         if (shinfo->gso_size)  {
2846                 unsigned int hdr_len;
2847                 u16 gso_segs = shinfo->gso_segs;
2848
2849                 /* mac layer + network layer */
2850                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2851
2852                 /* + transport layer */
2853                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2854                         hdr_len += tcp_hdrlen(skb);
2855                 else
2856                         hdr_len += sizeof(struct udphdr);
2857
2858                 if (shinfo->gso_type & SKB_GSO_DODGY)
2859                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2860                                                 shinfo->gso_size);
2861
2862                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2863         }
2864 }
2865
2866 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2867                                  struct net_device *dev,
2868                                  struct netdev_queue *txq)
2869 {
2870         spinlock_t *root_lock = qdisc_lock(q);
2871         bool contended;
2872         int rc;
2873
2874         qdisc_pkt_len_init(skb);
2875         qdisc_calculate_pkt_len(skb, q);
2876         /*
2877          * Heuristic to force contended enqueues to serialize on a
2878          * separate lock before trying to get qdisc main lock.
2879          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2880          * often and dequeue packets faster.
2881          */
2882         contended = qdisc_is_running(q);
2883         if (unlikely(contended))
2884                 spin_lock(&q->busylock);
2885
2886         spin_lock(root_lock);
2887         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2888                 kfree_skb(skb);
2889                 rc = NET_XMIT_DROP;
2890         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2891                    qdisc_run_begin(q)) {
2892                 /*
2893                  * This is a work-conserving queue; there are no old skbs
2894                  * waiting to be sent out; and the qdisc is not running -
2895                  * xmit the skb directly.
2896                  */
2897
2898                 qdisc_bstats_update(q, skb);
2899
2900                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2901                         if (unlikely(contended)) {
2902                                 spin_unlock(&q->busylock);
2903                                 contended = false;
2904                         }
2905                         __qdisc_run(q);
2906                 } else
2907                         qdisc_run_end(q);
2908
2909                 rc = NET_XMIT_SUCCESS;
2910         } else {
2911                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2912                 if (qdisc_run_begin(q)) {
2913                         if (unlikely(contended)) {
2914                                 spin_unlock(&q->busylock);
2915                                 contended = false;
2916                         }
2917                         __qdisc_run(q);
2918                 }
2919         }
2920         spin_unlock(root_lock);
2921         if (unlikely(contended))
2922                 spin_unlock(&q->busylock);
2923         return rc;
2924 }
2925
2926 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2927 static void skb_update_prio(struct sk_buff *skb)
2928 {
2929         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2930
2931         if (!skb->priority && skb->sk && map) {
2932                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2933
2934                 if (prioidx < map->priomap_len)
2935                         skb->priority = map->priomap[prioidx];
2936         }
2937 }
2938 #else
2939 #define skb_update_prio(skb)
2940 #endif
2941
2942 DEFINE_PER_CPU(int, xmit_recursion);
2943 EXPORT_SYMBOL(xmit_recursion);
2944
2945 #define RECURSION_LIMIT 10
2946
2947 /**
2948  *      dev_loopback_xmit - loop back @skb
2949  *      @net: network namespace this loopback is happening in
2950  *      @sk:  sk needed to be a netfilter okfn
2951  *      @skb: buffer to transmit
2952  */
2953 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
2954 {
2955         skb_reset_mac_header(skb);
2956         __skb_pull(skb, skb_network_offset(skb));
2957         skb->pkt_type = PACKET_LOOPBACK;
2958         skb->ip_summed = CHECKSUM_UNNECESSARY;
2959         WARN_ON(!skb_dst(skb));
2960         skb_dst_force(skb);
2961         netif_rx_ni(skb);
2962         return 0;
2963 }
2964 EXPORT_SYMBOL(dev_loopback_xmit);
2965
2966 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2967 {
2968 #ifdef CONFIG_XPS
2969         struct xps_dev_maps *dev_maps;
2970         struct xps_map *map;
2971         int queue_index = -1;
2972
2973         rcu_read_lock();
2974         dev_maps = rcu_dereference(dev->xps_maps);
2975         if (dev_maps) {
2976                 map = rcu_dereference(
2977                     dev_maps->cpu_map[skb->sender_cpu - 1]);
2978                 if (map) {
2979                         if (map->len == 1)
2980                                 queue_index = map->queues[0];
2981                         else
2982                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
2983                                                                            map->len)];
2984                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2985                                 queue_index = -1;
2986                 }
2987         }
2988         rcu_read_unlock();
2989
2990         return queue_index;
2991 #else
2992         return -1;
2993 #endif
2994 }
2995
2996 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
2997 {
2998         struct sock *sk = skb->sk;
2999         int queue_index = sk_tx_queue_get(sk);
3000
3001         if (queue_index < 0 || skb->ooo_okay ||
3002             queue_index >= dev->real_num_tx_queues) {
3003                 int new_index = get_xps_queue(dev, skb);
3004                 if (new_index < 0)
3005                         new_index = skb_tx_hash(dev, skb);
3006
3007                 if (queue_index != new_index && sk &&
3008                     sk_fullsock(sk) &&
3009                     rcu_access_pointer(sk->sk_dst_cache))
3010                         sk_tx_queue_set(sk, new_index);
3011
3012                 queue_index = new_index;
3013         }
3014
3015         return queue_index;
3016 }
3017
3018 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3019                                     struct sk_buff *skb,
3020                                     void *accel_priv)
3021 {
3022         int queue_index = 0;
3023
3024 #ifdef CONFIG_XPS
3025         if (skb->sender_cpu == 0)
3026                 skb->sender_cpu = raw_smp_processor_id() + 1;
3027 #endif
3028
3029         if (dev->real_num_tx_queues != 1) {
3030                 const struct net_device_ops *ops = dev->netdev_ops;
3031                 if (ops->ndo_select_queue)
3032                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3033                                                             __netdev_pick_tx);
3034                 else
3035                         queue_index = __netdev_pick_tx(dev, skb);
3036
3037                 if (!accel_priv)
3038                         queue_index = netdev_cap_txqueue(dev, queue_index);
3039         }
3040
3041         skb_set_queue_mapping(skb, queue_index);
3042         return netdev_get_tx_queue(dev, queue_index);
3043 }
3044
3045 /**
3046  *      __dev_queue_xmit - transmit a buffer
3047  *      @skb: buffer to transmit
3048  *      @accel_priv: private data used for L2 forwarding offload
3049  *
3050  *      Queue a buffer for transmission to a network device. The caller must
3051  *      have set the device and priority and built the buffer before calling
3052  *      this function. The function can be called from an interrupt.
3053  *
3054  *      A negative errno code is returned on a failure. A success does not
3055  *      guarantee the frame will be transmitted as it may be dropped due
3056  *      to congestion or traffic shaping.
3057  *
3058  * -----------------------------------------------------------------------------------
3059  *      I notice this method can also return errors from the queue disciplines,
3060  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3061  *      be positive.
3062  *
3063  *      Regardless of the return value, the skb is consumed, so it is currently
3064  *      difficult to retry a send to this method.  (You can bump the ref count
3065  *      before sending to hold a reference for retry if you are careful.)
3066  *
3067  *      When calling this method, interrupts MUST be enabled.  This is because
3068  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3069  *          --BLG
3070  */
3071 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3072 {
3073         struct net_device *dev = skb->dev;
3074         struct netdev_queue *txq;
3075         struct Qdisc *q;
3076         int rc = -ENOMEM;
3077
3078         skb_reset_mac_header(skb);
3079
3080         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3081                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3082
3083         /* Disable soft irqs for various locks below. Also
3084          * stops preemption for RCU.
3085          */
3086         rcu_read_lock_bh();
3087
3088         skb_update_prio(skb);
3089
3090         /* If device/qdisc don't need skb->dst, release it right now while
3091          * its hot in this cpu cache.
3092          */
3093         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3094                 skb_dst_drop(skb);
3095         else
3096                 skb_dst_force(skb);
3097
3098 #ifdef CONFIG_NET_SWITCHDEV
3099         /* Don't forward if offload device already forwarded */
3100         if (skb->offload_fwd_mark &&
3101             skb->offload_fwd_mark == dev->offload_fwd_mark) {
3102                 consume_skb(skb);
3103                 rc = NET_XMIT_SUCCESS;
3104                 goto out;
3105         }
3106 #endif
3107
3108         txq = netdev_pick_tx(dev, skb, accel_priv);
3109         q = rcu_dereference_bh(txq->qdisc);
3110
3111 #ifdef CONFIG_NET_CLS_ACT
3112         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3113 #endif
3114         trace_net_dev_queue(skb);
3115         if (q->enqueue) {
3116                 rc = __dev_xmit_skb(skb, q, dev, txq);
3117                 goto out;
3118         }
3119
3120         /* The device has no queue. Common case for software devices:
3121            loopback, all the sorts of tunnels...
3122
3123            Really, it is unlikely that netif_tx_lock protection is necessary
3124            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3125            counters.)
3126            However, it is possible, that they rely on protection
3127            made by us here.
3128
3129            Check this and shot the lock. It is not prone from deadlocks.
3130            Either shot noqueue qdisc, it is even simpler 8)
3131          */
3132         if (dev->flags & IFF_UP) {
3133                 int cpu = smp_processor_id(); /* ok because BHs are off */
3134
3135                 if (txq->xmit_lock_owner != cpu) {
3136
3137                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3138                                 goto recursion_alert;
3139
3140                         skb = validate_xmit_skb(skb, dev);
3141                         if (!skb)
3142                                 goto drop;
3143
3144                         HARD_TX_LOCK(dev, txq, cpu);
3145
3146                         if (!netif_xmit_stopped(txq)) {
3147                                 __this_cpu_inc(xmit_recursion);
3148                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3149                                 __this_cpu_dec(xmit_recursion);
3150                                 if (dev_xmit_complete(rc)) {
3151                                         HARD_TX_UNLOCK(dev, txq);
3152                                         goto out;
3153                                 }
3154                         }
3155                         HARD_TX_UNLOCK(dev, txq);
3156                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3157                                              dev->name);
3158                 } else {
3159                         /* Recursion is detected! It is possible,
3160                          * unfortunately
3161                          */
3162 recursion_alert:
3163                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3164                                              dev->name);
3165                 }
3166         }
3167
3168         rc = -ENETDOWN;
3169 drop:
3170         rcu_read_unlock_bh();
3171
3172         atomic_long_inc(&dev->tx_dropped);
3173         kfree_skb_list(skb);
3174         return rc;
3175 out:
3176         rcu_read_unlock_bh();
3177         return rc;
3178 }
3179
3180 int dev_queue_xmit(struct sk_buff *skb)
3181 {
3182         return __dev_queue_xmit(skb, NULL);
3183 }
3184 EXPORT_SYMBOL(dev_queue_xmit);
3185
3186 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3187 {
3188         return __dev_queue_xmit(skb, accel_priv);
3189 }
3190 EXPORT_SYMBOL(dev_queue_xmit_accel);
3191
3192
3193 /*=======================================================================
3194                         Receiver routines
3195   =======================================================================*/
3196
3197 int netdev_max_backlog __read_mostly = 1000;
3198 EXPORT_SYMBOL(netdev_max_backlog);
3199
3200 int netdev_tstamp_prequeue __read_mostly = 1;
3201 int netdev_budget __read_mostly = 300;
3202 int weight_p __read_mostly = 64;            /* old backlog weight */
3203
3204 /* Called with irq disabled */
3205 static inline void ____napi_schedule(struct softnet_data *sd,
3206                                      struct napi_struct *napi)
3207 {
3208         list_add_tail(&napi->poll_list, &sd->poll_list);
3209         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3210 }
3211
3212 #ifdef CONFIG_RPS
3213
3214 /* One global table that all flow-based protocols share. */
3215 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3216 EXPORT_SYMBOL(rps_sock_flow_table);
3217 u32 rps_cpu_mask __read_mostly;
3218 EXPORT_SYMBOL(rps_cpu_mask);
3219
3220 struct static_key rps_needed __read_mostly;
3221
3222 static struct rps_dev_flow *
3223 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3224             struct rps_dev_flow *rflow, u16 next_cpu)
3225 {
3226         if (next_cpu < nr_cpu_ids) {
3227 #ifdef CONFIG_RFS_ACCEL
3228                 struct netdev_rx_queue *rxqueue;
3229                 struct rps_dev_flow_table *flow_table;
3230                 struct rps_dev_flow *old_rflow;
3231                 u32 flow_id;
3232                 u16 rxq_index;
3233                 int rc;
3234
3235                 /* Should we steer this flow to a different hardware queue? */
3236                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3237                     !(dev->features & NETIF_F_NTUPLE))
3238                         goto out;
3239                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3240                 if (rxq_index == skb_get_rx_queue(skb))
3241                         goto out;
3242
3243                 rxqueue = dev->_rx + rxq_index;
3244                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3245                 if (!flow_table)
3246                         goto out;
3247                 flow_id = skb_get_hash(skb) & flow_table->mask;
3248                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3249                                                         rxq_index, flow_id);
3250                 if (rc < 0)
3251                         goto out;
3252                 old_rflow = rflow;
3253                 rflow = &flow_table->flows[flow_id];
3254                 rflow->filter = rc;
3255                 if (old_rflow->filter == rflow->filter)
3256                         old_rflow->filter = RPS_NO_FILTER;
3257         out:
3258 #endif
3259                 rflow->last_qtail =
3260                         per_cpu(softnet_data, next_cpu).input_queue_head;
3261         }
3262
3263         rflow->cpu = next_cpu;
3264         return rflow;
3265 }
3266
3267 /*
3268  * get_rps_cpu is called from netif_receive_skb and returns the target
3269  * CPU from the RPS map of the receiving queue for a given skb.
3270  * rcu_read_lock must be held on entry.
3271  */
3272 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3273                        struct rps_dev_flow **rflowp)
3274 {
3275         const struct rps_sock_flow_table *sock_flow_table;
3276         struct netdev_rx_queue *rxqueue = dev->_rx;
3277         struct rps_dev_flow_table *flow_table;
3278         struct rps_map *map;
3279         int cpu = -1;
3280         u32 tcpu;
3281         u32 hash;
3282
3283         if (skb_rx_queue_recorded(skb)) {
3284                 u16 index = skb_get_rx_queue(skb);
3285
3286                 if (unlikely(index >= dev->real_num_rx_queues)) {
3287                         WARN_ONCE(dev->real_num_rx_queues > 1,
3288                                   "%s received packet on queue %u, but number "
3289                                   "of RX queues is %u\n",
3290                                   dev->name, index, dev->real_num_rx_queues);
3291                         goto done;
3292                 }
3293                 rxqueue += index;
3294         }
3295
3296         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3297
3298         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3299         map = rcu_dereference(rxqueue->rps_map);
3300         if (!flow_table && !map)
3301                 goto done;
3302
3303         skb_reset_network_header(skb);
3304         hash = skb_get_hash(skb);
3305         if (!hash)
3306                 goto done;
3307
3308         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3309         if (flow_table && sock_flow_table) {
3310                 struct rps_dev_flow *rflow;
3311                 u32 next_cpu;
3312                 u32 ident;
3313
3314                 /* First check into global flow table if there is a match */
3315                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3316                 if ((ident ^ hash) & ~rps_cpu_mask)
3317                         goto try_rps;
3318
3319                 next_cpu = ident & rps_cpu_mask;
3320
3321                 /* OK, now we know there is a match,
3322                  * we can look at the local (per receive queue) flow table
3323                  */
3324                 rflow = &flow_table->flows[hash & flow_table->mask];
3325                 tcpu = rflow->cpu;
3326
3327                 /*
3328                  * If the desired CPU (where last recvmsg was done) is
3329                  * different from current CPU (one in the rx-queue flow
3330                  * table entry), switch if one of the following holds:
3331                  *   - Current CPU is unset (>= nr_cpu_ids).
3332                  *   - Current CPU is offline.
3333                  *   - The current CPU's queue tail has advanced beyond the
3334                  *     last packet that was enqueued using this table entry.
3335                  *     This guarantees that all previous packets for the flow
3336                  *     have been dequeued, thus preserving in order delivery.
3337                  */
3338                 if (unlikely(tcpu != next_cpu) &&
3339                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3340                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3341                       rflow->last_qtail)) >= 0)) {
3342                         tcpu = next_cpu;
3343                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3344                 }
3345
3346                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3347                         *rflowp = rflow;
3348                         cpu = tcpu;
3349                         goto done;
3350                 }
3351         }
3352
3353 try_rps:
3354
3355         if (map) {
3356                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3357                 if (cpu_online(tcpu)) {
3358                         cpu = tcpu;
3359                         goto done;
3360                 }
3361         }
3362
3363 done:
3364         return cpu;
3365 }
3366
3367 #ifdef CONFIG_RFS_ACCEL
3368
3369 /**
3370  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3371  * @dev: Device on which the filter was set
3372  * @rxq_index: RX queue index
3373  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3374  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3375  *
3376  * Drivers that implement ndo_rx_flow_steer() should periodically call
3377  * this function for each installed filter and remove the filters for
3378  * which it returns %true.
3379  */
3380 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3381                          u32 flow_id, u16 filter_id)
3382 {
3383         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3384         struct rps_dev_flow_table *flow_table;
3385         struct rps_dev_flow *rflow;
3386         bool expire = true;
3387         unsigned int cpu;
3388
3389         rcu_read_lock();
3390         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3391         if (flow_table && flow_id <= flow_table->mask) {
3392                 rflow = &flow_table->flows[flow_id];
3393                 cpu = ACCESS_ONCE(rflow->cpu);
3394                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3395                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3396                            rflow->last_qtail) <
3397                      (int)(10 * flow_table->mask)))
3398                         expire = false;
3399         }
3400         rcu_read_unlock();
3401         return expire;
3402 }
3403 EXPORT_SYMBOL(rps_may_expire_flow);
3404
3405 #endif /* CONFIG_RFS_ACCEL */
3406
3407 /* Called from hardirq (IPI) context */
3408 static void rps_trigger_softirq(void *data)
3409 {
3410         struct softnet_data *sd = data;
3411
3412         ____napi_schedule(sd, &sd->backlog);
3413         sd->received_rps++;
3414 }
3415
3416 #endif /* CONFIG_RPS */
3417
3418 /*
3419  * Check if this softnet_data structure is another cpu one
3420  * If yes, queue it to our IPI list and return 1
3421  * If no, return 0
3422  */
3423 static int rps_ipi_queued(struct softnet_data *sd)
3424 {
3425 #ifdef CONFIG_RPS
3426         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3427
3428         if (sd != mysd) {
3429                 sd->rps_ipi_next = mysd->rps_ipi_list;
3430                 mysd->rps_ipi_list = sd;
3431
3432                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3433                 return 1;
3434         }
3435 #endif /* CONFIG_RPS */
3436         return 0;
3437 }
3438
3439 #ifdef CONFIG_NET_FLOW_LIMIT
3440 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3441 #endif
3442
3443 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3444 {
3445 #ifdef CONFIG_NET_FLOW_LIMIT
3446         struct sd_flow_limit *fl;
3447         struct softnet_data *sd;
3448         unsigned int old_flow, new_flow;
3449
3450         if (qlen < (netdev_max_backlog >> 1))
3451                 return false;
3452
3453         sd = this_cpu_ptr(&softnet_data);
3454
3455         rcu_read_lock();
3456         fl = rcu_dereference(sd->flow_limit);
3457         if (fl) {
3458                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3459                 old_flow = fl->history[fl->history_head];
3460                 fl->history[fl->history_head] = new_flow;
3461
3462                 fl->history_head++;
3463                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3464
3465                 if (likely(fl->buckets[old_flow]))
3466                         fl->buckets[old_flow]--;
3467
3468                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3469                         fl->count++;
3470                         rcu_read_unlock();
3471                         return true;
3472                 }
3473         }
3474         rcu_read_unlock();
3475 #endif
3476         return false;
3477 }
3478
3479 /*
3480  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3481  * queue (may be a remote CPU queue).
3482  */
3483 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3484                               unsigned int *qtail)
3485 {
3486         struct softnet_data *sd;
3487         unsigned long flags;
3488         unsigned int qlen;
3489
3490         sd = &per_cpu(softnet_data, cpu);
3491
3492         local_irq_save(flags);
3493
3494         rps_lock(sd);
3495         if (!netif_running(skb->dev))
3496                 goto drop;
3497         qlen = skb_queue_len(&sd->input_pkt_queue);
3498         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3499                 if (qlen) {
3500 enqueue:
3501                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3502                         input_queue_tail_incr_save(sd, qtail);
3503                         rps_unlock(sd);
3504                         local_irq_restore(flags);
3505                         return NET_RX_SUCCESS;
3506                 }
3507
3508                 /* Schedule NAPI for backlog device
3509                  * We can use non atomic operation since we own the queue lock
3510                  */
3511                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3512                         if (!rps_ipi_queued(sd))
3513                                 ____napi_schedule(sd, &sd->backlog);
3514                 }
3515                 goto enqueue;
3516         }
3517
3518 drop:
3519         sd->dropped++;
3520         rps_unlock(sd);
3521
3522         local_irq_restore(flags);
3523
3524         atomic_long_inc(&skb->dev->rx_dropped);
3525         kfree_skb(skb);
3526         return NET_RX_DROP;
3527 }
3528
3529 static int netif_rx_internal(struct sk_buff *skb)
3530 {
3531         int ret;
3532
3533         net_timestamp_check(netdev_tstamp_prequeue, skb);
3534
3535         trace_netif_rx(skb);
3536 #ifdef CONFIG_RPS
3537         if (static_key_false(&rps_needed)) {
3538                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3539                 int cpu;
3540
3541                 preempt_disable();
3542                 rcu_read_lock();
3543
3544                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3545                 if (cpu < 0)
3546                         cpu = smp_processor_id();
3547
3548                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3549
3550                 rcu_read_unlock();
3551                 preempt_enable();
3552         } else
3553 #endif
3554         {
3555                 unsigned int qtail;
3556                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3557                 put_cpu();
3558         }
3559         return ret;
3560 }
3561
3562 /**
3563  *      netif_rx        -       post buffer to the network code
3564  *      @skb: buffer to post
3565  *
3566  *      This function receives a packet from a device driver and queues it for
3567  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3568  *      may be dropped during processing for congestion control or by the
3569  *      protocol layers.
3570  *
3571  *      return values:
3572  *      NET_RX_SUCCESS  (no congestion)
3573  *      NET_RX_DROP     (packet was dropped)
3574  *
3575  */
3576
3577 int netif_rx(struct sk_buff *skb)
3578 {
3579         trace_netif_rx_entry(skb);
3580
3581         return netif_rx_internal(skb);
3582 }
3583 EXPORT_SYMBOL(netif_rx);
3584
3585 int netif_rx_ni(struct sk_buff *skb)
3586 {
3587         int err;
3588
3589         trace_netif_rx_ni_entry(skb);
3590
3591         preempt_disable();
3592         err = netif_rx_internal(skb);
3593         if (local_softirq_pending())
3594                 do_softirq();
3595         preempt_enable();
3596
3597         return err;
3598 }
3599 EXPORT_SYMBOL(netif_rx_ni);
3600
3601 static void net_tx_action(struct softirq_action *h)
3602 {
3603         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3604
3605         if (sd->completion_queue) {
3606                 struct sk_buff *clist;
3607
3608                 local_irq_disable();
3609                 clist = sd->completion_queue;
3610                 sd->completion_queue = NULL;
3611                 local_irq_enable();
3612
3613                 while (clist) {
3614                         struct sk_buff *skb = clist;
3615                         clist = clist->next;
3616
3617                         WARN_ON(atomic_read(&skb->users));
3618                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3619                                 trace_consume_skb(skb);
3620                         else
3621                                 trace_kfree_skb(skb, net_tx_action);
3622                         __kfree_skb(skb);
3623                 }
3624         }
3625
3626         if (sd->output_queue) {
3627                 struct Qdisc *head;
3628
3629                 local_irq_disable();
3630                 head = sd->output_queue;
3631                 sd->output_queue = NULL;
3632                 sd->output_queue_tailp = &sd->output_queue;
3633                 local_irq_enable();
3634
3635                 while (head) {
3636                         struct Qdisc *q = head;
3637                         spinlock_t *root_lock;
3638
3639                         head = head->next_sched;
3640
3641                         root_lock = qdisc_lock(q);
3642                         if (spin_trylock(root_lock)) {
3643                                 smp_mb__before_atomic();
3644                                 clear_bit(__QDISC_STATE_SCHED,
3645                                           &q->state);
3646                                 qdisc_run(q);
3647                                 spin_unlock(root_lock);
3648                         } else {
3649                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3650                                               &q->state)) {
3651                                         __netif_reschedule(q);
3652                                 } else {
3653                                         smp_mb__before_atomic();
3654                                         clear_bit(__QDISC_STATE_SCHED,
3655                                                   &q->state);
3656                                 }
3657                         }
3658                 }
3659         }
3660 }
3661
3662 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3663     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3664 /* This hook is defined here for ATM LANE */
3665 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3666                              unsigned char *addr) __read_mostly;
3667 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3668 #endif
3669
3670 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3671                                          struct packet_type **pt_prev,
3672                                          int *ret, struct net_device *orig_dev)
3673 {
3674 #ifdef CONFIG_NET_CLS_ACT
3675         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3676         struct tcf_result cl_res;
3677
3678         /* If there's at least one ingress present somewhere (so
3679          * we get here via enabled static key), remaining devices
3680          * that are not configured with an ingress qdisc will bail
3681          * out here.
3682          */
3683         if (!cl)
3684                 return skb;
3685         if (*pt_prev) {
3686                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3687                 *pt_prev = NULL;
3688         }
3689
3690         qdisc_skb_cb(skb)->pkt_len = skb->len;
3691         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3692         qdisc_bstats_cpu_update(cl->q, skb);
3693
3694         switch (tc_classify(skb, cl, &cl_res, false)) {
3695         case TC_ACT_OK:
3696         case TC_ACT_RECLASSIFY:
3697                 skb->tc_index = TC_H_MIN(cl_res.classid);
3698                 break;
3699         case TC_ACT_SHOT:
3700                 qdisc_qstats_cpu_drop(cl->q);
3701         case TC_ACT_STOLEN:
3702         case TC_ACT_QUEUED:
3703                 kfree_skb(skb);
3704                 return NULL;
3705         case TC_ACT_REDIRECT:
3706                 /* skb_mac_header check was done by cls/act_bpf, so
3707                  * we can safely push the L2 header back before
3708                  * redirecting to another netdev
3709                  */
3710                 __skb_push(skb, skb->mac_len);
3711                 skb_do_redirect(skb);
3712                 return NULL;
3713         default:
3714                 break;
3715         }
3716 #endif /* CONFIG_NET_CLS_ACT */
3717         return skb;
3718 }
3719
3720 /**
3721  *      netdev_is_rx_handler_busy - check if receive handler is registered
3722  *      @dev: device to check
3723  *
3724  *      Check if a receive handler is already registered for a given device.
3725  *      Return true if there one.
3726  *
3727  *      The caller must hold the rtnl_mutex.
3728  */
3729 bool netdev_is_rx_handler_busy(struct net_device *dev)
3730 {
3731         ASSERT_RTNL();
3732         return dev && rtnl_dereference(dev->rx_handler);
3733 }
3734 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3735
3736 /**
3737  *      netdev_rx_handler_register - register receive handler
3738  *      @dev: device to register a handler for
3739  *      @rx_handler: receive handler to register
3740  *      @rx_handler_data: data pointer that is used by rx handler
3741  *
3742  *      Register a receive handler for a device. This handler will then be
3743  *      called from __netif_receive_skb. A negative errno code is returned
3744  *      on a failure.
3745  *
3746  *      The caller must hold the rtnl_mutex.
3747  *
3748  *      For a general description of rx_handler, see enum rx_handler_result.
3749  */
3750 int netdev_rx_handler_register(struct net_device *dev,
3751                                rx_handler_func_t *rx_handler,
3752                                void *rx_handler_data)
3753 {
3754         ASSERT_RTNL();
3755
3756         if (dev->rx_handler)
3757                 return -EBUSY;
3758
3759         /* Note: rx_handler_data must be set before rx_handler */
3760         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3761         rcu_assign_pointer(dev->rx_handler, rx_handler);
3762
3763         return 0;
3764 }
3765 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3766
3767 /**
3768  *      netdev_rx_handler_unregister - unregister receive handler
3769  *      @dev: device to unregister a handler from
3770  *
3771  *      Unregister a receive handler from a device.
3772  *
3773  *      The caller must hold the rtnl_mutex.
3774  */
3775 void netdev_rx_handler_unregister(struct net_device *dev)
3776 {
3777
3778         ASSERT_RTNL();
3779         RCU_INIT_POINTER(dev->rx_handler, NULL);
3780         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3781          * section has a guarantee to see a non NULL rx_handler_data
3782          * as well.
3783          */
3784         synchronize_net();
3785         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3786 }
3787 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3788
3789 /*
3790  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3791  * the special handling of PFMEMALLOC skbs.
3792  */
3793 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3794 {
3795         switch (skb->protocol) {
3796         case htons(ETH_P_ARP):
3797         case htons(ETH_P_IP):
3798         case htons(ETH_P_IPV6):
3799         case htons(ETH_P_8021Q):
3800         case htons(ETH_P_8021AD):
3801                 return true;
3802         default:
3803                 return false;
3804         }
3805 }
3806
3807 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3808                              int *ret, struct net_device *orig_dev)
3809 {
3810 #ifdef CONFIG_NETFILTER_INGRESS
3811         if (nf_hook_ingress_active(skb)) {
3812                 if (*pt_prev) {
3813                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
3814                         *pt_prev = NULL;
3815                 }
3816
3817                 return nf_hook_ingress(skb);
3818         }
3819 #endif /* CONFIG_NETFILTER_INGRESS */
3820         return 0;
3821 }
3822
3823 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3824 {
3825         struct packet_type *ptype, *pt_prev;
3826         rx_handler_func_t *rx_handler;
3827         struct net_device *orig_dev;
3828         bool deliver_exact = false;
3829         int ret = NET_RX_DROP;
3830         __be16 type;
3831
3832         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3833
3834         trace_netif_receive_skb(skb);
3835
3836         orig_dev = skb->dev;
3837
3838         skb_reset_network_header(skb);
3839         if (!skb_transport_header_was_set(skb))
3840                 skb_reset_transport_header(skb);
3841         skb_reset_mac_len(skb);
3842
3843         pt_prev = NULL;
3844
3845 another_round:
3846         skb->skb_iif = skb->dev->ifindex;
3847
3848         __this_cpu_inc(softnet_data.processed);
3849
3850         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3851             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3852                 skb = skb_vlan_untag(skb);
3853                 if (unlikely(!skb))
3854                         goto out;
3855         }
3856
3857 #ifdef CONFIG_NET_CLS_ACT
3858         if (skb->tc_verd & TC_NCLS) {
3859                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3860                 goto ncls;
3861         }
3862 #endif
3863
3864         if (pfmemalloc)
3865                 goto skip_taps;
3866
3867         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3868                 if (pt_prev)
3869                         ret = deliver_skb(skb, pt_prev, orig_dev);
3870                 pt_prev = ptype;
3871         }
3872
3873         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3874                 if (pt_prev)
3875                         ret = deliver_skb(skb, pt_prev, orig_dev);
3876                 pt_prev = ptype;
3877         }
3878
3879 skip_taps:
3880 #ifdef CONFIG_NET_INGRESS
3881         if (static_key_false(&ingress_needed)) {
3882                 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3883                 if (!skb)
3884                         goto out;
3885
3886                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
3887                         goto out;
3888         }
3889 #endif
3890 #ifdef CONFIG_NET_CLS_ACT
3891         skb->tc_verd = 0;
3892 ncls:
3893 #endif
3894         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3895                 goto drop;
3896
3897         if (skb_vlan_tag_present(skb)) {
3898                 if (pt_prev) {
3899                         ret = deliver_skb(skb, pt_prev, orig_dev);
3900                         pt_prev = NULL;
3901                 }
3902                 if (vlan_do_receive(&skb))
3903                         goto another_round;
3904                 else if (unlikely(!skb))
3905                         goto out;
3906         }
3907
3908         rx_handler = rcu_dereference(skb->dev->rx_handler);
3909         if (rx_handler) {
3910                 if (pt_prev) {
3911                         ret = deliver_skb(skb, pt_prev, orig_dev);
3912                         pt_prev = NULL;
3913                 }
3914                 switch (rx_handler(&skb)) {
3915                 case RX_HANDLER_CONSUMED:
3916                         ret = NET_RX_SUCCESS;
3917                         goto out;
3918                 case RX_HANDLER_ANOTHER:
3919                         goto another_round;
3920                 case RX_HANDLER_EXACT:
3921                         deliver_exact = true;
3922                 case RX_HANDLER_PASS:
3923                         break;
3924                 default:
3925                         BUG();
3926                 }
3927         }
3928
3929         if (unlikely(skb_vlan_tag_present(skb))) {
3930                 if (skb_vlan_tag_get_id(skb))
3931                         skb->pkt_type = PACKET_OTHERHOST;
3932                 /* Note: we might in the future use prio bits
3933                  * and set skb->priority like in vlan_do_receive()
3934                  * For the time being, just ignore Priority Code Point
3935                  */
3936                 skb->vlan_tci = 0;
3937         }
3938
3939         type = skb->protocol;
3940
3941         /* deliver only exact match when indicated */
3942         if (likely(!deliver_exact)) {
3943                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3944                                        &ptype_base[ntohs(type) &
3945                                                    PTYPE_HASH_MASK]);
3946         }
3947
3948         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3949                                &orig_dev->ptype_specific);
3950
3951         if (unlikely(skb->dev != orig_dev)) {
3952                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3953                                        &skb->dev->ptype_specific);
3954         }
3955
3956         if (pt_prev) {
3957                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3958                         goto drop;
3959                 else
3960                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3961         } else {
3962 drop:
3963                 atomic_long_inc(&skb->dev->rx_dropped);
3964                 kfree_skb(skb);
3965                 /* Jamal, now you will not able to escape explaining
3966                  * me how you were going to use this. :-)
3967                  */
3968                 ret = NET_RX_DROP;
3969         }
3970
3971 out:
3972         return ret;
3973 }
3974
3975 static int __netif_receive_skb(struct sk_buff *skb)
3976 {
3977         int ret;
3978
3979         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3980                 unsigned long pflags = current->flags;
3981
3982                 /*
3983                  * PFMEMALLOC skbs are special, they should
3984                  * - be delivered to SOCK_MEMALLOC sockets only
3985                  * - stay away from userspace
3986                  * - have bounded memory usage
3987                  *
3988                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3989                  * context down to all allocation sites.
3990                  */
3991                 current->flags |= PF_MEMALLOC;
3992                 ret = __netif_receive_skb_core(skb, true);
3993                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3994         } else
3995                 ret = __netif_receive_skb_core(skb, false);
3996
3997         return ret;
3998 }
3999
4000 static int netif_receive_skb_internal(struct sk_buff *skb)
4001 {
4002         int ret;
4003
4004         net_timestamp_check(netdev_tstamp_prequeue, skb);
4005
4006         if (skb_defer_rx_timestamp(skb))
4007                 return NET_RX_SUCCESS;
4008
4009         rcu_read_lock();
4010
4011 #ifdef CONFIG_RPS
4012         if (static_key_false(&rps_needed)) {
4013                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4014                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4015
4016                 if (cpu >= 0) {
4017                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4018                         rcu_read_unlock();
4019                         return ret;
4020                 }
4021         }
4022 #endif
4023         ret = __netif_receive_skb(skb);
4024         rcu_read_unlock();
4025         return ret;
4026 }
4027
4028 /**
4029  *      netif_receive_skb - process receive buffer from network
4030  *      @skb: buffer to process
4031  *
4032  *      netif_receive_skb() is the main receive data processing function.
4033  *      It always succeeds. The buffer may be dropped during processing
4034  *      for congestion control or by the protocol layers.
4035  *
4036  *      This function may only be called from softirq context and interrupts
4037  *      should be enabled.
4038  *
4039  *      Return values (usually ignored):
4040  *      NET_RX_SUCCESS: no congestion
4041  *      NET_RX_DROP: packet was dropped
4042  */
4043 int netif_receive_skb(struct sk_buff *skb)
4044 {
4045         trace_netif_receive_skb_entry(skb);
4046
4047         return netif_receive_skb_internal(skb);
4048 }
4049 EXPORT_SYMBOL(netif_receive_skb);
4050
4051 /* Network device is going away, flush any packets still pending
4052  * Called with irqs disabled.
4053  */
4054 static void flush_backlog(void *arg)
4055 {
4056         struct net_device *dev = arg;
4057         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4058         struct sk_buff *skb, *tmp;
4059
4060         rps_lock(sd);
4061         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4062                 if (skb->dev == dev) {
4063                         __skb_unlink(skb, &sd->input_pkt_queue);
4064                         kfree_skb(skb);
4065                         input_queue_head_incr(sd);
4066                 }
4067         }
4068         rps_unlock(sd);
4069
4070         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4071                 if (skb->dev == dev) {
4072                         __skb_unlink(skb, &sd->process_queue);
4073                         kfree_skb(skb);
4074                         input_queue_head_incr(sd);
4075                 }
4076         }
4077 }
4078
4079 static int napi_gro_complete(struct sk_buff *skb)
4080 {
4081         struct packet_offload *ptype;
4082         __be16 type = skb->protocol;
4083         struct list_head *head = &offload_base;
4084         int err = -ENOENT;
4085
4086         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4087
4088         if (NAPI_GRO_CB(skb)->count == 1) {
4089                 skb_shinfo(skb)->gso_size = 0;
4090                 goto out;
4091         }
4092
4093         rcu_read_lock();
4094         list_for_each_entry_rcu(ptype, head, list) {
4095                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4096                         continue;
4097
4098                 err = ptype->callbacks.gro_complete(skb, 0);
4099                 break;
4100         }
4101         rcu_read_unlock();
4102
4103         if (err) {
4104                 WARN_ON(&ptype->list == head);
4105                 kfree_skb(skb);
4106                 return NET_RX_SUCCESS;
4107         }
4108
4109 out:
4110         return netif_receive_skb_internal(skb);
4111 }
4112
4113 /* napi->gro_list contains packets ordered by age.
4114  * youngest packets at the head of it.
4115  * Complete skbs in reverse order to reduce latencies.
4116  */
4117 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4118 {
4119         struct sk_buff *skb, *prev = NULL;
4120
4121         /* scan list and build reverse chain */
4122         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4123                 skb->prev = prev;
4124                 prev = skb;
4125         }
4126
4127         for (skb = prev; skb; skb = prev) {
4128                 skb->next = NULL;
4129
4130                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4131                         return;
4132
4133                 prev = skb->prev;
4134                 napi_gro_complete(skb);
4135                 napi->gro_count--;
4136         }
4137
4138         napi->gro_list = NULL;
4139 }
4140 EXPORT_SYMBOL(napi_gro_flush);
4141
4142 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4143 {
4144         struct sk_buff *p;
4145         unsigned int maclen = skb->dev->hard_header_len;
4146         u32 hash = skb_get_hash_raw(skb);
4147
4148         for (p = napi->gro_list; p; p = p->next) {
4149                 unsigned long diffs;
4150
4151                 NAPI_GRO_CB(p)->flush = 0;
4152
4153                 if (hash != skb_get_hash_raw(p)) {
4154                         NAPI_GRO_CB(p)->same_flow = 0;
4155                         continue;
4156                 }
4157
4158                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4159                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4160                 diffs |= skb_metadata_dst_cmp(p, skb);
4161                 if (maclen == ETH_HLEN)
4162                         diffs |= compare_ether_header(skb_mac_header(p),
4163                                                       skb_mac_header(skb));
4164                 else if (!diffs)
4165                         diffs = memcmp(skb_mac_header(p),
4166                                        skb_mac_header(skb),
4167                                        maclen);
4168                 NAPI_GRO_CB(p)->same_flow = !diffs;
4169         }
4170 }
4171
4172 static void skb_gro_reset_offset(struct sk_buff *skb)
4173 {
4174         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4175         const skb_frag_t *frag0 = &pinfo->frags[0];
4176
4177         NAPI_GRO_CB(skb)->data_offset = 0;
4178         NAPI_GRO_CB(skb)->frag0 = NULL;
4179         NAPI_GRO_CB(skb)->frag0_len = 0;
4180
4181         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4182             pinfo->nr_frags &&
4183             !PageHighMem(skb_frag_page(frag0))) {
4184                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4185                 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4186                                                     skb_frag_size(frag0),
4187                                                     skb->end - skb->tail);
4188         }
4189 }
4190
4191 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4192 {
4193         struct skb_shared_info *pinfo = skb_shinfo(skb);
4194
4195         BUG_ON(skb->end - skb->tail < grow);
4196
4197         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4198
4199         skb->data_len -= grow;
4200         skb->tail += grow;
4201
4202         pinfo->frags[0].page_offset += grow;
4203         skb_frag_size_sub(&pinfo->frags[0], grow);
4204
4205         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4206                 skb_frag_unref(skb, 0);
4207                 memmove(pinfo->frags, pinfo->frags + 1,
4208                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4209         }
4210 }
4211
4212 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4213 {
4214         struct sk_buff **pp = NULL;
4215         struct packet_offload *ptype;
4216         __be16 type = skb->protocol;
4217         struct list_head *head = &offload_base;
4218         int same_flow;
4219         enum gro_result ret;
4220         int grow;
4221
4222         if (!(skb->dev->features & NETIF_F_GRO))
4223                 goto normal;
4224
4225         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4226                 goto normal;
4227
4228         gro_list_prepare(napi, skb);
4229
4230         rcu_read_lock();
4231         list_for_each_entry_rcu(ptype, head, list) {
4232                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4233                         continue;
4234
4235                 skb_set_network_header(skb, skb_gro_offset(skb));
4236                 skb_reset_mac_len(skb);
4237                 NAPI_GRO_CB(skb)->same_flow = 0;
4238                 NAPI_GRO_CB(skb)->flush = 0;
4239                 NAPI_GRO_CB(skb)->free = 0;
4240                 NAPI_GRO_CB(skb)->encap_mark = 0;
4241                 NAPI_GRO_CB(skb)->recursion_counter = 0;
4242                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4243
4244                 /* Setup for GRO checksum validation */
4245                 switch (skb->ip_summed) {
4246                 case CHECKSUM_COMPLETE:
4247                         NAPI_GRO_CB(skb)->csum = skb->csum;
4248                         NAPI_GRO_CB(skb)->csum_valid = 1;
4249                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4250                         break;
4251                 case CHECKSUM_UNNECESSARY:
4252                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4253                         NAPI_GRO_CB(skb)->csum_valid = 0;
4254                         break;
4255                 default:
4256                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4257                         NAPI_GRO_CB(skb)->csum_valid = 0;
4258                 }
4259
4260                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4261                 break;
4262         }
4263         rcu_read_unlock();
4264
4265         if (&ptype->list == head)
4266                 goto normal;
4267
4268         same_flow = NAPI_GRO_CB(skb)->same_flow;
4269         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4270
4271         if (pp) {
4272                 struct sk_buff *nskb = *pp;
4273
4274                 *pp = nskb->next;
4275                 nskb->next = NULL;
4276                 napi_gro_complete(nskb);
4277                 napi->gro_count--;
4278         }
4279
4280         if (same_flow)
4281                 goto ok;
4282
4283         if (NAPI_GRO_CB(skb)->flush)
4284                 goto normal;
4285
4286         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4287                 struct sk_buff *nskb = napi->gro_list;
4288
4289                 /* locate the end of the list to select the 'oldest' flow */
4290                 while (nskb->next) {
4291                         pp = &nskb->next;
4292                         nskb = *pp;
4293                 }
4294                 *pp = NULL;
4295                 nskb->next = NULL;
4296                 napi_gro_complete(nskb);
4297         } else {
4298                 napi->gro_count++;
4299         }
4300         NAPI_GRO_CB(skb)->count = 1;
4301         NAPI_GRO_CB(skb)->age = jiffies;
4302         NAPI_GRO_CB(skb)->last = skb;
4303         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4304         skb->next = napi->gro_list;
4305         napi->gro_list = skb;
4306         ret = GRO_HELD;
4307
4308 pull:
4309         grow = skb_gro_offset(skb) - skb_headlen(skb);
4310         if (grow > 0)
4311                 gro_pull_from_frag0(skb, grow);
4312 ok:
4313         return ret;
4314
4315 normal:
4316         ret = GRO_NORMAL;
4317         goto pull;
4318 }
4319
4320 struct packet_offload *gro_find_receive_by_type(__be16 type)
4321 {
4322         struct list_head *offload_head = &offload_base;
4323         struct packet_offload *ptype;
4324
4325         list_for_each_entry_rcu(ptype, offload_head, list) {
4326                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4327                         continue;
4328                 return ptype;
4329         }
4330         return NULL;
4331 }
4332 EXPORT_SYMBOL(gro_find_receive_by_type);
4333
4334 struct packet_offload *gro_find_complete_by_type(__be16 type)
4335 {
4336         struct list_head *offload_head = &offload_base;
4337         struct packet_offload *ptype;
4338
4339         list_for_each_entry_rcu(ptype, offload_head, list) {
4340                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4341                         continue;
4342                 return ptype;
4343         }
4344         return NULL;
4345 }
4346 EXPORT_SYMBOL(gro_find_complete_by_type);
4347
4348 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4349 {
4350         switch (ret) {
4351         case GRO_NORMAL:
4352                 if (netif_receive_skb_internal(skb))
4353                         ret = GRO_DROP;
4354                 break;
4355
4356         case GRO_DROP:
4357                 kfree_skb(skb);
4358                 break;
4359
4360         case GRO_MERGED_FREE:
4361                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4362                         skb_dst_drop(skb);
4363                         kmem_cache_free(skbuff_head_cache, skb);
4364                 } else {
4365                         __kfree_skb(skb);
4366                 }
4367                 break;
4368
4369         case GRO_HELD:
4370         case GRO_MERGED:
4371                 break;
4372         }
4373
4374         return ret;
4375 }
4376
4377 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4378 {
4379         trace_napi_gro_receive_entry(skb);
4380
4381         skb_gro_reset_offset(skb);
4382
4383         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4384 }
4385 EXPORT_SYMBOL(napi_gro_receive);
4386
4387 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4388 {
4389         if (unlikely(skb->pfmemalloc)) {
4390                 consume_skb(skb);
4391                 return;
4392         }
4393         __skb_pull(skb, skb_headlen(skb));
4394         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4395         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4396         skb->vlan_tci = 0;
4397         skb->dev = napi->dev;
4398         skb->skb_iif = 0;
4399         skb->encapsulation = 0;
4400         skb_shinfo(skb)->gso_type = 0;
4401         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4402
4403         napi->skb = skb;
4404 }
4405
4406 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4407 {
4408         struct sk_buff *skb = napi->skb;
4409
4410         if (!skb) {
4411                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4412                 napi->skb = skb;
4413         }
4414         return skb;
4415 }
4416 EXPORT_SYMBOL(napi_get_frags);
4417
4418 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4419                                       struct sk_buff *skb,
4420                                       gro_result_t ret)
4421 {
4422         switch (ret) {
4423         case GRO_NORMAL:
4424         case GRO_HELD:
4425                 __skb_push(skb, ETH_HLEN);
4426                 skb->protocol = eth_type_trans(skb, skb->dev);
4427                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4428                         ret = GRO_DROP;
4429                 break;
4430
4431         case GRO_DROP:
4432         case GRO_MERGED_FREE:
4433                 napi_reuse_skb(napi, skb);
4434                 break;
4435
4436         case GRO_MERGED:
4437                 break;
4438         }
4439
4440         return ret;
4441 }
4442
4443 /* Upper GRO stack assumes network header starts at gro_offset=0
4444  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4445  * We copy ethernet header into skb->data to have a common layout.
4446  */
4447 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4448 {
4449         struct sk_buff *skb = napi->skb;
4450         const struct ethhdr *eth;
4451         unsigned int hlen = sizeof(*eth);
4452
4453         napi->skb = NULL;
4454
4455         skb_reset_mac_header(skb);
4456         skb_gro_reset_offset(skb);
4457
4458         eth = skb_gro_header_fast(skb, 0);
4459         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4460                 eth = skb_gro_header_slow(skb, hlen, 0);
4461                 if (unlikely(!eth)) {
4462                         napi_reuse_skb(napi, skb);
4463                         return NULL;
4464                 }
4465         } else {
4466                 gro_pull_from_frag0(skb, hlen);
4467                 NAPI_GRO_CB(skb)->frag0 += hlen;
4468                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4469         }
4470         __skb_pull(skb, hlen);
4471
4472         /*
4473          * This works because the only protocols we care about don't require
4474          * special handling.
4475          * We'll fix it up properly in napi_frags_finish()
4476          */
4477         skb->protocol = eth->h_proto;
4478
4479         return skb;
4480 }
4481
4482 gro_result_t napi_gro_frags(struct napi_struct *napi)
4483 {
4484         struct sk_buff *skb = napi_frags_skb(napi);
4485
4486         if (!skb)
4487                 return GRO_DROP;
4488
4489         trace_napi_gro_frags_entry(skb);
4490
4491         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4492 }
4493 EXPORT_SYMBOL(napi_gro_frags);
4494
4495 /* Compute the checksum from gro_offset and return the folded value
4496  * after adding in any pseudo checksum.
4497  */
4498 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4499 {
4500         __wsum wsum;
4501         __sum16 sum;
4502
4503         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4504
4505         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4506         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4507         if (likely(!sum)) {
4508                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4509                     !skb->csum_complete_sw)
4510                         netdev_rx_csum_fault(skb->dev);
4511         }
4512
4513         NAPI_GRO_CB(skb)->csum = wsum;
4514         NAPI_GRO_CB(skb)->csum_valid = 1;
4515
4516         return sum;
4517 }
4518 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4519
4520 /*
4521  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4522  * Note: called with local irq disabled, but exits with local irq enabled.
4523  */
4524 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4525 {
4526 #ifdef CONFIG_RPS
4527         struct softnet_data *remsd = sd->rps_ipi_list;
4528
4529         if (remsd) {
4530                 sd->rps_ipi_list = NULL;
4531
4532                 local_irq_enable();
4533
4534                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4535                 while (remsd) {
4536                         struct softnet_data *next = remsd->rps_ipi_next;
4537
4538                         if (cpu_online(remsd->cpu))
4539                                 smp_call_function_single_async(remsd->cpu,
4540                                                            &remsd->csd);
4541                         remsd = next;
4542                 }
4543         } else
4544 #endif
4545                 local_irq_enable();
4546 }
4547
4548 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4549 {
4550 #ifdef CONFIG_RPS
4551         return sd->rps_ipi_list != NULL;
4552 #else
4553         return false;
4554 #endif
4555 }
4556
4557 static int process_backlog(struct napi_struct *napi, int quota)
4558 {
4559         int work = 0;
4560         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4561
4562         /* Check if we have pending ipi, its better to send them now,
4563          * not waiting net_rx_action() end.
4564          */
4565         if (sd_has_rps_ipi_waiting(sd)) {
4566                 local_irq_disable();
4567                 net_rps_action_and_irq_enable(sd);
4568         }
4569
4570         napi->weight = weight_p;
4571         local_irq_disable();
4572         while (1) {
4573                 struct sk_buff *skb;
4574
4575                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4576                         rcu_read_lock();
4577                         local_irq_enable();
4578                         __netif_receive_skb(skb);
4579                         rcu_read_unlock();
4580                         local_irq_disable();
4581                         input_queue_head_incr(sd);
4582                         if (++work >= quota) {
4583                                 local_irq_enable();
4584                                 return work;
4585                         }
4586                 }
4587
4588                 rps_lock(sd);
4589                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4590                         /*
4591                          * Inline a custom version of __napi_complete().
4592                          * only current cpu owns and manipulates this napi,
4593                          * and NAPI_STATE_SCHED is the only possible flag set
4594                          * on backlog.
4595                          * We can use a plain write instead of clear_bit(),
4596                          * and we dont need an smp_mb() memory barrier.
4597                          */
4598                         napi->state = 0;
4599                         rps_unlock(sd);
4600
4601                         break;
4602                 }
4603
4604                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4605                                            &sd->process_queue);
4606                 rps_unlock(sd);
4607         }
4608         local_irq_enable();
4609
4610         return work;
4611 }
4612
4613 /**
4614  * __napi_schedule - schedule for receive
4615  * @n: entry to schedule
4616  *
4617  * The entry's receive function will be scheduled to run.
4618  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4619  */
4620 void __napi_schedule(struct napi_struct *n)
4621 {
4622         unsigned long flags;
4623
4624         local_irq_save(flags);
4625         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4626         local_irq_restore(flags);
4627 }
4628 EXPORT_SYMBOL(__napi_schedule);
4629
4630 /**
4631  * __napi_schedule_irqoff - schedule for receive
4632  * @n: entry to schedule
4633  *
4634  * Variant of __napi_schedule() assuming hard irqs are masked
4635  */
4636 void __napi_schedule_irqoff(struct napi_struct *n)
4637 {
4638         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4639 }
4640 EXPORT_SYMBOL(__napi_schedule_irqoff);
4641
4642 void __napi_complete(struct napi_struct *n)
4643 {
4644         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4645
4646         list_del_init(&n->poll_list);
4647         smp_mb__before_atomic();
4648         clear_bit(NAPI_STATE_SCHED, &n->state);
4649 }
4650 EXPORT_SYMBOL(__napi_complete);
4651
4652 void napi_complete_done(struct napi_struct *n, int work_done)
4653 {
4654         unsigned long flags;
4655
4656         /*
4657          * don't let napi dequeue from the cpu poll list
4658          * just in case its running on a different cpu
4659          */
4660         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4661                 return;
4662
4663         if (n->gro_list) {
4664                 unsigned long timeout = 0;
4665
4666                 if (work_done)
4667                         timeout = n->dev->gro_flush_timeout;
4668
4669                 if (timeout)
4670                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4671                                       HRTIMER_MODE_REL_PINNED);
4672                 else
4673                         napi_gro_flush(n, false);
4674         }
4675         if (likely(list_empty(&n->poll_list))) {
4676                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4677         } else {
4678                 /* If n->poll_list is not empty, we need to mask irqs */
4679                 local_irq_save(flags);
4680                 __napi_complete(n);
4681                 local_irq_restore(flags);
4682         }
4683 }
4684 EXPORT_SYMBOL(napi_complete_done);
4685
4686 /* must be called under rcu_read_lock(), as we dont take a reference */
4687 struct napi_struct *napi_by_id(unsigned int napi_id)
4688 {
4689         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4690         struct napi_struct *napi;
4691
4692         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4693                 if (napi->napi_id == napi_id)
4694                         return napi;
4695
4696         return NULL;
4697 }
4698 EXPORT_SYMBOL_GPL(napi_by_id);
4699
4700 void napi_hash_add(struct napi_struct *napi)
4701 {
4702         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4703
4704                 spin_lock(&napi_hash_lock);
4705
4706                 /* 0 is not a valid id, we also skip an id that is taken
4707                  * we expect both events to be extremely rare
4708                  */
4709                 napi->napi_id = 0;
4710                 while (!napi->napi_id) {
4711                         napi->napi_id = ++napi_gen_id;
4712                         if (napi_by_id(napi->napi_id))
4713                                 napi->napi_id = 0;
4714                 }
4715
4716                 hlist_add_head_rcu(&napi->napi_hash_node,
4717                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4718
4719                 spin_unlock(&napi_hash_lock);
4720         }
4721 }
4722 EXPORT_SYMBOL_GPL(napi_hash_add);
4723
4724 /* Warning : caller is responsible to make sure rcu grace period
4725  * is respected before freeing memory containing @napi
4726  */
4727 void napi_hash_del(struct napi_struct *napi)
4728 {
4729         spin_lock(&napi_hash_lock);
4730
4731         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4732                 hlist_del_rcu(&napi->napi_hash_node);
4733
4734         spin_unlock(&napi_hash_lock);
4735 }
4736 EXPORT_SYMBOL_GPL(napi_hash_del);
4737
4738 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4739 {
4740         struct napi_struct *napi;
4741
4742         napi = container_of(timer, struct napi_struct, timer);
4743         if (napi->gro_list)
4744                 napi_schedule(napi);
4745
4746         return HRTIMER_NORESTART;
4747 }
4748
4749 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4750                     int (*poll)(struct napi_struct *, int), int weight)
4751 {
4752         INIT_LIST_HEAD(&napi->poll_list);
4753         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4754         napi->timer.function = napi_watchdog;
4755         napi->gro_count = 0;
4756         napi->gro_list = NULL;
4757         napi->skb = NULL;
4758         napi->poll = poll;
4759         if (weight > NAPI_POLL_WEIGHT)
4760                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4761                             weight, dev->name);
4762         napi->weight = weight;
4763         list_add(&napi->dev_list, &dev->napi_list);
4764         napi->dev = dev;
4765 #ifdef CONFIG_NETPOLL
4766         spin_lock_init(&napi->poll_lock);
4767         napi->poll_owner = -1;
4768 #endif
4769         set_bit(NAPI_STATE_SCHED, &napi->state);
4770 }
4771 EXPORT_SYMBOL(netif_napi_add);
4772
4773 void napi_disable(struct napi_struct *n)
4774 {
4775         might_sleep();
4776         set_bit(NAPI_STATE_DISABLE, &n->state);
4777
4778         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4779                 msleep(1);
4780         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
4781                 msleep(1);
4782
4783         hrtimer_cancel(&n->timer);
4784
4785         clear_bit(NAPI_STATE_DISABLE, &n->state);
4786 }
4787 EXPORT_SYMBOL(napi_disable);
4788
4789 void netif_napi_del(struct napi_struct *napi)
4790 {
4791         list_del_init(&napi->dev_list);
4792         napi_free_frags(napi);
4793
4794         kfree_skb_list(napi->gro_list);
4795         napi->gro_list = NULL;
4796         napi->gro_count = 0;
4797 }
4798 EXPORT_SYMBOL(netif_napi_del);
4799
4800 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4801 {
4802         void *have;
4803         int work, weight;
4804
4805         list_del_init(&n->poll_list);
4806
4807         have = netpoll_poll_lock(n);
4808
4809         weight = n->weight;
4810
4811         /* This NAPI_STATE_SCHED test is for avoiding a race
4812          * with netpoll's poll_napi().  Only the entity which
4813          * obtains the lock and sees NAPI_STATE_SCHED set will
4814          * actually make the ->poll() call.  Therefore we avoid
4815          * accidentally calling ->poll() when NAPI is not scheduled.
4816          */
4817         work = 0;
4818         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4819                 work = n->poll(n, weight);
4820                 trace_napi_poll(n);
4821         }
4822
4823         WARN_ON_ONCE(work > weight);
4824
4825         if (likely(work < weight))
4826                 goto out_unlock;
4827
4828         /* Drivers must not modify the NAPI state if they
4829          * consume the entire weight.  In such cases this code
4830          * still "owns" the NAPI instance and therefore can
4831          * move the instance around on the list at-will.
4832          */
4833         if (unlikely(napi_disable_pending(n))) {
4834                 napi_complete(n);
4835                 goto out_unlock;
4836         }
4837
4838         if (n->gro_list) {
4839                 /* flush too old packets
4840                  * If HZ < 1000, flush all packets.
4841                  */
4842                 napi_gro_flush(n, HZ >= 1000);
4843         }
4844
4845         /* Some drivers may have called napi_schedule
4846          * prior to exhausting their budget.
4847          */
4848         if (unlikely(!list_empty(&n->poll_list))) {
4849                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4850                              n->dev ? n->dev->name : "backlog");
4851                 goto out_unlock;
4852         }
4853
4854         list_add_tail(&n->poll_list, repoll);
4855
4856 out_unlock:
4857         netpoll_poll_unlock(have);
4858
4859         return work;
4860 }
4861
4862 static void net_rx_action(struct softirq_action *h)
4863 {
4864         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4865         unsigned long time_limit = jiffies + 2;
4866         int budget = netdev_budget;
4867         LIST_HEAD(list);
4868         LIST_HEAD(repoll);
4869
4870         local_irq_disable();
4871         list_splice_init(&sd->poll_list, &list);
4872         local_irq_enable();
4873
4874         for (;;) {
4875                 struct napi_struct *n;
4876
4877                 if (list_empty(&list)) {
4878                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4879                                 return;
4880                         break;
4881                 }
4882
4883                 n = list_first_entry(&list, struct napi_struct, poll_list);
4884                 budget -= napi_poll(n, &repoll);
4885
4886                 /* If softirq window is exhausted then punt.
4887                  * Allow this to run for 2 jiffies since which will allow
4888                  * an average latency of 1.5/HZ.
4889                  */
4890                 if (unlikely(budget <= 0 ||
4891                              time_after_eq(jiffies, time_limit))) {
4892                         sd->time_squeeze++;
4893                         break;
4894                 }
4895         }
4896
4897         local_irq_disable();
4898
4899         list_splice_tail_init(&sd->poll_list, &list);
4900         list_splice_tail(&repoll, &list);
4901         list_splice(&list, &sd->poll_list);
4902         if (!list_empty(&sd->poll_list))
4903                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4904
4905         net_rps_action_and_irq_enable(sd);
4906 }
4907
4908 struct netdev_adjacent {
4909         struct net_device *dev;
4910
4911         /* upper master flag, there can only be one master device per list */
4912         bool master;
4913
4914         /* counter for the number of times this device was added to us */
4915         u16 ref_nr;
4916
4917         /* private field for the users */
4918         void *private;
4919
4920         struct list_head list;
4921         struct rcu_head rcu;
4922 };
4923
4924 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
4925                                                  struct list_head *adj_list)
4926 {
4927         struct netdev_adjacent *adj;
4928
4929         list_for_each_entry(adj, adj_list, list) {
4930                 if (adj->dev == adj_dev)
4931                         return adj;
4932         }
4933         return NULL;
4934 }
4935
4936 /**
4937  * netdev_has_upper_dev - Check if device is linked to an upper device
4938  * @dev: device
4939  * @upper_dev: upper device to check
4940  *
4941  * Find out if a device is linked to specified upper device and return true
4942  * in case it is. Note that this checks only immediate upper device,
4943  * not through a complete stack of devices. The caller must hold the RTNL lock.
4944  */
4945 bool netdev_has_upper_dev(struct net_device *dev,
4946                           struct net_device *upper_dev)
4947 {
4948         ASSERT_RTNL();
4949
4950         return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
4951 }
4952 EXPORT_SYMBOL(netdev_has_upper_dev);
4953
4954 /**
4955  * netdev_has_any_upper_dev - Check if device is linked to some device
4956  * @dev: device
4957  *
4958  * Find out if a device is linked to an upper device and return true in case
4959  * it is. The caller must hold the RTNL lock.
4960  */
4961 static bool netdev_has_any_upper_dev(struct net_device *dev)
4962 {
4963         ASSERT_RTNL();
4964
4965         return !list_empty(&dev->all_adj_list.upper);
4966 }
4967
4968 /**
4969  * netdev_master_upper_dev_get - Get master upper device
4970  * @dev: device
4971  *
4972  * Find a master upper device and return pointer to it or NULL in case
4973  * it's not there. The caller must hold the RTNL lock.
4974  */
4975 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4976 {
4977         struct netdev_adjacent *upper;
4978
4979         ASSERT_RTNL();
4980
4981         if (list_empty(&dev->adj_list.upper))
4982                 return NULL;
4983
4984         upper = list_first_entry(&dev->adj_list.upper,
4985                                  struct netdev_adjacent, list);
4986         if (likely(upper->master))
4987                 return upper->dev;
4988         return NULL;
4989 }
4990 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4991
4992 void *netdev_adjacent_get_private(struct list_head *adj_list)
4993 {
4994         struct netdev_adjacent *adj;
4995
4996         adj = list_entry(adj_list, struct netdev_adjacent, list);
4997
4998         return adj->private;
4999 }
5000 EXPORT_SYMBOL(netdev_adjacent_get_private);
5001
5002 /**
5003  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5004  * @dev: device
5005  * @iter: list_head ** of the current position
5006  *
5007  * Gets the next device from the dev's upper list, starting from iter
5008  * position. The caller must hold RCU read lock.
5009  */
5010 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5011                                                  struct list_head **iter)
5012 {
5013         struct netdev_adjacent *upper;
5014
5015         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5016
5017         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5018
5019         if (&upper->list == &dev->adj_list.upper)
5020                 return NULL;
5021
5022         *iter = &upper->list;
5023
5024         return upper->dev;
5025 }
5026 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5027
5028 /**
5029  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5030  * @dev: device
5031  * @iter: list_head ** of the current position
5032  *
5033  * Gets the next device from the dev's upper list, starting from iter
5034  * position. The caller must hold RCU read lock.
5035  */
5036 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5037                                                      struct list_head **iter)
5038 {
5039         struct netdev_adjacent *upper;
5040
5041         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5042
5043         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5044
5045         if (&upper->list == &dev->all_adj_list.upper)
5046                 return NULL;
5047
5048         *iter = &upper->list;
5049
5050         return upper->dev;
5051 }
5052 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5053
5054 /**
5055  * netdev_lower_get_next_private - Get the next ->private from the
5056  *                                 lower neighbour list
5057  * @dev: device
5058  * @iter: list_head ** of the current position
5059  *
5060  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5061  * list, starting from iter position. The caller must hold either hold the
5062  * RTNL lock or its own locking that guarantees that the neighbour lower
5063  * list will remain unchanged.
5064  */
5065 void *netdev_lower_get_next_private(struct net_device *dev,
5066                                     struct list_head **iter)
5067 {
5068         struct netdev_adjacent *lower;
5069
5070         lower = list_entry(*iter, struct netdev_adjacent, list);
5071
5072         if (&lower->list == &dev->adj_list.lower)
5073                 return NULL;
5074
5075         *iter = lower->list.next;
5076
5077         return lower->private;
5078 }
5079 EXPORT_SYMBOL(netdev_lower_get_next_private);
5080
5081 /**
5082  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5083  *                                     lower neighbour list, RCU
5084  *                                     variant
5085  * @dev: device
5086  * @iter: list_head ** of the current position
5087  *
5088  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5089  * list, starting from iter position. The caller must hold RCU read lock.
5090  */
5091 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5092                                         struct list_head **iter)
5093 {
5094         struct netdev_adjacent *lower;
5095
5096         WARN_ON_ONCE(!rcu_read_lock_held());
5097
5098         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5099
5100         if (&lower->list == &dev->adj_list.lower)
5101                 return NULL;
5102
5103         *iter = &lower->list;
5104
5105         return lower->private;
5106 }
5107 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5108
5109 /**
5110  * netdev_lower_get_next - Get the next device from the lower neighbour
5111  *                         list
5112  * @dev: device
5113  * @iter: list_head ** of the current position
5114  *
5115  * Gets the next netdev_adjacent from the dev's lower neighbour
5116  * list, starting from iter position. The caller must hold RTNL lock or
5117  * its own locking that guarantees that the neighbour lower
5118  * list will remain unchanged.
5119  */
5120 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5121 {
5122         struct netdev_adjacent *lower;
5123
5124         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5125
5126         if (&lower->list == &dev->adj_list.lower)
5127                 return NULL;
5128
5129         *iter = &lower->list;
5130
5131         return lower->dev;
5132 }
5133 EXPORT_SYMBOL(netdev_lower_get_next);
5134
5135 /**
5136  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5137  *                                     lower neighbour list, RCU
5138  *                                     variant
5139  * @dev: device
5140  *
5141  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5142  * list. The caller must hold RCU read lock.
5143  */
5144 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5145 {
5146         struct netdev_adjacent *lower;
5147
5148         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5149                         struct netdev_adjacent, list);
5150         if (lower)
5151                 return lower->private;
5152         return NULL;
5153 }
5154 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5155
5156 /**
5157  * netdev_master_upper_dev_get_rcu - Get master upper device
5158  * @dev: device
5159  *
5160  * Find a master upper device and return pointer to it or NULL in case
5161  * it's not there. The caller must hold the RCU read lock.
5162  */
5163 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5164 {
5165         struct netdev_adjacent *upper;
5166
5167         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5168                                        struct netdev_adjacent, list);
5169         if (upper && likely(upper->master))
5170                 return upper->dev;
5171         return NULL;
5172 }
5173 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5174
5175 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5176                               struct net_device *adj_dev,
5177                               struct list_head *dev_list)
5178 {
5179         char linkname[IFNAMSIZ+7];
5180         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5181                 "upper_%s" : "lower_%s", adj_dev->name);
5182         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5183                                  linkname);
5184 }
5185 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5186                                char *name,
5187                                struct list_head *dev_list)
5188 {
5189         char linkname[IFNAMSIZ+7];
5190         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5191                 "upper_%s" : "lower_%s", name);
5192         sysfs_remove_link(&(dev->dev.kobj), linkname);
5193 }
5194
5195 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5196                                                  struct net_device *adj_dev,
5197                                                  struct list_head *dev_list)
5198 {
5199         return (dev_list == &dev->adj_list.upper ||
5200                 dev_list == &dev->adj_list.lower) &&
5201                 net_eq(dev_net(dev), dev_net(adj_dev));
5202 }
5203
5204 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5205                                         struct net_device *adj_dev,
5206                                         u16 ref_nr,
5207                                         struct list_head *dev_list,
5208                                         void *private, bool master)
5209 {
5210         struct netdev_adjacent *adj;
5211         int ret;
5212
5213         adj = __netdev_find_adj(adj_dev, dev_list);
5214
5215         if (adj) {
5216                 adj->ref_nr += ref_nr;
5217                 return 0;
5218         }
5219
5220         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5221         if (!adj)
5222                 return -ENOMEM;
5223
5224         adj->dev = adj_dev;
5225         adj->master = master;
5226         adj->ref_nr = ref_nr;
5227         adj->private = private;
5228         dev_hold(adj_dev);
5229
5230         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5231                  adj_dev->name, dev->name, adj_dev->name);
5232
5233         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5234                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5235                 if (ret)
5236                         goto free_adj;
5237         }
5238
5239         /* Ensure that master link is always the first item in list. */
5240         if (master) {
5241                 ret = sysfs_create_link(&(dev->dev.kobj),
5242                                         &(adj_dev->dev.kobj), "master");
5243                 if (ret)
5244                         goto remove_symlinks;
5245
5246                 list_add_rcu(&adj->list, dev_list);
5247         } else {
5248                 list_add_tail_rcu(&adj->list, dev_list);
5249         }
5250
5251         return 0;
5252
5253 remove_symlinks:
5254         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5255                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5256 free_adj:
5257         kfree(adj);
5258         dev_put(adj_dev);
5259
5260         return ret;
5261 }
5262
5263 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5264                                          struct net_device *adj_dev,
5265                                          u16 ref_nr,
5266                                          struct list_head *dev_list)
5267 {
5268         struct netdev_adjacent *adj;
5269
5270         adj = __netdev_find_adj(adj_dev, dev_list);
5271
5272         if (!adj) {
5273                 pr_err("tried to remove device %s from %s\n",
5274                        dev->name, adj_dev->name);
5275                 BUG();
5276         }
5277
5278         if (adj->ref_nr > ref_nr) {
5279                 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
5280                          ref_nr, adj->ref_nr-ref_nr);
5281                 adj->ref_nr -= ref_nr;
5282                 return;
5283         }
5284
5285         if (adj->master)
5286                 sysfs_remove_link(&(dev->dev.kobj), "master");
5287
5288         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5289                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5290
5291         list_del_rcu(&adj->list);
5292         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5293                  adj_dev->name, dev->name, adj_dev->name);
5294         dev_put(adj_dev);
5295         kfree_rcu(adj, rcu);
5296 }
5297
5298 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5299                                             struct net_device *upper_dev,
5300                                             u16 ref_nr,
5301                                             struct list_head *up_list,
5302                                             struct list_head *down_list,
5303                                             void *private, bool master)
5304 {
5305         int ret;
5306
5307         ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5308                                            private, master);
5309         if (ret)
5310                 return ret;
5311
5312         ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5313                                            private, false);
5314         if (ret) {
5315                 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5316                 return ret;
5317         }
5318
5319         return 0;
5320 }
5321
5322 static int __netdev_adjacent_dev_link(struct net_device *dev,
5323                                       struct net_device *upper_dev,
5324                                       u16 ref_nr)
5325 {
5326         return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5327                                                 &dev->all_adj_list.upper,
5328                                                 &upper_dev->all_adj_list.lower,
5329                                                 NULL, false);
5330 }
5331
5332 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5333                                                struct net_device *upper_dev,
5334                                                u16 ref_nr,
5335                                                struct list_head *up_list,
5336                                                struct list_head *down_list)
5337 {
5338         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5339         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5340 }
5341
5342 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5343                                          struct net_device *upper_dev,
5344                                          u16 ref_nr)
5345 {
5346         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5347                                            &dev->all_adj_list.upper,
5348                                            &upper_dev->all_adj_list.lower);
5349 }
5350
5351 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5352                                                 struct net_device *upper_dev,
5353                                                 void *private, bool master)
5354 {
5355         int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5356
5357         if (ret)
5358                 return ret;
5359
5360         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5361                                                &dev->adj_list.upper,
5362                                                &upper_dev->adj_list.lower,
5363                                                private, master);
5364         if (ret) {
5365                 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5366                 return ret;
5367         }
5368
5369         return 0;
5370 }
5371
5372 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5373                                                    struct net_device *upper_dev)
5374 {
5375         __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5376         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5377                                            &dev->adj_list.upper,
5378                                            &upper_dev->adj_list.lower);
5379 }
5380
5381 static int __netdev_upper_dev_link(struct net_device *dev,
5382                                    struct net_device *upper_dev, bool master,
5383                                    void *private)
5384 {
5385         struct netdev_notifier_changeupper_info changeupper_info;
5386         struct netdev_adjacent *i, *j, *to_i, *to_j;
5387         int ret = 0;
5388
5389         ASSERT_RTNL();
5390
5391         if (dev == upper_dev)
5392                 return -EBUSY;
5393
5394         /* To prevent loops, check if dev is not upper device to upper_dev. */
5395         if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5396                 return -EBUSY;
5397
5398         if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5399                 return -EEXIST;
5400
5401         if (master && netdev_master_upper_dev_get(dev))
5402                 return -EBUSY;
5403
5404         changeupper_info.upper_dev = upper_dev;
5405         changeupper_info.master = master;
5406         changeupper_info.linking = true;
5407
5408         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5409                                             &changeupper_info.info);
5410         ret = notifier_to_errno(ret);
5411         if (ret)
5412                 return ret;
5413
5414         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5415                                                    master);
5416         if (ret)
5417                 return ret;
5418
5419         /* Now that we linked these devs, make all the upper_dev's
5420          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5421          * versa, and don't forget the devices itself. All of these
5422          * links are non-neighbours.
5423          */
5424         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5425                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5426                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5427                                  i->dev->name, j->dev->name);
5428                         ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5429                         if (ret)
5430                                 goto rollback_mesh;
5431                 }
5432         }
5433
5434         /* add dev to every upper_dev's upper device */
5435         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5436                 pr_debug("linking %s's upper device %s with %s\n",
5437                          upper_dev->name, i->dev->name, dev->name);
5438                 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5439                 if (ret)
5440                         goto rollback_upper_mesh;
5441         }
5442
5443         /* add upper_dev to every dev's lower device */
5444         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5445                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5446                          i->dev->name, upper_dev->name);
5447                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5448                 if (ret)
5449                         goto rollback_lower_mesh;
5450         }
5451
5452         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5453                                       &changeupper_info.info);
5454         return 0;
5455
5456 rollback_lower_mesh:
5457         to_i = i;
5458         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5459                 if (i == to_i)
5460                         break;
5461                 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5462         }
5463
5464         i = NULL;
5465
5466 rollback_upper_mesh:
5467         to_i = i;
5468         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5469                 if (i == to_i)
5470                         break;
5471                 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5472         }
5473
5474         i = j = NULL;
5475
5476 rollback_mesh:
5477         to_i = i;
5478         to_j = j;
5479         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5480                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5481                         if (i == to_i && j == to_j)
5482                                 break;
5483                         __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5484                 }
5485                 if (i == to_i)
5486                         break;
5487         }
5488
5489         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5490
5491         return ret;
5492 }
5493
5494 /**
5495  * netdev_upper_dev_link - Add a link to the upper device
5496  * @dev: device
5497  * @upper_dev: new upper device
5498  *
5499  * Adds a link to device which is upper to this one. The caller must hold
5500  * the RTNL lock. On a failure a negative errno code is returned.
5501  * On success the reference counts are adjusted and the function
5502  * returns zero.
5503  */
5504 int netdev_upper_dev_link(struct net_device *dev,
5505                           struct net_device *upper_dev)
5506 {
5507         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5508 }
5509 EXPORT_SYMBOL(netdev_upper_dev_link);
5510
5511 /**
5512  * netdev_master_upper_dev_link - Add a master link to the upper device
5513  * @dev: device
5514  * @upper_dev: new upper device
5515  *
5516  * Adds a link to device which is upper to this one. In this case, only
5517  * one master upper device can be linked, although other non-master devices
5518  * might be linked as well. The caller must hold the RTNL lock.
5519  * On a failure a negative errno code is returned. On success the reference
5520  * counts are adjusted and the function returns zero.
5521  */
5522 int netdev_master_upper_dev_link(struct net_device *dev,
5523                                  struct net_device *upper_dev)
5524 {
5525         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5526 }
5527 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5528
5529 int netdev_master_upper_dev_link_private(struct net_device *dev,
5530                                          struct net_device *upper_dev,
5531                                          void *private)
5532 {
5533         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5534 }
5535 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5536
5537 /**
5538  * netdev_upper_dev_unlink - Removes a link to upper device
5539  * @dev: device
5540  * @upper_dev: new upper device
5541  *
5542  * Removes a link to device which is upper to this one. The caller must hold
5543  * the RTNL lock.
5544  */
5545 void netdev_upper_dev_unlink(struct net_device *dev,
5546                              struct net_device *upper_dev)
5547 {
5548         struct netdev_notifier_changeupper_info changeupper_info;
5549         struct netdev_adjacent *i, *j;
5550         ASSERT_RTNL();
5551
5552         changeupper_info.upper_dev = upper_dev;
5553         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5554         changeupper_info.linking = false;
5555
5556         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5557                                       &changeupper_info.info);
5558
5559         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5560
5561         /* Here is the tricky part. We must remove all dev's lower
5562          * devices from all upper_dev's upper devices and vice
5563          * versa, to maintain the graph relationship.
5564          */
5565         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5566                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5567                         __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5568
5569         /* remove also the devices itself from lower/upper device
5570          * list
5571          */
5572         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5573                 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5574
5575         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5576                 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5577
5578         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5579                                       &changeupper_info.info);
5580 }
5581 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5582
5583 /**
5584  * netdev_bonding_info_change - Dispatch event about slave change
5585  * @dev: device
5586  * @bonding_info: info to dispatch
5587  *
5588  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5589  * The caller must hold the RTNL lock.
5590  */
5591 void netdev_bonding_info_change(struct net_device *dev,
5592                                 struct netdev_bonding_info *bonding_info)
5593 {
5594         struct netdev_notifier_bonding_info     info;
5595
5596         memcpy(&info.bonding_info, bonding_info,
5597                sizeof(struct netdev_bonding_info));
5598         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5599                                       &info.info);
5600 }
5601 EXPORT_SYMBOL(netdev_bonding_info_change);
5602
5603 static void netdev_adjacent_add_links(struct net_device *dev)
5604 {
5605         struct netdev_adjacent *iter;
5606
5607         struct net *net = dev_net(dev);
5608
5609         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5610                 if (!net_eq(net,dev_net(iter->dev)))
5611                         continue;
5612                 netdev_adjacent_sysfs_add(iter->dev, dev,
5613                                           &iter->dev->adj_list.lower);
5614                 netdev_adjacent_sysfs_add(dev, iter->dev,
5615                                           &dev->adj_list.upper);
5616         }
5617
5618         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5619                 if (!net_eq(net,dev_net(iter->dev)))
5620                         continue;
5621                 netdev_adjacent_sysfs_add(iter->dev, dev,
5622                                           &iter->dev->adj_list.upper);
5623                 netdev_adjacent_sysfs_add(dev, iter->dev,
5624                                           &dev->adj_list.lower);
5625         }
5626 }
5627
5628 static void netdev_adjacent_del_links(struct net_device *dev)
5629 {
5630         struct netdev_adjacent *iter;
5631
5632         struct net *net = dev_net(dev);
5633
5634         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5635                 if (!net_eq(net,dev_net(iter->dev)))
5636                         continue;
5637                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5638                                           &iter->dev->adj_list.lower);
5639                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5640                                           &dev->adj_list.upper);
5641         }
5642
5643         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5644                 if (!net_eq(net,dev_net(iter->dev)))
5645                         continue;
5646                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5647                                           &iter->dev->adj_list.upper);
5648                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5649                                           &dev->adj_list.lower);
5650         }
5651 }
5652
5653 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5654 {
5655         struct netdev_adjacent *iter;
5656
5657         struct net *net = dev_net(dev);
5658
5659         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5660                 if (!net_eq(net,dev_net(iter->dev)))
5661                         continue;
5662                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5663                                           &iter->dev->adj_list.lower);
5664                 netdev_adjacent_sysfs_add(iter->dev, dev,
5665                                           &iter->dev->adj_list.lower);
5666         }
5667
5668         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5669                 if (!net_eq(net,dev_net(iter->dev)))
5670                         continue;
5671                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5672                                           &iter->dev->adj_list.upper);
5673                 netdev_adjacent_sysfs_add(iter->dev, dev,
5674                                           &iter->dev->adj_list.upper);
5675         }
5676 }
5677
5678 void *netdev_lower_dev_get_private(struct net_device *dev,
5679                                    struct net_device *lower_dev)
5680 {
5681         struct netdev_adjacent *lower;
5682
5683         if (!lower_dev)
5684                 return NULL;
5685         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5686         if (!lower)
5687                 return NULL;
5688
5689         return lower->private;
5690 }
5691 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5692
5693
5694 int dev_get_nest_level(struct net_device *dev,
5695                        bool (*type_check)(struct net_device *dev))
5696 {
5697         struct net_device *lower = NULL;
5698         struct list_head *iter;
5699         int max_nest = -1;
5700         int nest;
5701
5702         ASSERT_RTNL();
5703
5704         netdev_for_each_lower_dev(dev, lower, iter) {
5705                 nest = dev_get_nest_level(lower, type_check);
5706                 if (max_nest < nest)
5707                         max_nest = nest;
5708         }
5709
5710         if (type_check(dev))
5711                 max_nest++;
5712
5713         return max_nest;
5714 }
5715 EXPORT_SYMBOL(dev_get_nest_level);
5716
5717 static void dev_change_rx_flags(struct net_device *dev, int flags)
5718 {
5719         const struct net_device_ops *ops = dev->netdev_ops;
5720
5721         if (ops->ndo_change_rx_flags)
5722                 ops->ndo_change_rx_flags(dev, flags);
5723 }
5724
5725 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5726 {
5727         unsigned int old_flags = dev->flags;
5728         kuid_t uid;
5729         kgid_t gid;
5730
5731         ASSERT_RTNL();
5732
5733         dev->flags |= IFF_PROMISC;
5734         dev->promiscuity += inc;
5735         if (dev->promiscuity == 0) {
5736                 /*
5737                  * Avoid overflow.
5738                  * If inc causes overflow, untouch promisc and return error.
5739                  */
5740                 if (inc < 0)
5741                         dev->flags &= ~IFF_PROMISC;
5742                 else {
5743                         dev->promiscuity -= inc;
5744                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5745                                 dev->name);
5746                         return -EOVERFLOW;
5747                 }
5748         }
5749         if (dev->flags != old_flags) {
5750                 pr_info("device %s %s promiscuous mode\n",
5751                         dev->name,
5752                         dev->flags & IFF_PROMISC ? "entered" : "left");
5753                 if (audit_enabled) {
5754                         current_uid_gid(&uid, &gid);
5755                         audit_log(current->audit_context, GFP_ATOMIC,
5756                                 AUDIT_ANOM_PROMISCUOUS,
5757                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5758                                 dev->name, (dev->flags & IFF_PROMISC),
5759                                 (old_flags & IFF_PROMISC),
5760                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5761                                 from_kuid(&init_user_ns, uid),
5762                                 from_kgid(&init_user_ns, gid),
5763                                 audit_get_sessionid(current));
5764                 }
5765
5766                 dev_change_rx_flags(dev, IFF_PROMISC);
5767         }
5768         if (notify)
5769                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5770         return 0;
5771 }
5772
5773 /**
5774  *      dev_set_promiscuity     - update promiscuity count on a device
5775  *      @dev: device
5776  *      @inc: modifier
5777  *
5778  *      Add or remove promiscuity from a device. While the count in the device
5779  *      remains above zero the interface remains promiscuous. Once it hits zero
5780  *      the device reverts back to normal filtering operation. A negative inc
5781  *      value is used to drop promiscuity on the device.
5782  *      Return 0 if successful or a negative errno code on error.
5783  */
5784 int dev_set_promiscuity(struct net_device *dev, int inc)
5785 {
5786         unsigned int old_flags = dev->flags;
5787         int err;
5788
5789         err = __dev_set_promiscuity(dev, inc, true);
5790         if (err < 0)
5791                 return err;
5792         if (dev->flags != old_flags)
5793                 dev_set_rx_mode(dev);
5794         return err;
5795 }
5796 EXPORT_SYMBOL(dev_set_promiscuity);
5797
5798 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5799 {
5800         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5801
5802         ASSERT_RTNL();
5803
5804         dev->flags |= IFF_ALLMULTI;
5805         dev->allmulti += inc;
5806         if (dev->allmulti == 0) {
5807                 /*
5808                  * Avoid overflow.
5809                  * If inc causes overflow, untouch allmulti and return error.
5810                  */
5811                 if (inc < 0)
5812                         dev->flags &= ~IFF_ALLMULTI;
5813                 else {
5814                         dev->allmulti -= inc;
5815                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5816                                 dev->name);
5817                         return -EOVERFLOW;
5818                 }
5819         }
5820         if (dev->flags ^ old_flags) {
5821                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5822                 dev_set_rx_mode(dev);
5823                 if (notify)
5824                         __dev_notify_flags(dev, old_flags,
5825                                            dev->gflags ^ old_gflags);
5826         }
5827         return 0;
5828 }
5829
5830 /**
5831  *      dev_set_allmulti        - update allmulti count on a device
5832  *      @dev: device
5833  *      @inc: modifier
5834  *
5835  *      Add or remove reception of all multicast frames to a device. While the
5836  *      count in the device remains above zero the interface remains listening
5837  *      to all interfaces. Once it hits zero the device reverts back to normal
5838  *      filtering operation. A negative @inc value is used to drop the counter
5839  *      when releasing a resource needing all multicasts.
5840  *      Return 0 if successful or a negative errno code on error.
5841  */
5842
5843 int dev_set_allmulti(struct net_device *dev, int inc)
5844 {
5845         return __dev_set_allmulti(dev, inc, true);
5846 }
5847 EXPORT_SYMBOL(dev_set_allmulti);
5848
5849 /*
5850  *      Upload unicast and multicast address lists to device and
5851  *      configure RX filtering. When the device doesn't support unicast
5852  *      filtering it is put in promiscuous mode while unicast addresses
5853  *      are present.
5854  */
5855 void __dev_set_rx_mode(struct net_device *dev)
5856 {
5857         const struct net_device_ops *ops = dev->netdev_ops;
5858
5859         /* dev_open will call this function so the list will stay sane. */
5860         if (!(dev->flags&IFF_UP))
5861                 return;
5862
5863         if (!netif_device_present(dev))
5864                 return;
5865
5866         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5867                 /* Unicast addresses changes may only happen under the rtnl,
5868                  * therefore calling __dev_set_promiscuity here is safe.
5869                  */
5870                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5871                         __dev_set_promiscuity(dev, 1, false);
5872                         dev->uc_promisc = true;
5873                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5874                         __dev_set_promiscuity(dev, -1, false);
5875                         dev->uc_promisc = false;
5876                 }
5877         }
5878
5879         if (ops->ndo_set_rx_mode)
5880                 ops->ndo_set_rx_mode(dev);
5881 }
5882
5883 void dev_set_rx_mode(struct net_device *dev)
5884 {
5885         netif_addr_lock_bh(dev);
5886         __dev_set_rx_mode(dev);
5887         netif_addr_unlock_bh(dev);
5888 }
5889
5890 /**
5891  *      dev_get_flags - get flags reported to userspace
5892  *      @dev: device
5893  *
5894  *      Get the combination of flag bits exported through APIs to userspace.
5895  */
5896 unsigned int dev_get_flags(const struct net_device *dev)
5897 {
5898         unsigned int flags;
5899
5900         flags = (dev->flags & ~(IFF_PROMISC |
5901                                 IFF_ALLMULTI |
5902                                 IFF_RUNNING |
5903                                 IFF_LOWER_UP |
5904                                 IFF_DORMANT)) |
5905                 (dev->gflags & (IFF_PROMISC |
5906                                 IFF_ALLMULTI));
5907
5908         if (netif_running(dev)) {
5909                 if (netif_oper_up(dev))
5910                         flags |= IFF_RUNNING;
5911                 if (netif_carrier_ok(dev))
5912                         flags |= IFF_LOWER_UP;
5913                 if (netif_dormant(dev))
5914                         flags |= IFF_DORMANT;
5915         }
5916
5917         return flags;
5918 }
5919 EXPORT_SYMBOL(dev_get_flags);
5920
5921 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5922 {
5923         unsigned int old_flags = dev->flags;
5924         int ret;
5925
5926         ASSERT_RTNL();
5927
5928         /*
5929          *      Set the flags on our device.
5930          */
5931
5932         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5933                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5934                                IFF_AUTOMEDIA)) |
5935                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5936                                     IFF_ALLMULTI));
5937
5938         /*
5939          *      Load in the correct multicast list now the flags have changed.
5940          */
5941
5942         if ((old_flags ^ flags) & IFF_MULTICAST)
5943                 dev_change_rx_flags(dev, IFF_MULTICAST);
5944
5945         dev_set_rx_mode(dev);
5946
5947         /*
5948          *      Have we downed the interface. We handle IFF_UP ourselves
5949          *      according to user attempts to set it, rather than blindly
5950          *      setting it.
5951          */
5952
5953         ret = 0;
5954         if ((old_flags ^ flags) & IFF_UP)
5955                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5956
5957         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5958                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5959                 unsigned int old_flags = dev->flags;
5960
5961                 dev->gflags ^= IFF_PROMISC;
5962
5963                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5964                         if (dev->flags != old_flags)
5965                                 dev_set_rx_mode(dev);
5966         }
5967
5968         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5969            is important. Some (broken) drivers set IFF_PROMISC, when
5970            IFF_ALLMULTI is requested not asking us and not reporting.
5971          */
5972         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5973                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5974
5975                 dev->gflags ^= IFF_ALLMULTI;
5976                 __dev_set_allmulti(dev, inc, false);
5977         }
5978
5979         return ret;
5980 }
5981
5982 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5983                         unsigned int gchanges)
5984 {
5985         unsigned int changes = dev->flags ^ old_flags;
5986
5987         if (gchanges)
5988                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5989
5990         if (changes & IFF_UP) {
5991                 if (dev->flags & IFF_UP)
5992                         call_netdevice_notifiers(NETDEV_UP, dev);
5993                 else
5994                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5995         }
5996
5997         if (dev->flags & IFF_UP &&
5998             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5999                 struct netdev_notifier_change_info change_info;
6000
6001                 change_info.flags_changed = changes;
6002                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6003                                               &change_info.info);
6004         }
6005 }
6006
6007 /**
6008  *      dev_change_flags - change device settings
6009  *      @dev: device
6010  *      @flags: device state flags
6011  *
6012  *      Change settings on device based state flags. The flags are
6013  *      in the userspace exported format.
6014  */
6015 int dev_change_flags(struct net_device *dev, unsigned int flags)
6016 {
6017         int ret;
6018         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6019
6020         ret = __dev_change_flags(dev, flags);
6021         if (ret < 0)
6022                 return ret;
6023
6024         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6025         __dev_notify_flags(dev, old_flags, changes);
6026         return ret;
6027 }
6028 EXPORT_SYMBOL(dev_change_flags);
6029
6030 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6031 {
6032         const struct net_device_ops *ops = dev->netdev_ops;
6033
6034         if (ops->ndo_change_mtu)
6035                 return ops->ndo_change_mtu(dev, new_mtu);
6036
6037         dev->mtu = new_mtu;
6038         return 0;
6039 }
6040
6041 /**
6042  *      dev_set_mtu - Change maximum transfer unit
6043  *      @dev: device
6044  *      @new_mtu: new transfer unit
6045  *
6046  *      Change the maximum transfer size of the network device.
6047  */
6048 int dev_set_mtu(struct net_device *dev, int new_mtu)
6049 {
6050         int err, orig_mtu;
6051
6052         if (new_mtu == dev->mtu)
6053                 return 0;
6054
6055         /*      MTU must be positive.    */
6056         if (new_mtu < 0)
6057                 return -EINVAL;
6058
6059         if (!netif_device_present(dev))
6060                 return -ENODEV;
6061
6062         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6063         err = notifier_to_errno(err);
6064         if (err)
6065                 return err;
6066
6067         orig_mtu = dev->mtu;
6068         err = __dev_set_mtu(dev, new_mtu);
6069
6070         if (!err) {
6071                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6072                 err = notifier_to_errno(err);
6073                 if (err) {
6074                         /* setting mtu back and notifying everyone again,
6075                          * so that they have a chance to revert changes.
6076                          */
6077                         __dev_set_mtu(dev, orig_mtu);
6078                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6079                 }
6080         }
6081         return err;
6082 }
6083 EXPORT_SYMBOL(dev_set_mtu);
6084
6085 /**
6086  *      dev_set_group - Change group this device belongs to
6087  *      @dev: device
6088  *      @new_group: group this device should belong to
6089  */
6090 void dev_set_group(struct net_device *dev, int new_group)
6091 {
6092         dev->group = new_group;
6093 }
6094 EXPORT_SYMBOL(dev_set_group);
6095
6096 /**
6097  *      dev_set_mac_address - Change Media Access Control Address
6098  *      @dev: device
6099  *      @sa: new address
6100  *
6101  *      Change the hardware (MAC) address of the device
6102  */
6103 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6104 {
6105         const struct net_device_ops *ops = dev->netdev_ops;
6106         int err;
6107
6108         if (!ops->ndo_set_mac_address)
6109                 return -EOPNOTSUPP;
6110         if (sa->sa_family != dev->type)
6111                 return -EINVAL;
6112         if (!netif_device_present(dev))
6113                 return -ENODEV;
6114         err = ops->ndo_set_mac_address(dev, sa);
6115         if (err)
6116                 return err;
6117         dev->addr_assign_type = NET_ADDR_SET;
6118         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6119         add_device_randomness(dev->dev_addr, dev->addr_len);
6120         return 0;
6121 }
6122 EXPORT_SYMBOL(dev_set_mac_address);
6123
6124 /**
6125  *      dev_change_carrier - Change device carrier
6126  *      @dev: device
6127  *      @new_carrier: new value
6128  *
6129  *      Change device carrier
6130  */
6131 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6132 {
6133         const struct net_device_ops *ops = dev->netdev_ops;
6134
6135         if (!ops->ndo_change_carrier)
6136                 return -EOPNOTSUPP;
6137         if (!netif_device_present(dev))
6138                 return -ENODEV;
6139         return ops->ndo_change_carrier(dev, new_carrier);
6140 }
6141 EXPORT_SYMBOL(dev_change_carrier);
6142
6143 /**
6144  *      dev_get_phys_port_id - Get device physical port ID
6145  *      @dev: device
6146  *      @ppid: port ID
6147  *
6148  *      Get device physical port ID
6149  */
6150 int dev_get_phys_port_id(struct net_device *dev,
6151                          struct netdev_phys_item_id *ppid)
6152 {
6153         const struct net_device_ops *ops = dev->netdev_ops;
6154
6155         if (!ops->ndo_get_phys_port_id)
6156                 return -EOPNOTSUPP;
6157         return ops->ndo_get_phys_port_id(dev, ppid);
6158 }
6159 EXPORT_SYMBOL(dev_get_phys_port_id);
6160
6161 /**
6162  *      dev_get_phys_port_name - Get device physical port name
6163  *      @dev: device
6164  *      @name: port name
6165  *
6166  *      Get device physical port name
6167  */
6168 int dev_get_phys_port_name(struct net_device *dev,
6169                            char *name, size_t len)
6170 {
6171         const struct net_device_ops *ops = dev->netdev_ops;
6172
6173         if (!ops->ndo_get_phys_port_name)
6174                 return -EOPNOTSUPP;
6175         return ops->ndo_get_phys_port_name(dev, name, len);
6176 }
6177 EXPORT_SYMBOL(dev_get_phys_port_name);
6178
6179 /**
6180  *      dev_change_proto_down - update protocol port state information
6181  *      @dev: device
6182  *      @proto_down: new value
6183  *
6184  *      This info can be used by switch drivers to set the phys state of the
6185  *      port.
6186  */
6187 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6188 {
6189         const struct net_device_ops *ops = dev->netdev_ops;
6190
6191         if (!ops->ndo_change_proto_down)
6192                 return -EOPNOTSUPP;
6193         if (!netif_device_present(dev))
6194                 return -ENODEV;
6195         return ops->ndo_change_proto_down(dev, proto_down);
6196 }
6197 EXPORT_SYMBOL(dev_change_proto_down);
6198
6199 /**
6200  *      dev_new_index   -       allocate an ifindex
6201  *      @net: the applicable net namespace
6202  *
6203  *      Returns a suitable unique value for a new device interface
6204  *      number.  The caller must hold the rtnl semaphore or the
6205  *      dev_base_lock to be sure it remains unique.
6206  */
6207 static int dev_new_index(struct net *net)
6208 {
6209         int ifindex = net->ifindex;
6210         for (;;) {
6211                 if (++ifindex <= 0)
6212                         ifindex = 1;
6213                 if (!__dev_get_by_index(net, ifindex))
6214                         return net->ifindex = ifindex;
6215         }
6216 }
6217
6218 /* Delayed registration/unregisteration */
6219 static LIST_HEAD(net_todo_list);
6220 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6221
6222 static void net_set_todo(struct net_device *dev)
6223 {
6224         list_add_tail(&dev->todo_list, &net_todo_list);
6225         dev_net(dev)->dev_unreg_count++;
6226 }
6227
6228 static void rollback_registered_many(struct list_head *head)
6229 {
6230         struct net_device *dev, *tmp;
6231         LIST_HEAD(close_head);
6232
6233         BUG_ON(dev_boot_phase);
6234         ASSERT_RTNL();
6235
6236         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6237                 /* Some devices call without registering
6238                  * for initialization unwind. Remove those
6239                  * devices and proceed with the remaining.
6240                  */
6241                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6242                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6243                                  dev->name, dev);
6244
6245                         WARN_ON(1);
6246                         list_del(&dev->unreg_list);
6247                         continue;
6248                 }
6249                 dev->dismantle = true;
6250                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6251         }
6252
6253         /* If device is running, close it first. */
6254         list_for_each_entry(dev, head, unreg_list)
6255                 list_add_tail(&dev->close_list, &close_head);
6256         dev_close_many(&close_head, true);
6257
6258         list_for_each_entry(dev, head, unreg_list) {
6259                 /* And unlink it from device chain. */
6260                 unlist_netdevice(dev);
6261
6262                 dev->reg_state = NETREG_UNREGISTERING;
6263                 on_each_cpu(flush_backlog, dev, 1);
6264         }
6265
6266         synchronize_net();
6267
6268         list_for_each_entry(dev, head, unreg_list) {
6269                 struct sk_buff *skb = NULL;
6270
6271                 /* Shutdown queueing discipline. */
6272                 dev_shutdown(dev);
6273
6274
6275                 /* Notify protocols, that we are about to destroy
6276                    this device. They should clean all the things.
6277                 */
6278                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6279
6280                 if (!dev->rtnl_link_ops ||
6281                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6282                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6283                                                      GFP_KERNEL);
6284
6285                 /*
6286                  *      Flush the unicast and multicast chains
6287                  */
6288                 dev_uc_flush(dev);
6289                 dev_mc_flush(dev);
6290
6291                 if (dev->netdev_ops->ndo_uninit)
6292                         dev->netdev_ops->ndo_uninit(dev);
6293
6294                 if (skb)
6295                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6296
6297                 /* Notifier chain MUST detach us all upper devices. */
6298                 WARN_ON(netdev_has_any_upper_dev(dev));
6299
6300                 /* Remove entries from kobject tree */
6301                 netdev_unregister_kobject(dev);
6302 #ifdef CONFIG_XPS
6303                 /* Remove XPS queueing entries */
6304                 netif_reset_xps_queues_gt(dev, 0);
6305 #endif
6306         }
6307
6308         synchronize_net();
6309
6310         list_for_each_entry(dev, head, unreg_list)
6311                 dev_put(dev);
6312 }
6313
6314 static void rollback_registered(struct net_device *dev)
6315 {
6316         LIST_HEAD(single);
6317
6318         list_add(&dev->unreg_list, &single);
6319         rollback_registered_many(&single);
6320         list_del(&single);
6321 }
6322
6323 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6324         struct net_device *upper, netdev_features_t features)
6325 {
6326         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6327         netdev_features_t feature;
6328         int feature_bit;
6329
6330         for_each_netdev_feature(&upper_disables, feature_bit) {
6331                 feature = __NETIF_F_BIT(feature_bit);
6332                 if (!(upper->wanted_features & feature)
6333                     && (features & feature)) {
6334                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6335                                    &feature, upper->name);
6336                         features &= ~feature;
6337                 }
6338         }
6339
6340         return features;
6341 }
6342
6343 static void netdev_sync_lower_features(struct net_device *upper,
6344         struct net_device *lower, netdev_features_t features)
6345 {
6346         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6347         netdev_features_t feature;
6348         int feature_bit;
6349
6350         for_each_netdev_feature(&upper_disables, feature_bit) {
6351                 feature = __NETIF_F_BIT(feature_bit);
6352                 if (!(features & feature) && (lower->features & feature)) {
6353                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6354                                    &feature, lower->name);
6355                         lower->wanted_features &= ~feature;
6356                         netdev_update_features(lower);
6357
6358                         if (unlikely(lower->features & feature))
6359                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6360                                             &feature, lower->name);
6361                 }
6362         }
6363 }
6364
6365 static netdev_features_t netdev_fix_features(struct net_device *dev,
6366         netdev_features_t features)
6367 {
6368         /* Fix illegal checksum combinations */
6369         if ((features & NETIF_F_HW_CSUM) &&
6370             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6371                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6372                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6373         }
6374
6375         /* TSO requires that SG is present as well. */
6376         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6377                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6378                 features &= ~NETIF_F_ALL_TSO;
6379         }
6380
6381         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6382                                         !(features & NETIF_F_IP_CSUM)) {
6383                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6384                 features &= ~NETIF_F_TSO;
6385                 features &= ~NETIF_F_TSO_ECN;
6386         }
6387
6388         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6389                                          !(features & NETIF_F_IPV6_CSUM)) {
6390                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6391                 features &= ~NETIF_F_TSO6;
6392         }
6393
6394         /* TSO ECN requires that TSO is present as well. */
6395         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6396                 features &= ~NETIF_F_TSO_ECN;
6397
6398         /* Software GSO depends on SG. */
6399         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6400                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6401                 features &= ~NETIF_F_GSO;
6402         }
6403
6404         /* UFO needs SG and checksumming */
6405         if (features & NETIF_F_UFO) {
6406                 /* maybe split UFO into V4 and V6? */
6407                 if (!((features & NETIF_F_GEN_CSUM) ||
6408                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6409                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6410                         netdev_dbg(dev,
6411                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6412                         features &= ~NETIF_F_UFO;
6413                 }
6414
6415                 if (!(features & NETIF_F_SG)) {
6416                         netdev_dbg(dev,
6417                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6418                         features &= ~NETIF_F_UFO;
6419                 }
6420         }
6421
6422 #ifdef CONFIG_NET_RX_BUSY_POLL
6423         if (dev->netdev_ops->ndo_busy_poll)
6424                 features |= NETIF_F_BUSY_POLL;
6425         else
6426 #endif
6427                 features &= ~NETIF_F_BUSY_POLL;
6428
6429         return features;
6430 }
6431
6432 int __netdev_update_features(struct net_device *dev)
6433 {
6434         struct net_device *upper, *lower;
6435         netdev_features_t features;
6436         struct list_head *iter;
6437         int err = -1;
6438
6439         ASSERT_RTNL();
6440
6441         features = netdev_get_wanted_features(dev);
6442
6443         if (dev->netdev_ops->ndo_fix_features)
6444                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6445
6446         /* driver might be less strict about feature dependencies */
6447         features = netdev_fix_features(dev, features);
6448
6449         /* some features can't be enabled if they're off an an upper device */
6450         netdev_for_each_upper_dev_rcu(dev, upper, iter)
6451                 features = netdev_sync_upper_features(dev, upper, features);
6452
6453         if (dev->features == features)
6454                 goto sync_lower;
6455
6456         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6457                 &dev->features, &features);
6458
6459         if (dev->netdev_ops->ndo_set_features)
6460                 err = dev->netdev_ops->ndo_set_features(dev, features);
6461         else
6462                 err = 0;
6463
6464         if (unlikely(err < 0)) {
6465                 netdev_err(dev,
6466                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6467                         err, &features, &dev->features);
6468                 /* return non-0 since some features might have changed and
6469                  * it's better to fire a spurious notification than miss it
6470                  */
6471                 return -1;
6472         }
6473
6474 sync_lower:
6475         /* some features must be disabled on lower devices when disabled
6476          * on an upper device (think: bonding master or bridge)
6477          */
6478         netdev_for_each_lower_dev(dev, lower, iter)
6479                 netdev_sync_lower_features(dev, lower, features);
6480
6481         if (!err)
6482                 dev->features = features;
6483
6484         return err < 0 ? 0 : 1;
6485 }
6486
6487 /**
6488  *      netdev_update_features - recalculate device features
6489  *      @dev: the device to check
6490  *
6491  *      Recalculate dev->features set and send notifications if it
6492  *      has changed. Should be called after driver or hardware dependent
6493  *      conditions might have changed that influence the features.
6494  */
6495 void netdev_update_features(struct net_device *dev)
6496 {
6497         if (__netdev_update_features(dev))
6498                 netdev_features_change(dev);
6499 }
6500 EXPORT_SYMBOL(netdev_update_features);
6501
6502 /**
6503  *      netdev_change_features - recalculate device features
6504  *      @dev: the device to check
6505  *
6506  *      Recalculate dev->features set and send notifications even
6507  *      if they have not changed. Should be called instead of
6508  *      netdev_update_features() if also dev->vlan_features might
6509  *      have changed to allow the changes to be propagated to stacked
6510  *      VLAN devices.
6511  */
6512 void netdev_change_features(struct net_device *dev)
6513 {
6514         __netdev_update_features(dev);
6515         netdev_features_change(dev);
6516 }
6517 EXPORT_SYMBOL(netdev_change_features);
6518
6519 /**
6520  *      netif_stacked_transfer_operstate -      transfer operstate
6521  *      @rootdev: the root or lower level device to transfer state from
6522  *      @dev: the device to transfer operstate to
6523  *
6524  *      Transfer operational state from root to device. This is normally
6525  *      called when a stacking relationship exists between the root
6526  *      device and the device(a leaf device).
6527  */
6528 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6529                                         struct net_device *dev)
6530 {
6531         if (rootdev->operstate == IF_OPER_DORMANT)
6532                 netif_dormant_on(dev);
6533         else
6534                 netif_dormant_off(dev);
6535
6536         if (netif_carrier_ok(rootdev)) {
6537                 if (!netif_carrier_ok(dev))
6538                         netif_carrier_on(dev);
6539         } else {
6540                 if (netif_carrier_ok(dev))
6541                         netif_carrier_off(dev);
6542         }
6543 }
6544 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6545
6546 #ifdef CONFIG_SYSFS
6547 static int netif_alloc_rx_queues(struct net_device *dev)
6548 {
6549         unsigned int i, count = dev->num_rx_queues;
6550         struct netdev_rx_queue *rx;
6551         size_t sz = count * sizeof(*rx);
6552
6553         BUG_ON(count < 1);
6554
6555         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6556         if (!rx) {
6557                 rx = vzalloc(sz);
6558                 if (!rx)
6559                         return -ENOMEM;
6560         }
6561         dev->_rx = rx;
6562
6563         for (i = 0; i < count; i++)
6564                 rx[i].dev = dev;
6565         return 0;
6566 }
6567 #endif
6568
6569 static void netdev_init_one_queue(struct net_device *dev,
6570                                   struct netdev_queue *queue, void *_unused)
6571 {
6572         /* Initialize queue lock */
6573         spin_lock_init(&queue->_xmit_lock);
6574         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6575         queue->xmit_lock_owner = -1;
6576         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6577         queue->dev = dev;
6578 #ifdef CONFIG_BQL
6579         dql_init(&queue->dql, HZ);
6580 #endif
6581 }
6582
6583 static void netif_free_tx_queues(struct net_device *dev)
6584 {
6585         kvfree(dev->_tx);
6586 }
6587
6588 static int netif_alloc_netdev_queues(struct net_device *dev)
6589 {
6590         unsigned int count = dev->num_tx_queues;
6591         struct netdev_queue *tx;
6592         size_t sz = count * sizeof(*tx);
6593
6594         if (count < 1 || count > 0xffff)
6595                 return -EINVAL;
6596
6597         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6598         if (!tx) {
6599                 tx = vzalloc(sz);
6600                 if (!tx)
6601                         return -ENOMEM;
6602         }
6603         dev->_tx = tx;
6604
6605         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6606         spin_lock_init(&dev->tx_global_lock);
6607
6608         return 0;
6609 }
6610
6611 void netif_tx_stop_all_queues(struct net_device *dev)
6612 {
6613         unsigned int i;
6614
6615         for (i = 0; i < dev->num_tx_queues; i++) {
6616                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6617                 netif_tx_stop_queue(txq);
6618         }
6619 }
6620 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6621
6622 /**
6623  *      register_netdevice      - register a network device
6624  *      @dev: device to register
6625  *
6626  *      Take a completed network device structure and add it to the kernel
6627  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6628  *      chain. 0 is returned on success. A negative errno code is returned
6629  *      on a failure to set up the device, or if the name is a duplicate.
6630  *
6631  *      Callers must hold the rtnl semaphore. You may want
6632  *      register_netdev() instead of this.
6633  *
6634  *      BUGS:
6635  *      The locking appears insufficient to guarantee two parallel registers
6636  *      will not get the same name.
6637  */
6638
6639 int register_netdevice(struct net_device *dev)
6640 {
6641         int ret;
6642         struct net *net = dev_net(dev);
6643
6644         BUG_ON(dev_boot_phase);
6645         ASSERT_RTNL();
6646
6647         might_sleep();
6648
6649         /* When net_device's are persistent, this will be fatal. */
6650         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6651         BUG_ON(!net);
6652
6653         spin_lock_init(&dev->addr_list_lock);
6654         netdev_set_addr_lockdep_class(dev);
6655
6656         ret = dev_get_valid_name(net, dev, dev->name);
6657         if (ret < 0)
6658                 goto out;
6659
6660         /* Init, if this function is available */
6661         if (dev->netdev_ops->ndo_init) {
6662                 ret = dev->netdev_ops->ndo_init(dev);
6663                 if (ret) {
6664                         if (ret > 0)
6665                                 ret = -EIO;
6666                         goto out;
6667                 }
6668         }
6669
6670         if (((dev->hw_features | dev->features) &
6671              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6672             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6673              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6674                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6675                 ret = -EINVAL;
6676                 goto err_uninit;
6677         }
6678
6679         ret = -EBUSY;
6680         if (!dev->ifindex)
6681                 dev->ifindex = dev_new_index(net);
6682         else if (__dev_get_by_index(net, dev->ifindex))
6683                 goto err_uninit;
6684
6685         /* Transfer changeable features to wanted_features and enable
6686          * software offloads (GSO and GRO).
6687          */
6688         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6689         dev->features |= NETIF_F_SOFT_FEATURES;
6690         dev->wanted_features = dev->features & dev->hw_features;
6691
6692         if (!(dev->flags & IFF_LOOPBACK)) {
6693                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6694         }
6695
6696         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6697          */
6698         dev->vlan_features |= NETIF_F_HIGHDMA;
6699
6700         /* Make NETIF_F_SG inheritable to tunnel devices.
6701          */
6702         dev->hw_enc_features |= NETIF_F_SG;
6703
6704         /* Make NETIF_F_SG inheritable to MPLS.
6705          */
6706         dev->mpls_features |= NETIF_F_SG;
6707
6708         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6709         ret = notifier_to_errno(ret);
6710         if (ret)
6711                 goto err_uninit;
6712
6713         ret = netdev_register_kobject(dev);
6714         if (ret)
6715                 goto err_uninit;
6716         dev->reg_state = NETREG_REGISTERED;
6717
6718         __netdev_update_features(dev);
6719
6720         /*
6721          *      Default initial state at registry is that the
6722          *      device is present.
6723          */
6724
6725         set_bit(__LINK_STATE_PRESENT, &dev->state);
6726
6727         linkwatch_init_dev(dev);
6728
6729         dev_init_scheduler(dev);
6730         dev_hold(dev);
6731         list_netdevice(dev);
6732         add_device_randomness(dev->dev_addr, dev->addr_len);
6733
6734         /* If the device has permanent device address, driver should
6735          * set dev_addr and also addr_assign_type should be set to
6736          * NET_ADDR_PERM (default value).
6737          */
6738         if (dev->addr_assign_type == NET_ADDR_PERM)
6739                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6740
6741         /* Notify protocols, that a new device appeared. */
6742         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6743         ret = notifier_to_errno(ret);
6744         if (ret) {
6745                 rollback_registered(dev);
6746                 dev->reg_state = NETREG_UNREGISTERED;
6747         }
6748         /*
6749          *      Prevent userspace races by waiting until the network
6750          *      device is fully setup before sending notifications.
6751          */
6752         if (!dev->rtnl_link_ops ||
6753             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6754                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6755
6756 out:
6757         return ret;
6758
6759 err_uninit:
6760         if (dev->netdev_ops->ndo_uninit)
6761                 dev->netdev_ops->ndo_uninit(dev);
6762         goto out;
6763 }
6764 EXPORT_SYMBOL(register_netdevice);
6765
6766 /**
6767  *      init_dummy_netdev       - init a dummy network device for NAPI
6768  *      @dev: device to init
6769  *
6770  *      This takes a network device structure and initialize the minimum
6771  *      amount of fields so it can be used to schedule NAPI polls without
6772  *      registering a full blown interface. This is to be used by drivers
6773  *      that need to tie several hardware interfaces to a single NAPI
6774  *      poll scheduler due to HW limitations.
6775  */
6776 int init_dummy_netdev(struct net_device *dev)
6777 {
6778         /* Clear everything. Note we don't initialize spinlocks
6779          * are they aren't supposed to be taken by any of the
6780          * NAPI code and this dummy netdev is supposed to be
6781          * only ever used for NAPI polls
6782          */
6783         memset(dev, 0, sizeof(struct net_device));
6784
6785         /* make sure we BUG if trying to hit standard
6786          * register/unregister code path
6787          */
6788         dev->reg_state = NETREG_DUMMY;
6789
6790         /* NAPI wants this */
6791         INIT_LIST_HEAD(&dev->napi_list);
6792
6793         /* a dummy interface is started by default */
6794         set_bit(__LINK_STATE_PRESENT, &dev->state);
6795         set_bit(__LINK_STATE_START, &dev->state);
6796
6797         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6798          * because users of this 'device' dont need to change
6799          * its refcount.
6800          */
6801
6802         return 0;
6803 }
6804 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6805
6806
6807 /**
6808  *      register_netdev - register a network device
6809  *      @dev: device to register
6810  *
6811  *      Take a completed network device structure and add it to the kernel
6812  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6813  *      chain. 0 is returned on success. A negative errno code is returned
6814  *      on a failure to set up the device, or if the name is a duplicate.
6815  *
6816  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6817  *      and expands the device name if you passed a format string to
6818  *      alloc_netdev.
6819  */
6820 int register_netdev(struct net_device *dev)
6821 {
6822         int err;
6823
6824         rtnl_lock();
6825         err = register_netdevice(dev);
6826         rtnl_unlock();
6827         return err;
6828 }
6829 EXPORT_SYMBOL(register_netdev);
6830
6831 int netdev_refcnt_read(const struct net_device *dev)
6832 {
6833         int i, refcnt = 0;
6834
6835         for_each_possible_cpu(i)
6836                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6837         return refcnt;
6838 }
6839 EXPORT_SYMBOL(netdev_refcnt_read);
6840
6841 /**
6842  * netdev_wait_allrefs - wait until all references are gone.
6843  * @dev: target net_device
6844  *
6845  * This is called when unregistering network devices.
6846  *
6847  * Any protocol or device that holds a reference should register
6848  * for netdevice notification, and cleanup and put back the
6849  * reference if they receive an UNREGISTER event.
6850  * We can get stuck here if buggy protocols don't correctly
6851  * call dev_put.
6852  */
6853 static void netdev_wait_allrefs(struct net_device *dev)
6854 {
6855         unsigned long rebroadcast_time, warning_time;
6856         int refcnt;
6857
6858         linkwatch_forget_dev(dev);
6859
6860         rebroadcast_time = warning_time = jiffies;
6861         refcnt = netdev_refcnt_read(dev);
6862
6863         while (refcnt != 0) {
6864                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6865                         rtnl_lock();
6866
6867                         /* Rebroadcast unregister notification */
6868                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6869
6870                         __rtnl_unlock();
6871                         rcu_barrier();
6872                         rtnl_lock();
6873
6874                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6875                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6876                                      &dev->state)) {
6877                                 /* We must not have linkwatch events
6878                                  * pending on unregister. If this
6879                                  * happens, we simply run the queue
6880                                  * unscheduled, resulting in a noop
6881                                  * for this device.
6882                                  */
6883                                 linkwatch_run_queue();
6884                         }
6885
6886                         __rtnl_unlock();
6887
6888                         rebroadcast_time = jiffies;
6889                 }
6890
6891                 msleep(250);
6892
6893                 refcnt = netdev_refcnt_read(dev);
6894
6895                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6896                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6897                                  dev->name, refcnt);
6898                         warning_time = jiffies;
6899                 }
6900         }
6901 }
6902
6903 /* The sequence is:
6904  *
6905  *      rtnl_lock();
6906  *      ...
6907  *      register_netdevice(x1);
6908  *      register_netdevice(x2);
6909  *      ...
6910  *      unregister_netdevice(y1);
6911  *      unregister_netdevice(y2);
6912  *      ...
6913  *      rtnl_unlock();
6914  *      free_netdev(y1);
6915  *      free_netdev(y2);
6916  *
6917  * We are invoked by rtnl_unlock().
6918  * This allows us to deal with problems:
6919  * 1) We can delete sysfs objects which invoke hotplug
6920  *    without deadlocking with linkwatch via keventd.
6921  * 2) Since we run with the RTNL semaphore not held, we can sleep
6922  *    safely in order to wait for the netdev refcnt to drop to zero.
6923  *
6924  * We must not return until all unregister events added during
6925  * the interval the lock was held have been completed.
6926  */
6927 void netdev_run_todo(void)
6928 {
6929         struct list_head list;
6930
6931         /* Snapshot list, allow later requests */
6932         list_replace_init(&net_todo_list, &list);
6933
6934         __rtnl_unlock();
6935
6936
6937         /* Wait for rcu callbacks to finish before next phase */
6938         if (!list_empty(&list))
6939                 rcu_barrier();
6940
6941         while (!list_empty(&list)) {
6942                 struct net_device *dev
6943                         = list_first_entry(&list, struct net_device, todo_list);
6944                 list_del(&dev->todo_list);
6945
6946                 rtnl_lock();
6947                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6948                 __rtnl_unlock();
6949
6950                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6951                         pr_err("network todo '%s' but state %d\n",
6952                                dev->name, dev->reg_state);
6953                         dump_stack();
6954                         continue;
6955                 }
6956
6957                 dev->reg_state = NETREG_UNREGISTERED;
6958
6959                 netdev_wait_allrefs(dev);
6960
6961                 /* paranoia */
6962                 BUG_ON(netdev_refcnt_read(dev));
6963                 BUG_ON(!list_empty(&dev->ptype_all));
6964                 BUG_ON(!list_empty(&dev->ptype_specific));
6965                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6966                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6967                 WARN_ON(dev->dn_ptr);
6968
6969                 if (dev->destructor)
6970                         dev->destructor(dev);
6971
6972                 /* Report a network device has been unregistered */
6973                 rtnl_lock();
6974                 dev_net(dev)->dev_unreg_count--;
6975                 __rtnl_unlock();
6976                 wake_up(&netdev_unregistering_wq);
6977
6978                 /* Free network device */
6979                 kobject_put(&dev->dev.kobj);
6980         }
6981 }
6982
6983 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6984  * fields in the same order, with only the type differing.
6985  */
6986 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6987                              const struct net_device_stats *netdev_stats)
6988 {
6989 #if BITS_PER_LONG == 64
6990         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6991         memcpy(stats64, netdev_stats, sizeof(*stats64));
6992 #else
6993         size_t i, n = sizeof(*stats64) / sizeof(u64);
6994         const unsigned long *src = (const unsigned long *)netdev_stats;
6995         u64 *dst = (u64 *)stats64;
6996
6997         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6998                      sizeof(*stats64) / sizeof(u64));
6999         for (i = 0; i < n; i++)
7000                 dst[i] = src[i];
7001 #endif
7002 }
7003 EXPORT_SYMBOL(netdev_stats_to_stats64);
7004
7005 /**
7006  *      dev_get_stats   - get network device statistics
7007  *      @dev: device to get statistics from
7008  *      @storage: place to store stats
7009  *
7010  *      Get network statistics from device. Return @storage.
7011  *      The device driver may provide its own method by setting
7012  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7013  *      otherwise the internal statistics structure is used.
7014  */
7015 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7016                                         struct rtnl_link_stats64 *storage)
7017 {
7018         const struct net_device_ops *ops = dev->netdev_ops;
7019
7020         if (ops->ndo_get_stats64) {
7021                 memset(storage, 0, sizeof(*storage));
7022                 ops->ndo_get_stats64(dev, storage);
7023         } else if (ops->ndo_get_stats) {
7024                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7025         } else {
7026                 netdev_stats_to_stats64(storage, &dev->stats);
7027         }
7028         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7029         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7030         return storage;
7031 }
7032 EXPORT_SYMBOL(dev_get_stats);
7033
7034 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7035 {
7036         struct netdev_queue *queue = dev_ingress_queue(dev);
7037
7038 #ifdef CONFIG_NET_CLS_ACT
7039         if (queue)
7040                 return queue;
7041         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7042         if (!queue)
7043                 return NULL;
7044         netdev_init_one_queue(dev, queue, NULL);
7045         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7046         queue->qdisc_sleeping = &noop_qdisc;
7047         rcu_assign_pointer(dev->ingress_queue, queue);
7048 #endif
7049         return queue;
7050 }
7051
7052 static const struct ethtool_ops default_ethtool_ops;
7053
7054 void netdev_set_default_ethtool_ops(struct net_device *dev,
7055                                     const struct ethtool_ops *ops)
7056 {
7057         if (dev->ethtool_ops == &default_ethtool_ops)
7058                 dev->ethtool_ops = ops;
7059 }
7060 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7061
7062 void netdev_freemem(struct net_device *dev)
7063 {
7064         char *addr = (char *)dev - dev->padded;
7065
7066         kvfree(addr);
7067 }
7068
7069 /**
7070  *      alloc_netdev_mqs - allocate network device
7071  *      @sizeof_priv:           size of private data to allocate space for
7072  *      @name:                  device name format string
7073  *      @name_assign_type:      origin of device name
7074  *      @setup:                 callback to initialize device
7075  *      @txqs:                  the number of TX subqueues to allocate
7076  *      @rxqs:                  the number of RX subqueues to allocate
7077  *
7078  *      Allocates a struct net_device with private data area for driver use
7079  *      and performs basic initialization.  Also allocates subqueue structs
7080  *      for each queue on the device.
7081  */
7082 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7083                 unsigned char name_assign_type,
7084                 void (*setup)(struct net_device *),
7085                 unsigned int txqs, unsigned int rxqs)
7086 {
7087         struct net_device *dev;
7088         size_t alloc_size;
7089         struct net_device *p;
7090
7091         BUG_ON(strlen(name) >= sizeof(dev->name));
7092
7093         if (txqs < 1) {
7094                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7095                 return NULL;
7096         }
7097
7098 #ifdef CONFIG_SYSFS
7099         if (rxqs < 1) {
7100                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7101                 return NULL;
7102         }
7103 #endif
7104
7105         alloc_size = sizeof(struct net_device);
7106         if (sizeof_priv) {
7107                 /* ensure 32-byte alignment of private area */
7108                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7109                 alloc_size += sizeof_priv;
7110         }
7111         /* ensure 32-byte alignment of whole construct */
7112         alloc_size += NETDEV_ALIGN - 1;
7113
7114         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7115         if (!p)
7116                 p = vzalloc(alloc_size);
7117         if (!p)
7118                 return NULL;
7119
7120         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7121         dev->padded = (char *)dev - (char *)p;
7122
7123         dev->pcpu_refcnt = alloc_percpu(int);
7124         if (!dev->pcpu_refcnt)
7125                 goto free_dev;
7126
7127         if (dev_addr_init(dev))
7128                 goto free_pcpu;
7129
7130         dev_mc_init(dev);
7131         dev_uc_init(dev);
7132
7133         dev_net_set(dev, &init_net);
7134
7135         dev->gso_max_size = GSO_MAX_SIZE;
7136         dev->gso_max_segs = GSO_MAX_SEGS;
7137         dev->gso_min_segs = 0;
7138
7139         INIT_LIST_HEAD(&dev->napi_list);
7140         INIT_LIST_HEAD(&dev->unreg_list);
7141         INIT_LIST_HEAD(&dev->close_list);
7142         INIT_LIST_HEAD(&dev->link_watch_list);
7143         INIT_LIST_HEAD(&dev->adj_list.upper);
7144         INIT_LIST_HEAD(&dev->adj_list.lower);
7145         INIT_LIST_HEAD(&dev->all_adj_list.upper);
7146         INIT_LIST_HEAD(&dev->all_adj_list.lower);
7147         INIT_LIST_HEAD(&dev->ptype_all);
7148         INIT_LIST_HEAD(&dev->ptype_specific);
7149         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7150         setup(dev);
7151
7152         if (!dev->tx_queue_len) {
7153                 dev->priv_flags |= IFF_NO_QUEUE;
7154                 dev->tx_queue_len = 1;
7155         }
7156
7157         dev->num_tx_queues = txqs;
7158         dev->real_num_tx_queues = txqs;
7159         if (netif_alloc_netdev_queues(dev))
7160                 goto free_all;
7161
7162 #ifdef CONFIG_SYSFS
7163         dev->num_rx_queues = rxqs;
7164         dev->real_num_rx_queues = rxqs;
7165         if (netif_alloc_rx_queues(dev))
7166                 goto free_all;
7167 #endif
7168
7169         strcpy(dev->name, name);
7170         dev->name_assign_type = name_assign_type;
7171         dev->group = INIT_NETDEV_GROUP;
7172         if (!dev->ethtool_ops)
7173                 dev->ethtool_ops = &default_ethtool_ops;
7174
7175         nf_hook_ingress_init(dev);
7176
7177         return dev;
7178
7179 free_all:
7180         free_netdev(dev);
7181         return NULL;
7182
7183 free_pcpu:
7184         free_percpu(dev->pcpu_refcnt);
7185 free_dev:
7186         netdev_freemem(dev);
7187         return NULL;
7188 }
7189 EXPORT_SYMBOL(alloc_netdev_mqs);
7190
7191 /**
7192  *      free_netdev - free network device
7193  *      @dev: device
7194  *
7195  *      This function does the last stage of destroying an allocated device
7196  *      interface. The reference to the device object is released.
7197  *      If this is the last reference then it will be freed.
7198  */
7199 void free_netdev(struct net_device *dev)
7200 {
7201         struct napi_struct *p, *n;
7202
7203         netif_free_tx_queues(dev);
7204 #ifdef CONFIG_SYSFS
7205         kvfree(dev->_rx);
7206 #endif
7207
7208         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7209
7210         /* Flush device addresses */
7211         dev_addr_flush(dev);
7212
7213         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7214                 netif_napi_del(p);
7215
7216         free_percpu(dev->pcpu_refcnt);
7217         dev->pcpu_refcnt = NULL;
7218
7219         /*  Compatibility with error handling in drivers */
7220         if (dev->reg_state == NETREG_UNINITIALIZED) {
7221                 netdev_freemem(dev);
7222                 return;
7223         }
7224
7225         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7226         dev->reg_state = NETREG_RELEASED;
7227
7228         /* will free via device release */
7229         put_device(&dev->dev);
7230 }
7231 EXPORT_SYMBOL(free_netdev);
7232
7233 /**
7234  *      synchronize_net -  Synchronize with packet receive processing
7235  *
7236  *      Wait for packets currently being received to be done.
7237  *      Does not block later packets from starting.
7238  */
7239 void synchronize_net(void)
7240 {
7241         might_sleep();
7242         if (rtnl_is_locked())
7243                 synchronize_rcu_expedited();
7244         else
7245                 synchronize_rcu();
7246 }
7247 EXPORT_SYMBOL(synchronize_net);
7248
7249 /**
7250  *      unregister_netdevice_queue - remove device from the kernel
7251  *      @dev: device
7252  *      @head: list
7253  *
7254  *      This function shuts down a device interface and removes it
7255  *      from the kernel tables.
7256  *      If head not NULL, device is queued to be unregistered later.
7257  *
7258  *      Callers must hold the rtnl semaphore.  You may want
7259  *      unregister_netdev() instead of this.
7260  */
7261
7262 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7263 {
7264         ASSERT_RTNL();
7265
7266         if (head) {
7267                 list_move_tail(&dev->unreg_list, head);
7268         } else {
7269                 rollback_registered(dev);
7270                 /* Finish processing unregister after unlock */
7271                 net_set_todo(dev);
7272         }
7273 }
7274 EXPORT_SYMBOL(unregister_netdevice_queue);
7275
7276 /**
7277  *      unregister_netdevice_many - unregister many devices
7278  *      @head: list of devices
7279  *
7280  *  Note: As most callers use a stack allocated list_head,
7281  *  we force a list_del() to make sure stack wont be corrupted later.
7282  */
7283 void unregister_netdevice_many(struct list_head *head)
7284 {
7285         struct net_device *dev;
7286
7287         if (!list_empty(head)) {
7288                 rollback_registered_many(head);
7289                 list_for_each_entry(dev, head, unreg_list)
7290                         net_set_todo(dev);
7291                 list_del(head);
7292         }
7293 }
7294 EXPORT_SYMBOL(unregister_netdevice_many);
7295
7296 /**
7297  *      unregister_netdev - remove device from the kernel
7298  *      @dev: device
7299  *
7300  *      This function shuts down a device interface and removes it
7301  *      from the kernel tables.
7302  *
7303  *      This is just a wrapper for unregister_netdevice that takes
7304  *      the rtnl semaphore.  In general you want to use this and not
7305  *      unregister_netdevice.
7306  */
7307 void unregister_netdev(struct net_device *dev)
7308 {
7309         rtnl_lock();
7310         unregister_netdevice(dev);
7311         rtnl_unlock();
7312 }
7313 EXPORT_SYMBOL(unregister_netdev);
7314
7315 /**
7316  *      dev_change_net_namespace - move device to different nethost namespace
7317  *      @dev: device
7318  *      @net: network namespace
7319  *      @pat: If not NULL name pattern to try if the current device name
7320  *            is already taken in the destination network namespace.
7321  *
7322  *      This function shuts down a device interface and moves it
7323  *      to a new network namespace. On success 0 is returned, on
7324  *      a failure a netagive errno code is returned.
7325  *
7326  *      Callers must hold the rtnl semaphore.
7327  */
7328
7329 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7330 {
7331         int err;
7332
7333         ASSERT_RTNL();
7334
7335         /* Don't allow namespace local devices to be moved. */
7336         err = -EINVAL;
7337         if (dev->features & NETIF_F_NETNS_LOCAL)
7338                 goto out;
7339
7340         /* Ensure the device has been registrered */
7341         if (dev->reg_state != NETREG_REGISTERED)
7342                 goto out;
7343
7344         /* Get out if there is nothing todo */
7345         err = 0;
7346         if (net_eq(dev_net(dev), net))
7347                 goto out;
7348
7349         /* Pick the destination device name, and ensure
7350          * we can use it in the destination network namespace.
7351          */
7352         err = -EEXIST;
7353         if (__dev_get_by_name(net, dev->name)) {
7354                 /* We get here if we can't use the current device name */
7355                 if (!pat)
7356                         goto out;
7357                 if (dev_get_valid_name(net, dev, pat) < 0)
7358                         goto out;
7359         }
7360
7361         /*
7362          * And now a mini version of register_netdevice unregister_netdevice.
7363          */
7364
7365         /* If device is running close it first. */
7366         dev_close(dev);
7367
7368         /* And unlink it from device chain */
7369         err = -ENODEV;
7370         unlist_netdevice(dev);
7371
7372         synchronize_net();
7373
7374         /* Shutdown queueing discipline. */
7375         dev_shutdown(dev);
7376
7377         /* Notify protocols, that we are about to destroy
7378            this device. They should clean all the things.
7379
7380            Note that dev->reg_state stays at NETREG_REGISTERED.
7381            This is wanted because this way 8021q and macvlan know
7382            the device is just moving and can keep their slaves up.
7383         */
7384         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7385         rcu_barrier();
7386         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7387         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7388
7389         /*
7390          *      Flush the unicast and multicast chains
7391          */
7392         dev_uc_flush(dev);
7393         dev_mc_flush(dev);
7394
7395         /* Send a netdev-removed uevent to the old namespace */
7396         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7397         netdev_adjacent_del_links(dev);
7398
7399         /* Actually switch the network namespace */
7400         dev_net_set(dev, net);
7401
7402         /* If there is an ifindex conflict assign a new one */
7403         if (__dev_get_by_index(net, dev->ifindex))
7404                 dev->ifindex = dev_new_index(net);
7405
7406         /* Send a netdev-add uevent to the new namespace */
7407         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7408         netdev_adjacent_add_links(dev);
7409
7410         /* Fixup kobjects */
7411         err = device_rename(&dev->dev, dev->name);
7412         WARN_ON(err);
7413
7414         /* Add the device back in the hashes */
7415         list_netdevice(dev);
7416
7417         /* Notify protocols, that a new device appeared. */
7418         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7419
7420         /*
7421          *      Prevent userspace races by waiting until the network
7422          *      device is fully setup before sending notifications.
7423          */
7424         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7425
7426         synchronize_net();
7427         err = 0;
7428 out:
7429         return err;
7430 }
7431 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7432
7433 static int dev_cpu_callback(struct notifier_block *nfb,
7434                             unsigned long action,
7435                             void *ocpu)
7436 {
7437         struct sk_buff **list_skb;
7438         struct sk_buff *skb;
7439         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7440         struct softnet_data *sd, *oldsd;
7441
7442         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7443                 return NOTIFY_OK;
7444
7445         local_irq_disable();
7446         cpu = smp_processor_id();
7447         sd = &per_cpu(softnet_data, cpu);
7448         oldsd = &per_cpu(softnet_data, oldcpu);
7449
7450         /* Find end of our completion_queue. */
7451         list_skb = &sd->completion_queue;
7452         while (*list_skb)
7453                 list_skb = &(*list_skb)->next;
7454         /* Append completion queue from offline CPU. */
7455         *list_skb = oldsd->completion_queue;
7456         oldsd->completion_queue = NULL;
7457
7458         /* Append output queue from offline CPU. */
7459         if (oldsd->output_queue) {
7460                 *sd->output_queue_tailp = oldsd->output_queue;
7461                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7462                 oldsd->output_queue = NULL;
7463                 oldsd->output_queue_tailp = &oldsd->output_queue;
7464         }
7465         /* Append NAPI poll list from offline CPU, with one exception :
7466          * process_backlog() must be called by cpu owning percpu backlog.
7467          * We properly handle process_queue & input_pkt_queue later.
7468          */
7469         while (!list_empty(&oldsd->poll_list)) {
7470                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7471                                                             struct napi_struct,
7472                                                             poll_list);
7473
7474                 list_del_init(&napi->poll_list);
7475                 if (napi->poll == process_backlog)
7476                         napi->state = 0;
7477                 else
7478                         ____napi_schedule(sd, napi);
7479         }
7480
7481         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7482         local_irq_enable();
7483
7484         /* Process offline CPU's input_pkt_queue */
7485         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7486                 netif_rx_ni(skb);
7487                 input_queue_head_incr(oldsd);
7488         }
7489         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7490                 netif_rx_ni(skb);
7491                 input_queue_head_incr(oldsd);
7492         }
7493
7494         return NOTIFY_OK;
7495 }
7496
7497
7498 /**
7499  *      netdev_increment_features - increment feature set by one
7500  *      @all: current feature set
7501  *      @one: new feature set
7502  *      @mask: mask feature set
7503  *
7504  *      Computes a new feature set after adding a device with feature set
7505  *      @one to the master device with current feature set @all.  Will not
7506  *      enable anything that is off in @mask. Returns the new feature set.
7507  */
7508 netdev_features_t netdev_increment_features(netdev_features_t all,
7509         netdev_features_t one, netdev_features_t mask)
7510 {
7511         if (mask & NETIF_F_GEN_CSUM)
7512                 mask |= NETIF_F_ALL_CSUM;
7513         mask |= NETIF_F_VLAN_CHALLENGED;
7514
7515         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7516         all &= one | ~NETIF_F_ALL_FOR_ALL;
7517
7518         /* If one device supports hw checksumming, set for all. */
7519         if (all & NETIF_F_GEN_CSUM)
7520                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7521
7522         return all;
7523 }
7524 EXPORT_SYMBOL(netdev_increment_features);
7525
7526 static struct hlist_head * __net_init netdev_create_hash(void)
7527 {
7528         int i;
7529         struct hlist_head *hash;
7530
7531         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7532         if (hash != NULL)
7533                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7534                         INIT_HLIST_HEAD(&hash[i]);
7535
7536         return hash;
7537 }
7538
7539 /* Initialize per network namespace state */
7540 static int __net_init netdev_init(struct net *net)
7541 {
7542         if (net != &init_net)
7543                 INIT_LIST_HEAD(&net->dev_base_head);
7544
7545         net->dev_name_head = netdev_create_hash();
7546         if (net->dev_name_head == NULL)
7547                 goto err_name;
7548
7549         net->dev_index_head = netdev_create_hash();
7550         if (net->dev_index_head == NULL)
7551                 goto err_idx;
7552
7553         return 0;
7554
7555 err_idx:
7556         kfree(net->dev_name_head);
7557 err_name:
7558         return -ENOMEM;
7559 }
7560
7561 /**
7562  *      netdev_drivername - network driver for the device
7563  *      @dev: network device
7564  *
7565  *      Determine network driver for device.
7566  */
7567 const char *netdev_drivername(const struct net_device *dev)
7568 {
7569         const struct device_driver *driver;
7570         const struct device *parent;
7571         const char *empty = "";
7572
7573         parent = dev->dev.parent;
7574         if (!parent)
7575                 return empty;
7576
7577         driver = parent->driver;
7578         if (driver && driver->name)
7579                 return driver->name;
7580         return empty;
7581 }
7582
7583 static void __netdev_printk(const char *level, const struct net_device *dev,
7584                             struct va_format *vaf)
7585 {
7586         if (dev && dev->dev.parent) {
7587                 dev_printk_emit(level[1] - '0',
7588                                 dev->dev.parent,
7589                                 "%s %s %s%s: %pV",
7590                                 dev_driver_string(dev->dev.parent),
7591                                 dev_name(dev->dev.parent),
7592                                 netdev_name(dev), netdev_reg_state(dev),
7593                                 vaf);
7594         } else if (dev) {
7595                 printk("%s%s%s: %pV",
7596                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7597         } else {
7598                 printk("%s(NULL net_device): %pV", level, vaf);
7599         }
7600 }
7601
7602 void netdev_printk(const char *level, const struct net_device *dev,
7603                    const char *format, ...)
7604 {
7605         struct va_format vaf;
7606         va_list args;
7607
7608         va_start(args, format);
7609
7610         vaf.fmt = format;
7611         vaf.va = &args;
7612
7613         __netdev_printk(level, dev, &vaf);
7614
7615         va_end(args);
7616 }
7617 EXPORT_SYMBOL(netdev_printk);
7618
7619 #define define_netdev_printk_level(func, level)                 \
7620 void func(const struct net_device *dev, const char *fmt, ...)   \
7621 {                                                               \
7622         struct va_format vaf;                                   \
7623         va_list args;                                           \
7624                                                                 \
7625         va_start(args, fmt);                                    \
7626                                                                 \
7627         vaf.fmt = fmt;                                          \
7628         vaf.va = &args;                                         \
7629                                                                 \
7630         __netdev_printk(level, dev, &vaf);                      \
7631                                                                 \
7632         va_end(args);                                           \
7633 }                                                               \
7634 EXPORT_SYMBOL(func);
7635
7636 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7637 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7638 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7639 define_netdev_printk_level(netdev_err, KERN_ERR);
7640 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7641 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7642 define_netdev_printk_level(netdev_info, KERN_INFO);
7643
7644 static void __net_exit netdev_exit(struct net *net)
7645 {
7646         kfree(net->dev_name_head);
7647         kfree(net->dev_index_head);
7648 }
7649
7650 static struct pernet_operations __net_initdata netdev_net_ops = {
7651         .init = netdev_init,
7652         .exit = netdev_exit,
7653 };
7654
7655 static void __net_exit default_device_exit(struct net *net)
7656 {
7657         struct net_device *dev, *aux;
7658         /*
7659          * Push all migratable network devices back to the
7660          * initial network namespace
7661          */
7662         rtnl_lock();
7663         for_each_netdev_safe(net, dev, aux) {
7664                 int err;
7665                 char fb_name[IFNAMSIZ];
7666
7667                 /* Ignore unmoveable devices (i.e. loopback) */
7668                 if (dev->features & NETIF_F_NETNS_LOCAL)
7669                         continue;
7670
7671                 /* Leave virtual devices for the generic cleanup */
7672                 if (dev->rtnl_link_ops)
7673                         continue;
7674
7675                 /* Push remaining network devices to init_net */
7676                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7677                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7678                 if (err) {
7679                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7680                                  __func__, dev->name, err);
7681                         BUG();
7682                 }
7683         }
7684         rtnl_unlock();
7685 }
7686
7687 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7688 {
7689         /* Return with the rtnl_lock held when there are no network
7690          * devices unregistering in any network namespace in net_list.
7691          */
7692         struct net *net;
7693         bool unregistering;
7694         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7695
7696         add_wait_queue(&netdev_unregistering_wq, &wait);
7697         for (;;) {
7698                 unregistering = false;
7699                 rtnl_lock();
7700                 list_for_each_entry(net, net_list, exit_list) {
7701                         if (net->dev_unreg_count > 0) {
7702                                 unregistering = true;
7703                                 break;
7704                         }
7705                 }
7706                 if (!unregistering)
7707                         break;
7708                 __rtnl_unlock();
7709
7710                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7711         }
7712         remove_wait_queue(&netdev_unregistering_wq, &wait);
7713 }
7714
7715 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7716 {
7717         /* At exit all network devices most be removed from a network
7718          * namespace.  Do this in the reverse order of registration.
7719          * Do this across as many network namespaces as possible to
7720          * improve batching efficiency.
7721          */
7722         struct net_device *dev;
7723         struct net *net;
7724         LIST_HEAD(dev_kill_list);
7725
7726         /* To prevent network device cleanup code from dereferencing
7727          * loopback devices or network devices that have been freed
7728          * wait here for all pending unregistrations to complete,
7729          * before unregistring the loopback device and allowing the
7730          * network namespace be freed.
7731          *
7732          * The netdev todo list containing all network devices
7733          * unregistrations that happen in default_device_exit_batch
7734          * will run in the rtnl_unlock() at the end of
7735          * default_device_exit_batch.
7736          */
7737         rtnl_lock_unregistering(net_list);
7738         list_for_each_entry(net, net_list, exit_list) {
7739                 for_each_netdev_reverse(net, dev) {
7740                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7741                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7742                         else
7743                                 unregister_netdevice_queue(dev, &dev_kill_list);
7744                 }
7745         }
7746         unregister_netdevice_many(&dev_kill_list);
7747         rtnl_unlock();
7748 }
7749
7750 static struct pernet_operations __net_initdata default_device_ops = {
7751         .exit = default_device_exit,
7752         .exit_batch = default_device_exit_batch,
7753 };
7754
7755 /*
7756  *      Initialize the DEV module. At boot time this walks the device list and
7757  *      unhooks any devices that fail to initialise (normally hardware not
7758  *      present) and leaves us with a valid list of present and active devices.
7759  *
7760  */
7761
7762 /*
7763  *       This is called single threaded during boot, so no need
7764  *       to take the rtnl semaphore.
7765  */
7766 static int __init net_dev_init(void)
7767 {
7768         int i, rc = -ENOMEM;
7769
7770         BUG_ON(!dev_boot_phase);
7771
7772         if (dev_proc_init())
7773                 goto out;
7774
7775         if (netdev_kobject_init())
7776                 goto out;
7777
7778         INIT_LIST_HEAD(&ptype_all);
7779         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7780                 INIT_LIST_HEAD(&ptype_base[i]);
7781
7782         INIT_LIST_HEAD(&offload_base);
7783
7784         if (register_pernet_subsys(&netdev_net_ops))
7785                 goto out;
7786
7787         /*
7788          *      Initialise the packet receive queues.
7789          */
7790
7791         for_each_possible_cpu(i) {
7792                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7793
7794                 skb_queue_head_init(&sd->input_pkt_queue);
7795                 skb_queue_head_init(&sd->process_queue);
7796                 INIT_LIST_HEAD(&sd->poll_list);
7797                 sd->output_queue_tailp = &sd->output_queue;
7798 #ifdef CONFIG_RPS
7799                 sd->csd.func = rps_trigger_softirq;
7800                 sd->csd.info = sd;
7801                 sd->cpu = i;
7802 #endif
7803
7804                 sd->backlog.poll = process_backlog;
7805                 sd->backlog.weight = weight_p;
7806         }
7807
7808         dev_boot_phase = 0;
7809
7810         /* The loopback device is special if any other network devices
7811          * is present in a network namespace the loopback device must
7812          * be present. Since we now dynamically allocate and free the
7813          * loopback device ensure this invariant is maintained by
7814          * keeping the loopback device as the first device on the
7815          * list of network devices.  Ensuring the loopback devices
7816          * is the first device that appears and the last network device
7817          * that disappears.
7818          */
7819         if (register_pernet_device(&loopback_net_ops))
7820                 goto out;
7821
7822         if (register_pernet_device(&default_device_ops))
7823                 goto out;
7824
7825         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7826         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7827
7828         hotcpu_notifier(dev_cpu_callback, 0);
7829         dst_subsys_init();
7830         rc = 0;
7831 out:
7832         return rc;
7833 }
7834
7835 subsys_initcall(net_dev_init);