9d9b78e97b71e7406fbdb31b3bd840f9133fbad3
[firefly-linux-kernel-4.4.55.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97
98 On receive:
99 -----------
100
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111                  header.  PPP makes it, that is wrong, because introduce
112                  assymetry between rx and tx paths.
113    data       -> data
114
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121
122
123 On transmit:
124 ------------
125
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137
138 /* Private packet socket structures. */
139
140 struct packet_mclist {
141         struct packet_mclist    *next;
142         int                     ifindex;
143         int                     count;
144         unsigned short          type;
145         unsigned short          alen;
146         unsigned char           addr[MAX_ADDR_LEN];
147 };
148 /* identical to struct packet_mreq except it has
149  * a longer address field.
150  */
151 struct packet_mreq_max {
152         int             mr_ifindex;
153         unsigned short  mr_type;
154         unsigned short  mr_alen;
155         unsigned char   mr_address[MAX_ADDR_LEN];
156 };
157
158 #ifdef CONFIG_PACKET_MMAP
159 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
160                 int closing, int tx_ring);
161
162 struct packet_ring_buffer {
163         char                    **pg_vec;
164         unsigned int            head;
165         unsigned int            frames_per_block;
166         unsigned int            frame_size;
167         unsigned int            frame_max;
168
169         unsigned int            pg_vec_order;
170         unsigned int            pg_vec_pages;
171         unsigned int            pg_vec_len;
172
173         atomic_t                pending;
174 };
175
176 struct packet_sock;
177 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
178 #endif
179
180 static void packet_flush_mclist(struct sock *sk);
181
182 struct packet_sock {
183         /* struct sock has to be the first member of packet_sock */
184         struct sock             sk;
185         struct tpacket_stats    stats;
186 #ifdef CONFIG_PACKET_MMAP
187         struct packet_ring_buffer       rx_ring;
188         struct packet_ring_buffer       tx_ring;
189         int                     copy_thresh;
190 #endif
191         struct packet_type      prot_hook;
192         spinlock_t              bind_lock;
193         struct mutex            pg_vec_lock;
194         unsigned int            running:1,      /* prot_hook is attached*/
195                                 auxdata:1,
196                                 origdev:1;
197         int                     ifindex;        /* bound device         */
198         __be16                  num;
199         struct packet_mclist    *mclist;
200 #ifdef CONFIG_PACKET_MMAP
201         atomic_t                mapped;
202         enum tpacket_versions   tp_version;
203         unsigned int            tp_hdrlen;
204         unsigned int            tp_reserve;
205         unsigned int            tp_loss:1;
206 #endif
207 };
208
209 struct packet_skb_cb {
210         unsigned int origlen;
211         union {
212                 struct sockaddr_pkt pkt;
213                 struct sockaddr_ll ll;
214         } sa;
215 };
216
217 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
218
219 #ifdef CONFIG_PACKET_MMAP
220
221 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
222 {
223         union {
224                 struct tpacket_hdr *h1;
225                 struct tpacket2_hdr *h2;
226                 void *raw;
227         } h;
228
229         h.raw = frame;
230         switch (po->tp_version) {
231         case TPACKET_V1:
232                 h.h1->tp_status = status;
233                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
234                 break;
235         case TPACKET_V2:
236                 h.h2->tp_status = status;
237                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
238                 break;
239         default:
240                 pr_err("TPACKET version not supported\n");
241                 BUG();
242         }
243
244         smp_wmb();
245 }
246
247 static int __packet_get_status(struct packet_sock *po, void *frame)
248 {
249         union {
250                 struct tpacket_hdr *h1;
251                 struct tpacket2_hdr *h2;
252                 void *raw;
253         } h;
254
255         smp_rmb();
256
257         h.raw = frame;
258         switch (po->tp_version) {
259         case TPACKET_V1:
260                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
261                 return h.h1->tp_status;
262         case TPACKET_V2:
263                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
264                 return h.h2->tp_status;
265         default:
266                 pr_err("TPACKET version not supported\n");
267                 BUG();
268                 return 0;
269         }
270 }
271
272 static void *packet_lookup_frame(struct packet_sock *po,
273                 struct packet_ring_buffer *rb,
274                 unsigned int position,
275                 int status)
276 {
277         unsigned int pg_vec_pos, frame_offset;
278         union {
279                 struct tpacket_hdr *h1;
280                 struct tpacket2_hdr *h2;
281                 void *raw;
282         } h;
283
284         pg_vec_pos = position / rb->frames_per_block;
285         frame_offset = position % rb->frames_per_block;
286
287         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
288
289         if (status != __packet_get_status(po, h.raw))
290                 return NULL;
291
292         return h.raw;
293 }
294
295 static inline void *packet_current_frame(struct packet_sock *po,
296                 struct packet_ring_buffer *rb,
297                 int status)
298 {
299         return packet_lookup_frame(po, rb, rb->head, status);
300 }
301
302 static inline void *packet_previous_frame(struct packet_sock *po,
303                 struct packet_ring_buffer *rb,
304                 int status)
305 {
306         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
307         return packet_lookup_frame(po, rb, previous, status);
308 }
309
310 static inline void packet_increment_head(struct packet_ring_buffer *buff)
311 {
312         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
313 }
314
315 #endif
316
317 static inline struct packet_sock *pkt_sk(struct sock *sk)
318 {
319         return (struct packet_sock *)sk;
320 }
321
322 static void packet_sock_destruct(struct sock *sk)
323 {
324         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
325         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
326
327         if (!sock_flag(sk, SOCK_DEAD)) {
328                 pr_err("Attempt to release alive packet socket: %p\n", sk);
329                 return;
330         }
331
332         sk_refcnt_debug_dec(sk);
333 }
334
335
336 static const struct proto_ops packet_ops;
337
338 static const struct proto_ops packet_ops_spkt;
339
340 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
341                            struct packet_type *pt, struct net_device *orig_dev)
342 {
343         struct sock *sk;
344         struct sockaddr_pkt *spkt;
345
346         /*
347          *      When we registered the protocol we saved the socket in the data
348          *      field for just this event.
349          */
350
351         sk = pt->af_packet_priv;
352
353         /*
354          *      Yank back the headers [hope the device set this
355          *      right or kerboom...]
356          *
357          *      Incoming packets have ll header pulled,
358          *      push it back.
359          *
360          *      For outgoing ones skb->data == skb_mac_header(skb)
361          *      so that this procedure is noop.
362          */
363
364         if (skb->pkt_type == PACKET_LOOPBACK)
365                 goto out;
366
367         if (dev_net(dev) != sock_net(sk))
368                 goto out;
369
370         skb = skb_share_check(skb, GFP_ATOMIC);
371         if (skb == NULL)
372                 goto oom;
373
374         /* drop any routing info */
375         skb_dst_drop(skb);
376
377         /* drop conntrack reference */
378         nf_reset(skb);
379
380         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
381
382         skb_push(skb, skb->data - skb_mac_header(skb));
383
384         /*
385          *      The SOCK_PACKET socket receives _all_ frames.
386          */
387
388         spkt->spkt_family = dev->type;
389         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
390         spkt->spkt_protocol = skb->protocol;
391
392         /*
393          *      Charge the memory to the socket. This is done specifically
394          *      to prevent sockets using all the memory up.
395          */
396
397         if (sock_queue_rcv_skb(sk, skb) == 0)
398                 return 0;
399
400 out:
401         kfree_skb(skb);
402 oom:
403         return 0;
404 }
405
406
407 /*
408  *      Output a raw packet to a device layer. This bypasses all the other
409  *      protocol layers and you must therefore supply it with a complete frame
410  */
411
412 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
413                                struct msghdr *msg, size_t len)
414 {
415         struct sock *sk = sock->sk;
416         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
417         struct sk_buff *skb;
418         struct net_device *dev;
419         __be16 proto = 0;
420         int err;
421
422         /*
423          *      Get and verify the address.
424          */
425
426         if (saddr) {
427                 if (msg->msg_namelen < sizeof(struct sockaddr))
428                         return -EINVAL;
429                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
430                         proto = saddr->spkt_protocol;
431         } else
432                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
433
434         /*
435          *      Find the device first to size check it
436          */
437
438         saddr->spkt_device[13] = 0;
439         dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
440         err = -ENODEV;
441         if (dev == NULL)
442                 goto out_unlock;
443
444         err = -ENETDOWN;
445         if (!(dev->flags & IFF_UP))
446                 goto out_unlock;
447
448         /*
449          * You may not queue a frame bigger than the mtu. This is the lowest level
450          * raw protocol and you must do your own fragmentation at this level.
451          */
452
453         err = -EMSGSIZE;
454         if (len > dev->mtu + dev->hard_header_len)
455                 goto out_unlock;
456
457         err = -ENOBUFS;
458         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
459
460         /*
461          * If the write buffer is full, then tough. At this level the user
462          * gets to deal with the problem - do your own algorithmic backoffs.
463          * That's far more flexible.
464          */
465
466         if (skb == NULL)
467                 goto out_unlock;
468
469         /*
470          *      Fill it in
471          */
472
473         /* FIXME: Save some space for broken drivers that write a
474          * hard header at transmission time by themselves. PPP is the
475          * notable one here. This should really be fixed at the driver level.
476          */
477         skb_reserve(skb, LL_RESERVED_SPACE(dev));
478         skb_reset_network_header(skb);
479
480         /* Try to align data part correctly */
481         if (dev->header_ops) {
482                 skb->data -= dev->hard_header_len;
483                 skb->tail -= dev->hard_header_len;
484                 if (len < dev->hard_header_len)
485                         skb_reset_network_header(skb);
486         }
487
488         /* Returns -EFAULT on error */
489         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
490         skb->protocol = proto;
491         skb->dev = dev;
492         skb->priority = sk->sk_priority;
493         if (err)
494                 goto out_free;
495
496         /*
497          *      Now send it
498          */
499
500         dev_queue_xmit(skb);
501         dev_put(dev);
502         return len;
503
504 out_free:
505         kfree_skb(skb);
506 out_unlock:
507         if (dev)
508                 dev_put(dev);
509         return err;
510 }
511
512 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
513                                       unsigned int res)
514 {
515         struct sk_filter *filter;
516
517         rcu_read_lock_bh();
518         filter = rcu_dereference(sk->sk_filter);
519         if (filter != NULL)
520                 res = sk_run_filter(skb, filter->insns, filter->len);
521         rcu_read_unlock_bh();
522
523         return res;
524 }
525
526 /*
527    This function makes lazy skb cloning in hope that most of packets
528    are discarded by BPF.
529
530    Note tricky part: we DO mangle shared skb! skb->data, skb->len
531    and skb->cb are mangled. It works because (and until) packets
532    falling here are owned by current CPU. Output packets are cloned
533    by dev_queue_xmit_nit(), input packets are processed by net_bh
534    sequencially, so that if we return skb to original state on exit,
535    we will not harm anyone.
536  */
537
538 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
539                       struct packet_type *pt, struct net_device *orig_dev)
540 {
541         struct sock *sk;
542         struct sockaddr_ll *sll;
543         struct packet_sock *po;
544         u8 *skb_head = skb->data;
545         int skb_len = skb->len;
546         unsigned int snaplen, res;
547
548         if (skb->pkt_type == PACKET_LOOPBACK)
549                 goto drop;
550
551         sk = pt->af_packet_priv;
552         po = pkt_sk(sk);
553
554         if (dev_net(dev) != sock_net(sk))
555                 goto drop;
556
557         skb->dev = dev;
558
559         if (dev->header_ops) {
560                 /* The device has an explicit notion of ll header,
561                    exported to higher levels.
562
563                    Otherwise, the device hides datails of it frame
564                    structure, so that corresponding packet head
565                    never delivered to user.
566                  */
567                 if (sk->sk_type != SOCK_DGRAM)
568                         skb_push(skb, skb->data - skb_mac_header(skb));
569                 else if (skb->pkt_type == PACKET_OUTGOING) {
570                         /* Special case: outgoing packets have ll header at head */
571                         skb_pull(skb, skb_network_offset(skb));
572                 }
573         }
574
575         snaplen = skb->len;
576
577         res = run_filter(skb, sk, snaplen);
578         if (!res)
579                 goto drop_n_restore;
580         if (snaplen > res)
581                 snaplen = res;
582
583         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
584             (unsigned)sk->sk_rcvbuf)
585                 goto drop_n_acct;
586
587         if (skb_shared(skb)) {
588                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
589                 if (nskb == NULL)
590                         goto drop_n_acct;
591
592                 if (skb_head != skb->data) {
593                         skb->data = skb_head;
594                         skb->len = skb_len;
595                 }
596                 kfree_skb(skb);
597                 skb = nskb;
598         }
599
600         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
601                      sizeof(skb->cb));
602
603         sll = &PACKET_SKB_CB(skb)->sa.ll;
604         sll->sll_family = AF_PACKET;
605         sll->sll_hatype = dev->type;
606         sll->sll_protocol = skb->protocol;
607         sll->sll_pkttype = skb->pkt_type;
608         if (unlikely(po->origdev))
609                 sll->sll_ifindex = orig_dev->ifindex;
610         else
611                 sll->sll_ifindex = dev->ifindex;
612
613         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
614
615         PACKET_SKB_CB(skb)->origlen = skb->len;
616
617         if (pskb_trim(skb, snaplen))
618                 goto drop_n_acct;
619
620         skb_set_owner_r(skb, sk);
621         skb->dev = NULL;
622         skb_dst_drop(skb);
623
624         /* drop conntrack reference */
625         nf_reset(skb);
626
627         spin_lock(&sk->sk_receive_queue.lock);
628         po->stats.tp_packets++;
629         __skb_queue_tail(&sk->sk_receive_queue, skb);
630         spin_unlock(&sk->sk_receive_queue.lock);
631         sk->sk_data_ready(sk, skb->len);
632         return 0;
633
634 drop_n_acct:
635         spin_lock(&sk->sk_receive_queue.lock);
636         po->stats.tp_drops++;
637         spin_unlock(&sk->sk_receive_queue.lock);
638
639 drop_n_restore:
640         if (skb_head != skb->data && skb_shared(skb)) {
641                 skb->data = skb_head;
642                 skb->len = skb_len;
643         }
644 drop:
645         consume_skb(skb);
646         return 0;
647 }
648
649 #ifdef CONFIG_PACKET_MMAP
650 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
651                        struct packet_type *pt, struct net_device *orig_dev)
652 {
653         struct sock *sk;
654         struct packet_sock *po;
655         struct sockaddr_ll *sll;
656         union {
657                 struct tpacket_hdr *h1;
658                 struct tpacket2_hdr *h2;
659                 void *raw;
660         } h;
661         u8 *skb_head = skb->data;
662         int skb_len = skb->len;
663         unsigned int snaplen, res;
664         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
665         unsigned short macoff, netoff, hdrlen;
666         struct sk_buff *copy_skb = NULL;
667         struct timeval tv;
668         struct timespec ts;
669
670         if (skb->pkt_type == PACKET_LOOPBACK)
671                 goto drop;
672
673         sk = pt->af_packet_priv;
674         po = pkt_sk(sk);
675
676         if (dev_net(dev) != sock_net(sk))
677                 goto drop;
678
679         if (dev->header_ops) {
680                 if (sk->sk_type != SOCK_DGRAM)
681                         skb_push(skb, skb->data - skb_mac_header(skb));
682                 else if (skb->pkt_type == PACKET_OUTGOING) {
683                         /* Special case: outgoing packets have ll header at head */
684                         skb_pull(skb, skb_network_offset(skb));
685                 }
686         }
687
688         if (skb->ip_summed == CHECKSUM_PARTIAL)
689                 status |= TP_STATUS_CSUMNOTREADY;
690
691         snaplen = skb->len;
692
693         res = run_filter(skb, sk, snaplen);
694         if (!res)
695                 goto drop_n_restore;
696         if (snaplen > res)
697                 snaplen = res;
698
699         if (sk->sk_type == SOCK_DGRAM) {
700                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
701                                   po->tp_reserve;
702         } else {
703                 unsigned maclen = skb_network_offset(skb);
704                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
705                                        (maclen < 16 ? 16 : maclen)) +
706                         po->tp_reserve;
707                 macoff = netoff - maclen;
708         }
709
710         if (macoff + snaplen > po->rx_ring.frame_size) {
711                 if (po->copy_thresh &&
712                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
713                     (unsigned)sk->sk_rcvbuf) {
714                         if (skb_shared(skb)) {
715                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
716                         } else {
717                                 copy_skb = skb_get(skb);
718                                 skb_head = skb->data;
719                         }
720                         if (copy_skb)
721                                 skb_set_owner_r(copy_skb, sk);
722                 }
723                 snaplen = po->rx_ring.frame_size - macoff;
724                 if ((int)snaplen < 0)
725                         snaplen = 0;
726         }
727
728         spin_lock(&sk->sk_receive_queue.lock);
729         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
730         if (!h.raw)
731                 goto ring_is_full;
732         packet_increment_head(&po->rx_ring);
733         po->stats.tp_packets++;
734         if (copy_skb) {
735                 status |= TP_STATUS_COPY;
736                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
737         }
738         if (!po->stats.tp_drops)
739                 status &= ~TP_STATUS_LOSING;
740         spin_unlock(&sk->sk_receive_queue.lock);
741
742         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
743
744         switch (po->tp_version) {
745         case TPACKET_V1:
746                 h.h1->tp_len = skb->len;
747                 h.h1->tp_snaplen = snaplen;
748                 h.h1->tp_mac = macoff;
749                 h.h1->tp_net = netoff;
750                 if (skb->tstamp.tv64)
751                         tv = ktime_to_timeval(skb->tstamp);
752                 else
753                         do_gettimeofday(&tv);
754                 h.h1->tp_sec = tv.tv_sec;
755                 h.h1->tp_usec = tv.tv_usec;
756                 hdrlen = sizeof(*h.h1);
757                 break;
758         case TPACKET_V2:
759                 h.h2->tp_len = skb->len;
760                 h.h2->tp_snaplen = snaplen;
761                 h.h2->tp_mac = macoff;
762                 h.h2->tp_net = netoff;
763                 if (skb->tstamp.tv64)
764                         ts = ktime_to_timespec(skb->tstamp);
765                 else
766                         getnstimeofday(&ts);
767                 h.h2->tp_sec = ts.tv_sec;
768                 h.h2->tp_nsec = ts.tv_nsec;
769                 h.h2->tp_vlan_tci = skb->vlan_tci;
770                 hdrlen = sizeof(*h.h2);
771                 break;
772         default:
773                 BUG();
774         }
775
776         sll = h.raw + TPACKET_ALIGN(hdrlen);
777         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
778         sll->sll_family = AF_PACKET;
779         sll->sll_hatype = dev->type;
780         sll->sll_protocol = skb->protocol;
781         sll->sll_pkttype = skb->pkt_type;
782         if (unlikely(po->origdev))
783                 sll->sll_ifindex = orig_dev->ifindex;
784         else
785                 sll->sll_ifindex = dev->ifindex;
786
787         __packet_set_status(po, h.raw, status);
788         smp_mb();
789         {
790                 struct page *p_start, *p_end;
791                 u8 *h_end = h.raw + macoff + snaplen - 1;
792
793                 p_start = virt_to_page(h.raw);
794                 p_end = virt_to_page(h_end);
795                 while (p_start <= p_end) {
796                         flush_dcache_page(p_start);
797                         p_start++;
798                 }
799         }
800
801         sk->sk_data_ready(sk, 0);
802
803 drop_n_restore:
804         if (skb_head != skb->data && skb_shared(skb)) {
805                 skb->data = skb_head;
806                 skb->len = skb_len;
807         }
808 drop:
809         kfree_skb(skb);
810         return 0;
811
812 ring_is_full:
813         po->stats.tp_drops++;
814         spin_unlock(&sk->sk_receive_queue.lock);
815
816         sk->sk_data_ready(sk, 0);
817         kfree_skb(copy_skb);
818         goto drop_n_restore;
819 }
820
821 static void tpacket_destruct_skb(struct sk_buff *skb)
822 {
823         struct packet_sock *po = pkt_sk(skb->sk);
824         void *ph;
825
826         BUG_ON(skb == NULL);
827
828         if (likely(po->tx_ring.pg_vec)) {
829                 ph = skb_shinfo(skb)->destructor_arg;
830                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
831                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
832                 atomic_dec(&po->tx_ring.pending);
833                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
834         }
835
836         sock_wfree(skb);
837 }
838
839 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
840                 void *frame, struct net_device *dev, int size_max,
841                 __be16 proto, unsigned char *addr)
842 {
843         union {
844                 struct tpacket_hdr *h1;
845                 struct tpacket2_hdr *h2;
846                 void *raw;
847         } ph;
848         int to_write, offset, len, tp_len, nr_frags, len_max;
849         struct socket *sock = po->sk.sk_socket;
850         struct page *page;
851         void *data;
852         int err;
853
854         ph.raw = frame;
855
856         skb->protocol = proto;
857         skb->dev = dev;
858         skb->priority = po->sk.sk_priority;
859         skb_shinfo(skb)->destructor_arg = ph.raw;
860
861         switch (po->tp_version) {
862         case TPACKET_V2:
863                 tp_len = ph.h2->tp_len;
864                 break;
865         default:
866                 tp_len = ph.h1->tp_len;
867                 break;
868         }
869         if (unlikely(tp_len > size_max)) {
870                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
871                 return -EMSGSIZE;
872         }
873
874         skb_reserve(skb, LL_RESERVED_SPACE(dev));
875         skb_reset_network_header(skb);
876
877         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
878         to_write = tp_len;
879
880         if (sock->type == SOCK_DGRAM) {
881                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
882                                 NULL, tp_len);
883                 if (unlikely(err < 0))
884                         return -EINVAL;
885         } else if (dev->hard_header_len) {
886                 /* net device doesn't like empty head */
887                 if (unlikely(tp_len <= dev->hard_header_len)) {
888                         pr_err("packet size is too short (%d < %d)\n",
889                                tp_len, dev->hard_header_len);
890                         return -EINVAL;
891                 }
892
893                 skb_push(skb, dev->hard_header_len);
894                 err = skb_store_bits(skb, 0, data,
895                                 dev->hard_header_len);
896                 if (unlikely(err))
897                         return err;
898
899                 data += dev->hard_header_len;
900                 to_write -= dev->hard_header_len;
901         }
902
903         err = -EFAULT;
904         page = virt_to_page(data);
905         offset = offset_in_page(data);
906         len_max = PAGE_SIZE - offset;
907         len = ((to_write > len_max) ? len_max : to_write);
908
909         skb->data_len = to_write;
910         skb->len += to_write;
911         skb->truesize += to_write;
912         atomic_add(to_write, &po->sk.sk_wmem_alloc);
913
914         while (likely(to_write)) {
915                 nr_frags = skb_shinfo(skb)->nr_frags;
916
917                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
918                         pr_err("Packet exceed the number of skb frags(%lu)\n",
919                                MAX_SKB_FRAGS);
920                         return -EFAULT;
921                 }
922
923                 flush_dcache_page(page);
924                 get_page(page);
925                 skb_fill_page_desc(skb,
926                                 nr_frags,
927                                 page++, offset, len);
928                 to_write -= len;
929                 offset = 0;
930                 len_max = PAGE_SIZE;
931                 len = ((to_write > len_max) ? len_max : to_write);
932         }
933
934         return tp_len;
935 }
936
937 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
938 {
939         struct socket *sock;
940         struct sk_buff *skb;
941         struct net_device *dev;
942         __be16 proto;
943         int ifindex, err, reserve = 0;
944         void *ph;
945         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
946         int tp_len, size_max;
947         unsigned char *addr;
948         int len_sum = 0;
949         int status = 0;
950
951         sock = po->sk.sk_socket;
952
953         mutex_lock(&po->pg_vec_lock);
954
955         err = -EBUSY;
956         if (saddr == NULL) {
957                 ifindex = po->ifindex;
958                 proto   = po->num;
959                 addr    = NULL;
960         } else {
961                 err = -EINVAL;
962                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
963                         goto out;
964                 if (msg->msg_namelen < (saddr->sll_halen
965                                         + offsetof(struct sockaddr_ll,
966                                                 sll_addr)))
967                         goto out;
968                 ifindex = saddr->sll_ifindex;
969                 proto   = saddr->sll_protocol;
970                 addr    = saddr->sll_addr;
971         }
972
973         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
974         err = -ENXIO;
975         if (unlikely(dev == NULL))
976                 goto out;
977
978         reserve = dev->hard_header_len;
979
980         err = -ENETDOWN;
981         if (unlikely(!(dev->flags & IFF_UP)))
982                 goto out_put;
983
984         size_max = po->tx_ring.frame_size
985                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
986
987         if (size_max > dev->mtu + reserve)
988                 size_max = dev->mtu + reserve;
989
990         do {
991                 ph = packet_current_frame(po, &po->tx_ring,
992                                 TP_STATUS_SEND_REQUEST);
993
994                 if (unlikely(ph == NULL)) {
995                         schedule();
996                         continue;
997                 }
998
999                 status = TP_STATUS_SEND_REQUEST;
1000                 skb = sock_alloc_send_skb(&po->sk,
1001                                 LL_ALLOCATED_SPACE(dev)
1002                                 + sizeof(struct sockaddr_ll),
1003                                 0, &err);
1004
1005                 if (unlikely(skb == NULL))
1006                         goto out_status;
1007
1008                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1009                                 addr);
1010
1011                 if (unlikely(tp_len < 0)) {
1012                         if (po->tp_loss) {
1013                                 __packet_set_status(po, ph,
1014                                                 TP_STATUS_AVAILABLE);
1015                                 packet_increment_head(&po->tx_ring);
1016                                 kfree_skb(skb);
1017                                 continue;
1018                         } else {
1019                                 status = TP_STATUS_WRONG_FORMAT;
1020                                 err = tp_len;
1021                                 goto out_status;
1022                         }
1023                 }
1024
1025                 skb->destructor = tpacket_destruct_skb;
1026                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1027                 atomic_inc(&po->tx_ring.pending);
1028
1029                 status = TP_STATUS_SEND_REQUEST;
1030                 err = dev_queue_xmit(skb);
1031                 if (unlikely(err > 0)) {
1032                         err = net_xmit_errno(err);
1033                         if (err && __packet_get_status(po, ph) ==
1034                                    TP_STATUS_AVAILABLE) {
1035                                 /* skb was destructed already */
1036                                 skb = NULL;
1037                                 goto out_status;
1038                         }
1039                         /*
1040                          * skb was dropped but not destructed yet;
1041                          * let's treat it like congestion or err < 0
1042                          */
1043                         err = 0;
1044                 }
1045                 packet_increment_head(&po->tx_ring);
1046                 len_sum += tp_len;
1047         } while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1048                                         && (atomic_read(&po->tx_ring.pending))))
1049               );
1050
1051         err = len_sum;
1052         goto out_put;
1053
1054 out_status:
1055         __packet_set_status(po, ph, status);
1056         kfree_skb(skb);
1057 out_put:
1058         dev_put(dev);
1059 out:
1060         mutex_unlock(&po->pg_vec_lock);
1061         return err;
1062 }
1063 #endif
1064
1065 static int packet_snd(struct socket *sock,
1066                           struct msghdr *msg, size_t len)
1067 {
1068         struct sock *sk = sock->sk;
1069         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1070         struct sk_buff *skb;
1071         struct net_device *dev;
1072         __be16 proto;
1073         unsigned char *addr;
1074         int ifindex, err, reserve = 0;
1075
1076         /*
1077          *      Get and verify the address.
1078          */
1079
1080         if (saddr == NULL) {
1081                 struct packet_sock *po = pkt_sk(sk);
1082
1083                 ifindex = po->ifindex;
1084                 proto   = po->num;
1085                 addr    = NULL;
1086         } else {
1087                 err = -EINVAL;
1088                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1089                         goto out;
1090                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1091                         goto out;
1092                 ifindex = saddr->sll_ifindex;
1093                 proto   = saddr->sll_protocol;
1094                 addr    = saddr->sll_addr;
1095         }
1096
1097
1098         dev = dev_get_by_index(sock_net(sk), ifindex);
1099         err = -ENXIO;
1100         if (dev == NULL)
1101                 goto out_unlock;
1102         if (sock->type == SOCK_RAW)
1103                 reserve = dev->hard_header_len;
1104
1105         err = -ENETDOWN;
1106         if (!(dev->flags & IFF_UP))
1107                 goto out_unlock;
1108
1109         err = -EMSGSIZE;
1110         if (len > dev->mtu+reserve)
1111                 goto out_unlock;
1112
1113         skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1114                                 msg->msg_flags & MSG_DONTWAIT, &err);
1115         if (skb == NULL)
1116                 goto out_unlock;
1117
1118         skb_reserve(skb, LL_RESERVED_SPACE(dev));
1119         skb_reset_network_header(skb);
1120
1121         err = -EINVAL;
1122         if (sock->type == SOCK_DGRAM &&
1123             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1124                 goto out_free;
1125
1126         /* Returns -EFAULT on error */
1127         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1128         if (err)
1129                 goto out_free;
1130
1131         skb->protocol = proto;
1132         skb->dev = dev;
1133         skb->priority = sk->sk_priority;
1134
1135         /*
1136          *      Now send it
1137          */
1138
1139         err = dev_queue_xmit(skb);
1140         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1141                 goto out_unlock;
1142
1143         dev_put(dev);
1144
1145         return len;
1146
1147 out_free:
1148         kfree_skb(skb);
1149 out_unlock:
1150         if (dev)
1151                 dev_put(dev);
1152 out:
1153         return err;
1154 }
1155
1156 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1157                 struct msghdr *msg, size_t len)
1158 {
1159 #ifdef CONFIG_PACKET_MMAP
1160         struct sock *sk = sock->sk;
1161         struct packet_sock *po = pkt_sk(sk);
1162         if (po->tx_ring.pg_vec)
1163                 return tpacket_snd(po, msg);
1164         else
1165 #endif
1166                 return packet_snd(sock, msg, len);
1167 }
1168
1169 /*
1170  *      Close a PACKET socket. This is fairly simple. We immediately go
1171  *      to 'closed' state and remove our protocol entry in the device list.
1172  */
1173
1174 static int packet_release(struct socket *sock)
1175 {
1176         struct sock *sk = sock->sk;
1177         struct packet_sock *po;
1178         struct net *net;
1179 #ifdef CONFIG_PACKET_MMAP
1180         struct tpacket_req req;
1181 #endif
1182
1183         if (!sk)
1184                 return 0;
1185
1186         net = sock_net(sk);
1187         po = pkt_sk(sk);
1188
1189         write_lock_bh(&net->packet.sklist_lock);
1190         sk_del_node_init(sk);
1191         sock_prot_inuse_add(net, sk->sk_prot, -1);
1192         write_unlock_bh(&net->packet.sklist_lock);
1193
1194         /*
1195          *      Unhook packet receive handler.
1196          */
1197
1198         if (po->running) {
1199                 /*
1200                  *      Remove the protocol hook
1201                  */
1202                 dev_remove_pack(&po->prot_hook);
1203                 po->running = 0;
1204                 po->num = 0;
1205                 __sock_put(sk);
1206         }
1207
1208         packet_flush_mclist(sk);
1209
1210 #ifdef CONFIG_PACKET_MMAP
1211         memset(&req, 0, sizeof(req));
1212
1213         if (po->rx_ring.pg_vec)
1214                 packet_set_ring(sk, &req, 1, 0);
1215
1216         if (po->tx_ring.pg_vec)
1217                 packet_set_ring(sk, &req, 1, 1);
1218 #endif
1219
1220         /*
1221          *      Now the socket is dead. No more input will appear.
1222          */
1223
1224         sock_orphan(sk);
1225         sock->sk = NULL;
1226
1227         /* Purge queues */
1228
1229         skb_queue_purge(&sk->sk_receive_queue);
1230         sk_refcnt_debug_release(sk);
1231
1232         sock_put(sk);
1233         return 0;
1234 }
1235
1236 /*
1237  *      Attach a packet hook.
1238  */
1239
1240 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1241 {
1242         struct packet_sock *po = pkt_sk(sk);
1243         /*
1244          *      Detach an existing hook if present.
1245          */
1246
1247         lock_sock(sk);
1248
1249         spin_lock(&po->bind_lock);
1250         if (po->running) {
1251                 __sock_put(sk);
1252                 po->running = 0;
1253                 po->num = 0;
1254                 spin_unlock(&po->bind_lock);
1255                 dev_remove_pack(&po->prot_hook);
1256                 spin_lock(&po->bind_lock);
1257         }
1258
1259         po->num = protocol;
1260         po->prot_hook.type = protocol;
1261         po->prot_hook.dev = dev;
1262
1263         po->ifindex = dev ? dev->ifindex : 0;
1264
1265         if (protocol == 0)
1266                 goto out_unlock;
1267
1268         if (!dev || (dev->flags & IFF_UP)) {
1269                 dev_add_pack(&po->prot_hook);
1270                 sock_hold(sk);
1271                 po->running = 1;
1272         } else {
1273                 sk->sk_err = ENETDOWN;
1274                 if (!sock_flag(sk, SOCK_DEAD))
1275                         sk->sk_error_report(sk);
1276         }
1277
1278 out_unlock:
1279         spin_unlock(&po->bind_lock);
1280         release_sock(sk);
1281         return 0;
1282 }
1283
1284 /*
1285  *      Bind a packet socket to a device
1286  */
1287
1288 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1289                             int addr_len)
1290 {
1291         struct sock *sk = sock->sk;
1292         char name[15];
1293         struct net_device *dev;
1294         int err = -ENODEV;
1295
1296         /*
1297          *      Check legality
1298          */
1299
1300         if (addr_len != sizeof(struct sockaddr))
1301                 return -EINVAL;
1302         strlcpy(name, uaddr->sa_data, sizeof(name));
1303
1304         dev = dev_get_by_name(sock_net(sk), name);
1305         if (dev) {
1306                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1307                 dev_put(dev);
1308         }
1309         return err;
1310 }
1311
1312 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1313 {
1314         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1315         struct sock *sk = sock->sk;
1316         struct net_device *dev = NULL;
1317         int err;
1318
1319
1320         /*
1321          *      Check legality
1322          */
1323
1324         if (addr_len < sizeof(struct sockaddr_ll))
1325                 return -EINVAL;
1326         if (sll->sll_family != AF_PACKET)
1327                 return -EINVAL;
1328
1329         if (sll->sll_ifindex) {
1330                 err = -ENODEV;
1331                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1332                 if (dev == NULL)
1333                         goto out;
1334         }
1335         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1336         if (dev)
1337                 dev_put(dev);
1338
1339 out:
1340         return err;
1341 }
1342
1343 static struct proto packet_proto = {
1344         .name     = "PACKET",
1345         .owner    = THIS_MODULE,
1346         .obj_size = sizeof(struct packet_sock),
1347 };
1348
1349 /*
1350  *      Create a packet of type SOCK_PACKET.
1351  */
1352
1353 static int packet_create(struct net *net, struct socket *sock, int protocol)
1354 {
1355         struct sock *sk;
1356         struct packet_sock *po;
1357         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1358         int err;
1359
1360         if (!capable(CAP_NET_RAW))
1361                 return -EPERM;
1362         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1363             sock->type != SOCK_PACKET)
1364                 return -ESOCKTNOSUPPORT;
1365
1366         sock->state = SS_UNCONNECTED;
1367
1368         err = -ENOBUFS;
1369         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1370         if (sk == NULL)
1371                 goto out;
1372
1373         sock->ops = &packet_ops;
1374         if (sock->type == SOCK_PACKET)
1375                 sock->ops = &packet_ops_spkt;
1376
1377         sock_init_data(sock, sk);
1378
1379         po = pkt_sk(sk);
1380         sk->sk_family = PF_PACKET;
1381         po->num = proto;
1382
1383         sk->sk_destruct = packet_sock_destruct;
1384         sk_refcnt_debug_inc(sk);
1385
1386         /*
1387          *      Attach a protocol block
1388          */
1389
1390         spin_lock_init(&po->bind_lock);
1391         mutex_init(&po->pg_vec_lock);
1392         po->prot_hook.func = packet_rcv;
1393
1394         if (sock->type == SOCK_PACKET)
1395                 po->prot_hook.func = packet_rcv_spkt;
1396
1397         po->prot_hook.af_packet_priv = sk;
1398
1399         if (proto) {
1400                 po->prot_hook.type = proto;
1401                 dev_add_pack(&po->prot_hook);
1402                 sock_hold(sk);
1403                 po->running = 1;
1404         }
1405
1406         write_lock_bh(&net->packet.sklist_lock);
1407         sk_add_node(sk, &net->packet.sklist);
1408         sock_prot_inuse_add(net, &packet_proto, 1);
1409         write_unlock_bh(&net->packet.sklist_lock);
1410         return 0;
1411 out:
1412         return err;
1413 }
1414
1415 /*
1416  *      Pull a packet from our receive queue and hand it to the user.
1417  *      If necessary we block.
1418  */
1419
1420 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1421                           struct msghdr *msg, size_t len, int flags)
1422 {
1423         struct sock *sk = sock->sk;
1424         struct sk_buff *skb;
1425         int copied, err;
1426         struct sockaddr_ll *sll;
1427
1428         err = -EINVAL;
1429         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1430                 goto out;
1431
1432 #if 0
1433         /* What error should we return now? EUNATTACH? */
1434         if (pkt_sk(sk)->ifindex < 0)
1435                 return -ENODEV;
1436 #endif
1437
1438         /*
1439          *      Call the generic datagram receiver. This handles all sorts
1440          *      of horrible races and re-entrancy so we can forget about it
1441          *      in the protocol layers.
1442          *
1443          *      Now it will return ENETDOWN, if device have just gone down,
1444          *      but then it will block.
1445          */
1446
1447         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1448
1449         /*
1450          *      An error occurred so return it. Because skb_recv_datagram()
1451          *      handles the blocking we don't see and worry about blocking
1452          *      retries.
1453          */
1454
1455         if (skb == NULL)
1456                 goto out;
1457
1458         /*
1459          *      If the address length field is there to be filled in, we fill
1460          *      it in now.
1461          */
1462
1463         sll = &PACKET_SKB_CB(skb)->sa.ll;
1464         if (sock->type == SOCK_PACKET)
1465                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1466         else
1467                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1468
1469         /*
1470          *      You lose any data beyond the buffer you gave. If it worries a
1471          *      user program they can ask the device for its MTU anyway.
1472          */
1473
1474         copied = skb->len;
1475         if (copied > len) {
1476                 copied = len;
1477                 msg->msg_flags |= MSG_TRUNC;
1478         }
1479
1480         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1481         if (err)
1482                 goto out_free;
1483
1484         sock_recv_timestamp(msg, sk, skb);
1485
1486         if (msg->msg_name)
1487                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1488                        msg->msg_namelen);
1489
1490         if (pkt_sk(sk)->auxdata) {
1491                 struct tpacket_auxdata aux;
1492
1493                 aux.tp_status = TP_STATUS_USER;
1494                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1495                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1496                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1497                 aux.tp_snaplen = skb->len;
1498                 aux.tp_mac = 0;
1499                 aux.tp_net = skb_network_offset(skb);
1500                 aux.tp_vlan_tci = skb->vlan_tci;
1501
1502                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1503         }
1504
1505         /*
1506          *      Free or return the buffer as appropriate. Again this
1507          *      hides all the races and re-entrancy issues from us.
1508          */
1509         err = (flags&MSG_TRUNC) ? skb->len : copied;
1510
1511 out_free:
1512         skb_free_datagram(sk, skb);
1513 out:
1514         return err;
1515 }
1516
1517 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1518                                int *uaddr_len, int peer)
1519 {
1520         struct net_device *dev;
1521         struct sock *sk = sock->sk;
1522
1523         if (peer)
1524                 return -EOPNOTSUPP;
1525
1526         uaddr->sa_family = AF_PACKET;
1527         dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1528         if (dev) {
1529                 strncpy(uaddr->sa_data, dev->name, 14);
1530                 dev_put(dev);
1531         } else
1532                 memset(uaddr->sa_data, 0, 14);
1533         *uaddr_len = sizeof(*uaddr);
1534
1535         return 0;
1536 }
1537
1538 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1539                           int *uaddr_len, int peer)
1540 {
1541         struct net_device *dev;
1542         struct sock *sk = sock->sk;
1543         struct packet_sock *po = pkt_sk(sk);
1544         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1545
1546         if (peer)
1547                 return -EOPNOTSUPP;
1548
1549         sll->sll_family = AF_PACKET;
1550         sll->sll_ifindex = po->ifindex;
1551         sll->sll_protocol = po->num;
1552         sll->sll_pkttype = 0;
1553         dev = dev_get_by_index(sock_net(sk), po->ifindex);
1554         if (dev) {
1555                 sll->sll_hatype = dev->type;
1556                 sll->sll_halen = dev->addr_len;
1557                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1558                 dev_put(dev);
1559         } else {
1560                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1561                 sll->sll_halen = 0;
1562         }
1563         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1564
1565         return 0;
1566 }
1567
1568 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1569                          int what)
1570 {
1571         switch (i->type) {
1572         case PACKET_MR_MULTICAST:
1573                 if (what > 0)
1574                         return dev_mc_add(dev, i->addr, i->alen, 0);
1575                 else
1576                         return dev_mc_delete(dev, i->addr, i->alen, 0);
1577                 break;
1578         case PACKET_MR_PROMISC:
1579                 return dev_set_promiscuity(dev, what);
1580                 break;
1581         case PACKET_MR_ALLMULTI:
1582                 return dev_set_allmulti(dev, what);
1583                 break;
1584         case PACKET_MR_UNICAST:
1585                 if (what > 0)
1586                         return dev_unicast_add(dev, i->addr);
1587                 else
1588                         return dev_unicast_delete(dev, i->addr);
1589                 break;
1590         default:
1591                 break;
1592         }
1593         return 0;
1594 }
1595
1596 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1597 {
1598         for ( ; i; i = i->next) {
1599                 if (i->ifindex == dev->ifindex)
1600                         packet_dev_mc(dev, i, what);
1601         }
1602 }
1603
1604 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1605 {
1606         struct packet_sock *po = pkt_sk(sk);
1607         struct packet_mclist *ml, *i;
1608         struct net_device *dev;
1609         int err;
1610
1611         rtnl_lock();
1612
1613         err = -ENODEV;
1614         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1615         if (!dev)
1616                 goto done;
1617
1618         err = -EINVAL;
1619         if (mreq->mr_alen > dev->addr_len)
1620                 goto done;
1621
1622         err = -ENOBUFS;
1623         i = kmalloc(sizeof(*i), GFP_KERNEL);
1624         if (i == NULL)
1625                 goto done;
1626
1627         err = 0;
1628         for (ml = po->mclist; ml; ml = ml->next) {
1629                 if (ml->ifindex == mreq->mr_ifindex &&
1630                     ml->type == mreq->mr_type &&
1631                     ml->alen == mreq->mr_alen &&
1632                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1633                         ml->count++;
1634                         /* Free the new element ... */
1635                         kfree(i);
1636                         goto done;
1637                 }
1638         }
1639
1640         i->type = mreq->mr_type;
1641         i->ifindex = mreq->mr_ifindex;
1642         i->alen = mreq->mr_alen;
1643         memcpy(i->addr, mreq->mr_address, i->alen);
1644         i->count = 1;
1645         i->next = po->mclist;
1646         po->mclist = i;
1647         err = packet_dev_mc(dev, i, 1);
1648         if (err) {
1649                 po->mclist = i->next;
1650                 kfree(i);
1651         }
1652
1653 done:
1654         rtnl_unlock();
1655         return err;
1656 }
1657
1658 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1659 {
1660         struct packet_mclist *ml, **mlp;
1661
1662         rtnl_lock();
1663
1664         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1665                 if (ml->ifindex == mreq->mr_ifindex &&
1666                     ml->type == mreq->mr_type &&
1667                     ml->alen == mreq->mr_alen &&
1668                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1669                         if (--ml->count == 0) {
1670                                 struct net_device *dev;
1671                                 *mlp = ml->next;
1672                                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1673                                 if (dev) {
1674                                         packet_dev_mc(dev, ml, -1);
1675                                         dev_put(dev);
1676                                 }
1677                                 kfree(ml);
1678                         }
1679                         rtnl_unlock();
1680                         return 0;
1681                 }
1682         }
1683         rtnl_unlock();
1684         return -EADDRNOTAVAIL;
1685 }
1686
1687 static void packet_flush_mclist(struct sock *sk)
1688 {
1689         struct packet_sock *po = pkt_sk(sk);
1690         struct packet_mclist *ml;
1691
1692         if (!po->mclist)
1693                 return;
1694
1695         rtnl_lock();
1696         while ((ml = po->mclist) != NULL) {
1697                 struct net_device *dev;
1698
1699                 po->mclist = ml->next;
1700                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1701                 if (dev != NULL) {
1702                         packet_dev_mc(dev, ml, -1);
1703                         dev_put(dev);
1704                 }
1705                 kfree(ml);
1706         }
1707         rtnl_unlock();
1708 }
1709
1710 static int
1711 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1712 {
1713         struct sock *sk = sock->sk;
1714         struct packet_sock *po = pkt_sk(sk);
1715         int ret;
1716
1717         if (level != SOL_PACKET)
1718                 return -ENOPROTOOPT;
1719
1720         switch (optname) {
1721         case PACKET_ADD_MEMBERSHIP:
1722         case PACKET_DROP_MEMBERSHIP:
1723         {
1724                 struct packet_mreq_max mreq;
1725                 int len = optlen;
1726                 memset(&mreq, 0, sizeof(mreq));
1727                 if (len < sizeof(struct packet_mreq))
1728                         return -EINVAL;
1729                 if (len > sizeof(mreq))
1730                         len = sizeof(mreq);
1731                 if (copy_from_user(&mreq, optval, len))
1732                         return -EFAULT;
1733                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1734                         return -EINVAL;
1735                 if (optname == PACKET_ADD_MEMBERSHIP)
1736                         ret = packet_mc_add(sk, &mreq);
1737                 else
1738                         ret = packet_mc_drop(sk, &mreq);
1739                 return ret;
1740         }
1741
1742 #ifdef CONFIG_PACKET_MMAP
1743         case PACKET_RX_RING:
1744         case PACKET_TX_RING:
1745         {
1746                 struct tpacket_req req;
1747
1748                 if (optlen < sizeof(req))
1749                         return -EINVAL;
1750                 if (copy_from_user(&req, optval, sizeof(req)))
1751                         return -EFAULT;
1752                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1753         }
1754         case PACKET_COPY_THRESH:
1755         {
1756                 int val;
1757
1758                 if (optlen != sizeof(val))
1759                         return -EINVAL;
1760                 if (copy_from_user(&val, optval, sizeof(val)))
1761                         return -EFAULT;
1762
1763                 pkt_sk(sk)->copy_thresh = val;
1764                 return 0;
1765         }
1766         case PACKET_VERSION:
1767         {
1768                 int val;
1769
1770                 if (optlen != sizeof(val))
1771                         return -EINVAL;
1772                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1773                         return -EBUSY;
1774                 if (copy_from_user(&val, optval, sizeof(val)))
1775                         return -EFAULT;
1776                 switch (val) {
1777                 case TPACKET_V1:
1778                 case TPACKET_V2:
1779                         po->tp_version = val;
1780                         return 0;
1781                 default:
1782                         return -EINVAL;
1783                 }
1784         }
1785         case PACKET_RESERVE:
1786         {
1787                 unsigned int val;
1788
1789                 if (optlen != sizeof(val))
1790                         return -EINVAL;
1791                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1792                         return -EBUSY;
1793                 if (copy_from_user(&val, optval, sizeof(val)))
1794                         return -EFAULT;
1795                 po->tp_reserve = val;
1796                 return 0;
1797         }
1798         case PACKET_LOSS:
1799         {
1800                 unsigned int val;
1801
1802                 if (optlen != sizeof(val))
1803                         return -EINVAL;
1804                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1805                         return -EBUSY;
1806                 if (copy_from_user(&val, optval, sizeof(val)))
1807                         return -EFAULT;
1808                 po->tp_loss = !!val;
1809                 return 0;
1810         }
1811 #endif
1812         case PACKET_AUXDATA:
1813         {
1814                 int val;
1815
1816                 if (optlen < sizeof(val))
1817                         return -EINVAL;
1818                 if (copy_from_user(&val, optval, sizeof(val)))
1819                         return -EFAULT;
1820
1821                 po->auxdata = !!val;
1822                 return 0;
1823         }
1824         case PACKET_ORIGDEV:
1825         {
1826                 int val;
1827
1828                 if (optlen < sizeof(val))
1829                         return -EINVAL;
1830                 if (copy_from_user(&val, optval, sizeof(val)))
1831                         return -EFAULT;
1832
1833                 po->origdev = !!val;
1834                 return 0;
1835         }
1836         default:
1837                 return -ENOPROTOOPT;
1838         }
1839 }
1840
1841 static int packet_getsockopt(struct socket *sock, int level, int optname,
1842                              char __user *optval, int __user *optlen)
1843 {
1844         int len;
1845         int val;
1846         struct sock *sk = sock->sk;
1847         struct packet_sock *po = pkt_sk(sk);
1848         void *data;
1849         struct tpacket_stats st;
1850
1851         if (level != SOL_PACKET)
1852                 return -ENOPROTOOPT;
1853
1854         if (get_user(len, optlen))
1855                 return -EFAULT;
1856
1857         if (len < 0)
1858                 return -EINVAL;
1859
1860         switch (optname) {
1861         case PACKET_STATISTICS:
1862                 if (len > sizeof(struct tpacket_stats))
1863                         len = sizeof(struct tpacket_stats);
1864                 spin_lock_bh(&sk->sk_receive_queue.lock);
1865                 st = po->stats;
1866                 memset(&po->stats, 0, sizeof(st));
1867                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1868                 st.tp_packets += st.tp_drops;
1869
1870                 data = &st;
1871                 break;
1872         case PACKET_AUXDATA:
1873                 if (len > sizeof(int))
1874                         len = sizeof(int);
1875                 val = po->auxdata;
1876
1877                 data = &val;
1878                 break;
1879         case PACKET_ORIGDEV:
1880                 if (len > sizeof(int))
1881                         len = sizeof(int);
1882                 val = po->origdev;
1883
1884                 data = &val;
1885                 break;
1886 #ifdef CONFIG_PACKET_MMAP
1887         case PACKET_VERSION:
1888                 if (len > sizeof(int))
1889                         len = sizeof(int);
1890                 val = po->tp_version;
1891                 data = &val;
1892                 break;
1893         case PACKET_HDRLEN:
1894                 if (len > sizeof(int))
1895                         len = sizeof(int);
1896                 if (copy_from_user(&val, optval, len))
1897                         return -EFAULT;
1898                 switch (val) {
1899                 case TPACKET_V1:
1900                         val = sizeof(struct tpacket_hdr);
1901                         break;
1902                 case TPACKET_V2:
1903                         val = sizeof(struct tpacket2_hdr);
1904                         break;
1905                 default:
1906                         return -EINVAL;
1907                 }
1908                 data = &val;
1909                 break;
1910         case PACKET_RESERVE:
1911                 if (len > sizeof(unsigned int))
1912                         len = sizeof(unsigned int);
1913                 val = po->tp_reserve;
1914                 data = &val;
1915                 break;
1916         case PACKET_LOSS:
1917                 if (len > sizeof(unsigned int))
1918                         len = sizeof(unsigned int);
1919                 val = po->tp_loss;
1920                 data = &val;
1921                 break;
1922 #endif
1923         default:
1924                 return -ENOPROTOOPT;
1925         }
1926
1927         if (put_user(len, optlen))
1928                 return -EFAULT;
1929         if (copy_to_user(optval, data, len))
1930                 return -EFAULT;
1931         return 0;
1932 }
1933
1934
1935 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1936 {
1937         struct sock *sk;
1938         struct hlist_node *node;
1939         struct net_device *dev = data;
1940         struct net *net = dev_net(dev);
1941
1942         read_lock(&net->packet.sklist_lock);
1943         sk_for_each(sk, node, &net->packet.sklist) {
1944                 struct packet_sock *po = pkt_sk(sk);
1945
1946                 switch (msg) {
1947                 case NETDEV_UNREGISTER:
1948                         if (po->mclist)
1949                                 packet_dev_mclist(dev, po->mclist, -1);
1950                         /* fallthrough */
1951
1952                 case NETDEV_DOWN:
1953                         if (dev->ifindex == po->ifindex) {
1954                                 spin_lock(&po->bind_lock);
1955                                 if (po->running) {
1956                                         __dev_remove_pack(&po->prot_hook);
1957                                         __sock_put(sk);
1958                                         po->running = 0;
1959                                         sk->sk_err = ENETDOWN;
1960                                         if (!sock_flag(sk, SOCK_DEAD))
1961                                                 sk->sk_error_report(sk);
1962                                 }
1963                                 if (msg == NETDEV_UNREGISTER) {
1964                                         po->ifindex = -1;
1965                                         po->prot_hook.dev = NULL;
1966                                 }
1967                                 spin_unlock(&po->bind_lock);
1968                         }
1969                         break;
1970                 case NETDEV_UP:
1971                         spin_lock(&po->bind_lock);
1972                         if (dev->ifindex == po->ifindex && po->num &&
1973                             !po->running) {
1974                                 dev_add_pack(&po->prot_hook);
1975                                 sock_hold(sk);
1976                                 po->running = 1;
1977                         }
1978                         spin_unlock(&po->bind_lock);
1979                         break;
1980                 }
1981         }
1982         read_unlock(&net->packet.sklist_lock);
1983         return NOTIFY_DONE;
1984 }
1985
1986
1987 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1988                         unsigned long arg)
1989 {
1990         struct sock *sk = sock->sk;
1991
1992         switch (cmd) {
1993         case SIOCOUTQ:
1994         {
1995                 int amount = sk_wmem_alloc_get(sk);
1996
1997                 return put_user(amount, (int __user *)arg);
1998         }
1999         case SIOCINQ:
2000         {
2001                 struct sk_buff *skb;
2002                 int amount = 0;
2003
2004                 spin_lock_bh(&sk->sk_receive_queue.lock);
2005                 skb = skb_peek(&sk->sk_receive_queue);
2006                 if (skb)
2007                         amount = skb->len;
2008                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2009                 return put_user(amount, (int __user *)arg);
2010         }
2011         case SIOCGSTAMP:
2012                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2013         case SIOCGSTAMPNS:
2014                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2015
2016 #ifdef CONFIG_INET
2017         case SIOCADDRT:
2018         case SIOCDELRT:
2019         case SIOCDARP:
2020         case SIOCGARP:
2021         case SIOCSARP:
2022         case SIOCGIFADDR:
2023         case SIOCSIFADDR:
2024         case SIOCGIFBRDADDR:
2025         case SIOCSIFBRDADDR:
2026         case SIOCGIFNETMASK:
2027         case SIOCSIFNETMASK:
2028         case SIOCGIFDSTADDR:
2029         case SIOCSIFDSTADDR:
2030         case SIOCSIFFLAGS:
2031                 if (!net_eq(sock_net(sk), &init_net))
2032                         return -ENOIOCTLCMD;
2033                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2034 #endif
2035
2036         default:
2037                 return -ENOIOCTLCMD;
2038         }
2039         return 0;
2040 }
2041
2042 #ifndef CONFIG_PACKET_MMAP
2043 #define packet_mmap sock_no_mmap
2044 #define packet_poll datagram_poll
2045 #else
2046
2047 static unsigned int packet_poll(struct file *file, struct socket *sock,
2048                                 poll_table *wait)
2049 {
2050         struct sock *sk = sock->sk;
2051         struct packet_sock *po = pkt_sk(sk);
2052         unsigned int mask = datagram_poll(file, sock, wait);
2053
2054         spin_lock_bh(&sk->sk_receive_queue.lock);
2055         if (po->rx_ring.pg_vec) {
2056                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2057                         mask |= POLLIN | POLLRDNORM;
2058         }
2059         spin_unlock_bh(&sk->sk_receive_queue.lock);
2060         spin_lock_bh(&sk->sk_write_queue.lock);
2061         if (po->tx_ring.pg_vec) {
2062                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2063                         mask |= POLLOUT | POLLWRNORM;
2064         }
2065         spin_unlock_bh(&sk->sk_write_queue.lock);
2066         return mask;
2067 }
2068
2069
2070 /* Dirty? Well, I still did not learn better way to account
2071  * for user mmaps.
2072  */
2073
2074 static void packet_mm_open(struct vm_area_struct *vma)
2075 {
2076         struct file *file = vma->vm_file;
2077         struct socket *sock = file->private_data;
2078         struct sock *sk = sock->sk;
2079
2080         if (sk)
2081                 atomic_inc(&pkt_sk(sk)->mapped);
2082 }
2083
2084 static void packet_mm_close(struct vm_area_struct *vma)
2085 {
2086         struct file *file = vma->vm_file;
2087         struct socket *sock = file->private_data;
2088         struct sock *sk = sock->sk;
2089
2090         if (sk)
2091                 atomic_dec(&pkt_sk(sk)->mapped);
2092 }
2093
2094 static const struct vm_operations_struct packet_mmap_ops = {
2095         .open   =       packet_mm_open,
2096         .close  =       packet_mm_close,
2097 };
2098
2099 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2100 {
2101         int i;
2102
2103         for (i = 0; i < len; i++) {
2104                 if (likely(pg_vec[i]))
2105                         free_pages((unsigned long) pg_vec[i], order);
2106         }
2107         kfree(pg_vec);
2108 }
2109
2110 static inline char *alloc_one_pg_vec_page(unsigned long order)
2111 {
2112         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2113
2114         return (char *) __get_free_pages(gfp_flags, order);
2115 }
2116
2117 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2118 {
2119         unsigned int block_nr = req->tp_block_nr;
2120         char **pg_vec;
2121         int i;
2122
2123         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2124         if (unlikely(!pg_vec))
2125                 goto out;
2126
2127         for (i = 0; i < block_nr; i++) {
2128                 pg_vec[i] = alloc_one_pg_vec_page(order);
2129                 if (unlikely(!pg_vec[i]))
2130                         goto out_free_pgvec;
2131         }
2132
2133 out:
2134         return pg_vec;
2135
2136 out_free_pgvec:
2137         free_pg_vec(pg_vec, order, block_nr);
2138         pg_vec = NULL;
2139         goto out;
2140 }
2141
2142 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2143                 int closing, int tx_ring)
2144 {
2145         char **pg_vec = NULL;
2146         struct packet_sock *po = pkt_sk(sk);
2147         int was_running, order = 0;
2148         struct packet_ring_buffer *rb;
2149         struct sk_buff_head *rb_queue;
2150         __be16 num;
2151         int err;
2152
2153         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2154         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2155
2156         err = -EBUSY;
2157         if (!closing) {
2158                 if (atomic_read(&po->mapped))
2159                         goto out;
2160                 if (atomic_read(&rb->pending))
2161                         goto out;
2162         }
2163
2164         if (req->tp_block_nr) {
2165                 /* Sanity tests and some calculations */
2166                 err = -EBUSY;
2167                 if (unlikely(rb->pg_vec))
2168                         goto out;
2169
2170                 switch (po->tp_version) {
2171                 case TPACKET_V1:
2172                         po->tp_hdrlen = TPACKET_HDRLEN;
2173                         break;
2174                 case TPACKET_V2:
2175                         po->tp_hdrlen = TPACKET2_HDRLEN;
2176                         break;
2177                 }
2178
2179                 err = -EINVAL;
2180                 if (unlikely((int)req->tp_block_size <= 0))
2181                         goto out;
2182                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2183                         goto out;
2184                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2185                                         po->tp_reserve))
2186                         goto out;
2187                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2188                         goto out;
2189
2190                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2191                 if (unlikely(rb->frames_per_block <= 0))
2192                         goto out;
2193                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2194                                         req->tp_frame_nr))
2195                         goto out;
2196
2197                 err = -ENOMEM;
2198                 order = get_order(req->tp_block_size);
2199                 pg_vec = alloc_pg_vec(req, order);
2200                 if (unlikely(!pg_vec))
2201                         goto out;
2202         }
2203         /* Done */
2204         else {
2205                 err = -EINVAL;
2206                 if (unlikely(req->tp_frame_nr))
2207                         goto out;
2208         }
2209
2210         lock_sock(sk);
2211
2212         /* Detach socket from network */
2213         spin_lock(&po->bind_lock);
2214         was_running = po->running;
2215         num = po->num;
2216         if (was_running) {
2217                 __dev_remove_pack(&po->prot_hook);
2218                 po->num = 0;
2219                 po->running = 0;
2220                 __sock_put(sk);
2221         }
2222         spin_unlock(&po->bind_lock);
2223
2224         synchronize_net();
2225
2226         err = -EBUSY;
2227         mutex_lock(&po->pg_vec_lock);
2228         if (closing || atomic_read(&po->mapped) == 0) {
2229                 err = 0;
2230 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2231                 spin_lock_bh(&rb_queue->lock);
2232                 pg_vec = XC(rb->pg_vec, pg_vec);
2233                 rb->frame_max = (req->tp_frame_nr - 1);
2234                 rb->head = 0;
2235                 rb->frame_size = req->tp_frame_size;
2236                 spin_unlock_bh(&rb_queue->lock);
2237
2238                 order = XC(rb->pg_vec_order, order);
2239                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2240
2241                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2242                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2243                                                 tpacket_rcv : packet_rcv;
2244                 skb_queue_purge(rb_queue);
2245 #undef XC
2246                 if (atomic_read(&po->mapped))
2247                         pr_err("packet_mmap: vma is busy: %d\n",
2248                                atomic_read(&po->mapped));
2249         }
2250         mutex_unlock(&po->pg_vec_lock);
2251
2252         spin_lock(&po->bind_lock);
2253         if (was_running && !po->running) {
2254                 sock_hold(sk);
2255                 po->running = 1;
2256                 po->num = num;
2257                 dev_add_pack(&po->prot_hook);
2258         }
2259         spin_unlock(&po->bind_lock);
2260
2261         release_sock(sk);
2262
2263         if (pg_vec)
2264                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2265 out:
2266         return err;
2267 }
2268
2269 static int packet_mmap(struct file *file, struct socket *sock,
2270                 struct vm_area_struct *vma)
2271 {
2272         struct sock *sk = sock->sk;
2273         struct packet_sock *po = pkt_sk(sk);
2274         unsigned long size, expected_size;
2275         struct packet_ring_buffer *rb;
2276         unsigned long start;
2277         int err = -EINVAL;
2278         int i;
2279
2280         if (vma->vm_pgoff)
2281                 return -EINVAL;
2282
2283         mutex_lock(&po->pg_vec_lock);
2284
2285         expected_size = 0;
2286         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2287                 if (rb->pg_vec) {
2288                         expected_size += rb->pg_vec_len
2289                                                 * rb->pg_vec_pages
2290                                                 * PAGE_SIZE;
2291                 }
2292         }
2293
2294         if (expected_size == 0)
2295                 goto out;
2296
2297         size = vma->vm_end - vma->vm_start;
2298         if (size != expected_size)
2299                 goto out;
2300
2301         start = vma->vm_start;
2302         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2303                 if (rb->pg_vec == NULL)
2304                         continue;
2305
2306                 for (i = 0; i < rb->pg_vec_len; i++) {
2307                         struct page *page = virt_to_page(rb->pg_vec[i]);
2308                         int pg_num;
2309
2310                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2311                                         pg_num++, page++) {
2312                                 err = vm_insert_page(vma, start, page);
2313                                 if (unlikely(err))
2314                                         goto out;
2315                                 start += PAGE_SIZE;
2316                         }
2317                 }
2318         }
2319
2320         atomic_inc(&po->mapped);
2321         vma->vm_ops = &packet_mmap_ops;
2322         err = 0;
2323
2324 out:
2325         mutex_unlock(&po->pg_vec_lock);
2326         return err;
2327 }
2328 #endif
2329
2330
2331 static const struct proto_ops packet_ops_spkt = {
2332         .family =       PF_PACKET,
2333         .owner =        THIS_MODULE,
2334         .release =      packet_release,
2335         .bind =         packet_bind_spkt,
2336         .connect =      sock_no_connect,
2337         .socketpair =   sock_no_socketpair,
2338         .accept =       sock_no_accept,
2339         .getname =      packet_getname_spkt,
2340         .poll =         datagram_poll,
2341         .ioctl =        packet_ioctl,
2342         .listen =       sock_no_listen,
2343         .shutdown =     sock_no_shutdown,
2344         .setsockopt =   sock_no_setsockopt,
2345         .getsockopt =   sock_no_getsockopt,
2346         .sendmsg =      packet_sendmsg_spkt,
2347         .recvmsg =      packet_recvmsg,
2348         .mmap =         sock_no_mmap,
2349         .sendpage =     sock_no_sendpage,
2350 };
2351
2352 static const struct proto_ops packet_ops = {
2353         .family =       PF_PACKET,
2354         .owner =        THIS_MODULE,
2355         .release =      packet_release,
2356         .bind =         packet_bind,
2357         .connect =      sock_no_connect,
2358         .socketpair =   sock_no_socketpair,
2359         .accept =       sock_no_accept,
2360         .getname =      packet_getname,
2361         .poll =         packet_poll,
2362         .ioctl =        packet_ioctl,
2363         .listen =       sock_no_listen,
2364         .shutdown =     sock_no_shutdown,
2365         .setsockopt =   packet_setsockopt,
2366         .getsockopt =   packet_getsockopt,
2367         .sendmsg =      packet_sendmsg,
2368         .recvmsg =      packet_recvmsg,
2369         .mmap =         packet_mmap,
2370         .sendpage =     sock_no_sendpage,
2371 };
2372
2373 static struct net_proto_family packet_family_ops = {
2374         .family =       PF_PACKET,
2375         .create =       packet_create,
2376         .owner  =       THIS_MODULE,
2377 };
2378
2379 static struct notifier_block packet_netdev_notifier = {
2380         .notifier_call =        packet_notifier,
2381 };
2382
2383 #ifdef CONFIG_PROC_FS
2384 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2385 {
2386         struct sock *s;
2387         struct hlist_node *node;
2388
2389         sk_for_each(s, node, &net->packet.sklist) {
2390                 if (!off--)
2391                         return s;
2392         }
2393         return NULL;
2394 }
2395
2396 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2397         __acquires(seq_file_net(seq)->packet.sklist_lock)
2398 {
2399         struct net *net = seq_file_net(seq);
2400         read_lock(&net->packet.sklist_lock);
2401         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2402 }
2403
2404 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2405 {
2406         struct net *net = seq_file_net(seq);
2407         ++*pos;
2408         return  (v == SEQ_START_TOKEN)
2409                 ? sk_head(&net->packet.sklist)
2410                 : sk_next((struct sock *)v) ;
2411 }
2412
2413 static void packet_seq_stop(struct seq_file *seq, void *v)
2414         __releases(seq_file_net(seq)->packet.sklist_lock)
2415 {
2416         struct net *net = seq_file_net(seq);
2417         read_unlock(&net->packet.sklist_lock);
2418 }
2419
2420 static int packet_seq_show(struct seq_file *seq, void *v)
2421 {
2422         if (v == SEQ_START_TOKEN)
2423                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2424         else {
2425                 struct sock *s = v;
2426                 const struct packet_sock *po = pkt_sk(s);
2427
2428                 seq_printf(seq,
2429                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2430                            s,
2431                            atomic_read(&s->sk_refcnt),
2432                            s->sk_type,
2433                            ntohs(po->num),
2434                            po->ifindex,
2435                            po->running,
2436                            atomic_read(&s->sk_rmem_alloc),
2437                            sock_i_uid(s),
2438                            sock_i_ino(s));
2439         }
2440
2441         return 0;
2442 }
2443
2444 static const struct seq_operations packet_seq_ops = {
2445         .start  = packet_seq_start,
2446         .next   = packet_seq_next,
2447         .stop   = packet_seq_stop,
2448         .show   = packet_seq_show,
2449 };
2450
2451 static int packet_seq_open(struct inode *inode, struct file *file)
2452 {
2453         return seq_open_net(inode, file, &packet_seq_ops,
2454                             sizeof(struct seq_net_private));
2455 }
2456
2457 static const struct file_operations packet_seq_fops = {
2458         .owner          = THIS_MODULE,
2459         .open           = packet_seq_open,
2460         .read           = seq_read,
2461         .llseek         = seq_lseek,
2462         .release        = seq_release_net,
2463 };
2464
2465 #endif
2466
2467 static int packet_net_init(struct net *net)
2468 {
2469         rwlock_init(&net->packet.sklist_lock);
2470         INIT_HLIST_HEAD(&net->packet.sklist);
2471
2472         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2473                 return -ENOMEM;
2474
2475         return 0;
2476 }
2477
2478 static void packet_net_exit(struct net *net)
2479 {
2480         proc_net_remove(net, "packet");
2481 }
2482
2483 static struct pernet_operations packet_net_ops = {
2484         .init = packet_net_init,
2485         .exit = packet_net_exit,
2486 };
2487
2488
2489 static void __exit packet_exit(void)
2490 {
2491         unregister_netdevice_notifier(&packet_netdev_notifier);
2492         unregister_pernet_subsys(&packet_net_ops);
2493         sock_unregister(PF_PACKET);
2494         proto_unregister(&packet_proto);
2495 }
2496
2497 static int __init packet_init(void)
2498 {
2499         int rc = proto_register(&packet_proto, 0);
2500
2501         if (rc != 0)
2502                 goto out;
2503
2504         sock_register(&packet_family_ops);
2505         register_pernet_subsys(&packet_net_ops);
2506         register_netdevice_notifier(&packet_netdev_notifier);
2507 out:
2508         return rc;
2509 }
2510
2511 module_init(packet_init);
2512 module_exit(packet_exit);
2513 MODULE_LICENSE("GPL");
2514 MODULE_ALIAS_NETPROTO(PF_PACKET);