net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int __ip6_local_out(struct sk_buff *skb)
  60 {
  61         int len;
  62
  63         len = skb->len - sizeof(struct ipv6hdr);
  64         if (len > IPV6_MAXPLEN)
  65                 len = 0;
  66         ipv6_hdr(skb)->payload_len = htons(len);
  67
  68         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  69                        skb_dst(skb)->dev, dst_output);
  70 }
  71
  72 int ip6_local_out(struct sk_buff *skb)
  73 {
  74         int err;
  75
  76         err = __ip6_local_out(skb);
  77         if (likely(err == 1))
  78                 err = dst_output(skb);
  79
  80         return err;
  81 }
  82 EXPORT_SYMBOL_GPL(ip6_local_out);
  83
  84 static int ip6_finish_output2(struct sk_buff *skb)
  85 {
  86         struct dst_entry *dst = skb_dst(skb);
  87         struct net_device *dev = dst->dev;
  88         struct neighbour *neigh;
  89         struct in6_addr *nexthop;
  90         int ret;
  91
  92         skb->protocol = htons(ETH_P_IPV6);
  93         skb->dev = dev;
  94
  95         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  96                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  97
  98                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
  99                     ((mroute6_socket(dev_net(dev), skb) &&
 100                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 101                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 102                                          &ipv6_hdr(skb)->saddr))) {
 103                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 104
 105                         /* Do not check for IFF_ALLMULTI; multicast routing
 106                            is not supported in any case.
 107                          */
 108                         if (newskb)
 109                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 110                                         newskb, NULL, newskb->dev,
 111                                         dev_loopback_xmit);
 112
 113                         if (ipv6_hdr(skb)->hop_limit == 0) {
 114                                 IP6_INC_STATS(dev_net(dev), idev,
 115                                               IPSTATS_MIB_OUTDISCARDS);
 116                                 kfree_skb(skb);
 117                                 return 0;
 118                         }
 119                 }
 120
 121                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 122                                 skb->len);
 123
 124                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 125                     IPV6_ADDR_SCOPE_NODELOCAL &&
 126                     !(dev->flags & IFF_LOOPBACK)) {
 127                         kfree_skb(skb);
 128                         return 0;
 129                 }
 130         }
 131
 132         rcu_read_lock_bh();
 133         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 134         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 135         if (unlikely(!neigh))
 136                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 137         if (!IS_ERR(neigh)) {
 138                 ret = dst_neigh_output(dst, neigh, skb);
 139                 rcu_read_unlock_bh();
 140                 return ret;
 141         }
 142         rcu_read_unlock_bh();
 143
 144         IP6_INC_STATS_BH(dev_net(dst->dev),
 145                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 146         kfree_skb(skb);
 147         return -EINVAL;
 148 }
 149
 150 static int ip6_finish_output(struct sk_buff *skb)
 151 {
 152         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 153             dst_allfrag(skb_dst(skb)))
 154                 return ip6_fragment(skb, ip6_finish_output2);
 155         else
 156                 return ip6_finish_output2(skb);
 157 }
 158
 159 int ip6_output(struct sk_buff *skb)
 160 {
 161         struct net_device *dev = skb_dst(skb)->dev;
 162         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 163         if (unlikely(idev->cnf.disable_ipv6)) {
 164                 IP6_INC_STATS(dev_net(dev), idev,
 165                               IPSTATS_MIB_OUTDISCARDS);
 166                 kfree_skb(skb);
 167                 return 0;
 168         }
 169
 170         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 171                             ip6_finish_output,
 172                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 173 }
 174
 175 /*
 176  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 177  */
 178
 179 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 180              struct ipv6_txoptions *opt, int tclass)
 181 {
 182         struct net *net = sock_net(sk);
 183         struct ipv6_pinfo *np = inet6_sk(sk);
 184         struct in6_addr *first_hop = &fl6->daddr;
 185         struct dst_entry *dst = skb_dst(skb);
 186         struct ipv6hdr *hdr;
 187         u8  proto = fl6->flowi6_proto;
 188         int seg_len = skb->len;
 189         int hlimit = -1;
 190         u32 mtu;
 191
 192         if (opt) {
 193                 unsigned int head_room;
 194
 195                 /* First: exthdrs may take lots of space (~8K for now)
 196                    MAX_HEADER is not enough.
 197                  */
 198                 head_room = opt->opt_nflen + opt->opt_flen;
 199                 seg_len += head_room;
 200                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 201
 202                 if (skb_headroom(skb) < head_room) {
 203                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 204                         if (skb2 == NULL) {
 205                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 206                                               IPSTATS_MIB_OUTDISCARDS);
 207                                 kfree_skb(skb);
 208                                 return -ENOBUFS;
 209                         }
 210                         consume_skb(skb);
 211                         skb = skb2;
 212                         skb_set_owner_w(skb, sk);
 213                 }
 214                 if (opt->opt_flen)
 215                         ipv6_push_frag_opts(skb, opt, &proto);
 216                 if (opt->opt_nflen)
 217                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 218         }
 219
 220         skb_push(skb, sizeof(struct ipv6hdr));
 221         skb_reset_network_header(skb);
 222         hdr = ipv6_hdr(skb);
 223
 224         /*
 225          *      Fill in the IPv6 header
 226          */
 227         if (np)
 228                 hlimit = np->hop_limit;
 229         if (hlimit < 0)
 230                 hlimit = ip6_dst_hoplimit(dst);
 231
 232         ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
 233
 234         hdr->payload_len = htons(seg_len);
 235         hdr->nexthdr = proto;
 236         hdr->hop_limit = hlimit;
 237
 238         hdr->saddr = fl6->saddr;
 239         hdr->daddr = *first_hop;
 240
 241         skb->priority = sk->sk_priority;
 242         skb->mark = sk->sk_mark;
 243
 244         mtu = dst_mtu(dst);
 245         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 246                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 247                               IPSTATS_MIB_OUT, skb->len);
 248                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 249                                dst->dev, dst_output);
 250         }
 251
 252         skb->dev = dst->dev;
 253         ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
 254         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 255         kfree_skb(skb);
 256         return -EMSGSIZE;
 257 }
 258
 259 EXPORT_SYMBOL(ip6_xmit);
 260
 261 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 262 {
 263         struct ip6_ra_chain *ra;
 264         struct sock *last = NULL;
 265
 266         read_lock(&ip6_ra_lock);
 267         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 268                 struct sock *sk = ra->sk;
 269                 if (sk && ra->sel == sel &&
 270                     (!sk->sk_bound_dev_if ||
 271                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 272                         if (last) {
 273                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 274                                 if (skb2)
 275                                         rawv6_rcv(last, skb2);
 276                         }
 277                         last = sk;
 278                 }
 279         }
 280
 281         if (last) {
 282                 rawv6_rcv(last, skb);
 283                 read_unlock(&ip6_ra_lock);
 284                 return 1;
 285         }
 286         read_unlock(&ip6_ra_lock);
 287         return 0;
 288 }
 289
 290 static int ip6_forward_proxy_check(struct sk_buff *skb)
 291 {
 292         struct ipv6hdr *hdr = ipv6_hdr(skb);
 293         u8 nexthdr = hdr->nexthdr;
 294         __be16 frag_off;
 295         int offset;
 296
 297         if (ipv6_ext_hdr(nexthdr)) {
 298                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 299                 if (offset < 0)
 300                         return 0;
 301         } else
 302                 offset = sizeof(struct ipv6hdr);
 303
 304         if (nexthdr == IPPROTO_ICMPV6) {
 305                 struct icmp6hdr *icmp6;
 306
 307                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 308                                          offset + 1 - skb->data)))
 309                         return 0;
 310
 311                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 312
 313                 switch (icmp6->icmp6_type) {
 314                 case NDISC_ROUTER_SOLICITATION:
 315                 case NDISC_ROUTER_ADVERTISEMENT:
 316                 case NDISC_NEIGHBOUR_SOLICITATION:
 317                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 318                 case NDISC_REDIRECT:
 319                         /* For reaction involving unicast neighbor discovery
 320                          * message destined to the proxied address, pass it to
 321                          * input function.
 322                          */
 323                         return 1;
 324                 default:
 325                         break;
 326                 }
 327         }
 328
 329         /*
 330          * The proxying router can't forward traffic sent to a link-local
 331          * address, so signal the sender and discard the packet. This
 332          * behavior is clarified by the MIPv6 specification.
 333          */
 334         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 335                 dst_link_failure(skb);
 336                 return -1;
 337         }
 338
 339         return 0;
 340 }
 341
 342 static inline int ip6_forward_finish(struct sk_buff *skb)
 343 {
 344         return dst_output(skb);
 345 }
 346
 347 int ip6_forward(struct sk_buff *skb)
 348 {
 349         struct dst_entry *dst = skb_dst(skb);
 350         struct ipv6hdr *hdr = ipv6_hdr(skb);
 351         struct inet6_skb_parm *opt = IP6CB(skb);
 352         struct net *net = dev_net(dst->dev);
 353         u32 mtu;
 354
 355         if (net->ipv6.devconf_all->forwarding == 0)
 356                 goto error;
 357
 358         if (skb_warn_if_lro(skb))
 359                 goto drop;
 360
 361         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 362                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 363                 goto drop;
 364         }
 365
 366         if (skb->pkt_type != PACKET_HOST)
 367                 goto drop;
 368
 369         skb_forward_csum(skb);
 370
 371         /*
 372          *      We DO NOT make any processing on
 373          *      RA packets, pushing them to user level AS IS
 374          *      without ane WARRANTY that application will be able
 375          *      to interpret them. The reason is that we
 376          *      cannot make anything clever here.
 377          *
 378          *      We are not end-node, so that if packet contains
 379          *      AH/ESP, we cannot make anything.
 380          *      Defragmentation also would be mistake, RA packets
 381          *      cannot be fragmented, because there is no warranty
 382          *      that different fragments will go along one path. --ANK
 383          */
 384         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 385                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 386                         return 0;
 387         }
 388
 389         /*
 390          *      check and decrement ttl
 391          */
 392         if (hdr->hop_limit <= 1) {
 393                 /* Force OUTPUT device used as source address */
 394                 skb->dev = dst->dev;
 395                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 396                 IP6_INC_STATS_BH(net,
 397                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 398
 399                 kfree_skb(skb);
 400                 return -ETIMEDOUT;
 401         }
 402
 403         /* XXX: idev->cnf.proxy_ndp? */
 404         if (net->ipv6.devconf_all->proxy_ndp &&
 405             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 406                 int proxied = ip6_forward_proxy_check(skb);
 407                 if (proxied > 0)
 408                         return ip6_input(skb);
 409                 else if (proxied < 0) {
 410                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 411                                       IPSTATS_MIB_INDISCARDS);
 412                         goto drop;
 413                 }
 414         }
 415
 416         if (!xfrm6_route_forward(skb)) {
 417                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 418                 goto drop;
 419         }
 420         dst = skb_dst(skb);
 421
 422         /* IPv6 specs say nothing about it, but it is clear that we cannot
 423            send redirects to source routed frames.
 424            We don't send redirects to frames decapsulated from IPsec.
 425          */
 426         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 427                 struct in6_addr *target = NULL;
 428                 struct inet_peer *peer;
 429                 struct rt6_info *rt;
 430
 431                 /*
 432                  *      incoming and outgoing devices are the same
 433                  *      send a redirect.
 434                  */
 435
 436                 rt = (struct rt6_info *) dst;
 437                 if (rt->rt6i_flags & RTF_GATEWAY)
 438                         target = &rt->rt6i_gateway;
 439                 else
 440                         target = &hdr->daddr;
 441
 442                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
 443
 444                 /* Limit redirects both by destination (here)
 445                    and by source (inside ndisc_send_redirect)
 446                  */
 447                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 448                         ndisc_send_redirect(skb, target);
 449                 if (peer)
 450                         inet_putpeer(peer);
 451         } else {
 452                 int addrtype = ipv6_addr_type(&hdr->saddr);
 453
 454                 /* This check is security critical. */
 455                 if (addrtype == IPV6_ADDR_ANY ||
 456                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 457                         goto error;
 458                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 459                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 460                                     ICMPV6_NOT_NEIGHBOUR, 0);
 461                         goto error;
 462                 }
 463         }
 464
 465         mtu = dst_mtu(dst);
 466         if (mtu < IPV6_MIN_MTU)
 467                 mtu = IPV6_MIN_MTU;
 468
 469         if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
 470             (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
 471                 /* Again, force OUTPUT device used as source address */
 472                 skb->dev = dst->dev;
 473                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 474                 IP6_INC_STATS_BH(net,
 475                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 476                 IP6_INC_STATS_BH(net,
 477                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 478                 kfree_skb(skb);
 479                 return -EMSGSIZE;
 480         }
 481
 482         if (skb_cow(skb, dst->dev->hard_header_len)) {
 483                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 484                 goto drop;
 485         }
 486
 487         hdr = ipv6_hdr(skb);
 488
 489         /* Mangling hops number delayed to point after skb COW */
 490
 491         hdr->hop_limit--;
 492
 493         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 494         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 495         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 496                        ip6_forward_finish);
 497
 498 error:
 499         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 500 drop:
 501         kfree_skb(skb);
 502         return -EINVAL;
 503 }
 504
 505 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 506 {
 507         to->pkt_type = from->pkt_type;
 508         to->priority = from->priority;
 509         to->protocol = from->protocol;
 510         skb_dst_drop(to);
 511         skb_dst_set(to, dst_clone(skb_dst(from)));
 512         to->dev = from->dev;
 513         to->mark = from->mark;
 514
 515 #ifdef CONFIG_NET_SCHED
 516         to->tc_index = from->tc_index;
 517 #endif
 518         nf_copy(to, from);
 519 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
 520         to->nf_trace = from->nf_trace;
 521 #endif
 522         skb_copy_secmark(to, from);
 523 }
 524
 525 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 526 {
 527         struct sk_buff *frag;
 528         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 529         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 530         struct ipv6hdr *tmp_hdr;
 531         struct frag_hdr *fh;
 532         unsigned int mtu, hlen, left, len;
 533         int hroom, troom;
 534         __be32 frag_id = 0;
 535         int ptr, offset = 0, err=0;
 536         u8 *prevhdr, nexthdr = 0;
 537         struct net *net = dev_net(skb_dst(skb)->dev);
 538
 539         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 540         nexthdr = *prevhdr;
 541
 542         mtu = ip6_skb_dst_mtu(skb);
 543
 544         /* We must not fragment if the socket is set to force MTU discovery
 545          * or if the skb it not generated by a local socket.
 546          */
 547         if (unlikely(!skb->local_df && skb->len > mtu) ||
 548                      (IP6CB(skb)->frag_max_size &&
 549                       IP6CB(skb)->frag_max_size > mtu)) {
 550                 if (skb->sk && dst_allfrag(skb_dst(skb)))
 551                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 552
 553                 skb->dev = skb_dst(skb)->dev;
 554                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 555                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 556                               IPSTATS_MIB_FRAGFAILS);
 557                 kfree_skb(skb);
 558                 return -EMSGSIZE;
 559         }
 560
 561         if (np && np->frag_size < mtu) {
 562                 if (np->frag_size)
 563                         mtu = np->frag_size;
 564         }
 565         mtu -= hlen + sizeof(struct frag_hdr);
 566
 567         if (skb_has_frag_list(skb)) {
 568                 int first_len = skb_pagelen(skb);
 569                 struct sk_buff *frag2;
 570
 571                 if (first_len - hlen > mtu ||
 572                     ((first_len - hlen) & 7) ||
 573                     skb_cloned(skb))
 574                         goto slow_path;
 575
 576                 skb_walk_frags(skb, frag) {
 577                         /* Correct geometry. */
 578                         if (frag->len > mtu ||
 579                             ((frag->len & 7) && frag->next) ||
 580                             skb_headroom(frag) < hlen)
 581                                 goto slow_path_clean;
 582
 583                         /* Partially cloned skb? */
 584                         if (skb_shared(frag))
 585                                 goto slow_path_clean;
 586
 587                         BUG_ON(frag->sk);
 588                         if (skb->sk) {
 589                                 frag->sk = skb->sk;
 590                                 frag->destructor = sock_wfree;
 591                         }
 592                         skb->truesize -= frag->truesize;
 593                 }
 594
 595                 err = 0;
 596                 offset = 0;
 597                 frag = skb_shinfo(skb)->frag_list;
 598                 skb_frag_list_init(skb);
 599                 /* BUILD HEADER */
 600
 601                 *prevhdr = NEXTHDR_FRAGMENT;
 602                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 603                 if (!tmp_hdr) {
 604                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 605                                       IPSTATS_MIB_FRAGFAILS);
 606                         return -ENOMEM;
 607                 }
 608
 609                 __skb_pull(skb, hlen);
 610                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 611                 __skb_push(skb, hlen);
 612                 skb_reset_network_header(skb);
 613                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 614
 615                 ipv6_select_ident(fh, rt);
 616                 fh->nexthdr = nexthdr;
 617                 fh->reserved = 0;
 618                 fh->frag_off = htons(IP6_MF);
 619                 frag_id = fh->identification;
 620
 621                 first_len = skb_pagelen(skb);
 622                 skb->data_len = first_len - skb_headlen(skb);
 623                 skb->len = first_len;
 624                 ipv6_hdr(skb)->payload_len = htons(first_len -
 625                                                    sizeof(struct ipv6hdr));
 626
 627                 dst_hold(&rt->dst);
 628
 629                 for (;;) {
 630                         /* Prepare header of the next frame,
 631                          * before previous one went down. */
 632                         if (frag) {
 633                                 frag->ip_summed = CHECKSUM_NONE;
 634                                 skb_reset_transport_header(frag);
 635                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 636                                 __skb_push(frag, hlen);
 637                                 skb_reset_network_header(frag);
 638                                 memcpy(skb_network_header(frag), tmp_hdr,
 639                                        hlen);
 640                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 641                                 fh->nexthdr = nexthdr;
 642                                 fh->reserved = 0;
 643                                 fh->frag_off = htons(offset);
 644                                 if (frag->next != NULL)
 645                                         fh->frag_off |= htons(IP6_MF);
 646                                 fh->identification = frag_id;
 647                                 ipv6_hdr(frag)->payload_len =
 648                                                 htons(frag->len -
 649                                                       sizeof(struct ipv6hdr));
 650                                 ip6_copy_metadata(frag, skb);
 651                         }
 652
 653                         err = output(skb);
 654                         if(!err)
 655                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 656                                               IPSTATS_MIB_FRAGCREATES);
 657
 658                         if (err || !frag)
 659                                 break;
 660
 661                         skb = frag;
 662                         frag = skb->next;
 663                         skb->next = NULL;
 664                 }
 665
 666                 kfree(tmp_hdr);
 667
 668                 if (err == 0) {
 669                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 670                                       IPSTATS_MIB_FRAGOKS);
 671                         ip6_rt_put(rt);
 672                         return 0;
 673                 }
 674
 675                 while (frag) {
 676                         skb = frag->next;
 677                         kfree_skb(frag);
 678                         frag = skb;
 679                 }
 680
 681                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 682                               IPSTATS_MIB_FRAGFAILS);
 683                 ip6_rt_put(rt);
 684                 return err;
 685
 686 slow_path_clean:
 687                 skb_walk_frags(skb, frag2) {
 688                         if (frag2 == frag)
 689                                 break;
 690                         frag2->sk = NULL;
 691                         frag2->destructor = NULL;
 692                         skb->truesize += frag2->truesize;
 693                 }
 694         }
 695
 696 slow_path:
 697         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
 698             skb_checksum_help(skb))
 699                 goto fail;
 700
 701         left = skb->len - hlen;         /* Space per frame */
 702         ptr = hlen;                     /* Where to start from */
 703
 704         /*
 705          *      Fragment the datagram.
 706          */
 707
 708         *prevhdr = NEXTHDR_FRAGMENT;
 709         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 710         troom = rt->dst.dev->needed_tailroom;
 711
 712         /*
 713          *      Keep copying data until we run out.
 714          */
 715         while(left > 0) {
 716                 len = left;
 717                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 718                 if (len > mtu)
 719                         len = mtu;
 720                 /* IF: we are not sending up to and including the packet end
 721                    then align the next start on an eight byte boundary */
 722                 if (len < left) {
 723                         len &= ~7;
 724                 }
 725                 /*
 726                  *      Allocate buffer.
 727                  */
 728
 729                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 730                                       hroom + troom, GFP_ATOMIC)) == NULL) {
 731                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 732                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 733                                       IPSTATS_MIB_FRAGFAILS);
 734                         err = -ENOMEM;
 735                         goto fail;
 736                 }
 737
 738                 /*
 739                  *      Set up data on packet
 740                  */
 741
 742                 ip6_copy_metadata(frag, skb);
 743                 skb_reserve(frag, hroom);
 744                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 745                 skb_reset_network_header(frag);
 746                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 747                 frag->transport_header = (frag->network_header + hlen +
 748                                           sizeof(struct frag_hdr));
 749
 750                 /*
 751                  *      Charge the memory for the fragment to any owner
 752                  *      it might possess
 753                  */
 754                 if (skb->sk)
 755                         skb_set_owner_w(frag, skb->sk);
 756
 757                 /*
 758                  *      Copy the packet header into the new buffer.
 759                  */
 760                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 761
 762                 /*
 763                  *      Build fragment header.
 764                  */
 765                 fh->nexthdr = nexthdr;
 766                 fh->reserved = 0;
 767                 if (!frag_id) {
 768                         ipv6_select_ident(fh, rt);
 769                         frag_id = fh->identification;
 770                 } else
 771                         fh->identification = frag_id;
 772
 773                 /*
 774                  *      Copy a block of the IP datagram.
 775                  */
 776                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 777                         BUG();
 778                 left -= len;
 779
 780                 fh->frag_off = htons(offset);
 781                 if (left > 0)
 782                         fh->frag_off |= htons(IP6_MF);
 783                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 784                                                     sizeof(struct ipv6hdr));
 785
 786                 ptr += len;
 787                 offset += len;
 788
 789                 /*
 790                  *      Put this fragment into the sending queue.
 791                  */
 792                 err = output(frag);
 793                 if (err)
 794                         goto fail;
 795
 796                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 797                               IPSTATS_MIB_FRAGCREATES);
 798         }
 799         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 800                       IPSTATS_MIB_FRAGOKS);
 801         consume_skb(skb);
 802         return err;
 803
 804 fail:
 805         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 806                       IPSTATS_MIB_FRAGFAILS);
 807         kfree_skb(skb);
 808         return err;
 809 }
 810
 811 static inline int ip6_rt_check(const struct rt6key *rt_key,
 812                                const struct in6_addr *fl_addr,
 813                                const struct in6_addr *addr_cache)
 814 {
 815         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 816                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 817 }
 818
 819 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 820                                           struct dst_entry *dst,
 821                                           const struct flowi6 *fl6)
 822 {
 823         struct ipv6_pinfo *np = inet6_sk(sk);
 824         struct rt6_info *rt = (struct rt6_info *)dst;
 825
 826         if (!dst)
 827                 goto out;
 828
 829         /* Yes, checking route validity in not connected
 830          * case is not very simple. Take into account,
 831          * that we do not support routing by source, TOS,
 832          * and MSG_DONTROUTE            --ANK (980726)
 833          *
 834          * 1. ip6_rt_check(): If route was host route,
 835          *    check that cached destination is current.
 836          *    If it is network route, we still may
 837          *    check its validity using saved pointer
 838          *    to the last used address: daddr_cache.
 839          *    We do not want to save whole address now,
 840          *    (because main consumer of this service
 841          *    is tcp, which has not this problem),
 842          *    so that the last trick works only on connected
 843          *    sockets.
 844          * 2. oif also should be the same.
 845          */
 846         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 847 #ifdef CONFIG_IPV6_SUBTREES
 848             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 849 #endif
 850             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 851                 dst_release(dst);
 852                 dst = NULL;
 853         }
 854
 855 out:
 856         return dst;
 857 }
 858
 859 static int ip6_dst_lookup_tail(struct sock *sk,
 860                                struct dst_entry **dst, struct flowi6 *fl6)
 861 {
 862         struct net *net = sock_net(sk);
 863 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 864         struct neighbour *n;
 865         struct rt6_info *rt;
 866 #endif
 867         int err;
 868
 869         if (*dst == NULL)
 870                 *dst = ip6_route_output(net, sk, fl6);
 871
 872         if ((err = (*dst)->error))
 873                 goto out_err_release;
 874
 875         if (ipv6_addr_any(&fl6->saddr)) {
 876                 struct rt6_info *rt = (struct rt6_info *) *dst;
 877                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 878                                           sk ? inet6_sk(sk)->srcprefs : 0,
 879                                           &fl6->saddr);
 880                 if (err)
 881                         goto out_err_release;
 882         }
 883
 884 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 885         /*
 886          * Here if the dst entry we've looked up
 887          * has a neighbour entry that is in the INCOMPLETE
 888          * state and the src address from the flow is
 889          * marked as OPTIMISTIC, we release the found
 890          * dst entry and replace it instead with the
 891          * dst entry of the nexthop router
 892          */
 893         rt = (struct rt6_info *) *dst;
 894         rcu_read_lock_bh();
 895         n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr));
 896         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 897         rcu_read_unlock_bh();
 898
 899         if (err) {
 900                 struct inet6_ifaddr *ifp;
 901                 struct flowi6 fl_gw6;
 902                 int redirect;
 903
 904                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 905                                       (*dst)->dev, 1);
 906
 907                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 908                 if (ifp)
 909                         in6_ifa_put(ifp);
 910
 911                 if (redirect) {
 912                         /*
 913                          * We need to get the dst entry for the
 914                          * default router instead
 915                          */
 916                         dst_release(*dst);
 917                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
 918                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
 919                         *dst = ip6_route_output(net, sk, &fl_gw6);
 920                         if ((err = (*dst)->error))
 921                                 goto out_err_release;
 922                 }
 923         }
 924 #endif
 925
 926         return 0;
 927
 928 out_err_release:
 929         if (err == -ENETUNREACH)
 930                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 931         dst_release(*dst);
 932         *dst = NULL;
 933         return err;
 934 }
 935
 936 /**
 937  *      ip6_dst_lookup - perform route lookup on flow
 938  *      @sk: socket which provides route info
 939  *      @dst: pointer to dst_entry * for result
 940  *      @fl6: flow to lookup
 941  *
 942  *      This function performs a route lookup on the given flow.
 943  *
 944  *      It returns zero on success, or a standard errno code on error.
 945  */
 946 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
 947 {
 948         *dst = NULL;
 949         return ip6_dst_lookup_tail(sk, dst, fl6);
 950 }
 951 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 952
 953 /**
 954  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
 955  *      @sk: socket which provides route info
 956  *      @fl6: flow to lookup
 957  *      @final_dst: final destination address for ipsec lookup
 958  *      @can_sleep: we are in a sleepable context
 959  *
 960  *      This function performs a route lookup on the given flow.
 961  *
 962  *      It returns a valid dst pointer on success, or a pointer encoded
 963  *      error code.
 964  */
 965 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 966                                       const struct in6_addr *final_dst,
 967                                       bool can_sleep)
 968 {
 969         struct dst_entry *dst = NULL;
 970         int err;
 971
 972         err = ip6_dst_lookup_tail(sk, &dst, fl6);
 973         if (err)
 974                 return ERR_PTR(err);
 975         if (final_dst)
 976                 fl6->daddr = *final_dst;
 977         if (can_sleep)
 978                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
 979
 980         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
 981 }
 982 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
 983
 984 /**
 985  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
 986  *      @sk: socket which provides the dst cache and route info
 987  *      @fl6: flow to lookup
 988  *      @final_dst: final destination address for ipsec lookup
 989  *      @can_sleep: we are in a sleepable context
 990  *
 991  *      This function performs a route lookup on the given flow with the
 992  *      possibility of using the cached route in the socket if it is valid.
 993  *      It will take the socket dst lock when operating on the dst cache.
 994  *      As a result, this function can only be used in process context.
 995  *
 996  *      It returns a valid dst pointer on success, or a pointer encoded
 997  *      error code.
 998  */
 999 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1000                                          const struct in6_addr *final_dst,
1001                                          bool can_sleep)
1002 {
1003         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1004         int err;
1005
1006         dst = ip6_sk_dst_check(sk, dst, fl6);
1007
1008         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1009         if (err)
1010                 return ERR_PTR(err);
1011         if (final_dst)
1012                 fl6->daddr = *final_dst;
1013         if (can_sleep)
1014                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1015
1016         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1017 }
1018 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1019
1020 static inline int ip6_ufo_append_data(struct sock *sk,
1021                         int getfrag(void *from, char *to, int offset, int len,
1022                         int odd, struct sk_buff *skb),
1023                         void *from, int length, int hh_len, int fragheaderlen,
1024                         int transhdrlen, int mtu,unsigned int flags,
1025                         struct rt6_info *rt)
1026
1027 {
1028         struct sk_buff *skb;
1029         int err;
1030
1031         /* There is support for UDP large send offload by network
1032          * device, so create one single skb packet containing complete
1033          * udp datagram
1034          */
1035         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1036                 skb = sock_alloc_send_skb(sk,
1037                         hh_len + fragheaderlen + transhdrlen + 20,
1038                         (flags & MSG_DONTWAIT), &err);
1039                 if (skb == NULL)
1040                         return err;
1041
1042                 /* reserve space for Hardware header */
1043                 skb_reserve(skb, hh_len);
1044
1045                 /* create space for UDP/IP header */
1046                 skb_put(skb,fragheaderlen + transhdrlen);
1047
1048                 /* initialize network header pointer */
1049                 skb_reset_network_header(skb);
1050
1051                 /* initialize protocol header pointer */
1052                 skb->transport_header = skb->network_header + fragheaderlen;
1053
1054                 skb->ip_summed = CHECKSUM_PARTIAL;
1055                 skb->csum = 0;
1056         }
1057
1058         err = skb_append_datato_frags(sk,skb, getfrag, from,
1059                                       (length - transhdrlen));
1060         if (!err) {
1061                 struct frag_hdr fhdr;
1062
1063                 /* Specify the length of each IPv6 datagram fragment.
1064                  * It has to be a multiple of 8.
1065                  */
1066                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1067                                              sizeof(struct frag_hdr)) & ~7;
1068                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1069                 ipv6_select_ident(&fhdr, rt);
1070                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1071                 __skb_queue_tail(&sk->sk_write_queue, skb);
1072
1073                 return 0;
1074         }
1075         /* There is not enough support do UPD LSO,
1076          * so follow normal path
1077          */
1078         kfree_skb(skb);
1079
1080         return err;
1081 }
1082
1083 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1084                                                gfp_t gfp)
1085 {
1086         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1087 }
1088
1089 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1090                                                 gfp_t gfp)
1091 {
1092         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1093 }
1094
1095 static void ip6_append_data_mtu(int *mtu,
1096                                 int *maxfraglen,
1097                                 unsigned int fragheaderlen,
1098                                 struct sk_buff *skb,
1099                                 struct rt6_info *rt)
1100 {
1101         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1102                 if (skb == NULL) {
1103                         /* first fragment, reserve header_len */
1104                         *mtu = *mtu - rt->dst.header_len;
1105
1106                 } else {
1107                         /*
1108                          * this fragment is not first, the headers
1109                          * space is regarded as data space.
1110                          */
1111                         *mtu = dst_mtu(rt->dst.path);
1112                 }
1113                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1114                               + fragheaderlen - sizeof(struct frag_hdr);
1115         }
1116 }
1117
1118 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1119         int offset, int len, int odd, struct sk_buff *skb),
1120         void *from, int length, int transhdrlen,
1121         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1122         struct rt6_info *rt, unsigned int flags, int dontfrag)
1123 {
1124         struct inet_sock *inet = inet_sk(sk);
1125         struct ipv6_pinfo *np = inet6_sk(sk);
1126         struct inet_cork *cork;
1127         struct sk_buff *skb, *skb_prev = NULL;
1128         unsigned int maxfraglen, fragheaderlen;
1129         int exthdrlen;
1130         int dst_exthdrlen;
1131         int hh_len;
1132         int mtu;
1133         int copy;
1134         int err;
1135         int offset = 0;
1136         __u8 tx_flags = 0;
1137
1138         if (flags&MSG_PROBE)
1139                 return 0;
1140         cork = &inet->cork.base;
1141         if (skb_queue_empty(&sk->sk_write_queue)) {
1142                 /*
1143                  * setup for corking
1144                  */
1145                 if (opt) {
1146                         if (WARN_ON(np->cork.opt))
1147                                 return -EINVAL;
1148
1149                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1150                         if (unlikely(np->cork.opt == NULL))
1151                                 return -ENOBUFS;
1152
1153                         np->cork.opt->tot_len = opt->tot_len;
1154                         np->cork.opt->opt_flen = opt->opt_flen;
1155                         np->cork.opt->opt_nflen = opt->opt_nflen;
1156
1157                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1158                                                             sk->sk_allocation);
1159                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1160                                 return -ENOBUFS;
1161
1162                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1163                                                             sk->sk_allocation);
1164                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1165                                 return -ENOBUFS;
1166
1167                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1168                                                            sk->sk_allocation);
1169                         if (opt->hopopt && !np->cork.opt->hopopt)
1170                                 return -ENOBUFS;
1171
1172                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1173                                                             sk->sk_allocation);
1174                         if (opt->srcrt && !np->cork.opt->srcrt)
1175                                 return -ENOBUFS;
1176
1177                         /* need source address above miyazawa*/
1178                 }
1179                 dst_hold(&rt->dst);
1180                 cork->dst = &rt->dst;
1181                 inet->cork.fl.u.ip6 = *fl6;
1182                 np->cork.hop_limit = hlimit;
1183                 np->cork.tclass = tclass;
1184                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1185                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1186                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1187                 else
1188                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1189                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1190                 if (np->frag_size < mtu) {
1191                         if (np->frag_size)
1192                                 mtu = np->frag_size;
1193                 }
1194                 cork->fragsize = mtu;
1195                 if (dst_allfrag(rt->dst.path))
1196                         cork->flags |= IPCORK_ALLFRAG;
1197                 cork->length = 0;
1198                 exthdrlen = (opt ? opt->opt_flen : 0);
1199                 length += exthdrlen;
1200                 transhdrlen += exthdrlen;
1201                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1202         } else {
1203                 rt = (struct rt6_info *)cork->dst;
1204                 fl6 = &inet->cork.fl.u.ip6;
1205                 opt = np->cork.opt;
1206                 transhdrlen = 0;
1207                 exthdrlen = 0;
1208                 dst_exthdrlen = 0;
1209                 mtu = cork->fragsize;
1210         }
1211
1212         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1213
1214         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1215                         (opt ? opt->opt_nflen : 0);
1216         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1217
1218         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1219                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1220                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1221                         return -EMSGSIZE;
1222                 }
1223         }
1224
1225         /* For UDP, check if TX timestamp is enabled */
1226         if (sk->sk_type == SOCK_DGRAM)
1227                 sock_tx_timestamp(sk, &tx_flags);
1228
1229         /*
1230          * Let's try using as much space as possible.
1231          * Use MTU if total length of the message fits into the MTU.
1232          * Otherwise, we need to reserve fragment header and
1233          * fragment alignment (= 8-15 octects, in total).
1234          *
1235          * Note that we may need to "move" the data from the tail of
1236          * of the buffer to the new fragment when we split
1237          * the message.
1238          *
1239          * FIXME: It may be fragmented into multiple chunks
1240          *        at once if non-fragmentable extension headers
1241          *        are too large.
1242          * --yoshfuji
1243          */
1244
1245         cork->length += length;
1246         if (length > mtu) {
1247                 int proto = sk->sk_protocol;
1248                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1249                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1250                         return -EMSGSIZE;
1251                 }
1252
1253                 if (proto == IPPROTO_UDP &&
1254                     (rt->dst.dev->features & NETIF_F_UFO)) {
1255
1256                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1257                                                   hh_len, fragheaderlen,
1258                                                   transhdrlen, mtu, flags, rt);
1259                         if (err)
1260                                 goto error;
1261                         return 0;
1262                 }
1263         }
1264
1265         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1266                 goto alloc_new_skb;
1267
1268         while (length > 0) {
1269                 /* Check if the remaining data fits into current packet. */
1270                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1271                 if (copy < length)
1272                         copy = maxfraglen - skb->len;
1273
1274                 if (copy <= 0) {
1275                         char *data;
1276                         unsigned int datalen;
1277                         unsigned int fraglen;
1278                         unsigned int fraggap;
1279                         unsigned int alloclen;
1280 alloc_new_skb:
1281                         /* There's no room in the current skb */
1282                         if (skb)
1283                                 fraggap = skb->len - maxfraglen;
1284                         else
1285                                 fraggap = 0;
1286                         /* update mtu and maxfraglen if necessary */
1287                         if (skb == NULL || skb_prev == NULL)
1288                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1289                                                     fragheaderlen, skb, rt);
1290
1291                         skb_prev = skb;
1292
1293                         /*
1294                          * If remaining data exceeds the mtu,
1295                          * we know we need more fragment(s).
1296                          */
1297                         datalen = length + fraggap;
1298
1299                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1300                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1301                         if ((flags & MSG_MORE) &&
1302                             !(rt->dst.dev->features&NETIF_F_SG))
1303                                 alloclen = mtu;
1304                         else
1305                                 alloclen = datalen + fragheaderlen;
1306
1307                         alloclen += dst_exthdrlen;
1308
1309                         if (datalen != length + fraggap) {
1310                                 /*
1311                                  * this is not the last fragment, the trailer
1312                                  * space is regarded as data space.
1313                                  */
1314                                 datalen += rt->dst.trailer_len;
1315                         }
1316
1317                         alloclen += rt->dst.trailer_len;
1318                         fraglen = datalen + fragheaderlen;
1319
1320                         /*
1321                          * We just reserve space for fragment header.
1322                          * Note: this may be overallocation if the message
1323                          * (without MSG_MORE) fits into the MTU.
1324                          */
1325                         alloclen += sizeof(struct frag_hdr);
1326
1327                         if (transhdrlen) {
1328                                 skb = sock_alloc_send_skb(sk,
1329                                                 alloclen + hh_len,
1330                                                 (flags & MSG_DONTWAIT), &err);
1331                         } else {
1332                                 skb = NULL;
1333                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1334                                     2 * sk->sk_sndbuf)
1335                                         skb = sock_wmalloc(sk,
1336                                                            alloclen + hh_len, 1,
1337                                                            sk->sk_allocation);
1338                                 if (unlikely(skb == NULL))
1339                                         err = -ENOBUFS;
1340                                 else {
1341                                         /* Only the initial fragment
1342                                          * is time stamped.
1343                                          */
1344                                         tx_flags = 0;
1345                                 }
1346                         }
1347                         if (skb == NULL)
1348                                 goto error;
1349                         /*
1350                          *      Fill in the control structures
1351                          */
1352                         skb->ip_summed = CHECKSUM_NONE;
1353                         skb->csum = 0;
1354                         /* reserve for fragmentation and ipsec header */
1355                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1356                                     dst_exthdrlen);
1357
1358                         if (sk->sk_type == SOCK_DGRAM)
1359                                 skb_shinfo(skb)->tx_flags = tx_flags;
1360
1361                         /*
1362                          *      Find where to start putting bytes
1363                          */
1364                         data = skb_put(skb, fraglen);
1365                         skb_set_network_header(skb, exthdrlen);
1366                         data += fragheaderlen;
1367                         skb->transport_header = (skb->network_header +
1368                                                  fragheaderlen);
1369                         if (fraggap) {
1370                                 skb->csum = skb_copy_and_csum_bits(
1371                                         skb_prev, maxfraglen,
1372                                         data + transhdrlen, fraggap, 0);
1373                                 skb_prev->csum = csum_sub(skb_prev->csum,
1374                                                           skb->csum);
1375                                 data += fraggap;
1376                                 pskb_trim_unique(skb_prev, maxfraglen);
1377                         }
1378                         copy = datalen - transhdrlen - fraggap;
1379
1380                         if (copy < 0) {
1381                                 err = -EINVAL;
1382                                 kfree_skb(skb);
1383                                 goto error;
1384                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1385                                 err = -EFAULT;
1386                                 kfree_skb(skb);
1387                                 goto error;
1388                         }
1389
1390                         offset += copy;
1391                         length -= datalen - fraggap;
1392                         transhdrlen = 0;
1393                         exthdrlen = 0;
1394                         dst_exthdrlen = 0;
1395
1396                         /*
1397                          * Put the packet on the pending queue
1398                          */
1399                         __skb_queue_tail(&sk->sk_write_queue, skb);
1400                         continue;
1401                 }
1402
1403                 if (copy > length)
1404                         copy = length;
1405
1406                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1407                         unsigned int off;
1408
1409                         off = skb->len;
1410                         if (getfrag(from, skb_put(skb, copy),
1411                                                 offset, copy, off, skb) < 0) {
1412                                 __skb_trim(skb, off);
1413                                 err = -EFAULT;
1414                                 goto error;
1415                         }
1416                 } else {
1417                         int i = skb_shinfo(skb)->nr_frags;
1418                         struct page_frag *pfrag = sk_page_frag(sk);
1419
1420                         err = -ENOMEM;
1421                         if (!sk_page_frag_refill(sk, pfrag))
1422                                 goto error;
1423
1424                         if (!skb_can_coalesce(skb, i, pfrag->page,
1425                                               pfrag->offset)) {
1426                                 err = -EMSGSIZE;
1427                                 if (i == MAX_SKB_FRAGS)
1428                                         goto error;
1429
1430                                 __skb_fill_page_desc(skb, i, pfrag->page,
1431                                                      pfrag->offset, 0);
1432                                 skb_shinfo(skb)->nr_frags = ++i;
1433                                 get_page(pfrag->page);
1434                         }
1435                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1436                         if (getfrag(from,
1437                                     page_address(pfrag->page) + pfrag->offset,
1438                                     offset, copy, skb->len, skb) < 0)
1439                                 goto error_efault;
1440
1441                         pfrag->offset += copy;
1442                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1443                         skb->len += copy;
1444                         skb->data_len += copy;
1445                         skb->truesize += copy;
1446                         atomic_add(copy, &sk->sk_wmem_alloc);
1447                 }
1448                 offset += copy;
1449                 length -= copy;
1450         }
1451
1452         return 0;
1453
1454 error_efault:
1455         err = -EFAULT;
1456 error:
1457         cork->length -= length;
1458         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1459         return err;
1460 }
1461 EXPORT_SYMBOL_GPL(ip6_append_data);
1462
1463 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1464 {
1465         if (np->cork.opt) {
1466                 kfree(np->cork.opt->dst0opt);
1467                 kfree(np->cork.opt->dst1opt);
1468                 kfree(np->cork.opt->hopopt);
1469                 kfree(np->cork.opt->srcrt);
1470                 kfree(np->cork.opt);
1471                 np->cork.opt = NULL;
1472         }
1473
1474         if (inet->cork.base.dst) {
1475                 dst_release(inet->cork.base.dst);
1476                 inet->cork.base.dst = NULL;
1477                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1478         }
1479         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1480 }
1481
1482 int ip6_push_pending_frames(struct sock *sk)
1483 {
1484         struct sk_buff *skb, *tmp_skb;
1485         struct sk_buff **tail_skb;
1486         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1487         struct inet_sock *inet = inet_sk(sk);
1488         struct ipv6_pinfo *np = inet6_sk(sk);
1489         struct net *net = sock_net(sk);
1490         struct ipv6hdr *hdr;
1491         struct ipv6_txoptions *opt = np->cork.opt;
1492         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1493         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1494         unsigned char proto = fl6->flowi6_proto;
1495         int err = 0;
1496
1497         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1498                 goto out;
1499         tail_skb = &(skb_shinfo(skb)->frag_list);
1500
1501         /* move skb->data to ip header from ext header */
1502         if (skb->data < skb_network_header(skb))
1503                 __skb_pull(skb, skb_network_offset(skb));
1504         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1505                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1506                 *tail_skb = tmp_skb;
1507                 tail_skb = &(tmp_skb->next);
1508                 skb->len += tmp_skb->len;
1509                 skb->data_len += tmp_skb->len;
1510                 skb->truesize += tmp_skb->truesize;
1511                 tmp_skb->destructor = NULL;
1512                 tmp_skb->sk = NULL;
1513         }
1514
1515         /* Allow local fragmentation. */
1516         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1517                 skb->local_df = 1;
1518
1519         *final_dst = fl6->daddr;
1520         __skb_pull(skb, skb_network_header_len(skb));
1521         if (opt && opt->opt_flen)
1522                 ipv6_push_frag_opts(skb, opt, &proto);
1523         if (opt && opt->opt_nflen)
1524                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1525
1526         skb_push(skb, sizeof(struct ipv6hdr));
1527         skb_reset_network_header(skb);
1528         hdr = ipv6_hdr(skb);
1529
1530         ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1531         hdr->hop_limit = np->cork.hop_limit;
1532         hdr->nexthdr = proto;
1533         hdr->saddr = fl6->saddr;
1534         hdr->daddr = *final_dst;
1535
1536         skb->priority = sk->sk_priority;
1537         skb->mark = sk->sk_mark;
1538
1539         skb_dst_set(skb, dst_clone(&rt->dst));
1540         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1541         if (proto == IPPROTO_ICMPV6) {
1542                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1543
1544                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1545                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1546         }
1547
1548         err = ip6_local_out(skb);
1549         if (err) {
1550                 if (err > 0)
1551                         err = net_xmit_errno(err);
1552                 if (err)
1553                         goto error;
1554         }
1555
1556 out:
1557         ip6_cork_release(inet, np);
1558         return err;
1559 error:
1560         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1561         goto out;
1562 }
1563 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1564
1565 void ip6_flush_pending_frames(struct sock *sk)
1566 {
1567         struct sk_buff *skb;
1568
1569         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1570                 if (skb_dst(skb))
1571                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1572                                       IPSTATS_MIB_OUTDISCARDS);
1573                 kfree_skb(skb);
1574         }
1575
1576         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1577 }
1578 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);