net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  84
  85 #include <linux/module.h>
  86 #include <linux/kernel.h>
  87 #include <linux/signal.h>
  88 #include <linux/sched.h>
  89 #include <linux/errno.h>
  90 #include <linux/string.h>
  91 #include <linux/stat.h>
  92 #include <linux/dcache.h>
  93 #include <linux/namei.h>
  94 #include <linux/socket.h>
  95 #include <linux/un.h>
  96 #include <linux/fcntl.h>
  97 #include <linux/termios.h>
  98 #include <linux/sockios.h>
  99 #include <linux/net.h>
 100 #include <linux/in.h>
 101 #include <linux/fs.h>
 102 #include <linux/slab.h>
 103 #include <asm/uaccess.h>
 104 #include <linux/skbuff.h>
 105 #include <linux/netdevice.h>
 106 #include <net/net_namespace.h>
 107 #include <net/sock.h>
 108 #include <net/tcp_states.h>
 109 #include <net/af_unix.h>
 110 #include <linux/proc_fs.h>
 111 #include <linux/seq_file.h>
 112 #include <net/scm.h>
 113 #include <linux/init.h>
 114 #include <linux/poll.h>
 115 #include <linux/rtnetlink.h>
 116 #include <linux/mount.h>
 117 #include <net/checksum.h>
 118 #include <linux/security.h>
 119 #include <linux/freezer.h>
 120
 121 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 122 EXPORT_SYMBOL_GPL(unix_socket_table);
 123 DEFINE_SPINLOCK(unix_table_lock);
 124 EXPORT_SYMBOL_GPL(unix_table_lock);
 125 static atomic_long_t unix_nr_socks;
 126
 127
 128 static struct hlist_head *unix_sockets_unbound(void *addr)
 129 {
 130         unsigned long hash = (unsigned long)addr;
 131
 132         hash ^= hash >> 16;
 133         hash ^= hash >> 8;
 134         hash %= UNIX_HASH_SIZE;
 135         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 136 }
 137
 138 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 139
 140 #ifdef CONFIG_SECURITY_NETWORK
 141 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 142 {
 143         memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
 144 }
 145
 146 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 147 {
 148         scm->secid = *UNIXSID(skb);
 149 }
 150 #else
 151 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 152 { }
 153
 154 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 155 { }
 156 #endif /* CONFIG_SECURITY_NETWORK */
 157
 158 /*
 159  *  SMP locking strategy:
 160  *    hash table is protected with spinlock unix_table_lock
 161  *    each socket state is protected by separate spin lock.
 162  */
 163
 164 static inline unsigned int unix_hash_fold(__wsum n)
 165 {
 166         unsigned int hash = (__force unsigned int)csum_fold(n);
 167
 168         hash ^= hash>>8;
 169         return hash&(UNIX_HASH_SIZE-1);
 170 }
 171
 172 #define unix_peer(sk) (unix_sk(sk)->peer)
 173
 174 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 175 {
 176         return unix_peer(osk) == sk;
 177 }
 178
 179 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 180 {
 181         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 182 }
 183
 184 static inline int unix_recvq_full(struct sock const *sk)
 185 {
 186         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 187 }
 188
 189 struct sock *unix_peer_get(struct sock *s)
 190 {
 191         struct sock *peer;
 192
 193         unix_state_lock(s);
 194         peer = unix_peer(s);
 195         if (peer)
 196                 sock_hold(peer);
 197         unix_state_unlock(s);
 198         return peer;
 199 }
 200 EXPORT_SYMBOL_GPL(unix_peer_get);
 201
 202 static inline void unix_release_addr(struct unix_address *addr)
 203 {
 204         if (atomic_dec_and_test(&addr->refcnt))
 205                 kfree(addr);
 206 }
 207
 208 /*
 209  *      Check unix socket name:
 210  *              - should be not zero length.
 211  *              - if started by not zero, should be NULL terminated (FS object)
 212  *              - if started by zero, it is abstract name.
 213  */
 214
 215 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 216 {
 217         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 218                 return -EINVAL;
 219         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 220                 return -EINVAL;
 221         if (sunaddr->sun_path[0]) {
 222                 /*
 223                  * This may look like an off by one error but it is a bit more
 224                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 225                  * sun_path[108] doesn't as such exist.  However in kernel space
 226                  * we are guaranteed that it is a valid memory location in our
 227                  * kernel address buffer.
 228                  */
 229                 ((char *)sunaddr)[len] = 0;
 230                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 231                 return len;
 232         }
 233
 234         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 235         return len;
 236 }
 237
 238 static void __unix_remove_socket(struct sock *sk)
 239 {
 240         sk_del_node_init(sk);
 241 }
 242
 243 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 244 {
 245         WARN_ON(!sk_unhashed(sk));
 246         sk_add_node(sk, list);
 247 }
 248
 249 static inline void unix_remove_socket(struct sock *sk)
 250 {
 251         spin_lock(&unix_table_lock);
 252         __unix_remove_socket(sk);
 253         spin_unlock(&unix_table_lock);
 254 }
 255
 256 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 257 {
 258         spin_lock(&unix_table_lock);
 259         __unix_insert_socket(list, sk);
 260         spin_unlock(&unix_table_lock);
 261 }
 262
 263 static struct sock *__unix_find_socket_byname(struct net *net,
 264                                               struct sockaddr_un *sunname,
 265                                               int len, int type, unsigned int hash)
 266 {
 267         struct sock *s;
 268
 269         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 270                 struct unix_sock *u = unix_sk(s);
 271
 272                 if (!net_eq(sock_net(s), net))
 273                         continue;
 274
 275                 if (u->addr->len == len &&
 276                     !memcmp(u->addr->name, sunname, len))
 277                         goto found;
 278         }
 279         s = NULL;
 280 found:
 281         return s;
 282 }
 283
 284 static inline struct sock *unix_find_socket_byname(struct net *net,
 285                                                    struct sockaddr_un *sunname,
 286                                                    int len, int type,
 287                                                    unsigned int hash)
 288 {
 289         struct sock *s;
 290
 291         spin_lock(&unix_table_lock);
 292         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 293         if (s)
 294                 sock_hold(s);
 295         spin_unlock(&unix_table_lock);
 296         return s;
 297 }
 298
 299 static struct sock *unix_find_socket_byinode(struct inode *i)
 300 {
 301         struct sock *s;
 302
 303         spin_lock(&unix_table_lock);
 304         sk_for_each(s,
 305                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 306                 struct dentry *dentry = unix_sk(s)->path.dentry;
 307
 308                 if (dentry && d_backing_inode(dentry) == i) {
 309                         sock_hold(s);
 310                         goto found;
 311                 }
 312         }
 313         s = NULL;
 314 found:
 315         spin_unlock(&unix_table_lock);
 316         return s;
 317 }
 318
 319 static inline int unix_writable(struct sock *sk)
 320 {
 321         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 322 }
 323
 324 static void unix_write_space(struct sock *sk)
 325 {
 326         struct socket_wq *wq;
 327
 328         rcu_read_lock();
 329         if (unix_writable(sk)) {
 330                 wq = rcu_dereference(sk->sk_wq);
 331                 if (wq_has_sleeper(wq))
 332                         wake_up_interruptible_sync_poll(&wq->wait,
 333                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 334                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 335         }
 336         rcu_read_unlock();
 337 }
 338
 339 /* When dgram socket disconnects (or changes its peer), we clear its receive
 340  * queue of packets arrived from previous peer. First, it allows to do
 341  * flow control based only on wmem_alloc; second, sk connected to peer
 342  * may receive messages only from that peer. */
 343 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 344 {
 345         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 346                 skb_queue_purge(&sk->sk_receive_queue);
 347                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 348
 349                 /* If one link of bidirectional dgram pipe is disconnected,
 350                  * we signal error. Messages are lost. Do not make this,
 351                  * when peer was not connected to us.
 352                  */
 353                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 354                         other->sk_err = ECONNRESET;
 355                         other->sk_error_report(other);
 356                 }
 357         }
 358 }
 359
 360 static void unix_sock_destructor(struct sock *sk)
 361 {
 362         struct unix_sock *u = unix_sk(sk);
 363
 364         skb_queue_purge(&sk->sk_receive_queue);
 365
 366         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 367         WARN_ON(!sk_unhashed(sk));
 368         WARN_ON(sk->sk_socket);
 369         if (!sock_flag(sk, SOCK_DEAD)) {
 370                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 371                 return;
 372         }
 373
 374         if (u->addr)
 375                 unix_release_addr(u->addr);
 376
 377         atomic_long_dec(&unix_nr_socks);
 378         local_bh_disable();
 379         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 380         local_bh_enable();
 381 #ifdef UNIX_REFCNT_DEBUG
 382         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 383                 atomic_long_read(&unix_nr_socks));
 384 #endif
 385 }
 386
 387 static void unix_release_sock(struct sock *sk, int embrion)
 388 {
 389         struct unix_sock *u = unix_sk(sk);
 390         struct path path;
 391         struct sock *skpair;
 392         struct sk_buff *skb;
 393         int state;
 394
 395         unix_remove_socket(sk);
 396
 397         /* Clear state */
 398         unix_state_lock(sk);
 399         sock_orphan(sk);
 400         sk->sk_shutdown = SHUTDOWN_MASK;
 401         path         = u->path;
 402         u->path.dentry = NULL;
 403         u->path.mnt = NULL;
 404         state = sk->sk_state;
 405         sk->sk_state = TCP_CLOSE;
 406         unix_state_unlock(sk);
 407
 408         wake_up_interruptible_all(&u->peer_wait);
 409
 410         skpair = unix_peer(sk);
 411
 412         if (skpair != NULL) {
 413                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 414                         unix_state_lock(skpair);
 415                         /* No more writes */
 416                         skpair->sk_shutdown = SHUTDOWN_MASK;
 417                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 418                                 skpair->sk_err = ECONNRESET;
 419                         unix_state_unlock(skpair);
 420                         skpair->sk_state_change(skpair);
 421                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 422                 }
 423                 sock_put(skpair); /* It may now die */
 424                 unix_peer(sk) = NULL;
 425         }
 426
 427         /* Try to flush out this socket. Throw out buffers at least */
 428
 429         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 430                 if (state == TCP_LISTEN)
 431                         unix_release_sock(skb->sk, 1);
 432                 /* passed fds are erased in the kfree_skb hook        */
 433                 kfree_skb(skb);
 434         }
 435
 436         if (path.dentry)
 437                 path_put(&path);
 438
 439         sock_put(sk);
 440
 441         /* ---- Socket is dead now and most probably destroyed ---- */
 442
 443         /*
 444          * Fixme: BSD difference: In BSD all sockets connected to us get
 445          *        ECONNRESET and we die on the spot. In Linux we behave
 446          *        like files and pipes do and wait for the last
 447          *        dereference.
 448          *
 449          * Can't we simply set sock->err?
 450          *
 451          *        What the above comment does talk about? --ANK(980817)
 452          */
 453
 454         if (unix_tot_inflight)
 455                 unix_gc();              /* Garbage collect fds */
 456 }
 457
 458 static void init_peercred(struct sock *sk)
 459 {
 460         put_pid(sk->sk_peer_pid);
 461         if (sk->sk_peer_cred)
 462                 put_cred(sk->sk_peer_cred);
 463         sk->sk_peer_pid  = get_pid(task_tgid(current));
 464         sk->sk_peer_cred = get_current_cred();
 465 }
 466
 467 static void copy_peercred(struct sock *sk, struct sock *peersk)
 468 {
 469         put_pid(sk->sk_peer_pid);
 470         if (sk->sk_peer_cred)
 471                 put_cred(sk->sk_peer_cred);
 472         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 473         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 474 }
 475
 476 static int unix_listen(struct socket *sock, int backlog)
 477 {
 478         int err;
 479         struct sock *sk = sock->sk;
 480         struct unix_sock *u = unix_sk(sk);
 481         struct pid *old_pid = NULL;
 482
 483         err = -EOPNOTSUPP;
 484         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 485                 goto out;       /* Only stream/seqpacket sockets accept */
 486         err = -EINVAL;
 487         if (!u->addr)
 488                 goto out;       /* No listens on an unbound socket */
 489         unix_state_lock(sk);
 490         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 491                 goto out_unlock;
 492         if (backlog > sk->sk_max_ack_backlog)
 493                 wake_up_interruptible_all(&u->peer_wait);
 494         sk->sk_max_ack_backlog  = backlog;
 495         sk->sk_state            = TCP_LISTEN;
 496         /* set credentials so connect can copy them */
 497         init_peercred(sk);
 498         err = 0;
 499
 500 out_unlock:
 501         unix_state_unlock(sk);
 502         put_pid(old_pid);
 503 out:
 504         return err;
 505 }
 506
 507 static int unix_release(struct socket *);
 508 static int unix_bind(struct socket *, struct sockaddr *, int);
 509 static int unix_stream_connect(struct socket *, struct sockaddr *,
 510                                int addr_len, int flags);
 511 static int unix_socketpair(struct socket *, struct socket *);
 512 static int unix_accept(struct socket *, struct socket *, int);
 513 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 514 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 515 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 516                                     poll_table *);
 517 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 518 static int unix_shutdown(struct socket *, int);
 519 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 520 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 521 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 522                                     size_t size, int flags);
 523 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 524                                        struct pipe_inode_info *, size_t size,
 525                                        unsigned int flags);
 526 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 527 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 528 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 529                               int, int);
 530 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 531 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 532                                   int);
 533
 534 static int unix_set_peek_off(struct sock *sk, int val)
 535 {
 536         struct unix_sock *u = unix_sk(sk);
 537
 538         if (mutex_lock_interruptible(&u->readlock))
 539                 return -EINTR;
 540
 541         sk->sk_peek_off = val;
 542         mutex_unlock(&u->readlock);
 543
 544         return 0;
 545 }
 546
 547
 548 static const struct proto_ops unix_stream_ops = {
 549         .family =       PF_UNIX,
 550         .owner =        THIS_MODULE,
 551         .release =      unix_release,
 552         .bind =         unix_bind,
 553         .connect =      unix_stream_connect,
 554         .socketpair =   unix_socketpair,
 555         .accept =       unix_accept,
 556         .getname =      unix_getname,
 557         .poll =         unix_poll,
 558         .ioctl =        unix_ioctl,
 559         .listen =       unix_listen,
 560         .shutdown =     unix_shutdown,
 561         .setsockopt =   sock_no_setsockopt,
 562         .getsockopt =   sock_no_getsockopt,
 563         .sendmsg =      unix_stream_sendmsg,
 564         .recvmsg =      unix_stream_recvmsg,
 565         .mmap =         sock_no_mmap,
 566         .sendpage =     unix_stream_sendpage,
 567         .splice_read =  unix_stream_splice_read,
 568         .set_peek_off = unix_set_peek_off,
 569 };
 570
 571 static const struct proto_ops unix_dgram_ops = {
 572         .family =       PF_UNIX,
 573         .owner =        THIS_MODULE,
 574         .release =      unix_release,
 575         .bind =         unix_bind,
 576         .connect =      unix_dgram_connect,
 577         .socketpair =   unix_socketpair,
 578         .accept =       sock_no_accept,
 579         .getname =      unix_getname,
 580         .poll =         unix_dgram_poll,
 581         .ioctl =        unix_ioctl,
 582         .listen =       sock_no_listen,
 583         .shutdown =     unix_shutdown,
 584         .setsockopt =   sock_no_setsockopt,
 585         .getsockopt =   sock_no_getsockopt,
 586         .sendmsg =      unix_dgram_sendmsg,
 587         .recvmsg =      unix_dgram_recvmsg,
 588         .mmap =         sock_no_mmap,
 589         .sendpage =     sock_no_sendpage,
 590         .set_peek_off = unix_set_peek_off,
 591 };
 592
 593 static const struct proto_ops unix_seqpacket_ops = {
 594         .family =       PF_UNIX,
 595         .owner =        THIS_MODULE,
 596         .release =      unix_release,
 597         .bind =         unix_bind,
 598         .connect =      unix_stream_connect,
 599         .socketpair =   unix_socketpair,
 600         .accept =       unix_accept,
 601         .getname =      unix_getname,
 602         .poll =         unix_dgram_poll,
 603         .ioctl =        unix_ioctl,
 604         .listen =       unix_listen,
 605         .shutdown =     unix_shutdown,
 606         .setsockopt =   sock_no_setsockopt,
 607         .getsockopt =   sock_no_getsockopt,
 608         .sendmsg =      unix_seqpacket_sendmsg,
 609         .recvmsg =      unix_seqpacket_recvmsg,
 610         .mmap =         sock_no_mmap,
 611         .sendpage =     sock_no_sendpage,
 612         .set_peek_off = unix_set_peek_off,
 613 };
 614
 615 static struct proto unix_proto = {
 616         .name                   = "UNIX",
 617         .owner                  = THIS_MODULE,
 618         .obj_size               = sizeof(struct unix_sock),
 619 };
 620
 621 /*
 622  * AF_UNIX sockets do not interact with hardware, hence they
 623  * dont trigger interrupts - so it's safe for them to have
 624  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 625  * this special lock-class by reinitializing the spinlock key:
 626  */
 627 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 628
 629 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 630 {
 631         struct sock *sk = NULL;
 632         struct unix_sock *u;
 633
 634         atomic_long_inc(&unix_nr_socks);
 635         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 636                 goto out;
 637
 638         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 639         if (!sk)
 640                 goto out;
 641
 642         sock_init_data(sock, sk);
 643         lockdep_set_class(&sk->sk_receive_queue.lock,
 644                                 &af_unix_sk_receive_queue_lock_key);
 645
 646         sk->sk_write_space      = unix_write_space;
 647         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 648         sk->sk_destruct         = unix_sock_destructor;
 649         u         = unix_sk(sk);
 650         u->path.dentry = NULL;
 651         u->path.mnt = NULL;
 652         spin_lock_init(&u->lock);
 653         atomic_long_set(&u->inflight, 0);
 654         INIT_LIST_HEAD(&u->link);
 655         mutex_init(&u->readlock); /* single task reading lock */
 656         init_waitqueue_head(&u->peer_wait);
 657         unix_insert_socket(unix_sockets_unbound(sk), sk);
 658 out:
 659         if (sk == NULL)
 660                 atomic_long_dec(&unix_nr_socks);
 661         else {
 662                 local_bh_disable();
 663                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 664                 local_bh_enable();
 665         }
 666         return sk;
 667 }
 668
 669 static int unix_create(struct net *net, struct socket *sock, int protocol,
 670                        int kern)
 671 {
 672         if (protocol && protocol != PF_UNIX)
 673                 return -EPROTONOSUPPORT;
 674
 675         sock->state = SS_UNCONNECTED;
 676
 677         switch (sock->type) {
 678         case SOCK_STREAM:
 679                 sock->ops = &unix_stream_ops;
 680                 break;
 681                 /*
 682                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 683                  *      nothing uses it.
 684                  */
 685         case SOCK_RAW:
 686                 sock->type = SOCK_DGRAM;
 687         case SOCK_DGRAM:
 688                 sock->ops = &unix_dgram_ops;
 689                 break;
 690         case SOCK_SEQPACKET:
 691                 sock->ops = &unix_seqpacket_ops;
 692                 break;
 693         default:
 694                 return -ESOCKTNOSUPPORT;
 695         }
 696
 697         return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 698 }
 699
 700 static int unix_release(struct socket *sock)
 701 {
 702         struct sock *sk = sock->sk;
 703
 704         if (!sk)
 705                 return 0;
 706
 707         unix_release_sock(sk, 0);
 708         sock->sk = NULL;
 709
 710         return 0;
 711 }
 712
 713 static int unix_autobind(struct socket *sock)
 714 {
 715         struct sock *sk = sock->sk;
 716         struct net *net = sock_net(sk);
 717         struct unix_sock *u = unix_sk(sk);
 718         static u32 ordernum = 1;
 719         struct unix_address *addr;
 720         int err;
 721         unsigned int retries = 0;
 722
 723         err = mutex_lock_interruptible(&u->readlock);
 724         if (err)
 725                 return err;
 726
 727         err = 0;
 728         if (u->addr)
 729                 goto out;
 730
 731         err = -ENOMEM;
 732         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 733         if (!addr)
 734                 goto out;
 735
 736         addr->name->sun_family = AF_UNIX;
 737         atomic_set(&addr->refcnt, 1);
 738
 739 retry:
 740         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 741         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 742
 743         spin_lock(&unix_table_lock);
 744         ordernum = (ordernum+1)&0xFFFFF;
 745
 746         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 747                                       addr->hash)) {
 748                 spin_unlock(&unix_table_lock);
 749                 /*
 750                  * __unix_find_socket_byname() may take long time if many names
 751                  * are already in use.
 752                  */
 753                 cond_resched();
 754                 /* Give up if all names seems to be in use. */
 755                 if (retries++ == 0xFFFFF) {
 756                         err = -ENOSPC;
 757                         kfree(addr);
 758                         goto out;
 759                 }
 760                 goto retry;
 761         }
 762         addr->hash ^= sk->sk_type;
 763
 764         __unix_remove_socket(sk);
 765         u->addr = addr;
 766         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 767         spin_unlock(&unix_table_lock);
 768         err = 0;
 769
 770 out:    mutex_unlock(&u->readlock);
 771         return err;
 772 }
 773
 774 static struct sock *unix_find_other(struct net *net,
 775                                     struct sockaddr_un *sunname, int len,
 776                                     int type, unsigned int hash, int *error)
 777 {
 778         struct sock *u;
 779         struct path path;
 780         int err = 0;
 781
 782         if (sunname->sun_path[0]) {
 783                 struct inode *inode;
 784                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 785                 if (err)
 786                         goto fail;
 787                 inode = d_backing_inode(path.dentry);
 788                 err = inode_permission(inode, MAY_WRITE);
 789                 if (err)
 790                         goto put_fail;
 791
 792                 err = -ECONNREFUSED;
 793                 if (!S_ISSOCK(inode->i_mode))
 794                         goto put_fail;
 795                 u = unix_find_socket_byinode(inode);
 796                 if (!u)
 797                         goto put_fail;
 798
 799                 if (u->sk_type == type)
 800                         touch_atime(&path);
 801
 802                 path_put(&path);
 803
 804                 err = -EPROTOTYPE;
 805                 if (u->sk_type != type) {
 806                         sock_put(u);
 807                         goto fail;
 808                 }
 809         } else {
 810                 err = -ECONNREFUSED;
 811                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 812                 if (u) {
 813                         struct dentry *dentry;
 814                         dentry = unix_sk(u)->path.dentry;
 815                         if (dentry)
 816                                 touch_atime(&unix_sk(u)->path);
 817                 } else
 818                         goto fail;
 819         }
 820         return u;
 821
 822 put_fail:
 823         path_put(&path);
 824 fail:
 825         *error = err;
 826         return NULL;
 827 }
 828
 829 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 830 {
 831         struct dentry *dentry;
 832         struct path path;
 833         int err = 0;
 834         /*
 835          * Get the parent directory, calculate the hash for last
 836          * component.
 837          */
 838         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 839         err = PTR_ERR(dentry);
 840         if (IS_ERR(dentry))
 841                 return err;
 842
 843         /*
 844          * All right, let's create it.
 845          */
 846         err = security_path_mknod(&path, dentry, mode, 0);
 847         if (!err) {
 848                 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
 849                 if (!err) {
 850                         res->mnt = mntget(path.mnt);
 851                         res->dentry = dget(dentry);
 852                 }
 853         }
 854         done_path_create(&path, dentry);
 855         return err;
 856 }
 857
 858 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 859 {
 860         struct sock *sk = sock->sk;
 861         struct net *net = sock_net(sk);
 862         struct unix_sock *u = unix_sk(sk);
 863         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 864         char *sun_path = sunaddr->sun_path;
 865         int err;
 866         unsigned int hash;
 867         struct unix_address *addr;
 868         struct hlist_head *list;
 869
 870         err = -EINVAL;
 871         if (sunaddr->sun_family != AF_UNIX)
 872                 goto out;
 873
 874         if (addr_len == sizeof(short)) {
 875                 err = unix_autobind(sock);
 876                 goto out;
 877         }
 878
 879         err = unix_mkname(sunaddr, addr_len, &hash);
 880         if (err < 0)
 881                 goto out;
 882         addr_len = err;
 883
 884         err = mutex_lock_interruptible(&u->readlock);
 885         if (err)
 886                 goto out;
 887
 888         err = -EINVAL;
 889         if (u->addr)
 890                 goto out_up;
 891
 892         err = -ENOMEM;
 893         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
 894         if (!addr)
 895                 goto out_up;
 896
 897         memcpy(addr->name, sunaddr, addr_len);
 898         addr->len = addr_len;
 899         addr->hash = hash ^ sk->sk_type;
 900         atomic_set(&addr->refcnt, 1);
 901
 902         if (sun_path[0]) {
 903                 struct path path;
 904                 umode_t mode = S_IFSOCK |
 905                        (SOCK_INODE(sock)->i_mode & ~current_umask());
 906                 err = unix_mknod(sun_path, mode, &path);
 907                 if (err) {
 908                         if (err == -EEXIST)
 909                                 err = -EADDRINUSE;
 910                         unix_release_addr(addr);
 911                         goto out_up;
 912                 }
 913                 addr->hash = UNIX_HASH_SIZE;
 914                 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
 915                 spin_lock(&unix_table_lock);
 916                 u->path = path;
 917                 list = &unix_socket_table[hash];
 918         } else {
 919                 spin_lock(&unix_table_lock);
 920                 err = -EADDRINUSE;
 921                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
 922                                               sk->sk_type, hash)) {
 923                         unix_release_addr(addr);
 924                         goto out_unlock;
 925                 }
 926
 927                 list = &unix_socket_table[addr->hash];
 928         }
 929
 930         err = 0;
 931         __unix_remove_socket(sk);
 932         u->addr = addr;
 933         __unix_insert_socket(list, sk);
 934
 935 out_unlock:
 936         spin_unlock(&unix_table_lock);
 937 out_up:
 938         mutex_unlock(&u->readlock);
 939 out:
 940         return err;
 941 }
 942
 943 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
 944 {
 945         if (unlikely(sk1 == sk2) || !sk2) {
 946                 unix_state_lock(sk1);
 947                 return;
 948         }
 949         if (sk1 < sk2) {
 950                 unix_state_lock(sk1);
 951                 unix_state_lock_nested(sk2);
 952         } else {
 953                 unix_state_lock(sk2);
 954                 unix_state_lock_nested(sk1);
 955         }
 956 }
 957
 958 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
 959 {
 960         if (unlikely(sk1 == sk2) || !sk2) {
 961                 unix_state_unlock(sk1);
 962                 return;
 963         }
 964         unix_state_unlock(sk1);
 965         unix_state_unlock(sk2);
 966 }
 967
 968 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 969                               int alen, int flags)
 970 {
 971         struct sock *sk = sock->sk;
 972         struct net *net = sock_net(sk);
 973         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 974         struct sock *other;
 975         unsigned int hash;
 976         int err;
 977
 978         if (addr->sa_family != AF_UNSPEC) {
 979                 err = unix_mkname(sunaddr, alen, &hash);
 980                 if (err < 0)
 981                         goto out;
 982                 alen = err;
 983
 984                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
 985                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
 986                         goto out;
 987
 988 restart:
 989                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
 990                 if (!other)
 991                         goto out;
 992
 993                 unix_state_double_lock(sk, other);
 994
 995                 /* Apparently VFS overslept socket death. Retry. */
 996                 if (sock_flag(other, SOCK_DEAD)) {
 997                         unix_state_double_unlock(sk, other);
 998                         sock_put(other);
 999                         goto restart;
1000                 }
1001
1002                 err = -EPERM;
1003                 if (!unix_may_send(sk, other))
1004                         goto out_unlock;
1005
1006                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1007                 if (err)
1008                         goto out_unlock;
1009
1010         } else {
1011                 /*
1012                  *      1003.1g breaking connected state with AF_UNSPEC
1013                  */
1014                 other = NULL;
1015                 unix_state_double_lock(sk, other);
1016         }
1017
1018         /*
1019          * If it was connected, reconnect.
1020          */
1021         if (unix_peer(sk)) {
1022                 struct sock *old_peer = unix_peer(sk);
1023                 unix_peer(sk) = other;
1024                 unix_state_double_unlock(sk, other);
1025
1026                 if (other != old_peer)
1027                         unix_dgram_disconnected(sk, old_peer);
1028                 sock_put(old_peer);
1029         } else {
1030                 unix_peer(sk) = other;
1031                 unix_state_double_unlock(sk, other);
1032         }
1033         return 0;
1034
1035 out_unlock:
1036         unix_state_double_unlock(sk, other);
1037         sock_put(other);
1038 out:
1039         return err;
1040 }
1041
1042 static long unix_wait_for_peer(struct sock *other, long timeo)
1043 {
1044         struct unix_sock *u = unix_sk(other);
1045         int sched;
1046         DEFINE_WAIT(wait);
1047
1048         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1049
1050         sched = !sock_flag(other, SOCK_DEAD) &&
1051                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1052                 unix_recvq_full(other);
1053
1054         unix_state_unlock(other);
1055
1056         if (sched)
1057                 timeo = schedule_timeout(timeo);
1058
1059         finish_wait(&u->peer_wait, &wait);
1060         return timeo;
1061 }
1062
1063 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1064                                int addr_len, int flags)
1065 {
1066         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1067         struct sock *sk = sock->sk;
1068         struct net *net = sock_net(sk);
1069         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1070         struct sock *newsk = NULL;
1071         struct sock *other = NULL;
1072         struct sk_buff *skb = NULL;
1073         unsigned int hash;
1074         int st;
1075         int err;
1076         long timeo;
1077
1078         err = unix_mkname(sunaddr, addr_len, &hash);
1079         if (err < 0)
1080                 goto out;
1081         addr_len = err;
1082
1083         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1084             (err = unix_autobind(sock)) != 0)
1085                 goto out;
1086
1087         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1088
1089         /* First of all allocate resources.
1090            If we will make it after state is locked,
1091            we will have to recheck all again in any case.
1092          */
1093
1094         err = -ENOMEM;
1095
1096         /* create new sock for complete connection */
1097         newsk = unix_create1(sock_net(sk), NULL, 0);
1098         if (newsk == NULL)
1099                 goto out;
1100
1101         /* Allocate skb for sending to listening sock */
1102         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1103         if (skb == NULL)
1104                 goto out;
1105
1106 restart:
1107         /*  Find listening sock. */
1108         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1109         if (!other)
1110                 goto out;
1111
1112         /* Latch state of peer */
1113         unix_state_lock(other);
1114
1115         /* Apparently VFS overslept socket death. Retry. */
1116         if (sock_flag(other, SOCK_DEAD)) {
1117                 unix_state_unlock(other);
1118                 sock_put(other);
1119                 goto restart;
1120         }
1121
1122         err = -ECONNREFUSED;
1123         if (other->sk_state != TCP_LISTEN)
1124                 goto out_unlock;
1125         if (other->sk_shutdown & RCV_SHUTDOWN)
1126                 goto out_unlock;
1127
1128         if (unix_recvq_full(other)) {
1129                 err = -EAGAIN;
1130                 if (!timeo)
1131                         goto out_unlock;
1132
1133                 timeo = unix_wait_for_peer(other, timeo);
1134
1135                 err = sock_intr_errno(timeo);
1136                 if (signal_pending(current))
1137                         goto out;
1138                 sock_put(other);
1139                 goto restart;
1140         }
1141
1142         /* Latch our state.
1143
1144            It is tricky place. We need to grab our state lock and cannot
1145            drop lock on peer. It is dangerous because deadlock is
1146            possible. Connect to self case and simultaneous
1147            attempt to connect are eliminated by checking socket
1148            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1149            check this before attempt to grab lock.
1150
1151            Well, and we have to recheck the state after socket locked.
1152          */
1153         st = sk->sk_state;
1154
1155         switch (st) {
1156         case TCP_CLOSE:
1157                 /* This is ok... continue with connect */
1158                 break;
1159         case TCP_ESTABLISHED:
1160                 /* Socket is already connected */
1161                 err = -EISCONN;
1162                 goto out_unlock;
1163         default:
1164                 err = -EINVAL;
1165                 goto out_unlock;
1166         }
1167
1168         unix_state_lock_nested(sk);
1169
1170         if (sk->sk_state != st) {
1171                 unix_state_unlock(sk);
1172                 unix_state_unlock(other);
1173                 sock_put(other);
1174                 goto restart;
1175         }
1176
1177         err = security_unix_stream_connect(sk, other, newsk);
1178         if (err) {
1179                 unix_state_unlock(sk);
1180                 goto out_unlock;
1181         }
1182
1183         /* The way is open! Fastly set all the necessary fields... */
1184
1185         sock_hold(sk);
1186         unix_peer(newsk)        = sk;
1187         newsk->sk_state         = TCP_ESTABLISHED;
1188         newsk->sk_type          = sk->sk_type;
1189         init_peercred(newsk);
1190         newu = unix_sk(newsk);
1191         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1192         otheru = unix_sk(other);
1193
1194         /* copy address information from listening to new sock*/
1195         if (otheru->addr) {
1196                 atomic_inc(&otheru->addr->refcnt);
1197                 newu->addr = otheru->addr;
1198         }
1199         if (otheru->path.dentry) {
1200                 path_get(&otheru->path);
1201                 newu->path = otheru->path;
1202         }
1203
1204         /* Set credentials */
1205         copy_peercred(sk, other);
1206
1207         sock->state     = SS_CONNECTED;
1208         sk->sk_state    = TCP_ESTABLISHED;
1209         sock_hold(newsk);
1210
1211         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1212         unix_peer(sk)   = newsk;
1213
1214         unix_state_unlock(sk);
1215
1216         /* take ten and and send info to listening sock */
1217         spin_lock(&other->sk_receive_queue.lock);
1218         __skb_queue_tail(&other->sk_receive_queue, skb);
1219         spin_unlock(&other->sk_receive_queue.lock);
1220         unix_state_unlock(other);
1221         other->sk_data_ready(other);
1222         sock_put(other);
1223         return 0;
1224
1225 out_unlock:
1226         if (other)
1227                 unix_state_unlock(other);
1228
1229 out:
1230         kfree_skb(skb);
1231         if (newsk)
1232                 unix_release_sock(newsk, 0);
1233         if (other)
1234                 sock_put(other);
1235         return err;
1236 }
1237
1238 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1239 {
1240         struct sock *ska = socka->sk, *skb = sockb->sk;
1241
1242         /* Join our sockets back to back */
1243         sock_hold(ska);
1244         sock_hold(skb);
1245         unix_peer(ska) = skb;
1246         unix_peer(skb) = ska;
1247         init_peercred(ska);
1248         init_peercred(skb);
1249
1250         if (ska->sk_type != SOCK_DGRAM) {
1251                 ska->sk_state = TCP_ESTABLISHED;
1252                 skb->sk_state = TCP_ESTABLISHED;
1253                 socka->state  = SS_CONNECTED;
1254                 sockb->state  = SS_CONNECTED;
1255         }
1256         return 0;
1257 }
1258
1259 static void unix_sock_inherit_flags(const struct socket *old,
1260                                     struct socket *new)
1261 {
1262         if (test_bit(SOCK_PASSCRED, &old->flags))
1263                 set_bit(SOCK_PASSCRED, &new->flags);
1264         if (test_bit(SOCK_PASSSEC, &old->flags))
1265                 set_bit(SOCK_PASSSEC, &new->flags);
1266 }
1267
1268 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1269 {
1270         struct sock *sk = sock->sk;
1271         struct sock *tsk;
1272         struct sk_buff *skb;
1273         int err;
1274
1275         err = -EOPNOTSUPP;
1276         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1277                 goto out;
1278
1279         err = -EINVAL;
1280         if (sk->sk_state != TCP_LISTEN)
1281                 goto out;
1282
1283         /* If socket state is TCP_LISTEN it cannot change (for now...),
1284          * so that no locks are necessary.
1285          */
1286
1287         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1288         if (!skb) {
1289                 /* This means receive shutdown. */
1290                 if (err == 0)
1291                         err = -EINVAL;
1292                 goto out;
1293         }
1294
1295         tsk = skb->sk;
1296         skb_free_datagram(sk, skb);
1297         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1298
1299         /* attach accepted sock to socket */
1300         unix_state_lock(tsk);
1301         newsock->state = SS_CONNECTED;
1302         unix_sock_inherit_flags(sock, newsock);
1303         sock_graft(tsk, newsock);
1304         unix_state_unlock(tsk);
1305         return 0;
1306
1307 out:
1308         return err;
1309 }
1310
1311
1312 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1313 {
1314         struct sock *sk = sock->sk;
1315         struct unix_sock *u;
1316         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1317         int err = 0;
1318
1319         if (peer) {
1320                 sk = unix_peer_get(sk);
1321
1322                 err = -ENOTCONN;
1323                 if (!sk)
1324                         goto out;
1325                 err = 0;
1326         } else {
1327                 sock_hold(sk);
1328         }
1329
1330         u = unix_sk(sk);
1331         unix_state_lock(sk);
1332         if (!u->addr) {
1333                 sunaddr->sun_family = AF_UNIX;
1334                 sunaddr->sun_path[0] = 0;
1335                 *uaddr_len = sizeof(short);
1336         } else {
1337                 struct unix_address *addr = u->addr;
1338
1339                 *uaddr_len = addr->len;
1340                 memcpy(sunaddr, addr->name, *uaddr_len);
1341         }
1342         unix_state_unlock(sk);
1343         sock_put(sk);
1344 out:
1345         return err;
1346 }
1347
1348 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1349 {
1350         int i;
1351
1352         scm->fp = UNIXCB(skb).fp;
1353         UNIXCB(skb).fp = NULL;
1354
1355         for (i = scm->fp->count-1; i >= 0; i--)
1356                 unix_notinflight(scm->fp->fp[i]);
1357 }
1358
1359 static void unix_destruct_scm(struct sk_buff *skb)
1360 {
1361         struct scm_cookie scm;
1362         memset(&scm, 0, sizeof(scm));
1363         scm.pid  = UNIXCB(skb).pid;
1364         if (UNIXCB(skb).fp)
1365                 unix_detach_fds(&scm, skb);
1366
1367         /* Alas, it calls VFS */
1368         /* So fscking what? fput() had been SMP-safe since the last Summer */
1369         scm_destroy(&scm);
1370         sock_wfree(skb);
1371 }
1372
1373 #define MAX_RECURSION_LEVEL 4
1374
1375 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1376 {
1377         int i;
1378         unsigned char max_level = 0;
1379         int unix_sock_count = 0;
1380
1381         for (i = scm->fp->count - 1; i >= 0; i--) {
1382                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1383
1384                 if (sk) {
1385                         unix_sock_count++;
1386                         max_level = max(max_level,
1387                                         unix_sk(sk)->recursion_level);
1388                 }
1389         }
1390         if (unlikely(max_level > MAX_RECURSION_LEVEL))
1391                 return -ETOOMANYREFS;
1392
1393         /*
1394          * Need to duplicate file references for the sake of garbage
1395          * collection.  Otherwise a socket in the fps might become a
1396          * candidate for GC while the skb is not yet queued.
1397          */
1398         UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1399         if (!UNIXCB(skb).fp)
1400                 return -ENOMEM;
1401
1402         if (unix_sock_count) {
1403                 for (i = scm->fp->count - 1; i >= 0; i--)
1404                         unix_inflight(scm->fp->fp[i]);
1405         }
1406         return max_level;
1407 }
1408
1409 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1410 {
1411         int err = 0;
1412
1413         UNIXCB(skb).pid  = get_pid(scm->pid);
1414         UNIXCB(skb).uid = scm->creds.uid;
1415         UNIXCB(skb).gid = scm->creds.gid;
1416         UNIXCB(skb).fp = NULL;
1417         if (scm->fp && send_fds)
1418                 err = unix_attach_fds(scm, skb);
1419
1420         skb->destructor = unix_destruct_scm;
1421         return err;
1422 }
1423
1424 /*
1425  * Some apps rely on write() giving SCM_CREDENTIALS
1426  * We include credentials if source or destination socket
1427  * asserted SOCK_PASSCRED.
1428  */
1429 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1430                             const struct sock *other)
1431 {
1432         if (UNIXCB(skb).pid)
1433                 return;
1434         if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1435             !other->sk_socket ||
1436             test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1437                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1438                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1439         }
1440 }
1441
1442 /*
1443  *      Send AF_UNIX data.
1444  */
1445
1446 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1447                               size_t len)
1448 {
1449         struct sock *sk = sock->sk;
1450         struct net *net = sock_net(sk);
1451         struct unix_sock *u = unix_sk(sk);
1452         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1453         struct sock *other = NULL;
1454         int namelen = 0; /* fake GCC */
1455         int err;
1456         unsigned int hash;
1457         struct sk_buff *skb;
1458         long timeo;
1459         struct scm_cookie scm;
1460         int max_level;
1461         int data_len = 0;
1462
1463         wait_for_unix_gc();
1464         err = scm_send(sock, msg, &scm, false);
1465         if (err < 0)
1466                 return err;
1467
1468         err = -EOPNOTSUPP;
1469         if (msg->msg_flags&MSG_OOB)
1470                 goto out;
1471
1472         if (msg->msg_namelen) {
1473                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1474                 if (err < 0)
1475                         goto out;
1476                 namelen = err;
1477         } else {
1478                 sunaddr = NULL;
1479                 err = -ENOTCONN;
1480                 other = unix_peer_get(sk);
1481                 if (!other)
1482                         goto out;
1483         }
1484
1485         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1486             && (err = unix_autobind(sock)) != 0)
1487                 goto out;
1488
1489         err = -EMSGSIZE;
1490         if (len > sk->sk_sndbuf - 32)
1491                 goto out;
1492
1493         if (len > SKB_MAX_ALLOC) {
1494                 data_len = min_t(size_t,
1495                                  len - SKB_MAX_ALLOC,
1496                                  MAX_SKB_FRAGS * PAGE_SIZE);
1497                 data_len = PAGE_ALIGN(data_len);
1498
1499                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1500         }
1501
1502         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1503                                    msg->msg_flags & MSG_DONTWAIT, &err,
1504                                    PAGE_ALLOC_COSTLY_ORDER);
1505         if (skb == NULL)
1506                 goto out;
1507
1508         err = unix_scm_to_skb(&scm, skb, true);
1509         if (err < 0)
1510                 goto out_free;
1511         max_level = err + 1;
1512         unix_get_secdata(&scm, skb);
1513
1514         skb_put(skb, len - data_len);
1515         skb->data_len = data_len;
1516         skb->len = len;
1517         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1518         if (err)
1519                 goto out_free;
1520
1521         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1522
1523 restart:
1524         if (!other) {
1525                 err = -ECONNRESET;
1526                 if (sunaddr == NULL)
1527                         goto out_free;
1528
1529                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1530                                         hash, &err);
1531                 if (other == NULL)
1532                         goto out_free;
1533         }
1534
1535         if (sk_filter(other, skb) < 0) {
1536                 /* Toss the packet but do not return any error to the sender */
1537                 err = len;
1538                 goto out_free;
1539         }
1540
1541         unix_state_lock(other);
1542         err = -EPERM;
1543         if (!unix_may_send(sk, other))
1544                 goto out_unlock;
1545
1546         if (sock_flag(other, SOCK_DEAD)) {
1547                 /*
1548                  *      Check with 1003.1g - what should
1549                  *      datagram error
1550                  */
1551                 unix_state_unlock(other);
1552                 sock_put(other);
1553
1554                 err = 0;
1555                 unix_state_lock(sk);
1556                 if (unix_peer(sk) == other) {
1557                         unix_peer(sk) = NULL;
1558                         unix_state_unlock(sk);
1559
1560                         unix_dgram_disconnected(sk, other);
1561                         sock_put(other);
1562                         err = -ECONNREFUSED;
1563                 } else {
1564                         unix_state_unlock(sk);
1565                 }
1566
1567                 other = NULL;
1568                 if (err)
1569                         goto out_free;
1570                 goto restart;
1571         }
1572
1573         err = -EPIPE;
1574         if (other->sk_shutdown & RCV_SHUTDOWN)
1575                 goto out_unlock;
1576
1577         if (sk->sk_type != SOCK_SEQPACKET) {
1578                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1579                 if (err)
1580                         goto out_unlock;
1581         }
1582
1583         if (unix_peer(other) != sk && unix_recvq_full(other)) {
1584                 if (!timeo) {
1585                         err = -EAGAIN;
1586                         goto out_unlock;
1587                 }
1588
1589                 timeo = unix_wait_for_peer(other, timeo);
1590
1591                 err = sock_intr_errno(timeo);
1592                 if (signal_pending(current))
1593                         goto out_free;
1594
1595                 goto restart;
1596         }
1597
1598         if (sock_flag(other, SOCK_RCVTSTAMP))
1599                 __net_timestamp(skb);
1600         maybe_add_creds(skb, sock, other);
1601         skb_queue_tail(&other->sk_receive_queue, skb);
1602         if (max_level > unix_sk(other)->recursion_level)
1603                 unix_sk(other)->recursion_level = max_level;
1604         unix_state_unlock(other);
1605         other->sk_data_ready(other);
1606         sock_put(other);
1607         scm_destroy(&scm);
1608         return len;
1609
1610 out_unlock:
1611         unix_state_unlock(other);
1612 out_free:
1613         kfree_skb(skb);
1614 out:
1615         if (other)
1616                 sock_put(other);
1617         scm_destroy(&scm);
1618         return err;
1619 }
1620
1621 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1622  * bytes, and a minimun of a full page.
1623  */
1624 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1625
1626 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1627                                size_t len)
1628 {
1629         struct sock *sk = sock->sk;
1630         struct sock *other = NULL;
1631         int err, size;
1632         struct sk_buff *skb;
1633         int sent = 0;
1634         struct scm_cookie scm;
1635         bool fds_sent = false;
1636         int max_level;
1637         int data_len;
1638
1639         wait_for_unix_gc();
1640         err = scm_send(sock, msg, &scm, false);
1641         if (err < 0)
1642                 return err;
1643
1644         err = -EOPNOTSUPP;
1645         if (msg->msg_flags&MSG_OOB)
1646                 goto out_err;
1647
1648         if (msg->msg_namelen) {
1649                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1650                 goto out_err;
1651         } else {
1652                 err = -ENOTCONN;
1653                 other = unix_peer(sk);
1654                 if (!other)
1655                         goto out_err;
1656         }
1657
1658         if (sk->sk_shutdown & SEND_SHUTDOWN)
1659                 goto pipe_err;
1660
1661         while (sent < len) {
1662                 size = len - sent;
1663
1664                 /* Keep two messages in the pipe so it schedules better */
1665                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1666
1667                 /* allow fallback to order-0 allocations */
1668                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1669
1670                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1671
1672                 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1673
1674                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1675                                            msg->msg_flags & MSG_DONTWAIT, &err,
1676                                            get_order(UNIX_SKB_FRAGS_SZ));
1677                 if (!skb)
1678                         goto out_err;
1679
1680                 /* Only send the fds in the first buffer */
1681                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1682                 if (err < 0) {
1683                         kfree_skb(skb);
1684                         goto out_err;
1685                 }
1686                 max_level = err + 1;
1687                 fds_sent = true;
1688
1689                 skb_put(skb, size - data_len);
1690                 skb->data_len = data_len;
1691                 skb->len = size;
1692                 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1693                 if (err) {
1694                         kfree_skb(skb);
1695                         goto out_err;
1696                 }
1697
1698                 unix_state_lock(other);
1699
1700                 if (sock_flag(other, SOCK_DEAD) ||
1701                     (other->sk_shutdown & RCV_SHUTDOWN))
1702                         goto pipe_err_free;
1703
1704                 maybe_add_creds(skb, sock, other);
1705                 skb_queue_tail(&other->sk_receive_queue, skb);
1706                 if (max_level > unix_sk(other)->recursion_level)
1707                         unix_sk(other)->recursion_level = max_level;
1708                 unix_state_unlock(other);
1709                 other->sk_data_ready(other);
1710                 sent += size;
1711         }
1712
1713         scm_destroy(&scm);
1714
1715         return sent;
1716
1717 pipe_err_free:
1718         unix_state_unlock(other);
1719         kfree_skb(skb);
1720 pipe_err:
1721         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1722                 send_sig(SIGPIPE, current, 0);
1723         err = -EPIPE;
1724 out_err:
1725         scm_destroy(&scm);
1726         return sent ? : err;
1727 }
1728
1729 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1730                                     int offset, size_t size, int flags)
1731 {
1732         int err = 0;
1733         bool send_sigpipe = true;
1734         struct sock *other, *sk = socket->sk;
1735         struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1736
1737         if (flags & MSG_OOB)
1738                 return -EOPNOTSUPP;
1739
1740         other = unix_peer(sk);
1741         if (!other || sk->sk_state != TCP_ESTABLISHED)
1742                 return -ENOTCONN;
1743
1744         if (false) {
1745 alloc_skb:
1746                 unix_state_unlock(other);
1747                 mutex_unlock(&unix_sk(other)->readlock);
1748                 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1749                                               &err, 0);
1750                 if (!newskb)
1751                         return err;
1752         }
1753
1754         /* we must acquire readlock as we modify already present
1755          * skbs in the sk_receive_queue and mess with skb->len
1756          */
1757         err = mutex_lock_interruptible(&unix_sk(other)->readlock);
1758         if (err) {
1759                 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1760                 send_sigpipe = false;
1761                 goto err;
1762         }
1763
1764         if (sk->sk_shutdown & SEND_SHUTDOWN) {
1765                 err = -EPIPE;
1766                 goto err_unlock;
1767         }
1768
1769         unix_state_lock(other);
1770
1771         if (sock_flag(other, SOCK_DEAD) ||
1772             other->sk_shutdown & RCV_SHUTDOWN) {
1773                 err = -EPIPE;
1774                 goto err_state_unlock;
1775         }
1776
1777         skb = skb_peek_tail(&other->sk_receive_queue);
1778         if (tail && tail == skb) {
1779                 skb = newskb;
1780         } else if (!skb) {
1781                 if (newskb)
1782                         skb = newskb;
1783                 else
1784                         goto alloc_skb;
1785         } else if (newskb) {
1786                 /* this is fast path, we don't necessarily need to
1787                  * call to kfree_skb even though with newskb == NULL
1788                  * this - does no harm
1789                  */
1790                 consume_skb(newskb);
1791         }
1792
1793         if (skb_append_pagefrags(skb, page, offset, size)) {
1794                 tail = skb;
1795                 goto alloc_skb;
1796         }
1797
1798         skb->len += size;
1799         skb->data_len += size;
1800         skb->truesize += size;
1801         atomic_add(size, &sk->sk_wmem_alloc);
1802
1803         if (newskb)
1804                 __skb_queue_tail(&other->sk_receive_queue, newskb);
1805
1806         unix_state_unlock(other);
1807         mutex_unlock(&unix_sk(other)->readlock);
1808
1809         other->sk_data_ready(other);
1810
1811         return size;
1812
1813 err_state_unlock:
1814         unix_state_unlock(other);
1815 err_unlock:
1816         mutex_unlock(&unix_sk(other)->readlock);
1817 err:
1818         kfree_skb(newskb);
1819         if (send_sigpipe && !(flags & MSG_NOSIGNAL))
1820                 send_sig(SIGPIPE, current, 0);
1821         return err;
1822 }
1823
1824 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
1825                                   size_t len)
1826 {
1827         int err;
1828         struct sock *sk = sock->sk;
1829
1830         err = sock_error(sk);
1831         if (err)
1832                 return err;
1833
1834         if (sk->sk_state != TCP_ESTABLISHED)
1835                 return -ENOTCONN;
1836
1837         if (msg->msg_namelen)
1838                 msg->msg_namelen = 0;
1839
1840         return unix_dgram_sendmsg(sock, msg, len);
1841 }
1842
1843 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
1844                                   size_t size, int flags)
1845 {
1846         struct sock *sk = sock->sk;
1847
1848         if (sk->sk_state != TCP_ESTABLISHED)
1849                 return -ENOTCONN;
1850
1851         return unix_dgram_recvmsg(sock, msg, size, flags);
1852 }
1853
1854 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1855 {
1856         struct unix_sock *u = unix_sk(sk);
1857
1858         if (u->addr) {
1859                 msg->msg_namelen = u->addr->len;
1860                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1861         }
1862 }
1863
1864 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
1865                               size_t size, int flags)
1866 {
1867         struct scm_cookie scm;
1868         struct sock *sk = sock->sk;
1869         struct unix_sock *u = unix_sk(sk);
1870         int noblock = flags & MSG_DONTWAIT;
1871         struct sk_buff *skb;
1872         int err;
1873         int peeked, skip;
1874
1875         err = -EOPNOTSUPP;
1876         if (flags&MSG_OOB)
1877                 goto out;
1878
1879         err = mutex_lock_interruptible(&u->readlock);
1880         if (unlikely(err)) {
1881                 /* recvmsg() in non blocking mode is supposed to return -EAGAIN
1882                  * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1883                  */
1884                 err = noblock ? -EAGAIN : -ERESTARTSYS;
1885                 goto out;
1886         }
1887
1888         skip = sk_peek_offset(sk, flags);
1889
1890         skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1891         if (!skb) {
1892                 unix_state_lock(sk);
1893                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1894                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1895                     (sk->sk_shutdown & RCV_SHUTDOWN))
1896                         err = 0;
1897                 unix_state_unlock(sk);
1898                 goto out_unlock;
1899         }
1900
1901         wake_up_interruptible_sync_poll(&u->peer_wait,
1902                                         POLLOUT | POLLWRNORM | POLLWRBAND);
1903
1904         if (msg->msg_name)
1905                 unix_copy_addr(msg, skb->sk);
1906
1907         if (size > skb->len - skip)
1908                 size = skb->len - skip;
1909         else if (size < skb->len - skip)
1910                 msg->msg_flags |= MSG_TRUNC;
1911
1912         err = skb_copy_datagram_msg(skb, skip, msg, size);
1913         if (err)
1914                 goto out_free;
1915
1916         if (sock_flag(sk, SOCK_RCVTSTAMP))
1917                 __sock_recv_timestamp(msg, sk, skb);
1918
1919         memset(&scm, 0, sizeof(scm));
1920
1921         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1922         unix_set_secdata(&scm, skb);
1923
1924         if (!(flags & MSG_PEEK)) {
1925                 if (UNIXCB(skb).fp)
1926                         unix_detach_fds(&scm, skb);
1927
1928                 sk_peek_offset_bwd(sk, skb->len);
1929         } else {
1930                 /* It is questionable: on PEEK we could:
1931                    - do not return fds - good, but too simple 8)
1932                    - return fds, and do not return them on read (old strategy,
1933                      apparently wrong)
1934                    - clone fds (I chose it for now, it is the most universal
1935                      solution)
1936
1937                    POSIX 1003.1g does not actually define this clearly
1938                    at all. POSIX 1003.1g doesn't define a lot of things
1939                    clearly however!
1940
1941                 */
1942
1943                 sk_peek_offset_fwd(sk, size);
1944
1945                 if (UNIXCB(skb).fp)
1946                         scm.fp = scm_fp_dup(UNIXCB(skb).fp);
1947         }
1948         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1949
1950         scm_recv(sock, msg, &scm, flags);
1951
1952 out_free:
1953         skb_free_datagram(sk, skb);
1954 out_unlock:
1955         mutex_unlock(&u->readlock);
1956 out:
1957         return err;
1958 }
1959
1960 /*
1961  *      Sleep until more data has arrived. But check for races..
1962  */
1963 static long unix_stream_data_wait(struct sock *sk, long timeo,
1964                                   struct sk_buff *last, unsigned int last_len)
1965 {
1966         struct sk_buff *tail;
1967         DEFINE_WAIT(wait);
1968
1969         unix_state_lock(sk);
1970
1971         for (;;) {
1972                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1973
1974                 tail = skb_peek_tail(&sk->sk_receive_queue);
1975                 if (tail != last ||
1976                     (tail && tail->len != last_len) ||
1977                     sk->sk_err ||
1978                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1979                     signal_pending(current) ||
1980                     !timeo)
1981                         break;
1982
1983                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1984                 unix_state_unlock(sk);
1985                 timeo = freezable_schedule_timeout(timeo);
1986                 unix_state_lock(sk);
1987                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1988         }
1989
1990         finish_wait(sk_sleep(sk), &wait);
1991         unix_state_unlock(sk);
1992         return timeo;
1993 }
1994
1995 static unsigned int unix_skb_len(const struct sk_buff *skb)
1996 {
1997         return skb->len - UNIXCB(skb).consumed;
1998 }
1999
2000 struct unix_stream_read_state {
2001         int (*recv_actor)(struct sk_buff *, int, int,
2002                           struct unix_stream_read_state *);
2003         struct socket *socket;
2004         struct msghdr *msg;
2005         struct pipe_inode_info *pipe;
2006         size_t size;
2007         int flags;
2008         unsigned int splice_flags;
2009 };
2010
2011 static int unix_stream_read_generic(struct unix_stream_read_state *state)
2012 {
2013         struct scm_cookie scm;
2014         struct socket *sock = state->socket;
2015         struct sock *sk = sock->sk;
2016         struct unix_sock *u = unix_sk(sk);
2017         int copied = 0;
2018         int flags = state->flags;
2019         int noblock = flags & MSG_DONTWAIT;
2020         bool check_creds = false;
2021         int target;
2022         int err = 0;
2023         long timeo;
2024         int skip;
2025         size_t size = state->size;
2026         unsigned int last_len;
2027
2028         err = -EINVAL;
2029         if (sk->sk_state != TCP_ESTABLISHED)
2030                 goto out;
2031
2032         err = -EOPNOTSUPP;
2033         if (flags & MSG_OOB)
2034                 goto out;
2035
2036         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2037         timeo = sock_rcvtimeo(sk, noblock);
2038
2039         memset(&scm, 0, sizeof(scm));
2040
2041         /* Lock the socket to prevent queue disordering
2042          * while sleeps in memcpy_tomsg
2043          */
2044         err = mutex_lock_interruptible(&u->readlock);
2045         if (unlikely(err)) {
2046                 /* recvmsg() in non blocking mode is supposed to return -EAGAIN
2047                  * sk_rcvtimeo is not honored by mutex_lock_interruptible()
2048                  */
2049                 err = noblock ? -EAGAIN : -ERESTARTSYS;
2050                 goto out;
2051         }
2052
2053         do {
2054                 int chunk;
2055                 struct sk_buff *skb, *last;
2056
2057                 unix_state_lock(sk);
2058                 last = skb = skb_peek(&sk->sk_receive_queue);
2059                 last_len = last ? last->len : 0;
2060 again:
2061                 if (skb == NULL) {
2062                         unix_sk(sk)->recursion_level = 0;
2063                         if (copied >= target)
2064                                 goto unlock;
2065
2066                         /*
2067                          *      POSIX 1003.1g mandates this order.
2068                          */
2069
2070                         err = sock_error(sk);
2071                         if (err)
2072                                 goto unlock;
2073                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2074                                 goto unlock;
2075
2076                         unix_state_unlock(sk);
2077                         err = -EAGAIN;
2078                         if (!timeo)
2079                                 break;
2080                         mutex_unlock(&u->readlock);
2081
2082                         timeo = unix_stream_data_wait(sk, timeo, last,
2083                                                       last_len);
2084
2085                         if (signal_pending(current) ||
2086                             mutex_lock_interruptible(&u->readlock)) {
2087                                 err = sock_intr_errno(timeo);
2088                                 goto out;
2089                         }
2090
2091                         continue;
2092 unlock:
2093                         unix_state_unlock(sk);
2094                         break;
2095                 }
2096
2097                 skip = sk_peek_offset(sk, flags);
2098                 while (skip >= unix_skb_len(skb)) {
2099                         skip -= unix_skb_len(skb);
2100                         last = skb;
2101                         last_len = skb->len;
2102                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2103                         if (!skb)
2104                                 goto again;
2105                 }
2106
2107                 unix_state_unlock(sk);
2108
2109                 if (check_creds) {
2110                         /* Never glue messages from different writers */
2111                         if ((UNIXCB(skb).pid  != scm.pid) ||
2112                             !uid_eq(UNIXCB(skb).uid, scm.creds.uid) ||
2113                             !gid_eq(UNIXCB(skb).gid, scm.creds.gid))
2114                                 break;
2115                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2116                         /* Copy credentials */
2117                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2118                         check_creds = true;
2119                 }
2120
2121                 /* Copy address just once */
2122                 if (state->msg && state->msg->msg_name) {
2123                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2124                                          state->msg->msg_name);
2125                         unix_copy_addr(state->msg, skb->sk);
2126                         sunaddr = NULL;
2127                 }
2128
2129                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2130                 chunk = state->recv_actor(skb, skip, chunk, state);
2131                 if (chunk < 0) {
2132                         if (copied == 0)
2133                                 copied = -EFAULT;
2134                         break;
2135                 }
2136                 copied += chunk;
2137                 size -= chunk;
2138
2139                 /* Mark read part of skb as used */
2140                 if (!(flags & MSG_PEEK)) {
2141                         UNIXCB(skb).consumed += chunk;
2142
2143                         sk_peek_offset_bwd(sk, chunk);
2144
2145                         if (UNIXCB(skb).fp)
2146                                 unix_detach_fds(&scm, skb);
2147
2148                         if (unix_skb_len(skb))
2149                                 break;
2150
2151                         skb_unlink(skb, &sk->sk_receive_queue);
2152                         consume_skb(skb);
2153
2154                         if (scm.fp)
2155                                 break;
2156                 } else {
2157                         /* It is questionable, see note in unix_dgram_recvmsg.
2158                          */
2159                         if (UNIXCB(skb).fp)
2160                                 scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2161
2162                         sk_peek_offset_fwd(sk, chunk);
2163
2164                         break;
2165                 }
2166         } while (size);
2167
2168         mutex_unlock(&u->readlock);
2169         if (state->msg)
2170                 scm_recv(sock, state->msg, &scm, flags);
2171         else
2172                 scm_destroy(&scm);
2173 out:
2174         return copied ? : err;
2175 }
2176
2177 static int unix_stream_read_actor(struct sk_buff *skb,
2178                                   int skip, int chunk,
2179                                   struct unix_stream_read_state *state)
2180 {
2181         int ret;
2182
2183         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2184                                     state->msg, chunk);
2185         return ret ?: chunk;
2186 }
2187
2188 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2189                                size_t size, int flags)
2190 {
2191         struct unix_stream_read_state state = {
2192                 .recv_actor = unix_stream_read_actor,
2193                 .socket = sock,
2194                 .msg = msg,
2195                 .size = size,
2196                 .flags = flags
2197         };
2198
2199         return unix_stream_read_generic(&state);
2200 }
2201
2202 static ssize_t skb_unix_socket_splice(struct sock *sk,
2203                                       struct pipe_inode_info *pipe,
2204                                       struct splice_pipe_desc *spd)
2205 {
2206         int ret;
2207         struct unix_sock *u = unix_sk(sk);
2208
2209         mutex_unlock(&u->readlock);
2210         ret = splice_to_pipe(pipe, spd);
2211         mutex_lock(&u->readlock);
2212
2213         return ret;
2214 }
2215
2216 static int unix_stream_splice_actor(struct sk_buff *skb,
2217                                     int skip, int chunk,
2218                                     struct unix_stream_read_state *state)
2219 {
2220         return skb_splice_bits(skb, state->socket->sk,
2221                                UNIXCB(skb).consumed + skip,
2222                                state->pipe, chunk, state->splice_flags,
2223                                skb_unix_socket_splice);
2224 }
2225
2226 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2227                                        struct pipe_inode_info *pipe,
2228                                        size_t size, unsigned int flags)
2229 {
2230         struct unix_stream_read_state state = {
2231                 .recv_actor = unix_stream_splice_actor,
2232                 .socket = sock,
2233                 .pipe = pipe,
2234                 .size = size,
2235                 .splice_flags = flags,
2236         };
2237
2238         if (unlikely(*ppos))
2239                 return -ESPIPE;
2240
2241         if (sock->file->f_flags & O_NONBLOCK ||
2242             flags & SPLICE_F_NONBLOCK)
2243                 state.flags = MSG_DONTWAIT;
2244
2245         return unix_stream_read_generic(&state);
2246 }
2247
2248 static int unix_shutdown(struct socket *sock, int mode)
2249 {
2250         struct sock *sk = sock->sk;
2251         struct sock *other;
2252
2253         if (mode < SHUT_RD || mode > SHUT_RDWR)
2254                 return -EINVAL;
2255         /* This maps:
2256          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2257          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2258          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2259          */
2260         ++mode;
2261
2262         unix_state_lock(sk);
2263         sk->sk_shutdown |= mode;
2264         other = unix_peer(sk);
2265         if (other)
2266                 sock_hold(other);
2267         unix_state_unlock(sk);
2268         sk->sk_state_change(sk);
2269
2270         if (other &&
2271                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2272
2273                 int peer_mode = 0;
2274
2275                 if (mode&RCV_SHUTDOWN)
2276                         peer_mode |= SEND_SHUTDOWN;
2277                 if (mode&SEND_SHUTDOWN)
2278                         peer_mode |= RCV_SHUTDOWN;
2279                 unix_state_lock(other);
2280                 other->sk_shutdown |= peer_mode;
2281                 unix_state_unlock(other);
2282                 other->sk_state_change(other);
2283                 if (peer_mode == SHUTDOWN_MASK)
2284                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2285                 else if (peer_mode & RCV_SHUTDOWN)
2286                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2287         }
2288         if (other)
2289                 sock_put(other);
2290
2291         return 0;
2292 }
2293
2294 long unix_inq_len(struct sock *sk)
2295 {
2296         struct sk_buff *skb;
2297         long amount = 0;
2298
2299         if (sk->sk_state == TCP_LISTEN)
2300                 return -EINVAL;
2301
2302         spin_lock(&sk->sk_receive_queue.lock);
2303         if (sk->sk_type == SOCK_STREAM ||
2304             sk->sk_type == SOCK_SEQPACKET) {
2305                 skb_queue_walk(&sk->sk_receive_queue, skb)
2306                         amount += unix_skb_len(skb);
2307         } else {
2308                 skb = skb_peek(&sk->sk_receive_queue);
2309                 if (skb)
2310                         amount = skb->len;
2311         }
2312         spin_unlock(&sk->sk_receive_queue.lock);
2313
2314         return amount;
2315 }
2316 EXPORT_SYMBOL_GPL(unix_inq_len);
2317
2318 long unix_outq_len(struct sock *sk)
2319 {
2320         return sk_wmem_alloc_get(sk);
2321 }
2322 EXPORT_SYMBOL_GPL(unix_outq_len);
2323
2324 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2325 {
2326         struct sock *sk = sock->sk;
2327         long amount = 0;
2328         int err;
2329
2330         switch (cmd) {
2331         case SIOCOUTQ:
2332                 amount = unix_outq_len(sk);
2333                 err = put_user(amount, (int __user *)arg);
2334                 break;
2335         case SIOCINQ:
2336                 amount = unix_inq_len(sk);
2337                 if (amount < 0)
2338                         err = amount;
2339                 else
2340                         err = put_user(amount, (int __user *)arg);
2341                 break;
2342         default:
2343                 err = -ENOIOCTLCMD;
2344                 break;
2345         }
2346         return err;
2347 }
2348
2349 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2350 {
2351         struct sock *sk = sock->sk;
2352         unsigned int mask;
2353
2354         sock_poll_wait(file, sk_sleep(sk), wait);
2355         mask = 0;
2356
2357         /* exceptional events? */
2358         if (sk->sk_err)
2359                 mask |= POLLERR;
2360         if (sk->sk_shutdown == SHUTDOWN_MASK)
2361                 mask |= POLLHUP;
2362         if (sk->sk_shutdown & RCV_SHUTDOWN)
2363                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2364
2365         /* readable? */
2366         if (!skb_queue_empty(&sk->sk_receive_queue))
2367                 mask |= POLLIN | POLLRDNORM;
2368
2369         /* Connection-based need to check for termination and startup */
2370         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2371             sk->sk_state == TCP_CLOSE)
2372                 mask |= POLLHUP;
2373
2374         /*
2375          * we set writable also when the other side has shut down the
2376          * connection. This prevents stuck sockets.
2377          */
2378         if (unix_writable(sk))
2379                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2380
2381         return mask;
2382 }
2383
2384 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2385                                     poll_table *wait)
2386 {
2387         struct sock *sk = sock->sk, *other;
2388         unsigned int mask, writable;
2389
2390         sock_poll_wait(file, sk_sleep(sk), wait);
2391         mask = 0;
2392
2393         /* exceptional events? */
2394         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2395                 mask |= POLLERR |
2396                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2397
2398         if (sk->sk_shutdown & RCV_SHUTDOWN)
2399                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2400         if (sk->sk_shutdown == SHUTDOWN_MASK)
2401                 mask |= POLLHUP;
2402
2403         /* readable? */
2404         if (!skb_queue_empty(&sk->sk_receive_queue))
2405                 mask |= POLLIN | POLLRDNORM;
2406
2407         /* Connection-based need to check for termination and startup */
2408         if (sk->sk_type == SOCK_SEQPACKET) {
2409                 if (sk->sk_state == TCP_CLOSE)
2410                         mask |= POLLHUP;
2411                 /* connection hasn't started yet? */
2412                 if (sk->sk_state == TCP_SYN_SENT)
2413                         return mask;
2414         }
2415
2416         /* No write status requested, avoid expensive OUT tests. */
2417         if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2418                 return mask;
2419
2420         writable = unix_writable(sk);
2421         other = unix_peer_get(sk);
2422         if (other) {
2423                 if (unix_peer(other) != sk) {
2424                         sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2425                         if (unix_recvq_full(other))
2426                                 writable = 0;
2427                 }
2428                 sock_put(other);
2429         }
2430
2431         if (writable)
2432                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2433         else
2434                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2435
2436         return mask;
2437 }
2438
2439 #ifdef CONFIG_PROC_FS
2440
2441 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2442
2443 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2444 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2445 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2446
2447 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2448 {
2449         unsigned long offset = get_offset(*pos);
2450         unsigned long bucket = get_bucket(*pos);
2451         struct sock *sk;
2452         unsigned long count = 0;
2453
2454         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2455                 if (sock_net(sk) != seq_file_net(seq))
2456                         continue;
2457                 if (++count == offset)
2458                         break;
2459         }
2460
2461         return sk;
2462 }
2463
2464 static struct sock *unix_next_socket(struct seq_file *seq,
2465                                      struct sock *sk,
2466                                      loff_t *pos)
2467 {
2468         unsigned long bucket;
2469
2470         while (sk > (struct sock *)SEQ_START_TOKEN) {
2471                 sk = sk_next(sk);
2472                 if (!sk)
2473                         goto next_bucket;
2474                 if (sock_net(sk) == seq_file_net(seq))
2475                         return sk;
2476         }
2477
2478         do {
2479                 sk = unix_from_bucket(seq, pos);
2480                 if (sk)
2481                         return sk;
2482
2483 next_bucket:
2484                 bucket = get_bucket(*pos) + 1;
2485                 *pos = set_bucket_offset(bucket, 1);
2486         } while (bucket < ARRAY_SIZE(unix_socket_table));
2487
2488         return NULL;
2489 }
2490
2491 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2492         __acquires(unix_table_lock)
2493 {
2494         spin_lock(&unix_table_lock);
2495
2496         if (!*pos)
2497                 return SEQ_START_TOKEN;
2498
2499         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2500                 return NULL;
2501
2502         return unix_next_socket(seq, NULL, pos);
2503 }
2504
2505 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2506 {
2507         ++*pos;
2508         return unix_next_socket(seq, v, pos);
2509 }
2510
2511 static void unix_seq_stop(struct seq_file *seq, void *v)
2512         __releases(unix_table_lock)
2513 {
2514         spin_unlock(&unix_table_lock);
2515 }
2516
2517 static int unix_seq_show(struct seq_file *seq, void *v)
2518 {
2519
2520         if (v == SEQ_START_TOKEN)
2521                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2522                          "Inode Path\n");
2523         else {
2524                 struct sock *s = v;
2525                 struct unix_sock *u = unix_sk(s);
2526                 unix_state_lock(s);
2527
2528                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2529                         s,
2530                         atomic_read(&s->sk_refcnt),
2531                         0,
2532                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2533                         s->sk_type,
2534                         s->sk_socket ?
2535                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2536                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2537                         sock_i_ino(s));
2538
2539                 if (u->addr) {
2540                         int i, len;
2541                         seq_putc(seq, ' ');
2542
2543                         i = 0;
2544                         len = u->addr->len - sizeof(short);
2545                         if (!UNIX_ABSTRACT(s))
2546                                 len--;
2547                         else {
2548                                 seq_putc(seq, '@');
2549                                 i++;
2550                         }
2551                         for ( ; i < len; i++)
2552                                 seq_putc(seq, u->addr->name->sun_path[i]);
2553                 }
2554                 unix_state_unlock(s);
2555                 seq_putc(seq, '\n');
2556         }
2557
2558         return 0;
2559 }
2560
2561 static const struct seq_operations unix_seq_ops = {
2562         .start  = unix_seq_start,
2563         .next   = unix_seq_next,
2564         .stop   = unix_seq_stop,
2565         .show   = unix_seq_show,
2566 };
2567
2568 static int unix_seq_open(struct inode *inode, struct file *file)
2569 {
2570         return seq_open_net(inode, file, &unix_seq_ops,
2571                             sizeof(struct seq_net_private));
2572 }
2573
2574 static const struct file_operations unix_seq_fops = {
2575         .owner          = THIS_MODULE,
2576         .open           = unix_seq_open,
2577         .read           = seq_read,
2578         .llseek         = seq_lseek,
2579         .release        = seq_release_net,
2580 };
2581
2582 #endif
2583
2584 static const struct net_proto_family unix_family_ops = {
2585         .family = PF_UNIX,
2586         .create = unix_create,
2587         .owner  = THIS_MODULE,
2588 };
2589
2590
2591 static int __net_init unix_net_init(struct net *net)
2592 {
2593         int error = -ENOMEM;
2594
2595         net->unx.sysctl_max_dgram_qlen = 10;
2596         if (unix_sysctl_register(net))
2597                 goto out;
2598
2599 #ifdef CONFIG_PROC_FS
2600         if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2601                 unix_sysctl_unregister(net);
2602                 goto out;
2603         }
2604 #endif
2605         error = 0;
2606 out:
2607         return error;
2608 }
2609
2610 static void __net_exit unix_net_exit(struct net *net)
2611 {
2612         unix_sysctl_unregister(net);
2613         remove_proc_entry("unix", net->proc_net);
2614 }
2615
2616 static struct pernet_operations unix_net_ops = {
2617         .init = unix_net_init,
2618         .exit = unix_net_exit,
2619 };
2620
2621 static int __init af_unix_init(void)
2622 {
2623         int rc = -1;
2624
2625         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2626
2627         rc = proto_register(&unix_proto, 1);
2628         if (rc != 0) {
2629                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2630                 goto out;
2631         }
2632
2633         sock_register(&unix_family_ops);
2634         register_pernet_subsys(&unix_net_ops);
2635 out:
2636         return rc;
2637 }
2638
2639 static void __exit af_unix_exit(void)
2640 {
2641         sock_unregister(PF_UNIX);
2642         proto_unregister(&unix_proto);
2643         unregister_pernet_subsys(&unix_net_ops);
2644 }
2645
2646 /* Earlier than device_initcall() so that other drivers invoking
2647    request_module() don't end up in a loop when modprobe tries
2648    to use a UNIX socket. But later than subsys_initcall() because
2649    we depend on stuff initialised there */
2650 fs_initcall(af_unix_init);
2651 module_exit(af_unix_exit);
2652
2653 MODULE_LICENSE("GPL");
2654 MODULE_ALIAS_NETPROTO(PF_UNIX);