Merge tag 'nfs-for-3.16-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 10 Jun 2014 22:02:42 +0000 (15:02 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 10 Jun 2014 22:02:42 +0000 (15:02 -0700)
Pull NFS client updates from Trond Myklebust:
 "Highlights include:

   - massive cleanup of the NFS read/write code by Anna and Dros
   - support multiple NFS read/write requests per page in order to deal
     with non-page aligned pNFS striping.  Also cleans up the r/wsize <
     page size code nicely.
   - stable fix for ensuring inode is declared uptodate only after all
     the attributes have been checked.
   - stable fix for a kernel Oops when remounting
   - NFS over RDMA client fixes
   - move the pNFS files layout driver into its own subdirectory"

* tag 'nfs-for-3.16-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (79 commits)
  NFS: populate ->net in mount data when remounting
  pnfs: fix lockup caused by pnfs_generic_pg_test
  NFSv4.1: Fix typo in dprintk
  NFSv4.1: Comment is now wrong and redundant to code
  NFS: Use raw_write_seqcount_begin/end int nfs4_reclaim_open_state
  xprtrdma: Disconnect on registration failure
  xprtrdma: Remove BUG_ON() call sites
  xprtrdma: Avoid deadlock when credit window is reset
  SUNRPC: Move congestion window constants to header file
  xprtrdma: Reset connection timeout after successful reconnect
  xprtrdma: Use macros for reconnection timeout constants
  xprtrdma: Allocate missing pagelist
  xprtrdma: Remove Tavor MTU setting
  xprtrdma: Ensure ia->ri_id->qp is not NULL when reconnecting
  xprtrdma: Reduce the number of hardway buffer allocations
  xprtrdma: Limit work done by completion handler
  xprtrmda: Reduce calls to ib_poll_cq() in completion handlers
  xprtrmda: Reduce lock contention in completion handlers
  xprtrdma: Split the completion queue
  xprtrdma: Make rpcrdma_ep_destroy() return void
  ...

1  2 
fs/nfs/filelayout/filelayoutdev.c
fs/nfs/inode.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4state.c
fs/nfs/pagelist.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/write.c
include/linux/sunrpc/xprt.h
net/sunrpc/xprt.c

index 0000000000000000000000000000000000000000,7c85390a46479daeee2f86d350149e52ab8a69d6..44bf0140a4c77e2d787f8ee154aa0436f6d5d615
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,843 +1,843 @@@
 -      smp_mb__before_clear_bit();
+ /*
+  *  Device operations for the pnfs nfs4 file layout driver.
+  *
+  *  Copyright (c) 2002
+  *  The Regents of the University of Michigan
+  *  All Rights Reserved
+  *
+  *  Dean Hildebrand <dhildebz@umich.edu>
+  *  Garth Goodson   <Garth.Goodson@netapp.com>
+  *
+  *  Permission is granted to use, copy, create derivative works, and
+  *  redistribute this software and such derivative works for any purpose,
+  *  so long as the name of the University of Michigan is not used in
+  *  any advertising or publicity pertaining to the use or distribution
+  *  of this software without specific, written prior authorization. If
+  *  the above copyright notice or any other identification of the
+  *  University of Michigan is included in any copy of any portion of
+  *  this software, then the disclaimer below must also be included.
+  *
+  *  This software is provided as is, without representation or warranty
+  *  of any kind either express or implied, including without limitation
+  *  the implied warranties of merchantability, fitness for a particular
+  *  purpose, or noninfringement.  The Regents of the University of
+  *  Michigan shall not be liable for any damages, including special,
+  *  indirect, incidental, or consequential damages, with respect to any
+  *  claim arising out of or in connection with the use of the software,
+  *  even if it has been or is hereafter advised of the possibility of
+  *  such damages.
+  */
+ #include <linux/nfs_fs.h>
+ #include <linux/vmalloc.h>
+ #include <linux/module.h>
+ #include <linux/sunrpc/addr.h>
+ #include "../internal.h"
+ #include "../nfs4session.h"
+ #include "filelayout.h"
+ #define NFSDBG_FACILITY               NFSDBG_PNFS_LD
+ static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
+ static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
+ /*
+  * Data server cache
+  *
+  * Data servers can be mapped to different device ids.
+  * nfs4_pnfs_ds reference counting
+  *   - set to 1 on allocation
+  *   - incremented when a device id maps a data server already in the cache.
+  *   - decremented when deviceid is removed from the cache.
+  */
+ static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+ static LIST_HEAD(nfs4_data_server_cache);
+ /* Debug routines */
+ void
+ print_ds(struct nfs4_pnfs_ds *ds)
+ {
+       if (ds == NULL) {
+               printk("%s NULL device\n", __func__);
+               return;
+       }
+       printk("        ds %s\n"
+               "        ref count %d\n"
+               "        client %p\n"
+               "        cl_exchange_flags %x\n",
+               ds->ds_remotestr,
+               atomic_read(&ds->ds_count), ds->ds_clp,
+               ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+ }
+ static bool
+ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
+ {
+       struct sockaddr_in *a, *b;
+       struct sockaddr_in6 *a6, *b6;
+       if (addr1->sa_family != addr2->sa_family)
+               return false;
+       switch (addr1->sa_family) {
+       case AF_INET:
+               a = (struct sockaddr_in *)addr1;
+               b = (struct sockaddr_in *)addr2;
+               if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
+                   a->sin_port == b->sin_port)
+                       return true;
+               break;
+       case AF_INET6:
+               a6 = (struct sockaddr_in6 *)addr1;
+               b6 = (struct sockaddr_in6 *)addr2;
+               /* LINKLOCAL addresses must have matching scope_id */
+               if (ipv6_addr_src_scope(&a6->sin6_addr) ==
+                   IPV6_ADDR_SCOPE_LINKLOCAL &&
+                   a6->sin6_scope_id != b6->sin6_scope_id)
+                       return false;
+               if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
+                   a6->sin6_port == b6->sin6_port)
+                       return true;
+               break;
+       default:
+               dprintk("%s: unhandled address family: %u\n",
+                       __func__, addr1->sa_family);
+               return false;
+       }
+       return false;
+ }
+ static bool
+ _same_data_server_addrs_locked(const struct list_head *dsaddrs1,
+                              const struct list_head *dsaddrs2)
+ {
+       struct nfs4_pnfs_ds_addr *da1, *da2;
+       /* step through both lists, comparing as we go */
+       for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
+            da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
+            da1 != NULL && da2 != NULL;
+            da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
+            da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
+               if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
+                                  (struct sockaddr *)&da2->da_addr))
+                       return false;
+       }
+       if (da1 == NULL && da2 == NULL)
+               return true;
+       return false;
+ }
+ /*
+  * Lookup DS by addresses.  nfs4_ds_cache_lock is held
+  */
+ static struct nfs4_pnfs_ds *
+ _data_server_lookup_locked(const struct list_head *dsaddrs)
+ {
+       struct nfs4_pnfs_ds *ds;
+       list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
+               if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
+                       return ds;
+       return NULL;
+ }
+ /*
+  * Create an rpc connection to the nfs4_pnfs_ds data server
+  * Currently only supports IPv4 and IPv6 addresses
+  */
+ static int
+ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
+ {
+       struct nfs_client *clp = ERR_PTR(-EIO);
+       struct nfs4_pnfs_ds_addr *da;
+       int status = 0;
+       dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
+               mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
+       list_for_each_entry(da, &ds->ds_addrs, da_node) {
+               dprintk("%s: DS %s: trying address %s\n",
+                       __func__, ds->ds_remotestr, da->da_remotestr);
+               clp = nfs4_set_ds_client(mds_srv->nfs_client,
+                                       (struct sockaddr *)&da->da_addr,
+                                       da->da_addrlen, IPPROTO_TCP,
+                                       dataserver_timeo, dataserver_retrans);
+               if (!IS_ERR(clp))
+                       break;
+       }
+       if (IS_ERR(clp)) {
+               status = PTR_ERR(clp);
+               goto out;
+       }
+       status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
+       if (status)
+               goto out_put;
+       smp_wmb();
+       ds->ds_clp = clp;
+       dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
+ out:
+       return status;
+ out_put:
+       nfs_put_client(clp);
+       goto out;
+ }
+ static void
+ destroy_ds(struct nfs4_pnfs_ds *ds)
+ {
+       struct nfs4_pnfs_ds_addr *da;
+       dprintk("--> %s\n", __func__);
+       ifdebug(FACILITY)
+               print_ds(ds);
+       if (ds->ds_clp)
+               nfs_put_client(ds->ds_clp);
+       while (!list_empty(&ds->ds_addrs)) {
+               da = list_first_entry(&ds->ds_addrs,
+                                     struct nfs4_pnfs_ds_addr,
+                                     da_node);
+               list_del_init(&da->da_node);
+               kfree(da->da_remotestr);
+               kfree(da);
+       }
+       kfree(ds->ds_remotestr);
+       kfree(ds);
+ }
+ void
+ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+ {
+       struct nfs4_pnfs_ds *ds;
+       int i;
+       nfs4_print_deviceid(&dsaddr->id_node.deviceid);
+       for (i = 0; i < dsaddr->ds_num; i++) {
+               ds = dsaddr->ds_list[i];
+               if (ds != NULL) {
+                       if (atomic_dec_and_lock(&ds->ds_count,
+                                               &nfs4_ds_cache_lock)) {
+                               list_del_init(&ds->ds_node);
+                               spin_unlock(&nfs4_ds_cache_lock);
+                               destroy_ds(ds);
+                       }
+               }
+       }
+       kfree(dsaddr->stripe_indices);
+       kfree(dsaddr);
+ }
+ /*
+  * Create a string with a human readable address and port to avoid
+  * complicated setup around many dprinks.
+  */
+ static char *
+ nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
+ {
+       struct nfs4_pnfs_ds_addr *da;
+       char *remotestr;
+       size_t len;
+       char *p;
+       len = 3;        /* '{', '}' and eol */
+       list_for_each_entry(da, dsaddrs, da_node) {
+               len += strlen(da->da_remotestr) + 1;    /* string plus comma */
+       }
+       remotestr = kzalloc(len, gfp_flags);
+       if (!remotestr)
+               return NULL;
+       p = remotestr;
+       *(p++) = '{';
+       len--;
+       list_for_each_entry(da, dsaddrs, da_node) {
+               size_t ll = strlen(da->da_remotestr);
+               if (ll > len)
+                       goto out_err;
+               memcpy(p, da->da_remotestr, ll);
+               p += ll;
+               len -= ll;
+               if (len < 1)
+                       goto out_err;
+               (*p++) = ',';
+               len--;
+       }
+       if (len < 2)
+               goto out_err;
+       *(p++) = '}';
+       *p = '\0';
+       return remotestr;
+ out_err:
+       kfree(remotestr);
+       return NULL;
+ }
+ static struct nfs4_pnfs_ds *
+ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
+ {
+       struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
+       char *remotestr;
+       if (list_empty(dsaddrs)) {
+               dprintk("%s: no addresses defined\n", __func__);
+               goto out;
+       }
+       ds = kzalloc(sizeof(*ds), gfp_flags);
+       if (!ds)
+               goto out;
+       /* this is only used for debugging, so it's ok if its NULL */
+       remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
+       spin_lock(&nfs4_ds_cache_lock);
+       tmp_ds = _data_server_lookup_locked(dsaddrs);
+       if (tmp_ds == NULL) {
+               INIT_LIST_HEAD(&ds->ds_addrs);
+               list_splice_init(dsaddrs, &ds->ds_addrs);
+               ds->ds_remotestr = remotestr;
+               atomic_set(&ds->ds_count, 1);
+               INIT_LIST_HEAD(&ds->ds_node);
+               ds->ds_clp = NULL;
+               list_add(&ds->ds_node, &nfs4_data_server_cache);
+               dprintk("%s add new data server %s\n", __func__,
+                       ds->ds_remotestr);
+       } else {
+               kfree(remotestr);
+               kfree(ds);
+               atomic_inc(&tmp_ds->ds_count);
+               dprintk("%s data server %s found, inc'ed ds_count to %d\n",
+                       __func__, tmp_ds->ds_remotestr,
+                       atomic_read(&tmp_ds->ds_count));
+               ds = tmp_ds;
+       }
+       spin_unlock(&nfs4_ds_cache_lock);
+ out:
+       return ds;
+ }
+ /*
+  * Currently only supports ipv4, ipv6 and one multi-path address.
+  */
+ static struct nfs4_pnfs_ds_addr *
+ decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
+ {
+       struct nfs4_pnfs_ds_addr *da = NULL;
+       char *buf, *portstr;
+       __be16 port;
+       int nlen, rlen;
+       int tmp[2];
+       __be32 *p;
+       char *netid, *match_netid;
+       size_t len, match_netid_len;
+       char *startsep = "";
+       char *endsep = "";
+       /* r_netid */
+       p = xdr_inline_decode(streamp, 4);
+       if (unlikely(!p))
+               goto out_err;
+       nlen = be32_to_cpup(p++);
+       p = xdr_inline_decode(streamp, nlen);
+       if (unlikely(!p))
+               goto out_err;
+       netid = kmalloc(nlen+1, gfp_flags);
+       if (unlikely(!netid))
+               goto out_err;
+       netid[nlen] = '\0';
+       memcpy(netid, p, nlen);
+       /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
+       p = xdr_inline_decode(streamp, 4);
+       if (unlikely(!p))
+               goto out_free_netid;
+       rlen = be32_to_cpup(p);
+       p = xdr_inline_decode(streamp, rlen);
+       if (unlikely(!p))
+               goto out_free_netid;
+       /* port is ".ABC.DEF", 8 chars max */
+       if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
+               dprintk("%s: Invalid address, length %d\n", __func__,
+                       rlen);
+               goto out_free_netid;
+       }
+       buf = kmalloc(rlen + 1, gfp_flags);
+       if (!buf) {
+               dprintk("%s: Not enough memory\n", __func__);
+               goto out_free_netid;
+       }
+       buf[rlen] = '\0';
+       memcpy(buf, p, rlen);
+       /* replace port '.' with '-' */
+       portstr = strrchr(buf, '.');
+       if (!portstr) {
+               dprintk("%s: Failed finding expected dot in port\n",
+                       __func__);
+               goto out_free_buf;
+       }
+       *portstr = '-';
+       /* find '.' between address and port */
+       portstr = strrchr(buf, '.');
+       if (!portstr) {
+               dprintk("%s: Failed finding expected dot between address and "
+                       "port\n", __func__);
+               goto out_free_buf;
+       }
+       *portstr = '\0';
+       da = kzalloc(sizeof(*da), gfp_flags);
+       if (unlikely(!da))
+               goto out_free_buf;
+       INIT_LIST_HEAD(&da->da_node);
+       if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
+                     sizeof(da->da_addr))) {
+               dprintk("%s: error parsing address %s\n", __func__, buf);
+               goto out_free_da;
+       }
+       portstr++;
+       sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
+       port = htons((tmp[0] << 8) | (tmp[1]));
+       switch (da->da_addr.ss_family) {
+       case AF_INET:
+               ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
+               da->da_addrlen = sizeof(struct sockaddr_in);
+               match_netid = "tcp";
+               match_netid_len = 3;
+               break;
+       case AF_INET6:
+               ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
+               da->da_addrlen = sizeof(struct sockaddr_in6);
+               match_netid = "tcp6";
+               match_netid_len = 4;
+               startsep = "[";
+               endsep = "]";
+               break;
+       default:
+               dprintk("%s: unsupported address family: %u\n",
+                       __func__, da->da_addr.ss_family);
+               goto out_free_da;
+       }
+       if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
+               dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
+                       __func__, netid, match_netid);
+               goto out_free_da;
+       }
+       /* save human readable address */
+       len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
+       da->da_remotestr = kzalloc(len, gfp_flags);
+       /* NULL is ok, only used for dprintk */
+       if (da->da_remotestr)
+               snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
+                        buf, endsep, ntohs(port));
+       dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
+       kfree(buf);
+       kfree(netid);
+       return da;
+ out_free_da:
+       kfree(da);
+ out_free_buf:
+       dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
+       kfree(buf);
+ out_free_netid:
+       kfree(netid);
+ out_err:
+       return NULL;
+ }
+ /* Decode opaque device data and return the result */
+ static struct nfs4_file_layout_dsaddr*
+ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
+ {
+       int i;
+       u32 cnt, num;
+       u8 *indexp;
+       __be32 *p;
+       u8 *stripe_indices;
+       u8 max_stripe_index;
+       struct nfs4_file_layout_dsaddr *dsaddr = NULL;
+       struct xdr_stream stream;
+       struct xdr_buf buf;
+       struct page *scratch;
+       struct list_head dsaddrs;
+       struct nfs4_pnfs_ds_addr *da;
+       /* set up xdr stream */
+       scratch = alloc_page(gfp_flags);
+       if (!scratch)
+               goto out_err;
+       xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
+       xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+       /* Get the stripe count (number of stripe index) */
+       p = xdr_inline_decode(&stream, 4);
+       if (unlikely(!p))
+               goto out_err_free_scratch;
+       cnt = be32_to_cpup(p);
+       dprintk("%s stripe count  %d\n", __func__, cnt);
+       if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
+               printk(KERN_WARNING "NFS: %s: stripe count %d greater than "
+                      "supported maximum %d\n", __func__,
+                       cnt, NFS4_PNFS_MAX_STRIPE_CNT);
+               goto out_err_free_scratch;
+       }
+       /* read stripe indices */
+       stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags);
+       if (!stripe_indices)
+               goto out_err_free_scratch;
+       p = xdr_inline_decode(&stream, cnt << 2);
+       if (unlikely(!p))
+               goto out_err_free_stripe_indices;
+       indexp = &stripe_indices[0];
+       max_stripe_index = 0;
+       for (i = 0; i < cnt; i++) {
+               *indexp = be32_to_cpup(p++);
+               max_stripe_index = max(max_stripe_index, *indexp);
+               indexp++;
+       }
+       /* Check the multipath list count */
+       p = xdr_inline_decode(&stream, 4);
+       if (unlikely(!p))
+               goto out_err_free_stripe_indices;
+       num = be32_to_cpup(p);
+       dprintk("%s ds_num %u\n", __func__, num);
+       if (num > NFS4_PNFS_MAX_MULTI_CNT) {
+               printk(KERN_WARNING "NFS: %s: multipath count %d greater than "
+                       "supported maximum %d\n", __func__,
+                       num, NFS4_PNFS_MAX_MULTI_CNT);
+               goto out_err_free_stripe_indices;
+       }
+       /* validate stripe indices are all < num */
+       if (max_stripe_index >= num) {
+               printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",
+                       __func__, max_stripe_index, num);
+               goto out_err_free_stripe_indices;
+       }
+       dsaddr = kzalloc(sizeof(*dsaddr) +
+                       (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
+                       gfp_flags);
+       if (!dsaddr)
+               goto out_err_free_stripe_indices;
+       dsaddr->stripe_count = cnt;
+       dsaddr->stripe_indices = stripe_indices;
+       stripe_indices = NULL;
+       dsaddr->ds_num = num;
+       nfs4_init_deviceid_node(&dsaddr->id_node,
+                               NFS_SERVER(ino)->pnfs_curr_ld,
+                               NFS_SERVER(ino)->nfs_client,
+                               &pdev->dev_id);
+       INIT_LIST_HEAD(&dsaddrs);
+       for (i = 0; i < dsaddr->ds_num; i++) {
+               int j;
+               u32 mp_count;
+               p = xdr_inline_decode(&stream, 4);
+               if (unlikely(!p))
+                       goto out_err_free_deviceid;
+               mp_count = be32_to_cpup(p); /* multipath count */
+               for (j = 0; j < mp_count; j++) {
+                       da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
+                                           &stream, gfp_flags);
+                       if (da)
+                               list_add_tail(&da->da_node, &dsaddrs);
+               }
+               if (list_empty(&dsaddrs)) {
+                       dprintk("%s: no suitable DS addresses found\n",
+                               __func__);
+                       goto out_err_free_deviceid;
+               }
+               dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+               if (!dsaddr->ds_list[i])
+                       goto out_err_drain_dsaddrs;
+               /* If DS was already in cache, free ds addrs */
+               while (!list_empty(&dsaddrs)) {
+                       da = list_first_entry(&dsaddrs,
+                                             struct nfs4_pnfs_ds_addr,
+                                             da_node);
+                       list_del_init(&da->da_node);
+                       kfree(da->da_remotestr);
+                       kfree(da);
+               }
+       }
+       __free_page(scratch);
+       return dsaddr;
+ out_err_drain_dsaddrs:
+       while (!list_empty(&dsaddrs)) {
+               da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
+                                     da_node);
+               list_del_init(&da->da_node);
+               kfree(da->da_remotestr);
+               kfree(da);
+       }
+ out_err_free_deviceid:
+       nfs4_fl_free_deviceid(dsaddr);
+       /* stripe_indicies was part of dsaddr */
+       goto out_err_free_scratch;
+ out_err_free_stripe_indices:
+       kfree(stripe_indices);
+ out_err_free_scratch:
+       __free_page(scratch);
+ out_err:
+       dprintk("%s ERROR: returning NULL\n", __func__);
+       return NULL;
+ }
+ /*
+  * Decode the opaque device specified in 'dev' and add it to the cache of
+  * available devices.
+  */
+ static struct nfs4_file_layout_dsaddr *
+ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
+ {
+       struct nfs4_deviceid_node *d;
+       struct nfs4_file_layout_dsaddr *n, *new;
+       new = decode_device(inode, dev, gfp_flags);
+       if (!new) {
+               printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
+                       __func__);
+               return NULL;
+       }
+       d = nfs4_insert_deviceid_node(&new->id_node);
+       n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+       if (n != new) {
+               nfs4_fl_free_deviceid(new);
+               return n;
+       }
+       return new;
+ }
+ /*
+  * Retrieve the information for dev_id, add it to the list
+  * of available devices, and return it.
+  */
+ struct nfs4_file_layout_dsaddr *
+ filelayout_get_device_info(struct inode *inode,
+               struct nfs4_deviceid *dev_id,
+               struct rpc_cred *cred,
+               gfp_t gfp_flags)
+ {
+       struct pnfs_device *pdev = NULL;
+       u32 max_resp_sz;
+       int max_pages;
+       struct page **pages = NULL;
+       struct nfs4_file_layout_dsaddr *dsaddr = NULL;
+       int rc, i;
+       struct nfs_server *server = NFS_SERVER(inode);
+       /*
+        * Use the session max response size as the basis for setting
+        * GETDEVICEINFO's maxcount
+        */
+       max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+       max_pages = nfs_page_array_len(0, max_resp_sz);
+       dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
+               __func__, inode, max_resp_sz, max_pages);
+       pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
+       if (pdev == NULL)
+               return NULL;
+       pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
+       if (pages == NULL) {
+               kfree(pdev);
+               return NULL;
+       }
+       for (i = 0; i < max_pages; i++) {
+               pages[i] = alloc_page(gfp_flags);
+               if (!pages[i])
+                       goto out_free;
+       }
+       memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+       pdev->layout_type = LAYOUT_NFSV4_1_FILES;
+       pdev->pages = pages;
+       pdev->pgbase = 0;
+       pdev->pglen = max_resp_sz;
+       pdev->mincount = 0;
+       pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
+       rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
+       dprintk("%s getdevice info returns %d\n", __func__, rc);
+       if (rc)
+               goto out_free;
+       /*
+        * Found new device, need to decode it and then add it to the
+        * list of known devices for this mountpoint.
+        */
+       dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
+ out_free:
+       for (i = 0; i < max_pages; i++)
+               __free_page(pages[i]);
+       kfree(pages);
+       kfree(pdev);
+       dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
+       return dsaddr;
+ }
+ void
+ nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+ {
+       nfs4_put_deviceid_node(&dsaddr->id_node);
+ }
+ /*
+  * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
+  * Then: ((res + fsi) % dsaddr->stripe_count)
+  */
+ u32
+ nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
+ {
+       struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+       u64 tmp;
+       tmp = offset - flseg->pattern_offset;
+       do_div(tmp, flseg->stripe_unit);
+       tmp += flseg->first_stripe_index;
+       return do_div(tmp, flseg->dsaddr->stripe_count);
+ }
+ u32
+ nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
+ {
+       return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
+ }
+ struct nfs_fh *
+ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
+ {
+       struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+       u32 i;
+       if (flseg->stripe_type == STRIPE_SPARSE) {
+               if (flseg->num_fh == 1)
+                       i = 0;
+               else if (flseg->num_fh == 0)
+                       /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+                       return NULL;
+               else
+                       i = nfs4_fl_calc_ds_index(lseg, j);
+       } else
+               i = j;
+       return flseg->fh_array[i];
+ }
+ static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
+ {
+       might_sleep();
+       wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
+                       nfs_wait_bit_killable, TASK_KILLABLE);
+ }
+ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
+ {
 -      smp_mb__after_clear_bit();
++      smp_mb__before_atomic();
+       clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
++      smp_mb__after_atomic();
+       wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
+ }
+ struct nfs4_pnfs_ds *
+ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
+ {
+       struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
+       struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
+       struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
+       struct nfs4_pnfs_ds *ret = ds;
+       if (ds == NULL) {
+               printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+                       __func__, ds_idx);
+               filelayout_mark_devid_invalid(devid);
+               goto out;
+       }
+       smp_rmb();
+       if (ds->ds_clp)
+               goto out_test_devid;
+       if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
+               struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
+               int err;
+               err = nfs4_ds_connect(s, ds);
+               if (err)
+                       nfs4_mark_deviceid_unavailable(devid);
+               nfs4_clear_ds_conn_bit(ds);
+       } else {
+               /* Either ds is connected, or ds is NULL */
+               nfs4_wait_ds_connect(ds);
+       }
+ out_test_devid:
+       if (filelayout_test_devid_unavailable(devid))
+               ret = NULL;
+ out:
+       return ret;
+ }
+ module_param(dataserver_retrans, uint, 0644);
+ MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
+                       "retries a request before it attempts further "
+                       " recovery  action.");
+ module_param(dataserver_timeo, uint, 0644);
+ MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
+                       "NFSv4.1  client  waits for a response from a "
+                       " data server before it retries an NFS request.");
diff --cc fs/nfs/inode.c
Simple merge
Simple merge
Simple merge
index 03ed984ab4d8016e3e062a7fc95c2b3268e6287c,a8759825ac764343afdc172d0fb6800a2d43fd46..b6ee3a6ee96dd2b06df61a022fadc0841da8d0b4
@@@ -133,6 -138,151 +138,151 @@@ nfs_iocounter_wait(struct nfs_io_counte
        return __nfs_iocounter_wait(c);
  }
  
 -      smp_mb__before_clear_bit();
+ static int nfs_wait_bit_uninterruptible(void *word)
+ {
+       io_schedule();
+       return 0;
+ }
+ /*
+  * nfs_page_group_lock - lock the head of the page group
+  * @req - request in group that is to be locked
+  *
+  * this lock must be held if modifying the page group list
+  */
+ void
+ nfs_page_group_lock(struct nfs_page *req)
+ {
+       struct nfs_page *head = req->wb_head;
+       WARN_ON_ONCE(head != head->wb_head);
+       wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+                       nfs_wait_bit_uninterruptible,
+                       TASK_UNINTERRUPTIBLE);
+ }
+ /*
+  * nfs_page_group_unlock - unlock the head of the page group
+  * @req - request in group that is to be unlocked
+  */
+ void
+ nfs_page_group_unlock(struct nfs_page *req)
+ {
+       struct nfs_page *head = req->wb_head;
+       WARN_ON_ONCE(head != head->wb_head);
 -      smp_mb__after_clear_bit();
++      smp_mb__before_atomic();
+       clear_bit(PG_HEADLOCK, &head->wb_flags);
++      smp_mb__after_atomic();
+       wake_up_bit(&head->wb_flags, PG_HEADLOCK);
+ }
+ /*
+  * nfs_page_group_sync_on_bit_locked
+  *
+  * must be called with page group lock held
+  */
+ static bool
+ nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
+ {
+       struct nfs_page *head = req->wb_head;
+       struct nfs_page *tmp;
+       WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
+       WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
+       tmp = req->wb_this_page;
+       while (tmp != req) {
+               if (!test_bit(bit, &tmp->wb_flags))
+                       return false;
+               tmp = tmp->wb_this_page;
+       }
+       /* true! reset all bits */
+       tmp = req;
+       do {
+               clear_bit(bit, &tmp->wb_flags);
+               tmp = tmp->wb_this_page;
+       } while (tmp != req);
+       return true;
+ }
+ /*
+  * nfs_page_group_sync_on_bit - set bit on current request, but only
+  *   return true if the bit is set for all requests in page group
+  * @req - request in page group
+  * @bit - PG_* bit that is used to sync page group
+  */
+ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
+ {
+       bool ret;
+       nfs_page_group_lock(req);
+       ret = nfs_page_group_sync_on_bit_locked(req, bit);
+       nfs_page_group_unlock(req);
+       return ret;
+ }
+ /*
+  * nfs_page_group_init - Initialize the page group linkage for @req
+  * @req - a new nfs request
+  * @prev - the previous request in page group, or NULL if @req is the first
+  *         or only request in the group (the head).
+  */
+ static inline void
+ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
+ {
+       WARN_ON_ONCE(prev == req);
+       if (!prev) {
+               req->wb_head = req;
+               req->wb_this_page = req;
+       } else {
+               WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
+               WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
+               req->wb_head = prev->wb_head;
+               req->wb_this_page = prev->wb_this_page;
+               prev->wb_this_page = req;
+               /* grab extra ref if head request has extra ref from
+                * the write/commit path to handle handoff between write
+                * and commit lists */
+               if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags))
+                       kref_get(&req->wb_kref);
+       }
+ }
+ /*
+  * nfs_page_group_destroy - sync the destruction of page groups
+  * @req - request that no longer needs the page group
+  *
+  * releases the page group reference from each member once all
+  * members have called this function.
+  */
+ static void
+ nfs_page_group_destroy(struct kref *kref)
+ {
+       struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+       struct nfs_page *tmp, *next;
+       if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
+               return;
+       tmp = req;
+       do {
+               next = tmp->wb_this_page;
+               /* unlink and free */
+               tmp->wb_this_page = tmp;
+               tmp->wb_head = tmp;
+               nfs_free_request(tmp);
+               tmp = next;
+       } while (tmp != req);
+ }
  /**
   * nfs_create_request - Create an NFS read/write request.
   * @ctx: open context to use
diff --cc fs/nfs/pnfs.c
Simple merge
diff --cc fs/nfs/pnfs.h
Simple merge
diff --cc fs/nfs/write.c
Simple merge
Simple merge
Simple merge