Merge tag 'nfs-for-3.17-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 14 Aug 2014 00:13:19 +0000 (18:13 -0600)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 14 Aug 2014 00:13:19 +0000 (18:13 -0600)
Pull NFS client updates from Trond Myklebust:
 "Highlights include:

   - stable fix for a bug in nfs3_list_one_acl()
   - speed up NFS path walks by supporting LOOKUP_RCU
   - more read/write code cleanups
   - pNFS fixes for layout return on close
   - fixes for the RCU handling in the rpcsec_gss code
   - more NFS/RDMA fixes"

* tag 'nfs-for-3.17-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (79 commits)
  nfs: reject changes to resvport and sharecache during remount
  NFS: Avoid infinite loop when RELEASE_LOCKOWNER getting expired error
  SUNRPC: remove all refcounting of groupinfo from rpcauth_lookupcred
  NFS: fix two problems in lookup_revalidate in RCU-walk
  NFS: allow lockless access to access_cache
  NFS: teach nfs_lookup_verify_inode to handle LOOKUP_RCU
  NFS: teach nfs_neg_need_reval to understand LOOKUP_RCU
  NFS: support RCU_WALK in nfs_permission()
  sunrpc/auth: allow lockless (rcu) lookup of credential cache.
  NFS: prepare for RCU-walk support but pushing tests later in code.
  NFS: nfs4_lookup_revalidate: only evaluate parent if it will be used.
  NFS: add checks for returned value of try_module_get()
  nfs: clear_request_commit while holding i_lock
  pnfs: add pnfs_put_lseg_async
  pnfs: find swapped pages on pnfs commit lists too
  nfs: fix comment and add warn_on for PG_INODE_REF
  nfs: check wait_on_bit_lock err in page_group_lock
  sunrpc: remove "ec" argument from encrypt_v2 operation
  sunrpc: clean up sparse endianness warnings in gss_krb5_wrap.c
  sunrpc: clean up sparse endianness warnings in gss_krb5_seal.c
  ...

54 files changed:
fs/nfs/blocklayout/blocklayout.c
fs/nfs/callback.c
fs/nfs/client.c
fs/nfs/delegation.c
fs/nfs/delegation.h
fs/nfs/dir.c
fs/nfs/direct.c
fs/nfs/filelayout/filelayout.c
fs/nfs/filelayout/filelayoutdev.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/nfs3acl.c
fs/nfs/nfs3proc.c
fs/nfs/nfs4_fs.h
fs/nfs/nfs4client.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4state.c
fs/nfs/nfs4trace.h
fs/nfs/nfs4xdr.c
fs/nfs/objlayout/objio_osd.c
fs/nfs/objlayout/objlayout.c
fs/nfs/objlayout/objlayout.h
fs/nfs/pagelist.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/proc.c
fs/nfs/read.c
fs/nfs/super.c
fs/nfs/write.c
fs/nfs_common/nfsacl.c
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
include/linux/nfs_page.h
include/linux/nfs_xdr.h
include/linux/sunrpc/auth.h
include/linux/sunrpc/auth_gss.h
include/linux/sunrpc/gss_krb5.h
include/linux/sunrpc/xprtrdma.h
net/sunrpc/addr.c
net/sunrpc/auth.c
net/sunrpc/auth_generic.c
net/sunrpc/auth_gss/auth_gss.c
net/sunrpc/auth_gss/gss_krb5_crypto.c
net/sunrpc/auth_gss/gss_krb5_seal.c
net/sunrpc/auth_gss/gss_krb5_wrap.c
net/sunrpc/auth_null.c
net/sunrpc/clnt.c
net/sunrpc/rpc_pipe.c
net/sunrpc/xprt.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
net/sunrpc/xprtsock.c

index 9b431f44fad9d19a31d6c61ca3986453b137f4d4..cbb1797149d5731a77f22b97fc4a3ba2cdfbacfd 100644 (file)
@@ -210,8 +210,7 @@ static void bl_end_io_read(struct bio *bio, int err)
                        SetPageUptodate(bvec->bv_page);
 
        if (err) {
-               struct nfs_pgio_data *rdata = par->data;
-               struct nfs_pgio_header *header = rdata->header;
+               struct nfs_pgio_header *header = par->data;
 
                if (!header->pnfs_error)
                        header->pnfs_error = -EIO;
@@ -224,43 +223,44 @@ static void bl_end_io_read(struct bio *bio, int err)
 static void bl_read_cleanup(struct work_struct *work)
 {
        struct rpc_task *task;
-       struct nfs_pgio_data *rdata;
+       struct nfs_pgio_header *hdr;
        dprintk("%s enter\n", __func__);
        task = container_of(work, struct rpc_task, u.tk_work);
-       rdata = container_of(task, struct nfs_pgio_data, task);
-       pnfs_ld_read_done(rdata);
+       hdr = container_of(task, struct nfs_pgio_header, task);
+       pnfs_ld_read_done(hdr);
 }
 
 static void
 bl_end_par_io_read(void *data, int unused)
 {
-       struct nfs_pgio_data *rdata = data;
+       struct nfs_pgio_header *hdr = data;
 
-       rdata->task.tk_status = rdata->header->pnfs_error;
-       INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
-       schedule_work(&rdata->task.u.tk_work);
+       hdr->task.tk_status = hdr->pnfs_error;
+       INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
+       schedule_work(&hdr->task.u.tk_work);
 }
 
 static enum pnfs_try_status
-bl_read_pagelist(struct nfs_pgio_data *rdata)
+bl_read_pagelist(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *header = rdata->header;
+       struct nfs_pgio_header *header = hdr;
        int i, hole;
        struct bio *bio = NULL;
        struct pnfs_block_extent *be = NULL, *cow_read = NULL;
        sector_t isect, extent_length = 0;
        struct parallel_io *par;
-       loff_t f_offset = rdata->args.offset;
-       size_t bytes_left = rdata->args.count;
+       loff_t f_offset = hdr->args.offset;
+       size_t bytes_left = hdr->args.count;
        unsigned int pg_offset, pg_len;
-       struct page **pages = rdata->args.pages;
-       int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
+       struct page **pages = hdr->args.pages;
+       int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT;
        const bool is_dio = (header->dreq != NULL);
 
        dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
-              rdata->pages.npages, f_offset, (unsigned int)rdata->args.count);
+               hdr->page_array.npages, f_offset,
+               (unsigned int)hdr->args.count);
 
-       par = alloc_parallel(rdata);
+       par = alloc_parallel(hdr);
        if (!par)
                goto use_mds;
        par->pnfs_callback = bl_end_par_io_read;
@@ -268,7 +268,7 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
 
        isect = (sector_t) (f_offset >> SECTOR_SHIFT);
        /* Code assumes extents are page-aligned */
-       for (i = pg_index; i < rdata->pages.npages; i++) {
+       for (i = pg_index; i < hdr->page_array.npages; i++) {
                if (!extent_length) {
                        /* We've used up the previous extent */
                        bl_put_extent(be);
@@ -317,7 +317,8 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
                        struct pnfs_block_extent *be_read;
 
                        be_read = (hole && cow_read) ? cow_read : be;
-                       bio = do_add_page_to_bio(bio, rdata->pages.npages - i,
+                       bio = do_add_page_to_bio(bio,
+                                                hdr->page_array.npages - i,
                                                 READ,
                                                 isect, pages[i], be_read,
                                                 bl_end_io_read, par,
@@ -332,10 +333,10 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
                extent_length -= PAGE_CACHE_SECTORS;
        }
        if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
-               rdata->res.eof = 1;
-               rdata->res.count = header->inode->i_size - rdata->args.offset;
+               hdr->res.eof = 1;
+               hdr->res.count = header->inode->i_size - hdr->args.offset;
        } else {
-               rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset;
+               hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset;
        }
 out:
        bl_put_extent(be);
@@ -390,8 +391,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
        }
 
        if (unlikely(err)) {
-               struct nfs_pgio_data *data = par->data;
-               struct nfs_pgio_header *header = data->header;
+               struct nfs_pgio_header *header = par->data;
 
                if (!header->pnfs_error)
                        header->pnfs_error = -EIO;
@@ -405,8 +405,7 @@ static void bl_end_io_write(struct bio *bio, int err)
 {
        struct parallel_io *par = bio->bi_private;
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-       struct nfs_pgio_data *data = par->data;
-       struct nfs_pgio_header *header = data->header;
+       struct nfs_pgio_header *header = par->data;
 
        if (!uptodate) {
                if (!header->pnfs_error)
@@ -423,32 +422,32 @@ static void bl_end_io_write(struct bio *bio, int err)
 static void bl_write_cleanup(struct work_struct *work)
 {
        struct rpc_task *task;
-       struct nfs_pgio_data *wdata;
+       struct nfs_pgio_header *hdr;
        dprintk("%s enter\n", __func__);
        task = container_of(work, struct rpc_task, u.tk_work);
-       wdata = container_of(task, struct nfs_pgio_data, task);
-       if (likely(!wdata->header->pnfs_error)) {
+       hdr = container_of(task, struct nfs_pgio_header, task);
+       if (likely(!hdr->pnfs_error)) {
                /* Marks for LAYOUTCOMMIT */
-               mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg),
-                                    wdata->args.offset, wdata->args.count);
+               mark_extents_written(BLK_LSEG2EXT(hdr->lseg),
+                                    hdr->args.offset, hdr->args.count);
        }
-       pnfs_ld_write_done(wdata);
+       pnfs_ld_write_done(hdr);
 }
 
 /* Called when last of bios associated with a bl_write_pagelist call finishes */
 static void bl_end_par_io_write(void *data, int num_se)
 {
-       struct nfs_pgio_data *wdata = data;
+       struct nfs_pgio_header *hdr = data;
 
-       if (unlikely(wdata->header->pnfs_error)) {
-               bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval,
+       if (unlikely(hdr->pnfs_error)) {
+               bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval,
                                        num_se);
        }
 
-       wdata->task.tk_status = wdata->header->pnfs_error;
-       wdata->verf.committed = NFS_FILE_SYNC;
-       INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
-       schedule_work(&wdata->task.u.tk_work);
+       hdr->task.tk_status = hdr->pnfs_error;
+       hdr->verf.committed = NFS_FILE_SYNC;
+       INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
+       schedule_work(&hdr->task.u.tk_work);
 }
 
 /* FIXME STUB - mark intersection of layout and page as bad, so is not
@@ -673,18 +672,17 @@ check_page:
 }
 
 static enum pnfs_try_status
-bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)
+bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 {
-       struct nfs_pgio_header *header = wdata->header;
        int i, ret, npg_zero, pg_index, last = 0;
        struct bio *bio = NULL;
        struct pnfs_block_extent *be = NULL, *cow_read = NULL;
        sector_t isect, last_isect = 0, extent_length = 0;
        struct parallel_io *par = NULL;
-       loff_t offset = wdata->args.offset;
-       size_t count = wdata->args.count;
+       loff_t offset = header->args.offset;
+       size_t count = header->args.count;
        unsigned int pg_offset, pg_len, saved_len;
-       struct page **pages = wdata->args.pages;
+       struct page **pages = header->args.pages;
        struct page *page;
        pgoff_t index;
        u64 temp;
@@ -699,11 +697,11 @@ bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)
                dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
                goto out_mds;
        }
-       /* At this point, wdata->pages is a (sequential) list of nfs_pages.
+       /* At this point, header->page_aray is a (sequential) list of nfs_pages.
         * We want to write each, and if there is an error set pnfs_error
         * to have it redone using nfs.
         */
-       par = alloc_parallel(wdata);
+       par = alloc_parallel(header);
        if (!par)
                goto out_mds;
        par->pnfs_callback = bl_end_par_io_write;
@@ -790,8 +788,8 @@ next_page:
        bio = bl_submit_bio(WRITE, bio);
 
        /* Middle pages */
-       pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
-       for (i = pg_index; i < wdata->pages.npages; i++) {
+       pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+       for (i = pg_index; i < header->page_array.npages; i++) {
                if (!extent_length) {
                        /* We've used up the previous extent */
                        bl_put_extent(be);
@@ -862,7 +860,8 @@ next_page:
                }
 
 
-               bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
+               bio = do_add_page_to_bio(bio, header->page_array.npages - i,
+                                        WRITE,
                                         isect, pages[i], be,
                                         bl_end_io_write, par,
                                         pg_offset, pg_len);
@@ -890,7 +889,7 @@ next_page:
        }
 
 write_done:
-       wdata->res.count = wdata->args.count;
+       header->res.count = header->args.count;
 out:
        bl_put_extent(be);
        bl_put_extent(cow_read);
@@ -1063,7 +1062,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
                return ERR_PTR(-ENOMEM);
        }
 
-       pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
+       pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS);
        if (pages == NULL) {
                kfree(dev);
                return ERR_PTR(-ENOMEM);
index 073b4cf67ed9d39690626add5517708debeab021..54de482143cc0708638e12bde9fbbfbd66d59224 100644 (file)
@@ -428,6 +428,18 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
        if (p == NULL)
                return 0;
 
+       /*
+        * Did we get the acceptor from userland during the SETCLIENID
+        * negotiation?
+        */
+       if (clp->cl_acceptor)
+               return !strcmp(p, clp->cl_acceptor);
+
+       /*
+        * Otherwise try to verify it using the cl_hostname. Note that this
+        * doesn't work if a non-canonical hostname was used in the devname.
+        */
+
        /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
 
        if (memcmp(p, "nfs@", 4) != 0)
index 180d1ec9c32ed511f3ce33f72088902ad050fffc..1c5ff6d5838585c4b6b0806aa64b579e58a64bc0 100644 (file)
@@ -110,8 +110,8 @@ struct nfs_subversion *get_nfs_version(unsigned int version)
                mutex_unlock(&nfs_version_mutex);
        }
 
-       if (!IS_ERR(nfs))
-               try_module_get(nfs->owner);
+       if (!IS_ERR(nfs) && !try_module_get(nfs->owner))
+               return ERR_PTR(-EAGAIN);
        return nfs;
 }
 
@@ -158,7 +158,8 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
                goto error_0;
 
        clp->cl_nfs_mod = cl_init->nfs_mod;
-       try_module_get(clp->cl_nfs_mod->owner);
+       if (!try_module_get(clp->cl_nfs_mod->owner))
+               goto error_dealloc;
 
        clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;
 
@@ -190,6 +191,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 
 error_cleanup:
        put_nfs_version(clp->cl_nfs_mod);
+error_dealloc:
        kfree(clp);
 error_0:
        return ERR_PTR(err);
@@ -252,6 +254,7 @@ void nfs_free_client(struct nfs_client *clp)
        put_net(clp->cl_net);
        put_nfs_version(clp->cl_nfs_mod);
        kfree(clp->cl_hostname);
+       kfree(clp->cl_acceptor);
        kfree(clp);
 
        dprintk("<-- nfs_free_client()\n");
@@ -482,8 +485,13 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
        struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
        const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops;
 
+       if (cl_init->hostname == NULL) {
+               WARN_ON(1);
+               return NULL;
+       }
+
        dprintk("--> nfs_get_client(%s,v%u)\n",
-               cl_init->hostname ?: "", rpc_ops->version);
+               cl_init->hostname, rpc_ops->version);
 
        /* see if the client already exists */
        do {
@@ -510,7 +518,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
        } while (!IS_ERR(new));
 
        dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n",
-               cl_init->hostname ?: "", PTR_ERR(new));
+               cl_init->hostname, PTR_ERR(new));
        return new;
 }
 EXPORT_SYMBOL_GPL(nfs_get_client);
index 5d8ccecf5f5caada2de94bf30689ecd9e725a15a..5853f53db73246df670ce9daedb73e10d62d2da3 100644 (file)
@@ -41,14 +41,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
        set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
 
-/**
- * nfs_have_delegation - check if inode has a delegation
- * @inode: inode to check
- * @flags: delegation types to check for
- *
- * Returns one if inode has the indicated delegation, otherwise zero.
- */
-int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+static int
+nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
 {
        struct nfs_delegation *delegation;
        int ret = 0;
@@ -58,12 +52,34 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)
        delegation = rcu_dereference(NFS_I(inode)->delegation);
        if (delegation != NULL && (delegation->type & flags) == flags &&
            !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
-               nfs_mark_delegation_referenced(delegation);
+               if (mark)
+                       nfs_mark_delegation_referenced(delegation);
                ret = 1;
        }
        rcu_read_unlock();
        return ret;
 }
+/**
+ * nfs_have_delegation - check if inode has a delegation, mark it
+ * NFS_DELEGATION_REFERENCED if there is one.
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
+int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+{
+       return nfs4_do_check_delegation(inode, flags, true);
+}
+
+/*
+ * nfs4_check_delegation - check if inode has a delegation, do not mark
+ * NFS_DELEGATION_REFERENCED if it has one.
+ */
+int nfs4_check_delegation(struct inode *inode, fmode_t flags)
+{
+       return nfs4_do_check_delegation(inode, flags, false);
+}
 
 static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
 {
index 9a79c7a99d6d6dd64b03f58481dc8d0fb4b32818..5c1cce39297f68fb178b4a125a83001aeac4afde 100644 (file)
@@ -59,6 +59,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_
 
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
+int nfs4_check_delegation(struct inode *inode, fmode_t flags);
 
 #endif
 
index 4a3d4ef76127bc716028d3d9df25791d91ff76ce..36d921f0c6026c27170b565f46eb4e26999ea812 100644 (file)
@@ -988,9 +988,13 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
  * A check for whether or not the parent directory has changed.
  * In the case it has, we assume that the dentries are untrustworthy
  * and may need to be looked up again.
+ * If rcu_walk prevents us from performing a full check, return 0.
  */
-static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
+static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
+                             int rcu_walk)
 {
+       int ret;
+
        if (IS_ROOT(dentry))
                return 1;
        if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
@@ -998,7 +1002,11 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
        if (!nfs_verify_change_attribute(dir, dentry->d_time))
                return 0;
        /* Revalidate nfsi->cache_change_attribute before we declare a match */
-       if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+       if (rcu_walk)
+               ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir);
+       else
+               ret = nfs_revalidate_inode(NFS_SERVER(dir), dir);
+       if (ret < 0)
                return 0;
        if (!nfs_verify_change_attribute(dir, dentry->d_time))
                return 0;
@@ -1042,6 +1050,8 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
 out:
        return (inode->i_nlink == 0) ? -ENOENT : 0;
 out_force:
+       if (flags & LOOKUP_RCU)
+               return -ECHILD;
        ret = __nfs_revalidate_inode(server, inode);
        if (ret != 0)
                return ret;
@@ -1054,6 +1064,9 @@ out_force:
  *
  * If parent mtime has changed, we revalidate, else we wait for a
  * period corresponding to the parent's attribute cache timeout value.
+ *
+ * If LOOKUP_RCU prevents us from performing a full check, return 1
+ * suggesting a reval is needed.
  */
 static inline
 int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
@@ -1064,7 +1077,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
                return 0;
        if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
                return 1;
-       return !nfs_check_verifier(dir, dentry);
+       return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
 }
 
 /*
@@ -1088,21 +1101,30 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
        struct nfs4_label *label = NULL;
        int error;
 
-       if (flags & LOOKUP_RCU)
-               return -ECHILD;
-
-       parent = dget_parent(dentry);
-       dir = parent->d_inode;
+       if (flags & LOOKUP_RCU) {
+               parent = ACCESS_ONCE(dentry->d_parent);
+               dir = ACCESS_ONCE(parent->d_inode);
+               if (!dir)
+                       return -ECHILD;
+       } else {
+               parent = dget_parent(dentry);
+               dir = parent->d_inode;
+       }
        nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
        inode = dentry->d_inode;
 
        if (!inode) {
-               if (nfs_neg_need_reval(dir, dentry, flags))
+               if (nfs_neg_need_reval(dir, dentry, flags)) {
+                       if (flags & LOOKUP_RCU)
+                               return -ECHILD;
                        goto out_bad;
+               }
                goto out_valid_noent;
        }
 
        if (is_bad_inode(inode)) {
+               if (flags & LOOKUP_RCU)
+                       return -ECHILD;
                dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
                                __func__, dentry);
                goto out_bad;
@@ -1112,12 +1134,20 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
                goto out_set_verifier;
 
        /* Force a full look up iff the parent directory has changed */
-       if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) {
-               if (nfs_lookup_verify_inode(inode, flags))
+       if (!nfs_is_exclusive_create(dir, flags) &&
+           nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
+
+               if (nfs_lookup_verify_inode(inode, flags)) {
+                       if (flags & LOOKUP_RCU)
+                               return -ECHILD;
                        goto out_zap_parent;
+               }
                goto out_valid;
        }
 
+       if (flags & LOOKUP_RCU)
+               return -ECHILD;
+
        if (NFS_STALE(inode))
                goto out_bad;
 
@@ -1153,13 +1183,18 @@ out_set_verifier:
        /* Success: notify readdir to use READDIRPLUS */
        nfs_advise_use_readdirplus(dir);
  out_valid_noent:
-       dput(parent);
+       if (flags & LOOKUP_RCU) {
+               if (parent != ACCESS_ONCE(dentry->d_parent))
+                       return -ECHILD;
+       } else
+               dput(parent);
        dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
                        __func__, dentry);
        return 1;
 out_zap_parent:
        nfs_zap_caches(dir);
  out_bad:
+       WARN_ON(flags & LOOKUP_RCU);
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fhandle);
        nfs4_label_free(label);
@@ -1185,6 +1220,7 @@ out_zap_parent:
                        __func__, dentry);
        return 0;
 out_error:
+       WARN_ON(flags & LOOKUP_RCU);
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fhandle);
        nfs4_label_free(label);
@@ -1529,14 +1565,9 @@ EXPORT_SYMBOL_GPL(nfs_atomic_open);
 
 static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 {
-       struct dentry *parent = NULL;
        struct inode *inode;
-       struct inode *dir;
        int ret = 0;
 
-       if (flags & LOOKUP_RCU)
-               return -ECHILD;
-
        if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
                goto no_open;
        if (d_mountpoint(dentry))
@@ -1545,34 +1576,47 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
                goto no_open;
 
        inode = dentry->d_inode;
-       parent = dget_parent(dentry);
-       dir = parent->d_inode;
 
        /* We can't create new files in nfs_open_revalidate(), so we
         * optimize away revalidation of negative dentries.
         */
        if (inode == NULL) {
+               struct dentry *parent;
+               struct inode *dir;
+
+               if (flags & LOOKUP_RCU) {
+                       parent = ACCESS_ONCE(dentry->d_parent);
+                       dir = ACCESS_ONCE(parent->d_inode);
+                       if (!dir)
+                               return -ECHILD;
+               } else {
+                       parent = dget_parent(dentry);
+                       dir = parent->d_inode;
+               }
                if (!nfs_neg_need_reval(dir, dentry, flags))
                        ret = 1;
+               else if (flags & LOOKUP_RCU)
+                       ret = -ECHILD;
+               if (!(flags & LOOKUP_RCU))
+                       dput(parent);
+               else if (parent != ACCESS_ONCE(dentry->d_parent))
+                       return -ECHILD;
                goto out;
        }
 
        /* NFS only supports OPEN on regular files */
        if (!S_ISREG(inode->i_mode))
-               goto no_open_dput;
+               goto no_open;
        /* We cannot do exclusive creation on a positive dentry */
        if (flags & LOOKUP_EXCL)
-               goto no_open_dput;
+               goto no_open;
 
        /* Let f_op->open() actually open (and revalidate) the file */
        ret = 1;
 
 out:
-       dput(parent);
        return ret;
 
-no_open_dput:
-       dput(parent);
 no_open:
        return nfs_lookup_revalidate(dentry, flags);
 }
@@ -2028,10 +2072,14 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock);
 static LIST_HEAD(nfs_access_lru_list);
 static atomic_long_t nfs_access_nr_entries;
 
+static unsigned long nfs_access_max_cachesize = ULONG_MAX;
+module_param(nfs_access_max_cachesize, ulong, 0644);
+MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
+
 static void nfs_access_free_entry(struct nfs_access_entry *entry)
 {
        put_rpccred(entry->cred);
-       kfree(entry);
+       kfree_rcu(entry, rcu_head);
        smp_mb__before_atomic();
        atomic_long_dec(&nfs_access_nr_entries);
        smp_mb__after_atomic();
@@ -2048,19 +2096,14 @@ static void nfs_access_free_list(struct list_head *head)
        }
 }
 
-unsigned long
-nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+nfs_do_access_cache_scan(unsigned int nr_to_scan)
 {
        LIST_HEAD(head);
        struct nfs_inode *nfsi, *next;
        struct nfs_access_entry *cache;
-       int nr_to_scan = sc->nr_to_scan;
-       gfp_t gfp_mask = sc->gfp_mask;
        long freed = 0;
 
-       if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
-               return SHRINK_STOP;
-
        spin_lock(&nfs_access_lru_lock);
        list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
                struct inode *inode;
@@ -2093,12 +2136,40 @@ remove_lru_entry:
        return freed;
 }
 
+unsigned long
+nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+       int nr_to_scan = sc->nr_to_scan;
+       gfp_t gfp_mask = sc->gfp_mask;
+
+       if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+               return SHRINK_STOP;
+       return nfs_do_access_cache_scan(nr_to_scan);
+}
+
+
 unsigned long
 nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
 {
        return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
 }
 
+static void
+nfs_access_cache_enforce_limit(void)
+{
+       long nr_entries = atomic_long_read(&nfs_access_nr_entries);
+       unsigned long diff;
+       unsigned int nr_to_scan;
+
+       if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize)
+               return;
+       nr_to_scan = 100;
+       diff = nr_entries - nfs_access_max_cachesize;
+       if (diff < nr_to_scan)
+               nr_to_scan = diff;
+       nfs_do_access_cache_scan(nr_to_scan);
+}
+
 static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
 {
        struct rb_root *root_node = &nfsi->access_cache;
@@ -2186,6 +2257,38 @@ out_zap:
        return -ENOENT;
 }
 
+static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+{
+       /* Only check the most recently returned cache entry,
+        * but do it without locking.
+        */
+       struct nfs_inode *nfsi = NFS_I(inode);
+       struct nfs_access_entry *cache;
+       int err = -ECHILD;
+       struct list_head *lh;
+
+       rcu_read_lock();
+       if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+               goto out;
+       lh = rcu_dereference(nfsi->access_cache_entry_lru.prev);
+       cache = list_entry(lh, struct nfs_access_entry, lru);
+       if (lh == &nfsi->access_cache_entry_lru ||
+           cred != cache->cred)
+               cache = NULL;
+       if (cache == NULL)
+               goto out;
+       if (!nfs_have_delegated_attributes(inode) &&
+           !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+               goto out;
+       res->jiffies = cache->jiffies;
+       res->cred = cache->cred;
+       res->mask = cache->mask;
+       err = 0;
+out:
+       rcu_read_unlock();
+       return err;
+}
+
 static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
@@ -2229,6 +2332,11 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
        cache->cred = get_rpccred(set->cred);
        cache->mask = set->mask;
 
+       /* The above field assignments must be visible
+        * before this item appears on the lru.  We cannot easily
+        * use rcu_assign_pointer, so just force the memory barrier.
+        */
+       smp_wmb();
        nfs_access_add_rbtree(inode, cache);
 
        /* Update accounting */
@@ -2244,6 +2352,7 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
                                        &nfs_access_lru_list);
                spin_unlock(&nfs_access_lru_lock);
        }
+       nfs_access_cache_enforce_limit();
 }
 EXPORT_SYMBOL_GPL(nfs_access_add_cache);
 
@@ -2267,10 +2376,16 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
 
        trace_nfs_access_enter(inode);
 
-       status = nfs_access_get_cached(inode, cred, &cache);
+       status = nfs_access_get_cached_rcu(inode, cred, &cache);
+       if (status != 0)
+               status = nfs_access_get_cached(inode, cred, &cache);
        if (status == 0)
                goto out_cached;
 
+       status = -ECHILD;
+       if (mask & MAY_NOT_BLOCK)
+               goto out;
+
        /* Be clever: ask server to check for all possible rights */
        cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
        cache.cred = cred;
@@ -2321,9 +2436,6 @@ int nfs_permission(struct inode *inode, int mask)
        struct rpc_cred *cred;
        int res = 0;
 
-       if (mask & MAY_NOT_BLOCK)
-               return -ECHILD;
-
        nfs_inc_stats(inode, NFSIOS_VFSACCESS);
 
        if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2350,12 +2462,23 @@ force_lookup:
        if (!NFS_PROTO(inode)->access)
                goto out_notsup;
 
-       cred = rpc_lookup_cred();
-       if (!IS_ERR(cred)) {
-               res = nfs_do_access(inode, cred, mask);
-               put_rpccred(cred);
-       } else
+       /* Always try fast lookups first */
+       rcu_read_lock();
+       cred = rpc_lookup_cred_nonblock();
+       if (!IS_ERR(cred))
+               res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
+       else
                res = PTR_ERR(cred);
+       rcu_read_unlock();
+       if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
+               /* Fast lookup failed, try the slow way */
+               cred = rpc_lookup_cred();
+               if (!IS_ERR(cred)) {
+                       res = nfs_do_access(inode, cred, mask);
+                       put_rpccred(cred);
+               } else
+                       res = PTR_ERR(cred);
+       }
 out:
        if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
                res = -EACCES;
@@ -2364,6 +2487,9 @@ out:
                inode->i_sb->s_id, inode->i_ino, mask, res);
        return res;
 out_notsup:
+       if (mask & MAY_NOT_BLOCK)
+               return -ECHILD;
+
        res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
        if (res == 0)
                res = generic_permission(inode, mask);
index f11b9eed0de109d057cd86ef42c577400698992c..65ef6e00deee428a601e5534e2d4db6489199bf9 100644 (file)
@@ -148,8 +148,8 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
 {
        struct nfs_writeverf *verfp;
 
-       verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
-                                     hdr->data->ds_idx);
+       verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
+                                     hdr->ds_idx);
        WARN_ON_ONCE(verfp->committed >= 0);
        memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
        WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +169,8 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
 {
        struct nfs_writeverf *verfp;
 
-       verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
-                                        hdr->data->ds_idx);
+       verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
+                                        hdr->ds_idx);
        if (verfp->committed < 0) {
                nfs_direct_set_hdr_verf(dreq, hdr);
                return 0;
@@ -715,7 +715,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 {
        struct nfs_direct_req *dreq = hdr->dreq;
        struct nfs_commit_info cinfo;
-       int bit = -1;
+       bool request_commit = false;
        struct nfs_page *req = nfs_list_entry(hdr->pages.next);
 
        if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
@@ -729,27 +729,20 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
                dreq->flags = 0;
                dreq->error = hdr->error;
        }
-       if (dreq->error != 0)
-               bit = NFS_IOHDR_ERROR;
-       else {
+       if (dreq->error == 0) {
                dreq->count += hdr->good_bytes;
-               if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
-                       dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
-                       bit = NFS_IOHDR_NEED_RESCHED;
-               } else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
+               if (nfs_write_need_commit(hdr)) {
                        if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
-                               bit = NFS_IOHDR_NEED_RESCHED;
+                               request_commit = true;
                        else if (dreq->flags == 0) {
                                nfs_direct_set_hdr_verf(dreq, hdr);
-                               bit = NFS_IOHDR_NEED_COMMIT;
+                               request_commit = true;
                                dreq->flags = NFS_ODIRECT_DO_COMMIT;
                        } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
-                               if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) {
+                               request_commit = true;
+                               if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
                                        dreq->flags =
                                                NFS_ODIRECT_RESCHED_WRITES;
-                                       bit = NFS_IOHDR_NEED_RESCHED;
-                               } else
-                                       bit = NFS_IOHDR_NEED_COMMIT;
                        }
                }
        }
@@ -759,9 +752,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 
                req = nfs_list_entry(hdr->pages.next);
                nfs_list_remove_request(req);
-               switch (bit) {
-               case NFS_IOHDR_NEED_RESCHED:
-               case NFS_IOHDR_NEED_COMMIT:
+               if (request_commit) {
                        kref_get(&req->wb_kref);
                        nfs_mark_request_commit(req, hdr->lseg, &cinfo);
                }
index d2eba1c13b7eeab12f5eb5e53d3acb878f4f6220..1359c4a27393a6723fc3b22c244dd6422da4a9f3 100644 (file)
@@ -84,45 +84,37 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
        BUG();
 }
 
-static void filelayout_reset_write(struct nfs_pgio_data *data)
+static void filelayout_reset_write(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-       struct rpc_task *task = &data->task;
+       struct rpc_task *task = &hdr->task;
 
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
                dprintk("%s Reset task %5u for i/o through MDS "
                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
-                       data->task.tk_pid,
+                       hdr->task.tk_pid,
                        hdr->inode->i_sb->s_id,
                        (unsigned long long)NFS_FILEID(hdr->inode),
-                       data->args.count,
-                       (unsigned long long)data->args.offset);
+                       hdr->args.count,
+                       (unsigned long long)hdr->args.offset);
 
-               task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
-                                                       &hdr->pages,
-                                                       hdr->completion_ops,
-                                                       hdr->dreq);
+               task->tk_status = pnfs_write_done_resend_to_mds(hdr);
        }
 }
 
-static void filelayout_reset_read(struct nfs_pgio_data *data)
+static void filelayout_reset_read(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-       struct rpc_task *task = &data->task;
+       struct rpc_task *task = &hdr->task;
 
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
                dprintk("%s Reset task %5u for i/o through MDS "
                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
-                       data->task.tk_pid,
+                       hdr->task.tk_pid,
                        hdr->inode->i_sb->s_id,
                        (unsigned long long)NFS_FILEID(hdr->inode),
-                       data->args.count,
-                       (unsigned long long)data->args.offset);
+                       hdr->args.count,
+                       (unsigned long long)hdr->args.offset);
 
-               task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
-                                                       &hdr->pages,
-                                                       hdr->completion_ops,
-                                                       hdr->dreq);
+               task->tk_status = pnfs_read_done_resend_to_mds(hdr);
        }
 }
 
@@ -243,18 +235,17 @@ wait_on_recovery:
 /* NFS_PROTO call done callback routines */
 
 static int filelayout_read_done_cb(struct rpc_task *task,
-                               struct nfs_pgio_data *data)
+                               struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
        int err;
 
-       trace_nfs4_pnfs_read(data, task->tk_status);
-       err = filelayout_async_handle_error(task, data->args.context->state,
-                                           data->ds_clp, hdr->lseg);
+       trace_nfs4_pnfs_read(hdr, task->tk_status);
+       err = filelayout_async_handle_error(task, hdr->args.context->state,
+                                           hdr->ds_clp, hdr->lseg);
 
        switch (err) {
        case -NFS4ERR_RESET_TO_MDS:
-               filelayout_reset_read(data);
+               filelayout_reset_read(hdr);
                return task->tk_status;
        case -EAGAIN:
                rpc_restart_call_prepare(task);
@@ -270,15 +261,14 @@ static int filelayout_read_done_cb(struct rpc_task *task,
  * rfc5661 is not clear about which credential should be used.
  */
 static void
-filelayout_set_layoutcommit(struct nfs_pgio_data *wdata)
+filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = wdata->header;
 
        if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
-           wdata->res.verf->committed == NFS_FILE_SYNC)
+           hdr->res.verf->committed == NFS_FILE_SYNC)
                return;
 
-       pnfs_set_layoutcommit(wdata);
+       pnfs_set_layoutcommit(hdr);
        dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
                (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
 }
@@ -305,83 +295,82 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
  */
 static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
-       struct nfs_pgio_data *rdata = data;
+       struct nfs_pgio_header *hdr = data;
 
-       if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) {
+       if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
                return;
        }
-       if (filelayout_reset_to_mds(rdata->header->lseg)) {
+       if (filelayout_reset_to_mds(hdr->lseg)) {
                dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
-               filelayout_reset_read(rdata);
+               filelayout_reset_read(hdr);
                rpc_exit(task, 0);
                return;
        }
-       rdata->pgio_done_cb = filelayout_read_done_cb;
+       hdr->pgio_done_cb = filelayout_read_done_cb;
 
-       if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
-                       &rdata->args.seq_args,
-                       &rdata->res.seq_res,
+       if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
+                       &hdr->args.seq_args,
+                       &hdr->res.seq_res,
                        task))
                return;
-       if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
-                       rdata->args.lock_context, FMODE_READ) == -EIO)
+       if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+                       hdr->args.lock_context, FMODE_READ) == -EIO)
                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
 }
 
 static void filelayout_read_call_done(struct rpc_task *task, void *data)
 {
-       struct nfs_pgio_data *rdata = data;
+       struct nfs_pgio_header *hdr = data;
 
        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
-       if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) &&
+       if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
            task->tk_status == 0) {
-               nfs41_sequence_done(task, &rdata->res.seq_res);
+               nfs41_sequence_done(task, &hdr->res.seq_res);
                return;
        }
 
        /* Note this may cause RPC to be resent */
-       rdata->header->mds_ops->rpc_call_done(task, data);
+       hdr->mds_ops->rpc_call_done(task, data);
 }
 
 static void filelayout_read_count_stats(struct rpc_task *task, void *data)
 {
-       struct nfs_pgio_data *rdata = data;
+       struct nfs_pgio_header *hdr = data;
 
-       rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics);
+       rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
 }
 
 static void filelayout_read_release(void *data)
 {
-       struct nfs_pgio_data *rdata = data;
-       struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout;
+       struct nfs_pgio_header *hdr = data;
+       struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
 
        filelayout_fenceme(lo->plh_inode, lo);
-       nfs_put_client(rdata->ds_clp);
-       rdata->header->mds_ops->rpc_release(data);
+       nfs_put_client(hdr->ds_clp);
+       hdr->mds_ops->rpc_release(data);
 }
 
 static int filelayout_write_done_cb(struct rpc_task *task,
-                               struct nfs_pgio_data *data)
+                               struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
        int err;
 
-       trace_nfs4_pnfs_write(data, task->tk_status);
-       err = filelayout_async_handle_error(task, data->args.context->state,
-                                           data->ds_clp, hdr->lseg);
+       trace_nfs4_pnfs_write(hdr, task->tk_status);
+       err = filelayout_async_handle_error(task, hdr->args.context->state,
+                                           hdr->ds_clp, hdr->lseg);
 
        switch (err) {
        case -NFS4ERR_RESET_TO_MDS:
-               filelayout_reset_write(data);
+               filelayout_reset_write(hdr);
                return task->tk_status;
        case -EAGAIN:
                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
 
-       filelayout_set_layoutcommit(data);
+       filelayout_set_layoutcommit(hdr);
        return 0;
 }
 
@@ -419,57 +408,57 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 
 static void filelayout_write_prepare(struct rpc_task *task, void *data)
 {
-       struct nfs_pgio_data *wdata = data;
+       struct nfs_pgio_header *hdr = data;
 
-       if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) {
+       if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
                return;
        }
-       if (filelayout_reset_to_mds(wdata->header->lseg)) {
+       if (filelayout_reset_to_mds(hdr->lseg)) {
                dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
-               filelayout_reset_write(wdata);
+               filelayout_reset_write(hdr);
                rpc_exit(task, 0);
                return;
        }
-       if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
-                       &wdata->args.seq_args,
-                       &wdata->res.seq_res,
+       if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
+                       &hdr->args.seq_args,
+                       &hdr->res.seq_res,
                        task))
                return;
-       if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
-                       wdata->args.lock_context, FMODE_WRITE) == -EIO)
+       if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+                       hdr->args.lock_context, FMODE_WRITE) == -EIO)
                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
 }
 
 static void filelayout_write_call_done(struct rpc_task *task, void *data)
 {
-       struct nfs_pgio_data *wdata = data;
+       struct nfs_pgio_header *hdr = data;
 
-       if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
+       if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
            task->tk_status == 0) {
-               nfs41_sequence_done(task, &wdata->res.seq_res);
+               nfs41_sequence_done(task, &hdr->res.seq_res);
                return;
        }
 
        /* Note this may cause RPC to be resent */
-       wdata->header->mds_ops->rpc_call_done(task, data);
+       hdr->mds_ops->rpc_call_done(task, data);
 }
 
 static void filelayout_write_count_stats(struct rpc_task *task, void *data)
 {
-       struct nfs_pgio_data *wdata = data;
+       struct nfs_pgio_header *hdr = data;
 
-       rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics);
+       rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
 }
 
 static void filelayout_write_release(void *data)
 {
-       struct nfs_pgio_data *wdata = data;
-       struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout;
+       struct nfs_pgio_header *hdr = data;
+       struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
 
        filelayout_fenceme(lo->plh_inode, lo);
-       nfs_put_client(wdata->ds_clp);
-       wdata->header->mds_ops->rpc_release(data);
+       nfs_put_client(hdr->ds_clp);
+       hdr->mds_ops->rpc_release(data);
 }
 
 static void filelayout_commit_prepare(struct rpc_task *task, void *data)
@@ -529,19 +518,18 @@ static const struct rpc_call_ops filelayout_commit_call_ops = {
 };
 
 static enum pnfs_try_status
-filelayout_read_pagelist(struct nfs_pgio_data *data)
+filelayout_read_pagelist(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
        struct pnfs_layout_segment *lseg = hdr->lseg;
        struct nfs4_pnfs_ds *ds;
        struct rpc_clnt *ds_clnt;
-       loff_t offset = data->args.offset;
+       loff_t offset = hdr->args.offset;
        u32 j, idx;
        struct nfs_fh *fh;
 
        dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
                __func__, hdr->inode->i_ino,
-               data->args.pgbase, (size_t)data->args.count, offset);
+               hdr->args.pgbase, (size_t)hdr->args.count, offset);
 
        /* Retrieve the correct rpc_client for the byte range */
        j = nfs4_fl_calc_j_index(lseg, offset);
@@ -559,30 +547,29 @@ filelayout_read_pagelist(struct nfs_pgio_data *data)
 
        /* No multipath support. Use first DS */
        atomic_inc(&ds->ds_clp->cl_count);
-       data->ds_clp = ds->ds_clp;
-       data->ds_idx = idx;
+       hdr->ds_clp = ds->ds_clp;
+       hdr->ds_idx = idx;
        fh = nfs4_fl_select_ds_fh(lseg, j);
        if (fh)
-               data->args.fh = fh;
+               hdr->args.fh = fh;
 
-       data->args.offset = filelayout_get_dserver_offset(lseg, offset);
-       data->mds_offset = offset;
+       hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
+       hdr->mds_offset = offset;
 
        /* Perform an asynchronous read to ds */
-       nfs_initiate_pgio(ds_clnt, data,
+       nfs_initiate_pgio(ds_clnt, hdr,
                            &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
        return PNFS_ATTEMPTED;
 }
 
 /* Perform async writes. */
 static enum pnfs_try_status
-filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
+filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 {
-       struct nfs_pgio_header *hdr = data->header;
        struct pnfs_layout_segment *lseg = hdr->lseg;
        struct nfs4_pnfs_ds *ds;
        struct rpc_clnt *ds_clnt;
-       loff_t offset = data->args.offset;
+       loff_t offset = hdr->args.offset;
        u32 j, idx;
        struct nfs_fh *fh;
 
@@ -598,21 +585,20 @@ filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
                return PNFS_NOT_ATTEMPTED;
 
        dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
-               __func__, hdr->inode->i_ino, sync, (size_t) data->args.count,
+               __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
                offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
 
-       data->pgio_done_cb = filelayout_write_done_cb;
+       hdr->pgio_done_cb = filelayout_write_done_cb;
        atomic_inc(&ds->ds_clp->cl_count);
-       data->ds_clp = ds->ds_clp;
-       data->ds_idx = idx;
+       hdr->ds_clp = ds->ds_clp;
+       hdr->ds_idx = idx;
        fh = nfs4_fl_select_ds_fh(lseg, j);
        if (fh)
-               data->args.fh = fh;
-
-       data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+               hdr->args.fh = fh;
+       hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
 
        /* Perform an asynchronous write */
-       nfs_initiate_pgio(ds_clnt, data,
+       nfs_initiate_pgio(ds_clnt, hdr,
                                    &filelayout_write_call_ops, sync,
                                    RPC_TASK_SOFTCONN);
        return PNFS_ATTEMPTED;
@@ -1023,6 +1009,7 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
 
 /* The generic layer is about to remove the req from the commit list.
  * If this will make the bucket empty, it will need to put the lseg reference.
+ * Note this is must be called holding the inode (/cinfo) lock
  */
 static void
 filelayout_clear_request_commit(struct nfs_page *req,
@@ -1030,7 +1017,6 @@ filelayout_clear_request_commit(struct nfs_page *req,
 {
        struct pnfs_layout_segment *freeme = NULL;
 
-       spin_lock(cinfo->lock);
        if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
                goto out;
        cinfo->ds->nwritten--;
@@ -1045,22 +1031,25 @@ filelayout_clear_request_commit(struct nfs_page *req,
        }
 out:
        nfs_request_remove_commit_list(req, cinfo);
-       spin_unlock(cinfo->lock);
-       pnfs_put_lseg(freeme);
+       pnfs_put_lseg_async(freeme);
 }
 
-static struct list_head *
-filelayout_choose_commit_list(struct nfs_page *req,
-                             struct pnfs_layout_segment *lseg,
-                             struct nfs_commit_info *cinfo)
+static void
+filelayout_mark_request_commit(struct nfs_page *req,
+                              struct pnfs_layout_segment *lseg,
+                              struct nfs_commit_info *cinfo)
+
 {
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
        u32 i, j;
        struct list_head *list;
        struct pnfs_commit_bucket *buckets;
 
-       if (fl->commit_through_mds)
-               return &cinfo->mds->list;
+       if (fl->commit_through_mds) {
+               list = &cinfo->mds->list;
+               spin_lock(cinfo->lock);
+               goto mds_commit;
+       }
 
        /* Note that we are calling nfs4_fl_calc_j_index on each page
         * that ends up being committed to a data server.  An attractive
@@ -1084,19 +1073,22 @@ filelayout_choose_commit_list(struct nfs_page *req,
        }
        set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
        cinfo->ds->nwritten++;
-       spin_unlock(cinfo->lock);
-       return list;
-}
 
-static void
-filelayout_mark_request_commit(struct nfs_page *req,
-                              struct pnfs_layout_segment *lseg,
-                              struct nfs_commit_info *cinfo)
-{
-       struct list_head *list;
-
-       list = filelayout_choose_commit_list(req, lseg, cinfo);
-       nfs_request_add_commit_list(req, list, cinfo);
+mds_commit:
+       /* nfs_request_add_commit_list(). We need to add req to list without
+        * dropping cinfo lock.
+        */
+       set_bit(PG_CLEAN, &(req)->wb_flags);
+       nfs_list_add_request(req, list);
+       cinfo->mds->ncommit++;
+       spin_unlock(cinfo->lock);
+       if (!cinfo->dreq) {
+               inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+               inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+                            BDI_RECLAIMABLE);
+               __mark_inode_dirty(req->wb_context->dentry->d_inode,
+                                  I_DIRTY_DATASYNC);
+       }
 }
 
 static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
@@ -1244,15 +1236,63 @@ restart:
        spin_unlock(cinfo->lock);
 }
 
+/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
+ *                                for @page
+ * @cinfo - commit info for current inode
+ * @page - page to search for matching head request
+ *
+ * Returns a the head request if one is found, otherwise returns NULL.
+ */
+static struct nfs_page *
+filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
+{
+       struct nfs_page *freq, *t;
+       struct pnfs_commit_bucket *b;
+       int i;
+
+       /* Linearly search the commit lists for each bucket until a matching
+        * request is found */
+       for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+               list_for_each_entry_safe(freq, t, &b->written, wb_list) {
+                       if (freq->wb_page == page)
+                               return freq->wb_head;
+               }
+               list_for_each_entry_safe(freq, t, &b->committing, wb_list) {
+                       if (freq->wb_page == page)
+                               return freq->wb_head;
+               }
+       }
+
+       return NULL;
+}
+
+static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
+{
+       struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+       struct pnfs_commit_bucket *bucket = fl_cinfo->buckets;
+       struct pnfs_layout_segment *freeme;
+       int i;
+
+       for (i = idx; i < fl_cinfo->nbuckets; i++, bucket++) {
+               if (list_empty(&bucket->committing))
+                       continue;
+               nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
+               spin_lock(cinfo->lock);
+               freeme = bucket->clseg;
+               bucket->clseg = NULL;
+               spin_unlock(cinfo->lock);
+               pnfs_put_lseg(freeme);
+       }
+}
+
 static unsigned int
 alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
 {
        struct pnfs_ds_commit_info *fl_cinfo;
        struct pnfs_commit_bucket *bucket;
        struct nfs_commit_data *data;
-       int i, j;
+       int i;
        unsigned int nreq = 0;
-       struct pnfs_layout_segment *freeme;
 
        fl_cinfo = cinfo->ds;
        bucket = fl_cinfo->buckets;
@@ -1272,16 +1312,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
        }
 
        /* Clean up on error */
-       for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) {
-               if (list_empty(&bucket->committing))
-                       continue;
-               nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
-               spin_lock(cinfo->lock);
-               freeme = bucket->clseg;
-               bucket->clseg = NULL;
-               spin_unlock(cinfo->lock);
-               pnfs_put_lseg(freeme);
-       }
+       filelayout_retry_commit(cinfo, i);
        /* Caller will clean up entries put on list */
        return nreq;
 }
@@ -1301,8 +1332,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
                        data->lseg = NULL;
                        list_add(&data->pages, &list);
                        nreq++;
-               } else
+               } else {
                        nfs_retry_commit(mds_pages, NULL, cinfo);
+                       filelayout_retry_commit(cinfo, 0);
+                       cinfo->completion_ops->error_cleanup(NFS_I(inode));
+                       return -ENOMEM;
+               }
        }
 
        nreq += alloc_ds_commits(cinfo, &list);
@@ -1380,6 +1415,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
        .clear_request_commit   = filelayout_clear_request_commit,
        .scan_commit_lists      = filelayout_scan_commit_lists,
        .recover_commit_reqs    = filelayout_recover_commit_reqs,
+       .search_commit_reqs     = filelayout_search_commit_reqs,
        .commit_pagelist        = filelayout_commit_pagelist,
        .read_pagelist          = filelayout_read_pagelist,
        .write_pagelist         = filelayout_write_pagelist,
index e2a0361e24c680165a93597ca3226b0fde5fd373..8540516f4d719bff7d80c98068ee77a634485ec9 100644 (file)
@@ -695,7 +695,7 @@ filelayout_get_device_info(struct inode *inode,
        if (pdev == NULL)
                return NULL;
 
-       pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
+       pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
        if (pages == NULL) {
                kfree(pdev);
                return NULL;
index 68921b01b792634be15cb8c213319b0ae1ecd02d..577a36f0a510b27cefe15429523750b6d21ca45a 100644 (file)
@@ -1002,6 +1002,15 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
 
+int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode)
+{
+       if (!(NFS_I(inode)->cache_validity &
+                       (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
+                       && !nfs_attribute_cache_expired(inode))
+               return NFS_STALE(inode) ? -ESTALE : 0;
+       return -ECHILD;
+}
+
 static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
index e2a45ae5014e0d4dc21c4bed16d1602f02fd88be..9056622d223005c087caac590ed8139743665277 100644 (file)
@@ -247,11 +247,11 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
 int nfs_iocounter_wait(struct nfs_io_counter *c);
 
 extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
-struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *);
-void nfs_rw_header_free(struct nfs_pgio_header *);
-void nfs_pgio_data_release(struct nfs_pgio_data *);
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
+void nfs_pgio_header_free(struct nfs_pgio_header *);
+void nfs_pgio_data_destroy(struct nfs_pgio_header *);
 int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
-int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *,
+int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *,
                      const struct rpc_call_ops *, int, int);
 void nfs_free_request(struct nfs_page *req);
 
@@ -451,6 +451,7 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst,
 void nfs_mark_request_commit(struct nfs_page *req,
                             struct pnfs_layout_segment *lseg,
                             struct nfs_commit_info *cinfo);
+int nfs_write_need_commit(struct nfs_pgio_header *);
 int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
                            int how, struct nfs_commit_info *cinfo);
 void nfs_retry_commit(struct list_head *page_list,
@@ -491,7 +492,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
 extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
 
 /* nfs4proc.c */
-extern void __nfs4_read_done_cb(struct nfs_pgio_data *);
+extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
 extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
                            const struct rpc_timeout *timeparms,
                            const char *ip_addr);
index 8f854dde4150e1f3dc2ace238d2ddda44e580d63..d0fec260132add4ce0d8917cd1bb47c15408be90 100644 (file)
@@ -256,7 +256,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
        char *p = data + *result;
 
        acl = get_acl(inode, type);
-       if (!acl)
+       if (IS_ERR_OR_NULL(acl))
                return 0;
 
        posix_acl_release(acl);
index f0afa291fd5883278783f846e6b2770ef69232d8..809670eba52a7b2111c159f2a42bd2e1fd86b213 100644 (file)
@@ -795,41 +795,44 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return status;
 }
 
-static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
 
        if (nfs3_async_handle_jukebox(task, inode))
                return -EAGAIN;
 
        nfs_invalidate_atime(inode);
-       nfs_refresh_inode(inode, &data->fattr);
+       nfs_refresh_inode(inode, &hdr->fattr);
        return 0;
 }
 
-static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr,
+                                struct rpc_message *msg)
 {
        msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
 }
 
-static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task,
+                                     struct nfs_pgio_header *hdr)
 {
        rpc_call_start(task);
        return 0;
 }
 
-static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
 
        if (nfs3_async_handle_jukebox(task, inode))
                return -EAGAIN;
        if (task->tk_status >= 0)
-               nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+               nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
        return 0;
 }
 
-static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr,
+                                 struct rpc_message *msg)
 {
        msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
 }
index ba2affa51941bc5ff304fce2288fe50b9c84b1e5..92193eddb41dc315868af5f437f083a3a4c0302a 100644 (file)
@@ -54,7 +54,7 @@ struct nfs4_minor_version_ops {
                        const nfs4_stateid *);
        int     (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
                        struct nfs_fsinfo *);
-       int     (*free_lock_state)(struct nfs_server *,
+       void    (*free_lock_state)(struct nfs_server *,
                        struct nfs4_lock_state *);
        const struct rpc_call_ops *call_sync_ops;
        const struct nfs4_state_recovery_ops *reboot_recovery_ops;
@@ -129,27 +129,17 @@ enum {
  * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
  */
 
-struct nfs4_lock_owner {
-       unsigned int lo_type;
-#define NFS4_ANY_LOCK_TYPE     (0U)
-#define NFS4_FLOCK_LOCK_TYPE   (1U << 0)
-#define NFS4_POSIX_LOCK_TYPE   (1U << 1)
-       union {
-               fl_owner_t posix_owner;
-               pid_t flock_owner;
-       } lo_u;
-};
-
 struct nfs4_lock_state {
-       struct list_head        ls_locks;       /* Other lock stateids */
-       struct nfs4_state *     ls_state;       /* Pointer to open state */
+       struct list_head                ls_locks;   /* Other lock stateids */
+       struct nfs4_state *             ls_state;   /* Pointer to open state */
 #define NFS_LOCK_INITIALIZED 0
 #define NFS_LOCK_LOST        1
-       unsigned long           ls_flags;
+       unsigned long                   ls_flags;
        struct nfs_seqid_counter        ls_seqid;
-       nfs4_stateid            ls_stateid;
-       atomic_t                ls_count;
-       struct nfs4_lock_owner  ls_owner;
+       nfs4_stateid                    ls_stateid;
+       atomic_t                        ls_count;
+       fl_owner_t                      ls_owner;
+       struct work_struct              ls_release;
 };
 
 /* bits for nfs4_state->flags */
@@ -337,11 +327,11 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
  */
 static inline void
 nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
-                        struct rpc_message *msg, struct nfs_pgio_data *wdata)
+                        struct rpc_message *msg, struct nfs_pgio_header *hdr)
 {
        if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
            !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
-               wdata->args.stable = NFS_FILE_SYNC;
+               hdr->args.stable = NFS_FILE_SYNC;
 }
 #else /* CONFIG_NFS_v4_1 */
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -369,7 +359,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
 
 static inline void
 nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
-                        struct rpc_message *msg, struct nfs_pgio_data *wdata)
+                        struct rpc_message *msg, struct nfs_pgio_header *hdr)
 {
 }
 #endif /* CONFIG_NFS_V4_1 */
index aa9ef4876046aa17cc43c201d33ba6dbe49f1b8f..53e435a952602aa5037cc8a9605b399f646edc81 100644 (file)
@@ -855,6 +855,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
        };
        struct rpc_timeout ds_timeout;
        struct nfs_client *clp;
+       char buf[INET6_ADDRSTRLEN + 1];
+
+       if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+               return ERR_PTR(-EINVAL);
+       cl_init.hostname = buf;
 
        /*
         * Set an authflavor equual to the MDS value. Use the MDS nfs_client
index 4bf3d97cc5a094da598789ad58f05d92e10c3929..75ae8d22f067d55b7edfe77bbb44d2b0067880f8 100644 (file)
@@ -1952,6 +1952,14 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
        return status;
 }
 
+/*
+ * Additional permission checks in order to distinguish between an
+ * open for read, and an open for execute. This works around the
+ * fact that NFSv4 OPEN treats read and execute permissions as being
+ * the same.
+ * Note that in the non-execute case, we want to turn off permission
+ * checking if we just created a new file (POSIX open() semantics).
+ */
 static int nfs4_opendata_access(struct rpc_cred *cred,
                                struct nfs4_opendata *opendata,
                                struct nfs4_state *state, fmode_t fmode,
@@ -1966,14 +1974,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
                return 0;
 
        mask = 0;
-       /* don't check MAY_WRITE - a newly created file may not have
-        * write mode bits, but POSIX allows the creating process to write.
-        * use openflags to check for exec, because fmode won't
-        * always have FMODE_EXEC set when file open for exec. */
+       /*
+        * Use openflags to check for exec, because fmode won't
+        * always have FMODE_EXEC set when file open for exec.
+        */
        if (openflags & __FMODE_EXEC) {
                /* ONLY check for exec rights */
                mask = MAY_EXEC;
-       } else if (fmode & FMODE_READ)
+       } else if ((fmode & FMODE_READ) && !opendata->file_created)
                mask = MAY_READ;
 
        cache.cred = cred;
@@ -2216,8 +2224,15 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
        seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
 
        ret = _nfs4_proc_open(opendata);
-       if (ret != 0)
+       if (ret != 0) {
+               if (ret == -ENOENT) {
+                       d_drop(opendata->dentry);
+                       d_add(opendata->dentry, NULL);
+                       nfs_set_verifier(opendata->dentry,
+                                        nfs_save_change_attribute(opendata->dir->d_inode));
+               }
                goto out;
+       }
 
        state = nfs4_opendata_to_nfs4_state(opendata);
        ret = PTR_ERR(state);
@@ -2647,6 +2662,48 @@ static const struct rpc_call_ops nfs4_close_ops = {
        .rpc_release = nfs4_free_closedata,
 };
 
+static bool nfs4_state_has_opener(struct nfs4_state *state)
+{
+       /* first check existing openers */
+       if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 &&
+           state->n_rdonly != 0)
+               return true;
+
+       if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 &&
+           state->n_wronly != 0)
+               return true;
+
+       if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 &&
+           state->n_rdwr != 0)
+               return true;
+
+       return false;
+}
+
+static bool nfs4_roc(struct inode *inode)
+{
+       struct nfs_inode *nfsi = NFS_I(inode);
+       struct nfs_open_context *ctx;
+       struct nfs4_state *state;
+
+       spin_lock(&inode->i_lock);
+       list_for_each_entry(ctx, &nfsi->open_files, list) {
+               state = ctx->state;
+               if (state == NULL)
+                       continue;
+               if (nfs4_state_has_opener(state)) {
+                       spin_unlock(&inode->i_lock);
+                       return false;
+               }
+       }
+       spin_unlock(&inode->i_lock);
+
+       if (nfs4_check_delegation(inode, FMODE_READ))
+               return false;
+
+       return pnfs_roc(inode);
+}
+
 /* 
  * It is possible for data to be read/written from a mem-mapped file 
  * after the sys_close call (which hits the vfs layer as a flush).
@@ -2697,7 +2754,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
-       calldata->roc = pnfs_roc(state->inode);
+       calldata->roc = nfs4_roc(state->inode);
        nfs_sb_active(calldata->inode->i_sb);
 
        msg.rpc_argp = &calldata->arg;
@@ -4033,24 +4090,25 @@ static bool nfs4_error_stateid_expired(int err)
        return false;
 }
 
-void __nfs4_read_done_cb(struct nfs_pgio_data *data)
+void __nfs4_read_done_cb(struct nfs_pgio_header *hdr)
 {
-       nfs_invalidate_atime(data->header->inode);
+       nfs_invalidate_atime(hdr->inode);
 }
 
-static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-       struct nfs_server *server = NFS_SERVER(data->header->inode);
+       struct nfs_server *server = NFS_SERVER(hdr->inode);
 
-       trace_nfs4_read(data, task->tk_status);
-       if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
+       trace_nfs4_read(hdr, task->tk_status);
+       if (nfs4_async_handle_error(task, server,
+                                   hdr->args.context->state) == -EAGAIN) {
                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
 
-       __nfs4_read_done_cb(data);
+       __nfs4_read_done_cb(hdr);
        if (task->tk_status > 0)
-               renew_lease(server, data->timestamp);
+               renew_lease(server, hdr->timestamp);
        return 0;
 }
 
@@ -4068,54 +4126,59 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,
        return true;
 }
 
-static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
 
        dprintk("--> %s\n", __func__);
 
-       if (!nfs4_sequence_done(task, &data->res.seq_res))
+       if (!nfs4_sequence_done(task, &hdr->res.seq_res))
                return -EAGAIN;
-       if (nfs4_read_stateid_changed(task, &data->args))
+       if (nfs4_read_stateid_changed(task, &hdr->args))
                return -EAGAIN;
-       return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
-                                   nfs4_read_done_cb(task, data);
+       return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+                                   nfs4_read_done_cb(task, hdr);
 }
 
-static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
+                                struct rpc_message *msg)
 {
-       data->timestamp   = jiffies;
-       data->pgio_done_cb = nfs4_read_done_cb;
+       hdr->timestamp   = jiffies;
+       hdr->pgio_done_cb = nfs4_read_done_cb;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
-       nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+       nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
 }
 
-static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
+                                     struct nfs_pgio_header *hdr)
 {
-       if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
-                       &data->args.seq_args,
-                       &data->res.seq_res,
+       if (nfs4_setup_sequence(NFS_SERVER(hdr->inode),
+                       &hdr->args.seq_args,
+                       &hdr->res.seq_res,
                        task))
                return 0;
-       if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
-                               data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO)
+       if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+                               hdr->args.lock_context,
+                               hdr->rw_ops->rw_mode) == -EIO)
                return -EIO;
-       if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+       if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
                return -EIO;
        return 0;
 }
 
-static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_write_done_cb(struct rpc_task *task,
+                             struct nfs_pgio_header *hdr)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
        
-       trace_nfs4_write(data, task->tk_status);
-       if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
+       trace_nfs4_write(hdr, task->tk_status);
+       if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+                                   hdr->args.context->state) == -EAGAIN) {
                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
        if (task->tk_status >= 0) {
-               renew_lease(NFS_SERVER(inode), data->timestamp);
-               nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+               renew_lease(NFS_SERVER(inode), hdr->timestamp);
+               nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr);
        }
        return 0;
 }
@@ -4134,23 +4197,21 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task,
        return true;
 }
 
-static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-       if (!nfs4_sequence_done(task, &data->res.seq_res))
+       if (!nfs4_sequence_done(task, &hdr->res.seq_res))
                return -EAGAIN;
-       if (nfs4_write_stateid_changed(task, &data->args))
+       if (nfs4_write_stateid_changed(task, &hdr->args))
                return -EAGAIN;
-       return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
-               nfs4_write_done_cb(task, data);
+       return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+               nfs4_write_done_cb(task, hdr);
 }
 
 static
-bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
+bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
 {
-       const struct nfs_pgio_header *hdr = data->header;
-
        /* Don't request attributes for pNFS or O_DIRECT writes */
-       if (data->ds_clp != NULL || hdr->dreq != NULL)
+       if (hdr->ds_clp != NULL || hdr->dreq != NULL)
                return false;
        /* Otherwise, request attributes if and only if we don't hold
         * a delegation
@@ -4158,23 +4219,24 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
        return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
 }
 
-static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
+                                 struct rpc_message *msg)
 {
-       struct nfs_server *server = NFS_SERVER(data->header->inode);
+       struct nfs_server *server = NFS_SERVER(hdr->inode);
 
-       if (!nfs4_write_need_cache_consistency_data(data)) {
-               data->args.bitmask = NULL;
-               data->res.fattr = NULL;
+       if (!nfs4_write_need_cache_consistency_data(hdr)) {
+               hdr->args.bitmask = NULL;
+               hdr->res.fattr = NULL;
        } else
-               data->args.bitmask = server->cache_consistency_bitmask;
+               hdr->args.bitmask = server->cache_consistency_bitmask;
 
-       if (!data->pgio_done_cb)
-               data->pgio_done_cb = nfs4_write_done_cb;
-       data->res.server = server;
-       data->timestamp   = jiffies;
+       if (!hdr->pgio_done_cb)
+               hdr->pgio_done_cb = nfs4_write_done_cb;
+       hdr->res.server = server;
+       hdr->timestamp   = jiffies;
 
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
-       nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+       nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1);
 }
 
 static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -4881,6 +4943,18 @@ nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
                return scnprintf(buf, len, "tcp");
 }
 
+static void nfs4_setclientid_done(struct rpc_task *task, void *calldata)
+{
+       struct nfs4_setclientid *sc = calldata;
+
+       if (task->tk_status == 0)
+               sc->sc_cred = get_rpccred(task->tk_rqstp->rq_cred);
+}
+
+static const struct rpc_call_ops nfs4_setclientid_ops = {
+       .rpc_call_done = nfs4_setclientid_done,
+};
+
 /**
  * nfs4_proc_setclientid - Negotiate client ID
  * @clp: state data structure
@@ -4907,6 +4981,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                .rpc_resp = res,
                .rpc_cred = cred,
        };
+       struct rpc_task *task;
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = clp->cl_rpcclient,
+               .rpc_message = &msg,
+               .callback_ops = &nfs4_setclientid_ops,
+               .callback_data = &setclientid,
+               .flags = RPC_TASK_TIMEOUT,
+       };
        int status;
 
        /* nfs_client_id4 */
@@ -4933,7 +5015,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
        dprintk("NFS call  setclientid auth=%s, '%.*s'\n",
                clp->cl_rpcclient->cl_auth->au_ops->au_name,
                setclientid.sc_name_len, setclientid.sc_name);
-       status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+       task = rpc_run_task(&task_setup_data);
+       if (IS_ERR(task)) {
+               status = PTR_ERR(task);
+               goto out;
+       }
+       status = task->tk_status;
+       if (setclientid.sc_cred) {
+               clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
+               put_rpccred(setclientid.sc_cred);
+       }
+       rpc_put_task(task);
+out:
        trace_nfs4_setclientid(clp, status);
        dprintk("NFS reply setclientid: %d\n", status);
        return status;
@@ -4975,6 +5068,9 @@ struct nfs4_delegreturndata {
        unsigned long timestamp;
        struct nfs_fattr fattr;
        int rpc_status;
+       struct inode *inode;
+       bool roc;
+       u32 roc_barrier;
 };
 
 static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
@@ -4988,7 +5084,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
        switch (task->tk_status) {
        case 0:
                renew_lease(data->res.server, data->timestamp);
-               break;
        case -NFS4ERR_ADMIN_REVOKED:
        case -NFS4ERR_DELEG_REVOKED:
        case -NFS4ERR_BAD_STATEID:
@@ -4996,6 +5091,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
        case -NFS4ERR_STALE_STATEID:
        case -NFS4ERR_EXPIRED:
                task->tk_status = 0;
+               if (data->roc)
+                       pnfs_roc_set_barrier(data->inode, data->roc_barrier);
                break;
        default:
                if (nfs4_async_handle_error(task, data->res.server, NULL) ==
@@ -5009,6 +5106,10 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 
 static void nfs4_delegreturn_release(void *calldata)
 {
+       struct nfs4_delegreturndata *data = calldata;
+
+       if (data->roc)
+               pnfs_roc_release(data->inode);
        kfree(calldata);
 }
 
@@ -5018,6 +5119,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
 
        d_data = (struct nfs4_delegreturndata *)data;
 
+       if (d_data->roc &&
+           pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
+               return;
+
        nfs4_setup_sequence(d_data->res.server,
                        &d_data->args.seq_args,
                        &d_data->res.seq_res,
@@ -5061,6 +5166,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        nfs_fattr_init(data->res.fattr);
        data->timestamp = jiffies;
        data->rpc_status = 0;
+       data->inode = inode;
+       data->roc = list_empty(&NFS_I(inode)->open_files) ?
+                   pnfs_roc(inode) : false;
 
        task_setup_data.callback_data = data;
        msg.rpc_argp = &data->args;
@@ -5834,8 +5942,10 @@ struct nfs_release_lockowner_data {
 static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs_release_lockowner_data *data = calldata;
-       nfs40_setup_sequence(data->server,
-                               &data->args.seq_args, &data->res.seq_res, task);
+       struct nfs_server *server = data->server;
+       nfs40_setup_sequence(server, &data->args.seq_args,
+                               &data->res.seq_res, task);
+       data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
        data->timestamp = jiffies;
 }
 
@@ -5852,6 +5962,8 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
                break;
        case -NFS4ERR_STALE_CLIENTID:
        case -NFS4ERR_EXPIRED:
+               nfs4_schedule_lease_recovery(server->nfs_client);
+               break;
        case -NFS4ERR_LEASE_MOVED:
        case -NFS4ERR_DELAY:
                if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
@@ -5872,7 +5984,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = {
        .rpc_release = nfs4_release_lockowner_release,
 };
 
-static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
+static void
+nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
        struct nfs_release_lockowner_data *data;
        struct rpc_message msg = {
@@ -5880,11 +5993,11 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
        };
 
        if (server->nfs_client->cl_mvops->minor_version != 0)
-               return -EINVAL;
+               return;
 
        data = kmalloc(sizeof(*data), GFP_NOFS);
        if (!data)
-               return -ENOMEM;
+               return;
        data->lsp = lsp;
        data->server = server;
        data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
@@ -5895,7 +6008,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
        msg.rpc_resp = &data->res;
        nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
        rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
-       return 0;
 }
 
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
@@ -8182,7 +8294,8 @@ static int nfs41_free_stateid(struct nfs_server *server,
        return ret;
 }
 
-static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+static void
+nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
        struct rpc_task *task;
        struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
@@ -8190,9 +8303,8 @@ static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_sta
        task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
        nfs4_free_lock_state(server, lsp);
        if (IS_ERR(task))
-               return PTR_ERR(task);
+               return;
        rpc_put_task(task);
-       return 0;
 }
 
 static bool nfs41_match_stateid(const nfs4_stateid *s1,
index 42f12118216700d2cde7c6093b8470b1f20ab674..a043f618cd5a30ef35a8ec63d54ff12034a2387f 100644 (file)
@@ -787,33 +787,36 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
  * that is compatible with current->files
  */
 static struct nfs4_lock_state *
-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 {
        struct nfs4_lock_state *pos;
        list_for_each_entry(pos, &state->lock_states, ls_locks) {
-               if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type)
+               if (pos->ls_owner != fl_owner)
                        continue;
-               switch (pos->ls_owner.lo_type) {
-               case NFS4_POSIX_LOCK_TYPE:
-                       if (pos->ls_owner.lo_u.posix_owner != fl_owner)
-                               continue;
-                       break;
-               case NFS4_FLOCK_LOCK_TYPE:
-                       if (pos->ls_owner.lo_u.flock_owner != fl_pid)
-                               continue;
-               }
                atomic_inc(&pos->ls_count);
                return pos;
        }
        return NULL;
 }
 
+static void
+free_lock_state_work(struct work_struct *work)
+{
+       struct nfs4_lock_state *lsp = container_of(work,
+                                       struct nfs4_lock_state, ls_release);
+       struct nfs4_state *state = lsp->ls_state;
+       struct nfs_server *server = state->owner->so_server;
+       struct nfs_client *clp = server->nfs_client;
+
+       clp->cl_mvops->free_lock_state(server, lsp);
+}
+
 /*
  * Return a compatible lock_state. If no initialized lock_state structure
  * exists, return an uninitialized one.
  *
  */
-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 {
        struct nfs4_lock_state *lsp;
        struct nfs_server *server = state->owner->so_server;
@@ -824,21 +827,12 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
        nfs4_init_seqid_counter(&lsp->ls_seqid);
        atomic_set(&lsp->ls_count, 1);
        lsp->ls_state = state;
-       lsp->ls_owner.lo_type = type;
-       switch (lsp->ls_owner.lo_type) {
-       case NFS4_FLOCK_LOCK_TYPE:
-               lsp->ls_owner.lo_u.flock_owner = fl_pid;
-               break;
-       case NFS4_POSIX_LOCK_TYPE:
-               lsp->ls_owner.lo_u.posix_owner = fl_owner;
-               break;
-       default:
-               goto out_free;
-       }
+       lsp->ls_owner = fl_owner;
        lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
        if (lsp->ls_seqid.owner_id < 0)
                goto out_free;
        INIT_LIST_HEAD(&lsp->ls_locks);
+       INIT_WORK(&lsp->ls_release, free_lock_state_work);
        return lsp;
 out_free:
        kfree(lsp);
@@ -857,13 +851,13 @@ void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp
  * exists, return an uninitialized one.
  *
  */
-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type)
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
 {
        struct nfs4_lock_state *lsp, *new = NULL;
        
        for(;;) {
                spin_lock(&state->state_lock);
-               lsp = __nfs4_find_lock_state(state, owner, pid, type);
+               lsp = __nfs4_find_lock_state(state, owner);
                if (lsp != NULL)
                        break;
                if (new != NULL) {
@@ -874,7 +868,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
                        break;
                }
                spin_unlock(&state->state_lock);
-               new = nfs4_alloc_lock_state(state, owner, pid, type);
+               new = nfs4_alloc_lock_state(state, owner);
                if (new == NULL)
                        return NULL;
        }
@@ -902,13 +896,12 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
        if (list_empty(&state->lock_states))
                clear_bit(LK_STATE_IN_USE, &state->flags);
        spin_unlock(&state->state_lock);
-       server = state->owner->so_server;
-       if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-               struct nfs_client *clp = server->nfs_client;
-
-               clp->cl_mvops->free_lock_state(server, lsp);
-       } else
+       if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
+               queue_work(nfsiod_workqueue, &lsp->ls_release);
+       else {
+               server = state->owner->so_server;
                nfs4_free_lock_state(server, lsp);
+       }
 }
 
 static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -935,13 +928,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
 
        if (fl->fl_ops != NULL)
                return 0;
-       if (fl->fl_flags & FL_POSIX)
-               lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
-       else if (fl->fl_flags & FL_FLOCK)
-               lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid,
-                               NFS4_FLOCK_LOCK_TYPE);
-       else
-               return -EINVAL;
+       lsp = nfs4_get_lock_state(state, fl->fl_owner);
        if (lsp == NULL)
                return -ENOMEM;
        fl->fl_u.nfs4_fl.owner = lsp;
@@ -955,7 +942,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
 {
        struct nfs4_lock_state *lsp;
        fl_owner_t fl_owner;
-       pid_t fl_pid;
        int ret = -ENOENT;
 
 
@@ -966,9 +952,8 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
                goto out;
 
        fl_owner = lockowner->l_owner;
-       fl_pid = lockowner->l_pid;
        spin_lock(&state->state_lock);
-       lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
+       lsp = __nfs4_find_lock_state(state, fl_owner);
        if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
                ret = -EIO;
        else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
index 0a744f3a86f6f592c9913f60a4b99c5d715d6324..1c32adbe728df548bcc92ee175b4d6061b3b4489 100644 (file)
@@ -932,11 +932,11 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
 
 DECLARE_EVENT_CLASS(nfs4_read_event,
                TP_PROTO(
-                       const struct nfs_pgio_data *data,
+                       const struct nfs_pgio_header *hdr,
                        int error
                ),
 
-               TP_ARGS(data, error),
+               TP_ARGS(hdr, error),
 
                TP_STRUCT__entry(
                        __field(dev_t, dev)
@@ -948,12 +948,12 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
                ),
 
                TP_fast_assign(
-                       const struct inode *inode = data->header->inode;
+                       const struct inode *inode = hdr->inode;
                        __entry->dev = inode->i_sb->s_dev;
                        __entry->fileid = NFS_FILEID(inode);
                        __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
-                       __entry->offset = data->args.offset;
-                       __entry->count = data->args.count;
+                       __entry->offset = hdr->args.offset;
+                       __entry->count = hdr->args.count;
                        __entry->error = error;
                ),
 
@@ -972,10 +972,10 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
 #define DEFINE_NFS4_READ_EVENT(name) \
        DEFINE_EVENT(nfs4_read_event, name, \
                        TP_PROTO( \
-                               const struct nfs_pgio_data *data, \
+                               const struct nfs_pgio_header *hdr, \
                                int error \
                        ), \
-                       TP_ARGS(data, error))
+                       TP_ARGS(hdr, error))
 DEFINE_NFS4_READ_EVENT(nfs4_read);
 #ifdef CONFIG_NFS_V4_1
 DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
@@ -983,11 +983,11 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
 
 DECLARE_EVENT_CLASS(nfs4_write_event,
                TP_PROTO(
-                       const struct nfs_pgio_data *data,
+                       const struct nfs_pgio_header *hdr,
                        int error
                ),
 
-               TP_ARGS(data, error),
+               TP_ARGS(hdr, error),
 
                TP_STRUCT__entry(
                        __field(dev_t, dev)
@@ -999,12 +999,12 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
                ),
 
                TP_fast_assign(
-                       const struct inode *inode = data->header->inode;
+                       const struct inode *inode = hdr->inode;
                        __entry->dev = inode->i_sb->s_dev;
                        __entry->fileid = NFS_FILEID(inode);
                        __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
-                       __entry->offset = data->args.offset;
-                       __entry->count = data->args.count;
+                       __entry->offset = hdr->args.offset;
+                       __entry->count = hdr->args.count;
                        __entry->error = error;
                ),
 
@@ -1024,10 +1024,10 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
 #define DEFINE_NFS4_WRITE_EVENT(name) \
        DEFINE_EVENT(nfs4_write_event, name, \
                        TP_PROTO( \
-                               const struct nfs_pgio_data *data, \
+                               const struct nfs_pgio_header *hdr, \
                                int error \
                        ), \
-                       TP_ARGS(data, error))
+                       TP_ARGS(hdr, error))
 DEFINE_NFS4_WRITE_EVENT(nfs4_write);
 #ifdef CONFIG_NFS_V4_1
 DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write);
index 939ae606cfa4c96d4d607afd8886f5d45ed85896..e13b59d8d9aa1374990c5eee9acec623c1193537 100644 (file)
@@ -7092,7 +7092,7 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
        if (!status)
                status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (!status)
-               status = decode_reclaim_complete(xdr, (void *)NULL);
+               status = decode_reclaim_complete(xdr, NULL);
        return status;
 }
 
index 611320753db2117469765bbdee79f74b1d721424..ae05278b3761df60ed622195d011df5a1059ab8a 100644 (file)
@@ -439,22 +439,21 @@ static void _read_done(struct ore_io_state *ios, void *private)
        objlayout_read_done(&objios->oir, status, objios->sync);
 }
 
-int objio_read_pagelist(struct nfs_pgio_data *rdata)
+int objio_read_pagelist(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = rdata->header;
        struct objio_state *objios;
        int ret;
 
        ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
-                       hdr->lseg, rdata->args.pages, rdata->args.pgbase,
-                       rdata->args.offset, rdata->args.count, rdata,
+                       hdr->lseg, hdr->args.pages, hdr->args.pgbase,
+                       hdr->args.offset, hdr->args.count, hdr,
                        GFP_KERNEL, &objios);
        if (unlikely(ret))
                return ret;
 
        objios->ios->done = _read_done;
        dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
-               rdata->args.offset, rdata->args.count);
+               hdr->args.offset, hdr->args.count);
        ret = ore_read(objios->ios);
        if (unlikely(ret))
                objio_free_result(&objios->oir);
@@ -487,11 +486,11 @@ static void _write_done(struct ore_io_state *ios, void *private)
 static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
 {
        struct objio_state *objios = priv;
-       struct nfs_pgio_data *wdata = objios->oir.rpcdata;
-       struct address_space *mapping = wdata->header->inode->i_mapping;
+       struct nfs_pgio_header *hdr = objios->oir.rpcdata;
+       struct address_space *mapping = hdr->inode->i_mapping;
        pgoff_t index = offset / PAGE_SIZE;
        struct page *page;
-       loff_t i_size = i_size_read(wdata->header->inode);
+       loff_t i_size = i_size_read(hdr->inode);
 
        if (offset >= i_size) {
                *uptodate = true;
@@ -531,15 +530,14 @@ static const struct _ore_r4w_op _r4w_op = {
        .put_page = &__r4w_put_page,
 };
 
-int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
+int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
 {
-       struct nfs_pgio_header *hdr = wdata->header;
        struct objio_state *objios;
        int ret;
 
        ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
-                       hdr->lseg, wdata->args.pages, wdata->args.pgbase,
-                       wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
+                       hdr->lseg, hdr->args.pages, hdr->args.pgbase,
+                       hdr->args.offset, hdr->args.count, hdr, GFP_NOFS,
                        &objios);
        if (unlikely(ret))
                return ret;
@@ -551,7 +549,7 @@ int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
                objios->ios->done = _write_done;
 
        dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
-               wdata->args.offset, wdata->args.count);
+               hdr->args.offset, hdr->args.count);
        ret = ore_write(objios->ios);
        if (unlikely(ret)) {
                objio_free_result(&objios->oir);
index 765d3f54e9860b18404dea42a757f899966ccda8..697a16d11fac3204b0574c084965662988626f88 100644 (file)
@@ -229,36 +229,36 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
 static void _rpc_read_complete(struct work_struct *work)
 {
        struct rpc_task *task;
-       struct nfs_pgio_data *rdata;
+       struct nfs_pgio_header *hdr;
 
        dprintk("%s enter\n", __func__);
        task = container_of(work, struct rpc_task, u.tk_work);
-       rdata = container_of(task, struct nfs_pgio_data, task);
+       hdr = container_of(task, struct nfs_pgio_header, task);
 
-       pnfs_ld_read_done(rdata);
+       pnfs_ld_read_done(hdr);
 }
 
 void
 objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 {
-       struct nfs_pgio_data *rdata = oir->rpcdata;
+       struct nfs_pgio_header *hdr = oir->rpcdata;
 
-       oir->status = rdata->task.tk_status = status;
+       oir->status = hdr->task.tk_status = status;
        if (status >= 0)
-               rdata->res.count = status;
+               hdr->res.count = status;
        else
-               rdata->header->pnfs_error = status;
+               hdr->pnfs_error = status;
        objlayout_iodone(oir);
        /* must not use oir after this point */
 
        dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
-               status, rdata->res.eof, sync);
+               status, hdr->res.eof, sync);
 
        if (sync)
-               pnfs_ld_read_done(rdata);
+               pnfs_ld_read_done(hdr);
        else {
-               INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
-               schedule_work(&rdata->task.u.tk_work);
+               INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete);
+               schedule_work(&hdr->task.u.tk_work);
        }
 }
 
@@ -266,12 +266,11 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
  * Perform sync or async reads.
  */
 enum pnfs_try_status
-objlayout_read_pagelist(struct nfs_pgio_data *rdata)
+objlayout_read_pagelist(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = rdata->header;
        struct inode *inode = hdr->inode;
-       loff_t offset = rdata->args.offset;
-       size_t count = rdata->args.count;
+       loff_t offset = hdr->args.offset;
+       size_t count = hdr->args.count;
        int err;
        loff_t eof;
 
@@ -279,23 +278,23 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
        if (unlikely(offset + count > eof)) {
                if (offset >= eof) {
                        err = 0;
-                       rdata->res.count = 0;
-                       rdata->res.eof = 1;
+                       hdr->res.count = 0;
+                       hdr->res.eof = 1;
                        /*FIXME: do we need to call pnfs_ld_read_done() */
                        goto out;
                }
                count = eof - offset;
        }
 
-       rdata->res.eof = (offset + count) >= eof;
-       _fix_verify_io_params(hdr->lseg, &rdata->args.pages,
-                             &rdata->args.pgbase,
-                             rdata->args.offset, rdata->args.count);
+       hdr->res.eof = (offset + count) >= eof;
+       _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
+                             &hdr->args.pgbase,
+                             hdr->args.offset, hdr->args.count);
 
        dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
-               __func__, inode->i_ino, offset, count, rdata->res.eof);
+               __func__, inode->i_ino, offset, count, hdr->res.eof);
 
-       err = objio_read_pagelist(rdata);
+       err = objio_read_pagelist(hdr);
  out:
        if (unlikely(err)) {
                hdr->pnfs_error = err;
@@ -312,38 +311,38 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
 static void _rpc_write_complete(struct work_struct *work)
 {
        struct rpc_task *task;
-       struct nfs_pgio_data *wdata;
+       struct nfs_pgio_header *hdr;
 
        dprintk("%s enter\n", __func__);
        task = container_of(work, struct rpc_task, u.tk_work);
-       wdata = container_of(task, struct nfs_pgio_data, task);
+       hdr = container_of(task, struct nfs_pgio_header, task);
 
-       pnfs_ld_write_done(wdata);
+       pnfs_ld_write_done(hdr);
 }
 
 void
 objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 {
-       struct nfs_pgio_data *wdata = oir->rpcdata;
+       struct nfs_pgio_header *hdr = oir->rpcdata;
 
-       oir->status = wdata->task.tk_status = status;
+       oir->status = hdr->task.tk_status = status;
        if (status >= 0) {
-               wdata->res.count = status;
-               wdata->verf.committed = oir->committed;
+               hdr->res.count = status;
+               hdr->verf.committed = oir->committed;
        } else {
-               wdata->header->pnfs_error = status;
+               hdr->pnfs_error = status;
        }
        objlayout_iodone(oir);
        /* must not use oir after this point */
 
        dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
-               status, wdata->verf.committed, sync);
+               status, hdr->verf.committed, sync);
 
        if (sync)
-               pnfs_ld_write_done(wdata);
+               pnfs_ld_write_done(hdr);
        else {
-               INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
-               schedule_work(&wdata->task.u.tk_work);
+               INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete);
+               schedule_work(&hdr->task.u.tk_work);
        }
 }
 
@@ -351,17 +350,15 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
  * Perform sync or async writes.
  */
 enum pnfs_try_status
-objlayout_write_pagelist(struct nfs_pgio_data *wdata,
-                        int how)
+objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how)
 {
-       struct nfs_pgio_header *hdr = wdata->header;
        int err;
 
-       _fix_verify_io_params(hdr->lseg, &wdata->args.pages,
-                             &wdata->args.pgbase,
-                             wdata->args.offset, wdata->args.count);
+       _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
+                             &hdr->args.pgbase,
+                             hdr->args.offset, hdr->args.count);
 
-       err = objio_write_pagelist(wdata, how);
+       err = objio_write_pagelist(hdr, how);
        if (unlikely(err)) {
                hdr->pnfs_error = err;
                dprintk("%s: Returned Error %d\n", __func__, err);
index 01e041029a6ca6ab5be86062dbb54775c3ae87f4..fd13f1d2f136d6c35dd29b56b405a2815f17bef8 100644 (file)
@@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
  */
 extern void objio_free_result(struct objlayout_io_res *oir);
 
-extern int objio_read_pagelist(struct nfs_pgio_data *rdata);
-extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how);
+extern int objio_read_pagelist(struct nfs_pgio_header *rdata);
+extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how);
 
 /*
  * callback API
@@ -168,10 +168,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg(
 extern void objlayout_free_lseg(struct pnfs_layout_segment *);
 
 extern enum pnfs_try_status objlayout_read_pagelist(
-       struct nfs_pgio_data *);
+       struct nfs_pgio_header *);
 
 extern enum pnfs_try_status objlayout_write_pagelist(
-       struct nfs_pgio_data *,
+       struct nfs_pgio_header *,
        int how);
 
 extern void objlayout_encode_layoutcommit(
index 0be5050638f7c026f14eb8eb728530a6238160d0..ba491926df5f7df2db1e224c96e7e070bd36dcec 100644 (file)
@@ -141,16 +141,24 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
  * @req - request in group that is to be locked
  *
  * this lock must be held if modifying the page group list
+ *
+ * returns result from wait_on_bit_lock: 0 on success, < 0 on error
  */
-void
-nfs_page_group_lock(struct nfs_page *req)
+int
+nfs_page_group_lock(struct nfs_page *req, bool wait)
 {
        struct nfs_page *head = req->wb_head;
+       int ret;
 
        WARN_ON_ONCE(head != head->wb_head);
 
-       wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+       do {
+               ret = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
                        TASK_UNINTERRUPTIBLE);
+       } while (wait && ret != 0);
+
+       WARN_ON_ONCE(ret > 0);
+       return ret;
 }
 
 /*
@@ -211,7 +219,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
 {
        bool ret;
 
-       nfs_page_group_lock(req);
+       nfs_page_group_lock(req, true);
        ret = nfs_page_group_sync_on_bit_locked(req, bit);
        nfs_page_group_unlock(req);
 
@@ -454,123 +462,72 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 }
 EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
 
-static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr)
-{
-       return container_of(hdr, struct nfs_rw_header, header);
-}
-
-/**
- * nfs_rw_header_alloc - Allocate a header for a read or write
- * @ops: Read or write function vector
- */
-struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops)
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
 {
-       struct nfs_rw_header *header = ops->rw_alloc_header();
-
-       if (header) {
-               struct nfs_pgio_header *hdr = &header->header;
+       struct nfs_pgio_header *hdr = ops->rw_alloc_header();
 
+       if (hdr) {
                INIT_LIST_HEAD(&hdr->pages);
                spin_lock_init(&hdr->lock);
-               atomic_set(&hdr->refcnt, 0);
                hdr->rw_ops = ops;
        }
-       return header;
+       return hdr;
 }
-EXPORT_SYMBOL_GPL(nfs_rw_header_alloc);
+EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc);
 
 /*
- * nfs_rw_header_free - Free a read or write header
+ * nfs_pgio_header_free - Free a read or write header
  * @hdr: The header to free
  */
-void nfs_rw_header_free(struct nfs_pgio_header *hdr)
+void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
 {
-       hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr));
+       hdr->rw_ops->rw_free_header(hdr);
 }
-EXPORT_SYMBOL_GPL(nfs_rw_header_free);
+EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
 
 /**
- * nfs_pgio_data_alloc - Allocate pageio data
- * @hdr: The header making a request
- * @pagecount: Number of pages to create
- */
-static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr,
-                                                unsigned int pagecount)
-{
-       struct nfs_pgio_data *data, *prealloc;
-
-       prealloc = &NFS_RW_HEADER(hdr)->rpc_data;
-       if (prealloc->header == NULL)
-               data = prealloc;
-       else
-               data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               goto out;
-
-       if (nfs_pgarray_set(&data->pages, pagecount)) {
-               data->header = hdr;
-               atomic_inc(&hdr->refcnt);
-       } else {
-               if (data != prealloc)
-                       kfree(data);
-               data = NULL;
-       }
-out:
-       return data;
-}
-
-/**
- * nfs_pgio_data_release - Properly free pageio data
- * @data: The data to release
+ * nfs_pgio_data_destroy - make @hdr suitable for reuse
+ *
+ * Frees memory and releases refs from nfs_generic_pgio, so that it may
+ * be called again.
+ *
+ * @hdr: A header that has had nfs_generic_pgio called
  */
-void nfs_pgio_data_release(struct nfs_pgio_data *data)
+void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-       struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr);
-
-       put_nfs_open_context(data->args.context);
-       if (data->pages.pagevec != data->pages.page_array)
-               kfree(data->pages.pagevec);
-       if (data == &pageio_header->rpc_data) {
-               data->header = NULL;
-               data = NULL;
-       }
-       if (atomic_dec_and_test(&hdr->refcnt))
-               hdr->completion_ops->completion(hdr);
-       /* Note: we only free the rpc_task after callbacks are done.
-        * See the comment in rpc_free_task() for why
-        */
-       kfree(data);
+       put_nfs_open_context(hdr->args.context);
+       if (hdr->page_array.pagevec != hdr->page_array.page_array)
+               kfree(hdr->page_array.pagevec);
 }
-EXPORT_SYMBOL_GPL(nfs_pgio_data_release);
+EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy);
 
 /**
  * nfs_pgio_rpcsetup - Set up arguments for a pageio call
- * @data: The pageio data
+ * @hdr: The pageio hdr
  * @count: Number of bytes to read
  * @offset: Initial offset
  * @how: How to commit data (writes only)
  * @cinfo: Commit information for the call (writes only)
  */
-static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
+static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
                              unsigned int count, unsigned int offset,
                              int how, struct nfs_commit_info *cinfo)
 {
-       struct nfs_page *req = data->header->req;
+       struct nfs_page *req = hdr->req;
 
        /* Set up the RPC argument and reply structs
-        * NB: take care not to mess about with data->commit et al. */
+        * NB: take care not to mess about with hdr->commit et al. */
 
-       data->args.fh     = NFS_FH(data->header->inode);
-       data->args.offset = req_offset(req) + offset;
+       hdr->args.fh     = NFS_FH(hdr->inode);
+       hdr->args.offset = req_offset(req) + offset;
        /* pnfs_set_layoutcommit needs this */
-       data->mds_offset = data->args.offset;
-       data->args.pgbase = req->wb_pgbase + offset;
-       data->args.pages  = data->pages.pagevec;
-       data->args.count  = count;
-       data->args.context = get_nfs_open_context(req->wb_context);
-       data->args.lock_context = req->wb_lock_context;
-       data->args.stable  = NFS_UNSTABLE;
+       hdr->mds_offset = hdr->args.offset;
+       hdr->args.pgbase = req->wb_pgbase + offset;
+       hdr->args.pages  = hdr->page_array.pagevec;
+       hdr->args.count  = count;
+       hdr->args.context = get_nfs_open_context(req->wb_context);
+       hdr->args.lock_context = req->wb_lock_context;
+       hdr->args.stable  = NFS_UNSTABLE;
        switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
        case 0:
                break;
@@ -578,59 +535,59 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
                if (nfs_reqs_to_commit(cinfo))
                        break;
        default:
-               data->args.stable = NFS_FILE_SYNC;
+               hdr->args.stable = NFS_FILE_SYNC;
        }
 
-       data->res.fattr   = &data->fattr;
-       data->res.count   = count;
-       data->res.eof     = 0;
-       data->res.verf    = &data->verf;
-       nfs_fattr_init(&data->fattr);
+       hdr->res.fattr   = &hdr->fattr;
+       hdr->res.count   = count;
+       hdr->res.eof     = 0;
+       hdr->res.verf    = &hdr->verf;
+       nfs_fattr_init(&hdr->fattr);
 }
 
 /**
- * nfs_pgio_prepare - Prepare pageio data to go over the wire
+ * nfs_pgio_prepare - Prepare pageio hdr to go over the wire
  * @task: The current task
- * @calldata: pageio data to prepare
+ * @calldata: pageio header to prepare
  */
 static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
 {
-       struct nfs_pgio_data *data = calldata;
+       struct nfs_pgio_header *hdr = calldata;
        int err;
-       err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data);
+       err = NFS_PROTO(hdr->inode)->pgio_rpc_prepare(task, hdr);
        if (err)
                rpc_exit(task, err);
 }
 
-int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data,
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
                      const struct rpc_call_ops *call_ops, int how, int flags)
 {
        struct rpc_task *task;
        struct rpc_message msg = {
-               .rpc_argp = &data->args,
-               .rpc_resp = &data->res,
-               .rpc_cred = data->header->cred,
+               .rpc_argp = &hdr->args,
+               .rpc_resp = &hdr->res,
+               .rpc_cred = hdr->cred,
        };
        struct rpc_task_setup task_setup_data = {
                .rpc_client = clnt,
-               .task = &data->task,
+               .task = &hdr->task,
                .rpc_message = &msg,
                .callback_ops = call_ops,
-               .callback_data = data,
+               .callback_data = hdr,
                .workqueue = nfsiod_workqueue,
                .flags = RPC_TASK_ASYNC | flags,
        };
        int ret = 0;
 
-       data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how);
+       hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how);
 
        dprintk("NFS: %5u initiated pgio call "
                "(req %s/%llu, %u bytes @ offset %llu)\n",
-               data->task.tk_pid,
-               data->header->inode->i_sb->s_id,
-               (unsigned long long)NFS_FILEID(data->header->inode),
-               data->args.count,
-               (unsigned long long)data->args.offset);
+               hdr->task.tk_pid,
+               hdr->inode->i_sb->s_id,
+               (unsigned long long)NFS_FILEID(hdr->inode),
+               hdr->args.count,
+               (unsigned long long)hdr->args.offset);
 
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task)) {
@@ -657,22 +614,23 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
                          struct nfs_pgio_header *hdr)
 {
        set_bit(NFS_IOHDR_REDO, &hdr->flags);
-       nfs_pgio_data_release(hdr->data);
-       hdr->data = NULL;
+       nfs_pgio_data_destroy(hdr);
+       hdr->completion_ops->completion(hdr);
        desc->pg_completion_ops->error_cleanup(&desc->pg_list);
        return -ENOMEM;
 }
 
 /**
  * nfs_pgio_release - Release pageio data
- * @calldata: The pageio data to release
+ * @calldata: The pageio header to release
  */
 static void nfs_pgio_release(void *calldata)
 {
-       struct nfs_pgio_data *data = calldata;
-       if (data->header->rw_ops->rw_release)
-               data->header->rw_ops->rw_release(data);
-       nfs_pgio_data_release(data);
+       struct nfs_pgio_header *hdr = calldata;
+       if (hdr->rw_ops->rw_release)
+               hdr->rw_ops->rw_release(hdr);
+       nfs_pgio_data_destroy(hdr);
+       hdr->completion_ops->completion(hdr);
 }
 
 /**
@@ -713,22 +671,22 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init);
 /**
  * nfs_pgio_result - Basic pageio error handling
  * @task: The task that ran
- * @calldata: Pageio data to check
+ * @calldata: Pageio header to check
  */
 static void nfs_pgio_result(struct rpc_task *task, void *calldata)
 {
-       struct nfs_pgio_data *data = calldata;
-       struct inode *inode = data->header->inode;
+       struct nfs_pgio_header *hdr = calldata;
+       struct inode *inode = hdr->inode;
 
        dprintk("NFS: %s: %5u, (status %d)\n", __func__,
                task->tk_pid, task->tk_status);
 
-       if (data->header->rw_ops->rw_done(task, data, inode) != 0)
+       if (hdr->rw_ops->rw_done(task, hdr, inode) != 0)
                return;
        if (task->tk_status < 0)
-               nfs_set_pgio_error(data->header, task->tk_status, data->args.offset);
+               nfs_set_pgio_error(hdr, task->tk_status, hdr->args.offset);
        else
-               data->header->rw_ops->rw_result(task, data);
+               hdr->rw_ops->rw_result(task, hdr);
 }
 
 /*
@@ -744,17 +702,16 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 {
        struct nfs_page         *req;
        struct page             **pages;
-       struct nfs_pgio_data    *data;
        struct list_head *head = &desc->pg_list;
        struct nfs_commit_info cinfo;
+       unsigned int pagecount;
 
-       data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base,
-                                                          desc->pg_count));
-       if (!data)
+       pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
+       if (!nfs_pgarray_set(&hdr->page_array, pagecount))
                return nfs_pgio_error(desc, hdr);
 
        nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
-       pages = data->pages.pagevec;
+       pages = hdr->page_array.pagevec;
        while (!list_empty(head)) {
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
@@ -767,8 +724,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
                desc->pg_ioflags &= ~FLUSH_COND_STABLE;
 
        /* Set up the argument struct */
-       nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
-       hdr->data = data;
+       nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
        desc->pg_rpc_callops = &nfs_pgio_common_ops;
        return 0;
 }
@@ -776,25 +732,20 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
 
 static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 {
-       struct nfs_rw_header *rw_hdr;
        struct nfs_pgio_header *hdr;
        int ret;
 
-       rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops);
-       if (!rw_hdr) {
+       hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+       if (!hdr) {
                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
                return -ENOMEM;
        }
-       hdr = &rw_hdr->header;
-       nfs_pgheader_init(desc, hdr, nfs_rw_header_free);
-       atomic_inc(&hdr->refcnt);
+       nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
        ret = nfs_generic_pgio(desc, hdr);
        if (ret == 0)
                ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
-                                       hdr->data, desc->pg_rpc_callops,
+                                       hdr, desc->pg_rpc_callops,
                                        desc->pg_ioflags, 0);
-       if (atomic_dec_and_test(&hdr->refcnt))
-               hdr->completion_ops->completion(hdr);
        return ret;
 }
 
@@ -907,8 +858,13 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
        struct nfs_page *subreq;
        unsigned int bytes_left = 0;
        unsigned int offset, pgbase;
+       int ret;
 
-       nfs_page_group_lock(req);
+       ret = nfs_page_group_lock(req, false);
+       if (ret < 0) {
+               desc->pg_error = ret;
+               return 0;
+       }
 
        subreq = req;
        bytes_left = subreq->wb_bytes;
@@ -930,7 +886,11 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                        if (desc->pg_recoalesce)
                                return 0;
                        /* retry add_request for this subreq */
-                       nfs_page_group_lock(req);
+                       ret = nfs_page_group_lock(req, false);
+                       if (ret < 0) {
+                               desc->pg_error = ret;
+                               return 0;
+                       }
                        continue;
                }
 
@@ -1005,7 +965,38 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
        } while (ret);
        return ret;
 }
-EXPORT_SYMBOL_GPL(nfs_pageio_add_request);
+
+/*
+ * nfs_pageio_resend - Transfer requests to new descriptor and resend
+ * @hdr - the pgio header to move request from
+ * @desc - the pageio descriptor to add requests to
+ *
+ * Try to move each request (nfs_page) from @hdr to @desc then attempt
+ * to send them.
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
+                     struct nfs_pgio_header *hdr)
+{
+       LIST_HEAD(failed);
+
+       desc->pg_dreq = hdr->dreq;
+       while (!list_empty(&hdr->pages)) {
+               struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+               nfs_list_remove_request(req);
+               if (!nfs_pageio_add_request(desc, req))
+                       nfs_list_add_request(req, &failed);
+       }
+       nfs_pageio_complete(desc);
+       if (!list_empty(&failed)) {
+               list_move(&failed, &hdr->pages);
+               return -EIO;
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_resend);
 
 /**
  * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
@@ -1021,7 +1012,6 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
                        break;
        }
 }
-EXPORT_SYMBOL_GPL(nfs_pageio_complete);
 
 /**
  * nfs_pageio_cond_complete - Conditional I/O completion
index a8914b3356174a5063369452bc73fab7c0b8db2e..a3851debf8a2f481435b3750f5cb1bec678ab4fe 100644 (file)
@@ -361,6 +361,23 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 }
 EXPORT_SYMBOL_GPL(pnfs_put_lseg);
 
+static void pnfs_put_lseg_async_work(struct work_struct *work)
+{
+       struct pnfs_layout_segment *lseg;
+
+       lseg = container_of(work, struct pnfs_layout_segment, pls_work);
+
+       pnfs_put_lseg(lseg);
+}
+
+void
+pnfs_put_lseg_async(struct pnfs_layout_segment *lseg)
+{
+       INIT_WORK(&lseg->pls_work, pnfs_put_lseg_async_work);
+       schedule_work(&lseg->pls_work);
+}
+EXPORT_SYMBOL_GPL(pnfs_put_lseg_async);
+
 static u64
 end_offset(u64 start, u64 len)
 {
@@ -1470,41 +1487,19 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 
-int pnfs_write_done_resend_to_mds(struct inode *inode,
-                               struct list_head *head,
-                               const struct nfs_pgio_completion_ops *compl_ops,
-                               struct nfs_direct_req *dreq)
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
 {
        struct nfs_pageio_descriptor pgio;
-       LIST_HEAD(failed);
 
        /* Resend all requests through the MDS */
-       nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops);
-       pgio.pg_dreq = dreq;
-       while (!list_empty(head)) {
-               struct nfs_page *req = nfs_list_entry(head->next);
-
-               nfs_list_remove_request(req);
-               if (!nfs_pageio_add_request(&pgio, req))
-                       nfs_list_add_request(req, &failed);
-       }
-       nfs_pageio_complete(&pgio);
-
-       if (!list_empty(&failed)) {
-               /* For some reason our attempt to resend pages. Mark the
-                * overall send request as having failed, and let
-                * nfs_writeback_release_full deal with the error.
-                */
-               list_move(&failed, head);
-               return -EIO;
-       }
-       return 0;
+       nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
+                             hdr->completion_ops);
+       return nfs_pageio_resend(&pgio, hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
 
-static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
+static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
 
        dprintk("pnfs write error = %d\n", hdr->pnfs_error);
        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
@@ -1512,50 +1507,42 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
                pnfs_return_layout(hdr->inode);
        }
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
-               data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
-                                                       &hdr->pages,
-                                                       hdr->completion_ops,
-                                                       hdr->dreq);
+               hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
 }
 
 /*
  * Called by non rpc-based layout drivers
  */
-void pnfs_ld_write_done(struct nfs_pgio_data *data)
+void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-
-       trace_nfs4_pnfs_write(data, hdr->pnfs_error);
+       trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
        if (!hdr->pnfs_error) {
-               pnfs_set_layoutcommit(data);
-               hdr->mds_ops->rpc_call_done(&data->task, data);
+               pnfs_set_layoutcommit(hdr);
+               hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
        } else
-               pnfs_ld_handle_write_error(data);
-       hdr->mds_ops->rpc_release(data);
+               pnfs_ld_handle_write_error(hdr);
+       hdr->mds_ops->rpc_release(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
 
 static void
 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
-               struct nfs_pgio_data *data)
+               struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
                list_splice_tail_init(&hdr->pages, &desc->pg_list);
                nfs_pageio_reset_write_mds(desc);
                desc->pg_recoalesce = 1;
        }
-       nfs_pgio_data_release(data);
+       nfs_pgio_data_destroy(hdr);
 }
 
 static enum pnfs_try_status
-pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
+pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
                        const struct rpc_call_ops *call_ops,
                        struct pnfs_layout_segment *lseg,
                        int how)
 {
-       struct nfs_pgio_header *hdr = wdata->header;
        struct inode *inode = hdr->inode;
        enum pnfs_try_status trypnfs;
        struct nfs_server *nfss = NFS_SERVER(inode);
@@ -1563,8 +1550,8 @@ pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
        hdr->mds_ops = call_ops;
 
        dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
-               inode->i_ino, wdata->args.count, wdata->args.offset, how);
-       trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
+               inode->i_ino, hdr->args.count, hdr->args.offset, how);
+       trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
        if (trypnfs != PNFS_NOT_ATTEMPTED)
                nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1575,139 +1562,105 @@ static void
 pnfs_do_write(struct nfs_pageio_descriptor *desc,
              struct nfs_pgio_header *hdr, int how)
 {
-       struct nfs_pgio_data *data = hdr->data;
        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        enum pnfs_try_status trypnfs;
 
        desc->pg_lseg = NULL;
-       trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
+       trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
        if (trypnfs == PNFS_NOT_ATTEMPTED)
-               pnfs_write_through_mds(desc, data);
+               pnfs_write_through_mds(desc, hdr);
        pnfs_put_lseg(lseg);
 }
 
 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
        pnfs_put_lseg(hdr->lseg);
-       nfs_rw_header_free(hdr);
+       nfs_pgio_header_free(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
 
 int
 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
-       struct nfs_rw_header *whdr;
        struct nfs_pgio_header *hdr;
        int ret;
 
-       whdr = nfs_rw_header_alloc(desc->pg_rw_ops);
-       if (!whdr) {
+       hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+       if (!hdr) {
                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
                return -ENOMEM;
        }
-       hdr = &whdr->header;
        nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
-       atomic_inc(&hdr->refcnt);
        ret = nfs_generic_pgio(desc, hdr);
        if (ret != 0) {
                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
        } else
                pnfs_do_write(desc, hdr, desc->pg_ioflags);
-       if (atomic_dec_and_test(&hdr->refcnt))
-               hdr->completion_ops->completion(hdr);
        return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
 
-int pnfs_read_done_resend_to_mds(struct inode *inode,
-                               struct list_head *head,
-                               const struct nfs_pgio_completion_ops *compl_ops,
-                               struct nfs_direct_req *dreq)
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
 {
        struct nfs_pageio_descriptor pgio;
-       LIST_HEAD(failed);
 
        /* Resend all requests through the MDS */
-       nfs_pageio_init_read(&pgio, inode, true, compl_ops);
-       pgio.pg_dreq = dreq;
-       while (!list_empty(head)) {
-               struct nfs_page *req = nfs_list_entry(head->next);
-
-               nfs_list_remove_request(req);
-               if (!nfs_pageio_add_request(&pgio, req))
-                       nfs_list_add_request(req, &failed);
-       }
-       nfs_pageio_complete(&pgio);
-
-       if (!list_empty(&failed)) {
-               list_move(&failed, head);
-               return -EIO;
-       }
-       return 0;
+       nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
+       return nfs_pageio_resend(&pgio, hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
 
-static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data)
+static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-
        dprintk("pnfs read error = %d\n", hdr->pnfs_error);
        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
            PNFS_LAYOUTRET_ON_ERROR) {
                pnfs_return_layout(hdr->inode);
        }
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
-               data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
-                                                       &hdr->pages,
-                                                       hdr->completion_ops,
-                                                       hdr->dreq);
+               hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
 }
 
 /*
  * Called by non rpc-based layout drivers
  */
-void pnfs_ld_read_done(struct nfs_pgio_data *data)
+void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-
-       trace_nfs4_pnfs_read(data, hdr->pnfs_error);
+       trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
        if (likely(!hdr->pnfs_error)) {
-               __nfs4_read_done_cb(data);
-               hdr->mds_ops->rpc_call_done(&data->task, data);
+               __nfs4_read_done_cb(hdr);
+               hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
        } else
-               pnfs_ld_handle_read_error(data);
-       hdr->mds_ops->rpc_release(data);
+               pnfs_ld_handle_read_error(hdr);
+       hdr->mds_ops->rpc_release(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
 
 static void
 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
-               struct nfs_pgio_data *data)
+               struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
                list_splice_tail_init(&hdr->pages, &desc->pg_list);
                nfs_pageio_reset_read_mds(desc);
                desc->pg_recoalesce = 1;
        }
-       nfs_pgio_data_release(data);
+       nfs_pgio_data_destroy(hdr);
 }
 
 /*
  * Call the appropriate parallel I/O subsystem read function.
  */
 static enum pnfs_try_status
-pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
+pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
                       const struct rpc_call_ops *call_ops,
                       struct pnfs_layout_segment *lseg)
 {
-       struct nfs_pgio_header *hdr = rdata->header;
        struct inode *inode = hdr->inode;
        struct nfs_server *nfss = NFS_SERVER(inode);
        enum pnfs_try_status trypnfs;
@@ -1715,9 +1668,9 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
        hdr->mds_ops = call_ops;
 
        dprintk("%s: Reading ino:%lu %u@%llu\n",
-               __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
+               __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
 
-       trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
+       trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
        if (trypnfs != PNFS_NOT_ATTEMPTED)
                nfs_inc_stats(inode, NFSIOS_PNFS_READ);
        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1727,52 +1680,46 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
 static void
 pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_data *data = hdr->data;
        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        enum pnfs_try_status trypnfs;
 
        desc->pg_lseg = NULL;
-       trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
+       trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
        if (trypnfs == PNFS_NOT_ATTEMPTED)
-               pnfs_read_through_mds(desc, data);
+               pnfs_read_through_mds(desc, hdr);
        pnfs_put_lseg(lseg);
 }
 
 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
 {
        pnfs_put_lseg(hdr->lseg);
-       nfs_rw_header_free(hdr);
+       nfs_pgio_header_free(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
 
 int
 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
-       struct nfs_rw_header *rhdr;
        struct nfs_pgio_header *hdr;
        int ret;
 
-       rhdr = nfs_rw_header_alloc(desc->pg_rw_ops);
-       if (!rhdr) {
+       hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+       if (!hdr) {
                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
                ret = -ENOMEM;
                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
                return ret;
        }
-       hdr = &rhdr->header;
        nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
-       atomic_inc(&hdr->refcnt);
        ret = nfs_generic_pgio(desc, hdr);
        if (ret != 0) {
                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
        } else
                pnfs_do_read(desc, hdr);
-       if (atomic_dec_and_test(&hdr->refcnt))
-               hdr->completion_ops->completion(hdr);
        return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
@@ -1820,12 +1767,11 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
 
 void
-pnfs_set_layoutcommit(struct nfs_pgio_data *wdata)
+pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = wdata->header;
        struct inode *inode = hdr->inode;
        struct nfs_inode *nfsi = NFS_I(inode);
-       loff_t end_pos = wdata->mds_offset + wdata->res.count;
+       loff_t end_pos = hdr->mds_offset + hdr->res.count;
        bool mark_as_dirty = false;
 
        spin_lock(&inode->i_lock);
index 4fb309a2b4c48e871de3a13a0b60c8ce66e08c7e..aca3dff5dae63e3a5d41f3e43b7055e77372b841 100644 (file)
@@ -32,6 +32,7 @@
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
+#include <linux/workqueue.h>
 
 enum {
        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
@@ -46,6 +47,7 @@ struct pnfs_layout_segment {
        atomic_t pls_refcount;
        unsigned long pls_flags;
        struct pnfs_layout_hdr *pls_layout;
+       struct work_struct pls_work;
 };
 
 enum pnfs_try_status {
@@ -104,6 +106,8 @@ struct pnfs_layoutdriver_type {
                                  int max);
        void (*recover_commit_reqs) (struct list_head *list,
                                     struct nfs_commit_info *cinfo);
+       struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
+                                               struct page *page);
        int (*commit_pagelist)(struct inode *inode,
                               struct list_head *mds_pages,
                               int how,
@@ -113,8 +117,8 @@ struct pnfs_layoutdriver_type {
         * Return PNFS_ATTEMPTED to indicate the layout code has attempted
         * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
         */
-       enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data);
-       enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how);
+       enum pnfs_try_status (*read_pagelist)(struct nfs_pgio_header *);
+       enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
 
        void (*free_deviceid_node) (struct nfs4_deviceid_node *);
 
@@ -179,6 +183,7 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 /* pnfs.c */
 void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
+void pnfs_put_lseg_async(struct pnfs_layout_segment *lseg);
 
 void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
 void unset_pnfs_layoutdriver(struct nfs_server *);
@@ -213,13 +218,13 @@ bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
-void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata);
+void pnfs_set_layoutcommit(struct nfs_pgio_header *);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
 int pnfs_commit_and_return_layout(struct inode *);
-void pnfs_ld_write_done(struct nfs_pgio_data *);
-void pnfs_ld_read_done(struct nfs_pgio_data *);
+void pnfs_ld_write_done(struct nfs_pgio_header *);
+void pnfs_ld_read_done(struct nfs_pgio_header *);
 struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
                                               struct nfs_open_context *ctx,
                                               loff_t pos,
@@ -228,12 +233,8 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
                                               gfp_t gfp_flags);
 
 void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
-int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head,
-                       const struct nfs_pgio_completion_ops *compl_ops,
-                       struct nfs_direct_req *dreq);
-int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
-                       const struct nfs_pgio_completion_ops *compl_ops,
-                       struct nfs_direct_req *dreq);
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
 
 /* nfs4_deviceid_flags */
@@ -345,6 +346,17 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
        NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
 }
 
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+                       struct page *page)
+{
+       struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+       if (ld == NULL || ld->search_commit_reqs == NULL)
+               return NULL;
+       return ld->search_commit_reqs(cinfo, page);
+}
+
 /* Should the pNFS client commit and return the layout upon a setattr */
 static inline bool
 pnfs_ld_layoutret_on_setattr(struct inode *inode)
@@ -410,6 +422,10 @@ static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 {
 }
 
+static inline void pnfs_put_lseg_async(struct pnfs_layout_segment *lseg)
+{
+}
+
 static inline int pnfs_return_layout(struct inode *ino)
 {
        return 0;
@@ -496,6 +512,13 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
 {
 }
 
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+                       struct page *page)
+{
+       return NULL;
+}
+
 static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
        return 0;
index c171ce1a8a3098f20f53d19c36f7dc35789efbe2..b09cc23d6f433bc5ea8aff6cfe68c3910c4f8319 100644 (file)
@@ -578,46 +578,49 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return 0;
 }
 
-static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
 
        nfs_invalidate_atime(inode);
        if (task->tk_status >= 0) {
-               nfs_refresh_inode(inode, data->res.fattr);
+               nfs_refresh_inode(inode, hdr->res.fattr);
                /* Emulate the eof flag, which isn't normally needed in NFSv2
                 * as it is guaranteed to always return the file attributes
                 */
-               if (data->args.offset + data->res.count >= data->res.fattr->size)
-                       data->res.eof = 1;
+               if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
+                       hdr->res.eof = 1;
        }
        return 0;
 }
 
-static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs_proc_read_setup(struct nfs_pgio_header *hdr,
+                               struct rpc_message *msg)
 {
        msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
 }
 
-static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
+                                    struct nfs_pgio_header *hdr)
 {
        rpc_call_start(task);
        return 0;
 }
 
-static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
 
        if (task->tk_status >= 0)
-               nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+               nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
        return 0;
 }
 
-static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs_proc_write_setup(struct nfs_pgio_header *hdr,
+                                struct rpc_message *msg)
 {
        /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
-       data->args.stable = NFS_FILE_SYNC;
+       hdr->args.stable = NFS_FILE_SYNC;
        msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
 }
 
index e818a475ca64351f0ae00e2484c2e76a640b6bec..beff2769c5c587f9955ec55fd41444326a36c97e 100644 (file)
@@ -33,12 +33,12 @@ static const struct nfs_rw_ops nfs_rw_read_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
 
-static struct nfs_rw_header *nfs_readhdr_alloc(void)
+static struct nfs_pgio_header *nfs_readhdr_alloc(void)
 {
        return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
 }
 
-static void nfs_readhdr_free(struct nfs_rw_header *rhdr)
+static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
 {
        kmem_cache_free(nfs_rdata_cachep, rhdr);
 }
@@ -115,12 +115,6 @@ static void nfs_readpage_release(struct nfs_page *req)
 
                unlock_page(req->wb_page);
        }
-
-       dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
-                       req->wb_context->dentry->d_inode->i_sb->s_id,
-                       (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
-                       req->wb_bytes,
-                       (long long)req_offset(req));
        nfs_release_request(req);
 }
 
@@ -172,14 +166,15 @@ out:
        hdr->release(hdr);
 }
 
-static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg,
+static void nfs_initiate_read(struct nfs_pgio_header *hdr,
+                             struct rpc_message *msg,
                              struct rpc_task_setup *task_setup_data, int how)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
        int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
 
        task_setup_data->flags |= swap_flags;
-       NFS_PROTO(inode)->read_setup(data, msg);
+       NFS_PROTO(inode)->read_setup(hdr, msg);
 }
 
 static void
@@ -203,14 +198,15 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
  * This is the callback from RPC telling us whether a reply was
  * received or some error occurred (timeout or socket shutdown).
  */
-static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
+static int nfs_readpage_done(struct rpc_task *task,
+                            struct nfs_pgio_header *hdr,
                             struct inode *inode)
 {
-       int status = NFS_PROTO(inode)->read_done(task, data);
+       int status = NFS_PROTO(inode)->read_done(task, hdr);
        if (status != 0)
                return status;
 
-       nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, data->res.count);
+       nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count);
 
        if (task->tk_status == -ESTALE) {
                set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
@@ -219,34 +215,34 @@ static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
        return 0;
 }
 
-static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_readpage_retry(struct rpc_task *task,
+                              struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_args *argp = &data->args;
-       struct nfs_pgio_res  *resp = &data->res;
+       struct nfs_pgio_args *argp = &hdr->args;
+       struct nfs_pgio_res  *resp = &hdr->res;
 
        /* This is a short read! */
-       nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD);
+       nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD);
        /* Has the server at least made some progress? */
        if (resp->count == 0) {
-               nfs_set_pgio_error(data->header, -EIO, argp->offset);
+               nfs_set_pgio_error(hdr, -EIO, argp->offset);
                return;
        }
-       /* Yes, so retry the read at the end of the data */
-       data->mds_offset += resp->count;
+       /* Yes, so retry the read at the end of the hdr */
+       hdr->mds_offset += resp->count;
        argp->offset += resp->count;
        argp->pgbase += resp->count;
        argp->count -= resp->count;
        rpc_restart_call_prepare(task);
 }
 
-static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_readpage_result(struct rpc_task *task,
+                               struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-
-       if (data->res.eof) {
+       if (hdr->res.eof) {
                loff_t bound;
 
-               bound = data->args.offset + data->res.count;
+               bound = hdr->args.offset + hdr->res.count;
                spin_lock(&hdr->lock);
                if (bound < hdr->io_start + hdr->good_bytes) {
                        set_bit(NFS_IOHDR_EOF, &hdr->flags);
@@ -254,8 +250,8 @@ static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *dat
                        hdr->good_bytes = bound - hdr->io_start;
                }
                spin_unlock(&hdr->lock);
-       } else if (data->res.count != data->args.count)
-               nfs_readpage_retry(task, data);
+       } else if (hdr->res.count != hdr->args.count)
+               nfs_readpage_retry(task, hdr);
 }
 
 /*
@@ -404,7 +400,7 @@ out:
 int __init nfs_init_readpagecache(void)
 {
        nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
-                                            sizeof(struct nfs_rw_header),
+                                            sizeof(struct nfs_pgio_header),
                                             0, SLAB_HWCACHE_ALIGN,
                                             NULL);
        if (nfs_rdata_cachep == NULL)
index 084af1060d79e1b289f6989dd251eeab2e59f7f6..e4499d5b51e8f33a05fec43843c8648de834dfd7 100644 (file)
@@ -1027,8 +1027,7 @@ static bool nfs_auth_info_add(struct nfs_auth_info *auth_info,
                              rpc_authflavor_t flavor)
 {
        unsigned int i;
-       unsigned int max_flavor_len = (sizeof(auth_info->flavors) /
-                                      sizeof(auth_info->flavors[0]));
+       unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors);
 
        /* make sure this flavor isn't already in the list */
        for (i = 0; i < auth_info->flavor_len; i++) {
@@ -2180,7 +2179,7 @@ out_no_address:
        return -EINVAL;
 }
 
-#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
+#define NFS_REMOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
                | NFS_MOUNT_SECURE \
                | NFS_MOUNT_TCP \
                | NFS_MOUNT_VER3 \
@@ -2188,15 +2187,16 @@ out_no_address:
                | NFS_MOUNT_NONLM \
                | NFS_MOUNT_BROKEN_SUID \
                | NFS_MOUNT_STRICTLOCK \
-               | NFS_MOUNT_UNSHARED \
-               | NFS_MOUNT_NORESVPORT \
                | NFS_MOUNT_LEGACY_INTERFACE)
 
+#define NFS_MOUNT_CMP_FLAGMASK (NFS_REMOUNT_CMP_FLAGMASK & \
+               ~(NFS_MOUNT_UNSHARED | NFS_MOUNT_NORESVPORT))
+
 static int
 nfs_compare_remount_data(struct nfs_server *nfss,
                         struct nfs_parsed_mount_data *data)
 {
-       if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK ||
+       if ((data->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK ||
            data->rsize != nfss->rsize ||
            data->wsize != nfss->wsize ||
            data->version != nfss->nfs_client->rpc_ops->version ||
index 962c9ee758be30e57141761bd29898a1df129009..e3b5cf28bdc5c2dbfba5d3c16b06b5724afe6c60 100644 (file)
@@ -47,6 +47,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
 static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
 static const struct nfs_rw_ops nfs_rw_write_ops;
 static void nfs_clear_request_commit(struct nfs_page *req);
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+                                     struct inode *inode);
 
 static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
@@ -71,18 +73,18 @@ void nfs_commit_free(struct nfs_commit_data *p)
 }
 EXPORT_SYMBOL_GPL(nfs_commit_free);
 
-static struct nfs_rw_header *nfs_writehdr_alloc(void)
+static struct nfs_pgio_header *nfs_writehdr_alloc(void)
 {
-       struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
+       struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
 
        if (p)
                memset(p, 0, sizeof(*p));
        return p;
 }
 
-static void nfs_writehdr_free(struct nfs_rw_header *whdr)
+static void nfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
-       mempool_free(whdr, nfs_wdata_mempool);
+       mempool_free(hdr, nfs_wdata_mempool);
 }
 
 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
@@ -92,6 +94,38 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
        set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 }
 
+/*
+ * nfs_page_search_commits_for_head_request_locked
+ *
+ * Search through commit lists on @inode for the head request for @page.
+ * Must be called while holding the inode (which is cinfo) lock.
+ *
+ * Returns the head request if found, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+                                               struct page *page)
+{
+       struct nfs_page *freq, *t;
+       struct nfs_commit_info cinfo;
+       struct inode *inode = &nfsi->vfs_inode;
+
+       nfs_init_cinfo_from_inode(&cinfo, inode);
+
+       /* search through pnfs commit lists */
+       freq = pnfs_search_commit_reqs(inode, &cinfo, page);
+       if (freq)
+               return freq->wb_head;
+
+       /* Linearly search the commit list for the correct request */
+       list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
+               if (freq->wb_page == page)
+                       return freq->wb_head;
+       }
+
+       return NULL;
+}
+
 /*
  * nfs_page_find_head_request_locked - find head request associated with @page
  *
@@ -106,21 +140,12 @@ nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
 
        if (PagePrivate(page))
                req = (struct nfs_page *)page_private(page);
-       else if (unlikely(PageSwapCache(page))) {
-               struct nfs_page *freq, *t;
-
-               /* Linearly search the commit list for the correct req */
-               list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) {
-                       if (freq->wb_page == page) {
-                               req = freq->wb_head;
-                               break;
-                       }
-               }
-       }
+       else if (unlikely(PageSwapCache(page)))
+               req = nfs_page_search_commits_for_head_request_locked(nfsi,
+                       page);
 
        if (req) {
                WARN_ON_ONCE(req->wb_head != req);
-
                kref_get(&req->wb_kref);
        }
 
@@ -216,7 +241,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
        unsigned int pos = 0;
        unsigned int len = nfs_page_length(req->wb_page);
 
-       nfs_page_group_lock(req);
+       nfs_page_group_lock(req, true);
 
        do {
                tmp = nfs_page_group_search_locked(req->wb_head, pos);
@@ -379,8 +404,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
                subreq->wb_head = subreq;
                subreq->wb_this_page = subreq;
 
-               nfs_clear_request_commit(subreq);
-
                /* subreq is now totally disconnected from page group or any
                 * write / commit lists. last chance to wake any waiters */
                nfs_unlock_request(subreq);
@@ -456,7 +479,9 @@ try_again:
        }
 
        /* lock each request in the page group */
-       nfs_page_group_lock(head);
+       ret = nfs_page_group_lock(head, false);
+       if (ret < 0)
+               return ERR_PTR(ret);
        subreq = head;
        do {
                /*
@@ -488,7 +513,7 @@ try_again:
         * Commit list removal accounting is done after locks are dropped */
        subreq = head;
        do {
-               nfs_list_remove_request(subreq);
+               nfs_clear_request_commit(subreq);
                subreq = subreq->wb_this_page;
        } while (subreq != head);
 
@@ -518,15 +543,11 @@ try_again:
 
        nfs_page_group_unlock(head);
 
-       /* drop lock to clear_request_commit the head req and clean up
-        * requests on destroy list */
+       /* drop lock to clean uprequests on destroy list */
        spin_unlock(&inode->i_lock);
 
        nfs_destroy_unlinked_subrequests(destroy_list, head);
 
-       /* clean up commit list state */
-       nfs_clear_request_commit(head);
-
        /* still holds ref on head from nfs_page_find_head_request_locked
         * and still has lock on head from lock loop */
        return head;
@@ -705,6 +726,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 
        if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
                nfs_release_request(req);
+       else
+               WARN_ON_ONCE(1);
 }
 
 static void
@@ -808,6 +831,7 @@ nfs_clear_page_commit(struct page *page)
        dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
 }
 
+/* Called holding inode (/cinfo) lock */
 static void
 nfs_clear_request_commit(struct nfs_page *req)
 {
@@ -817,20 +841,17 @@ nfs_clear_request_commit(struct nfs_page *req)
 
                nfs_init_cinfo_from_inode(&cinfo, inode);
                if (!pnfs_clear_request_commit(req, &cinfo)) {
-                       spin_lock(cinfo.lock);
                        nfs_request_remove_commit_list(req, &cinfo);
-                       spin_unlock(cinfo.lock);
                }
                nfs_clear_page_commit(req->wb_page);
        }
 }
 
-static inline
-int nfs_write_need_commit(struct nfs_pgio_data *data)
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
 {
-       if (data->verf.committed == NFS_DATA_SYNC)
-               return data->header->lseg == NULL;
-       return data->verf.committed != NFS_FILE_SYNC;
+       if (hdr->verf.committed == NFS_DATA_SYNC)
+               return hdr->lseg == NULL;
+       return hdr->verf.committed != NFS_FILE_SYNC;
 }
 
 #else
@@ -856,8 +877,7 @@ nfs_clear_request_commit(struct nfs_page *req)
 {
 }
 
-static inline
-int nfs_write_need_commit(struct nfs_pgio_data *data)
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
 {
        return 0;
 }
@@ -883,11 +903,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
                        nfs_context_set_write_error(req->wb_context, hdr->error);
                        goto remove_req;
                }
-               if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
-                       nfs_mark_request_dirty(req);
-                       goto next;
-               }
-               if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
+               if (nfs_write_need_commit(hdr)) {
                        memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
                        nfs_mark_request_commit(req, hdr->lseg, &cinfo);
                        goto next;
@@ -1038,9 +1054,9 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
        else
                req->wb_bytes = rqend - req->wb_offset;
 out_unlock:
-       spin_unlock(&inode->i_lock);
        if (req)
                nfs_clear_request_commit(req);
+       spin_unlock(&inode->i_lock);
        return req;
 out_flushme:
        spin_unlock(&inode->i_lock);
@@ -1241,17 +1257,18 @@ static int flush_task_priority(int how)
        return RPC_PRIORITY_NORMAL;
 }
 
-static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg,
+static void nfs_initiate_write(struct nfs_pgio_header *hdr,
+                              struct rpc_message *msg,
                               struct rpc_task_setup *task_setup_data, int how)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
        int priority = flush_task_priority(how);
 
        task_setup_data->priority = priority;
-       NFS_PROTO(inode)->write_setup(data, msg);
+       NFS_PROTO(inode)->write_setup(hdr, msg);
 
        nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
-                                &task_setup_data->rpc_client, msg, data);
+                                &task_setup_data->rpc_client, msg, hdr);
 }
 
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -1313,21 +1330,9 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
        NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
 }
 
-static void nfs_writeback_release_common(struct nfs_pgio_data *data)
+static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_header *hdr = data->header;
-       int status = data->task.tk_status;
-
-       if ((status >= 0) && nfs_write_need_commit(data)) {
-               spin_lock(&hdr->lock);
-               if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
-                       ; /* Do nothing */
-               else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
-                       memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf));
-               else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf)))
-                       set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
-               spin_unlock(&hdr->lock);
-       }
+       /* do nothing! */
 }
 
 /*
@@ -1358,7 +1363,8 @@ static int nfs_should_remove_suid(const struct inode *inode)
 /*
  * This function is called when the WRITE call is complete.
  */
-static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
+static int nfs_writeback_done(struct rpc_task *task,
+                             struct nfs_pgio_header *hdr,
                              struct inode *inode)
 {
        int status;
@@ -1370,13 +1376,14 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
         * another writer had changed the file, but some applications
         * depend on tighter cache coherency when writing.
         */
-       status = NFS_PROTO(inode)->write_done(task, data);
+       status = NFS_PROTO(inode)->write_done(task, hdr);
        if (status != 0)
                return status;
-       nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count);
+       nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
 
 #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
-       if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) {
+       if (hdr->res.verf->committed < hdr->args.stable &&
+           task->tk_status >= 0) {
                /* We tried a write call, but the server did not
                 * commit data to stable storage even though we
                 * requested it.
@@ -1392,7 +1399,7 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
                        dprintk("NFS:       faulty NFS server %s:"
                                " (committed = %d) != (stable = %d)\n",
                                NFS_SERVER(inode)->nfs_client->cl_hostname,
-                               data->res.verf->committed, data->args.stable);
+                               hdr->res.verf->committed, hdr->args.stable);
                        complain = jiffies + 300 * HZ;
                }
        }
@@ -1407,16 +1414,17 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
 /*
  * This function is called when the WRITE call is complete.
  */
-static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_writeback_result(struct rpc_task *task,
+                                struct nfs_pgio_header *hdr)
 {
-       struct nfs_pgio_args    *argp = &data->args;
-       struct nfs_pgio_res     *resp = &data->res;
+       struct nfs_pgio_args    *argp = &hdr->args;
+       struct nfs_pgio_res     *resp = &hdr->res;
 
        if (resp->count < argp->count) {
                static unsigned long    complain;
 
                /* This a short write! */
-               nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE);
+               nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE);
 
                /* Has the server at least made some progress? */
                if (resp->count == 0) {
@@ -1426,14 +1434,14 @@ static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *da
                                       argp->count);
                                complain = jiffies + 300 * HZ;
                        }
-                       nfs_set_pgio_error(data->header, -EIO, argp->offset);
+                       nfs_set_pgio_error(hdr, -EIO, argp->offset);
                        task->tk_status = -EIO;
                        return;
                }
                /* Was this an NFSv2 write or an NFSv3 stable write? */
                if (resp->verf->committed != NFS_UNSTABLE) {
                        /* Resend from where the server left off */
-                       data->mds_offset += resp->count;
+                       hdr->mds_offset += resp->count;
                        argp->offset += resp->count;
                        argp->pgbase += resp->count;
                        argp->count -= resp->count;
@@ -1884,7 +1892,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 int __init nfs_init_writepagecache(void)
 {
        nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
-                                            sizeof(struct nfs_rw_header),
+                                            sizeof(struct nfs_pgio_header),
                                             0, SLAB_HWCACHE_ALIGN,
                                             NULL);
        if (nfs_wdata_cachep == NULL)
index ed628f71274c7d22a0cbc1fde941c79f710fc7ab..538f142935ea89b2c14c47e78119f61e56f3bc59 100644 (file)
@@ -30,9 +30,6 @@
 
 MODULE_LICENSE("GPL");
 
-EXPORT_SYMBOL_GPL(nfsacl_encode);
-EXPORT_SYMBOL_GPL(nfsacl_decode);
-
 struct nfsacl_encode_desc {
        struct xdr_array2_desc desc;
        unsigned int count;
@@ -136,6 +133,7 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
                          nfsacl_desc.desc.array_len;
        return err;
 }
+EXPORT_SYMBOL_GPL(nfsacl_encode);
 
 struct nfsacl_decode_desc {
        struct xdr_array2_desc desc;
@@ -295,3 +293,4 @@ int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
        return 8 + nfsacl_desc.desc.elem_size *
                   nfsacl_desc.desc.array_len;
 }
+EXPORT_SYMBOL_GPL(nfsacl_decode);
index e30f6059ecd642b44c0cc599344c0421b713958f..5180a7ededecf2797b4cb4a23c1a223f48bbe3cf 100644 (file)
@@ -52,6 +52,7 @@ struct nfs_access_entry {
        unsigned long           jiffies;
        struct rpc_cred *       cred;
        int                     mask;
+       struct rcu_head         rcu_head;
 };
 
 struct nfs_lockowner {
@@ -352,6 +353,7 @@ extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
 extern int nfs_attribute_cache_expired(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
+extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
index 1150ea41b626723b67720320a23f0810a2ec2be4..922be2e050f5c938b561daf4b5533d01f1d1d0bc 100644 (file)
@@ -45,6 +45,7 @@ struct nfs_client {
        struct sockaddr_storage cl_addr;        /* server identifier */
        size_t                  cl_addrlen;
        char *                  cl_hostname;    /* hostname of server */
+       char *                  cl_acceptor;    /* GSSAPI acceptor name */
        struct list_head        cl_share_link;  /* link in global client list */
        struct list_head        cl_superblocks; /* List of nfs_server structs */
 
index 7d9096d95d4aa5f2d276f1e05383a596acd6d4c6..6ad2bbcad4050c12105778c3011b5196fcbf4b9e 100644 (file)
@@ -26,7 +26,7 @@ enum {
        PG_MAPPED,              /* page private set for buffered io */
        PG_CLEAN,               /* write succeeded */
        PG_COMMIT_TO_DS,        /* used by pnfs layouts */
-       PG_INODE_REF,           /* extra ref held by inode (head req only) */
+       PG_INODE_REF,           /* extra ref held by inode when in writeback */
        PG_HEADLOCK,            /* page group lock of wb_head */
        PG_TEARDOWN,            /* page group sync for destroy */
        PG_UNLOCKPAGE,          /* page group sync bit in read path */
@@ -62,12 +62,13 @@ struct nfs_pageio_ops {
 
 struct nfs_rw_ops {
        const fmode_t rw_mode;
-       struct nfs_rw_header *(*rw_alloc_header)(void);
-       void (*rw_free_header)(struct nfs_rw_header *);
-       void (*rw_release)(struct nfs_pgio_data *);
-       int  (*rw_done)(struct rpc_task *, struct nfs_pgio_data *, struct inode *);
-       void (*rw_result)(struct rpc_task *, struct nfs_pgio_data *);
-       void (*rw_initiate)(struct nfs_pgio_data *, struct rpc_message *,
+       struct nfs_pgio_header *(*rw_alloc_header)(void);
+       void (*rw_free_header)(struct nfs_pgio_header *);
+       void (*rw_release)(struct nfs_pgio_header *);
+       int  (*rw_done)(struct rpc_task *, struct nfs_pgio_header *,
+                       struct inode *);
+       void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *);
+       void (*rw_initiate)(struct nfs_pgio_header *, struct rpc_message *,
                            struct rpc_task_setup *, int);
 };
 
@@ -111,6 +112,8 @@ extern      void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
                             int how);
 extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
                                   struct nfs_page *);
+extern  int nfs_pageio_resend(struct nfs_pageio_descriptor *,
+                             struct nfs_pgio_header *);
 extern void nfs_pageio_complete(struct nfs_pageio_descriptor *desc);
 extern void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *, pgoff_t);
 extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
@@ -119,7 +122,7 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 extern  int nfs_wait_on_request(struct nfs_page *);
 extern void nfs_unlock_request(struct nfs_page *req);
 extern void nfs_unlock_and_release_request(struct nfs_page *);
-extern void nfs_page_group_lock(struct nfs_page *);
+extern int nfs_page_group_lock(struct nfs_page *, bool);
 extern void nfs_page_group_unlock(struct nfs_page *);
 extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
 
index 9a1396e70310f92c40e6a0fe563b86b34f5e6836..0040629894dfa42084161124edb2f494df5be4ea 100644 (file)
@@ -993,6 +993,7 @@ struct nfs4_setclientid {
        unsigned int                    sc_uaddr_len;
        char                            sc_uaddr[RPCBIND_MAXUADDRLEN + 1];
        u32                             sc_cb_ident;
+       struct rpc_cred                 *sc_cred;
 };
 
 struct nfs4_setclientid_res {
@@ -1253,18 +1254,12 @@ enum {
        NFS_IOHDR_ERROR = 0,
        NFS_IOHDR_EOF,
        NFS_IOHDR_REDO,
-       NFS_IOHDR_NEED_COMMIT,
-       NFS_IOHDR_NEED_RESCHED,
 };
 
-struct nfs_pgio_data;
-
 struct nfs_pgio_header {
        struct inode            *inode;
        struct rpc_cred         *cred;
        struct list_head        pages;
-       struct nfs_pgio_data    *data;
-       atomic_t                refcnt;
        struct nfs_page         *req;
        struct nfs_writeverf    verf;           /* Used for writes */
        struct pnfs_layout_segment *lseg;
@@ -1281,28 +1276,22 @@ struct nfs_pgio_header {
        int                     error;          /* merge with pnfs_error */
        unsigned long           good_bytes;     /* boundary of good data */
        unsigned long           flags;
-};
 
-struct nfs_pgio_data {
-       struct nfs_pgio_header  *header;
+       /*
+        * rpc data
+        */
        struct rpc_task         task;
        struct nfs_fattr        fattr;
-       struct nfs_writeverf    verf;           /* Used for writes */
        struct nfs_pgio_args    args;           /* argument struct */
        struct nfs_pgio_res     res;            /* result struct */
        unsigned long           timestamp;      /* For lease renewal */
-       int (*pgio_done_cb) (struct rpc_task *task, struct nfs_pgio_data *data);
+       int (*pgio_done_cb)(struct rpc_task *, struct nfs_pgio_header *);
        __u64                   mds_offset;     /* Filelayout dense stripe */
-       struct nfs_page_array   pages;
+       struct nfs_page_array   page_array;
        struct nfs_client       *ds_clp;        /* pNFS data server */
        int                     ds_idx;         /* ds index if ds_clp is set */
 };
 
-struct nfs_rw_header {
-       struct nfs_pgio_header  header;
-       struct nfs_pgio_data    rpc_data;
-};
-
 struct nfs_mds_commit_info {
        atomic_t rpcs_out;
        unsigned long           ncommit;
@@ -1432,11 +1421,12 @@ struct nfs_rpc_ops {
                             struct nfs_pathconf *);
        int     (*set_capabilities)(struct nfs_server *, struct nfs_fh *);
        int     (*decode_dirent)(struct xdr_stream *, struct nfs_entry *, int);
-       int     (*pgio_rpc_prepare)(struct rpc_task *, struct nfs_pgio_data *);
-       void    (*read_setup)   (struct nfs_pgio_data *, struct rpc_message *);
-       int     (*read_done)  (struct rpc_task *, struct nfs_pgio_data *);
-       void    (*write_setup)  (struct nfs_pgio_data *, struct rpc_message *);
-       int     (*write_done)  (struct rpc_task *, struct nfs_pgio_data *);
+       int     (*pgio_rpc_prepare)(struct rpc_task *,
+                                   struct nfs_pgio_header *);
+       void    (*read_setup)(struct nfs_pgio_header *, struct rpc_message *);
+       int     (*read_done)(struct rpc_task *, struct nfs_pgio_header *);
+       void    (*write_setup)(struct nfs_pgio_header *, struct rpc_message *);
+       int     (*write_done)(struct rpc_task *, struct nfs_pgio_header *);
        void    (*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
        void    (*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);
        int     (*commit_done) (struct rpc_task *, struct nfs_commit_data *);
index 790be1472792a3fc49fcf81edd7d7e9c2ab08128..8e030075fe7906bbf46db5296e0abaf12c7c6131 100644 (file)
@@ -103,6 +103,7 @@ struct rpc_auth_create_args {
 
 /* Flags for rpcauth_lookupcred() */
 #define RPCAUTH_LOOKUP_NEW             0x01    /* Accept an uninitialised cred */
+#define RPCAUTH_LOOKUP_RCU             0x02    /* lock-less lookup */
 
 /*
  * Client authentication ops
@@ -140,6 +141,7 @@ struct rpc_credops {
                                                void *, __be32 *, void *);
        int                     (*crkey_timeout)(struct rpc_cred *);
        bool                    (*crkey_to_expire)(struct rpc_cred *);
+       char *                  (*crstringify_acceptor)(struct rpc_cred *);
 };
 
 extern const struct rpc_authops        authunix_ops;
@@ -153,6 +155,7 @@ void                        rpc_destroy_generic_auth(void);
 void                   rpc_destroy_authunix(void);
 
 struct rpc_cred *      rpc_lookup_cred(void);
+struct rpc_cred *      rpc_lookup_cred_nonblock(void);
 struct rpc_cred *      rpc_lookup_machine_cred(const char *service_name);
 int                    rpcauth_register(const struct rpc_authops *);
 int                    rpcauth_unregister(const struct rpc_authops *);
@@ -182,6 +185,7 @@ void                        rpcauth_clear_credcache(struct rpc_cred_cache *);
 int                    rpcauth_key_timeout_notify(struct rpc_auth *,
                                                struct rpc_cred *);
 bool                   rpcauth_cred_key_to_expire(struct rpc_cred *);
+char *                 rpcauth_stringify_acceptor(struct rpc_cred *);
 
 static inline
 struct rpc_cred *      get_rpccred(struct rpc_cred *cred)
index f1cfd4c85cd047c4b2fadd367eeb819aabc57d29..36eebc451b416878db871f6ea1577d783d4a6296 100644 (file)
@@ -69,8 +69,9 @@ struct gss_cl_ctx {
        enum rpc_gss_proc       gc_proc;
        u32                     gc_seq;
        spinlock_t              gc_seq_lock;
-       struct gss_ctx __rcu    *gc_gss_ctx;
+       struct gss_ctx          *gc_gss_ctx;
        struct xdr_netobj       gc_wire_ctx;
+       struct xdr_netobj       gc_acceptor;
        u32                     gc_win;
        unsigned long           gc_expiry;
        struct rcu_head         gc_rcu;
index 5af2931cf58d07daf6d16d2a4deb5956f8811794..df02a41884874f68dfb2aded42a2d38658eaff06 100644 (file)
@@ -81,7 +81,7 @@ struct gss_krb5_enctype {
                       struct xdr_netobj *in,
                       struct xdr_netobj *out); /* complete key generation */
        u32 (*encrypt_v2) (struct krb5_ctx *kctx, u32 offset,
-                          struct xdr_buf *buf, int ec,
+                          struct xdr_buf *buf,
                           struct page **pages); /* v2 encryption function */
        u32 (*decrypt_v2) (struct krb5_ctx *kctx, u32 offset,
                           struct xdr_buf *buf, u32 *headskip,
@@ -310,7 +310,7 @@ gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e,
 
 u32
 gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
-                    struct xdr_buf *buf, int ec,
+                    struct xdr_buf *buf,
                     struct page **pages);
 
 u32
index c2f04e1ae15973a4e15e55d0fb9f34d086628ef7..64a0a0a97b2396492352f99fa1a1a04f225ef06a 100644 (file)
@@ -62,8 +62,6 @@
 #define RPCRDMA_INLINE_PAD_THRESH  (512)/* payload threshold to pad (bytes) */
 
 /* memory registration strategies */
-#define RPCRDMA_PERSISTENT_REGISTRATION (1)
-
 enum rpcrdma_memreg {
        RPCRDMA_BOUNCEBUFFERS = 0,
        RPCRDMA_REGISTER,
index a622ad64acd8686a9baa3ad11ffea8ea91bb6731..2e0a6f92e563d7942b3bf7fa17b43e3f44dc9355 100644 (file)
@@ -176,7 +176,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
        len = (buf + buflen) - delim - 1;
        p = kstrndup(delim + 1, len, GFP_KERNEL);
        if (p) {
-               unsigned long scope_id = 0;
+               u32 scope_id = 0;
                struct net_device *dev;
 
                dev = dev_get_by_name(net, p);
@@ -184,7 +184,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
                        scope_id = dev->ifindex;
                        dev_put(dev);
                } else {
-                       if (strict_strtoul(p, 10, &scope_id) == 0) {
+                       if (kstrtou32(p, 10, &scope_id) == 0) {
                                kfree(p);
                                return 0;
                        }
@@ -304,7 +304,7 @@ char *rpc_sockaddr2uaddr(const struct sockaddr *sap, gfp_t gfp_flags)
  * @sap: buffer into which to plant socket address
  * @salen: size of buffer
  *
- * @uaddr does not have to be '\0'-terminated, but strict_strtoul() and
+ * @uaddr does not have to be '\0'-terminated, but kstrtou8() and
  * rpc_pton() require proper string termination to be successful.
  *
  * Returns the size of the socket address if successful; otherwise
@@ -315,7 +315,7 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr,
                          const size_t salen)
 {
        char *c, buf[RPCBIND_MAXUADDRLEN + sizeof('\0')];
-       unsigned long portlo, porthi;
+       u8 portlo, porthi;
        unsigned short port;
 
        if (uaddr_len > RPCBIND_MAXUADDRLEN)
@@ -327,18 +327,14 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr,
        c = strrchr(buf, '.');
        if (unlikely(c == NULL))
                return 0;
-       if (unlikely(strict_strtoul(c + 1, 10, &portlo) != 0))
-               return 0;
-       if (unlikely(portlo > 255))
+       if (unlikely(kstrtou8(c + 1, 10, &portlo) != 0))
                return 0;
 
        *c = '\0';
        c = strrchr(buf, '.');
        if (unlikely(c == NULL))
                return 0;
-       if (unlikely(strict_strtoul(c + 1, 10, &porthi) != 0))
-               return 0;
-       if (unlikely(porthi > 255))
+       if (unlikely(kstrtou8(c + 1, 10, &porthi) != 0))
                return 0;
 
        port = (unsigned short)((porthi << 8) | portlo);
index f773667174200cbabb92c7fc02adfb5d57e3e61c..383eb919ac0be3ed1348528d59f0bd637cb65bf5 100644 (file)
@@ -48,7 +48,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
 
        if (!val)
                goto out_inval;
-       ret = strict_strtoul(val, 0, &num);
+       ret = kstrtoul(val, 0, &num);
        if (ret == -EINVAL)
                goto out_inval;
        nbits = fls(num);
@@ -80,6 +80,10 @@ static struct kernel_param_ops param_ops_hashtbl_sz = {
 module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644);
 MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size");
 
+static unsigned long auth_max_cred_cachesize = ULONG_MAX;
+module_param(auth_max_cred_cachesize, ulong, 0644);
+MODULE_PARM_DESC(auth_max_cred_cachesize, "RPC credential maximum total cache size");
+
 static u32
 pseudoflavor_to_flavor(u32 flavor) {
        if (flavor > RPC_AUTH_MAXFLAVOR)
@@ -363,6 +367,15 @@ rpcauth_cred_key_to_expire(struct rpc_cred *cred)
 }
 EXPORT_SYMBOL_GPL(rpcauth_cred_key_to_expire);
 
+char *
+rpcauth_stringify_acceptor(struct rpc_cred *cred)
+{
+       if (!cred->cr_ops->crstringify_acceptor)
+               return NULL;
+       return cred->cr_ops->crstringify_acceptor(cred);
+}
+EXPORT_SYMBOL_GPL(rpcauth_stringify_acceptor);
+
 /*
  * Destroy a list of credentials
  */
@@ -472,6 +485,20 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
        return freed;
 }
 
+static unsigned long
+rpcauth_cache_do_shrink(int nr_to_scan)
+{
+       LIST_HEAD(free);
+       unsigned long freed;
+
+       spin_lock(&rpc_credcache_lock);
+       freed = rpcauth_prune_expired(&free, nr_to_scan);
+       spin_unlock(&rpc_credcache_lock);
+       rpcauth_destroy_credlist(&free);
+
+       return freed;
+}
+
 /*
  * Run memory cache shrinker.
  */
@@ -479,9 +506,6 @@ static unsigned long
 rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 
 {
-       LIST_HEAD(free);
-       unsigned long freed;
-
        if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
                return SHRINK_STOP;
 
@@ -489,12 +513,7 @@ rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
        if (list_empty(&cred_unused))
                return SHRINK_STOP;
 
-       spin_lock(&rpc_credcache_lock);
-       freed = rpcauth_prune_expired(&free, sc->nr_to_scan);
-       spin_unlock(&rpc_credcache_lock);
-       rpcauth_destroy_credlist(&free);
-
-       return freed;
+       return rpcauth_cache_do_shrink(sc->nr_to_scan);
 }
 
 static unsigned long
@@ -504,6 +523,21 @@ rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
        return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
+static void
+rpcauth_cache_enforce_limit(void)
+{
+       unsigned long diff;
+       unsigned int nr_to_scan;
+
+       if (number_cred_unused <= auth_max_cred_cachesize)
+               return;
+       diff = number_cred_unused - auth_max_cred_cachesize;
+       nr_to_scan = 100;
+       if (diff < nr_to_scan)
+               nr_to_scan = diff;
+       rpcauth_cache_do_shrink(nr_to_scan);
+}
+
 /*
  * Look up a process' credentials in the authentication cache
  */
@@ -523,6 +557,12 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
        hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
                if (!entry->cr_ops->crmatch(acred, entry, flags))
                        continue;
+               if (flags & RPCAUTH_LOOKUP_RCU) {
+                       if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) &&
+                           !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags))
+                               cred = entry;
+                       break;
+               }
                spin_lock(&cache->lock);
                if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) {
                        spin_unlock(&cache->lock);
@@ -537,6 +577,9 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
        if (cred != NULL)
                goto found;
 
+       if (flags & RPCAUTH_LOOKUP_RCU)
+               return ERR_PTR(-ECHILD);
+
        new = auth->au_ops->crcreate(auth, acred, flags);
        if (IS_ERR(new)) {
                cred = new;
@@ -557,6 +600,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
        } else
                list_add_tail(&new->cr_lru, &free);
        spin_unlock(&cache->lock);
+       rpcauth_cache_enforce_limit();
 found:
        if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) &&
            cred->cr_ops->cr_init != NULL &&
@@ -586,10 +630,8 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags)
        memset(&acred, 0, sizeof(acred));
        acred.uid = cred->fsuid;
        acred.gid = cred->fsgid;
-       acred.group_info = get_group_info(((struct cred *)cred)->group_info);
-
+       acred.group_info = cred->group_info;
        ret = auth->au_ops->lookup_cred(auth, &acred, flags);
-       put_group_info(acred.group_info);
        return ret;
 }
 EXPORT_SYMBOL_GPL(rpcauth_lookupcred);
index ed04869b2d4f4f097ea85e8fe899c2ea262a0aca..6f6b829c9e8ee2bab63ba5f30d03d3c39dc5ba52 100644 (file)
@@ -38,6 +38,12 @@ struct rpc_cred *rpc_lookup_cred(void)
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_cred);
 
+struct rpc_cred *rpc_lookup_cred_nonblock(void)
+{
+       return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_cred_nonblock);
+
 /*
  * Public call interface for looking up machine creds.
  */
index b6e440baccc3733f7b8963ed7ab6fddc72fd0c4c..afb292cd797decf08561492925d87d34d09e485b 100644 (file)
@@ -183,8 +183,9 @@ gss_cred_get_ctx(struct rpc_cred *cred)
        struct gss_cl_ctx *ctx = NULL;
 
        rcu_read_lock();
-       if (gss_cred->gc_ctx)
-               ctx = gss_get_ctx(gss_cred->gc_ctx);
+       ctx = rcu_dereference(gss_cred->gc_ctx);
+       if (ctx)
+               gss_get_ctx(ctx);
        rcu_read_unlock();
        return ctx;
 }
@@ -262,9 +263,22 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
                p = ERR_PTR(ret);
                goto err;
        }
-       dprintk("RPC:       %s Success. gc_expiry %lu now %lu timeout %u\n",
-               __func__, ctx->gc_expiry, now, timeout);
-       return q;
+
+       /* is there any trailing data? */
+       if (q == end) {
+               p = q;
+               goto done;
+       }
+
+       /* pull in acceptor name (if there is one) */
+       p = simple_get_netobj(q, end, &ctx->gc_acceptor);
+       if (IS_ERR(p))
+               goto err;
+done:
+       dprintk("RPC:       %s Success. gc_expiry %lu now %lu timeout %u acceptor %.*s\n",
+               __func__, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len,
+               ctx->gc_acceptor.data);
+       return p;
 err:
        dprintk("RPC:       %s returns error %ld\n", __func__, -PTR_ERR(p));
        return p;
@@ -1194,13 +1208,13 @@ gss_destroying_context(struct rpc_cred *cred)
 {
        struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
        struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
+       struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
        struct rpc_task *task;
 
-       if (gss_cred->gc_ctx == NULL ||
-           test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
+       if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
                return 0;
 
-       gss_cred->gc_ctx->gc_proc = RPC_GSS_PROC_DESTROY;
+       ctx->gc_proc = RPC_GSS_PROC_DESTROY;
        cred->cr_ops = &gss_nullops;
 
        /* Take a reference to ensure the cred will be destroyed either
@@ -1225,6 +1239,7 @@ gss_do_free_ctx(struct gss_cl_ctx *ctx)
 
        gss_delete_sec_context(&ctx->gc_gss_ctx);
        kfree(ctx->gc_wire_ctx.data);
+       kfree(ctx->gc_acceptor.data);
        kfree(ctx);
 }
 
@@ -1260,7 +1275,7 @@ gss_destroy_nullcred(struct rpc_cred *cred)
 {
        struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
        struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
-       struct gss_cl_ctx *ctx = gss_cred->gc_ctx;
+       struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
 
        RCU_INIT_POINTER(gss_cred->gc_ctx, NULL);
        call_rcu(&cred->cr_rcu, gss_free_cred_callback);
@@ -1332,6 +1347,36 @@ gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred)
        return err;
 }
 
+static char *
+gss_stringify_acceptor(struct rpc_cred *cred)
+{
+       char *string = NULL;
+       struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+       struct gss_cl_ctx *ctx;
+       struct xdr_netobj *acceptor;
+
+       rcu_read_lock();
+       ctx = rcu_dereference(gss_cred->gc_ctx);
+       if (!ctx)
+               goto out;
+
+       acceptor = &ctx->gc_acceptor;
+
+       /* no point if there's no string */
+       if (!acceptor->len)
+               goto out;
+
+       string = kmalloc(acceptor->len + 1, GFP_KERNEL);
+       if (!string)
+               goto out;
+
+       memcpy(string, acceptor->data, acceptor->len);
+       string[acceptor->len] = '\0';
+out:
+       rcu_read_unlock();
+       return string;
+}
+
 /*
  * Returns -EACCES if GSS context is NULL or will expire within the
  * timeout (miliseconds)
@@ -1340,15 +1385,16 @@ static int
 gss_key_timeout(struct rpc_cred *rc)
 {
        struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+       struct gss_cl_ctx *ctx;
        unsigned long now = jiffies;
        unsigned long expire;
 
-       if (gss_cred->gc_ctx == NULL)
-               return -EACCES;
-
-       expire = gss_cred->gc_ctx->gc_expiry - (gss_key_expire_timeo * HZ);
-
-       if (time_after(now, expire))
+       rcu_read_lock();
+       ctx = rcu_dereference(gss_cred->gc_ctx);
+       if (ctx)
+               expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ);
+       rcu_read_unlock();
+       if (!ctx || time_after(now, expire))
                return -EACCES;
        return 0;
 }
@@ -1357,13 +1403,19 @@ static int
 gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
 {
        struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+       struct gss_cl_ctx *ctx;
        int ret;
 
        if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags))
                goto out;
        /* Don't match with creds that have expired. */
-       if (time_after(jiffies, gss_cred->gc_ctx->gc_expiry))
+       rcu_read_lock();
+       ctx = rcu_dereference(gss_cred->gc_ctx);
+       if (!ctx || time_after(jiffies, ctx->gc_expiry)) {
+               rcu_read_unlock();
                return 0;
+       }
+       rcu_read_unlock();
        if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags))
                return 0;
 out:
@@ -1909,29 +1961,31 @@ static const struct rpc_authops authgss_ops = {
 };
 
 static const struct rpc_credops gss_credops = {
-       .cr_name        = "AUTH_GSS",
-       .crdestroy      = gss_destroy_cred,
-       .cr_init        = gss_cred_init,
-       .crbind         = rpcauth_generic_bind_cred,
-       .crmatch        = gss_match,
-       .crmarshal      = gss_marshal,
-       .crrefresh      = gss_refresh,
-       .crvalidate     = gss_validate,
-       .crwrap_req     = gss_wrap_req,
-       .crunwrap_resp  = gss_unwrap_resp,
-       .crkey_timeout  = gss_key_timeout,
+       .cr_name                = "AUTH_GSS",
+       .crdestroy              = gss_destroy_cred,
+       .cr_init                = gss_cred_init,
+       .crbind                 = rpcauth_generic_bind_cred,
+       .crmatch                = gss_match,
+       .crmarshal              = gss_marshal,
+       .crrefresh              = gss_refresh,
+       .crvalidate             = gss_validate,
+       .crwrap_req             = gss_wrap_req,
+       .crunwrap_resp          = gss_unwrap_resp,
+       .crkey_timeout          = gss_key_timeout,
+       .crstringify_acceptor   = gss_stringify_acceptor,
 };
 
 static const struct rpc_credops gss_nullops = {
-       .cr_name        = "AUTH_GSS",
-       .crdestroy      = gss_destroy_nullcred,
-       .crbind         = rpcauth_generic_bind_cred,
-       .crmatch        = gss_match,
-       .crmarshal      = gss_marshal,
-       .crrefresh      = gss_refresh_null,
-       .crvalidate     = gss_validate,
-       .crwrap_req     = gss_wrap_req,
-       .crunwrap_resp  = gss_unwrap_resp,
+       .cr_name                = "AUTH_GSS",
+       .crdestroy              = gss_destroy_nullcred,
+       .crbind                 = rpcauth_generic_bind_cred,
+       .crmatch                = gss_match,
+       .crmarshal              = gss_marshal,
+       .crrefresh              = gss_refresh_null,
+       .crvalidate             = gss_validate,
+       .crwrap_req             = gss_wrap_req,
+       .crunwrap_resp          = gss_unwrap_resp,
+       .crstringify_acceptor   = gss_stringify_acceptor,
 };
 
 static const struct rpc_pipe_ops gss_upcall_ops_v0 = {
index 0f43e894bc0a47e913ca5999afc69d392cc6e6ad..f5ed9f6ece0699cbc89208f278554962f9409912 100644 (file)
@@ -641,7 +641,7 @@ out:
 
 u32
 gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
-                    struct xdr_buf *buf, int ec, struct page **pages)
+                    struct xdr_buf *buf, struct page **pages)
 {
        u32 err;
        struct xdr_netobj hmac;
@@ -684,13 +684,8 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
                ecptr = buf->tail[0].iov_base;
        }
 
-       memset(ecptr, 'X', ec);
-       buf->tail[0].iov_len += ec;
-       buf->len += ec;
-
        /* copy plaintext gss token header after filler (if any) */
-       memcpy(ecptr + ec, buf->head[0].iov_base + offset,
-                                               GSS_KRB5_TOK_HDR_LEN);
+       memcpy(ecptr, buf->head[0].iov_base + offset, GSS_KRB5_TOK_HDR_LEN);
        buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN;
        buf->len += GSS_KRB5_TOK_HDR_LEN;
 
index 62ae3273186cdd94545d26742ae7a2ece246a685..42768e5c3994e3d4570bdea259ab6c7f658f76c0 100644 (file)
 
 DEFINE_SPINLOCK(krb5_seq_lock);
 
-static char *
+static void *
 setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token)
 {
-       __be16 *ptr, *krb5_hdr;
+       u16 *ptr;
+       void *krb5_hdr;
        int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
 
        token->len = g_token_size(&ctx->mech_used, body_size);
 
-       ptr = (__be16 *)token->data;
+       ptr = (u16 *)token->data;
        g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr);
 
        /* ptr now at start of header described in rfc 1964, section 1.2.1: */
        krb5_hdr = ptr;
        *ptr++ = KG_TOK_MIC_MSG;
-       *ptr++ = cpu_to_le16(ctx->gk5e->signalg);
+       /*
+        * signalg is stored as if it were converted from LE to host endian, even
+        * though it's an opaque pair of bytes according to the RFC.
+        */
+       *ptr++ = (__force u16)cpu_to_le16(ctx->gk5e->signalg);
        *ptr++ = SEAL_ALG_NONE;
-       *ptr++ = 0xffff;
+       *ptr = 0xffff;
 
-       return (char *)krb5_hdr;
+       return krb5_hdr;
 }
 
 static void *
 setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
 {
-       __be16 *ptr, *krb5_hdr;
+       u16 *ptr;
+       void *krb5_hdr;
        u8 *p, flags = 0x00;
 
        if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0)
@@ -104,15 +110,15 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
 
        /* Per rfc 4121, sec 4.2.6.1, there is no header,
         * just start the token */
-       krb5_hdr = ptr = (__be16 *)token->data;
+       krb5_hdr = ptr = (u16 *)token->data;
 
        *ptr++ = KG2_TOK_MIC;
        p = (u8 *)ptr;
        *p++ = flags;
        *p++ = 0xff;
-       ptr = (__be16 *)p;
-       *ptr++ = 0xffff;
+       ptr = (u16 *)p;
        *ptr++ = 0xffff;
+       *ptr = 0xffff;
 
        token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
        return krb5_hdr;
@@ -181,7 +187,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
        spin_lock(&krb5_seq_lock);
        seq_send = ctx->seq_send64++;
        spin_unlock(&krb5_seq_lock);
-       *((u64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
+       *((__be64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
 
        if (ctx->initiate) {
                cksumkey = ctx->initiator_sign;
index 42560e55d9789e946f5c02e73a0ed4b6c179409d..4b614c604fe09afd8a7ef03c635662d2117d05db 100644 (file)
@@ -201,9 +201,15 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
 
        msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength;
 
-       *(__be16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg);
-       memset(ptr + 4, 0xff, 4);
-       *(__be16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg);
+       /*
+        * signalg and sealalg are stored as if they were converted from LE
+        * to host endian, even though they're opaque pairs of bytes according
+        * to the RFC.
+        */
+       *(__le16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg);
+       *(__le16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg);
+       ptr[6] = 0xff;
+       ptr[7] = 0xff;
 
        gss_krb5_make_confounder(msg_start, conflen);
 
@@ -438,7 +444,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
        u8              *ptr, *plainhdr;
        s32             now;
        u8              flags = 0x00;
-       __be16          *be16ptr, ec = 0;
+       __be16          *be16ptr;
        __be64          *be64ptr;
        u32             err;
 
@@ -468,16 +474,16 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
        be16ptr = (__be16 *)ptr;
 
        blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc);
-       *be16ptr++ = cpu_to_be16(ec);
+       *be16ptr++ = 0;
        /* "inner" token header always uses 0 for RRC */
-       *be16ptr++ = cpu_to_be16(0);
+       *be16ptr++ = 0;
 
        be64ptr = (__be64 *)be16ptr;
        spin_lock(&krb5_seq_lock);
        *be64ptr = cpu_to_be64(kctx->seq_send64++);
        spin_unlock(&krb5_seq_lock);
 
-       err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, ec, pages);
+       err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages);
        if (err)
                return err;
 
index f0ebe07978a236e66744bc2dfe332a92bfb85d05..712c123e04e9ec43464581115b7bfb4cf42e514b 100644 (file)
@@ -35,6 +35,8 @@ nul_destroy(struct rpc_auth *auth)
 static struct rpc_cred *
 nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
+       if (flags & RPCAUTH_LOOKUP_RCU)
+               return &null_cred;
        return get_rpccred(&null_cred);
 }
 
index 2e6ab10734f6869af45a422bad6253e9de5cd580..488ddeed9363db71e1b5a80df4863a464345cb20 100644 (file)
@@ -1746,6 +1746,7 @@ call_bind_status(struct rpc_task *task)
        case -EHOSTDOWN:
        case -EHOSTUNREACH:
        case -ENETUNREACH:
+       case -ENOBUFS:
        case -EPIPE:
                dprintk("RPC: %5u remote rpcbind unreachable: %d\n",
                                task->tk_pid, task->tk_status);
@@ -1812,6 +1813,8 @@ call_connect_status(struct rpc_task *task)
        case -ECONNABORTED:
        case -ENETUNREACH:
        case -EHOSTUNREACH:
+       case -ENOBUFS:
+       case -EPIPE:
                if (RPC_IS_SOFTCONN(task))
                        break;
                /* retry with existing socket, after a delay */
@@ -1918,6 +1921,7 @@ call_transmit_status(struct rpc_task *task)
        case -ECONNRESET:
        case -ECONNABORTED:
        case -ENOTCONN:
+       case -ENOBUFS:
        case -EPIPE:
                rpc_task_force_reencode(task);
        }
@@ -2034,6 +2038,7 @@ call_status(struct rpc_task *task)
        case -ECONNRESET:
        case -ECONNABORTED:
                rpc_force_rebind(clnt);
+       case -ENOBUFS:
                rpc_delay(task, 3*HZ);
        case -EPIPE:
        case -ENOTCONN:
index b185548985622c0c23b1e0dc01acb783ee81246a..2d12b76b5a64f958e4fa8f45ffaeea00a5d857a9 100644 (file)
@@ -195,7 +195,7 @@ static struct inode *
 rpc_alloc_inode(struct super_block *sb)
 {
        struct rpc_inode *rpci;
-       rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
+       rpci = kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
        if (!rpci)
                return NULL;
        return &rpci->vfs_inode;
index 51c63165073c08044bf94b081e3cd1f698de4e82..56e4e150e80ee8931e4f15e0fe4f5d9527f07b1f 100644 (file)
@@ -744,6 +744,7 @@ static void xprt_connect_status(struct rpc_task *task)
        case -ECONNABORTED:
        case -ENETUNREACH:
        case -EHOSTUNREACH:
+       case -EPIPE:
        case -EAGAIN:
                dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
                break;
index 693966d3f33ba12c1220538ff58f632ae762562d..6166c985fe24850b94f4cccb56dfceee6bbd575d 100644 (file)
 # define RPCDBG_FACILITY       RPCDBG_TRANS
 #endif
 
-enum rpcrdma_chunktype {
-       rpcrdma_noch = 0,
-       rpcrdma_readch,
-       rpcrdma_areadch,
-       rpcrdma_writech,
-       rpcrdma_replych
-};
-
 #ifdef RPC_DEBUG
 static const char transfertypes[][12] = {
        "pure inline",  /* no chunks */
@@ -279,12 +271,36 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
        return (unsigned char *)iptr - (unsigned char *)headerp;
 
 out:
-       for (pos = 0; nchunks--;)
-               pos += rpcrdma_deregister_external(
-                               &req->rl_segments[pos], r_xprt);
+       if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) {
+               for (pos = 0; nchunks--;)
+                       pos += rpcrdma_deregister_external(
+                                       &req->rl_segments[pos], r_xprt);
+       }
        return n;
 }
 
+/*
+ * Marshal chunks. This routine returns the header length
+ * consumed by marshaling.
+ *
+ * Returns positive RPC/RDMA header size, or negative errno.
+ */
+
+ssize_t
+rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
+{
+       struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+       struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base;
+
+       if (req->rl_rtype != rpcrdma_noch)
+               result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
+                                              headerp, req->rl_rtype);
+       else if (req->rl_wtype != rpcrdma_noch)
+               result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
+                                              headerp, req->rl_wtype);
+       return result;
+}
+
 /*
  * Copy write data inline.
  * This function is used for "small" requests. Data which is passed
@@ -377,7 +393,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        char *base;
        size_t rpclen, padlen;
        ssize_t hdrlen;
-       enum rpcrdma_chunktype rtype, wtype;
        struct rpcrdma_msg *headerp;
 
        /*
@@ -415,13 +430,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         * into pages; otherwise use reply chunks.
         */
        if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
-               wtype = rpcrdma_noch;
+               req->rl_wtype = rpcrdma_noch;
        else if (rqst->rq_rcv_buf.page_len == 0)
-               wtype = rpcrdma_replych;
+               req->rl_wtype = rpcrdma_replych;
        else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
-               wtype = rpcrdma_writech;
+               req->rl_wtype = rpcrdma_writech;
        else
-               wtype = rpcrdma_replych;
+               req->rl_wtype = rpcrdma_replych;
 
        /*
         * Chunks needed for arguments?
@@ -438,16 +453,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         * TBD check NFSv4 setacl
         */
        if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
-               rtype = rpcrdma_noch;
+               req->rl_rtype = rpcrdma_noch;
        else if (rqst->rq_snd_buf.page_len == 0)
-               rtype = rpcrdma_areadch;
+               req->rl_rtype = rpcrdma_areadch;
        else
-               rtype = rpcrdma_readch;
+               req->rl_rtype = rpcrdma_readch;
 
        /* The following simplification is not true forever */
-       if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
-               wtype = rpcrdma_noch;
-       if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
+       if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych)
+               req->rl_wtype = rpcrdma_noch;
+       if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) {
                dprintk("RPC:       %s: cannot marshal multiple chunk lists\n",
                        __func__);
                return -EIO;
@@ -461,7 +476,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         * When padding is in use and applies to the transfer, insert
         * it and change the message type.
         */
-       if (rtype == rpcrdma_noch) {
+       if (req->rl_rtype == rpcrdma_noch) {
 
                padlen = rpcrdma_inline_pullup(rqst,
                                                RPCRDMA_INLINE_PAD_VALUE(rqst));
@@ -476,7 +491,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
                        headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
                        headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
                        hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
-                       if (wtype != rpcrdma_noch) {
+                       if (req->rl_wtype != rpcrdma_noch) {
                                dprintk("RPC:       %s: invalid chunk list\n",
                                        __func__);
                                return -EIO;
@@ -497,30 +512,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
                         * on receive. Therefore, we request a reply chunk
                         * for non-writes wherever feasible and efficient.
                         */
-                       if (wtype == rpcrdma_noch)
-                               wtype = rpcrdma_replych;
+                       if (req->rl_wtype == rpcrdma_noch)
+                               req->rl_wtype = rpcrdma_replych;
                }
        }
 
-       /*
-        * Marshal chunks. This routine will return the header length
-        * consumed by marshaling.
-        */
-       if (rtype != rpcrdma_noch) {
-               hdrlen = rpcrdma_create_chunks(rqst,
-                                       &rqst->rq_snd_buf, headerp, rtype);
-               wtype = rtype;  /* simplify dprintk */
-
-       } else if (wtype != rpcrdma_noch) {
-               hdrlen = rpcrdma_create_chunks(rqst,
-                                       &rqst->rq_rcv_buf, headerp, wtype);
-       }
+       hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen);
        if (hdrlen < 0)
                return hdrlen;
 
        dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd"
                " headerp 0x%p base 0x%p lkey 0x%x\n",
-               __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+               __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,
                headerp, base, req->rl_iov.lkey);
 
        /*
index 66f91f0d071a9bbdec3e440aaa09c94df57ffbe0..2faac49405633000f7aa528085a7ecf7994015fd 100644 (file)
@@ -296,7 +296,6 @@ xprt_setup_rdma(struct xprt_create *args)
 
        xprt->resvport = 0;             /* privileged port not needed */
        xprt->tsh_size = 0;             /* RPC-RDMA handles framing */
-       xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE;
        xprt->ops = &xprt_rdma_procs;
 
        /*
@@ -382,6 +381,9 @@ xprt_setup_rdma(struct xprt_create *args)
        new_ep->rep_xprt = xprt;
 
        xprt_rdma_format_addresses(xprt);
+       xprt->max_payload = rpcrdma_max_payload(new_xprt);
+       dprintk("RPC:       %s: transport data payload maximum: %zu bytes\n",
+               __func__, xprt->max_payload);
 
        if (!try_module_get(THIS_MODULE))
                goto out4;
@@ -412,7 +414,7 @@ xprt_rdma_close(struct rpc_xprt *xprt)
        if (r_xprt->rx_ep.rep_connected > 0)
                xprt->reestablish_timeout = 0;
        xprt_disconnect_done(xprt);
-       (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+       rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
 }
 
 static void
@@ -595,13 +597,14 @@ xprt_rdma_send_request(struct rpc_task *task)
        struct rpc_xprt *xprt = rqst->rq_xprt;
        struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
-       int rc;
+       int rc = 0;
 
-       if (req->rl_niovs == 0) {
+       if (req->rl_niovs == 0)
                rc = rpcrdma_marshal_req(rqst);
-               if (rc < 0)
-                       goto failed_marshal;
-       }
+       else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+               rc = rpcrdma_marshal_chunks(rqst, 0);
+       if (rc < 0)
+               goto failed_marshal;
 
        if (req->rl_reply == NULL)              /* e.g. reconnection */
                rpcrdma_recv_buffer_get(req);
index 13dbd1c389ff07b02c6fa362ebbbf5fd5b6662d7..61c41298b4ea7b09b727548bb1a00d71b60d50b9 100644 (file)
@@ -61,6 +61,8 @@
 # define RPCDBG_FACILITY       RPCDBG_TRANS
 #endif
 
+static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
+
 /*
  * internal functions
  */
@@ -103,17 +105,6 @@ rpcrdma_run_tasklet(unsigned long data)
 
 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
 
-static inline void
-rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
-       list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
-       spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
-       tasklet_schedule(&rpcrdma_tasklet_g);
-}
-
 static void
 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
 {
@@ -153,12 +144,7 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
        if (wc->wr_id == 0ULL)
                return;
        if (wc->status != IB_WC_SUCCESS)
-               return;
-
-       if (wc->opcode == IB_WC_FAST_REG_MR)
-               frmr->r.frmr.state = FRMR_IS_VALID;
-       else if (wc->opcode == IB_WC_LOCAL_INV)
-               frmr->r.frmr.state = FRMR_IS_INVALID;
+               frmr->r.frmr.fr_state = FRMR_IS_STALE;
 }
 
 static int
@@ -217,7 +203,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
 }
 
 static void
-rpcrdma_recvcq_process_wc(struct ib_wc *wc)
+rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
 {
        struct rpcrdma_rep *rep =
                        (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
@@ -248,28 +234,38 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc)
        }
 
 out_schedule:
-       rpcrdma_schedule_tasklet(rep);
+       list_add_tail(&rep->rr_list, sched_list);
 }
 
 static int
 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
 {
+       struct list_head sched_list;
        struct ib_wc *wcs;
        int budget, count, rc;
+       unsigned long flags;
 
+       INIT_LIST_HEAD(&sched_list);
        budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
        do {
                wcs = ep->rep_recv_wcs;
 
                rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
                if (rc <= 0)
-                       return rc;
+                       goto out_schedule;
 
                count = rc;
                while (count-- > 0)
-                       rpcrdma_recvcq_process_wc(wcs++);
+                       rpcrdma_recvcq_process_wc(wcs++, &sched_list);
        } while (rc == RPCRDMA_POLLSIZE && --budget);
-       return 0;
+       rc = 0;
+
+out_schedule:
+       spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+       list_splice_tail(&sched_list, &rpcrdma_tasklets_g);
+       spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+       tasklet_schedule(&rpcrdma_tasklet_g);
+       return rc;
 }
 
 /*
@@ -310,6 +306,13 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
        rpcrdma_recvcq_poll(cq, ep);
 }
 
+static void
+rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
+{
+       rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep);
+       rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep);
+}
+
 #ifdef RPC_DEBUG
 static const char * const conn[] = {
        "address resolved",
@@ -323,8 +326,16 @@ static const char * const conn[] = {
        "rejected",
        "established",
        "disconnected",
-       "device removal"
+       "device removal",
+       "multicast join",
+       "multicast error",
+       "address change",
+       "timewait exit",
 };
+
+#define CONNECTION_MSG(status)                                         \
+       ((status) < ARRAY_SIZE(conn) ?                                  \
+               conn[(status)] : "unrecognized connection error")
 #endif
 
 static int
@@ -382,23 +393,18 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
        case RDMA_CM_EVENT_DEVICE_REMOVAL:
                connstate = -ENODEV;
 connected:
-               dprintk("RPC:       %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
-                       __func__,
-                       (event->event <= 11) ? conn[event->event] :
-                                               "unknown connection error",
-                       &addr->sin_addr.s_addr,
-                       ntohs(addr->sin_port),
-                       ep, event->event);
                atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
                dprintk("RPC:       %s: %sconnected\n",
                                        __func__, connstate > 0 ? "" : "dis");
                ep->rep_connected = connstate;
                ep->rep_func(ep);
                wake_up_all(&ep->rep_connect_wait);
-               break;
+               /*FALLTHROUGH*/
        default:
-               dprintk("RPC:       %s: unexpected CM event %d\n",
-                       __func__, event->event);
+               dprintk("RPC:       %s: %pI4:%u (ep 0x%p): %s\n",
+                       __func__, &addr->sin_addr.s_addr,
+                       ntohs(addr->sin_port), ep,
+                       CONNECTION_MSG(event->event));
                break;
        }
 
@@ -558,12 +564,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                if (!ia->ri_id->device->alloc_fmr) {
                        dprintk("RPC:       %s: MTHCAFMR registration "
                                "not supported by HCA\n", __func__);
-#if RPCRDMA_PERSISTENT_REGISTRATION
                        memreg = RPCRDMA_ALLPHYSICAL;
-#else
-                       rc = -ENOMEM;
-                       goto out2;
-#endif
                }
        }
 
@@ -578,20 +579,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
        switch (memreg) {
        case RPCRDMA_FRMR:
                break;
-#if RPCRDMA_PERSISTENT_REGISTRATION
        case RPCRDMA_ALLPHYSICAL:
                mem_priv = IB_ACCESS_LOCAL_WRITE |
                                IB_ACCESS_REMOTE_WRITE |
                                IB_ACCESS_REMOTE_READ;
                goto register_setup;
-#endif
        case RPCRDMA_MTHCAFMR:
                if (ia->ri_have_dma_lkey)
                        break;
                mem_priv = IB_ACCESS_LOCAL_WRITE;
-#if RPCRDMA_PERSISTENT_REGISTRATION
        register_setup:
-#endif
                ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
                if (IS_ERR(ia->ri_bind_mem)) {
                        printk(KERN_ALERT "%s: ib_get_dma_mr for "
@@ -613,6 +610,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
        /* Else will do memory reg/dereg for each chunk */
        ia->ri_memreg_strategy = memreg;
 
+       rwlock_init(&ia->ri_qplock);
        return 0;
 out2:
        rdma_destroy_id(ia->ri_id);
@@ -826,10 +824,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
        cancel_delayed_work_sync(&ep->rep_connect_worker);
 
        if (ia->ri_id->qp) {
-               rc = rpcrdma_ep_disconnect(ep, ia);
-               if (rc)
-                       dprintk("RPC:       %s: rpcrdma_ep_disconnect"
-                               " returned %i\n", __func__, rc);
+               rpcrdma_ep_disconnect(ep, ia);
                rdma_destroy_qp(ia->ri_id);
                ia->ri_id->qp = NULL;
        }
@@ -859,7 +854,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 int
 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
-       struct rdma_cm_id *id;
+       struct rdma_cm_id *id, *old;
        int rc = 0;
        int retry_count = 0;
 
@@ -867,13 +862,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
                struct rpcrdma_xprt *xprt;
 retry:
                dprintk("RPC:       %s: reconnecting...\n", __func__);
-               rc = rpcrdma_ep_disconnect(ep, ia);
-               if (rc && rc != -ENOTCONN)
-                       dprintk("RPC:       %s: rpcrdma_ep_disconnect"
-                               " status %i\n", __func__, rc);
 
-               rpcrdma_clean_cq(ep->rep_attr.recv_cq);
-               rpcrdma_clean_cq(ep->rep_attr.send_cq);
+               rpcrdma_ep_disconnect(ep, ia);
+               rpcrdma_flush_cqs(ep);
+
+               if (ia->ri_memreg_strategy == RPCRDMA_FRMR)
+                       rpcrdma_reset_frmrs(ia);
 
                xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
                id = rpcrdma_create_id(xprt, ia,
@@ -905,9 +899,14 @@ retry:
                        rc = -ENETUNREACH;
                        goto out;
                }
-               rdma_destroy_qp(ia->ri_id);
-               rdma_destroy_id(ia->ri_id);
+
+               write_lock(&ia->ri_qplock);
+               old = ia->ri_id;
                ia->ri_id = id;
+               write_unlock(&ia->ri_qplock);
+
+               rdma_destroy_qp(old);
+               rdma_destroy_id(old);
        } else {
                dprintk("RPC:       %s: connecting...\n", __func__);
                rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
@@ -974,13 +973,12 @@ out:
  * This call is not reentrant, and must not be made in parallel
  * on the same endpoint.
  */
-int
+void
 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
        int rc;
 
-       rpcrdma_clean_cq(ep->rep_attr.recv_cq);
-       rpcrdma_clean_cq(ep->rep_attr.send_cq);
+       rpcrdma_flush_cqs(ep);
        rc = rdma_disconnect(ia->ri_id);
        if (!rc) {
                /* returns without wait if not connected */
@@ -992,12 +990,93 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
                dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
                ep->rep_connected = rc;
        }
+}
+
+static int
+rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
+{
+       int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
+       struct ib_fmr_attr fmr_attr = {
+               .max_pages      = RPCRDMA_MAX_DATA_SEGS,
+               .max_maps       = 1,
+               .page_shift     = PAGE_SHIFT
+       };
+       struct rpcrdma_mw *r;
+       int i, rc;
+
+       i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+       dprintk("RPC:       %s: initalizing %d FMRs\n", __func__, i);
+
+       while (i--) {
+               r = kzalloc(sizeof(*r), GFP_KERNEL);
+               if (r == NULL)
+                       return -ENOMEM;
+
+               r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
+               if (IS_ERR(r->r.fmr)) {
+                       rc = PTR_ERR(r->r.fmr);
+                       dprintk("RPC:       %s: ib_alloc_fmr failed %i\n",
+                               __func__, rc);
+                       goto out_free;
+               }
+
+               list_add(&r->mw_list, &buf->rb_mws);
+               list_add(&r->mw_all, &buf->rb_all);
+       }
+       return 0;
+
+out_free:
+       kfree(r);
+       return rc;
+}
+
+static int
+rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_frmr *f;
+       struct rpcrdma_mw *r;
+       int i, rc;
+
+       i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+       dprintk("RPC:       %s: initalizing %d FRMRs\n", __func__, i);
+
+       while (i--) {
+               r = kzalloc(sizeof(*r), GFP_KERNEL);
+               if (r == NULL)
+                       return -ENOMEM;
+               f = &r->r.frmr;
+
+               f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+                                               ia->ri_max_frmr_depth);
+               if (IS_ERR(f->fr_mr)) {
+                       rc = PTR_ERR(f->fr_mr);
+                       dprintk("RPC:       %s: ib_alloc_fast_reg_mr "
+                               "failed %i\n", __func__, rc);
+                       goto out_free;
+               }
+
+               f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
+                                                       ia->ri_max_frmr_depth);
+               if (IS_ERR(f->fr_pgl)) {
+                       rc = PTR_ERR(f->fr_pgl);
+                       dprintk("RPC:       %s: ib_alloc_fast_reg_page_list "
+                               "failed %i\n", __func__, rc);
+
+                       ib_dereg_mr(f->fr_mr);
+                       goto out_free;
+               }
+
+               list_add(&r->mw_list, &buf->rb_mws);
+               list_add(&r->mw_all, &buf->rb_all);
+       }
+
+       return 0;
+
+out_free:
+       kfree(r);
        return rc;
 }
 
-/*
- * Initialize buffer memory
- */
 int
 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
        struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
@@ -1005,7 +1084,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
        char *p;
        size_t len, rlen, wlen;
        int i, rc;
-       struct rpcrdma_mw *r;
 
        buf->rb_max_requests = cdata->max_requests;
        spin_lock_init(&buf->rb_lock);
@@ -1016,28 +1094,12 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
         *   2.  arrays of struct rpcrdma_req to fill in pointers
         *   3.  array of struct rpcrdma_rep for replies
         *   4.  padding, if any
-        *   5.  mw's, fmr's or frmr's, if any
         * Send/recv buffers in req/rep need to be registered
         */
-
        len = buf->rb_max_requests *
                (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
        len += cdata->padding;
-       switch (ia->ri_memreg_strategy) {
-       case RPCRDMA_FRMR:
-               len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
-                               sizeof(struct rpcrdma_mw);
-               break;
-       case RPCRDMA_MTHCAFMR:
-               /* TBD we are perhaps overallocating here */
-               len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
-                               sizeof(struct rpcrdma_mw);
-               break;
-       default:
-               break;
-       }
 
-       /* allocate 1, 4 and 5 in one shot */
        p = kzalloc(len, GFP_KERNEL);
        if (p == NULL) {
                dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
@@ -1064,51 +1126,17 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
        p += cdata->padding;
 
        INIT_LIST_HEAD(&buf->rb_mws);
-       r = (struct rpcrdma_mw *)p;
+       INIT_LIST_HEAD(&buf->rb_all);
        switch (ia->ri_memreg_strategy) {
        case RPCRDMA_FRMR:
-               for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
-                       r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
-                                               ia->ri_max_frmr_depth);
-                       if (IS_ERR(r->r.frmr.fr_mr)) {
-                               rc = PTR_ERR(r->r.frmr.fr_mr);
-                               dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
-                                       " failed %i\n", __func__, rc);
-                               goto out;
-                       }
-                       r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
-                                               ia->ri_id->device,
-                                               ia->ri_max_frmr_depth);
-                       if (IS_ERR(r->r.frmr.fr_pgl)) {
-                               rc = PTR_ERR(r->r.frmr.fr_pgl);
-                               dprintk("RPC:       %s: "
-                                       "ib_alloc_fast_reg_page_list "
-                                       "failed %i\n", __func__, rc);
-
-                               ib_dereg_mr(r->r.frmr.fr_mr);
-                               goto out;
-                       }
-                       list_add(&r->mw_list, &buf->rb_mws);
-                       ++r;
-               }
+               rc = rpcrdma_init_frmrs(ia, buf);
+               if (rc)
+                       goto out;
                break;
        case RPCRDMA_MTHCAFMR:
-               /* TBD we are perhaps overallocating here */
-               for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
-                       static struct ib_fmr_attr fa =
-                               { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
-                       r->r.fmr = ib_alloc_fmr(ia->ri_pd,
-                               IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
-                               &fa);
-                       if (IS_ERR(r->r.fmr)) {
-                               rc = PTR_ERR(r->r.fmr);
-                               dprintk("RPC:       %s: ib_alloc_fmr"
-                                       " failed %i\n", __func__, rc);
-                               goto out;
-                       }
-                       list_add(&r->mw_list, &buf->rb_mws);
-                       ++r;
-               }
+               rc = rpcrdma_init_fmrs(ia, buf);
+               if (rc)
+                       goto out;
                break;
        default:
                break;
@@ -1176,24 +1204,57 @@ out:
        return rc;
 }
 
-/*
- * Unregister and destroy buffer memory. Need to deal with
- * partial initialization, so it's callable from failed create.
- * Must be called before destroying endpoint, as registrations
- * reference it.
- */
+static void
+rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_mw *r;
+       int rc;
+
+       while (!list_empty(&buf->rb_all)) {
+               r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+               list_del(&r->mw_all);
+               list_del(&r->mw_list);
+
+               rc = ib_dealloc_fmr(r->r.fmr);
+               if (rc)
+                       dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
+                               __func__, rc);
+
+               kfree(r);
+       }
+}
+
+static void
+rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_mw *r;
+       int rc;
+
+       while (!list_empty(&buf->rb_all)) {
+               r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+               list_del(&r->mw_all);
+               list_del(&r->mw_list);
+
+               rc = ib_dereg_mr(r->r.frmr.fr_mr);
+               if (rc)
+                       dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
+                               __func__, rc);
+               ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+
+               kfree(r);
+       }
+}
+
 void
 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
-       int rc, i;
        struct rpcrdma_ia *ia = rdmab_to_ia(buf);
-       struct rpcrdma_mw *r;
+       int i;
 
        /* clean up in reverse order from create
         *   1.  recv mr memory (mr free, then kfree)
         *   2.  send mr memory (mr free, then kfree)
-        *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
-        *   4.  arrays
+        *   3.  MWs
         */
        dprintk("RPC:       %s: entering\n", __func__);
 
@@ -1212,34 +1273,217 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
                }
        }
 
+       switch (ia->ri_memreg_strategy) {
+       case RPCRDMA_FRMR:
+               rpcrdma_destroy_frmrs(buf);
+               break;
+       case RPCRDMA_MTHCAFMR:
+               rpcrdma_destroy_fmrs(buf);
+               break;
+       default:
+               break;
+       }
+
+       kfree(buf->rb_pool);
+}
+
+/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
+ * an unusable state. Find FRMRs in this state and dereg / reg
+ * each.  FRMRs that are VALID and attached to an rpcrdma_req are
+ * also torn down.
+ *
+ * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_frmr_external().
+ */
+static void
+rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
+{
+       struct rpcrdma_xprt *r_xprt =
+                               container_of(ia, struct rpcrdma_xprt, rx_ia);
+       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+       struct list_head *pos;
+       struct rpcrdma_mw *r;
+       int rc;
+
+       list_for_each(pos, &buf->rb_all) {
+               r = list_entry(pos, struct rpcrdma_mw, mw_all);
+
+               if (r->r.frmr.fr_state == FRMR_IS_INVALID)
+                       continue;
+
+               rc = ib_dereg_mr(r->r.frmr.fr_mr);
+               if (rc)
+                       dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
+                               __func__, rc);
+               ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+
+               r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+                                       ia->ri_max_frmr_depth);
+               if (IS_ERR(r->r.frmr.fr_mr)) {
+                       rc = PTR_ERR(r->r.frmr.fr_mr);
+                       dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
+                               " failed %i\n", __func__, rc);
+                       continue;
+               }
+               r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
+                                       ia->ri_id->device,
+                                       ia->ri_max_frmr_depth);
+               if (IS_ERR(r->r.frmr.fr_pgl)) {
+                       rc = PTR_ERR(r->r.frmr.fr_pgl);
+                       dprintk("RPC:       %s: "
+                               "ib_alloc_fast_reg_page_list "
+                               "failed %i\n", __func__, rc);
+
+                       ib_dereg_mr(r->r.frmr.fr_mr);
+                       continue;
+               }
+               r->r.frmr.fr_state = FRMR_IS_INVALID;
+       }
+}
+
+/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
+ * some req segments uninitialized.
+ */
+static void
+rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
+{
+       if (*mw) {
+               list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
+               *mw = NULL;
+       }
+}
+
+/* Cycle mw's back in reverse order, and "spin" them.
+ * This delays and scrambles reuse as much as possible.
+ */
+static void
+rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_mr_seg *seg = req->rl_segments;
+       struct rpcrdma_mr_seg *seg1 = seg;
+       int i;
+
+       for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
+               rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf);
+       rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf);
+}
+
+static void
+rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+       buf->rb_send_bufs[--buf->rb_send_index] = req;
+       req->rl_niovs = 0;
+       if (req->rl_reply) {
+               buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
+               req->rl_reply->rr_func = NULL;
+               req->rl_reply = NULL;
+       }
+}
+
+/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
+ * Redo only the ib_post_send().
+ */
+static void
+rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
+{
+       struct rpcrdma_xprt *r_xprt =
+                               container_of(ia, struct rpcrdma_xprt, rx_ia);
+       struct ib_send_wr invalidate_wr, *bad_wr;
+       int rc;
+
+       dprintk("RPC:       %s: FRMR %p is stale\n", __func__, r);
+
+       /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
+       r->r.frmr.fr_state = FRMR_IS_INVALID;
+
+       memset(&invalidate_wr, 0, sizeof(invalidate_wr));
+       invalidate_wr.wr_id = (unsigned long)(void *)r;
+       invalidate_wr.opcode = IB_WR_LOCAL_INV;
+       invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
+       DECR_CQCOUNT(&r_xprt->rx_ep);
+
+       dprintk("RPC:       %s: frmr %p invalidating rkey %08x\n",
+               __func__, r, r->r.frmr.fr_mr->rkey);
+
+       read_lock(&ia->ri_qplock);
+       rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+       read_unlock(&ia->ri_qplock);
+       if (rc) {
+               /* Force rpcrdma_buffer_get() to retry */
+               r->r.frmr.fr_state = FRMR_IS_STALE;
+               dprintk("RPC:       %s: ib_post_send failed, %i\n",
+                       __func__, rc);
+       }
+}
+
+static void
+rpcrdma_retry_flushed_linv(struct list_head *stale,
+                          struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+       struct list_head *pos;
+       struct rpcrdma_mw *r;
+       unsigned long flags;
+
+       list_for_each(pos, stale) {
+               r = list_entry(pos, struct rpcrdma_mw, mw_list);
+               rpcrdma_retry_local_inv(r, ia);
+       }
+
+       spin_lock_irqsave(&buf->rb_lock, flags);
+       list_splice_tail(stale, &buf->rb_mws);
+       spin_unlock_irqrestore(&buf->rb_lock, flags);
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
+                        struct list_head *stale)
+{
+       struct rpcrdma_mw *r;
+       int i;
+
+       i = RPCRDMA_MAX_SEGS - 1;
        while (!list_empty(&buf->rb_mws)) {
                r = list_entry(buf->rb_mws.next,
-                       struct rpcrdma_mw, mw_list);
+                              struct rpcrdma_mw, mw_list);
                list_del(&r->mw_list);
-               switch (ia->ri_memreg_strategy) {
-               case RPCRDMA_FRMR:
-                       rc = ib_dereg_mr(r->r.frmr.fr_mr);
-                       if (rc)
-                               dprintk("RPC:       %s:"
-                                       " ib_dereg_mr"
-                                       " failed %i\n",
-                                       __func__, rc);
-                       ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
-                       break;
-               case RPCRDMA_MTHCAFMR:
-                       rc = ib_dealloc_fmr(r->r.fmr);
-                       if (rc)
-                               dprintk("RPC:       %s:"
-                                       " ib_dealloc_fmr"
-                                       " failed %i\n",
-                                       __func__, rc);
-                       break;
-               default:
-                       break;
+               if (r->r.frmr.fr_state == FRMR_IS_STALE) {
+                       list_add(&r->mw_list, stale);
+                       continue;
                }
+               req->rl_segments[i].mr_chunk.rl_mw = r;
+               if (unlikely(i-- == 0))
+                       return req;     /* Success */
        }
 
-       kfree(buf->rb_pool);
+       /* Not enough entries on rb_mws for this req */
+       rpcrdma_buffer_put_sendbuf(req, buf);
+       rpcrdma_buffer_put_mrs(req, buf);
+       return NULL;
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_mw *r;
+       int i;
+
+       i = RPCRDMA_MAX_SEGS - 1;
+       while (!list_empty(&buf->rb_mws)) {
+               r = list_entry(buf->rb_mws.next,
+                              struct rpcrdma_mw, mw_list);
+               list_del(&r->mw_list);
+               req->rl_segments[i].mr_chunk.rl_mw = r;
+               if (unlikely(i-- == 0))
+                       return req;     /* Success */
+       }
+
+       /* Not enough entries on rb_mws for this req */
+       rpcrdma_buffer_put_sendbuf(req, buf);
+       rpcrdma_buffer_put_mrs(req, buf);
+       return NULL;
 }
 
 /*
@@ -1254,10 +1498,10 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 struct rpcrdma_req *
 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 {
+       struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
+       struct list_head stale;
        struct rpcrdma_req *req;
        unsigned long flags;
-       int i;
-       struct rpcrdma_mw *r;
 
        spin_lock_irqsave(&buffers->rb_lock, flags);
        if (buffers->rb_send_index == buffers->rb_max_requests) {
@@ -1277,16 +1521,21 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
                buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
        }
        buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
-       if (!list_empty(&buffers->rb_mws)) {
-               i = RPCRDMA_MAX_SEGS - 1;
-               do {
-                       r = list_entry(buffers->rb_mws.next,
-                                       struct rpcrdma_mw, mw_list);
-                       list_del(&r->mw_list);
-                       req->rl_segments[i].mr_chunk.rl_mw = r;
-               } while (--i >= 0);
+
+       INIT_LIST_HEAD(&stale);
+       switch (ia->ri_memreg_strategy) {
+       case RPCRDMA_FRMR:
+               req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
+               break;
+       case RPCRDMA_MTHCAFMR:
+               req = rpcrdma_buffer_get_fmrs(req, buffers);
+               break;
+       default:
+               break;
        }
        spin_unlock_irqrestore(&buffers->rb_lock, flags);
+       if (!list_empty(&stale))
+               rpcrdma_retry_flushed_linv(&stale, buffers);
        return req;
 }
 
@@ -1299,34 +1548,14 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
 {
        struct rpcrdma_buffer *buffers = req->rl_buffer;
        struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
-       int i;
        unsigned long flags;
 
        spin_lock_irqsave(&buffers->rb_lock, flags);
-       buffers->rb_send_bufs[--buffers->rb_send_index] = req;
-       req->rl_niovs = 0;
-       if (req->rl_reply) {
-               buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
-               req->rl_reply->rr_func = NULL;
-               req->rl_reply = NULL;
-       }
+       rpcrdma_buffer_put_sendbuf(req, buffers);
        switch (ia->ri_memreg_strategy) {
        case RPCRDMA_FRMR:
        case RPCRDMA_MTHCAFMR:
-               /*
-                * Cycle mw's back in reverse order, and "spin" them.
-                * This delays and scrambles reuse as much as possible.
-                */
-               i = 1;
-               do {
-                       struct rpcrdma_mw **mw;
-                       mw = &req->rl_segments[i].mr_chunk.rl_mw;
-                       list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
-                       *mw = NULL;
-               } while (++i < RPCRDMA_MAX_SEGS);
-               list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
-                                       &buffers->rb_mws);
-               req->rl_segments[0].mr_chunk.rl_mw = NULL;
+               rpcrdma_buffer_put_mrs(req, buffers);
                break;
        default:
                break;
@@ -1388,6 +1617,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
         */
        iov->addr = ib_dma_map_single(ia->ri_id->device,
                        va, len, DMA_BIDIRECTIONAL);
+       if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
+               return -ENOMEM;
+
        iov->length = len;
 
        if (ia->ri_have_dma_lkey) {
@@ -1483,8 +1715,10 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
                        struct rpcrdma_xprt *r_xprt)
 {
        struct rpcrdma_mr_seg *seg1 = seg;
-       struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
-
+       struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw;
+       struct rpcrdma_frmr *frmr = &mw->r.frmr;
+       struct ib_mr *mr = frmr->fr_mr;
+       struct ib_send_wr fastreg_wr, *bad_wr;
        u8 key;
        int len, pageoff;
        int i, rc;
@@ -1502,8 +1736,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
                rpcrdma_map_one(ia, seg, writing);
                pa = seg->mr_dma;
                for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
-                       seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
-                               page_list[page_no++] = pa;
+                       frmr->fr_pgl->page_list[page_no++] = pa;
                        pa += PAGE_SIZE;
                }
                len += seg->mr_len;
@@ -1515,65 +1748,51 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
                        break;
        }
        dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
-               __func__, seg1->mr_chunk.rl_mw, i);
-
-       if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
-               dprintk("RPC:       %s: frmr %x left valid, posting invalidate.\n",
-                       __func__,
-                       seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
-               /* Invalidate before using. */
-               memset(&invalidate_wr, 0, sizeof invalidate_wr);
-               invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
-               invalidate_wr.next = &frmr_wr;
-               invalidate_wr.opcode = IB_WR_LOCAL_INV;
-               invalidate_wr.send_flags = IB_SEND_SIGNALED;
-               invalidate_wr.ex.invalidate_rkey =
-                       seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
-               DECR_CQCOUNT(&r_xprt->rx_ep);
-               post_wr = &invalidate_wr;
-       } else
-               post_wr = &frmr_wr;
-
-       /* Prepare FRMR WR */
-       memset(&frmr_wr, 0, sizeof frmr_wr);
-       frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
-       frmr_wr.opcode = IB_WR_FAST_REG_MR;
-       frmr_wr.send_flags = IB_SEND_SIGNALED;
-       frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
-       frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
-       frmr_wr.wr.fast_reg.page_list_len = page_no;
-       frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
-       frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
-       if (frmr_wr.wr.fast_reg.length < len) {
-               while (seg1->mr_nsegs--)
-                       rpcrdma_unmap_one(ia, seg++);
-               return -EIO;
+               __func__, mw, i);
+
+       frmr->fr_state = FRMR_IS_VALID;
+
+       memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+       fastreg_wr.wr_id = (unsigned long)(void *)mw;
+       fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+       fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
+       fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
+       fastreg_wr.wr.fast_reg.page_list_len = page_no;
+       fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+       fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
+       if (fastreg_wr.wr.fast_reg.length < len) {
+               rc = -EIO;
+               goto out_err;
        }
 
        /* Bump the key */
-       key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
-       ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
+       key = (u8)(mr->rkey & 0x000000FF);
+       ib_update_fast_reg_key(mr, ++key);
 
-       frmr_wr.wr.fast_reg.access_flags = (writing ?
+       fastreg_wr.wr.fast_reg.access_flags = (writing ?
                                IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
                                IB_ACCESS_REMOTE_READ);
-       frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+       fastreg_wr.wr.fast_reg.rkey = mr->rkey;
        DECR_CQCOUNT(&r_xprt->rx_ep);
 
-       rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
-
+       rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
        if (rc) {
                dprintk("RPC:       %s: failed ib_post_send for register,"
                        " status %i\n", __func__, rc);
-               while (i--)
-                       rpcrdma_unmap_one(ia, --seg);
+               ib_update_fast_reg_key(mr, --key);
+               goto out_err;
        } else {
-               seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+               seg1->mr_rkey = mr->rkey;
                seg1->mr_base = seg1->mr_dma + pageoff;
                seg1->mr_nsegs = i;
                seg1->mr_len = len;
        }
        *nsegs = i;
+       return 0;
+out_err:
+       frmr->fr_state = FRMR_IS_INVALID;
+       while (i--)
+               rpcrdma_unmap_one(ia, --seg);
        return rc;
 }
 
@@ -1585,20 +1804,25 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
        struct ib_send_wr invalidate_wr, *bad_wr;
        int rc;
 
-       while (seg1->mr_nsegs--)
-               rpcrdma_unmap_one(ia, seg++);
+       seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
 
        memset(&invalidate_wr, 0, sizeof invalidate_wr);
        invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
        invalidate_wr.opcode = IB_WR_LOCAL_INV;
-       invalidate_wr.send_flags = IB_SEND_SIGNALED;
        invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
        DECR_CQCOUNT(&r_xprt->rx_ep);
 
+       read_lock(&ia->ri_qplock);
+       while (seg1->mr_nsegs--)
+               rpcrdma_unmap_one(ia, seg++);
        rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
-       if (rc)
+       read_unlock(&ia->ri_qplock);
+       if (rc) {
+               /* Force rpcrdma_buffer_get() to retry */
+               seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
                dprintk("RPC:       %s: failed ib_post_send for invalidate,"
                        " status %i\n", __func__, rc);
+       }
        return rc;
 }
 
@@ -1656,8 +1880,10 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
 
        list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
        rc = ib_unmap_fmr(&l);
+       read_lock(&ia->ri_qplock);
        while (seg1->mr_nsegs--)
                rpcrdma_unmap_one(ia, seg++);
+       read_unlock(&ia->ri_qplock);
        if (rc)
                dprintk("RPC:       %s: failed ib_unmap_fmr,"
                        " status %i\n", __func__, rc);
@@ -1673,7 +1899,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
 
        switch (ia->ri_memreg_strategy) {
 
-#if RPCRDMA_PERSISTENT_REGISTRATION
        case RPCRDMA_ALLPHYSICAL:
                rpcrdma_map_one(ia, seg, writing);
                seg->mr_rkey = ia->ri_bind_mem->rkey;
@@ -1681,7 +1906,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
                seg->mr_nsegs = 1;
                nsegs = 1;
                break;
-#endif
 
        /* Registration using frmr registration */
        case RPCRDMA_FRMR:
@@ -1711,11 +1935,11 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
 
        switch (ia->ri_memreg_strategy) {
 
-#if RPCRDMA_PERSISTENT_REGISTRATION
        case RPCRDMA_ALLPHYSICAL:
+               read_lock(&ia->ri_qplock);
                rpcrdma_unmap_one(ia, seg);
+               read_unlock(&ia->ri_qplock);
                break;
-#endif
 
        case RPCRDMA_FRMR:
                rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
@@ -1809,3 +2033,44 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
                        rc);
        return rc;
 }
+
+/* Physical mapping means one Read/Write list entry per-page.
+ * All list entries must fit within an inline buffer
+ *
+ * NB: The server must return a Write list for NFS READ,
+ *     which has the same constraint. Factor in the inline
+ *     rsize as well.
+ */
+static size_t
+rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+       struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+       unsigned int inline_size, pages;
+
+       inline_size = min_t(unsigned int,
+                           cdata->inline_wsize, cdata->inline_rsize);
+       inline_size -= RPCRDMA_HDRLEN_MIN;
+       pages = inline_size / sizeof(struct rpcrdma_segment);
+       return pages << PAGE_SHIFT;
+}
+
+static size_t
+rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+       return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
+}
+
+size_t
+rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+       size_t result;
+
+       switch (r_xprt->rx_ia.ri_memreg_strategy) {
+       case RPCRDMA_ALLPHYSICAL:
+               result = rpcrdma_physical_max_payload(r_xprt);
+               break;
+       default:
+               result = rpcrdma_mr_max_payload(r_xprt);
+       }
+       return result;
+}
index 89e7cd479705da640519cfe790fae5fed3f43e63..c419498b8f468a3a1c14bdb1a8f7f1200d32160c 100644 (file)
@@ -59,6 +59,7 @@
  * Interface Adapter -- one per transport instance
  */
 struct rpcrdma_ia {
+       rwlock_t                ri_qplock;
        struct rdma_cm_id       *ri_id;
        struct ib_pd            *ri_pd;
        struct ib_mr            *ri_bind_mem;
@@ -98,6 +99,14 @@ struct rpcrdma_ep {
 #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
 #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
 
+enum rpcrdma_chunktype {
+       rpcrdma_noch = 0,
+       rpcrdma_readch,
+       rpcrdma_areadch,
+       rpcrdma_writech,
+       rpcrdma_replych
+};
+
 /*
  * struct rpcrdma_rep -- this structure encapsulates state required to recv
  * and complete a reply, asychronously. It needs several pieces of
@@ -136,6 +145,40 @@ struct rpcrdma_rep {
        char    rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
 };
 
+/*
+ * struct rpcrdma_mw - external memory region metadata
+ *
+ * An external memory region is any buffer or page that is registered
+ * on the fly (ie, not pre-registered).
+ *
+ * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During
+ * call_allocate, rpcrdma_buffer_get() assigns one to each segment in
+ * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep
+ * track of registration metadata while each RPC is pending.
+ * rpcrdma_deregister_external() uses this metadata to unmap and
+ * release these resources when an RPC is complete.
+ */
+enum rpcrdma_frmr_state {
+       FRMR_IS_INVALID,        /* ready to be used */
+       FRMR_IS_VALID,          /* in use */
+       FRMR_IS_STALE,          /* failed completion */
+};
+
+struct rpcrdma_frmr {
+       struct ib_fast_reg_page_list    *fr_pgl;
+       struct ib_mr                    *fr_mr;
+       enum rpcrdma_frmr_state         fr_state;
+};
+
+struct rpcrdma_mw {
+       union {
+               struct ib_fmr           *fmr;
+               struct rpcrdma_frmr     frmr;
+       } r;
+       struct list_head        mw_list;
+       struct list_head        mw_all;
+};
+
 /*
  * struct rpcrdma_req -- structure central to the request/reply sequence.
  *
@@ -163,17 +206,7 @@ struct rpcrdma_rep {
 struct rpcrdma_mr_seg {                /* chunk descriptors */
        union {                         /* chunk memory handles */
                struct ib_mr    *rl_mr;         /* if registered directly */
-               struct rpcrdma_mw {             /* if registered from region */
-                       union {
-                               struct ib_fmr   *fmr;
-                               struct {
-                                       struct ib_fast_reg_page_list *fr_pgl;
-                                       struct ib_mr *fr_mr;
-                                       enum { FRMR_IS_INVALID, FRMR_IS_VALID  } state;
-                               } frmr;
-                       } r;
-                       struct list_head mw_list;
-               } *rl_mw;
+               struct rpcrdma_mw *rl_mw;       /* if registered from region */
        } mr_chunk;
        u64             mr_base;        /* registration result */
        u32             mr_rkey;        /* registration result */
@@ -191,6 +224,7 @@ struct rpcrdma_req {
        unsigned int    rl_niovs;       /* 0, 2 or 4 */
        unsigned int    rl_nchunks;     /* non-zero if chunks */
        unsigned int    rl_connect_cookie;      /* retry detection */
+       enum rpcrdma_chunktype  rl_rtype, rl_wtype;
        struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
        struct rpcrdma_rep      *rl_reply;/* holder for reply buffer */
        struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
@@ -214,6 +248,7 @@ struct rpcrdma_buffer {
        atomic_t        rb_credits;     /* most recent server credits */
        int             rb_max_requests;/* client max requests */
        struct list_head rb_mws;        /* optional memory windows/fmrs/frmrs */
+       struct list_head rb_all;
        int             rb_send_index;
        struct rpcrdma_req      **rb_send_bufs;
        int             rb_recv_index;
@@ -306,7 +341,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
                                struct rpcrdma_create_data_internal *);
 void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
 int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
-int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
 
 int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
                                struct rpcrdma_req *);
@@ -346,7 +381,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
 /*
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
  */
+ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
 int rpcrdma_marshal_req(struct rpc_rqst *);
+size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
 
 /* Temporary NFS request map cache. Created in svc_rdma.c  */
 extern struct kmem_cache *svc_rdma_map_cachep;
index be8bbd5d65ec6914f6178816556ce31ee7ef0883..43cd89eacfab2caa79a4db7fd38448f330e12373 100644 (file)
@@ -594,6 +594,7 @@ static int xs_local_send_request(struct rpc_task *task)
        }
 
        switch (status) {
+       case -ENOBUFS:
        case -EAGAIN:
                status = xs_nospace(task);
                break;
@@ -661,6 +662,7 @@ static int xs_udp_send_request(struct rpc_task *task)
                dprintk("RPC:       sendmsg returned unrecognized error %d\n",
                        -status);
        case -ENETUNREACH:
+       case -ENOBUFS:
        case -EPIPE:
        case -ECONNREFUSED:
                /* When the server has died, an ICMP port unreachable message
@@ -758,6 +760,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
                status = -ENOTCONN;
                /* Should we call xs_close() here? */
                break;
+       case -ENOBUFS:
        case -EAGAIN:
                status = xs_nospace(task);
                break;
@@ -1946,6 +1949,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
                dprintk("RPC:       xprt %p connected to %s\n",
                                xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
                xprt_set_connected(xprt);
+       case -ENOBUFS:
                break;
        case -ENOENT:
                dprintk("RPC:       xprt %p: socket %s does not exist\n",
@@ -2281,6 +2285,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
        case -ECONNREFUSED:
        case -ECONNRESET:
        case -ENETUNREACH:
+       case -ENOBUFS:
                /* retry with existing socket, after a delay */
                goto out;
        }
@@ -3054,12 +3059,12 @@ static int param_set_uint_minmax(const char *val,
                const struct kernel_param *kp,
                unsigned int min, unsigned int max)
 {
-       unsigned long num;
+       unsigned int num;
        int ret;
 
        if (!val)
                return -EINVAL;
-       ret = strict_strtoul(val, 0, &num);
+       ret = kstrtouint(val, 0, &num);
        if (ret == -EINVAL || num < min || num > max)
                return -EINVAL;
        *((unsigned int *)kp->arg) = num;