static const struct file_operations aio_ring_fops;
static const struct address_space_operations aio_ctx_aops;
- /* Backing dev info for aio fs.
- * -no dirty page accounting or writeback happens
- */
- static struct backing_dev_info aio_fs_backing_dev_info = {
- .name = "aiofs",
- .state = 0,
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
- };
-
static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
{
struct qstr this = QSTR_INIT("[aio]", 5);
inode->i_mapping->a_ops = &aio_ctx_aops;
inode->i_mapping->private_data = ctx;
- inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
inode->i_size = PAGE_SIZE * nr_pages;
path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
if (IS_ERR(aio_mnt))
panic("Failed to create aio fs mount.");
- if (bdi_init(&aio_fs_backing_dev_info))
- panic("Failed to init aio fs backing dev info.");
-
kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
long ret = 0;
int copy_ret;
+ /*
+ * The mutex can block and wake us up and that will cause
+ * wait_event_interruptible_hrtimeout() to schedule without sleeping
+ * and repeat. This should be rare enough that it doesn't cause
+ * peformance issues. See the comment in read_events() for more detail.
+ */
+ sched_annotate_sleep();
mutex_lock(&ctx->ring_lock);
/* Access to ->ring_pages here is protected by ctx->ring_lock. */
mutex_lock(&inode->i_mutex);
- current->backing_dev_info = inode->i_mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err) {
mutex_unlock(&inode->i_mutex);
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = btrfs_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
static int block_device_ejected(struct super_block *sb)
{
struct inode *bd_inode = sb->s_bdev->bd_inode;
- struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
return bdi->dev == NULL;
}
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
struct path *path);
-static int ext4_quota_on_sysfile(struct super_block *sb, int type,
- int format_id);
static int ext4_quota_off(struct super_block *sb, int type);
-static int ext4_quota_off_sysfile(struct super_block *sb, int type);
static int ext4_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
.get_dqblk = dquot_get_dqblk,
.set_dqblk = dquot_set_dqblk
};
-
-static const struct quotactl_ops ext4_qctl_sysfile_operations = {
- .quota_on_meta = ext4_quota_on_sysfile,
- .quota_off = ext4_quota_off_sysfile,
- .quota_sync = dquot_quota_sync,
- .get_info = dquot_get_dqinfo,
- .set_info = dquot_set_dqinfo,
- .get_dqblk = dquot_get_dqblk,
- .set_dqblk = dquot_set_dqblk
-};
#endif
static const struct super_operations ext4_sops = {
#ifdef CONFIG_QUOTA
sb->dq_op = &ext4_quota_operations;
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
- sb->s_qcop = &ext4_qctl_sysfile_operations;
+ sb->s_qcop = &dquot_quotactl_sysfile_ops;
else
sb->s_qcop = &ext4_qctl_operations;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
return 0;
}
-/*
- * quota_on function that is used when QUOTA feature is set.
- */
-static int ext4_quota_on_sysfile(struct super_block *sb, int type,
- int format_id)
-{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
- return -EINVAL;
-
- /*
- * USAGE was enabled at mount time. Only need to enable LIMITS now.
- */
- return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED);
-}
-
static int ext4_quota_off(struct super_block *sb, int type)
{
struct inode *inode = sb_dqopt(sb)->files[type];
return dquot_quota_off(sb, type);
}
-/*
- * quota_off function that is used when QUOTA feature is set.
- */
-static int ext4_quota_off_sysfile(struct super_block *sb, int type)
-{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
- return -EINVAL;
-
- /* Disable only the limits. */
- return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-}
-
/* Read data from quotafile - avoid pagecache and such because we cannot afford
* acquiring the locks... As quota files are never truncated and quota code
* itself serializes the operations (and no one else should touch the files)
mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
{
struct inode *inode = req->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
- struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
list_del(&req->writepages_entry);
req->end = fuse_writepage_end;
req->inode = inode;
- inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+ inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
spin_lock(&fc->lock);
if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
old_req->state == FUSE_REQ_PENDING)) {
- struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
copy_highpage(old_req->pages[0], page);
spin_unlock(&fc->lock);
req->page_descs[req->num_pages].offset = 0;
req->page_descs[req->num_pages].length = PAGE_SIZE;
- inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+ inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
err = 0;
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = fuse_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
spin_unlock(&lru_lock);
}
-static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
{
+ spin_lock(&lru_lock);
if (!list_empty(&gl->gl_lru)) {
list_del_init(&gl->gl_lru);
atomic_dec(&lru_count);
clear_bit(GLF_LRU, &gl->gl_flags);
}
-}
-
-static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
-{
- spin_lock(&lru_lock);
- __gfs2_glock_remove_from_lru(gl);
spin_unlock(&lru_lock);
}
lockref_mark_dead(&gl->gl_lockref);
- spin_lock(&lru_lock);
- __gfs2_glock_remove_from_lru(gl);
- spin_unlock(&lru_lock);
+ gfs2_glock_remove_from_lru(gl);
spin_unlock(&gl->gl_lockref.lock);
spin_lock_bucket(gl->gl_hash);
hlist_bl_del_rcu(&gl->gl_list);
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
mapping->private_data = NULL;
- mapping->backing_dev_info = s->s_bdi;
mapping->writeback_index = 0;
}
atomic_set(&mapping->i_mmap_writable, 0);
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
mapping->private_data = NULL;
- mapping->backing_dev_info = &default_backing_dev_info;
mapping->writeback_index = 0;
-
- /*
- * If the block_device provides a backing_dev_info for client
- * inodes then use that. Otherwise the inode share the bdev's
- * backing_dev_info.
- */
- if (sb->s_bdev) {
- struct backing_dev_info *bdi;
-
- bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
- mapping->backing_dev_info = bdi;
- }
inode->i_private = NULL;
inode->i_mapping = mapping;
INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
#ifdef CONFIG_FSNOTIFY
inode->i_fsnotify_mask = 0;
#endif
-
+ inode->i_flctx = NULL;
this_cpu_inc(nr_inodes);
return 0;
BUG_ON(inode_has_buffers(inode));
security_inode_free(inode);
fsnotify_inode_delete(inode);
+ locks_free_lock_context(inode->i_flctx);
if (!inode->i_nlink) {
WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
atomic_long_dec(&inode->i_sb->s_remove_count);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
mapping->i_mmap = RB_ROOT;
- INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
}
EXPORT_SYMBOL(address_space_init_once);
}
}
-static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo)
-{
- if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
- return;
- pnfs_return_layout(inode);
-}
-
static int filelayout_async_handle_error(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
dprintk("%s DS connection error %d\n", __func__,
task->tk_status);
nfs4_mark_deviceid_unavailable(devid);
- set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+ pnfs_error_mark_layout_for_return(inode, lseg);
rpc_wake_up(&tbl->slot_tbl_waitq);
/* fall through */
default:
rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
}
-static void filelayout_read_release(void *data)
-{
- struct nfs_pgio_header *hdr = data;
- struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
-
- filelayout_fenceme(lo->plh_inode, lo);
- nfs_put_client(hdr->ds_clp);
- hdr->mds_ops->rpc_release(data);
-}
-
static int filelayout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
return 0;
}
-/* Fake up some data that will cause nfs_commit_release to retry the writes. */
-static void prepare_to_resend_writes(struct nfs_commit_data *data)
-{
- struct nfs_page *first = nfs_list_entry(data->pages.next);
-
- data->task.tk_status = 0;
- memcpy(&data->verf.verifier, &first->wb_verf,
- sizeof(data->verf.verifier));
- data->verf.verifier.data[0]++; /* ensure verifier mismatch */
-}
-
static int filelayout_commit_done_cb(struct rpc_task *task,
struct nfs_commit_data *data)
{
switch (err) {
case -NFS4ERR_RESET_TO_MDS:
- prepare_to_resend_writes(data);
+ pnfs_generic_prepare_to_resend_writes(data);
return -EAGAIN;
case -EAGAIN:
rpc_restart_call_prepare(task);
rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
}
-static void filelayout_write_release(void *data)
-{
- struct nfs_pgio_header *hdr = data;
- struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
-
- filelayout_fenceme(lo->plh_inode, lo);
- nfs_put_client(hdr->ds_clp);
- hdr->mds_ops->rpc_release(data);
-}
-
static void filelayout_commit_prepare(struct rpc_task *task, void *data)
{
struct nfs_commit_data *wdata = data;
task);
}
-static void filelayout_write_commit_done(struct rpc_task *task, void *data)
-{
- struct nfs_commit_data *wdata = data;
-
- /* Note this may cause RPC to be resent */
- wdata->mds_ops->rpc_call_done(task, data);
-}
-
static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
{
struct nfs_commit_data *cdata = data;
rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics);
}
-static void filelayout_commit_release(void *calldata)
-{
- struct nfs_commit_data *data = calldata;
-
- data->completion_ops->completion(data);
- pnfs_put_lseg(data->lseg);
- nfs_put_client(data->ds_clp);
- nfs_commitdata_release(data);
-}
-
static const struct rpc_call_ops filelayout_read_call_ops = {
.rpc_call_prepare = filelayout_read_prepare,
.rpc_call_done = filelayout_read_call_done,
.rpc_count_stats = filelayout_read_count_stats,
- .rpc_release = filelayout_read_release,
+ .rpc_release = pnfs_generic_rw_release,
};
static const struct rpc_call_ops filelayout_write_call_ops = {
.rpc_call_prepare = filelayout_write_prepare,
.rpc_call_done = filelayout_write_call_done,
.rpc_count_stats = filelayout_write_count_stats,
- .rpc_release = filelayout_write_release,
+ .rpc_release = pnfs_generic_rw_release,
};
static const struct rpc_call_ops filelayout_commit_call_ops = {
.rpc_call_prepare = filelayout_commit_prepare,
- .rpc_call_done = filelayout_write_commit_done,
+ .rpc_call_done = pnfs_generic_write_commit_done,
.rpc_count_stats = filelayout_commit_count_stats,
- .rpc_release = filelayout_commit_release,
+ .rpc_release = pnfs_generic_commit_release,
};
static enum pnfs_try_status
/* No multipath support. Use first DS */
atomic_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
- hdr->ds_idx = idx;
+ hdr->ds_commit_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
if (fh)
hdr->args.fh = fh;
hdr->mds_offset = offset;
/* Perform an asynchronous read to ds */
- nfs_initiate_pgio(ds_clnt, hdr,
- &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
+ nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
+ NFS_PROTO(hdr->inode), &filelayout_read_call_ops,
+ 0, RPC_TASK_SOFTCONN);
return PNFS_ATTEMPTED;
}
hdr->pgio_done_cb = filelayout_write_done_cb;
atomic_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
- hdr->ds_idx = idx;
+ hdr->ds_commit_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
if (fh)
hdr->args.fh = fh;
hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
/* Perform an asynchronous write */
- nfs_initiate_pgio(ds_clnt, hdr,
- &filelayout_write_call_ops, sync,
- RPC_TASK_SOFTCONN);
+ nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
+ NFS_PROTO(hdr->inode), &filelayout_write_call_ops,
+ sync, RPC_TASK_SOFTCONN);
return PNFS_ATTEMPTED;
}
.pg_init = filelayout_pg_init_read,
.pg_test = filelayout_pg_test,
.pg_doio = pnfs_generic_pg_readpages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
};
static const struct nfs_pageio_ops filelayout_pg_write_ops = {
.pg_init = filelayout_pg_init_write,
.pg_test = filelayout_pg_test,
.pg_doio = pnfs_generic_pg_writepages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
};
static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
return j;
}
-/* The generic layer is about to remove the req from the commit list.
- * If this will make the bucket empty, it will need to put the lseg reference.
- * Note this is must be called holding the inode (/cinfo) lock
- */
-static void
-filelayout_clear_request_commit(struct nfs_page *req,
- struct nfs_commit_info *cinfo)
-{
- struct pnfs_layout_segment *freeme = NULL;
-
- if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
- goto out;
- cinfo->ds->nwritten--;
- if (list_is_singular(&req->wb_list)) {
- struct pnfs_commit_bucket *bucket;
-
- bucket = list_first_entry(&req->wb_list,
- struct pnfs_commit_bucket,
- written);
- freeme = bucket->wlseg;
- bucket->wlseg = NULL;
- }
-out:
- nfs_request_remove_commit_list(req, cinfo);
- pnfs_put_lseg_locked(freeme);
-}
-
static void
filelayout_mark_request_commit(struct nfs_page *req,
struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx)
{
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
* is normally transferred to the COMMIT call and released
* there. It could also be released if the last req is pulled
* off due to a rewrite, in which case it will be done in
- * filelayout_clear_request_commit
+ * pnfs_generic_clear_request_commit
*/
buckets[i].wlseg = pnfs_get_lseg(lseg);
}
spin_unlock(cinfo->lock);
if (!cinfo->dreq) {
inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+ inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
BDI_RECLAIMABLE);
__mark_inode_dirty(req->wb_context->dentry->d_inode,
I_DIRTY_DATASYNC);
fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
if (fh)
data->args.fh = fh;
- return nfs_initiate_commit(ds_clnt, data,
+ return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode),
&filelayout_commit_call_ops, how,
RPC_TASK_SOFTCONN);
out_err:
- prepare_to_resend_writes(data);
- filelayout_commit_release(data);
+ pnfs_generic_prepare_to_resend_writes(data);
+ pnfs_generic_commit_release(data);
return -EAGAIN;
}
-static int
-transfer_commit_list(struct list_head *src, struct list_head *dst,
- struct nfs_commit_info *cinfo, int max)
-{
- struct nfs_page *req, *tmp;
- int ret = 0;
-
- list_for_each_entry_safe(req, tmp, src, wb_list) {
- if (!nfs_lock_request(req))
- continue;
- kref_get(&req->wb_kref);
- if (cond_resched_lock(cinfo->lock))
- list_safe_reset_next(req, tmp, wb_list);
- nfs_request_remove_commit_list(req, cinfo);
- clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
- nfs_list_add_request(req, dst);
- ret++;
- if ((ret == max) && !cinfo->dreq)
- break;
- }
- return ret;
-}
-
-/* Note called with cinfo->lock held. */
-static int
-filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
- struct nfs_commit_info *cinfo,
- int max)
-{
- struct list_head *src = &bucket->written;
- struct list_head *dst = &bucket->committing;
- int ret;
-
- ret = transfer_commit_list(src, dst, cinfo, max);
- if (ret) {
- cinfo->ds->nwritten -= ret;
- cinfo->ds->ncommitting += ret;
- bucket->clseg = bucket->wlseg;
- if (list_empty(src))
- bucket->wlseg = NULL;
- else
- pnfs_get_lseg(bucket->clseg);
- }
- return ret;
-}
-
-/* Move reqs from written to committing lists, returning count of number moved.
- * Note called with cinfo->lock held.
- */
-static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo,
- int max)
-{
- int i, rv = 0, cnt;
-
- for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
- cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i],
- cinfo, max);
- max -= cnt;
- rv += cnt;
- }
- return rv;
-}
-
-/* Pull everything off the committing lists and dump into @dst */
-static void filelayout_recover_commit_reqs(struct list_head *dst,
- struct nfs_commit_info *cinfo)
-{
- struct pnfs_commit_bucket *b;
- struct pnfs_layout_segment *freeme;
- int i;
-
-restart:
- spin_lock(cinfo->lock);
- for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
- if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
- freeme = b->wlseg;
- b->wlseg = NULL;
- spin_unlock(cinfo->lock);
- pnfs_put_lseg(freeme);
- goto restart;
- }
- }
- cinfo->ds->nwritten = 0;
- spin_unlock(cinfo->lock);
-}
-
/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
* for @page
* @cinfo - commit info for current inode
return NULL;
}
-static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
-{
- struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- struct pnfs_commit_bucket *bucket;
- struct pnfs_layout_segment *freeme;
- int i;
-
- for (i = idx; i < fl_cinfo->nbuckets; i++) {
- bucket = &fl_cinfo->buckets[i];
- if (list_empty(&bucket->committing))
- continue;
- nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
- spin_lock(cinfo->lock);
- freeme = bucket->clseg;
- bucket->clseg = NULL;
- spin_unlock(cinfo->lock);
- pnfs_put_lseg(freeme);
- }
-}
-
-static unsigned int
-alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
-{
- struct pnfs_ds_commit_info *fl_cinfo;
- struct pnfs_commit_bucket *bucket;
- struct nfs_commit_data *data;
- int i;
- unsigned int nreq = 0;
-
- fl_cinfo = cinfo->ds;
- bucket = fl_cinfo->buckets;
- for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
- if (list_empty(&bucket->committing))
- continue;
- data = nfs_commitdata_alloc();
- if (!data)
- break;
- data->ds_commit_index = i;
- spin_lock(cinfo->lock);
- data->lseg = bucket->clseg;
- bucket->clseg = NULL;
- spin_unlock(cinfo->lock);
- list_add(&data->pages, list);
- nreq++;
- }
-
- /* Clean up on error */
- filelayout_retry_commit(cinfo, i);
- /* Caller will clean up entries put on list */
- return nreq;
-}
-
-/* This follows nfs_commit_list pretty closely */
static int
filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
int how, struct nfs_commit_info *cinfo)
{
- struct nfs_commit_data *data, *tmp;
- LIST_HEAD(list);
- unsigned int nreq = 0;
-
- if (!list_empty(mds_pages)) {
- data = nfs_commitdata_alloc();
- if (data != NULL) {
- data->lseg = NULL;
- list_add(&data->pages, &list);
- nreq++;
- } else {
- nfs_retry_commit(mds_pages, NULL, cinfo);
- filelayout_retry_commit(cinfo, 0);
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
- return -ENOMEM;
- }
- }
-
- nreq += alloc_ds_commits(cinfo, &list);
-
- if (nreq == 0) {
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
- goto out;
- }
-
- atomic_add(nreq, &cinfo->mds->rpcs_out);
-
- list_for_each_entry_safe(data, tmp, &list, pages) {
- list_del_init(&data->pages);
- if (!data->lseg) {
- nfs_init_commit(data, mds_pages, NULL, cinfo);
- nfs_initiate_commit(NFS_CLIENT(inode), data,
- data->mds_ops, how, 0);
- } else {
- struct pnfs_commit_bucket *buckets;
-
- buckets = cinfo->ds->buckets;
- nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo);
- filelayout_initiate_commit(data, how);
- }
- }
-out:
- cinfo->ds->ncommitting = 0;
- return PNFS_ATTEMPTED;
+ return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
+ filelayout_initiate_commit);
}
+
static struct nfs4_deviceid_node *
filelayout_alloc_deviceid_node(struct nfs_server *server,
struct pnfs_device *pdev, gfp_t gfp_flags)
.pg_write_ops = &filelayout_pg_write_ops,
.get_ds_info = &filelayout_get_ds_info,
.mark_request_commit = filelayout_mark_request_commit,
- .clear_request_commit = filelayout_clear_request_commit,
- .scan_commit_lists = filelayout_scan_commit_lists,
- .recover_commit_reqs = filelayout_recover_commit_reqs,
+ .clear_request_commit = pnfs_generic_clear_request_commit,
+ .scan_commit_lists = pnfs_generic_scan_commit_lists,
+ .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
.search_commit_reqs = filelayout_search_commit_reqs,
.commit_pagelist = filelayout_commit_pagelist,
.read_pagelist = filelayout_read_pagelist,
--- /dev/null
- inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+/*
+ * Module for pnfs flexfile layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_idmap.h>
+
+#include "flexfilelayout.h"
+#include "../nfs4session.h"
+#include "../internal.h"
+#include "../delegation.h"
+#include "../nfs4trace.h"
+#include "../iostat.h"
+#include "../nfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
+
+static struct pnfs_layout_hdr *
+ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+ struct nfs4_flexfile_layout *ffl;
+
+ ffl = kzalloc(sizeof(*ffl), gfp_flags);
+ if (ffl) {
+ INIT_LIST_HEAD(&ffl->error_list);
+ return &ffl->generic_hdr;
+ } else
+ return NULL;
+}
+
+static void
+ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct nfs4_ff_layout_ds_err *err, *n;
+
+ list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
+ list) {
+ list_del(&err->list);
+ kfree(err);
+ }
+ kfree(FF_LAYOUT_FROM_HDR(lo));
+}
+
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
+ if (unlikely(p == NULL))
+ return -ENOBUFS;
+ memcpy(stateid, p, NFS4_STATEID_SIZE);
+ dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
+ p[0], p[1], p[2], p[3]);
+ return 0;
+}
+
+static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ memcpy(devid, p, NFS4_DEVICEID4_SIZE);
+ nfs4_print_deviceid(devid);
+ return 0;
+}
+
+static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ fh->size = be32_to_cpup(p++);
+ if (fh->size > sizeof(struct nfs_fh)) {
+ printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
+ fh->size);
+ return -EOVERFLOW;
+ }
+ /* fh.data */
+ p = xdr_inline_decode(xdr, fh->size);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ memcpy(&fh->data, p, fh->size);
+ dprintk("%s: fh len %d\n", __func__, fh->size);
+
+ return 0;
+}
+
+/*
+ * Currently only stringified uids and gids are accepted.
+ * I.e., kerberos is not supported to the DSes, so no pricipals.
+ *
+ * That means that one common function will suffice, but when
+ * principals are added, this should be split to accomodate
+ * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
+ */
+static int
+decode_name(struct xdr_stream *xdr, u32 *id)
+{
+ __be32 *p;
+ int len;
+
+ /* opaque_length(4)*/
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ len = be32_to_cpup(p++);
+ if (len < 0)
+ return -EINVAL;
+
+ dprintk("%s: len %u\n", __func__, len);
+
+ /* opaque body */
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -ENOBUFS;
+
+ if (!nfs_map_string_to_numeric((char *)p, len, id))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
+{
+ int i;
+
+ if (fls->mirror_array) {
+ for (i = 0; i < fls->mirror_array_cnt; i++) {
+ /* normally mirror_ds is freed in
+ * .free_deviceid_node but we still do it here
+ * for .alloc_lseg error path */
+ if (fls->mirror_array[i]) {
+ kfree(fls->mirror_array[i]->fh_versions);
+ nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
+ kfree(fls->mirror_array[i]);
+ }
+ }
+ kfree(fls->mirror_array);
+ fls->mirror_array = NULL;
+ }
+}
+
+static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
+{
+ int ret = 0;
+
+ dprintk("--> %s\n", __func__);
+
+ /* FIXME: remove this check when layout segment support is added */
+ if (lgr->range.offset != 0 ||
+ lgr->range.length != NFS4_MAX_UINT64) {
+ dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
+ __func__);
+ ret = -EINVAL;
+ }
+
+ dprintk("--> %s returns %d\n", __func__, ret);
+ return ret;
+}
+
+static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
+{
+ if (fls) {
+ ff_layout_free_mirror_array(fls);
+ kfree(fls);
+ }
+}
+
+static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
+{
+ struct nfs4_ff_layout_mirror *tmp;
+ int i, j;
+
+ for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
+ for (j = i + 1; j < fls->mirror_array_cnt; j++)
+ if (fls->mirror_array[i]->efficiency <
+ fls->mirror_array[j]->efficiency) {
+ tmp = fls->mirror_array[i];
+ fls->mirror_array[i] = fls->mirror_array[j];
+ fls->mirror_array[j] = tmp;
+ }
+ }
+}
+
+static struct pnfs_layout_segment *
+ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
+ struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_flags)
+{
+ struct pnfs_layout_segment *ret;
+ struct nfs4_ff_layout_segment *fls = NULL;
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ u64 stripe_unit;
+ u32 mirror_array_cnt;
+ __be32 *p;
+ int i, rc;
+
+ dprintk("--> %s\n", __func__);
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ return ERR_PTR(-ENOMEM);
+
+ xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
+ lgr->layoutp->len);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ /* stripe unit and mirror_array_cnt */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 8 + 4);
+ if (!p)
+ goto out_err_free;
+
+ p = xdr_decode_hyper(p, &stripe_unit);
+ mirror_array_cnt = be32_to_cpup(p++);
+ dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
+ stripe_unit, mirror_array_cnt);
+
+ if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
+ mirror_array_cnt == 0)
+ goto out_err_free;
+
+ rc = -ENOMEM;
+ fls = kzalloc(sizeof(*fls), gfp_flags);
+ if (!fls)
+ goto out_err_free;
+
+ fls->mirror_array_cnt = mirror_array_cnt;
+ fls->stripe_unit = stripe_unit;
+ fls->mirror_array = kcalloc(fls->mirror_array_cnt,
+ sizeof(fls->mirror_array[0]), gfp_flags);
+ if (fls->mirror_array == NULL)
+ goto out_err_free;
+
+ for (i = 0; i < fls->mirror_array_cnt; i++) {
+ struct nfs4_deviceid devid;
+ struct nfs4_deviceid_node *idnode;
+ u32 ds_count;
+ u32 fh_count;
+ int j;
+
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ ds_count = be32_to_cpup(p);
+
+ /* FIXME: allow for striping? */
+ if (ds_count != 1)
+ goto out_err_free;
+
+ fls->mirror_array[i] =
+ kzalloc(sizeof(struct nfs4_ff_layout_mirror),
+ gfp_flags);
+ if (fls->mirror_array[i] == NULL) {
+ rc = -ENOMEM;
+ goto out_err_free;
+ }
+
+ spin_lock_init(&fls->mirror_array[i]->lock);
+ fls->mirror_array[i]->ds_count = ds_count;
+
+ /* deviceid */
+ rc = decode_deviceid(&stream, &devid);
+ if (rc)
+ goto out_err_free;
+
+ idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
+ &devid, lh->plh_lc_cred,
+ gfp_flags);
+ /*
+ * upon success, mirror_ds is allocated by previous
+ * getdeviceinfo, or newly by .alloc_deviceid_node
+ * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure
+ */
+ if (idnode)
+ fls->mirror_array[i]->mirror_ds =
+ FF_LAYOUT_MIRROR_DS(idnode);
+ else
+ goto out_err_free;
+
+ /* efficiency */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ fls->mirror_array[i]->efficiency = be32_to_cpup(p);
+
+ /* stateid */
+ rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
+ if (rc)
+ goto out_err_free;
+
+ /* fh */
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ fh_count = be32_to_cpup(p);
+
+ fls->mirror_array[i]->fh_versions =
+ kzalloc(fh_count * sizeof(struct nfs_fh),
+ gfp_flags);
+ if (fls->mirror_array[i]->fh_versions == NULL) {
+ rc = -ENOMEM;
+ goto out_err_free;
+ }
+
+ for (j = 0; j < fh_count; j++) {
+ rc = decode_nfs_fh(&stream,
+ &fls->mirror_array[i]->fh_versions[j]);
+ if (rc)
+ goto out_err_free;
+ }
+
+ fls->mirror_array[i]->fh_versions_cnt = fh_count;
+
+ /* user */
+ rc = decode_name(&stream, &fls->mirror_array[i]->uid);
+ if (rc)
+ goto out_err_free;
+
+ /* group */
+ rc = decode_name(&stream, &fls->mirror_array[i]->gid);
+ if (rc)
+ goto out_err_free;
+
+ dprintk("%s: uid %d gid %d\n", __func__,
+ fls->mirror_array[i]->uid,
+ fls->mirror_array[i]->gid);
+ }
+
+ ff_layout_sort_mirrors(fls);
+ rc = ff_layout_check_layout(lgr);
+ if (rc)
+ goto out_err_free;
+
+ ret = &fls->generic_hdr;
+ dprintk("<-- %s (success)\n", __func__);
+out_free_page:
+ __free_page(scratch);
+ return ret;
+out_err_free:
+ _ff_layout_free_lseg(fls);
+ ret = ERR_PTR(rc);
+ dprintk("<-- %s (%d)\n", __func__, rc);
+ goto out_free_page;
+}
+
+static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
+{
+ struct pnfs_layout_segment *lseg;
+
+ list_for_each_entry(lseg, &layout->plh_segs, pls_list)
+ if (lseg->pls_range.iomode == IOMODE_RW)
+ return true;
+
+ return false;
+}
+
+static void
+ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+ int i;
+
+ dprintk("--> %s\n", __func__);
+
+ for (i = 0; i < fls->mirror_array_cnt; i++) {
+ if (fls->mirror_array[i]) {
+ nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
+ fls->mirror_array[i]->mirror_ds = NULL;
+ if (fls->mirror_array[i]->cred) {
+ put_rpccred(fls->mirror_array[i]->cred);
+ fls->mirror_array[i]->cred = NULL;
+ }
+ }
+ }
+
+ if (lseg->pls_range.iomode == IOMODE_RW) {
+ struct nfs4_flexfile_layout *ffl;
+ struct inode *inode;
+
+ ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
+ inode = ffl->generic_hdr.plh_inode;
+ spin_lock(&inode->i_lock);
+ if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
+ ffl->commit_info.nbuckets = 0;
+ kfree(ffl->commit_info.buckets);
+ ffl->commit_info.buckets = NULL;
+ }
+ spin_unlock(&inode->i_lock);
+ }
+ _ff_layout_free_lseg(fls);
+}
+
+/* Return 1 until we have multiple lsegs support */
+static int
+ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
+{
+ return 1;
+}
+
+static int
+ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ gfp_t gfp_flags)
+{
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+ struct pnfs_commit_bucket *buckets;
+ int size;
+
+ if (cinfo->ds->nbuckets != 0) {
+ /* This assumes there is only one RW lseg per file.
+ * To support multiple lseg per file, we need to
+ * change struct pnfs_commit_bucket to allow dynamic
+ * increasing nbuckets.
+ */
+ return 0;
+ }
+
+ size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
+
+ buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
+ gfp_flags);
+ if (!buckets)
+ return -ENOMEM;
+ else {
+ int i;
+
+ spin_lock(cinfo->lock);
+ if (cinfo->ds->nbuckets != 0)
+ kfree(buckets);
+ else {
+ cinfo->ds->buckets = buckets;
+ cinfo->ds->nbuckets = size;
+ for (i = 0; i < size; i++) {
+ INIT_LIST_HEAD(&buckets[i].written);
+ INIT_LIST_HEAD(&buckets[i].committing);
+ /* mark direct verifier as unset */
+ buckets[i].direct_verf.committed =
+ NFS_INVALID_STABLE_HOW;
+ }
+ }
+ spin_unlock(cinfo->lock);
+ return 0;
+ }
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
+ int *best_idx)
+{
+ struct nfs4_ff_layout_segment *fls;
+ struct nfs4_pnfs_ds *ds;
+ int idx;
+
+ fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
+ /* mirrors are sorted by efficiency */
+ for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
+ ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
+ if (ds) {
+ *best_idx = idx;
+ return ds;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ struct nfs_pgio_mirror *pgm;
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_pnfs_ds *ds;
+ int ds_idx;
+
+ /* Use full layout for now */
+ if (!pgio->pg_lseg)
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ req->wb_context,
+ 0,
+ NFS4_MAX_UINT64,
+ IOMODE_READ,
+ GFP_KERNEL);
+ /* If no lseg, fall back to read through mds */
+ if (pgio->pg_lseg == NULL)
+ goto out_mds;
+
+ ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
+ if (!ds)
+ goto out_mds;
+ mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
+
+ pgio->pg_mirror_idx = ds_idx;
+
+ /* read always uses only one mirror - idx 0 for pgio layer */
+ pgm = &pgio->pg_mirrors[0];
+ pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+
+ return;
+out_mds:
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ nfs_pageio_reset_read_mds(pgio);
+}
+
+static void
+ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs_pgio_mirror *pgm;
+ struct nfs_commit_info cinfo;
+ struct nfs4_pnfs_ds *ds;
+ int i;
+ int status;
+
+ if (!pgio->pg_lseg)
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ req->wb_context,
+ 0,
+ NFS4_MAX_UINT64,
+ IOMODE_RW,
+ GFP_NOFS);
+ /* If no lseg, fall back to write through mds */
+ if (pgio->pg_lseg == NULL)
+ goto out_mds;
+
+ nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
+ status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
+ if (status < 0)
+ goto out_mds;
+
+ /* Use a direct mapping of ds_idx to pgio mirror_idx */
+ if (WARN_ON_ONCE(pgio->pg_mirror_count !=
+ FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
+ goto out_mds;
+
+ for (i = 0; i < pgio->pg_mirror_count; i++) {
+ ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
+ if (!ds)
+ goto out_mds;
+ pgm = &pgio->pg_mirrors[i];
+ mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+ pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
+ }
+
+ return;
+
+out_mds:
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ nfs_pageio_reset_write_mds(pgio);
+}
+
+static unsigned int
+ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ if (!pgio->pg_lseg)
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ req->wb_context,
+ 0,
+ NFS4_MAX_UINT64,
+ IOMODE_RW,
+ GFP_NOFS);
+ if (pgio->pg_lseg)
+ return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
+
+ /* no lseg means that pnfs is not in use, so no mirroring here */
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ nfs_pageio_reset_write_mds(pgio);
+ return 1;
+}
+
+static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
+ .pg_init = ff_layout_pg_init_read,
+ .pg_test = pnfs_generic_pg_test,
+ .pg_doio = pnfs_generic_pg_readpages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
+ .pg_init = ff_layout_pg_init_write,
+ .pg_test = pnfs_generic_pg_test,
+ .pg_doio = pnfs_generic_pg_writepages,
+ .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
+{
+ struct rpc_task *task = &hdr->task;
+
+ pnfs_layoutcommit_inode(hdr->inode, false);
+
+ if (retry_pnfs) {
+ dprintk("%s Reset task %5u for i/o through pNFS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ if (!hdr->dreq) {
+ struct nfs_open_context *ctx;
+
+ ctx = nfs_list_entry(hdr->pages.next)->wb_context;
+ set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
+ hdr->completion_ops->error_cleanup(&hdr->pages);
+ } else {
+ nfs_direct_set_resched_writes(hdr->dreq);
+ /* fake unstable write to let common nfs resend pages */
+ hdr->verf.committed = NFS_UNSTABLE;
+ hdr->good_bytes = 0;
+ }
+ return;
+ }
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ dprintk("%s Reset task %5u for i/o through MDS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ task->tk_status = pnfs_write_done_resend_to_mds(hdr);
+ }
+}
+
+static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
+{
+ struct rpc_task *task = &hdr->task;
+
+ pnfs_layoutcommit_inode(hdr->inode, false);
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ dprintk("%s Reset task %5u for i/o through MDS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ task->tk_status = pnfs_read_done_resend_to_mds(hdr);
+ }
+}
+
+static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+ struct nfs4_state *state,
+ struct nfs_client *clp,
+ struct pnfs_layout_segment *lseg,
+ int idx)
+{
+ struct pnfs_layout_hdr *lo = lseg->pls_layout;
+ struct inode *inode = lo->plh_inode;
+ struct nfs_server *mds_server = NFS_SERVER(inode);
+
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs_client *mds_client = mds_server->nfs_client;
+ struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
+
+ if (task->tk_status >= 0)
+ return 0;
+
+ switch (task->tk_status) {
+ /* MDS state errors */
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_BAD_STATEID:
+ if (state == NULL)
+ break;
+ nfs_remove_bad_delegation(state->inode);
+ case -NFS4ERR_OPENMODE:
+ if (state == NULL)
+ break;
+ if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+ goto out_bad_stateid;
+ goto wait_on_recovery;
+ case -NFS4ERR_EXPIRED:
+ if (state != NULL) {
+ if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+ goto out_bad_stateid;
+ }
+ nfs4_schedule_lease_recovery(mds_client);
+ goto wait_on_recovery;
+ /* DS session errors */
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ case -NFS4ERR_SEQ_MISORDERED:
+ dprintk("%s ERROR %d, Reset session. Exchangeid "
+ "flags 0x%x\n", __func__, task->tk_status,
+ clp->cl_exchange_flags);
+ nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+ break;
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_GRACE:
+ rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
+ break;
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ break;
+ /* Invalidate Layout errors */
+ case -NFS4ERR_PNFS_NO_LAYOUT:
+ case -ESTALE: /* mapped NFS4ERR_STALE */
+ case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */
+ case -EISDIR: /* mapped NFS4ERR_ISDIR */
+ case -NFS4ERR_FHEXPIRED:
+ case -NFS4ERR_WRONG_TYPE:
+ dprintk("%s Invalid layout error %d\n", __func__,
+ task->tk_status);
+ /*
+ * Destroy layout so new i/o will get a new layout.
+ * Layout will not be destroyed until all current lseg
+ * references are put. Mark layout as invalid to resend failed
+ * i/o and all i/o waiting on the slot table to the MDS until
+ * layout is destroyed and a new valid layout is obtained.
+ */
+ pnfs_destroy_layout(NFS_I(inode));
+ rpc_wake_up(&tbl->slot_tbl_waitq);
+ goto reset;
+ /* RPC connection errors */
+ case -ECONNREFUSED:
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -EIO:
+ case -ETIMEDOUT:
+ case -EPIPE:
+ dprintk("%s DS connection error %d\n", __func__,
+ task->tk_status);
+ nfs4_mark_deviceid_unavailable(devid);
+ rpc_wake_up(&tbl->slot_tbl_waitq);
+ /* fall through */
+ default:
+ if (ff_layout_has_available_ds(lseg))
+ return -NFS4ERR_RESET_TO_PNFS;
+reset:
+ dprintk("%s Retry through MDS. Error %d\n", __func__,
+ task->tk_status);
+ return -NFS4ERR_RESET_TO_MDS;
+ }
+out:
+ task->tk_status = 0;
+ return -EAGAIN;
+out_bad_stateid:
+ task->tk_status = -EIO;
+ return 0;
+wait_on_recovery:
+ rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
+ if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
+ rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
+ goto out;
+}
+
+/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
+static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+ struct pnfs_layout_segment *lseg,
+ int idx)
+{
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ if (task->tk_status >= 0)
+ return 0;
+
+ if (task->tk_status != -EJUKEBOX) {
+ dprintk("%s DS connection error %d\n", __func__,
+ task->tk_status);
+ nfs4_mark_deviceid_unavailable(devid);
+ if (ff_layout_has_available_ds(lseg))
+ return -NFS4ERR_RESET_TO_PNFS;
+ else
+ return -NFS4ERR_RESET_TO_MDS;
+ }
+
+ if (task->tk_status == -EJUKEBOX)
+ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ task->tk_status = 0;
+ rpc_restart_call(task);
+ rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+ return -EAGAIN;
+}
+
+static int ff_layout_async_handle_error(struct rpc_task *task,
+ struct nfs4_state *state,
+ struct nfs_client *clp,
+ struct pnfs_layout_segment *lseg,
+ int idx)
+{
+ int vers = clp->cl_nfs_mod->rpc_vers->number;
+
+ switch (vers) {
+ case 3:
+ return ff_layout_async_handle_error_v3(task, lseg, idx);
+ case 4:
+ return ff_layout_async_handle_error_v4(task, state, clp,
+ lseg, idx);
+ default:
+ /* should never happen */
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+}
+
+static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
+ int idx, u64 offset, u64 length,
+ u32 status, int opnum)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ int err;
+
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+ mirror, offset, length, status, opnum,
+ GFP_NOIO);
+ dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
+}
+
+/* NFS_PROTO call done callback routines */
+
+static int ff_layout_read_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ struct inode *inode;
+ int err;
+
+ trace_nfs4_pnfs_read(hdr, task->tk_status);
+ if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
+ hdr->res.op_status = NFS4ERR_NXIO;
+ if (task->tk_status < 0 && hdr->res.op_status)
+ ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+ hdr->args.offset, hdr->args.count,
+ hdr->res.op_status, OP_READ);
+ err = ff_layout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg,
+ hdr->pgio_mirror_idx);
+
+ switch (err) {
+ case -NFS4ERR_RESET_TO_PNFS:
+ set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ &hdr->lseg->pls_layout->plh_flags);
+ pnfs_read_resend_pnfs(hdr);
+ return task->tk_status;
+ case -NFS4ERR_RESET_TO_MDS:
+ inode = hdr->lseg->pls_layout->plh_inode;
+ pnfs_error_mark_layout_for_return(inode, hdr->lseg);
+ ff_layout_reset_read(hdr);
+ return task->tk_status;
+ case -EAGAIN:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+/*
+ * We reference the rpc_cred of the first WRITE that triggers the need for
+ * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
+ * rfc5661 is not clear about which credential should be used.
+ *
+ * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
+ * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
+ * we always send layoutcommit after DS writes.
+ */
+static void
+ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
+{
+ pnfs_set_layoutcommit(hdr);
+ dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
+ (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
+}
+
+static bool
+ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
+{
+ /* No mirroring for now */
+ struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ return ff_layout_test_devid_unavailable(node);
+}
+
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+ rpc_exit(task, -EIO);
+ return -EIO;
+ }
+ if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
+ dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+ if (ff_layout_has_available_ds(hdr->lseg))
+ pnfs_read_resend_pnfs(hdr);
+ else
+ ff_layout_reset_read(hdr);
+ rpc_exit(task, 0);
+ return -EAGAIN;
+ }
+ hdr->pgio_done_cb = ff_layout_read_done_cb;
+
+ return 0;
+}
+
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (ff_layout_read_prepare_common(task, hdr))
+ return;
+
+ rpc_call_start(task);
+}
+
+static int ff_layout_setup_sequence(struct nfs_client *ds_clp,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ struct rpc_task *task)
+{
+ if (ds_clp->cl_session)
+ return nfs41_setup_sequence(ds_clp->cl_session,
+ args,
+ res,
+ task);
+ return nfs40_setup_sequence(ds_clp->cl_slot_tbl,
+ args,
+ res,
+ task);
+}
+
+static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (ff_layout_read_prepare_common(task, hdr))
+ return;
+
+ if (ff_layout_setup_sequence(hdr->ds_clp,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
+ task))
+ return;
+
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context, FMODE_READ) == -EIO)
+ rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+
+static void ff_layout_read_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+ task->tk_status == 0) {
+ nfs4_sequence_done(task, &hdr->res.seq_res);
+ return;
+ }
+
+ /* Note this may cause RPC to be resent */
+ hdr->mds_ops->rpc_call_done(task, hdr);
+}
+
+static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ rpc_count_iostats_metrics(task,
+ &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
+}
+
+static int ff_layout_write_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ struct inode *inode;
+ int err;
+
+ trace_nfs4_pnfs_write(hdr, task->tk_status);
+ if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
+ hdr->res.op_status = NFS4ERR_NXIO;
+ if (task->tk_status < 0 && hdr->res.op_status)
+ ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+ hdr->args.offset, hdr->args.count,
+ hdr->res.op_status, OP_WRITE);
+ err = ff_layout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg,
+ hdr->pgio_mirror_idx);
+
+ switch (err) {
+ case -NFS4ERR_RESET_TO_PNFS:
+ case -NFS4ERR_RESET_TO_MDS:
+ inode = hdr->lseg->pls_layout->plh_inode;
+ pnfs_error_mark_layout_for_return(inode, hdr->lseg);
+ if (err == -NFS4ERR_RESET_TO_PNFS) {
+ pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
+ ff_layout_reset_write(hdr, true);
+ } else {
+ pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
+ ff_layout_reset_write(hdr, false);
+ }
+ return task->tk_status;
+ case -EAGAIN:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+
+ if (hdr->res.verf->committed == NFS_FILE_SYNC ||
+ hdr->res.verf->committed == NFS_DATA_SYNC)
+ ff_layout_set_layoutcommit(hdr);
+
+ return 0;
+}
+
+static int ff_layout_commit_done_cb(struct rpc_task *task,
+ struct nfs_commit_data *data)
+{
+ struct inode *inode;
+ int err;
+
+ trace_nfs4_pnfs_commit_ds(data, task->tk_status);
+ if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
+ data->res.op_status = NFS4ERR_NXIO;
+ if (task->tk_status < 0 && data->res.op_status)
+ ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
+ data->args.offset, data->args.count,
+ data->res.op_status, OP_COMMIT);
+ err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
+ data->lseg, data->ds_commit_index);
+
+ switch (err) {
+ case -NFS4ERR_RESET_TO_PNFS:
+ case -NFS4ERR_RESET_TO_MDS:
+ inode = data->lseg->pls_layout->plh_inode;
+ pnfs_error_mark_layout_for_return(inode, data->lseg);
+ if (err == -NFS4ERR_RESET_TO_PNFS)
+ pnfs_set_retry_layoutget(data->lseg->pls_layout);
+ else
+ pnfs_clear_retry_layoutget(data->lseg->pls_layout);
+ pnfs_generic_prepare_to_resend_writes(data);
+ return -EAGAIN;
+ case -EAGAIN:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+
+ if (data->verf.committed == NFS_UNSTABLE)
+ pnfs_commit_set_layoutcommit(data);
+
+ return 0;
+}
+
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+ rpc_exit(task, -EIO);
+ return -EIO;
+ }
+
+ if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
+ bool retry_pnfs;
+
+ retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
+ dprintk("%s task %u reset io to %s\n", __func__,
+ task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
+ ff_layout_reset_write(hdr, retry_pnfs);
+ rpc_exit(task, 0);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (ff_layout_write_prepare_common(task, hdr))
+ return;
+
+ rpc_call_start(task);
+}
+
+static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (ff_layout_write_prepare_common(task, hdr))
+ return;
+
+ if (ff_layout_setup_sequence(hdr->ds_clp,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
+ task))
+ return;
+
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context, FMODE_WRITE) == -EIO)
+ rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+
+static void ff_layout_write_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+ task->tk_status == 0) {
+ nfs4_sequence_done(task, &hdr->res.seq_res);
+ return;
+ }
+
+ /* Note this may cause RPC to be resent */
+ hdr->mds_ops->rpc_call_done(task, hdr);
+}
+
+static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ rpc_count_iostats_metrics(task,
+ &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
+}
+
+static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
+{
+ rpc_call_start(task);
+}
+
+static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *wdata = data;
+
+ ff_layout_setup_sequence(wdata->ds_clp,
+ &wdata->args.seq_args,
+ &wdata->res.seq_res,
+ task);
+}
+
+static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *cdata = data;
+
+ rpc_count_iostats_metrics(task,
+ &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
+}
+
+static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
+ .rpc_call_prepare = ff_layout_read_prepare_v3,
+ .rpc_call_done = ff_layout_read_call_done,
+ .rpc_count_stats = ff_layout_read_count_stats,
+ .rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
+ .rpc_call_prepare = ff_layout_read_prepare_v4,
+ .rpc_call_done = ff_layout_read_call_done,
+ .rpc_count_stats = ff_layout_read_count_stats,
+ .rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
+ .rpc_call_prepare = ff_layout_write_prepare_v3,
+ .rpc_call_done = ff_layout_write_call_done,
+ .rpc_count_stats = ff_layout_write_count_stats,
+ .rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
+ .rpc_call_prepare = ff_layout_write_prepare_v4,
+ .rpc_call_done = ff_layout_write_call_done,
+ .rpc_count_stats = ff_layout_write_count_stats,
+ .rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
+ .rpc_call_prepare = ff_layout_commit_prepare_v3,
+ .rpc_call_done = pnfs_generic_write_commit_done,
+ .rpc_count_stats = ff_layout_commit_count_stats,
+ .rpc_release = pnfs_generic_commit_release,
+};
+
+static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
+ .rpc_call_prepare = ff_layout_commit_prepare_v4,
+ .rpc_call_done = pnfs_generic_write_commit_done,
+ .rpc_count_stats = ff_layout_commit_count_stats,
+ .rpc_release = pnfs_generic_commit_release,
+};
+
+static enum pnfs_try_status
+ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
+{
+ struct pnfs_layout_segment *lseg = hdr->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ struct rpc_cred *ds_cred;
+ loff_t offset = hdr->args.offset;
+ u32 idx = hdr->pgio_mirror_idx;
+ int vers;
+ struct nfs_fh *fh;
+
+ dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+ __func__, hdr->inode->i_ino,
+ hdr->args.pgbase, (size_t)hdr->args.count, offset);
+
+ ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
+ if (!ds)
+ goto out_failed;
+
+ ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ hdr->inode);
+ if (IS_ERR(ds_clnt))
+ goto out_failed;
+
+ ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+ if (IS_ERR(ds_cred))
+ goto out_failed;
+
+ vers = nfs4_ff_layout_ds_version(lseg, idx);
+
+ dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
+ ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
+
+ atomic_inc(&ds->ds_clp->cl_count);
+ hdr->ds_clp = ds->ds_clp;
+ fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+ if (fh)
+ hdr->args.fh = fh;
+
+ /*
+ * Note that if we ever decide to split across DSes,
+ * then we may need to handle dense-like offsets.
+ */
+ hdr->args.offset = offset;
+ hdr->mds_offset = offset;
+
+ /* Perform an asynchronous read to ds */
+ nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+ vers == 3 ? &ff_layout_read_call_ops_v3 :
+ &ff_layout_read_call_ops_v4,
+ 0, RPC_TASK_SOFTCONN);
+
+ return PNFS_ATTEMPTED;
+
+out_failed:
+ if (ff_layout_has_available_ds(lseg))
+ return PNFS_TRY_AGAIN;
+ return PNFS_NOT_ATTEMPTED;
+}
+
+/* Perform async writes. */
+static enum pnfs_try_status
+ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
+{
+ struct pnfs_layout_segment *lseg = hdr->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ struct rpc_cred *ds_cred;
+ loff_t offset = hdr->args.offset;
+ int vers;
+ struct nfs_fh *fh;
+ int idx = hdr->pgio_mirror_idx;
+
+ ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+ if (!ds)
+ return PNFS_NOT_ATTEMPTED;
+
+ ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ hdr->inode);
+ if (IS_ERR(ds_clnt))
+ return PNFS_NOT_ATTEMPTED;
+
+ ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+ if (IS_ERR(ds_cred))
+ return PNFS_NOT_ATTEMPTED;
+
+ vers = nfs4_ff_layout_ds_version(lseg, idx);
+
+ dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n",
+ __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
+ offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
+ vers);
+
+ hdr->pgio_done_cb = ff_layout_write_done_cb;
+ atomic_inc(&ds->ds_clp->cl_count);
+ hdr->ds_clp = ds->ds_clp;
+ hdr->ds_commit_idx = idx;
+ fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+ if (fh)
+ hdr->args.fh = fh;
+
+ /*
+ * Note that if we ever decide to split across DSes,
+ * then we may need to handle dense-like offsets.
+ */
+ hdr->args.offset = offset;
+
+ /* Perform an asynchronous write */
+ nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+ vers == 3 ? &ff_layout_write_call_ops_v3 :
+ &ff_layout_write_call_ops_v4,
+ sync, RPC_TASK_SOFTCONN);
+ return PNFS_ATTEMPTED;
+}
+
+static void
+ff_layout_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx)
+{
+ struct list_head *list;
+ struct pnfs_commit_bucket *buckets;
+
+ spin_lock(cinfo->lock);
+ buckets = cinfo->ds->buckets;
+ list = &buckets[ds_commit_idx].written;
+ if (list_empty(list)) {
+ /* Non-empty buckets hold a reference on the lseg. That ref
+ * is normally transferred to the COMMIT call and released
+ * there. It could also be released if the last req is pulled
+ * off due to a rewrite, in which case it will be done in
+ * pnfs_common_clear_request_commit
+ */
+ WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
+ buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
+ }
+ set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+ cinfo->ds->nwritten++;
+
+ /* nfs_request_add_commit_list(). We need to add req to list without
+ * dropping cinfo lock.
+ */
+ set_bit(PG_CLEAN, &(req)->wb_flags);
+ nfs_list_add_request(req, list);
+ cinfo->mds->ncommit++;
+ spin_unlock(cinfo->lock);
+ if (!cinfo->dreq) {
+ inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
++ inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
+ BDI_RECLAIMABLE);
+ __mark_inode_dirty(req->wb_context->dentry->d_inode,
+ I_DIRTY_DATASYNC);
+ }
+}
+
+static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+ return i;
+}
+
+static struct nfs_fh *
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+
+ /* FIXME: Assume that there is only one NFS version available
+ * for the DS.
+ */
+ return &flseg->mirror_array[i]->fh_versions[0];
+}
+
+static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
+{
+ struct pnfs_layout_segment *lseg = data->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ struct rpc_cred *ds_cred;
+ u32 idx;
+ int vers;
+ struct nfs_fh *fh;
+
+ idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+ ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+ if (!ds)
+ goto out_err;
+
+ ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ data->inode);
+ if (IS_ERR(ds_clnt))
+ goto out_err;
+
+ ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
+ if (IS_ERR(ds_cred))
+ goto out_err;
+
+ vers = nfs4_ff_layout_ds_version(lseg, idx);
+
+ dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
+ data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count),
+ vers);
+ data->commit_done_cb = ff_layout_commit_done_cb;
+ data->cred = ds_cred;
+ atomic_inc(&ds->ds_clp->cl_count);
+ data->ds_clp = ds->ds_clp;
+ fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+ if (fh)
+ data->args.fh = fh;
+ return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
+ vers == 3 ? &ff_layout_commit_call_ops_v3 :
+ &ff_layout_commit_call_ops_v4,
+ how, RPC_TASK_SOFTCONN);
+out_err:
+ pnfs_generic_prepare_to_resend_writes(data);
+ pnfs_generic_commit_release(data);
+ return -EAGAIN;
+}
+
+static int
+ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+ int how, struct nfs_commit_info *cinfo)
+{
+ return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
+ ff_layout_initiate_commit);
+}
+
+static struct pnfs_ds_commit_info *
+ff_layout_get_ds_info(struct inode *inode)
+{
+ struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
+
+ if (layout == NULL)
+ return NULL;
+
+ return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
+}
+
+static void
+ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+{
+ nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
+ id_node));
+}
+
+static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args)
+{
+ struct pnfs_layout_hdr *hdr = &flo->generic_hdr;
+ __be32 *start;
+ int count = 0, ret = 0;
+
+ start = xdr_reserve_space(xdr, 4);
+ if (unlikely(!start))
+ return -E2BIG;
+
+ /* This assume we always return _ALL_ layouts */
+ spin_lock(&hdr->plh_inode->i_lock);
+ ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range);
+ spin_unlock(&hdr->plh_inode->i_lock);
+
+ *start = cpu_to_be32(count);
+
+ return ret;
+}
+
+/* report nothing for now */
+static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (likely(p))
+ *p = cpu_to_be32(0);
+}
+
+static struct nfs4_deviceid_node *
+ff_layout_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+ struct nfs4_ff_layout_ds *dsaddr;
+
+ dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
+ if (!dsaddr)
+ return NULL;
+ return &dsaddr->id_node;
+}
+
+static void
+ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args)
+{
+ struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
+ __be32 *start;
+
+ dprintk("%s: Begin\n", __func__);
+ start = xdr_reserve_space(xdr, 4);
+ BUG_ON(!start);
+
+ if (ff_layout_encode_ioerr(flo, xdr, args))
+ goto out;
+
+ ff_layout_encode_iostats(flo, xdr, args);
+out:
+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
+ dprintk("%s: Return\n", __func__);
+}
+
+static struct pnfs_layoutdriver_type flexfilelayout_type = {
+ .id = LAYOUT_FLEX_FILES,
+ .name = "LAYOUT_FLEX_FILES",
+ .owner = THIS_MODULE,
+ .alloc_layout_hdr = ff_layout_alloc_layout_hdr,
+ .free_layout_hdr = ff_layout_free_layout_hdr,
+ .alloc_lseg = ff_layout_alloc_lseg,
+ .free_lseg = ff_layout_free_lseg,
+ .pg_read_ops = &ff_layout_pg_read_ops,
+ .pg_write_ops = &ff_layout_pg_write_ops,
+ .get_ds_info = ff_layout_get_ds_info,
+ .free_deviceid_node = ff_layout_free_deveiceid_node,
+ .mark_request_commit = ff_layout_mark_request_commit,
+ .clear_request_commit = pnfs_generic_clear_request_commit,
+ .scan_commit_lists = pnfs_generic_scan_commit_lists,
+ .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
+ .commit_pagelist = ff_layout_commit_pagelist,
+ .read_pagelist = ff_layout_read_pagelist,
+ .write_pagelist = ff_layout_write_pagelist,
+ .alloc_deviceid_node = ff_layout_alloc_deviceid_node,
+ .encode_layoutreturn = ff_layout_encode_layoutreturn,
+};
+
+static int __init nfs4flexfilelayout_init(void)
+{
+ printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
+ __func__);
+ return pnfs_register_layoutdriver(&flexfilelayout_type);
+}
+
+static void __exit nfs4flexfilelayout_exit(void)
+{
+ printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
+ __func__);
+ pnfs_unregister_layoutdriver(&flexfilelayout_type);
+}
+
+MODULE_ALIAS("nfs-layouttype4-4");
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
+
+module_init(nfs4flexfilelayout_init);
+module_exit(nfs4flexfilelayout_exit);
nfs_attr_check_mountpoint(sb, fattr);
- if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) &&
- !nfs_attr_use_mounted_on_fileid(fattr))
+ if (nfs_attr_use_mounted_on_fileid(fattr))
+ fattr->fileid = fattr->mounted_on_fileid;
+ else if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
goto out_no_inode;
if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
goto out_no_inode;
if (S_ISREG(inode->i_mode)) {
inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
inode->i_data.a_ops = &nfs_file_aops;
- inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
inode->i_fop = &nfs_dir_operations;
attr->ia_valid &= ~ATTR_MODE;
if (attr->ia_valid & ATTR_SIZE) {
+ loff_t i_size;
+
BUG_ON(!S_ISREG(inode->i_mode));
- if (attr->ia_size == i_size_read(inode))
+ i_size = i_size_read(inode);
+ if (attr->ia_size == i_size)
attr->ia_valid &= ~ATTR_SIZE;
+ else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
+ return -ETXTBSY;
}
/* Optimization: if the end result is no change, don't RPC */
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/crc32.h>
+#include <linux/nfs_page.h>
#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
(((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) &&
((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0)))
return 0;
-
- fattr->fileid = fattr->mounted_on_fileid;
return 1;
}
const struct sockaddr *ds_addr,
int ds_addrlen, int ds_proto,
unsigned int ds_timeo,
- unsigned int ds_retrans);
+ unsigned int ds_retrans,
+ u32 minor_version,
+ rpc_authflavor_t au_flavor);
extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
struct inode *);
+extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+ const struct sockaddr *ds_addr, int ds_addrlen,
+ int ds_proto, unsigned int ds_timeo,
+ unsigned int ds_retrans, rpc_authflavor_t au_flavor);
#ifdef CONFIG_PROC_FS
extern int __init nfs_fs_proc_init(void);
extern void nfs_fs_proc_exit(void);
void nfs_pgio_header_free(struct nfs_pgio_header *);
void nfs_pgio_data_destroy(struct nfs_pgio_header *);
int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
-int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *,
- const struct rpc_call_ops *, int, int);
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
+ struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
+ const struct rpc_call_ops *call_ops, int how, int flags);
void nfs_free_request(struct nfs_page *req);
+struct nfs_pgio_mirror *
+nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
static inline void nfs_iocounter_init(struct nfs_io_counter *c)
{
atomic_set(&c->io_count, 0);
}
+static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
+{
+ WARN_ON_ONCE(desc->pg_mirror_count < 1);
+ return desc->pg_mirror_count > 1;
+}
+
/* nfs2xdr.c */
extern struct rpc_procinfo nfs_procedures[];
extern int nfs2_decode_dirent(struct xdr_stream *,
extern int __init register_nfs_fs(void);
extern void __exit unregister_nfs_fs(void);
-extern void nfs_sb_active(struct super_block *sb);
+extern bool nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb);
/* namespace.c */
int nfs_show_devname(struct seq_file *, struct dentry *);
int nfs_show_path(struct seq_file *, struct dentry *);
int nfs_show_stats(struct seq_file *, struct dentry *);
- void nfs_put_super(struct super_block *);
int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
/* write.c */
extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
extern int nfs_initiate_commit(struct rpc_clnt *clnt,
struct nfs_commit_data *data,
+ const struct nfs_rpc_ops *nfs_ops,
const struct rpc_call_ops *call_ops,
int how, int flags);
extern void nfs_init_commit(struct nfs_commit_data *data,
struct nfs_commit_info *cinfo);
void nfs_mark_request_commit(struct nfs_page *req,
struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo);
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
int nfs_write_need_commit(struct nfs_pgio_header *);
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
int how, struct nfs_commit_info *cinfo);
void nfs_retry_commit(struct list_head *page_list,
struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo);
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
void nfs_commitdata_release(struct nfs_commit_data *data);
void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
struct nfs_commit_info *cinfo);
struct nfs_direct_req *dreq);
int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
inode_dio_wait(inode);
}
extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
+extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
/* nfs4proc.c */
extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
struct nfs_client **result,
struct rpc_cred *cred);
+static inline struct inode *nfs_igrab_and_active(struct inode *inode)
+{
+ inode = igrab(inode);
+ if (inode != NULL && !nfs_sb_active(inode->i_sb)) {
+ iput(inode);
+ inode = NULL;
+ }
+ return inode;
+}
+
+static inline void nfs_iput_and_deactive(struct inode *inode)
+{
+ if (inode != NULL) {
+ struct super_block *sb = inode->i_sb;
+
+ iput(inode);
+ nfs_sb_deactive(sb);
+ }
+}
+
/*
* Determine the device name as a string
*/
.destroy_inode = nfs_destroy_inode,
.write_inode = nfs4_write_inode,
.drop_inode = nfs_drop_inode,
- .put_super = nfs_put_super,
.statfs = nfs_statfs,
.evict_inode = nfs4_evict_inode,
.umount_begin = nfs_umount_begin,
static void __exit exit_nfs_v4(void)
{
+ /* Not called in the _init(), conditionally loaded */
+ nfs4_pnfs_v3_ds_connect_unload();
+
unregister_nfs_version(&nfs_v4);
nfs4_unregister_sysctl();
nfs_idmap_quit();
.destroy_inode = nfs_destroy_inode,
.write_inode = nfs_write_inode,
.drop_inode = nfs_drop_inode,
- .put_super = nfs_put_super,
.statfs = nfs_statfs,
.evict_inode = nfs_evict_inode,
.umount_begin = nfs_umount_begin,
unregister_filesystem(&nfs_fs_type);
}
-void nfs_sb_active(struct super_block *sb)
+bool nfs_sb_active(struct super_block *sb)
{
struct nfs_server *server = NFS_SB(sb);
- if (atomic_inc_return(&server->active) == 1)
- atomic_inc(&sb->s_active);
+ if (!atomic_inc_not_zero(&sb->s_active))
+ return false;
+ if (atomic_inc_return(&server->active) != 1)
+ atomic_dec(&sb->s_active);
+ return true;
}
EXPORT_SYMBOL_GPL(nfs_sb_active);
error = nfs_bdi_register(server);
if (error) {
mntroot = ERR_PTR(error);
- goto error_splat_bdi;
+ goto error_splat_super;
}
server->super = s;
}
dput(mntroot);
mntroot = ERR_PTR(error);
error_splat_super:
- if (server && !s->s_root)
- bdi_unregister(&server->backing_dev_info);
- error_splat_bdi:
deactivate_locked_super(s);
goto out;
}
}
EXPORT_SYMBOL_GPL(nfs_fs_mount);
- /*
- * Ensure that we unregister the bdi before kill_anon_super
- * releases the device name
- */
- void nfs_put_super(struct super_block *s)
- {
- struct nfs_server *server = NFS_SB(s);
-
- bdi_unregister(&server->backing_dev_info);
- }
- EXPORT_SYMBOL_GPL(nfs_put_super);
-
/*
* Destroy an NFS2/3 superblock
*/
void nfs_kill_super(struct super_block *s)
{
struct nfs_server *server = NFS_SB(s);
+ dev_t dev = s->s_dev;
+
+ generic_shutdown_super(s);
- kill_anon_super(s);
nfs_fscache_release_super_cookie(s);
+
nfs_free_server(server);
+ free_anon_bdev(dev);
}
EXPORT_SYMBOL_GPL(nfs_kill_super);
do {
/*
* Subrequests are always contiguous, non overlapping
- * and in order. If not, it's a programming error.
+ * and in order - but may be repeated (mirrored writes).
*/
- WARN_ON_ONCE(subreq->wb_offset !=
- (head->wb_offset + total_bytes));
-
- /* keep track of how many bytes this group covers */
- total_bytes += subreq->wb_bytes;
+ if (subreq->wb_offset == (head->wb_offset + total_bytes)) {
+ /* keep track of how many bytes this group covers */
+ total_bytes += subreq->wb_bytes;
+ } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
+ ((subreq->wb_offset + subreq->wb_bytes) >
+ (head->wb_offset + total_bytes)))) {
+ nfs_page_group_unlock(head);
+ spin_unlock(&inode->i_lock);
+ return ERR_PTR(-EIO);
+ }
if (!nfs_lock_request(subreq)) {
/* releases page group bit lock and
spin_unlock(cinfo->lock);
if (!cinfo->dreq) {
inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+ inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
BDI_RECLAIMABLE);
__mark_inode_dirty(req->wb_context->dentry->d_inode,
I_DIRTY_DATASYNC);
*/
void
nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
+ struct nfs_commit_info *cinfo, u32 ds_commit_idx)
{
- if (pnfs_mark_request_commit(req, lseg, cinfo))
+ if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
return;
nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
}
nfs_clear_page_commit(struct page *page)
{
dec_zone_page_state(page, NR_UNSTABLE_NFS);
- dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
+ dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
}
/* Called holding inode (/cinfo) lock */
}
if (nfs_write_need_commit(hdr)) {
memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
- nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+ nfs_mark_request_commit(req, hdr->lseg, &cinfo,
+ hdr->pgio_mirror_idx);
goto next;
}
remove_req:
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct nfs_lock_context *l_ctx;
+ struct file_lock_context *flctx = file_inode(file)->i_flctx;
struct nfs_page *req;
int do_flush, status;
/*
do_flush = req->wb_page != page || req->wb_context != ctx;
/* for now, flush if more than 1 request in page_group */
do_flush |= req->wb_this_page != req;
- if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
+ if (l_ctx && flctx &&
+ !(list_empty_careful(&flctx->flc_posix) &&
+ list_empty_careful(&flctx->flc_flock))) {
do_flush |= l_ctx->lockowner.l_owner != current->files
|| l_ctx->lockowner.l_pid != current->tgid;
}
return PageUptodate(page) != 0;
}
+static bool
+is_whole_file_wrlock(struct file_lock *fl)
+{
+ return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX &&
+ fl->fl_type == F_WRLCK;
+}
+
/* If we know the page is up to date, and we're not using byte range locks (or
* if we have the whole file locked for writing), it may be more efficient to
* extend the write to cover the entire page in order to avoid fragmentation
*/
static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
{
+ int ret;
+ struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock *fl;
+
if (file->f_flags & O_DSYNC)
return 0;
if (!nfs_write_pageuptodate(page, inode))
return 0;
if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
return 1;
- if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 &&
- inode->i_flock->fl_end == OFFSET_MAX &&
- inode->i_flock->fl_type != F_RDLCK))
- return 1;
- return 0;
+ if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
+ list_empty_careful(&flctx->flc_posix)))
+ return 0;
+
+ /* Check to see if there are whole file write locks */
+ ret = 0;
+ spin_lock(&flctx->flc_lock);
+ if (!list_empty(&flctx->flc_posix)) {
+ fl = list_first_entry(&flctx->flc_posix, struct file_lock,
+ fl_list);
+ if (is_whole_file_wrlock(fl))
+ ret = 1;
+ } else if (!list_empty(&flctx->flc_flock)) {
+ fl = list_first_entry(&flctx->flc_flock, struct file_lock,
+ fl_list);
+ if (fl->fl_type == F_WRLCK)
+ ret = 1;
+ }
+ spin_unlock(&flctx->flc_lock);
+ return ret;
}
/*
static void nfs_initiate_write(struct nfs_pgio_header *hdr,
struct rpc_message *msg,
+ const struct nfs_rpc_ops *rpc_ops,
struct rpc_task_setup *task_setup_data, int how)
{
- struct inode *inode = hdr->inode;
int priority = flush_task_priority(how);
task_setup_data->priority = priority;
- NFS_PROTO(inode)->write_setup(hdr, msg);
+ rpc_ops->write_setup(hdr, msg);
- nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
+ nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client,
&task_setup_data->rpc_client, msg, hdr);
}
void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
{
+ struct nfs_pgio_mirror *mirror;
+
pgio->pg_ops = &nfs_pgio_rw_ops;
- pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
+
+ nfs_pageio_stop_mirroring(pgio);
+
+ mirror = &pgio->pg_mirrors[0];
+ mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
}
EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
EXPORT_SYMBOL_GPL(nfs_commitdata_release);
int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
+ const struct nfs_rpc_ops *nfs_ops,
const struct rpc_call_ops *call_ops,
int how, int flags)
{
.priority = priority,
};
/* Set up the initial task struct. */
- NFS_PROTO(data->inode)->commit_setup(data, &msg);
+ nfs_ops->commit_setup(data, &msg);
dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
void nfs_retry_commit(struct list_head *page_list,
struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx)
{
struct nfs_page *req;
while (!list_empty(page_list)) {
req = nfs_list_entry(page_list->next);
nfs_list_remove_request(req);
- nfs_mark_request_commit(req, lseg, cinfo);
+ nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
if (!cinfo->dreq) {
dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+ dec_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
BDI_RECLAIMABLE);
}
nfs_unlock_and_release_request(req);
/* Set up the argument struct */
nfs_init_commit(data, head, NULL, cinfo);
atomic_inc(&cinfo->mds->rpcs_out);
- return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops,
- how, 0);
+ return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
+ data->mds_ops, how, 0);
out_bad:
- nfs_retry_commit(head, NULL, cinfo);
+ nfs_retry_commit(head, NULL, cinfo, 0);
cinfo->completion_ops->error_cleanup(NFS_I(inode));
return -ENOMEM;
}
handle_t *handle = NULL;
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
- enum ocfs2_alloc_restarted why;
+ enum ocfs2_alloc_restarted why = RESTART_NONE;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_extent_tree et;
int did_quota = 0;
goto out_dio;
}
} else {
- current->backing_dev_info = file->f_mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
written = generic_perform_write(file, from, *ppos);
if (likely(written >= 0))
iocb->ki_pos = *ppos + written;
return (-status);
}
+int
+xfs_update_prealloc_flags(
+ struct xfs_inode *ip,
+ enum xfs_prealloc_flags flags)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
+ error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ if (!(flags & XFS_PREALLOC_INVISIBLE)) {
+ ip->i_d.di_mode &= ~S_ISUID;
+ if (ip->i_d.di_mode & S_IXGRP)
+ ip->i_d.di_mode &= ~S_ISGID;
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ }
+
+ if (flags & XFS_PREALLOC_SET)
+ ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+ if (flags & XFS_PREALLOC_CLEAR)
+ ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ if (flags & XFS_PREALLOC_SYNC)
+ xfs_trans_set_sync(tp);
+ return xfs_trans_commit(tp, 0);
+}
+
/*
* Fsync operations on directories are much simpler than on regular files,
* as there is no file data to flush, and thus also no need for explicit
iov_iter_truncate(from, count);
/* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
write_retry:
trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
{
struct inode *inode = file_inode(file);
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_trans *tp;
long error;
+ enum xfs_prealloc_flags flags = 0;
loff_t new_size = 0;
if (!S_ISREG(inode->i_mode))
if (error)
goto out_unlock;
} else {
+ flags |= XFS_PREALLOC_SET;
+
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
offset + len > i_size_read(inode)) {
new_size = offset + len;
goto out_unlock;
}
- tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
- error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- goto out_unlock;
- }
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- ip->i_d.di_mode &= ~S_ISUID;
- if (ip->i_d.di_mode & S_IXGRP)
- ip->i_d.di_mode &= ~S_ISGID;
-
- if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
- ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
-
- xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
if (file->f_flags & O_DSYNC)
- xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ flags |= XFS_PREALLOC_SYNC;
+
+ error = xfs_update_prealloc_flags(ip, flags);
if (error)
goto out_unlock;
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_vm_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
+ struct backing_dev_info;
struct export_operations;
struct hd_geometry;
struct iovec;
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
- struct backing_dev_info;
struct address_space {
struct inode *host; /* owner: inode, block_device */
struct radix_tree_root page_tree; /* radix tree of all pages */
spinlock_t tree_lock; /* and lock protecting it */
atomic_t i_mmap_writable;/* count VM_SHARED mappings */
struct rb_root i_mmap; /* tree of private and shared mappings */
- struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */
/* Protected by tree_lock together with the radix tree */
unsigned long nrpages; /* number of total pages */
pgoff_t writeback_index;/* writeback starts here */
const struct address_space_operations *a_ops; /* methods */
unsigned long flags; /* error bits/gfp mask */
- struct backing_dev_info *backing_dev_info; /* device readahead, etc */
spinlock_t private_lock; /* for use by the address_space */
struct list_head private_list; /* ditto */
void *private_data; /* ditto */
*/
static inline int mapping_mapped(struct address_space *mapping)
{
- return !RB_EMPTY_ROOT(&mapping->i_mmap) ||
- !list_empty(&mapping->i_mmap_nonlinear);
+ return !RB_EMPTY_ROOT(&mapping->i_mmap);
}
/*
atomic_t i_readcount; /* struct files open RO */
#endif
const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
- struct file_lock *i_flock;
+ struct file_lock_context *i_flctx;
struct address_space i_data;
struct list_head i_devices;
union {
#define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */
#define FL_UNLOCK_PENDING 512 /* Lease is being broken */
#define FL_OFDLCK 1024 /* lock is "owned" by struct file */
+#define FL_LAYOUT 2048 /* outstanding pNFS layout */
/*
* Special return value from posix_lock_file() and vfs_lock_file() for
/* legacy typedef, should eventually be removed */
typedef void *fl_owner_t;
+struct file_lock;
+
struct file_lock_operations {
void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
void (*fl_release_private)(struct file_lock *);
void (*lm_notify)(struct file_lock *); /* unblock callback */
int (*lm_grant)(struct file_lock *, int);
bool (*lm_break)(struct file_lock *);
- int (*lm_change)(struct file_lock **, int, struct list_head *);
+ int (*lm_change)(struct file_lock *, int, struct list_head *);
void (*lm_setup)(struct file_lock *, void **);
};
* FIXME: should we create a separate "struct lock_request" to help distinguish
* these two uses?
*
- * The i_flock list is ordered by:
+ * The varous i_flctx lists are ordered by:
*
- * 1) lock type -- FL_LEASEs first, then FL_FLOCK, and finally FL_POSIX
- * 2) lock owner
- * 3) lock range start
- * 4) lock range end
+ * 1) lock owner
+ * 2) lock range start
+ * 3) lock range end
*
* Obviously, the last two criteria only matter for POSIX locks.
*/
struct file_lock {
struct file_lock *fl_next; /* singly linked list for this inode */
+ struct list_head fl_list; /* link into file_lock_context */
struct hlist_node fl_link; /* node in global lists */
struct list_head fl_block; /* circular list of blocked processes */
fl_owner_t fl_owner;
} fl_u;
};
+struct file_lock_context {
+ spinlock_t flc_lock;
+ struct list_head flc_flock;
+ struct list_head flc_posix;
+ struct list_head flc_lease;
+ int flc_flock_cnt;
+ int flc_posix_cnt;
+ int flc_lease_cnt;
+};
+
/* The following constant reflects the upper bound of the file/locking space */
#ifndef OFFSET_MAX
#define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1)))
extern int fcntl_getlease(struct file *filp);
/* fs/locks.c */
+void locks_free_lock_context(struct file_lock_context *ctx);
void locks_free_lock(struct file_lock *fl);
extern void locks_init_lock(struct file_lock *);
extern struct file_lock * locks_alloc_lock(void);
extern void lease_get_mtime(struct inode *, struct timespec *time);
extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
extern int vfs_setlease(struct file *, long, struct file_lock **, void **);
-extern int lease_modify(struct file_lock **, int, struct list_head *);
+extern int lease_modify(struct file_lock *, int, struct list_head *);
#else /* !CONFIG_FILE_LOCKING */
static inline int fcntl_getlk(struct file *file, unsigned int cmd,
struct flock __user *user)
return F_UNLCK;
}
+static inline void
+locks_free_lock_context(struct file_lock_context *ctx)
+{
+}
+
static inline void locks_init_lock(struct file_lock *fl)
{
return;
return -EINVAL;
}
-static inline int lease_modify(struct file_lock **before, int arg,
+static inline int lease_modify(struct file_lock *fl, int arg,
struct list_head *dispose)
{
return -EINVAL;
#define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */
#define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */
- extern struct list_head super_blocks;
- extern spinlock_t sb_lock;
/* Possible states of 'frozen' field */
enum {
#define HAVE_COMPAT_IOCTL 1
#define HAVE_UNLOCKED_IOCTL 1
+ /*
+ * These flags let !MMU mmap() govern direct device mapping vs immediate
+ * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
+ *
+ * NOMMU_MAP_COPY: Copy can be mapped (MAP_PRIVATE)
+ * NOMMU_MAP_DIRECT: Can be mapped directly (MAP_SHARED)
+ * NOMMU_MAP_READ: Can be mapped for reading
+ * NOMMU_MAP_WRITE: Can be mapped for writing
+ * NOMMU_MAP_EXEC: Can be mapped for execution
+ */
+ #define NOMMU_MAP_COPY 0x00000001
+ #define NOMMU_MAP_DIRECT 0x00000008
+ #define NOMMU_MAP_READ VM_MAYREAD
+ #define NOMMU_MAP_WRITE VM_MAYWRITE
+ #define NOMMU_MAP_EXEC VM_MAYEXEC
+
+ #define NOMMU_VMFLAGS \
+ (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)
+
+
struct iov_iter;
struct file_operations {
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
void (*show_fdinfo)(struct seq_file *m, struct file *f);
+ #ifndef CONFIG_MMU
+ unsigned (*mmap_capabilities)(struct file *);
+ #endif
};
struct inode_operations {
struct file *filp,
loff_t size)
{
- if (inode->i_flock && mandatory_lock(inode))
+ if (inode->i_flctx && mandatory_lock(inode))
return locks_mandatory_area(
FLOCK_VERIFY_WRITE, inode, filp,
size < inode->i_size ? size : inode->i_size,
{
/*
* Since this check is lockless, we must ensure that any refcounts
- * taken are done before checking inode->i_flock. Otherwise, we could
- * end up racing with tasks trying to set a new lease on this file.
+ * taken are done before checking i_flctx->flc_lease. Otherwise, we
+ * could end up racing with tasks trying to set a new lease on this
+ * file.
*/
smp_mb();
- if (inode->i_flock)
+ if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
return __break_lease(inode, mode, FL_LEASE);
return 0;
}
{
/*
* Since this check is lockless, we must ensure that any refcounts
- * taken are done before checking inode->i_flock. Otherwise, we could
- * end up racing with tasks trying to set a new lease on this file.
+ * taken are done before checking i_flctx->flc_lease. Otherwise, we
+ * could end up racing with tasks trying to set a new lease on this
+ * file.
*/
smp_mb();
- if (inode->i_flock)
+ if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
return __break_lease(inode, mode, FL_DELEG);
return 0;
}
return ret;
}
+static inline int break_layout(struct inode *inode, bool wait)
+{
+ smp_mb();
+ if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
+ return __break_lease(inode,
+ wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
+ FL_LAYOUT);
+ return 0;
+}
+
#else /* !CONFIG_FILE_LOCKING */
static inline int locks_mandatory_locked(struct file *file)
{
return 0;
}
+static inline int break_layout(struct inode *inode, bool wait)
+{
+ return 0;
+}
+
#endif /* CONFIG_FILE_LOCKING */
/* fs/open.c */
extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
-extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
- unsigned long size, pgoff_t pgoff);
int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
*/
if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+ dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
}
}
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = filemap_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
/* This is used for a general mmap of a disk file */
size_t count = iov_iter_count(from);
/* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
goto out;
*/
#include <linux/fs.h>
+ #include <linux/backing-dev.h>
#include <linux/pagemap.h>
#include <linux/export.h>
#include <linux/uio.h>
static const struct vm_operations_struct xip_file_vm_ops = {
.fault = xip_file_fault,
.page_mkwrite = filemap_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
count = len;
/* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
if (ret)
pte = *(orig_pte + ((index - start) / PAGE_SIZE));
pte_unmap_unlock(orig_pte, ptl);
- if (pte_present(pte) || pte_none(pte) || pte_file(pte))
+ if (pte_present(pte) || pte_none(pte))
continue;
entry = pte_to_swp_entry(pte);
if (unlikely(non_swap_entry(entry)))
struct file *file = vma->vm_file;
#ifdef CONFIG_SWAP
- if (!file || mapping_cap_swap_backed(file->f_mapping)) {
+ if (!file) {
*prev = vma;
- if (!file)
- force_swapin_readahead(vma, start, end);
- else
- force_shm_swapin_readahead(vma, start, end,
- file->f_mapping);
+ force_swapin_readahead(vma, start, end);
return 0;
}
- #endif
+ if (shmem_mapping(file->f_mapping)) {
+ *prev = vma;
+ force_shm_swapin_readahead(vma, start, end,
+ file->f_mapping);
+ return 0;
+ }
+ #else
if (!file)
return -EBADF;
+ #endif
if (file->f_mapping->a_ops->get_xip_mem) {
/* no bad return value, but ignore advice */
if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
return -EINVAL;
- if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
- struct zap_details details = {
- .nonlinear_vma = vma,
- .last_index = ULONG_MAX,
- };
- zap_page_range(vma, start, end - start, &details);
- } else
- zap_page_range(vma, start, end - start, NULL);
+ zap_page_range(vma, start, end - start, NULL);
return 0;
}
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
- if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+ if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
return -EINVAL;
f = vma->vm_file;
#endif
void *high_memory;
+EXPORT_SYMBOL(high_memory);
struct page *mem_map;
unsigned long max_mapnr;
unsigned long highest_memmap_pfn;
}
EXPORT_SYMBOL(get_user_pages);
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ int *locked)
+{
+ return get_user_pages(tsk, mm, start, nr_pages, write, force,
+ pages, NULL);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+
+long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ unsigned int gup_flags)
+{
+ long ret;
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
+ pages, NULL);
+ up_read(&mm->mmap_sem);
+ return ret;
+}
+EXPORT_SYMBOL(__get_user_pages_unlocked);
+
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages)
+{
+ return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
+ force, pages, 0);
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
+
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
return -EOVERFLOW;
if (file) {
- /* validate file mapping requests */
- struct address_space *mapping;
-
/* files must support mmap */
if (!file->f_op->mmap)
return -ENODEV;
* - we support chardevs that provide their own "memory"
* - we support files/blockdevs that are memory backed
*/
- mapping = file->f_mapping;
- if (!mapping)
- mapping = file_inode(file)->i_mapping;
-
- capabilities = 0;
- if (mapping && mapping->backing_dev_info)
- capabilities = mapping->backing_dev_info->capabilities;
-
- if (!capabilities) {
+ if (file->f_op->mmap_capabilities) {
+ capabilities = file->f_op->mmap_capabilities(file);
+ } else {
/* no explicit capabilities set, so assume some
* defaults */
switch (file_inode(file)->i_mode & S_IFMT) {
case S_IFREG:
case S_IFBLK:
- capabilities = BDI_CAP_MAP_COPY;
+ capabilities = NOMMU_MAP_COPY;
break;
case S_IFCHR:
capabilities =
- BDI_CAP_MAP_DIRECT |
- BDI_CAP_READ_MAP |
- BDI_CAP_WRITE_MAP;
+ NOMMU_MAP_DIRECT |
+ NOMMU_MAP_READ |
+ NOMMU_MAP_WRITE;
break;
default:
/* eliminate any capabilities that we can't support on this
* device */
if (!file->f_op->get_unmapped_area)
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
if (!file->f_op->read)
- capabilities &= ~BDI_CAP_MAP_COPY;
+ capabilities &= ~NOMMU_MAP_COPY;
/* The file shall have been opened with read permission. */
if (!(file->f_mode & FMODE_READ))
if (locks_verify_locked(file))
return -EAGAIN;
- if (!(capabilities & BDI_CAP_MAP_DIRECT))
+ if (!(capabilities & NOMMU_MAP_DIRECT))
return -ENODEV;
/* we mustn't privatise shared mappings */
- capabilities &= ~BDI_CAP_MAP_COPY;
+ capabilities &= ~NOMMU_MAP_COPY;
} else {
/* we're going to read the file into private memory we
* allocate */
- if (!(capabilities & BDI_CAP_MAP_COPY))
+ if (!(capabilities & NOMMU_MAP_COPY))
return -ENODEV;
/* we don't permit a private writable mapping to be
* shared with the backing device */
if (prot & PROT_WRITE)
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
}
- if (capabilities & BDI_CAP_MAP_DIRECT) {
- if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
- ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
- ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
+ if (capabilities & NOMMU_MAP_DIRECT) {
+ if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) ||
+ ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
+ ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC))
) {
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
if (flags & MAP_SHARED) {
printk(KERN_WARNING
"MAP_SHARED not completely supported on !MMU\n");
} else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
/* handle implication of PROT_EXEC by PROT_READ */
if (current->personality & READ_IMPLIES_EXEC) {
- if (capabilities & BDI_CAP_EXEC_MAP)
+ if (capabilities & NOMMU_MAP_EXEC)
prot |= PROT_EXEC;
}
} else if ((prot & PROT_READ) &&
(prot & PROT_EXEC) &&
- !(capabilities & BDI_CAP_EXEC_MAP)
+ !(capabilities & NOMMU_MAP_EXEC)
) {
/* backing file is not executable, try to copy */
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
}
} else {
/* anonymous mappings are always memory backed and can be
* privately mapped
*/
- capabilities = BDI_CAP_MAP_COPY;
+ capabilities = NOMMU_MAP_COPY;
/* handle PROT_EXEC implication by PROT_READ */
if ((prot & PROT_READ) &&
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
/* vm_flags |= mm->def_flags; */
- if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
+ if (!(capabilities & NOMMU_MAP_DIRECT)) {
/* attempt to share read-only copies of mapped file chunks */
vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (file && !(prot & PROT_WRITE))
/* overlay a shareable mapping on the backing device or inode
* if possible - used for chardevs, ramfs/tmpfs/shmfs and
* romfs/cramfs */
- vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
+ vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
if (flags & MAP_SHARED)
vm_flags |= VM_SHARED;
}
* shared mappings on devices or memory
* - VM_MAYSHARE will be set if it may attempt to share
*/
- if (capabilities & BDI_CAP_MAP_DIRECT) {
+ if (capabilities & NOMMU_MAP_DIRECT) {
ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
if (ret == 0) {
/* shouldn't return success if we're not sharing */
if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
!(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
/* new mapping is not a subset of the region */
- if (!(capabilities & BDI_CAP_MAP_DIRECT))
+ if (!(capabilities & NOMMU_MAP_DIRECT))
goto sharing_violation;
continue;
}
* - this is the hook for quasi-memory character devices to
* tell us the location of a shared mapping
*/
- if (capabilities & BDI_CAP_MAP_DIRECT) {
+ if (capabilities & NOMMU_MAP_DIRECT) {
addr = file->f_op->get_unmapped_area(file, addr, len,
pgoff, flags);
if (IS_ERR_VALUE(addr)) {
* the mapping so we'll have to attempt to copy
* it */
ret = -ENODEV;
- if (!(capabilities & BDI_CAP_MAP_COPY))
+ if (!(capabilities & NOMMU_MAP_COPY))
goto error_just_free;
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
} else {
vma->vm_start = region->vm_start = addr;
vma->vm_end = region->vm_end = addr + len;
vma->vm_region = region;
/* set up the mapping
- * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
+ * - the region is filled in if NOMMU_MAP_DIRECT is still set
*/
if (file && vma->vm_flags & VM_SHARED)
ret = do_mmap_shared_file(vma);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed, reserve;
+ long free, allowed, reserve;
vm_acct_memory(pages);
*/
if (mm) {
reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min(mm->total_vm / 32, reserve);
+ allowed -= min_t(long, mm->total_vm / 32, reserve);
}
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
}
EXPORT_SYMBOL(filemap_map_pages);
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
- unsigned long size, pgoff_t pgoff)
-{
- BUG();
- return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
-
static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, int write)
{
unsigned long task_ratelimit;
unsigned long dirty_ratelimit;
unsigned long pos_ratio;
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
int ratelimit;
int *p;
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- trace_wbc_writepage(wbc, mapping->backing_dev_info);
+ trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
ret = (*writepage)(page, wbc, data);
if (unlikely(ret)) {
if (ret == AOP_WRITEPAGE_ACTIVATE) {
trace_writeback_dirty_page(page, mapping);
if (mapping_cap_account_dirty(mapping)) {
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
- __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
- __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+ __inc_bdi_stat(bdi, BDI_RECLAIMABLE);
+ __inc_bdi_stat(bdi, BDI_DIRTIED);
task_io_account_write(PAGE_CACHE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
if (mapping && mapping_cap_account_dirty(mapping)) {
current->nr_dirtied--;
dec_zone_page_state(page, NR_DIRTIED);
- dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+ dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
}
}
EXPORT_SYMBOL(account_page_redirty);
*/
int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
{
+ int ret;
+
wbc->pages_skipped++;
+ ret = __set_page_dirty_nobuffers(page);
account_page_redirty(page);
- return __set_page_dirty_nobuffers(page);
+ return ret;
}
EXPORT_SYMBOL(redirty_page_for_writepage);
*/
if (TestClearPageDirty(page)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(mapping->backing_dev_info,
+ dec_bdi_stat(inode_to_bdi(mapping->host),
BDI_RECLAIMABLE);
return 1;
}
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- unsigned long memcg_flags;
struct mem_cgroup *memcg;
- bool locked;
int ret;
- memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
+ memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
- mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
+ mem_cgroup_end_page_stat(memcg);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
- unsigned long memcg_flags;
struct mem_cgroup *memcg;
- bool locked;
int ret;
- memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
+ memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK);
}
- mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
+ mem_cgroup_end_page_stat(memcg);
return ret;
}
*/
void wait_for_stable_page(struct page *page)
{
- struct address_space *mapping = page_mapping(page);
- struct backing_dev_info *bdi = mapping->backing_dev_info;
-
- if (!bdi_cap_stable_pages_required(bdi))
- return;
-
- wait_on_page_writeback(page);
+ if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
+ wait_on_page_writeback(page);
}
EXPORT_SYMBOL_GPL(wait_for_stable_page);
static const struct inode_operations shmem_special_inode_operations;
static const struct vm_operations_struct shmem_vm_ops;
- static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
- };
-
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
goto redirty;
/*
- * shmem_backing_dev_info's capabilities prevent regular writeback or
- * sync from ever calling shmem_writepage; but a stacking filesystem
- * might use ->writepage of its underlying filesystem, in which case
- * tmpfs should write out to swap only in response to memory pressure,
- * and not for the writeback threads or sync.
+ * Our capabilities prevent regular writeback or sync from ever calling
+ * shmem_writepage; but a stacking filesystem might use ->writepage of
+ * its underlying filesystem, in which case tmpfs should write out to
+ * swap only in response to memory pressure, and not for the writeback
+ * threads or sync.
*/
if (!wbc->for_reclaim) {
WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
*/
oldpage = newpage;
} else {
- mem_cgroup_migrate(oldpage, newpage, false);
+ mem_cgroup_migrate(oldpage, newpage, true);
lru_cache_add_anon(newpage);
*pagep = newpage;
}
* truncated or holepunched since swap was confirmed.
* shmem_undo_range() will have done some of the
* unaccounting, now delete_from_swap_cache() will do
- * the rest (including mem_cgroup_uncharge_swapcache).
+ * the rest.
* Reset swap.val? No, leave it so "failed" goes back to
* "repeat": reading a hole and writing should succeed.
*/
inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
- inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_generation = get_seconds();
info = SHMEM_I(inode);
bool shmem_mapping(struct address_space *mapping)
{
- return mapping->backing_dev_info == &shmem_backing_dev_info;
+ return mapping->host->i_sb->s_op == &shmem_ops;
}
#ifdef CONFIG_TMPFS
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
#endif
- .remap_pages = generic_file_remap_pages,
};
static struct dentry *shmem_mount(struct file_system_type *fs_type,
if (shmem_inode_cachep)
return 0;
- error = bdi_init(&shmem_backing_dev_info);
- if (error)
- goto out4;
-
error = shmem_init_inodecache();
if (error)
goto out3;
out2:
shmem_destroy_inodecache();
out3:
- bdi_destroy(&shmem_backing_dev_info);
- out4:
shm_mnt = ERR_PTR(error);
return error;
}
#ifdef CONFIG_SWAP
int i;
- if (bdi_init(swapper_spaces[0].backing_dev_info))
- panic("Failed to init swap bdi");
- for (i = 0; i < MAX_SWAPFILES; i++) {
+ for (i = 0; i < MAX_SWAPFILES; i++)
spin_lock_init(&swapper_spaces[i].tree_lock);
- INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
- }
#endif
/* Use a smaller cluster for small-memory machines */
/* Can pages be swapped as part of reclaim? */
unsigned int may_swap:1;
+ /* Can cgroups be reclaimed below their normal consumption range? */
+ unsigned int may_thrash:1;
+
unsigned int hibernation_mode:1;
/* One of the zones is ready for compaction */
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_queue(mapping->backing_dev_info, sc))
+ if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
*/
mapping = page_mapping(page);
if (((dirty || writeback) && mapping &&
- bdi_write_congested(mapping->backing_dev_info)) ||
+ bdi_write_congested(inode_to_bdi(mapping->host))) ||
(writeback && PageReclaim(page)))
nr_congested++;
* latencies, so it's better to scan a minimum amount there as
* well.
*/
- if (current_is_kswapd() && !zone_reclaimable(zone))
- force_scan = true;
+ if (current_is_kswapd()) {
+ if (!zone_reclaimable(zone))
+ force_scan = true;
+ if (!mem_cgroup_lruvec_online(lruvec))
+ force_scan = true;
+ }
if (!global_reclaim(sc))
force_scan = true;
struct lruvec *lruvec;
int swappiness;
+ if (mem_cgroup_low(root, memcg)) {
+ if (!sc->may_thrash)
+ continue;
+ mem_cgroup_events(memcg, MEMCG_LOW, 1);
+ }
+
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
swappiness = mem_cgroup_swappiness(memcg);
mem_cgroup_iter_break(root, memcg);
break;
}
- memcg = mem_cgroup_iter(root, memcg, &reclaim);
- } while (memcg);
+ } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
/*
* Shrink the slab caches in the same proportion that
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
+ int initial_priority = sc->priority;
unsigned long total_scanned = 0;
unsigned long writeback_threshold;
bool zones_reclaimable;
-
+retry:
delayacct_freepages_start();
if (global_reclaim(sc))
if (sc->compaction_ready)
return 1;
+ /* Untapped cgroup reserves? Don't OOM, retry. */
+ if (!sc->may_thrash) {
+ sc->priority = initial_priority;
+ sc->may_thrash = 1;
+ goto retry;
+ }
+
/* Any of the zones still reclaimable? Don't OOM. */
if (zones_reclaimable)
return 1;
* should make reasonable progress.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
- gfp_mask, nodemask) {
+ gfp_zone(gfp_mask), nodemask) {
if (zone_idx(zone) > ZONE_NORMAL)
continue;
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
pfmemalloc_watermark_ok(pgdat))
- wake_up(&pgdat->pfmemalloc_wait);
+ wake_up_all(&pgdat->pfmemalloc_wait);
/*
* Fragmentation may mean that the system cannot be rebalanced