* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback: (27 commits)
mm: properly reflect task dirty limits in dirty_exceeded logic
writeback: don't busy retry writeback on new/freeing inodes
writeback: scale IO chunk size up to half device bandwidth
writeback: trace global_dirty_state
writeback: introduce max-pause and pass-good dirty limits
writeback: introduce smoothed global dirty limit
writeback: consolidate variable names in balance_dirty_pages()
writeback: show bdi write bandwidth in debugfs
writeback: bdi write bandwidth estimation
writeback: account per-bdi accumulated written pages
writeback: make writeback_control.nr_to_write straight
writeback: skip tmpfs early in balance_dirty_pages_ratelimited_nr()
writeback: trace event writeback_queue_io
writeback: trace event writeback_single_inode
writeback: remove .nonblocking and .encountered_congestion
writeback: remove writeback_control.more_io
writeback: skip balance_dirty_pages() for in-memory fs
writeback: add bdi_dirty_limit() kernel-doc
writeback: avoid extra sync work at enqueue time
writeback: elevate queue_io() into wb_writeback()
...
Fix up trivial conflicts in fs/fs-writeback.c and mm/filemap.c
{
return &BDEV_I(inode)->bdev;
}
-
EXPORT_SYMBOL(I_BDEV);
/*
- * move the inode from it's current bdi to the a new bdi. if the inode is dirty
- * we need to move it onto the dirty list of @dst so that the inode is always
- * on the right list.
+ * Move the inode from its current bdi to a new bdi. If the inode is dirty we
+ * need to move it onto the dirty list of @dst so that the inode is always on
+ * the right list.
*/
static void bdev_inode_switch_bdi(struct inode *inode,
struct backing_dev_info *dst)
{
- spin_lock(&inode_wb_list_lock);
+ struct backing_dev_info *old = inode->i_data.backing_dev_info;
+
+ if (unlikely(dst == old)) /* deadlock avoidance */
+ return;
+ bdi_lock_two(&old->wb, &dst->wb);
spin_lock(&inode->i_lock);
inode->i_data.backing_dev_info = dst;
if (inode->i_state & I_DIRTY)
list_move(&inode->i_wb_list, &dst->wb.b_dirty);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&old->wb.list_lock);
+ spin_unlock(&dst->wb.list_lock);
}
static sector_t max_block(struct block_device *bdev)
mutex_lock(&bd_inode->i_mutex);
size = i_size_read(bd_inode);
+ retval = -EINVAL;
switch (origin) {
- case 2:
+ case SEEK_END:
offset += size;
break;
- case 1:
+ case SEEK_CUR:
offset += file->f_pos;
+ case SEEK_SET:
+ break;
+ default:
+ goto out;
}
- retval = -EINVAL;
if (offset >= 0 && offset <= size) {
if (offset != file->f_pos) {
file->f_pos = offset;
}
retval = offset;
}
+out:
mutex_unlock(&bd_inode->i_mutex);
return retval;
}
-int blkdev_fsync(struct file *filp, int datasync)
+int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
struct inode *bd_inode = filp->f_mapping->host;
struct block_device *bdev = I_BDEV(bd_inode);
* i_mutex and doing so causes performance issues with concurrent
* O_SYNC writers to a block device.
*/
- mutex_unlock(&bd_inode->i_mutex);
-
error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
if (error == -EOPNOTSUPP)
error = 0;
- mutex_lock(&bd_inode->i_mutex);
-
return error;
}
EXPORT_SYMBOL(blkdev_fsync);
if (!disk)
return ERR_PTR(-ENXIO);
- whole = bdget_disk(disk, 0);
+ /*
+ * Normally, @bdev should equal what's returned from bdget_disk()
+ * if partno is 0; however, some drivers (floppy) use multiple
+ * bdev's for the same physical device and @bdev may be one of the
+ * aliases. Keep @bdev if partno is 0. This means claimer
+ * tracking is broken for those devices but it has always been that
+ * way.
+ */
+ if (partno)
+ whole = bdget_disk(disk, 0);
+ else
+ whole = bdgrab(bdev);
+
module_put(disk->fops->owner);
put_disk(disk);
if (!whole)
int blkdev_put(struct block_device *bdev, fmode_t mode)
{
+ mutex_lock(&bdev->bd_mutex);
+
if (mode & FMODE_EXCL) {
bool bdev_free;
* are protected with bdev_lock. bd_mutex is to
* synchronize disk_holder unlinking.
*/
- mutex_lock(&bdev->bd_mutex);
spin_lock(&bdev_lock);
WARN_ON_ONCE(--bdev->bd_holders < 0);
* If this was the last claim, remove holder link and
* unblock evpoll if it was a write holder.
*/
- if (bdev_free) {
- if (bdev->bd_write_holder) {
- disk_unblock_events(bdev->bd_disk);
- disk_check_events(bdev->bd_disk);
- bdev->bd_write_holder = false;
- }
+ if (bdev_free && bdev->bd_write_holder) {
+ disk_unblock_events(bdev->bd_disk);
+ bdev->bd_write_holder = false;
}
-
- mutex_unlock(&bdev->bd_mutex);
}
+ /*
+ * Trigger event checking and tell drivers to flush MEDIA_CHANGE
+ * event. This is to ensure detection of media removal commanded
+ * from userland - e.g. eject(1).
+ */
+ disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
+
+ mutex_unlock(&bdev->bd_mutex);
+
return __blkdev_put(bdev, mode, 0);
}
EXPORT_SYMBOL(blkdev_put);
struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host;
- trace_ext4_writepage(inode, page);
+ trace_ext4_writepage(page);
size = i_size_read(inode);
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
index = wbc->range_start >> PAGE_CACHE_SHIFT;
end = wbc->range_end >> PAGE_CACHE_SHIFT;
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
}
retry:
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
while (!ret && wbc->nr_to_write > 0) {
offset, nr_segs,
ext4_get_block, NULL, NULL, 0);
else {
- ret = blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block, NULL);
+ ret = blockdev_direct_IO(rw, iocb, inode, iov,
+ offset, nr_segs, ext4_get_block);
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
ssize_t size, void *private, int ret,
bool is_async)
{
+ struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
ext4_io_end_t *io_end = iocb->private;
struct workqueue_struct *wq;
unsigned long flags;
out:
if (is_async)
aio_complete(iocb, ret, 0);
+ inode_dio_done(inode);
return;
}
/* queue the work to convert unwritten extents to written */
queue_work(wq, &io_end->work);
iocb->private = NULL;
+
+ /* XXX: probably should move into the real I/O completion handler */
+ inode_dio_done(inode);
}
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
EXT4_I(inode)->cur_aio_dio = iocb->private;
}
- ret = blockdev_direct_IO(rw, iocb, inode,
+ ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block_write,
- ext4_end_io_dio);
+ ext4_end_io_dio,
+ NULL,
+ DIO_LOCKING | DIO_SKIP_HOLES);
if (iocb->private)
EXT4_I(inode)->cur_aio_dio = NULL;
/*
}
if (attr->ia_valid & ATTR_SIZE) {
+ inode_dio_wait(inode);
+
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct page *page = vmf->page;
loff_t size;
unsigned long len;
- int ret = -EINVAL;
- void *fsdata;
+ int ret;
struct file *file = vma->vm_file;
struct inode *inode = file->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
+ handle_t *handle;
+ get_block_t *get_block;
+ int retries = 0;
/*
- * Get i_alloc_sem to stop truncates messing with the inode. We cannot
- * get i_mutex because we are already holding mmap_sem.
+ * This check is racy but catches the common case. We rely on
+ * __block_page_mkwrite() to do a reliable check.
*/
- down_read(&inode->i_alloc_sem);
- size = i_size_read(inode);
- if (page->mapping != mapping || size <= page_offset(page)
- || !PageUptodate(page)) {
- /* page got truncated from under us? */
- goto out_unlock;
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ /* Delalloc case is easy... */
+ if (test_opt(inode->i_sb, DELALLOC) &&
+ !ext4_should_journal_data(inode) &&
+ !ext4_nonda_switch(inode->i_sb)) {
+ do {
+ ret = __block_page_mkwrite(vma, vmf,
+ ext4_da_get_block_prep);
+ } while (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries));
+ goto out_ret;
}
- ret = 0;
lock_page(page);
- wait_on_page_writeback(page);
- if (PageMappedToDisk(page)) {
- up_read(&inode->i_alloc_sem);
- return VM_FAULT_LOCKED;
+ size = i_size_read(inode);
+ /* Page got truncated from under us? */
+ if (page->mapping != mapping || page_offset(page) > size) {
+ unlock_page(page);
+ ret = VM_FAULT_NOPAGE;
+ goto out;
}
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
else
len = PAGE_CACHE_SIZE;
-
/*
- * return if we have all the buffers mapped. This avoid
- * the need to call write_begin/write_end which does a
- * journal_start/journal_stop which can block and take
- * long time
+ * Return if we have all the buffers mapped. This avoids the need to do
+ * journal_start/journal_stop which can block and take a long time
*/
if (page_has_buffers(page)) {
if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
ext4_bh_unmapped)) {
- up_read(&inode->i_alloc_sem);
- return VM_FAULT_LOCKED;
+ /* Wait so that we don't change page under IO */
+ wait_on_page_writeback(page);
+ ret = VM_FAULT_LOCKED;
+ goto out;
}
}
unlock_page(page);
- /*
- * OK, we need to fill the hole... Do write_begin write_end
- * to do block allocation/reservation.We are not holding
- * inode.i__mutex here. That allow * parallel write_begin,
- * write_end call. lock_page prevent this from happening
- * on the same page though
- */
- ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
- len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
- if (ret < 0)
- goto out_unlock;
- ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
- len, len, page, fsdata);
- if (ret < 0)
- goto out_unlock;
- ret = 0;
-
- /*
- * write_begin/end might have created a dirty page and someone
- * could wander in and start the IO. Make sure that hasn't
- * happened.
- */
- lock_page(page);
- wait_on_page_writeback(page);
- up_read(&inode->i_alloc_sem);
- return VM_FAULT_LOCKED;
-out_unlock:
- if (ret)
+ /* OK, we need to fill the hole... */
+ if (ext4_should_dioread_nolock(inode))
+ get_block = ext4_get_block_write;
+ else
+ get_block = ext4_get_block;
+retry_alloc:
+ handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
ret = VM_FAULT_SIGBUS;
- up_read(&inode->i_alloc_sem);
+ goto out;
+ }
+ ret = __block_page_mkwrite(vma, vmf, get_block);
+ if (!ret && ext4_should_journal_data(inode)) {
+ if (walk_page_buffers(handle, page_buffers(page), 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
+ unlock_page(page);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ }
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_alloc;
+out_ret:
+ ret = block_page_mkwrite_return(ret);
+out:
return ret;
}
struct wb_writeback_work {
long nr_pages;
struct super_block *sb;
+ unsigned long *older_than_this;
enum writeback_sync_modes sync_mode;
+ unsigned int tagged_writepages:1;
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
*/
void inode_wb_list_del(struct inode *inode)
{
- spin_lock(&inode_wb_list_lock);
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+
+ spin_lock(&bdi->wb.list_lock);
list_del_init(&inode->i_wb_list);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&bdi->wb.list_lock);
}
-
/*
* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
* furthest end of its superblock's dirty-inode list.
* the case then the inode must have been redirtied while it was being written
* out and we don't reset its dirtied_when.
*/
- static void redirty_tail(struct inode *inode)
+ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
- struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-
- assert_spin_locked(&inode_wb_list_lock);
+ assert_spin_locked(&wb->list_lock);
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
/*
* requeue inode for re-scanning after bdi->b_io list is exhausted.
*/
- static void requeue_io(struct inode *inode)
+ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
- struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-
- assert_spin_locked(&inode_wb_list_lock);
+ assert_spin_locked(&wb->list_lock);
list_move(&inode->i_wb_list, &wb->b_more_io);
}
{
/*
* Prevent speculative execution through
- * spin_unlock(&inode_wb_list_lock);
+ * spin_unlock(&wb->list_lock);
*/
smp_mb();
/*
* Move expired dirty inodes from @delaying_queue to @dispatch_queue.
*/
- static void move_expired_inodes(struct list_head *delaying_queue,
+ static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
- unsigned long *older_than_this)
+ unsigned long *older_than_this)
{
LIST_HEAD(tmp);
struct list_head *pos, *node;
struct super_block *sb = NULL;
struct inode *inode;
int do_sb_sort = 0;
+ int moved = 0;
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
do_sb_sort = 1;
sb = inode->i_sb;
list_move(&inode->i_wb_list, &tmp);
+ moved++;
}
/* just one sb in list, splice to dispatch_queue and we're done */
if (!do_sb_sort) {
list_splice(&tmp, dispatch_queue);
- return;
+ goto out;
}
/* Move inodes from one superblock together */
list_move(&inode->i_wb_list, dispatch_queue);
}
}
+ out:
+ return moved;
}
/*
*/
static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
{
- assert_spin_locked(&inode_wb_list_lock);
+ int moved;
+ assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
- move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+ moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+ trace_writeback_queue_io(wb, older_than_this, moved);
}
static int write_inode(struct inode *inode, struct writeback_control *wbc)
/*
* Wait for writeback on an inode to complete.
*/
- static void inode_wait_for_writeback(struct inode *inode)
+ static void inode_wait_for_writeback(struct inode *inode,
+ struct bdi_writeback *wb)
{
DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
wait_queue_head_t *wqh;
wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
while (inode->i_state & I_SYNC) {
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&wb->list_lock);
__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
- spin_lock(&inode_wb_list_lock);
+ spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
}
}
/*
- * Write out an inode's dirty pages. Called under inode_wb_list_lock and
+ * Write out an inode's dirty pages. Called under wb->list_lock and
* inode->i_lock. Either the caller has an active reference on the inode or
* the inode has I_WILL_FREE set.
*
* livelocks, etc.
*/
static int
- writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
+ struct writeback_control *wbc)
{
struct address_space *mapping = inode->i_mapping;
+ long nr_to_write = wbc->nr_to_write;
unsigned dirty;
int ret;
- assert_spin_locked(&inode_wb_list_lock);
+ assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
if (!atomic_read(&inode->i_count))
* completed a full scan of b_io.
*/
if (wbc->sync_mode != WB_SYNC_ALL) {
- requeue_io(inode);
+ requeue_io(inode, wb);
+ trace_writeback_single_inode_requeue(inode, wbc,
+ nr_to_write);
return 0;
}
/*
* It's a data-integrity sync. We must wait.
*/
- inode_wait_for_writeback(inode);
+ inode_wait_for_writeback(inode, wb);
}
BUG_ON(inode->i_state & I_SYNC);
inode->i_state |= I_SYNC;
inode->i_state &= ~I_DIRTY_PAGES;
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&wb->list_lock);
ret = do_writepages(mapping, wbc);
ret = err;
}
- spin_lock(&inode_wb_list_lock);
+ spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
inode->i_state &= ~I_SYNC;
if (!(inode->i_state & I_FREEING)) {
+ /*
+ * Sync livelock prevention. Each inode is tagged and synced in
+ * one shot. If still dirty, it will be redirty_tail()'ed below.
+ * Update the dirty time to prevent enqueue and sync it again.
+ */
+ if ((inode->i_state & I_DIRTY) &&
+ (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
+ inode->dirtied_when = jiffies;
+
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
/*
* We didn't write back all the pages. nfs_writepages()
/*
* slice used up: queue for next turn
*/
- requeue_io(inode);
+ requeue_io(inode, wb);
} else {
/*
* Writeback blocked by something other than
* retrying writeback of the dirty page/inode
* that cannot be performed immediately.
*/
- redirty_tail(inode);
+ redirty_tail(inode, wb);
}
} else if (inode->i_state & I_DIRTY) {
/*
* submission or metadata updates after data IO
* completion.
*/
- redirty_tail(inode);
+ redirty_tail(inode, wb);
} else {
/*
* The inode is clean. At this point we either have
}
}
inode_sync_complete(inode);
+ trace_writeback_single_inode(inode, wbc, nr_to_write);
return ret;
}
-/*
- * For background writeback the caller does not have the sb pinned
- * before calling writeback. So make sure that we do pin it, so it doesn't
- * go away while we are writing inodes from it.
- */
-static bool pin_sb_for_writeback(struct super_block *sb)
-{
- spin_lock(&sb_lock);
- if (list_empty(&sb->s_instances)) {
- spin_unlock(&sb_lock);
- return false;
- }
-
- sb->s_count++;
- spin_unlock(&sb_lock);
-
- if (down_read_trylock(&sb->s_umount)) {
- if (sb->s_root)
- return true;
- up_read(&sb->s_umount);
- }
-
- put_super(sb);
- return false;
-}
-
+ static long writeback_chunk_size(struct backing_dev_info *bdi,
+ struct wb_writeback_work *work)
+ {
+ long pages;
+
+ /*
+ * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+ * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+ * here avoids calling into writeback_inodes_wb() more than once.
+ *
+ * The intended call sequence for WB_SYNC_ALL writeback is:
+ *
+ * wb_writeback()
+ * writeback_sb_inodes() <== called only once
+ * write_cache_pages() <== called once for each inode
+ * (quickly) tag currently dirty pages
+ * (maybe slowly) sync all tagged pages
+ */
+ if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
+ pages = LONG_MAX;
+ else {
+ pages = min(bdi->avg_write_bandwidth / 2,
+ global_dirty_limit / DIRTY_SCOPE);
+ pages = min(pages, work->nr_pages);
+ pages = round_down(pages + MIN_WRITEBACK_PAGES,
+ MIN_WRITEBACK_PAGES);
+ }
+
+ return pages;
+ }
+
/*
* Write a portion of b_io inodes which belong to @sb.
*
* inodes. Otherwise write only ones which go sequentially
* in reverse order.
*
- * Return 1, if the caller writeback routine should be
- * interrupted. Otherwise return 0.
+ * Return the number of pages and/or inodes written.
*/
- static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
- struct writeback_control *wbc, bool only_this_sb)
+ static long writeback_sb_inodes(struct super_block *sb,
+ struct bdi_writeback *wb,
+ struct wb_writeback_work *work)
{
+ struct writeback_control wbc = {
+ .sync_mode = work->sync_mode,
+ .tagged_writepages = work->tagged_writepages,
+ .for_kupdate = work->for_kupdate,
+ .for_background = work->for_background,
+ .range_cyclic = work->range_cyclic,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
+ unsigned long start_time = jiffies;
+ long write_chunk;
+ long wrote = 0; /* count both pages and inodes */
+
while (!list_empty(&wb->b_io)) {
- long pages_skipped;
struct inode *inode = wb_inode(wb->b_io.prev);
if (inode->i_sb != sb) {
- if (only_this_sb) {
+ if (work->sb) {
/*
* We only want to write back data for this
* superblock, move all inodes not belonging
* to it back onto the dirty list.
*/
- redirty_tail(inode);
+ redirty_tail(inode, wb);
continue;
}
* Bounce back to the caller to unpin this and
* pin the next superblock.
*/
- return 0;
+ break;
}
/*
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
spin_unlock(&inode->i_lock);
- requeue_io(inode);
+ redirty_tail(inode, wb);
continue;
}
-
- /*
- * Was this inode dirtied after sync_sb_inodes was called?
- * This keeps sync from extra jobs and livelock.
- */
- if (inode_dirtied_after(inode, wbc->wb_start)) {
- spin_unlock(&inode->i_lock);
- return 1;
- }
-
__iget(inode);
+ write_chunk = writeback_chunk_size(wb->bdi, work);
+ wbc.nr_to_write = write_chunk;
+ wbc.pages_skipped = 0;
- pages_skipped = wbc->pages_skipped;
- writeback_single_inode(inode, wbc);
- if (wbc->pages_skipped != pages_skipped) {
+ writeback_single_inode(inode, wb, &wbc);
+
+ work->nr_pages -= write_chunk - wbc.nr_to_write;
+ wrote += write_chunk - wbc.nr_to_write;
+ if (!(inode->i_state & I_DIRTY))
+ wrote++;
+ if (wbc.pages_skipped) {
/*
* writeback is not making progress due to locked
* buffers. Skip this inode for now.
*/
- redirty_tail(inode);
+ redirty_tail(inode, wb);
}
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&wb->list_lock);
iput(inode);
cond_resched();
- spin_lock(&inode_wb_list_lock);
- if (wbc->nr_to_write <= 0) {
- wbc->more_io = 1;
- return 1;
+ spin_lock(&wb->list_lock);
+ /*
+ * bail out to wb_writeback() often enough to check
+ * background threshold and other termination conditions.
+ */
+ if (wrote) {
+ if (time_is_before_jiffies(start_time + HZ / 10UL))
+ break;
+ if (work->nr_pages <= 0)
+ break;
}
- if (!list_empty(&wb->b_more_io))
- wbc->more_io = 1;
}
- /* b_io is empty */
- return 1;
+ return wrote;
}
- void writeback_inodes_wb(struct bdi_writeback *wb,
- struct writeback_control *wbc)
+ static long __writeback_inodes_wb(struct bdi_writeback *wb,
+ struct wb_writeback_work *work)
{
- int ret = 0;
-
- if (!wbc->wb_start)
- wbc->wb_start = jiffies; /* livelock avoidance */
- spin_lock(&inode_wb_list_lock);
- if (!wbc->for_kupdate || list_empty(&wb->b_io))
- queue_io(wb, wbc->older_than_this);
+ unsigned long start_time = jiffies;
+ long wrote = 0;
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
struct super_block *sb = inode->i_sb;
- if (!pin_sb_for_writeback(sb)) {
+ if (!grab_super_passive(sb)) {
- requeue_io(inode);
+ requeue_io(inode, wb);
continue;
}
- ret = writeback_sb_inodes(sb, wb, wbc, false);
+ wrote += writeback_sb_inodes(sb, wb, work);
drop_super(sb);
- if (ret)
- break;
+ /* refer to the same tests at the end of writeback_sb_inodes */
+ if (wrote) {
+ if (time_is_before_jiffies(start_time + HZ / 10UL))
+ break;
+ if (work->nr_pages <= 0)
+ break;
+ }
}
- spin_unlock(&inode_wb_list_lock);
/* Leave any unwritten inodes on b_io */
+ return wrote;
}
- static void __writeback_inodes_sb(struct super_block *sb,
- struct bdi_writeback *wb, struct writeback_control *wbc)
+ long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
{
- WARN_ON(!rwsem_is_locked(&sb->s_umount));
+ struct wb_writeback_work work = {
+ .nr_pages = nr_pages,
+ .sync_mode = WB_SYNC_NONE,
+ .range_cyclic = 1,
+ };
- spin_lock(&inode_wb_list_lock);
- if (!wbc->for_kupdate || list_empty(&wb->b_io))
- queue_io(wb, wbc->older_than_this);
- writeback_sb_inodes(sb, wb, wbc, true);
- spin_unlock(&inode_wb_list_lock);
- }
+ spin_lock(&wb->list_lock);
+ if (list_empty(&wb->b_io))
+ queue_io(wb, NULL);
+ __writeback_inodes_wb(wb, &work);
+ spin_unlock(&wb->list_lock);
- /*
- * The maximum number of pages to writeout in a single bdi flush/kupdate
- * operation. We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode. Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
- #define MAX_WRITEBACK_PAGES 1024
+ return nr_pages - work.nr_pages;
+ }
static inline bool over_bground_thresh(void)
{
global_page_state(NR_UNSTABLE_NFS) > background_thresh);
}
+ /*
+ * Called under wb->list_lock. If there are multiple wb per bdi,
+ * only the flusher working on the first wb should do it.
+ */
+ static void wb_update_bandwidth(struct bdi_writeback *wb,
+ unsigned long start_time)
+ {
+ __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
+ }
+
/*
* Explicit flushing or periodic writeback of "old" data.
*
static long wb_writeback(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
- struct writeback_control wbc = {
- .sync_mode = work->sync_mode,
- .older_than_this = NULL,
- .for_kupdate = work->for_kupdate,
- .for_background = work->for_background,
- .range_cyclic = work->range_cyclic,
- };
+ unsigned long wb_start = jiffies;
+ long nr_pages = work->nr_pages;
unsigned long oldest_jif;
- long wrote = 0;
- long write_chunk;
struct inode *inode;
+ long progress;
- if (wbc.for_kupdate) {
- wbc.older_than_this = &oldest_jif;
- oldest_jif = jiffies -
- msecs_to_jiffies(dirty_expire_interval * 10);
- }
- if (!wbc.range_cyclic) {
- wbc.range_start = 0;
- wbc.range_end = LLONG_MAX;
- }
+ oldest_jif = jiffies;
+ work->older_than_this = &oldest_jif;
- /*
- * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
- * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
- * here avoids calling into writeback_inodes_wb() more than once.
- *
- * The intended call sequence for WB_SYNC_ALL writeback is:
- *
- * wb_writeback()
- * __writeback_inodes_sb() <== called only once
- * write_cache_pages() <== called once for each inode
- * (quickly) tag currently dirty pages
- * (maybe slowly) sync all tagged pages
- */
- if (wbc.sync_mode == WB_SYNC_NONE)
- write_chunk = MAX_WRITEBACK_PAGES;
- else
- write_chunk = LONG_MAX;
-
- wbc.wb_start = jiffies; /* livelock avoidance */
+ spin_lock(&wb->list_lock);
for (;;) {
/*
* Stop writeback when nr_pages has been consumed
if (work->for_background && !over_bground_thresh())
break;
- wbc.more_io = 0;
- wbc.nr_to_write = write_chunk;
- wbc.pages_skipped = 0;
+ if (work->for_kupdate) {
+ oldest_jif = jiffies -
+ msecs_to_jiffies(dirty_expire_interval * 10);
+ work->older_than_this = &oldest_jif;
+ }
- trace_wbc_writeback_start(&wbc, wb->bdi);
+ trace_writeback_start(wb->bdi, work);
+ if (list_empty(&wb->b_io))
+ queue_io(wb, work->older_than_this);
if (work->sb)
- __writeback_inodes_sb(work->sb, wb, &wbc);
+ progress = writeback_sb_inodes(work->sb, wb, work);
else
- writeback_inodes_wb(wb, &wbc);
- trace_wbc_writeback_written(&wbc, wb->bdi);
+ progress = __writeback_inodes_wb(wb, work);
+ trace_writeback_written(wb->bdi, work);
- work->nr_pages -= write_chunk - wbc.nr_to_write;
- wrote += write_chunk - wbc.nr_to_write;
+ wb_update_bandwidth(wb, wb_start);
/*
- * If we consumed everything, see if we have more
+ * Did we write something? Try for more
+ *
+ * Dirty inodes are moved to b_io for writeback in batches.
+ * The completion of the current batch does not necessarily
+ * mean the overall work is done. So we keep looping as long
+ * as made some progress on cleaning pages or inodes.
*/
- if (wbc.nr_to_write <= 0)
+ if (progress)
continue;
/*
- * Didn't write everything and we don't have more IO, bail
+ * No more inodes for IO, bail
*/
- if (!wbc.more_io)
+ if (list_empty(&wb->b_more_io))
break;
- /*
- * Did we write something? Try for more
- */
- if (wbc.nr_to_write < write_chunk)
- continue;
/*
* Nothing written. Wait for some inode to
* become available for writeback. Otherwise
* we'll just busyloop.
*/
- spin_lock(&inode_wb_list_lock);
if (!list_empty(&wb->b_more_io)) {
+ trace_writeback_wait(wb->bdi, work);
inode = wb_inode(wb->b_more_io.prev);
- trace_wbc_writeback_wait(&wbc, wb->bdi);
spin_lock(&inode->i_lock);
- inode_wait_for_writeback(inode);
+ inode_wait_for_writeback(inode, wb);
spin_unlock(&inode->i_lock);
}
- spin_unlock(&inode_wb_list_lock);
}
+ spin_unlock(&wb->list_lock);
- return wrote;
+ return nr_pages - work->nr_pages;
}
/*
}
spin_unlock(&inode->i_lock);
- spin_lock(&inode_wb_list_lock);
+ spin_lock(&bdi->wb.list_lock);
inode->dirtied_when = jiffies;
list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&bdi->wb.list_lock);
if (wakeup_bdi)
bdi_wakeup_thread_delayed(bdi);
{
DECLARE_COMPLETION_ONSTACK(done);
struct wb_writeback_work work = {
- .sb = sb,
- .sync_mode = WB_SYNC_NONE,
- .done = &done,
- .nr_pages = nr,
+ .sb = sb,
+ .sync_mode = WB_SYNC_NONE,
+ .tagged_writepages = 1,
+ .done = &done,
+ .nr_pages = nr,
};
WARN_ON(!rwsem_is_locked(&sb->s_umount));
*/
int write_inode_now(struct inode *inode, int sync)
{
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
int ret;
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
wbc.nr_to_write = 0;
might_sleep();
- spin_lock(&inode_wb_list_lock);
+ spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
- ret = writeback_single_inode(inode, &wbc);
+ ret = writeback_single_inode(inode, wb, &wbc);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&wb->list_lock);
if (sync)
inode_sync_wait(inode);
return ret;
*/
int sync_inode(struct inode *inode, struct writeback_control *wbc)
{
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
int ret;
- spin_lock(&inode_wb_list_lock);
+ spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
- ret = writeback_single_inode(inode, wbc);
+ ret = writeback_single_inode(inode, wb, wbc);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&wb->list_lock);
return ret;
}
EXPORT_SYMBOL(sync_inode);
*
* inode->i_lock protects:
* inode->i_state, inode->i_hash, __iget()
- * inode_lru_lock protects:
- * inode_lru, inode->i_lru
+ * inode->i_sb->s_inode_lru_lock protects:
+ * inode->i_sb->s_inode_lru, inode->i_lru
* inode_sb_list_lock protects:
* sb->s_inodes, inode->i_sb_list
- * inode_wb_list_lock protects:
+ * bdi->wb.list_lock protects:
* bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
* inode_hash_lock protects:
* inode_hashtable, inode->i_hash
*
* inode_sb_list_lock
* inode->i_lock
- * inode_lru_lock
+ * inode->i_sb->s_inode_lru_lock
*
- * inode_wb_list_lock
+ * bdi->wb.list_lock
* inode->i_lock
*
* inode_hash_lock
static struct hlist_head *inode_hashtable __read_mostly;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
-static LIST_HEAD(inode_lru);
-static DEFINE_SPINLOCK(inode_lru_lock);
-
__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
-/*
- * iprune_sem provides exclusion between the icache shrinking and the
- * umount path.
- *
- * We don't actually need it to protect anything in the umount path,
- * but only need to cycle through it to make sure any inode that
- * prune_icache took off the LRU list has been fully torn down by the
- * time we are past evict_inodes.
- */
-static DECLARE_RWSEM(iprune_sem);
-
/*
* Empty aops. Can be used for the cases where the user does not
* define any of the address_space operations.
struct inodes_stat_t inodes_stat;
static DEFINE_PER_CPU(unsigned int, nr_inodes);
+static DEFINE_PER_CPU(unsigned int, nr_unused);
static struct kmem_cache *inode_cachep __read_mostly;
static inline int get_nr_inodes_unused(void)
{
- return inodes_stat.nr_unused;
+ int i;
+ int sum = 0;
+ for_each_possible_cpu(i)
+ sum += per_cpu(nr_unused, i);
+ return sum < 0 ? 0 : sum;
}
int get_nr_dirty_inodes(void)
void __user *buffer, size_t *lenp, loff_t *ppos)
{
inodes_stat.nr_inodes = get_nr_inodes();
+ inodes_stat.nr_unused = get_nr_inodes_unused();
return proc_dointvec(table, write, buffer, lenp, ppos);
}
#endif
mutex_init(&inode->i_mutex);
lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
- init_rwsem(&inode->i_alloc_sem);
- lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
+ atomic_set(&inode->i_dio_count, 0);
mapping->a_ops = &empty_aops;
mapping->host = inode;
static void inode_lru_list_add(struct inode *inode)
{
- spin_lock(&inode_lru_lock);
+ spin_lock(&inode->i_sb->s_inode_lru_lock);
if (list_empty(&inode->i_lru)) {
- list_add(&inode->i_lru, &inode_lru);
- inodes_stat.nr_unused++;
+ list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
+ inode->i_sb->s_nr_inodes_unused++;
+ this_cpu_inc(nr_unused);
}
- spin_unlock(&inode_lru_lock);
+ spin_unlock(&inode->i_sb->s_inode_lru_lock);
}
static void inode_lru_list_del(struct inode *inode)
{
- spin_lock(&inode_lru_lock);
+ spin_lock(&inode->i_sb->s_inode_lru_lock);
if (!list_empty(&inode->i_lru)) {
list_del_init(&inode->i_lru);
- inodes_stat.nr_unused--;
+ inode->i_sb->s_nr_inodes_unused--;
+ this_cpu_dec(nr_unused);
}
- spin_unlock(&inode_lru_lock);
+ spin_unlock(&inode->i_sb->s_inode_lru_lock);
}
/**
void end_writeback(struct inode *inode)
{
might_sleep();
+ /*
+ * We have to cycle tree_lock here because reclaim can be still in the
+ * process of removing the last page (in __delete_from_page_cache())
+ * and we must not free mapping under it.
+ */
+ spin_lock_irq(&inode->i_data.tree_lock);
BUG_ON(inode->i_data.nrpages);
+ spin_unlock_irq(&inode->i_data.tree_lock);
BUG_ON(!list_empty(&inode->i_data.private_list));
BUG_ON(!(inode->i_state & I_FREEING));
BUG_ON(inode->i_state & I_CLEAR);
spin_unlock(&inode_sb_list_lock);
dispose_list(&dispose);
-
- /*
- * Cycle through iprune_sem to make sure any inode that prune_icache
- * moved off the list before we took the lock has been fully torn
- * down.
- */
- down_write(&iprune_sem);
- up_write(&iprune_sem);
}
/**
}
/*
- * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- * temporary list and then are freed outside inode_lru_lock by dispose_list().
+ * Walk the superblock inode LRU for freeable inodes and attempt to free them.
+ * This is called from the superblock shrinker function with a number of inodes
+ * to trim from the LRU. Inodes to be freed are moved to a temporary list and
+ * then are freed outside inode_lock by dispose_list().
*
* Any inodes which are pinned purely because of attached pagecache have their
* pagecache removed. If the inode has metadata buffers attached to
* LRU does not have strict ordering. Hence we don't want to reclaim inodes
* with this flag set because they are the inodes that are out of order.
*/
-static void prune_icache(int nr_to_scan)
+void prune_icache_sb(struct super_block *sb, int nr_to_scan)
{
LIST_HEAD(freeable);
int nr_scanned;
unsigned long reap = 0;
- down_read(&iprune_sem);
- spin_lock(&inode_lru_lock);
- for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+ spin_lock(&sb->s_inode_lru_lock);
+ for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
struct inode *inode;
- if (list_empty(&inode_lru))
+ if (list_empty(&sb->s_inode_lru))
break;
- inode = list_entry(inode_lru.prev, struct inode, i_lru);
+ inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
/*
- * we are inverting the inode_lru_lock/inode->i_lock here,
+ * we are inverting the sb->s_inode_lru_lock/inode->i_lock here,
* so use a trylock. If we fail to get the lock, just move the
* inode to the back of the list so we don't spin on it.
*/
if (!spin_trylock(&inode->i_lock)) {
- list_move(&inode->i_lru, &inode_lru);
+ list_move(&inode->i_lru, &sb->s_inode_lru);
continue;
}
(inode->i_state & ~I_REFERENCED)) {
list_del_init(&inode->i_lru);
spin_unlock(&inode->i_lock);
- inodes_stat.nr_unused--;
+ sb->s_nr_inodes_unused--;
+ this_cpu_dec(nr_unused);
continue;
}
/* recently referenced inodes get one more pass */
if (inode->i_state & I_REFERENCED) {
inode->i_state &= ~I_REFERENCED;
- list_move(&inode->i_lru, &inode_lru);
+ list_move(&inode->i_lru, &sb->s_inode_lru);
spin_unlock(&inode->i_lock);
continue;
}
if (inode_has_buffers(inode) || inode->i_data.nrpages) {
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_lru_lock);
+ spin_unlock(&sb->s_inode_lru_lock);
if (remove_inode_buffers(inode))
reap += invalidate_mapping_pages(&inode->i_data,
0, -1);
iput(inode);
- spin_lock(&inode_lru_lock);
+ spin_lock(&sb->s_inode_lru_lock);
- if (inode != list_entry(inode_lru.next,
+ if (inode != list_entry(sb->s_inode_lru.next,
struct inode, i_lru))
continue; /* wrong inode or list_empty */
/* avoid lock inversions with trylock */
spin_unlock(&inode->i_lock);
list_move(&inode->i_lru, &freeable);
- inodes_stat.nr_unused--;
+ sb->s_nr_inodes_unused--;
+ this_cpu_dec(nr_unused);
}
if (current_is_kswapd())
__count_vm_events(KSWAPD_INODESTEAL, reap);
else
__count_vm_events(PGINODESTEAL, reap);
- spin_unlock(&inode_lru_lock);
+ spin_unlock(&sb->s_inode_lru_lock);
dispose_list(&freeable);
- up_read(&iprune_sem);
}
-/*
- * shrink_icache_memory() will attempt to reclaim some unused inodes. Here,
- * "unused" means that no dentries are referring to the inodes: the files are
- * not open and the dcache references to those inodes have already been
- * reclaimed.
- *
- * This function is passed the number of inodes to scan, and it returns the
- * total number of remaining possibly-reclaimable inodes.
- */
-static int shrink_icache_memory(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- int nr = sc->nr_to_scan;
- gfp_t gfp_mask = sc->gfp_mask;
-
- if (nr) {
- /*
- * Nasty deadlock avoidance. We may hold various FS locks,
- * and we don't want to recurse into the FS that called us
- * in clear_inode() and friends..
- */
- if (!(gfp_mask & __GFP_FS))
- return -1;
- prune_icache(nr);
- }
- return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
-}
-
-static struct shrinker icache_shrinker = {
- .shrink = shrink_icache_memory,
- .seeks = DEFAULT_SEEKS,
-};
-
static void __wait_on_freeing_inode(struct inode *inode);
/*
* Called with the inode lock held.
WARN_ON(inode->i_state & I_NEW);
- if (op && op->drop_inode)
+ if (op->drop_inode)
drop = op->drop_inode(inode);
else
drop = generic_drop_inode(inode);
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
SLAB_MEM_SPREAD),
init_once);
- register_shrinker(&icache_shrinker);
/* Hash may have been set up in inode_init_early */
if (!hashdist)
*/
static void nfs_inode_remove_request(struct nfs_page *req)
{
- struct inode *inode = req->wb_context->path.dentry->d_inode;
+ struct inode *inode = req->wb_context->dentry->d_inode;
struct nfs_inode *nfsi = NFS_I(inode);
BUG_ON (!NFS_WBACK_BUSY(req));
static void
nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
{
- struct inode *inode = req->wb_context->path.dentry->d_inode;
+ struct inode *inode = req->wb_context->dentry->d_inode;
struct nfs_inode *nfsi = NFS_I(inode);
spin_lock(&inode->i_lock);
struct pnfs_layout_segment *lseg,
int how)
{
- struct inode *inode = req->wb_context->path.dentry->d_inode;
+ struct inode *inode = req->wb_context->dentry->d_inode;
/* Set up the RPC argument and reply structs
* NB: take care not to mess about with data->commit et al. */
data->req = req;
- data->inode = inode = req->wb_context->path.dentry->d_inode;
+ data->inode = inode = req->wb_context->dentry->d_inode;
data->cred = req->wb_context->cred;
data->lseg = get_lseg(lseg);
data->args.fh = NFS_FH(inode);
data->args.offset = req_offset(req) + offset;
+ /* pnfs_set_layoutcommit needs this */
+ data->mds_offset = data->args.offset;
data->args.pgbase = req->wb_pgbase + offset;
data->args.pages = data->pagevec;
data->args.count = count;
dprintk("NFS: %5u write(%s/%lld %d@%lld)",
task->tk_pid,
- data->req->wb_context->path.dentry->d_inode->i_sb->s_id,
+ data->req->wb_context->dentry->d_inode->i_sb->s_id,
(long long)
- NFS_FILEID(data->req->wb_context->path.dentry->d_inode),
+ NFS_FILEID(data->req->wb_context->dentry->d_inode),
data->req->wb_bytes, (long long)req_offset(data->req));
nfs_writeback_done(task, data);
dprintk("NFS: %5u write (%s/%lld %d@%lld)",
data->task.tk_pid,
- req->wb_context->path.dentry->d_inode->i_sb->s_id,
- (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
+ req->wb_context->dentry->d_inode->i_sb->s_id,
+ (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
req->wb_bytes,
(long long)req_offset(req));
struct pnfs_layout_segment *lseg)
{
struct nfs_page *first = nfs_list_entry(head->next);
- struct inode *inode = first->wb_context->path.dentry->d_inode;
+ struct inode *inode = first->wb_context->dentry->d_inode;
/* Set up the RPC argument and reply structs
* NB: take care not to mess about with data->commit et al. */
nfs_clear_request_commit(req);
dprintk("NFS: commit (%s/%lld %d@%lld)",
- req->wb_context->path.dentry->d_inode->i_sb->s_id,
- (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
+ req->wb_context->dentry->d_sb->s_id,
+ (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
req->wb_bytes,
(long long)req_offset(req));
if (status < 0) {
int status;
bool sync = true;
- if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking ||
- wbc->for_background)
+ if (wbc->sync_mode == WB_SYNC_NONE)
sync = false;
status = pnfs_layoutcommit_inode(inode, sync);
__field( umode_t, mode )
__field( uid_t, uid )
__field( gid_t, gid )
- __field( blkcnt_t, blocks )
+ __field( __u64, blocks )
),
TP_fast_assign(
TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- (unsigned long) __entry->ino,
- __entry->mode, __entry->uid, __entry->gid,
- (unsigned long long) __entry->blocks)
+ (unsigned long) __entry->ino, __entry->mode,
+ __entry->uid, __entry->gid, __entry->blocks)
);
TRACE_EVENT(ext4_request_inode,
TP_printk("dev %d,%d ino %lu new_size %lld",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
- (long long) __entry->new_size)
+ __entry->new_size)
);
DECLARE_EVENT_CLASS(ext4__write_begin,
__entry->flags = flags;
),
- TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u",
+ TP_printk("dev %d,%d ino %lu pos %lld len %u flags %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
__entry->pos, __entry->len, __entry->flags)
__entry->copied = copied;
),
- TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u",
+ TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
__entry->pos, __entry->len, __entry->copied)
TP_ARGS(inode, pos, len, copied)
);
-TRACE_EVENT(ext4_writepage,
- TP_PROTO(struct inode *inode, struct page *page),
-
- TP_ARGS(inode, page),
-
- TP_STRUCT__entry(
- __field( dev_t, dev )
- __field( ino_t, ino )
- __field( pgoff_t, index )
-
- ),
-
- TP_fast_assign(
- __entry->dev = inode->i_sb->s_dev;
- __entry->ino = inode->i_ino;
- __entry->index = page->index;
- ),
-
- TP_printk("dev %d,%d ino %lu page_index %lu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- (unsigned long) __entry->ino, __entry->index)
-);
-
TRACE_EVENT(ext4_da_writepages,
TP_PROTO(struct inode *inode, struct writeback_control *wbc),
),
TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
- "range_start %llu range_end %llu sync_mode %d"
+ "range_start %lld range_end %lld sync_mode %d"
"for_kupdate %d range_cyclic %d writeback_index %lu",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino, __entry->nr_to_write,
__field( int, pages_written )
__field( long, pages_skipped )
__field( int, sync_mode )
- __field( char, more_io )
__field( pgoff_t, writeback_index )
),
__entry->pages_written = pages_written;
__entry->pages_skipped = wbc->pages_skipped;
__entry->sync_mode = wbc->sync_mode;
- __entry->more_io = wbc->more_io;
__entry->writeback_index = inode->i_mapping->writeback_index;
),
TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
- " more_io %d sync_mode %d writeback_index %lu",
+ "sync_mode %d writeback_index %lu",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino, __entry->ret,
__entry->pages_written, __entry->pages_skipped,
- __entry->more_io, __entry->sync_mode,
+ __entry->sync_mode,
(unsigned long) __entry->writeback_index)
);
TP_printk("dev %d,%d ino %lu page_index %lu",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
- __entry->index)
+ (unsigned long) __entry->index)
+);
+
+DEFINE_EVENT(ext4__page_op, ext4_writepage,
+
+ TP_PROTO(struct page *page),
+
+ TP_ARGS(page)
);
DEFINE_EVENT(ext4__page_op, ext4_readpage,
TP_printk("dev %d,%d ino %lu page_index %lu offset %lu",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
- __entry->index, __entry->offset)
+ (unsigned long) __entry->index, __entry->offset)
);
TRACE_EVENT(ext4_discard_blocks,
);
TRACE_EVENT(ext4_mb_release_inode_pa,
- TP_PROTO(struct super_block *sb,
- struct inode *inode,
- struct ext4_prealloc_space *pa,
+ TP_PROTO(struct ext4_prealloc_space *pa,
unsigned long long block, unsigned int count),
- TP_ARGS(sb, inode, pa, block, count),
+ TP_ARGS(pa, block, count),
TP_STRUCT__entry(
__field( dev_t, dev )
),
TP_fast_assign(
- __entry->dev = sb->s_dev;
- __entry->ino = inode->i_ino;
+ __entry->dev = pa->pa_inode->i_sb->s_dev;
+ __entry->ino = pa->pa_inode->i_ino;
__entry->block = block;
__entry->count = count;
),
);
TRACE_EVENT(ext4_mb_release_group_pa,
- TP_PROTO(struct super_block *sb,
- struct ext4_prealloc_space *pa),
+ TP_PROTO(struct ext4_prealloc_space *pa),
- TP_ARGS(sb, pa),
+ TP_ARGS(pa),
TP_STRUCT__entry(
__field( dev_t, dev )
),
TP_fast_assign(
- __entry->dev = sb->s_dev;
+ __entry->dev = pa->pa_inode->i_sb->s_dev;
__entry->pa_pstart = pa->pa_pstart;
__entry->pa_len = pa->pa_len;
),
__field( ino_t, ino )
__field( unsigned int, flags )
__field( unsigned int, len )
- __field( __u64, logical )
+ __field( __u32, logical )
+ __field( __u32, lleft )
+ __field( __u32, lright )
__field( __u64, goal )
- __field( __u64, lleft )
- __field( __u64, lright )
__field( __u64, pleft )
__field( __u64, pright )
),
__entry->pright = ar->pright;
),
- TP_printk("dev %d,%d ino %lu flags %u len %u lblk %llu goal %llu "
- "lleft %llu lright %llu pleft %llu pright %llu ",
+ TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu "
+ "lleft %u lright %u pleft %llu pright %llu ",
MAJOR(__entry->dev), MINOR(__entry->dev),
- (unsigned long) __entry->ino,
- __entry->flags, __entry->len,
- (unsigned long long) __entry->logical,
- (unsigned long long) __entry->goal,
- (unsigned long long) __entry->lleft,
- (unsigned long long) __entry->lright,
- (unsigned long long) __entry->pleft,
- (unsigned long long) __entry->pright)
+ (unsigned long) __entry->ino, __entry->flags,
+ __entry->len, __entry->logical, __entry->goal,
+ __entry->lleft, __entry->lright, __entry->pleft,
+ __entry->pright)
);
TRACE_EVENT(ext4_allocate_blocks,
__field( __u64, block )
__field( unsigned int, flags )
__field( unsigned int, len )
- __field( __u64, logical )
+ __field( __u32, logical )
+ __field( __u32, lleft )
+ __field( __u32, lright )
__field( __u64, goal )
- __field( __u64, lleft )
- __field( __u64, lright )
__field( __u64, pleft )
__field( __u64, pright )
),
__entry->pright = ar->pright;
),
- TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %llu "
- "goal %llu lleft %llu lright %llu pleft %llu pright %llu",
+ TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u "
+ "goal %llu lleft %u lright %u pleft %llu pright %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- (unsigned long) __entry->ino,
- __entry->flags, __entry->len, __entry->block,
- (unsigned long long) __entry->logical,
- (unsigned long long) __entry->goal,
- (unsigned long long) __entry->lleft,
- (unsigned long long) __entry->lright,
- (unsigned long long) __entry->pleft,
- (unsigned long long) __entry->pright)
+ (unsigned long) __entry->ino, __entry->flags,
+ __entry->len, __entry->block, __entry->logical,
+ __entry->goal, __entry->lleft, __entry->lright,
+ __entry->pleft, __entry->pright)
);
TRACE_EVENT(ext4_free_blocks,
TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
- __field( umode_t, mode )
+ __field( umode_t, mode )
__field( __u64, block )
__field( unsigned long, count )
- __field( int, flags )
+ __field( int, flags )
),
TP_fast_assign(
__entry->parent = dentry->d_parent->d_inode->i_ino;
),
- TP_printk("dev %d,%d ino %ld parent %ld datasync %d ",
+ TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
(unsigned long) __entry->parent, __entry->datasync)
__entry->dev = inode->i_sb->s_dev;
),
- TP_printk("dev %d,%d ino %ld ret %d",
+ TP_printk("dev %d,%d ino %lu ret %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
__entry->ret)
__entry->result_len = len;
),
- TP_printk("dev %d,%d inode %lu extent %u/%d/%u ",
+ TP_printk("dev %d,%d inode %lu extent %u/%d/%d ",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
__entry->result_group, __entry->result_start,
"allocated_meta_blocks %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
- __entry->mode, (unsigned long long) __entry->i_blocks,
+ __entry->mode, __entry->i_blocks,
__entry->used_blocks, __entry->reserved_data_blocks,
__entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
);
"reserved_data_blocks %d reserved_meta_blocks %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
- __entry->mode, (unsigned long long) __entry->i_blocks,
+ __entry->mode, __entry->i_blocks,
__entry->md_needed, __entry->reserved_data_blocks,
__entry->reserved_meta_blocks)
);
"allocated_meta_blocks %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
- __entry->mode, (unsigned long long) __entry->i_blocks,
+ __entry->mode, __entry->i_blocks,
__entry->freed_blocks, __entry->reserved_data_blocks,
__entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
);
__entry->rw = rw;
),
- TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d",
+ TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
- (unsigned long long) __entry->pos, __entry->len, __entry->rw)
+ __entry->pos, __entry->len, __entry->rw)
);
TRACE_EVENT(ext4_direct_IO_exit,
- TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw, int ret),
+ TP_PROTO(struct inode *inode, loff_t offset, unsigned long len,
+ int rw, int ret),
TP_ARGS(inode, offset, len, rw, ret),
__entry->ret = ret;
),
- TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d ret %d",
+ TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
- (unsigned long long) __entry->pos, __entry->len,
+ __entry->pos, __entry->len,
__entry->rw, __entry->ret)
);
__entry->mode = mode;
),
- TP_printk("dev %d,%d ino %ld pos %llu len %llu mode %d",
+ TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- (unsigned long) __entry->ino,
- (unsigned long long) __entry->pos,
- (unsigned long long) __entry->len, __entry->mode)
+ (unsigned long) __entry->ino, __entry->pos,
+ __entry->len, __entry->mode)
);
TRACE_EVENT(ext4_fallocate_exit,
- TP_PROTO(struct inode *inode, loff_t offset, unsigned int max_blocks, int ret),
+ TP_PROTO(struct inode *inode, loff_t offset,
+ unsigned int max_blocks, int ret),
TP_ARGS(inode, offset, max_blocks, ret),
__field( ino_t, ino )
__field( dev_t, dev )
__field( loff_t, pos )
- __field( unsigned, blocks )
+ __field( unsigned int, blocks )
__field( int, ret )
),
__entry->ret = ret;
),
- TP_printk("dev %d,%d ino %ld pos %llu blocks %d ret %d",
+ TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
- (unsigned long long) __entry->pos, __entry->blocks,
+ __entry->pos, __entry->blocks,
__entry->ret)
);
__entry->dev = dentry->d_inode->i_sb->s_dev;
),
- TP_printk("dev %d,%d ino %ld size %lld parent %ld",
+ TP_printk("dev %d,%d ino %lu size %lld parent %lu",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino, __entry->size,
(unsigned long) __entry->parent)
__entry->ret = ret;
),
- TP_printk("dev %d,%d ino %ld ret %d",
+ TP_printk("dev %d,%d ino %lu ret %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
__entry->ret)
TP_STRUCT__entry(
__field( ino_t, ino )
__field( dev_t, dev )
- __field( blkcnt_t, blocks )
+ __field( __u64, blocks )
),
TP_fast_assign(
__entry->blocks = inode->i_blocks;
),
- TP_printk("dev %d,%d ino %lu blocks %lu",
+ TP_printk("dev %d,%d ino %lu blocks %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- (unsigned long) __entry->ino, (unsigned long) __entry->blocks)
+ (unsigned long) __entry->ino, __entry->blocks)
);
DEFINE_EVENT(ext4__truncate, ext4_truncate_enter,
DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
- unsigned len, unsigned flags),
+ unsigned int len, unsigned int flags),
TP_ARGS(inode, lblk, len, flags),
__field( ino_t, ino )
__field( dev_t, dev )
__field( ext4_lblk_t, lblk )
- __field( unsigned, len )
- __field( unsigned, flags )
+ __field( unsigned int, len )
+ __field( unsigned int, flags )
),
TP_fast_assign(
TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
- (unsigned) __entry->lblk, __entry->len, __entry->flags)
+ __entry->lblk, __entry->len, __entry->flags)
);
DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter,
DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
- ext4_fsblk_t pblk, unsigned len, int ret),
+ ext4_fsblk_t pblk, unsigned int len, int ret),
TP_ARGS(inode, lblk, pblk, len, ret),
__field( dev_t, dev )
__field( ext4_lblk_t, lblk )
__field( ext4_fsblk_t, pblk )
- __field( unsigned, len )
+ __field( unsigned int, len )
__field( int, ret )
),
TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
- (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
+ __entry->lblk, __entry->pblk,
__entry->len, __entry->ret)
);
TP_printk("dev %d,%d ino %lu lblk %u pblk %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
- (unsigned) __entry->lblk, (unsigned long long) __entry->pblk)
+ __entry->lblk, __entry->pblk)
);
TRACE_EVENT(ext4_load_inode,
static int bdi_sync_supers(void *);
static void sync_supers_timer_fn(unsigned long);
+ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+ {
+ if (wb1 < wb2) {
+ spin_lock(&wb1->list_lock);
+ spin_lock_nested(&wb2->list_lock, 1);
+ } else {
+ spin_lock(&wb2->list_lock);
+ spin_lock_nested(&wb1->list_lock, 1);
+ }
+ }
+
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
struct inode *inode;
nr_dirty = nr_io = nr_more_io = 0;
- spin_lock(&inode_wb_list_lock);
+ spin_lock(&wb->list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
list_for_each_entry(inode, &wb->b_io, i_wb_list)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&wb->list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
- "BdiWriteback: %8lu kB\n"
- "BdiReclaimable: %8lu kB\n"
- "BdiDirtyThresh: %8lu kB\n"
- "DirtyThresh: %8lu kB\n"
- "BackgroundThresh: %8lu kB\n"
- "b_dirty: %8lu\n"
- "b_io: %8lu\n"
- "b_more_io: %8lu\n"
- "bdi_list: %8u\n"
- "state: %8lx\n",
+ "BdiWriteback: %10lu kB\n"
+ "BdiReclaimable: %10lu kB\n"
+ "BdiDirtyThresh: %10lu kB\n"
+ "DirtyThresh: %10lu kB\n"
+ "BackgroundThresh: %10lu kB\n"
+ "BdiWritten: %10lu kB\n"
+ "BdiWriteBandwidth: %10lu kBps\n"
+ "b_dirty: %10lu\n"
+ "b_io: %10lu\n"
+ "b_more_io: %10lu\n"
+ "bdi_list: %10u\n"
+ "state: %10lx\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
(unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
- K(bdi_thresh), K(dirty_thresh),
- K(background_thresh), nr_dirty, nr_io, nr_more_io,
+ K(bdi_thresh),
+ K(dirty_thresh),
+ K(background_thresh),
+ (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+ (unsigned long) K(bdi->write_bandwidth),
+ nr_dirty,
+ nr_io,
+ nr_more_io,
!list_empty(&bdi->bdi_list), bdi->state);
#undef K
return wb_has_dirty_io(&bdi->wb);
}
- static void bdi_flush_io(struct backing_dev_info *bdi)
- {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
- .range_cyclic = 1,
- .nr_to_write = 1024,
- };
-
- writeback_inodes_wb(&bdi->wb, &wbc);
- }
-
/*
* kupdated() used to do this. We cannot do it from the bdi_forker_thread()
* or we risk deadlocking on ->s_umount. The longer term solution would be
if (IS_ERR(task)) {
/*
* If thread creation fails, force writeout of
- * the bdi from the thread.
+ * the bdi from the thread. Hopefully 1024 is
+ * large enough for efficient IO.
*/
- bdi_flush_io(bdi);
+ writeback_inodes_wb(&bdi->wb, 1024);
} else {
/*
* The spinlock makes sure we do not lose
list_del_rcu(&bdi->bdi_list);
spin_unlock_bh(&bdi_lock);
- synchronize_rcu();
+ synchronize_rcu_expedited();
}
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
+ bdi_set_min_ratio(bdi, 0);
trace_writeback_bdi_unregister(bdi);
bdi_prune_sb(bdi);
del_timer_sync(&bdi->wb.wakeup_timer);
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
+ spin_lock_init(&wb->list_lock);
setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
}
+ /*
+ * Initial write bandwidth: 100 MB/s
+ */
+ #define INIT_BW (100 << (20 - PAGE_SHIFT))
+
int bdi_init(struct backing_dev_info *bdi)
{
int i, err;
}
bdi->dirty_exceeded = 0;
+
+ bdi->bw_time_stamp = jiffies;
+ bdi->written_stamp = 0;
+
+ bdi->write_bandwidth = INIT_BW;
+ bdi->avg_write_bandwidth = INIT_BW;
+
err = prop_local_init_percpu(&bdi->completions);
if (err) {
if (bdi_has_dirty_io(bdi)) {
struct bdi_writeback *dst = &default_backing_dev_info.wb;
- spin_lock(&inode_wb_list_lock);
+ bdi_lock_two(&bdi->wb, dst);
list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
list_splice(&bdi->wb.b_io, &dst->b_io);
list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&bdi->wb.list_lock);
+ spin_unlock(&dst->list_lock);
}
bdi_unregister(bdi);
* ->i_mutex (generic_file_buffered_write)
* ->mmap_sem (fault_in_pages_readable->do_page_fault)
*
- * inode_wb_list_lock
- * ->i_mutex
- * ->i_alloc_sem (various)
- *
+ * bdi->wb.list_lock
* sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
* ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
- * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
+ * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * inode_wb_list_lock (zap_pte_range->set_page_dirty)
+ * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
+ /* Leave page->index set: truncation lookup relies upon it */
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
+ /* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
page_cache_release(page);
static struct page *__read_cache_page(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data,
gfp_t gfp)
{
static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data,
gfp_t gfp)
* @mapping: the page's address_space
* @index: the page index
* @filler: function to perform the read
- * @data: destination for read data
+ * @data: first arg to filler(data, page) function, often left as NULL
*
* Same as read_cache_page, but don't wait for page to become unlocked
* after submitting it to the filler.
*/
struct page *read_cache_page_async(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data)
{
return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
* @mapping: the page's address_space
* @index: the page index
* @filler: function to perform the read
- * @data: destination for read data
+ * @data: first arg to filler(data, page) function, often left as NULL
*
* Read into the page cache. If a page already exists, and PageUptodate() is
* not set, try to fill the page then wait for it to become unlocked.
*/
struct page *read_cache_page(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data)
{
return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
error = security_inode_killpriv(dentry);
if (!error && killsuid)
error = __remove_suid(dentry, killsuid);
- if (!error)
+ if (!error && (inode->i_sb->s_flags & MS_NOSEC))
inode->i_flags |= S_NOSEC;
return error;
#include <linux/pagevec.h>
#include <trace/events/writeback.h>
+ /*
+ * Sleep at most 200ms at a time in balance_dirty_pages().
+ */
+ #define MAX_PAUSE max(HZ/5, 1)
+
+ /*
+ * Estimate write bandwidth at 200ms intervals.
+ */
+ #define BANDWIDTH_INTERVAL max(HZ/5, 1)
+
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling.
/* End of sysctl-exported parameters */
+ unsigned long global_dirty_limit;
/*
* Scale the writeback cache size proportional to the relative writeout speeds.
*/
static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
{
+ __inc_bdi_stat(bdi, BDI_WRITTEN);
__prop_inc_percpu_max(&vm_completions, &bdi->completions,
bdi->max_prop_frac);
}
static void bdi_writeout_fraction(struct backing_dev_info *bdi,
long *numerator, long *denominator)
{
- if (bdi_cap_writeback_dirty(bdi)) {
- prop_fraction_percpu(&vm_completions, &bdi->completions,
+ prop_fraction_percpu(&vm_completions, &bdi->completions,
numerator, denominator);
- } else {
- *numerator = 0;
- *denominator = 1;
- }
}
static inline void task_dirties_fraction(struct task_struct *tsk,
* effectively curb the growth of dirty pages. Light dirtiers with high enough
* dirty threshold may never get throttled.
*/
+ #define TASK_LIMIT_FRACTION 8
static unsigned long task_dirty_limit(struct task_struct *tsk,
unsigned long bdi_dirty)
{
long numerator, denominator;
unsigned long dirty = bdi_dirty;
- u64 inv = dirty >> 3;
+ u64 inv = dirty / TASK_LIMIT_FRACTION;
task_dirties_fraction(tsk, &numerator, &denominator);
inv *= numerator;
return max(dirty, bdi_dirty/2);
}
+ /* Minimum limit for any task */
+ static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
+ {
+ return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
+ }
+
/*
*
*/
return x + 1; /* Ensure that we never return 0 */
}
+ static unsigned long hard_dirty_limit(unsigned long thresh)
+ {
+ return max(thresh, global_dirty_limit);
+ }
+
/*
* global_dirty_limits - background-writeback and dirty-throttling thresholds
*
}
*pbackground = background;
*pdirty = dirty;
+ trace_global_dirty_state(background, dirty);
}
- /*
+ /**
* bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ * @bdi: the backing_dev_info to query
+ * @dirty: global dirty limit in pages
*
- * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ * And the "limit" in the name is not seriously taken as hard limit in
+ * balance_dirty_pages().
+ *
+ * It allocates high/low dirty limits to fast/slow devices, in order to prevent
* - starving fast devices
* - piling up dirty pages (that will take long time to sync) on slow devices
*
return bdi_dirty;
}
+ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+ unsigned long elapsed,
+ unsigned long written)
+ {
+ const unsigned long period = roundup_pow_of_two(3 * HZ);
+ unsigned long avg = bdi->avg_write_bandwidth;
+ unsigned long old = bdi->write_bandwidth;
+ u64 bw;
+
+ /*
+ * bw = written * HZ / elapsed
+ *
+ * bw * elapsed + write_bandwidth * (period - elapsed)
+ * write_bandwidth = ---------------------------------------------------
+ * period
+ */
+ bw = written - bdi->written_stamp;
+ bw *= HZ;
+ if (unlikely(elapsed > period)) {
+ do_div(bw, elapsed);
+ avg = bw;
+ goto out;
+ }
+ bw += (u64)bdi->write_bandwidth * (period - elapsed);
+ bw >>= ilog2(period);
+
+ /*
+ * one more level of smoothing, for filtering out sudden spikes
+ */
+ if (avg > old && old >= (unsigned long)bw)
+ avg -= (avg - old) >> 3;
+
+ if (avg < old && old <= (unsigned long)bw)
+ avg += (old - avg) >> 3;
+
+ out:
+ bdi->write_bandwidth = bw;
+ bdi->avg_write_bandwidth = avg;
+ }
+
+ /*
+ * The global dirtyable memory and dirty threshold could be suddenly knocked
+ * down by a large amount (eg. on the startup of KVM in a swapless system).
+ * This may throw the system into deep dirty exceeded state and throttle
+ * heavy/light dirtiers alike. To retain good responsiveness, maintain
+ * global_dirty_limit for tracking slowly down to the knocked down dirty
+ * threshold.
+ */
+ static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+ {
+ unsigned long limit = global_dirty_limit;
+
+ /*
+ * Follow up in one step.
+ */
+ if (limit < thresh) {
+ limit = thresh;
+ goto update;
+ }
+
+ /*
+ * Follow down slowly. Use the higher one as the target, because thresh
+ * may drop below dirty. This is exactly the reason to introduce
+ * global_dirty_limit which is guaranteed to lie above the dirty pages.
+ */
+ thresh = max(thresh, dirty);
+ if (limit > thresh) {
+ limit -= (limit - thresh) >> 5;
+ goto update;
+ }
+ return;
+ update:
+ global_dirty_limit = limit;
+ }
+
+ static void global_update_bandwidth(unsigned long thresh,
+ unsigned long dirty,
+ unsigned long now)
+ {
+ static DEFINE_SPINLOCK(dirty_lock);
+ static unsigned long update_time;
+
+ /*
+ * check locklessly first to optimize away locking for the most time
+ */
+ if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+ return;
+
+ spin_lock(&dirty_lock);
+ if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
+ update_dirty_limit(thresh, dirty);
+ update_time = now;
+ }
+ spin_unlock(&dirty_lock);
+ }
+
+ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty,
+ unsigned long start_time)
+ {
+ unsigned long now = jiffies;
+ unsigned long elapsed = now - bdi->bw_time_stamp;
+ unsigned long written;
+
+ /*
+ * rate-limit, only update once every 200ms.
+ */
+ if (elapsed < BANDWIDTH_INTERVAL)
+ return;
+
+ written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+
+ /*
+ * Skip quiet periods when disk bandwidth is under-utilized.
+ * (at least 1s idle time between two flusher runs)
+ */
+ if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+ goto snapshot;
+
+ if (thresh)
+ global_update_bandwidth(thresh, dirty, now);
+
+ bdi_update_write_bandwidth(bdi, elapsed, written);
+
+ snapshot:
+ bdi->written_stamp = written;
+ bdi->bw_time_stamp = now;
+ }
+
+ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty,
+ unsigned long start_time)
+ {
+ if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
+ return;
+ spin_lock(&bdi->wb.list_lock);
+ __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
+ start_time);
+ spin_unlock(&bdi->wb.list_lock);
+ }
+
/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
static void balance_dirty_pages(struct address_space *mapping,
unsigned long write_chunk)
{
- long nr_reclaimable, bdi_nr_reclaimable;
- long nr_writeback, bdi_nr_writeback;
+ unsigned long nr_reclaimable, bdi_nr_reclaimable;
+ unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
+ unsigned long bdi_dirty;
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
+ unsigned long task_bdi_thresh;
+ unsigned long min_task_bdi_thresh;
unsigned long pages_written = 0;
unsigned long pause = 1;
bool dirty_exceeded = false;
+ bool clear_dirty_exceeded = true;
struct backing_dev_info *bdi = mapping->backing_dev_info;
+ unsigned long start_time = jiffies;
for (;;) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
- .nr_to_write = write_chunk,
- .range_cyclic = 1,
- };
-
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
- nr_writeback = global_page_state(NR_WRITEBACK);
+ nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
global_dirty_limits(&background_thresh, &dirty_thresh);
* catch-up. This avoids (excessively) small writeouts
* when the bdi limits are ramping up.
*/
- if (nr_reclaimable + nr_writeback <=
- (background_thresh + dirty_thresh) / 2)
+ if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
break;
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
- bdi_thresh = task_dirty_limit(current, bdi_thresh);
+ min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
+ task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
/*
* In order to avoid the stacked BDI deadlock we need
* actually dirty; with m+n sitting in the percpu
* deltas.
*/
- if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+ if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
- bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+ bdi_dirty = bdi_nr_reclaimable +
+ bdi_stat_sum(bdi, BDI_WRITEBACK);
} else {
bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+ bdi_dirty = bdi_nr_reclaimable +
+ bdi_stat(bdi, BDI_WRITEBACK);
}
/*
* bdi or process from holding back light ones; The latter is
* the last resort safeguard.
*/
- dirty_exceeded =
- (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
- || (nr_reclaimable + nr_writeback > dirty_thresh);
+ dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
+ (nr_dirty > dirty_thresh);
+ clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
+ (nr_dirty <= dirty_thresh);
if (!dirty_exceeded)
break;
if (!bdi->dirty_exceeded)
bdi->dirty_exceeded = 1;
+ bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
+ bdi_thresh, bdi_dirty, start_time);
+
/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
* Unstable writes are a feature of certain networked
* filesystems (i.e. NFS) in which data may have been
* threshold otherwise wait until the disk writes catch
* up.
*/
- trace_wbc_balance_dirty_start(&wbc, bdi);
- if (bdi_nr_reclaimable > bdi_thresh) {
- writeback_inodes_wb(&bdi->wb, &wbc);
- pages_written += write_chunk - wbc.nr_to_write;
- trace_wbc_balance_dirty_written(&wbc, bdi);
+ trace_balance_dirty_start(bdi);
+ if (bdi_nr_reclaimable > task_bdi_thresh) {
+ pages_written += writeback_inodes_wb(&bdi->wb,
+ write_chunk);
+ trace_balance_dirty_written(bdi, pages_written);
if (pages_written >= write_chunk)
break; /* We've done our duty */
}
- trace_wbc_balance_dirty_wait(&wbc, bdi);
__set_current_state(TASK_UNINTERRUPTIBLE);
io_schedule_timeout(pause);
+ trace_balance_dirty_wait(bdi);
+
+ dirty_thresh = hard_dirty_limit(dirty_thresh);
+ /*
+ * max-pause area. If dirty exceeded but still within this
+ * area, no need to sleep for more than 200ms: (a) 8 pages per
+ * 200ms is typically more than enough to curb heavy dirtiers;
+ * (b) the pause time limit makes the dirtiers more responsive.
+ */
+ if (nr_dirty < dirty_thresh +
+ dirty_thresh / DIRTY_MAXPAUSE_AREA &&
+ time_after(jiffies, start_time + MAX_PAUSE))
+ break;
+ /*
+ * pass-good area. When some bdi gets blocked (eg. NFS server
+ * not responding), or write bandwidth dropped dramatically due
+ * to concurrent reads, or dirty threshold suddenly dropped and
+ * the dirty pages cannot be brought down anytime soon (eg. on
+ * slow USB stick), at least let go of the good bdi's.
+ */
+ if (nr_dirty < dirty_thresh +
+ dirty_thresh / DIRTY_PASSGOOD_AREA &&
+ bdi_dirty < bdi_thresh)
+ break;
/*
* Increase the delay for each loop, up to our previous
pause = HZ / 10;
}
- if (!dirty_exceeded && bdi->dirty_exceeded)
+ /* Clear dirty_exceeded flag only when no task can exceed the limit */
+ if (clear_dirty_exceeded && bdi->dirty_exceeded)
bdi->dirty_exceeded = 0;
if (writeback_in_progress(bdi))
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied)
{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long ratelimit;
unsigned long *p;
+ if (!bdi_cap_account_dirty(bdi))
+ return;
+
ratelimit = ratelimit_pages;
if (mapping->backing_dev_info->dirty_exceeded)
ratelimit = 8;
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
}
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
retry:
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && (index <= end)) {
void account_page_writeback(struct page *page)
{
inc_zone_page_state(page, NR_WRITEBACK);
- inc_zone_page_state(page, NR_WRITTEN);
}
EXPORT_SYMBOL(account_page_writeback);
} else {
ret = TestClearPageWriteback(page);
}
- if (ret)
+ if (ret) {
dec_zone_page_state(page, NR_WRITEBACK);
+ inc_zone_page_state(page, NR_WRITTEN);
+ }
return ret;
}
*/
int mapping_tagged(struct address_space *mapping, int tag)
{
- int ret;
- rcu_read_lock();
- ret = radix_tree_tagged(&mapping->page_tree, tag);
- rcu_read_unlock();
- return ret;
+ return radix_tree_tagged(&mapping->page_tree, tag);
}
EXPORT_SYMBOL(mapping_tagged);
* Lock ordering in mm:
*
* inode->i_mutex (while writing or truncating, not reading or faulting)
- * inode->i_alloc_sem (vmtruncate_range)
* mm->mmap_sem
* page->flags PG_locked (lock_page)
* mapping->i_mmap_mutex
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
+ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
* mapping->tree_lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
- * within inode_wb_list_lock in __sync_single_inode)
+ * within bdi.wb->list_lock in __sync_single_inode)
*
- * (code doesn't rely on that order so it could be switched around)
- * ->tasklist_lock
- * anon_vma->mutex (memory_failure, collect_procs_anon)
+ * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
+ * ->tasklist_lock
* pte map lock
*/
kmem_cache_free(anon_vma_cachep, anon_vma);
}
-static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
+static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
{
- return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
+ return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
}
static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
struct mm_struct *mm = vma->vm_mm;
struct anon_vma *allocated;
- avc = anon_vma_chain_alloc();
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_enomem;
return -ENOMEM;
}
+/*
+ * This is a useful helper function for locking the anon_vma root as
+ * we traverse the vma->anon_vma_chain, looping over anon_vma's that
+ * have the same vma.
+ *
+ * Such anon_vma's should have the same root, so you'd expect to see
+ * just a single mutex_lock for the whole traversal.
+ */
+static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
+{
+ struct anon_vma *new_root = anon_vma->root;
+ if (new_root != root) {
+ if (WARN_ON_ONCE(root))
+ mutex_unlock(&root->mutex);
+ root = new_root;
+ mutex_lock(&root->mutex);
+ }
+ return root;
+}
+
+static inline void unlock_anon_vma_root(struct anon_vma *root)
+{
+ if (root)
+ mutex_unlock(&root->mutex);
+}
+
static void anon_vma_chain_link(struct vm_area_struct *vma,
struct anon_vma_chain *avc,
struct anon_vma *anon_vma)
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
- anon_vma_lock(anon_vma);
/*
* It's critical to add new vmas to the tail of the anon_vma,
* see comment in huge_memory.c:__split_huge_page().
*/
list_add_tail(&avc->same_anon_vma, &anon_vma->head);
- anon_vma_unlock(anon_vma);
}
/*
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
struct anon_vma_chain *avc, *pavc;
+ struct anon_vma *root = NULL;
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
- avc = anon_vma_chain_alloc();
- if (!avc)
- goto enomem_failure;
- anon_vma_chain_link(dst, avc, pavc->anon_vma);
+ struct anon_vma *anon_vma;
+
+ avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
+ if (unlikely(!avc)) {
+ unlock_anon_vma_root(root);
+ root = NULL;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto enomem_failure;
+ }
+ anon_vma = pavc->anon_vma;
+ root = lock_anon_vma_root(root, anon_vma);
+ anon_vma_chain_link(dst, avc, anon_vma);
}
+ unlock_anon_vma_root(root);
return 0;
enomem_failure:
anon_vma = anon_vma_alloc();
if (!anon_vma)
goto out_error;
- avc = anon_vma_chain_alloc();
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_error_free_anon_vma;
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
+ anon_vma_lock(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma_unlock(anon_vma);
return 0;
return -ENOMEM;
}
-static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
-{
- struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
- int empty;
-
- /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
- if (!anon_vma)
- return;
-
- anon_vma_lock(anon_vma);
- list_del(&anon_vma_chain->same_anon_vma);
-
- /* We must garbage collect the anon_vma if it's empty */
- empty = list_empty(&anon_vma->head);
- anon_vma_unlock(anon_vma);
-
- if (empty)
- put_anon_vma(anon_vma);
-}
-
void unlink_anon_vmas(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc, *next;
+ struct anon_vma *root = NULL;
/*
* Unlink each anon_vma chained to the VMA. This list is ordered
* from newest to oldest, ensuring the root anon_vma gets freed last.
*/
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
- anon_vma_unlink(avc);
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ root = lock_anon_vma_root(root, anon_vma);
+ list_del(&avc->same_anon_vma);
+
+ /*
+ * Leave empty anon_vmas on the list - we'll need
+ * to free them outside the lock.
+ */
+ if (list_empty(&anon_vma->head))
+ continue;
+
+ list_del(&avc->same_vma);
+ anon_vma_chain_free(avc);
+ }
+ unlock_anon_vma_root(root);
+
+ /*
+ * Iterate the list once more, it now only contains empty and unlinked
+ * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
+ * needing to acquire the anon_vma->root->mutex.
+ */
+ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ put_anon_vma(anon_vma);
+
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
vm_flags);
if (we_locked)
unlock_page(page);
+
+ if (page_test_and_clear_young(page_to_pfn(page)))
+ referenced++;
}
out:
- if (page_test_and_clear_young(page_to_pfn(page)))
- referenced++;
-
return referenced;
}