kvm_guest_exit();
continue;
}
-
- trace_hardirqs_on();
#endif
kvm_guest_enter();
return kvmppc_core_create_memslot(slot, npages);
}
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
+
int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
struct kvm_userspace_memory_region *mem,
#endif
#ifdef CONFIG_KVM_MPIC
case KVM_CAP_IRQ_MPIC: {
- struct file *filp;
+ struct fd f;
struct kvm_device *dev;
r = -EBADF;
- filp = fget(cap->args[0]);
- if (!filp)
+ f = fdget(cap->args[0]);
+ if (!f.file)
break;
r = -EPERM;
- dev = kvm_device_from_filp(filp);
+ dev = kvm_device_from_filp(f.file);
if (dev)
r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]);
- fput(filp);
+ fdput(f);
break;
}
#endif
#ifdef CONFIG_KVM_XICS
case KVM_CAP_IRQ_XICS: {
- struct file *filp;
+ struct fd f;
struct kvm_device *dev;
r = -EBADF;
- filp = fget(cap->args[0]);
- if (!filp)
+ f = fdget(cap->args[0]);
+ if (!f.file)
break;
r = -EPERM;
- dev = kvm_device_from_filp(filp);
+ dev = kvm_device_from_filp(f.file);
if (dev)
r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
- fput(filp);
+ fdput(f);
break;
}
#endif /* CONFIG_KVM_XICS */
*/
static void d_free(struct dentry *dentry)
{
- BUG_ON(dentry->d_count);
+ BUG_ON(dentry->d_lockref.count);
this_cpu_dec(nr_dentry);
if (dentry->d_op && dentry->d_op->d_release)
dentry->d_op->d_release(dentry);
}
if (ref)
- dentry->d_count--;
+ dentry->d_lockref.count--;
/*
* inform the fs via d_prune that this dentry is about to be
* unhashed and destroyed.
*/
- if (dentry->d_flags & DCACHE_OP_PRUNE)
+ if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
dentry->d_op->d_prune(dentry);
dentry_lru_del(dentry);
return;
repeat:
- if (dentry->d_count == 1)
+ if (dentry->d_lockref.count == 1)
might_sleep();
- spin_lock(&dentry->d_lock);
- BUG_ON(!dentry->d_count);
- if (dentry->d_count > 1) {
- dentry->d_count--;
- spin_unlock(&dentry->d_lock);
+ if (lockref_put_or_lock(&dentry->d_lockref))
return;
- }
if (dentry->d_flags & DCACHE_OP_DELETE) {
if (dentry->d_op->d_delete(dentry))
dentry->d_flags |= DCACHE_REFERENCED;
dentry_lru_add(dentry);
- dentry->d_count--;
+ dentry->d_lockref.count--;
spin_unlock(&dentry->d_lock);
return;
* We also need to leave mountpoints alone,
* directory or not.
*/
- if (dentry->d_count > 1 && dentry->d_inode) {
+ if (dentry->d_lockref.count > 1 && dentry->d_inode) {
if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
spin_unlock(&dentry->d_lock);
return -EBUSY;
/* This must be called with d_lock held */
static inline void __dget_dlock(struct dentry *dentry)
{
- dentry->d_count++;
+ dentry->d_lockref.count++;
}
static inline void __dget(struct dentry *dentry)
{
- spin_lock(&dentry->d_lock);
- __dget_dlock(dentry);
- spin_unlock(&dentry->d_lock);
+ lockref_get(&dentry->d_lockref);
}
struct dentry *dget_parent(struct dentry *dentry)
{
+ int gotref;
struct dentry *ret;
+ /*
+ * Do optimistic parent lookup without any
+ * locking.
+ */
+ rcu_read_lock();
+ ret = ACCESS_ONCE(dentry->d_parent);
+ gotref = lockref_get_not_zero(&ret->d_lockref);
+ rcu_read_unlock();
+ if (likely(gotref)) {
+ if (likely(ret == ACCESS_ONCE(dentry->d_parent)))
+ return ret;
+ dput(ret);
+ }
+
repeat:
/*
* Don't need rcu_dereference because we re-check it was correct under
goto repeat;
}
rcu_read_unlock();
- BUG_ON(!ret->d_count);
- ret->d_count++;
+ BUG_ON(!ret->d_lockref.count);
+ ret->d_lockref.count++;
spin_unlock(&ret->d_lock);
return ret;
}
spin_lock(&inode->i_lock);
hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
spin_lock(&dentry->d_lock);
- if (!dentry->d_count) {
+ if (!dentry->d_lockref.count) {
+ /*
+ * inform the fs via d_prune that this dentry
+ * is about to be unhashed and destroyed.
+ */
+ if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
+ !d_unhashed(dentry))
+ dentry->d_op->d_prune(dentry);
+
__dget_dlock(dentry);
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
/* Prune ancestors. */
dentry = parent;
while (dentry) {
- spin_lock(&dentry->d_lock);
- if (dentry->d_count > 1) {
- dentry->d_count--;
- spin_unlock(&dentry->d_lock);
+ if (lockref_put_or_lock(&dentry->d_lockref))
return;
- }
dentry = dentry_kill(dentry, 1);
}
}
* the LRU because of laziness during lookup. Do not free
* it - just keep it off the LRU list.
*/
- if (dentry->d_count) {
+ if (dentry->d_lockref.count) {
dentry_lru_del(dentry);
spin_unlock(&dentry->d_lock);
continue;
* inform the fs that this dentry is about to be
* unhashed and destroyed.
*/
- if (dentry->d_flags & DCACHE_OP_PRUNE)
+ if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
+ !d_unhashed(dentry))
dentry->d_op->d_prune(dentry);
dentry_lru_del(dentry);
__d_shrink(dentry);
- if (dentry->d_count != 0) {
+ if (dentry->d_lockref.count != 0) {
printk(KERN_ERR
"BUG: Dentry %p{i=%lx,n=%s}"
" still in use (%d)"
dentry->d_inode ?
dentry->d_inode->i_ino : 0UL,
dentry->d_name.name,
- dentry->d_count,
+ dentry->d_lockref.count,
dentry->d_sb->s_type->name,
dentry->d_sb->s_id);
BUG();
list_del(&dentry->d_u.d_child);
} else {
parent = dentry->d_parent;
- parent->d_count--;
+ parent->d_lockref.count--;
list_del(&dentry->d_u.d_child);
}
dentry = sb->s_root;
sb->s_root = NULL;
- dentry->d_count--;
+ dentry->d_lockref.count--;
shrink_dcache_for_umount_subtree(dentry);
while (!hlist_bl_empty(&sb->s_anon)) {
* loop in shrink_dcache_parent() might not make any progress
* and loop forever.
*/
- if (dentry->d_count) {
+ if (dentry->d_lockref.count) {
dentry_lru_del(dentry);
} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
dentry_lru_move_list(dentry, dispose);
smp_wmb();
dentry->d_name.name = dname;
- dentry->d_count = 1;
+ dentry->d_lockref.count = 1;
dentry->d_flags = 0;
spin_lock_init(&dentry->d_lock);
seqcount_init(&dentry->d_seq);
* without taking d_lock and checking d_seq sequence count against @seq
* returned here.
*
- * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
+ * A refcount may be taken on the found dentry with the d_rcu_to_refcount
* function.
*
* Alternatively, __d_lookup_rcu may be called again to look up the child of
goto next;
}
- dentry->d_count++;
+ dentry->d_lockref.count++;
found = dentry;
spin_unlock(&dentry->d_lock);
break;
spin_lock(&dentry->d_lock);
inode = dentry->d_inode;
isdir = S_ISDIR(inode->i_mode);
- if (dentry->d_count == 1) {
+ if (dentry->d_lockref.count == 1) {
if (!spin_trylock(&inode->i_lock)) {
spin_unlock(&dentry->d_lock);
cpu_relax();
}
if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
dentry->d_flags |= DCACHE_GENOCIDE;
- dentry->d_count--;
+ dentry->d_lockref.count--;
}
spin_unlock(&dentry->d_lock);
}
struct dentry *child = this_parent;
if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
this_parent->d_flags |= DCACHE_GENOCIDE;
- this_parent->d_count--;
+ this_parent->d_lockref.count--;
}
this_parent = try_to_ascend(this_parent, locked, seq);
if (!this_parent)
* Flags for ext4_io_end->flags
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
- #define EXT4_IO_END_DIRECT 0x0002
/*
* For converting uninitialized extents on a work queue. 'handle' is used for
unsigned int flag; /* unwritten or not */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
- struct kiocb *iocb; /* iocb struct for AIO */
- int result; /* error value for AIO */
atomic_t count; /* reference counter */
} ext4_io_end_t;
/* Do not put hole in extent cache */
#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
+/*
+ * The bit position of these flags must not overlap with any of the
+ * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(),
+ * read_extent_tree_block(), ext4_split_extent_at(),
+ * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
+ * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
+ * caching the extents when reading from the extent tree while a
+ * truncate or punch hole operation is in progress.
+ */
+#define EXT4_EX_NOCACHE 0x0400
+#define EXT4_EX_FORCE_CACHE 0x0800
+
/*
* Flags used by ext4_free_blocks
*/
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
+#define EXT4_FREE_BLOCKS_RESERVE 0x0040
/*
* ioctl commands
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
+#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* Completed IOs that need unwritten extents handling and don't have
* transaction reserved
*/
- struct list_head i_unrsv_conversion_list;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
struct work_struct i_rsv_conversion_work;
- struct work_struct i_unrsv_conversion_work;
spinlock_t i_block_reservation_lock;
struct flex_groups *s_flex_groups;
ext4_group_t s_flex_groups_allocated;
- /* workqueue for unreserved extent convertions (dio) */
- struct workqueue_struct *unrsv_conversion_wq;
/* workqueue for reserved extent conversions (buffered io) */
struct workqueue_struct *rsv_conversion_wq;
struct ext4_io_end *io_end)
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- /* Writeback has to have coversion transaction reserved */
- WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
- !(io_end->flag & EXT4_IO_END_DIRECT));
io_end->flag |= EXT4_IO_END_UNWRITTEN;
atomic_inc(&EXT4_I(inode)->i_unwritten);
}
nolocking */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
+ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
extern void ext4_validate_block_bitmap(struct super_block *sb,
struct ext4_group_desc *desc,
- unsigned int block_group,
+ ext4_group_t block_group,
struct buffer_head *bh);
extern unsigned int ext4_block_group(struct super_block *sb,
ext4_fsblk_t blocknr);
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif
+/* Update i_disksize. Requires i_mutex to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
- /*
- * XXX: replace with spinlock if seen contended -bzzz
- */
+ WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
+ !mutex_is_locked(&inode->i_mutex));
down_write(&EXT4_I(inode)->i_data_sem);
if (newsize > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = newsize;
up_write(&EXT4_I(inode)->i_data_sem);
- return ;
+}
+
+/*
+ * Update i_disksize after writeback has been started. Races with truncate
+ * are avoided by checking i_size under i_data_sem.
+ */
+static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize)
+{
+ loff_t i_size;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = i_size_read(inode);
+ if (newsize > i_size)
+ newsize = i_size;
+ if (newsize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = newsize;
+ up_write(&EXT4_I(inode)->i_data_sem);
}
struct ext4_group_info {
#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1
+#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2
+#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \
+ (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \
+ (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_WAS_TRIMMED(grp) \
(test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
struct ext4_ext_path;
struct ext4_extent;
+/*
+ * Maximum number of logical blocks in a file; ext4_extent's ee_block is
+ * __le32.
+ */
+#define EXT_MAX_BLOCKS 0xffffffff
+
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
struct ext4_ext_path *,
struct ext4_extent *, int);
extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
- struct ext4_ext_path *);
+ struct ext4_ext_path *,
+ int flags);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern int ext4_find_delalloc_range(struct inode *inode,
extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
-
+extern int ext4_ext_precache(struct inode *inode);
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
extern void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc);
extern void ext4_end_io_rsv_work(struct work_struct *work);
- extern void ext4_end_io_unrsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
}
if (retval > 0) {
int ret;
- unsigned long long status;
+ unsigned int status;
if (unlikely(retval != map->m_len)) {
ext4_warning(inode->i_sb,
if (retval > 0) {
int ret;
- unsigned long long status;
+ unsigned int status;
if (unlikely(retval != map->m_len)) {
ext4_warning(inode->i_sb,
ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret > 0) {
+ ext4_io_end_t *io_end = ext4_inode_aio(inode);
+
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
+ set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
}
ext4_journal_stop(handle);
goto retry_grab;
}
- wait_on_page_writeback(page);
+ /* In case writeback began while the page was unlocked */
+ wait_for_stable_page(page);
if (ext4_should_dioread_nolock(inode))
ret = __block_write_begin(page, pos, len, ext4_get_block_write);
set_buffer_delay(bh);
} else if (retval > 0) {
int ret;
- unsigned long long status;
+ unsigned int status;
if (unlikely(retval != map->m_len)) {
ext4_warning(inode->i_sb,
return ret;
}
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+ int len;
+ loff_t size = i_size_read(mpd->inode);
+ int err;
+
+ BUG_ON(page->index != mpd->first_page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ clear_page_dirty_for_io(page);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+ if (!err)
+ mpd->wbc->nr_to_write--;
+ mpd->first_page++;
+
+ return err;
+}
+
#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
/*
* mballoc gives us at most this number of blocks...
* XXX: That seems to be only a limitation of ext4_mb_normalize_request().
- * The rest of mballoc seems to handle chunks upto full group size.
+ * The rest of mballoc seems to handle chunks up to full group size.
*/
#define MAX_WRITEPAGES_EXTENT_LEN 2048
*
* @mpd - extent of blocks
* @lblk - logical number of the block in the file
- * @b_state - b_state of the buffer head added
+ * @bh - buffer head we want to add to the extent
*
- * the function is used to collect contig. blocks in same state
+ * The function is used to collect contig. blocks in the same state. If the
+ * buffer doesn't require mapping for writeback and we haven't started the
+ * extent of buffers to map yet, the function returns 'true' immediately - the
+ * caller can write the buffer right away. Otherwise the function returns true
+ * if the block has been added to the extent, false if the block couldn't be
+ * added.
*/
-static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
- unsigned long b_state)
+static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+ struct buffer_head *bh)
{
struct ext4_map_blocks *map = &mpd->map;
- /* Don't go larger than mballoc is willing to allocate */
- if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
- return 0;
+ /* Buffer that doesn't need mapping for writeback? */
+ if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+ (!buffer_delay(bh) && !buffer_unwritten(bh))) {
+ /* So far no extent to map => we write the buffer right away */
+ if (map->m_len == 0)
+ return true;
+ return false;
+ }
/* First block in the extent? */
if (map->m_len == 0) {
map->m_lblk = lblk;
map->m_len = 1;
- map->m_flags = b_state & BH_FLAGS;
- return 1;
+ map->m_flags = bh->b_state & BH_FLAGS;
+ return true;
}
+ /* Don't go larger than mballoc is willing to allocate */
+ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+ return false;
+
/* Can we merge the block to our big extent? */
if (lblk == map->m_lblk + map->m_len &&
- (b_state & BH_FLAGS) == map->m_flags) {
+ (bh->b_state & BH_FLAGS) == map->m_flags) {
map->m_len++;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
-static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
- struct buffer_head *head,
- struct buffer_head *bh,
- ext4_lblk_t lblk)
+/*
+ * mpage_process_page_bufs - submit page buffers for IO or add them to extent
+ *
+ * @mpd - extent of blocks for mapping
+ * @head - the first buffer in the page
+ * @bh - buffer we should start processing from
+ * @lblk - logical number of the block in the file corresponding to @bh
+ *
+ * Walk through page buffers from @bh upto @head (exclusive) and either submit
+ * the page for IO if all buffers in this page were mapped and there's no
+ * accumulated extent of buffers to map or add buffers in the page to the
+ * extent of buffers to map. The function returns 1 if the caller can continue
+ * by processing the next page, 0 if it should stop adding buffers to the
+ * extent to map because we cannot extend it anymore. It can also return value
+ * < 0 in case of error during IO submission.
+ */
+static int mpage_process_page_bufs(struct mpage_da_data *mpd,
+ struct buffer_head *head,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
{
struct inode *inode = mpd->inode;
+ int err;
ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
>> inode->i_blkbits;
do {
BUG_ON(buffer_locked(bh));
- if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
- (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
- lblk >= blocks) {
+ if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
/* Found extent to map? */
if (mpd->map.m_len)
- return false;
- if (lblk >= blocks)
- return true;
- continue;
+ return 0;
+ /* Everything mapped so far and we hit EOF */
+ break;
}
- if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
- return false;
} while (lblk++, (bh = bh->b_this_page) != head);
- return true;
-}
-
-static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
-{
- int len;
- loff_t size = i_size_read(mpd->inode);
- int err;
-
- BUG_ON(page->index != mpd->first_page);
- if (page->index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
- else
- len = PAGE_CACHE_SIZE;
- clear_page_dirty_for_io(page);
- err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
- if (!err)
- mpd->wbc->nr_to_write--;
- mpd->first_page++;
-
- return err;
+ /* So far everything mapped? Submit the page for IO. */
+ if (mpd->map.m_len == 0) {
+ err = mpage_submit_page(mpd, head->b_page);
+ if (err < 0)
+ return err;
+ }
+ return lblk < blocks;
}
/*
struct inode *inode = mpd->inode;
struct buffer_head *head, *bh;
int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
- ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
- >> inode->i_blkbits;
pgoff_t start, end;
ext4_lblk_t lblk;
sector_t pblock;
if (page->index > end)
break;
- /* Upto 'end' pages must be contiguous */
+ /* Up to 'end' pages must be contiguous */
BUG_ON(page->index != start);
bh = head = page_buffers(page);
do {
*/
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
- add_page_bufs_to_extent(mpd, head, bh,
- lblk);
+ /*
+ * FIXME: If dioread_nolock supports
+ * blocksize < pagesize, we need to make
+ * sure we add size mapped so far to
+ * io_end->size as the following call
+ * can submit the page for IO.
+ */
+ err = mpage_process_page_bufs(mpd, head,
+ bh, lblk);
pagevec_release(&pvec);
- return 0;
+ if (err > 0)
+ err = 0;
+ return err;
}
if (buffer_delay(bh)) {
clear_buffer_delay(bh);
bh->b_blocknr = pblock++;
}
clear_buffer_unwritten(bh);
- } while (++lblk < blocks &&
- (bh = bh->b_this_page) != head);
+ } while (lblk++, (bh = bh->b_this_page) != head);
/*
* FIXME: This is going to break if dioread_nolock
/* Update on-disk size after IO is submitted */
disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
- if (disksize > i_size_read(inode))
- disksize = i_size_read(inode);
if (disksize > EXT4_I(inode)->i_disksize) {
int err2;
- ext4_update_i_disksize(inode, disksize);
+ ext4_wb_update_i_disksize(inode, disksize);
err2 = ext4_mark_inode_dirty(handle, inode);
if (err2)
ext4_error(inode->i_sb,
/*
* Calculate the total number of credits to reserve for one writepages
* iteration. This is called from ext4_writepages(). We map an extent of
- * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
* the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
* bpp - 1 blocks in bpp different extents.
*/
lblk = ((ext4_lblk_t)page->index) <<
(PAGE_CACHE_SHIFT - blkbits);
head = page_buffers(page);
- if (!add_page_bufs_to_extent(mpd, head, head, lblk))
+ err = mpage_process_page_bufs(mpd, head, head, lblk);
+ if (err <= 0)
goto out;
- /* So far everything mapped? Submit the page for IO. */
- if (mpd->map.m_len == 0) {
- err = mpage_submit_page(mpd, page);
- if (err < 0)
- goto out;
- }
+ err = 0;
/*
* Accumulated enough dirty pages? This doesn't apply
if (ext4_should_dioread_nolock(inode)) {
/*
- * We may need to convert upto one extent per block in
+ * We may need to convert up to one extent per block in
* the page and we may dirty the inode.
*/
rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
goto retry_grab;
}
/* In case writeback began while the page was unlocked */
- wait_on_page_writeback(page);
+ wait_for_stable_page(page);
ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
if (ret < 0) {
}
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
- ssize_t size, void *private, int ret,
- bool is_async)
+ ssize_t size, void *private)
{
- struct inode *inode = file_inode(iocb->ki_filp);
ext4_io_end_t *io_end = iocb->private;
/* if not async direct IO just return */
- if (!io_end) {
- inode_dio_done(inode);
- if (is_async)
- aio_complete(iocb, ret, 0);
+ if (!io_end)
return;
- }
ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
iocb->private = NULL;
io_end->offset = offset;
io_end->size = size;
- if (is_async) {
- io_end->iocb = iocb;
- io_end->result = ret;
- }
- ext4_put_io_end_defer(io_end);
+ ext4_put_io_end(io_end);
}
/*
ret = -ENOMEM;
goto retake_lock;
}
- io_end->flag |= EXT4_IO_END_DIRECT;
/*
* Grab reference for DIO. Will be dropped in ext4_end_io_dio()
*/
if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
WARN_ON(iocb->private != io_end);
WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
- WARN_ON(io_end->iocb);
- /*
- * Generic code already did inode_dio_done() so we
- * have to clear EXT4_IO_END_DIRECT to not do it for
- * the second time.
- */
- io_end->flag = 0;
ext4_put_io_end(io_end);
iocb->private = NULL;
}
ext4_journal_stop(handle);
}
- if (attr->ia_valid & ATTR_SIZE) {
+ if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+ handle_t *handle;
+ loff_t oldsize = inode->i_size;
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
if (attr->ia_size > sbi->s_bitmap_maxbytes)
return -EFBIG;
}
- }
-
- if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE &&
- (attr->ia_size < inode->i_size)) {
- handle_t *handle;
-
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- goto err_out;
- }
- if (ext4_handle_valid(handle)) {
- error = ext4_orphan_add(handle, inode);
- orphan = 1;
- }
- EXT4_I(inode)->i_disksize = attr->ia_size;
- rc = ext4_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
- ext4_journal_stop(handle);
-
- if (ext4_should_order_data(inode)) {
- error = ext4_begin_ordered_truncate(inode,
+ if (S_ISREG(inode->i_mode) &&
+ (attr->ia_size < inode->i_size)) {
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
attr->ia_size);
- if (error) {
- /* Do as much error cleanup as possible */
- handle = ext4_journal_start(inode,
- EXT4_HT_INODE, 3);
- if (IS_ERR(handle)) {
- ext4_orphan_del(NULL, inode);
+ if (error)
goto err_out;
- }
- ext4_orphan_del(handle, inode);
- orphan = 0;
- ext4_journal_stop(handle);
+ }
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
goto err_out;
}
- }
- }
-
- if (attr->ia_valid & ATTR_SIZE) {
- if (attr->ia_size != inode->i_size) {
- loff_t oldsize = inode->i_size;
-
- i_size_write(inode, attr->ia_size);
- /*
- * Blocks are going to be removed from the inode. Wait
- * for dio in flight. Temporarily disable
- * dioread_nolock to prevent livelock.
- */
- if (orphan) {
- if (!ext4_should_journal_data(inode)) {
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
- ext4_inode_resume_unlocked_dio(inode);
- } else
- ext4_wait_for_tail_page_commit(inode);
+ if (ext4_handle_valid(handle)) {
+ error = ext4_orphan_add(handle, inode);
+ orphan = 1;
}
+ down_write(&EXT4_I(inode)->i_data_sem);
+ EXT4_I(inode)->i_disksize = attr->ia_size;
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
/*
- * Truncate pagecache after we've waited for commit
- * in data=journal mode to make pages freeable.
+ * We have to update i_size under i_data_sem together
+ * with i_disksize to avoid races with writeback code
+ * running ext4_wb_update_i_disksize().
*/
- truncate_pagecache(inode, oldsize, inode->i_size);
+ if (!error)
+ i_size_write(inode, attr->ia_size);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_journal_stop(handle);
+ if (error) {
+ ext4_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ } else
+ i_size_write(inode, attr->ia_size);
+
+ /*
+ * Blocks are going to be removed from the inode. Wait
+ * for dio in flight. Temporarily disable
+ * dioread_nolock to prevent livelock.
+ */
+ if (orphan) {
+ if (!ext4_should_journal_data(inode)) {
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ ext4_inode_resume_unlocked_dio(inode);
+ } else
+ ext4_wait_for_tail_page_commit(inode);
}
- ext4_truncate(inode);
+ /*
+ * Truncate pagecache after we've waited for commit
+ * in data=journal mode to make pages freeable.
+ */
+ truncate_pagecache(inode, oldsize, inode->i_size);
}
+ /*
+ * We want to call ext4_truncate() even if attr->ia_size ==
+ * inode->i_size for cases like truncation of fallocated space
+ */
+ if (attr->ia_valid & ATTR_SIZE)
+ ext4_truncate(inode);
if (!rc) {
setattr_copy(inode, attr);
ext4_unregister_li_request(sb);
dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- flush_workqueue(sbi->unrsv_conversion_wq);
flush_workqueue(sbi->rsv_conversion_wq);
- destroy_workqueue(sbi->unrsv_conversion_wq);
destroy_workqueue(sbi->rsv_conversion_wq);
if (sbi->s_journal) {
#endif
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
- INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
- INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
return &ei->vfs_inode;
}
Opt_nouid32, Opt_debug, Opt_removed,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
- Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
- Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit,
+ Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
+ Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
{Opt_min_batch_time, "min_batch_time=%u"},
{Opt_max_batch_time, "max_batch_time=%u"},
{Opt_journal_dev, "journal_dev=%u"},
+ {Opt_journal_path, "journal_path=%s"},
{Opt_journal_checksum, "journal_checksum"},
{Opt_journal_async_commit, "journal_async_commit"},
{Opt_abort, "abort"},
#define MOPT_NO_EXT2 0x0100
#define MOPT_NO_EXT3 0x0200
#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
+#define MOPT_STRING 0x0400
static const struct mount_opts {
int token;
{Opt_resuid, 0, MOPT_GTE0},
{Opt_resgid, 0, MOPT_GTE0},
{Opt_journal_dev, 0, MOPT_GTE0},
+ {Opt_journal_path, 0, MOPT_STRING},
{Opt_journal_ioprio, 0, MOPT_GTE0},
{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
return -1;
}
- if (args->from && match_int(args, &arg))
+ if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
return -1;
if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
return -1;
return -1;
}
*journal_devnum = arg;
+ } else if (token == Opt_journal_path) {
+ char *journal_path;
+ struct inode *journal_inode;
+ struct path path;
+ int error;
+
+ if (is_remount) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot specify journal on remount");
+ return -1;
+ }
+ journal_path = match_strdup(&args[0]);
+ if (!journal_path) {
+ ext4_msg(sb, KERN_ERR, "error: could not dup "
+ "journal device string");
+ return -1;
+ }
+
+ error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+ if (error) {
+ ext4_msg(sb, KERN_ERR, "error: could not find "
+ "journal device path: error %d", error);
+ kfree(journal_path);
+ return -1;
+ }
+
+ journal_inode = path.dentry->d_inode;
+ if (!S_ISBLK(journal_inode->i_mode)) {
+ ext4_msg(sb, KERN_ERR, "error: journal path %s "
+ "is not a block device", journal_path);
+ path_put(&path);
+ kfree(journal_path);
+ return -1;
+ }
+
+ *journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ path_put(&path);
+ kfree(journal_path);
} else if (token == Opt_journal_ioprio) {
if (arg > 7) {
ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
goto failed_mount4;
}
- EXT4_SB(sb)->unrsv_conversion_wq =
- alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
- if (!EXT4_SB(sb)->unrsv_conversion_wq) {
- printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
- ret = -ENOMEM;
- goto failed_mount4;
- }
-
/*
* The jbd2_journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
ext4_msg(sb, KERN_ERR, "mount failed");
if (EXT4_SB(sb)->rsv_conversion_wq)
destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
- if (EXT4_SB(sb)->unrsv_conversion_wq)
- destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
failed_mount_wq:
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->rsv_conversion_wq);
- flush_workqueue(sbi->unrsv_conversion_wq);
/*
* Writeback quota in non-journalled quota case - journalled quota has
* no dirty dquots
trace_ext4_sync_fs(sb, wait);
flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
- flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
dquot_writeback_dquots(sb, -1);
if (wait && test_opt(sb, BARRIER))
ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
br_read_unlock(&vfsmount_lock);
}
+/*
+ * When we move over from the RCU domain to properly refcounted
+ * long-lived dentries, we need to check the sequence numbers
+ * we got before lookup very carefully.
+ *
+ * We cannot blindly increment a dentry refcount - even if it
+ * is not locked - if it is zero, because it may have gone
+ * through the final d_kill() logic already.
+ *
+ * So for a zero refcount, we need to get the spinlock (which is
+ * safe even for a dead dentry because the de-allocation is
+ * RCU-delayed), and check the sequence count under the lock.
+ *
+ * Once we have checked the sequence count, we know it is live,
+ * and since we hold the spinlock it cannot die from under us.
+ *
+ * In contrast, if the reference count wasn't zero, we can just
+ * increment the lockref without having to take the spinlock.
+ * Even if the sequence number ends up being stale, we haven't
+ * gone through the final dput() and killed the dentry yet.
+ */
+static inline int d_rcu_to_refcount(struct dentry *dentry, seqcount_t *validate, unsigned seq)
+{
+ int gotref;
+
+ gotref = lockref_get_or_lock(&dentry->d_lockref);
+
+ /* Does the sequence number still match? */
+ if (read_seqcount_retry(validate, seq)) {
+ if (gotref)
+ dput(dentry);
+ else
+ spin_unlock(&dentry->d_lock);
+ return -ECHILD;
+ }
+
+ /* Get the ref now, if we couldn't get it originally */
+ if (!gotref) {
+ dentry->d_lockref.count++;
+ spin_unlock(&dentry->d_lock);
+ }
+ return 0;
+}
+
/**
* unlazy_walk - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
nd->root.dentry != fs->root.dentry)
goto err_root;
}
- spin_lock(&parent->d_lock);
+
+ /*
+ * For a negative lookup, the lookup sequence point is the parents
+ * sequence point, and it only needs to revalidate the parent dentry.
+ *
+ * For a positive lookup, we need to move both the parent and the
+ * dentry from the RCU domain to be properly refcounted. And the
+ * sequence number in the dentry validates *both* dentry counters,
+ * since we checked the sequence number of the parent after we got
+ * the child sequence number. So we know the parent must still
+ * be valid if the child sequence number is still valid.
+ */
if (!dentry) {
- if (!__d_rcu_to_refcount(parent, nd->seq))
- goto err_parent;
+ if (d_rcu_to_refcount(parent, &parent->d_seq, nd->seq) < 0)
+ goto err_root;
BUG_ON(nd->inode != parent->d_inode);
} else {
- if (dentry->d_parent != parent)
+ if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0)
+ goto err_root;
+ if (d_rcu_to_refcount(parent, &dentry->d_seq, nd->seq) < 0)
goto err_parent;
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (!__d_rcu_to_refcount(dentry, nd->seq))
- goto err_child;
- /*
- * If the sequence check on the child dentry passed, then
- * the child has not been removed from its parent. This
- * means the parent dentry must be valid and able to take
- * a reference at this point.
- */
- BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
- BUG_ON(!parent->d_count);
- parent->d_count++;
- spin_unlock(&dentry->d_lock);
}
- spin_unlock(&parent->d_lock);
if (want_root) {
path_get(&nd->root);
spin_unlock(&fs->lock);
nd->flags &= ~LOOKUP_RCU;
return 0;
-err_child:
- spin_unlock(&dentry->d_lock);
err_parent:
- spin_unlock(&parent->d_lock);
+ dput(dentry);
err_root:
if (want_root)
spin_unlock(&fs->lock);
nd->flags &= ~LOOKUP_RCU;
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
- spin_lock(&dentry->d_lock);
- if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
- spin_unlock(&dentry->d_lock);
+
+ if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) {
unlock_rcu_walk();
return -ECHILD;
}
- BUG_ON(nd->inode != dentry->d_inode);
- spin_unlock(&dentry->d_lock);
mntget(nd->path.mnt);
unlock_rcu_walk();
}
return s;
}
+ /**
+ * umount_lookup_last - look up last component for umount
+ * @nd: pathwalk nameidata - currently pointing at parent directory of "last"
+ * @path: pointer to container for result
+ *
+ * This is a special lookup_last function just for umount. In this case, we
+ * need to resolve the path without doing any revalidation.
+ *
+ * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
+ * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
+ * in almost all cases, this lookup will be served out of the dcache. The only
+ * cases where it won't are if nd->last refers to a symlink or the path is
+ * bogus and it doesn't exist.
+ *
+ * Returns:
+ * -error: if there was an error during lookup. This includes -ENOENT if the
+ * lookup found a negative dentry. The nd->path reference will also be
+ * put in this case.
+ *
+ * 0: if we successfully resolved nd->path and found it to not to be a
+ * symlink that needs to be followed. "path" will also be populated.
+ * The nd->path reference will also be put.
+ *
+ * 1: if we successfully resolved nd->last and found it to be a symlink
+ * that needs to be followed. "path" will be populated with the path
+ * to the link, and nd->path will *not* be put.
+ */
+ static int
+ umount_lookup_last(struct nameidata *nd, struct path *path)
+ {
+ int error = 0;
+ struct dentry *dentry;
+ struct dentry *dir = nd->path.dentry;
+
+ if (unlikely(nd->flags & LOOKUP_RCU)) {
+ WARN_ON_ONCE(1);
+ error = -ECHILD;
+ goto error_check;
+ }
+
+ nd->flags &= ~LOOKUP_PARENT;
+
+ if (unlikely(nd->last_type != LAST_NORM)) {
+ error = handle_dots(nd, nd->last_type);
+ if (!error)
+ dentry = dget(nd->path.dentry);
+ goto error_check;
+ }
+
+ mutex_lock(&dir->d_inode->i_mutex);
+ dentry = d_lookup(dir, &nd->last);
+ if (!dentry) {
+ /*
+ * No cached dentry. Mounted dentries are pinned in the cache,
+ * so that means that this dentry is probably a symlink or the
+ * path doesn't actually point to a mounted dentry.
+ */
+ dentry = d_alloc(dir, &nd->last);
+ if (!dentry) {
+ error = -ENOMEM;
+ } else {
+ dentry = lookup_real(dir->d_inode, dentry, nd->flags);
+ if (IS_ERR(dentry))
+ error = PTR_ERR(dentry);
+ }
+ }
+ mutex_unlock(&dir->d_inode->i_mutex);
+
+ error_check:
+ if (!error) {
+ if (!dentry->d_inode) {
+ error = -ENOENT;
+ dput(dentry);
+ } else {
+ path->dentry = dentry;
+ path->mnt = mntget(nd->path.mnt);
+ if (should_follow_link(dentry->d_inode,
+ nd->flags & LOOKUP_FOLLOW))
+ return 1;
+ follow_mount(path);
+ }
+ }
+ terminate_walk(nd);
+ return error;
+ }
+
+ /**
+ * path_umountat - look up a path to be umounted
+ * @dfd: directory file descriptor to start walk from
+ * @name: full pathname to walk
+ * @flags: lookup flags
+ * @nd: pathwalk nameidata
+ *
+ * Look up the given name, but don't attempt to revalidate the last component.
+ * Returns 0 and "path" will be valid on success; Retuns error otherwise.
+ */
+ static int
+ path_umountat(int dfd, const char *name, struct path *path, unsigned int flags)
+ {
+ struct file *base = NULL;
+ struct nameidata nd;
+ int err;
+
+ err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base);
+ if (unlikely(err))
+ return err;
+
+ current->total_link_count = 0;
+ err = link_path_walk(name, &nd);
+ if (err)
+ goto out;
+
+ /* If we're in rcuwalk, drop out of it to handle last component */
+ if (nd.flags & LOOKUP_RCU) {
+ err = unlazy_walk(&nd, NULL);
+ if (err) {
+ terminate_walk(&nd);
+ goto out;
+ }
+ }
+
+ err = umount_lookup_last(&nd, path);
+ while (err > 0) {
+ void *cookie;
+ struct path link = *path;
+ err = may_follow_link(&link, &nd);
+ if (unlikely(err))
+ break;
+ nd.flags |= LOOKUP_PARENT;
+ err = follow_link(&link, &nd, &cookie);
+ if (err)
+ break;
+ err = umount_lookup_last(&nd, path);
+ put_link(&nd, &link, cookie);
+ }
+ out:
+ if (base)
+ fput(base);
+
+ if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
+ path_put(&nd.root);
+
+ return err;
+ }
+
+ /**
+ * user_path_umountat - lookup a path from userland in order to umount it
+ * @dfd: directory file descriptor
+ * @name: pathname from userland
+ * @flags: lookup flags
+ * @path: pointer to container to hold result
+ *
+ * A umount is a special case for path walking. We're not actually interested
+ * in the inode in this situation, and ESTALE errors can be a problem. We
+ * simply want track down the dentry and vfsmount attached at the mountpoint
+ * and avoid revalidating the last component.
+ *
+ * Returns 0 and populates "path" on success.
+ */
+ int
+ user_path_umountat(int dfd, const char __user *name, unsigned int flags,
+ struct path *path)
+ {
+ struct filename *s = getname(name);
+ int error;
+
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+
+ error = path_umountat(dfd, s->name, path, flags | LOOKUP_RCU);
+ if (unlikely(error == -ECHILD))
+ error = path_umountat(dfd, s->name, path, flags);
+ if (unlikely(error == -ESTALE))
+ error = path_umountat(dfd, s->name, path, flags | LOOKUP_REVAL);
+
+ if (likely(!error))
+ audit_inode(s, path->dentry, 0);
+
+ putname(s);
+ return error;
+ }
+
/*
* It's inline, so penalty for filesystems that don't use sticky bit is
* minimal.
{
shrink_dcache_parent(dentry);
spin_lock(&dentry->d_lock);
- if (dentry->d_count == 1)
+ if (dentry->d_lockref.count == 1)
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
}
if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
return -EINVAL;
/*
- * Using empty names is equivalent to using AT_SYMLINK_FOLLOW
- * on /proc/self/fd/<fd>.
+ * To use null names we require CAP_DAC_READ_SEARCH
+ * This ensures that not everyone will be able to create
+ * handlink using the passed filedescriptor.
*/
- if (flags & AT_EMPTY_PATH)
+ if (flags & AT_EMPTY_PATH) {
+ if (!capable(CAP_DAC_READ_SEARCH))
+ return -ENOENT;
how = LOOKUP_EMPTY;
+ }
if (flags & AT_SYMLINK_FOLLOW)
how |= LOOKUP_FOLLOW;
struct cred;
struct swap_info_struct;
struct seq_file;
+ struct workqueue_struct;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
- ssize_t bytes, void *private, int ret,
- bool is_async);
+ ssize_t bytes, void *private);
#define MAY_EXEC 0x00000001
#define MAY_WRITE 0x00000002
/* Being remounted read-only */
int s_readonly_remount;
+
+ /* AIO completions deferred from interrupt context */
+ struct workqueue_struct *s_dio_done_wq;
};
/* superblock cache pruning functions */
S_VERSION = 8,
};
- extern void touch_atime(struct path *);
+ extern void touch_atime(const struct path *);
static inline void file_accessed(struct file *file)
{
if (!(file->f_flags & O_NOATIME))
extern int vfs_getattr(struct path *, struct kstat *);
void __inode_add_bytes(struct inode *inode, loff_t bytes);
void inode_add_bytes(struct inode *inode, loff_t bytes);
+void __inode_sub_bytes(struct inode *inode, loff_t bytes);
void inode_sub_bytes(struct inode *inode, loff_t bytes);
loff_t inode_get_bytes(struct inode *inode);
void inode_set_bytes(struct inode *inode, loff_t bytes);
}
static const struct kernel_param_ops param_ops_bool_enable_only = {
+ .flags = KERNEL_PARAM_FL_NOARG,
.set = param_set_bool_enable_only,
.get = param_get_bool,
};
static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
struct module_kobject *mk, char *buffer) \
{ \
- return sprintf(buffer, "%s\n", mk->mod->field); \
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \
} \
static int modinfo_##field##_exists(struct module *mod) \
{ \
kfree(mod->modinfo_attrs);
}
+static void mod_kobject_put(struct module *mod)
+{
+ DECLARE_COMPLETION_ONSTACK(c);
+ mod->mkobj.kobj_completion = &c;
+ kobject_put(&mod->mkobj.kobj);
+ wait_for_completion(&c);
+}
+
static int mod_sysfs_init(struct module *mod)
{
int err;
err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
"%s", mod->name);
if (err)
- kobject_put(&mod->mkobj.kobj);
+ mod_kobject_put(mod);
/* delay uevent until full sysfs population */
out:
out_unreg_holders:
kobject_put(mod->holders_dir);
out_unreg:
- kobject_put(&mod->mkobj.kobj);
+ mod_kobject_put(mod);
out:
return err;
}
{
remove_notes_attrs(mod);
remove_sect_attrs(mod);
- kobject_put(&mod->mkobj.kobj);
+ mod_kobject_put(mod);
}
#else /* !CONFIG_SYSFS */
/* Sets info->hdr and info->len. */
static int copy_module_from_fd(int fd, struct load_info *info)
{
- struct file *file;
+ struct fd f = fdget(fd);
int err;
struct kstat stat;
loff_t pos;
ssize_t bytes = 0;
- file = fget(fd);
- if (!file)
+ if (!f.file)
return -ENOEXEC;
- err = security_kernel_module_from_file(file);
+ err = security_kernel_module_from_file(f.file);
if (err)
goto out;
- err = vfs_getattr(&file->f_path, &stat);
+ err = vfs_getattr(&f.file->f_path, &stat);
if (err)
goto out;
pos = 0;
while (pos < stat.size) {
- bytes = kernel_read(file, pos, (char *)(info->hdr) + pos,
+ bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos,
stat.size - pos);
if (bytes < 0) {
vfree(info->hdr);
info->len = pos;
out:
- fput(file);
+ fdput(f);
return err;
}