#include <linux/time.h>
#include <linux/fs.h>
-#include <linux/jbd2.h>
#include <linux/mount.h>
#include <linux/path.h>
-#include <linux/aio.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
+#include <linux/uio.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
return 0;
}
-void ext4_unwritten_wait(struct inode *inode)
+static void ext4_unwritten_wait(struct inode *inode)
{
wait_queue_head_t *wq = ext4_ioend_wq(inode);
* or one thread will zero the other's data, causing corruption.
*/
static int
-ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
{
struct super_block *sb = inode->i_sb;
int blockmask = sb->s_blocksize - 1;
- size_t count = iov_length(iov, nr_segs);
- loff_t final_size = pos + count;
- if (pos >= inode->i_size)
+ if (pos >= i_size_read(inode))
return 0;
- if ((pos & blockmask) || (final_size & blockmask))
+ if ((pos | iov_iter_alignment(from)) & blockmask)
return 1;
return 0;
}
static ssize_t
-ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct mutex *aio_mutex = NULL;
struct blk_plug plug;
- int unaligned_aio = 0;
- ssize_t ret;
+ int o_direct = iocb->ki_flags & IOCB_DIRECT;
int overwrite = 0;
- size_t length = iov_length(iov, nr_segs);
-
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
- !is_sync_kiocb(iocb))
- unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
+ ssize_t ret;
- /* Unaligned direct AIO must be serialized; see comment above */
- if (unaligned_aio) {
- mutex_lock(ext4_aio_mutex(inode));
+ /*
+ * Unaligned direct AIO must be serialized; see comment above
+ * In the case of O_APPEND, assume that we must always serialize
+ */
+ if (o_direct &&
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ !is_sync_kiocb(iocb) &&
+ (iocb->ki_flags & IOCB_APPEND ||
+ ext4_unaligned_aio(inode, from, iocb->ki_pos))) {
+ aio_mutex = ext4_aio_mutex(inode);
+ mutex_lock(aio_mutex);
ext4_unwritten_wait(inode);
}
- BUG_ON(iocb->ki_pos != pos);
-
mutex_lock(&inode->i_mutex);
- blk_start_plug(&plug);
-
- iocb->private = &overwrite;
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
- /* check whether we do a DIO overwrite or not */
- if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
- !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
- struct ext4_map_blocks map;
- unsigned int blkbits = inode->i_blkbits;
- int err, len;
+ /*
+ * If we have encountered a bitmap-format file, the size limit
+ * is smaller than s_maxbytes, which is for extent-mapped files.
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- map.m_lblk = pos >> blkbits;
- map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
- - map.m_lblk;
- len = map.m_len;
+ if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) {
+ ret = -EFBIG;
+ goto out;
+ }
+ iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
+ }
- err = ext4_map_blocks(NULL, inode, &map, 0);
- /*
- * 'err==len' means that all of blocks has been preallocated no
- * matter they are initialized or not. For excluding
- * uninitialized extents, we need to check m_flags. There are
- * two conditions that indicate for initialized extents.
- * 1) If we hit extent cache, EXT4_MAP_MAPPED flag is returned;
- * 2) If we do a real lookup, non-flags are returned.
- * So we should check these two conditions.
- */
- if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
- overwrite = 1;
+ iocb->private = &overwrite;
+ if (o_direct) {
+ size_t length = iov_iter_count(from);
+ loff_t pos = iocb->ki_pos;
+ blk_start_plug(&plug);
+
+ /* check whether we do a DIO overwrite or not */
+ if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
+ !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ int err, len;
+
+ map.m_lblk = pos >> blkbits;
+ map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
+ - map.m_lblk;
+ len = map.m_len;
+
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ /*
+ * 'err==len' means that all of blocks has
+ * been preallocated no matter they are
+ * initialized or not. For excluding
+ * unwritten extents, we need to check
+ * m_flags. There are two conditions that
+ * indicate for initialized extents. 1) If we
+ * hit extent cache, EXT4_MAP_MAPPED flag is
+ * returned; 2) If we do a real lookup,
+ * non-flags are returned. So we should check
+ * these two conditions.
+ */
+ if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
+ overwrite = 1;
+ }
}
- ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+ ret = __generic_file_write_iter(iocb, from);
mutex_unlock(&inode->i_mutex);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ if (ret > 0) {
ssize_t err;
- err = generic_write_sync(file, pos, ret);
- if (err < 0 && ret > 0)
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ if (err < 0)
ret = err;
}
- blk_finish_plug(&plug);
+ if (o_direct)
+ blk_finish_plug(&plug);
- if (unaligned_aio)
- mutex_unlock(ext4_aio_mutex(inode));
+ if (aio_mutex)
+ mutex_unlock(aio_mutex);
+ return ret;
+out:
+ mutex_unlock(&inode->i_mutex);
+ if (aio_mutex)
+ mutex_unlock(aio_mutex);
return ret;
}
-static ssize_t
-ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+#ifdef CONFIG_FS_DAX
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
- struct inode *inode = file_inode(iocb->ki_filp);
- ssize_t ret;
-
- /*
- * If we have encountered a bitmap-format file, the size limit
- * is smaller than s_maxbytes, which is for extent-mapped files.
- */
+ struct inode *inode = bh->b_assoc_map->host;
+ /* XXX: breaks on 32-bit > 16TB. Is that even supported? */
+ loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+ int err;
+ if (!uptodate)
+ return;
+ WARN_ON(!buffer_unwritten(bh));
+ err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- size_t length = iov_length(iov, nr_segs);
+static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ int result;
+ handle_t *handle = NULL;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ EXT4_DATA_TRANS_BLOCKS(sb));
+ } else
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+ else
+ result = __dax_fault(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
- if ((pos > sbi->s_bitmap_maxbytes ||
- (pos == sbi->s_bitmap_maxbytes && length > 0)))
- return -EFBIG;
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ sb_end_pagefault(sb);
+ } else
+ up_read(&EXT4_I(inode)->i_mmap_sem);
- if (pos + length > sbi->s_bitmap_maxbytes) {
- nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
- sbi->s_bitmap_maxbytes - pos);
- }
- }
+ return result;
+}
- if (unlikely(iocb->ki_filp->f_flags & O_DIRECT))
- ret = ext4_file_dio_write(iocb, iov, nr_segs, pos);
+static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ int result;
+ handle_t *handle = NULL;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ bool write = flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ ext4_chunk_trans_blocks(inode,
+ PMD_SIZE / PAGE_SIZE));
+ } else
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
else
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ result = __dax_pmd_fault(vma, addr, pmd, flags,
+ ext4_get_block_dax, ext4_end_io_unwritten);
+
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ sb_end_pagefault(sb);
+ } else
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+
+ return result;
+}
+
+static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ int err;
+ struct inode *inode = file_inode(vma->vm_file);
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ err = __dax_mkwrite(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ sb_end_pagefault(inode->i_sb);
+
+ return err;
+}
+
+/*
+ * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
+ * handler we check for races agaist truncate. Note that since we cycle through
+ * i_mmap_sem, we are sure that also any hole punching that began before we
+ * were called is finished by now and so if it included part of the file we
+ * are working on, our pte will get unmapped and the check for pte_same() in
+ * wp_pfn_shared() fails. Thus fault gets retried and things work out as
+ * desired.
+ */
+static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ int ret = VM_FAULT_NOPAGE;
+ loff_t size;
+
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ ret = VM_FAULT_SIGBUS;
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ sb_end_pagefault(sb);
return ret;
}
+static const struct vm_operations_struct ext4_dax_vm_ops = {
+ .fault = ext4_dax_fault,
+ .pmd_fault = ext4_dax_pmd_fault,
+ .page_mkwrite = ext4_dax_mkwrite,
+ .pfn_mkwrite = ext4_dax_pfn_mkwrite,
+};
+#else
+#define ext4_dax_vm_ops ext4_file_vm_ops
+#endif
+
static const struct vm_operations_struct ext4_file_vm_ops = {
- .fault = filemap_fault,
+ .fault = ext4_filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
- struct address_space *mapping = file->f_mapping;
+ struct inode *inode = file->f_mapping->host;
- if (!mapping->a_ops->readpage)
- return -ENOEXEC;
+ if (ext4_encrypted_inode(inode)) {
+ int err = ext4_get_encryption_info(inode);
+ if (err)
+ return 0;
+ if (ext4_encryption_info(inode) == NULL)
+ return -ENOKEY;
+ }
file_accessed(file);
- vma->vm_ops = &ext4_file_vm_ops;
+ if (IS_DAX(file_inode(file))) {
+ vma->vm_ops = &ext4_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ } else {
+ vma->vm_ops = &ext4_file_vm_ops;
+ }
return 0;
}
{
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct ext4_inode_info *ei = EXT4_I(inode);
struct vfsmount *mnt = filp->f_path.mnt;
struct path path;
char buf[64], *cp;
+ int ret;
if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
!(sb->s_flags & MS_RDONLY))) {
handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err) {
ext4_journal_stop(handle);
ext4_journal_stop(handle);
}
}
+ if (ext4_encrypted_inode(inode)) {
+ ret = ext4_get_encryption_info(inode);
+ if (ret)
+ return -EACCES;
+ if (ext4_encryption_info(inode) == NULL)
+ return -ENOKEY;
+ }
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
*/
- if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
- struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
-
- spin_lock(&inode->i_lock);
- if (!ei->jinode) {
- if (!jinode) {
- spin_unlock(&inode->i_lock);
- return -ENOMEM;
- }
- ei->jinode = jinode;
- jbd2_journal_init_jbd_inode(ei->jinode, inode);
- jinode = NULL;
- }
- spin_unlock(&inode->i_lock);
- if (unlikely(jinode != NULL))
- jbd2_free_inode(jinode);
+ if (filp->f_mode & FMODE_WRITE) {
+ ret = ext4_inode_attach_jinode(inode);
+ if (ret < 0)
+ return ret;
}
return dquot_file_open(inode, filp);
}
blkbits = inode->i_sb->s_blocksize_bits;
startoff = *offset;
lastoff = startoff;
- endoff = (map->m_lblk + map->m_len) << blkbits;
+ endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
index = startoff >> PAGE_CACHE_SHIFT;
end = endoff >> PAGE_CACHE_SHIFT;
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
if (last != start)
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
break;
}
ext4_es_find_delayed_extent_range(inode, last, last, &es);
if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
if (last != start)
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
break;
}
}
last++;
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
} while (last <= end);
mutex_unlock(&inode->i_mutex);
if (dataoff > isize)
return -ENXIO;
- if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
- return -EINVAL;
- if (dataoff > maxsize)
- return -EINVAL;
-
- if (dataoff != file->f_pos) {
- file->f_pos = dataoff;
- file->f_version = 0;
- }
-
- return dataoff;
+ return vfs_setpos(file, dataoff, maxsize);
}
/*
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
last += ret;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
ext4_es_find_delayed_extent_range(inode, last, last, &es);
if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
last = es.es_lblk + es.es_len;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
&map, &holeoff);
if (!unwritten) {
last += ret;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
}
if (holeoff > isize)
holeoff = isize;
- if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
- return -EINVAL;
- if (holeoff > maxsize)
- return -EINVAL;
-
- if (holeoff != file->f_pos) {
- file->f_pos = holeoff;
- file->f_version = 0;
- }
-
- return holeoff;
+ return vfs_setpos(file, holeoff, maxsize);
}
/*
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = ext4_file_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = ext4_file_write_iter,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
.release = ext4_release_file,
.fsync = ext4_sync_file,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
};
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
+ .set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
};