}
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+ /*
+ * Unlock the page and cycle ip_alloc_sem so that we don't
+ * busyloop waiting for ip_alloc_sem to unlock
+ */
ret = AOP_TRUNCATED_PAGE;
+ unlock_page(page);
+ unlock = 0;
+ down_read(&oi->ip_alloc_sem);
+ up_read(&oi->ip_alloc_sem);
goto out_inode_unlock;
}
/*
* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
- * particularly interested in the aio/dio case. Like the core uses
- * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
- * truncation on another.
+ * particularly interested in the aio/dio case. We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
*/
static void ocfs2_dio_end_io(struct kiocb *iocb,
loff_t offset,
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
int level;
+ wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
- if (ocfs2_iocb_is_sem_locked(iocb)) {
- up_read(&inode->i_alloc_sem);
+ if (ocfs2_iocb_is_sem_locked(iocb))
ocfs2_iocb_clear_sem_locked(iocb);
- }
+ if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+ ocfs2_iocb_clear_unaligned_aio(iocb);
+
+ if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
+ waitqueue_active(wq)) {
+ wake_up_all(wq);
+ }
+ }
+
ocfs2_iocb_clear_rw_locked(iocb);
level = ocfs2_iocb_rw_locked_level(iocb);
if (is_async)
aio_complete(iocb, ret, 0);
+ inode_dio_done(inode);
}
/*
struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
struct page *w_target_page;
+ /*
+ * w_target_locked is used for page_mkwrite path indicating no unlocking
+ * against w_target_page in ocfs2_write_end_nolock.
+ */
+ unsigned int w_target_locked:1;
+
/*
* ocfs2_write_end() uses this to know what the real range to
* write in the target should be.
static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
{
+ int i;
+
+ /*
+ * w_target_locked is only set to true in the page_mkwrite() case.
+ * The intent is to allow us to lock the target page from write_begin()
+ * to write_end(). The caller must hold a ref on w_target_page.
+ */
+ if (wc->w_target_locked) {
+ BUG_ON(!wc->w_target_page);
+ for (i = 0; i < wc->w_num_pages; i++) {
+ if (wc->w_target_page == wc->w_pages[i]) {
+ wc->w_pages[i] = NULL;
+ break;
+ }
+ }
+ mark_page_accessed(wc->w_target_page);
+ page_cache_release(wc->w_target_page);
+ }
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
brelse(wc->w_di_bh);
*/
lock_page(mmap_page);
+ /* Exit and let the caller retry */
if (mmap_page->mapping != mapping) {
+ WARN_ON(mmap_page->mapping);
unlock_page(mmap_page);
- /*
- * Sanity check - the locking in
- * ocfs2_pagemkwrite() should ensure
- * that this code doesn't trigger.
- */
- ret = -EINVAL;
- mlog_errno(ret);
+ ret = -EAGAIN;
goto out;
}
page_cache_get(mmap_page);
wc->w_pages[i] = mmap_page;
+ wc->w_target_locked = true;
} else {
wc->w_pages[i] = find_or_create_page(mapping, index,
GFP_NOFS);
wc->w_target_page = wc->w_pages[i];
}
out:
+ if (ret)
+ wc->w_target_locked = false;
return ret;
}
*/
ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page);
- if (ret) {
+ if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out_quota;
}
+ /*
+ * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
+ * the target page. In this case, we exit with no error and no target
+ * page. This will trigger the caller, page_mkwrite(), to re-try
+ * the operation.
+ */
+ if (ret == -EAGAIN) {
+ BUG_ON(wc->w_target_page);
+ ret = 0;
+ goto out_quota;
+ }
+
ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
len);
if (ret) {
return 0;
}
- static int ocfs2_sync_file(struct file *file, int datasync)
+ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
+ int datasync)
{
int err = 0;
journal_t *journal;
file->f_path.dentry->d_name.name,
(unsigned long long)datasync);
+ err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (err)
+ return err;
+
+ /*
+ * Probably don't need the i_mutex at all in here, just putting it here
+ * to be consistent with how fsync used to be called, someone more
+ * familiar with the fs could possibly remove it.
+ */
+ mutex_lock(&inode->i_mutex);
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
/*
* We still have to flush drive's caches to get data to the
bail:
if (err)
mlog_errno(err);
+ mutex_unlock(&inode->i_mutex);
return (err < 0) ? -EIO : 0;
}
if (status)
goto bail_unlock;
+ inode_dio_wait(inode);
+
if (i_size_read(inode) > attr->ia_size) {
if (ocfs2_should_order_data(inode)) {
status = ocfs2_begin_ordered_truncate(inode,
return err;
}
- int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
+ int ocfs2_permission(struct inode *inode, int mask)
{
int ret;
- if (flags & IPERM_FLAG_RCU)
+ if (mask & MAY_NOT_BLOCK)
return -ECHILD;
ret = ocfs2_inode_lock(inode, NULL, 0);
goto out;
}
- ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
+ ret = generic_permission(inode, mask);
ocfs2_inode_unlock(inode, 0);
out:
return ret;
}
+static void ocfs2_aiodio_wait(struct inode *inode)
+{
+ wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
+
+ wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
+}
+
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+ int blockmask = inode->i_sb->s_blocksize - 1;
+ loff_t final_size = pos + count;
+
+ if ((pos & blockmask) || (final_size & blockmask))
+ return 1;
+ return 0;
+}
+
static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
struct file *file,
loff_t pos, size_t count,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
+ int unaligned_dio = 0;
trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
ocfs2_iocb_clear_sem_locked(iocb);
relock:
- /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
+ /* to match setattr's i_mutex -> rw_lock ordering */
if (direct_io) {
- down_read(&inode->i_alloc_sem);
have_alloc_sem = 1;
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_sem_locked(iocb);
goto out;
}
+ if (direct_io && !is_sync_kiocb(iocb))
+ unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
+ *ppos);
+
/*
* We can't complete the direct I/O as requested, fall back to
* buffered I/O.
*/
if (direct_io && !can_do_direct) {
ocfs2_rw_unlock(inode, rw_level);
- up_read(&inode->i_alloc_sem);
have_alloc_sem = 0;
rw_level = -1;
goto relock;
}
+ if (unaligned_dio) {
+ /*
+ * Wait on previous unaligned aio to complete before
+ * proceeding.
+ */
+ ocfs2_aiodio_wait(inode);
+
+ /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
+ atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
+ ocfs2_iocb_set_unaligned_aio(iocb);
+ }
+
/*
* To later detect whether a journal commit for sync writes is
* necessary, we sample i_size, and cluster count here.
/*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
* function pointer which is called when o_direct io completes so that
- * it can unlock our rw lock. (it's the clustered equivalent of
- * i_alloc_sem; protects truncate from racing with pending ios).
+ * it can unlock our rw lock.
* Unfortunately there are error cases which call end_io and others
* that don't. so we don't have to unlock the rw_lock if either an
* async dio is going to do it in the future or an end_io after an
if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1;
have_alloc_sem = 0;
+ unaligned_dio = 0;
}
+ if (unaligned_dio)
+ atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
+
out:
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
out_sems:
- if (have_alloc_sem) {
- up_read(&inode->i_alloc_sem);
+ if (have_alloc_sem)
ocfs2_iocb_clear_sem_locked(iocb);
- }
mutex_unlock(&inode->i_mutex);
* need locks to protect pending reads from racing with truncate.
*/
if (filp->f_flags & O_DIRECT) {
- down_read(&inode->i_alloc_sem);
have_alloc_sem = 1;
ocfs2_iocb_set_sem_locked(iocb);
}
bail:
- if (have_alloc_sem) {
- up_read(&inode->i_alloc_sem);
+ if (have_alloc_sem)
ocfs2_iocb_clear_sem_locked(iocb);
- }
+
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
return ret;
}
+ /* Refer generic_file_llseek_unlocked() */
+ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
+ {
+ struct inode *inode = file->f_mapping->host;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ switch (origin) {
+ case SEEK_SET:
+ break;
+ case SEEK_END:
+ offset += inode->i_size;
+ break;
+ case SEEK_CUR:
+ if (offset == 0) {
+ offset = file->f_pos;
+ goto out;
+ }
+ offset += file->f_pos;
+ break;
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
+ if (ret)
+ goto out;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+ ret = -EINVAL;
+ if (!ret && offset > inode->i_sb->s_maxbytes)
+ ret = -EINVAL;
+ if (ret)
+ goto out;
+
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+
+ out:
+ mutex_unlock(&inode->i_mutex);
+ if (ret)
+ return ret;
+ return offset;
+ }
+
const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.listxattr = ocfs2_listxattr,
.removexattr = generic_removexattr,
.fiemap = ocfs2_fiemap,
+ .check_acl = ocfs2_check_acl,
};
const struct inode_operations ocfs2_special_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
+ .check_acl = ocfs2_check_acl,
};
/*
* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
*/
const struct file_operations ocfs2_fops = {
- .llseek = generic_file_llseek,
+ .llseek = ocfs2_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.mmap = ocfs2_mmap,
* the cluster.
*/
const struct file_operations ocfs2_fops_no_plocks = {
- .llseek = generic_file_llseek,
+ .llseek = ocfs2_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.mmap = ocfs2_mmap,
#include "ocfs1_fs_compat.h"
#include "alloc.h"
+#include "aops.h"
#include "blockcheck.h"
#include "dlmglue.h"
#include "export.h"
sb->s_magic = OCFS2_SUPER_MAGIC;
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
ocfs2_set_ro_flag(osb, 1);
- printk(KERN_NOTICE "Readonly device detected. No cluster "
- "services will be utilized for this mount. Recovery "
- "will be skipped.\n");
+ printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
+ "Cluster services will not be used for this mount. "
+ "Recovery will be skipped.\n", osb->dev_str);
}
if (!ocfs2_is_hard_readonly(osb)) {
return 0;
}
+wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
+
static int __init ocfs2_init(void)
{
- int status;
+ int status, i;
ocfs2_print_version();
+ for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
+ init_waitqueue_head(&ocfs2__ioend_wq[i]);
+
status = init_ocfs2_uptodate_cache();
if (status < 0) {
mlog_errno(status);
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
oi->ip_dir_start_lookup = 0;
-
+ atomic_set(&oi->ip_unaligned_aio, 0);
init_rwsem(&oi->ip_alloc_sem);
init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex);
* If we failed before we got a uuid_str yet, we can't stop
* heartbeat. Otherwise, do it.
*/
- if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+ if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
+ !ocfs2_is_hard_readonly(osb))
hangup_needed = 1;
if (osb->cconn)
goto finally;
}
} else {
- mlog(ML_NOTICE, "File system was not unmounted cleanly, "
- "recovering volume.\n");
+ printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
+ "unmounted cleanly, recovering it.\n", osb->dev_str);
}
local = ocfs2_mount_local(osb);