Merge branch 'mw-3.1-jul25' of git://oss.oracle.com/git/smushran/linux-2.6 into ocfs2...

author Joel Becker <jlbec@evilplan.org>

Mon, 22 Aug 2011 04:02:57 +0000 (21:02 -0700)

committer Joel Becker <jlbec@evilplan.org>

Mon, 22 Aug 2011 04:02:57 +0000 (21:02 -0700)
author Joel Becker <jlbec@evilplan.org>
Mon, 22 Aug 2011 04:02:57 +0000 (21:02 -0700)
committer Joel Becker <jlbec@evilplan.org>
Mon, 22 Aug 2011 04:02:57 +0000 (21:02 -0700)
diff --combined fs/ocfs2/aops.c

index ba3ca1e63b51bebc30938276462354b2a8cd6e49,ff98c169b631efb1a0e2d2467f9a31eec9cc5e85..78b68af3b0e32627b1874277d8ae58003501acb5
--- 1/fs/ocfs2/aops.c
--- 2/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@@ -290,15 -290,7 +290,15 @@@ static int ocfs2_readpage(struct file *
         }
   
         if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+ +              /*
+ +               * Unlock the page and cycle ip_alloc_sem so that we don't
+ +               * busyloop waiting for ip_alloc_sem to unlock
+ +               */
                 ret = AOP_TRUNCATED_PAGE;
+ +              unlock_page(page);
+ +              unlock = 0;
+ +              down_read(&oi->ip_alloc_sem);
+ +              up_read(&oi->ip_alloc_sem);
                 goto out_inode_unlock;
         }
   
@@@ -559,9 -551,8 +559,8 @@@ bail
   
   /*
    * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
-  * particularly interested in the aio/dio case.  Like the core uses
-  * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
-  * truncation on another.
+  * particularly interested in the aio/dio case.  We use the rw_lock DLM lock
+  * to protect io on one node from truncation on another.
    */
   static void ocfs2_dio_end_io(struct kiocb *iocb,
                              loff_t offset,
@@@ -572,25 -563,13 +571,23 @@@
   {
         struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
         int level;
+ +      wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
   
         /* this io's submitter should not have unlocked this before we could */
         BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
   
-       if (ocfs2_iocb_is_sem_locked(iocb)) {
-               up_read(&inode->i_alloc_sem);
+       if (ocfs2_iocb_is_sem_locked(iocb))
                 ocfs2_iocb_clear_sem_locked(iocb);
-       }
   
+ +      if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+ +              ocfs2_iocb_clear_unaligned_aio(iocb);
+ +
+ +              if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
+ +                  waitqueue_active(wq)) {
+ +                      wake_up_all(wq);
+ +              }
+ +      }
+ +
         ocfs2_iocb_clear_rw_locked(iocb);
   
         level = ocfs2_iocb_rw_locked_level(iocb);
@@@ -598,6 -577,7 +595,7 @@@
   
         if (is_async)
                 aio_complete(iocb, ret, 0);
+       inode_dio_done(inode);
   }
   
   /*
@@@ -882,6 -862,12 +880,12 @@@ struct ocfs2_write_ctxt 
         struct page                     *w_pages[OCFS2_MAX_CTXT_PAGES];
         struct page                     *w_target_page;
   
+       /*
+        * w_target_locked is used for page_mkwrite path indicating no unlocking
+        * against w_target_page in ocfs2_write_end_nolock.
+        */
+       unsigned int                    w_target_locked:1;
+ 
         /*
          * ocfs2_write_end() uses this to know what the real range to
          * write in the target should be.
@@@ -915,6 -901,24 +919,24 @@@ void ocfs2_unlock_and_free_pages(struc
   
   static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
   {
+       int i;
+ 
+       /*
+        * w_target_locked is only set to true in the page_mkwrite() case.
+        * The intent is to allow us to lock the target page from write_begin()
+        * to write_end(). The caller must hold a ref on w_target_page.
+        */
+       if (wc->w_target_locked) {
+               BUG_ON(!wc->w_target_page);
+               for (i = 0; i < wc->w_num_pages; i++) {
+                       if (wc->w_target_page == wc->w_pages[i]) {
+                               wc->w_pages[i] = NULL;
+                               break;
+                       }
+               }
+               mark_page_accessed(wc->w_target_page);
+               page_cache_release(wc->w_target_page);
+       }
         ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
   
         brelse(wc->w_di_bh);
@@@ -1152,20 -1156,17 +1174,17 @@@ static int ocfs2_grab_pages_for_write(s
                          */
                         lock_page(mmap_page);
   
+                       /* Exit and let the caller retry */
                         if (mmap_page->mapping != mapping) {
+                               WARN_ON(mmap_page->mapping);
                                 unlock_page(mmap_page);
-                               /*
-                                * Sanity check - the locking in
-                                * ocfs2_pagemkwrite() should ensure
-                                * that this code doesn't trigger.
-                                */
-                               ret = -EINVAL;
-                               mlog_errno(ret);
+                               ret = -EAGAIN;
                                 goto out;
                         }
   
                         page_cache_get(mmap_page);
                         wc->w_pages[i] = mmap_page;
+                       wc->w_target_locked = true;
                 } else {
                         wc->w_pages[i] = find_or_create_page(mapping, index,
                                                              GFP_NOFS);
@@@ -1180,6 -1181,8 +1199,8 @@@
                         wc->w_target_page = wc->w_pages[i];
         }
   out:
+       if (ret)
+               wc->w_target_locked = false;
         return ret;
   }
   
@@@ -1837,11 -1840,23 +1858,23 @@@ try_again
          */
         ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
                                          cluster_of_pages, mmap_page);
-       if (ret) {
+       if (ret && ret != -EAGAIN) {
                 mlog_errno(ret);
                 goto out_quota;
         }
   
+       /*
+        * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
+        * the target page. In this case, we exit with no error and no target
+        * page. This will trigger the caller, page_mkwrite(), to re-try
+        * the operation.
+        */
+       if (ret == -EAGAIN) {
+               BUG_ON(wc->w_target_page);
+               ret = 0;
+               goto out_quota;
+       }
+ 
         ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
                                           len);
         if (ret) {
diff --combined fs/ocfs2/file.c

index 145f4533a93644be863cea16fa31ca01fb266e5b,c0f015e11c28f2d45eecbcb62b3886108b3df228..5c4a74e04ab4b67e9e64a533f1968955b4d11eca
--- 1/fs/ocfs2/file.c
--- 2/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@@ -171,7 -171,8 +171,8 @@@ static int ocfs2_dir_release(struct ino
         return 0;
   }
   
- static int ocfs2_sync_file(struct file *file, int datasync)
+ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
+                          int datasync)
   {
         int err = 0;
         journal_t *journal;
@@@ -184,6 -185,16 +185,16 @@@
                               file->f_path.dentry->d_name.name,
                               (unsigned long long)datasync);
   
+       err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       if (err)
+               return err;
+ 
+       /*
+        * Probably don't need the i_mutex at all in here, just putting it here
+        * to be consistent with how fsync used to be called, someone more
+        * familiar with the fs could possibly remove it.
+        */
+       mutex_lock(&inode->i_mutex);
         if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
                 /*
                  * We still have to flush drive's caches to get data to the
@@@ -200,6 -211,7 +211,7 @@@
   bail:
         if (err)
                 mlog_errno(err);
+       mutex_unlock(&inode->i_mutex);
   
         return (err < 0) ? -EIO : 0;
   }
@@@ -1142,6 -1154,8 +1154,8 @@@ int ocfs2_setattr(struct dentry *dentry
                 if (status)
                         goto bail_unlock;
   
+               inode_dio_wait(inode);
+ 
                 if (i_size_read(inode) > attr->ia_size) {
                         if (ocfs2_should_order_data(inode)) {
                                 status = ocfs2_begin_ordered_truncate(inode,
@@@ -1279,11 -1293,11 +1293,11 @@@ bail
         return err;
   }
   
- int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
+ int ocfs2_permission(struct inode *inode, int mask)
   {
         int ret;
   
-       if (flags & IPERM_FLAG_RCU)
+       if (mask & MAY_NOT_BLOCK)
                 return -ECHILD;
   
         ret = ocfs2_inode_lock(inode, NULL, 0);
@@@ -1293,7 -1307,7 +1307,7 @@@
                 goto out;
         }
   
-       ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
+       ret = generic_permission(inode, mask);
   
         ocfs2_inode_unlock(inode, 0);
   out:
@@@ -2038,23 -2052,6 +2052,23 @@@ out
         return ret;
   }
   
+ +static void ocfs2_aiodio_wait(struct inode *inode)
+ +{
+ +      wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
+ +
+ +      wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
+ +}
+ +
+ +static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+ +{
+ +      int blockmask = inode->i_sb->s_blocksize - 1;
+ +      loff_t final_size = pos + count;
+ +
+ +      if ((pos & blockmask) || (final_size & blockmask))
+ +              return 1;
+ +      return 0;
+ +}
+ +
   static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
                                             struct file *file,
                                             loff_t pos, size_t count,
@@@ -2233,7 -2230,6 +2247,7 @@@ static ssize_t ocfs2_file_aio_write(str
         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
         int full_coherency = !(osb->s_mount_opt &
                                OCFS2_MOUNT_COHERENCY_BUFFERED);
+ +      int unaligned_dio = 0;
   
         trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@@ -2254,9 -2250,8 +2268,8 @@@
         ocfs2_iocb_clear_sem_locked(iocb);
   
   relock:
-       /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
+       /* to match setattr's i_mutex -> rw_lock ordering */
         if (direct_io) {
-               down_read(&inode->i_alloc_sem);
                 have_alloc_sem = 1;
                 /* communicate with ocfs2_dio_end_io */
                 ocfs2_iocb_set_sem_locked(iocb);
@@@ -2302,17 -2297,12 +2315,16 @@@
                 goto out;
         }
   
+ +      if (direct_io && !is_sync_kiocb(iocb))
+ +              unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
+ +                                                    *ppos);
+ +
         /*
          * We can't complete the direct I/O as requested, fall back to
          * buffered I/O.
          */
         if (direct_io && !can_do_direct) {
                 ocfs2_rw_unlock(inode, rw_level);
-               up_read(&inode->i_alloc_sem);
   
                 have_alloc_sem = 0;
                 rw_level = -1;
@@@ -2321,18 -2311,6 +2333,18 @@@
                 goto relock;
         }
   
+ +      if (unaligned_dio) {
+ +              /*
+ +               * Wait on previous unaligned aio to complete before
+ +               * proceeding.
+ +               */
+ +              ocfs2_aiodio_wait(inode);
+ +
+ +              /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
+ +              atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
+ +              ocfs2_iocb_set_unaligned_aio(iocb);
+ +      }
+ +
         /*
          * To later detect whether a journal commit for sync writes is
          * necessary, we sample i_size, and cluster count here.
@@@ -2395,8 -2373,7 +2407,7 @@@ out_dio
         /*
          * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
          * function pointer which is called when o_direct io completes so that
-        * it can unlock our rw lock.  (it's the clustered equivalent of
-        * i_alloc_sem; protects truncate from racing with pending ios).
+        * it can unlock our rw lock.
          * Unfortunately there are error cases which call end_io and others
          * that don't.  so we don't have to unlock the rw_lock if either an
          * async dio is going to do it in the future or an end_io after an
@@@ -2405,21 -2382,15 +2416,19 @@@
         if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                 rw_level = -1;
                 have_alloc_sem = 0;
+ +              unaligned_dio = 0;
         }
   
+ +      if (unaligned_dio)
+ +              atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
+ +
   out:
         if (rw_level != -1)
                 ocfs2_rw_unlock(inode, rw_level);
   
   out_sems:
-       if (have_alloc_sem) {
-               up_read(&inode->i_alloc_sem);
+       if (have_alloc_sem)
                 ocfs2_iocb_clear_sem_locked(iocb);
-       }
   
         mutex_unlock(&inode->i_mutex);
   
@@@ -2569,7 -2540,6 +2578,6 @@@ static ssize_t ocfs2_file_aio_read(stru
          * need locks to protect pending reads from racing with truncate.
          */
         if (filp->f_flags & O_DIRECT) {
-               down_read(&inode->i_alloc_sem);
                 have_alloc_sem = 1;
                 ocfs2_iocb_set_sem_locked(iocb);
   
@@@ -2612,16 -2582,66 +2620,66 @@@
         }
   
   bail:
-       if (have_alloc_sem) {
-               up_read(&inode->i_alloc_sem);
+       if (have_alloc_sem)
                 ocfs2_iocb_clear_sem_locked(iocb);
-       }
+ 
         if (rw_level != -1)
                 ocfs2_rw_unlock(inode, rw_level);
   
         return ret;
   }
   
+ /* Refer generic_file_llseek_unlocked() */
+ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
+ {
+       struct inode *inode = file->f_mapping->host;
+       int ret = 0;
+ 
+       mutex_lock(&inode->i_mutex);
+ 
+       switch (origin) {
+       case SEEK_SET:
+               break;
+       case SEEK_END:
+               offset += inode->i_size;
+               break;
+       case SEEK_CUR:
+               if (offset == 0) {
+                       offset = file->f_pos;
+                       goto out;
+               }
+               offset += file->f_pos;
+               break;
+       case SEEK_DATA:
+       case SEEK_HOLE:
+               ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
+               if (ret)
+                       goto out;
+               break;
+       default:
+               ret = -EINVAL;
+               goto out;
+       }
+ 
+       if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+               ret = -EINVAL;
+       if (!ret && offset > inode->i_sb->s_maxbytes)
+               ret = -EINVAL;
+       if (ret)
+               goto out;
+ 
+       if (offset != file->f_pos) {
+               file->f_pos = offset;
+               file->f_version = 0;
+       }
+ 
+ out:
+       mutex_unlock(&inode->i_mutex);
+       if (ret)
+               return ret;
+       return offset;
+ }
+ 
   const struct inode_operations ocfs2_file_iops = {
         .setattr        = ocfs2_setattr,
         .getattr        = ocfs2_getattr,
@@@ -2631,12 -2651,14 +2689,14 @@@
         .listxattr      = ocfs2_listxattr,
         .removexattr    = generic_removexattr,
         .fiemap         = ocfs2_fiemap,
+       .check_acl      = ocfs2_check_acl,
   };
   
   const struct inode_operations ocfs2_special_file_iops = {
         .setattr        = ocfs2_setattr,
         .getattr        = ocfs2_getattr,
         .permission     = ocfs2_permission,
+       .check_acl      = ocfs2_check_acl,
   };
   
   /*
@@@ -2644,7 -2666,7 +2704,7 @@@
    * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
    */
   const struct file_operations ocfs2_fops = {
-       .llseek         = generic_file_llseek,
+       .llseek         = ocfs2_file_llseek,
         .read           = do_sync_read,
         .write          = do_sync_write,
         .mmap           = ocfs2_mmap,
@@@ -2692,7 -2714,7 +2752,7 @@@ const struct file_operations ocfs2_dop
    * the cluster.
    */
   const struct file_operations ocfs2_fops_no_plocks = {
-       .llseek         = generic_file_llseek,
+       .llseek         = ocfs2_file_llseek,
         .read           = do_sync_read,
         .write          = do_sync_write,
         .mmap           = ocfs2_mmap,
diff --combined fs/ocfs2/super.c

index 603f5fe9f81603d1d12699eebb2a5dc63d3ea6f0,3e7850380d2b6145cd917a7baaec0b6bf06f94f7..938e2b2b0c9c9f8635a43ad4686dff675885361f
--- 1/fs/ocfs2/super.c
--- 2/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@@ -54,7 -54,6 +54,7 @@@
   #include "ocfs1_fs_compat.h"
   
   #include "alloc.h"
+ +#include "aops.h"
   #include "blockcheck.h"
   #include "dlmglue.h"
   #include "export.h"
@@@ -1073,7 -1072,7 +1073,7 @@@ static int ocfs2_fill_super(struct supe
   
         sb->s_magic = OCFS2_SUPER_MAGIC;
   
-       sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+       sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
                 ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
   
         /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
@@@ -1108,9 -1107,9 +1108,9 @@@
   
                 ocfs2_set_ro_flag(osb, 1);
   
-               printk(KERN_NOTICE "Readonly device detected. No cluster "
-                      "services will be utilized for this mount. Recovery "
-                      "will be skipped.\n");
+               printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
+                      "Cluster services will not be used for this mount. "
+                      "Recovery will be skipped.\n", osb->dev_str);
         }
   
         if (!ocfs2_is_hard_readonly(osb)) {
@@@ -1617,17 -1616,12 +1617,17 @@@ static int ocfs2_show_options(struct se
         return 0;
   }
   
+ +wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
+ +
   static int __init ocfs2_init(void)
   {
- -      int status;
+ +      int status, i;
   
         ocfs2_print_version();
   
+ +      for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
+ +              init_waitqueue_head(&ocfs2__ioend_wq[i]);
+ +
         status = init_ocfs2_uptodate_cache();
         if (status < 0) {
                 mlog_errno(status);
@@@ -1766,7 -1760,7 +1766,7 @@@ static void ocfs2_inode_init_once(void 
         ocfs2_extent_map_init(&oi->vfs_inode);
         INIT_LIST_HEAD(&oi->ip_io_markers);
         oi->ip_dir_start_lookup = 0;
- -
+ +      atomic_set(&oi->ip_unaligned_aio, 0);
         init_rwsem(&oi->ip_alloc_sem);
         init_rwsem(&oi->ip_xattr_sem);
         mutex_init(&oi->ip_io_mutex);
@@@ -1980,8 -1974,7 +1980,8 @@@ static void ocfs2_dismount_volume(struc
          * If we failed before we got a uuid_str yet, we can't stop
          * heartbeat.  Otherwise, do it.
          */
- -      if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+ +      if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
+ +          !ocfs2_is_hard_readonly(osb))
                 hangup_needed = 1;
   
         if (osb->cconn)
@@@ -2469,8 -2462,8 +2469,8 @@@ static int ocfs2_check_volume(struct oc
                         goto finally;
                 }
         } else {
-               mlog(ML_NOTICE, "File system was not unmounted cleanly, "
-                    "recovering volume.\n");
+               printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
+                      "unmounted cleanly, recovering it.\n", osb->dev_str);
         }
   
         local = ocfs2_mount_local(osb);
author	Joel Becker <jlbec@evilplan.org>
	Mon, 22 Aug 2011 04:02:57 +0000 (21:02 -0700)
committer	Joel Becker <jlbec@evilplan.org>
	Mon, 22 Aug 2011 04:02:57 +0000 (21:02 -0700)
		1	2
fs/ocfs2/aops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/super.c	patch \|	diff1 \|	diff2 \|	blob \| history