Merge branch 'mw-3.1-jul25' of git://oss.oracle.com/git/smushran/linux-2.6 into ocfs2...
authorJoel Becker <jlbec@evilplan.org>
Mon, 22 Aug 2011 04:02:57 +0000 (21:02 -0700)
committerJoel Becker <jlbec@evilplan.org>
Mon, 22 Aug 2011 04:02:57 +0000 (21:02 -0700)
1  2 
fs/ocfs2/aops.c
fs/ocfs2/file.c
fs/ocfs2/super.c

diff --combined fs/ocfs2/aops.c
index ba3ca1e63b51bebc30938276462354b2a8cd6e49,ff98c169b631efb1a0e2d2467f9a31eec9cc5e85..78b68af3b0e32627b1874277d8ae58003501acb5
@@@ -290,15 -290,7 +290,15 @@@ static int ocfs2_readpage(struct file *
        }
  
        if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
 +              /*
 +               * Unlock the page and cycle ip_alloc_sem so that we don't
 +               * busyloop waiting for ip_alloc_sem to unlock
 +               */
                ret = AOP_TRUNCATED_PAGE;
 +              unlock_page(page);
 +              unlock = 0;
 +              down_read(&oi->ip_alloc_sem);
 +              up_read(&oi->ip_alloc_sem);
                goto out_inode_unlock;
        }
  
@@@ -559,9 -551,8 +559,8 @@@ bail
  
  /*
   * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
-  * particularly interested in the aio/dio case.  Like the core uses
-  * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
-  * truncation on another.
+  * particularly interested in the aio/dio case.  We use the rw_lock DLM lock
+  * to protect io on one node from truncation on another.
   */
  static void ocfs2_dio_end_io(struct kiocb *iocb,
                             loff_t offset,
  {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
        int level;
 +      wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
  
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
  
-       if (ocfs2_iocb_is_sem_locked(iocb)) {
-               up_read(&inode->i_alloc_sem);
+       if (ocfs2_iocb_is_sem_locked(iocb))
                ocfs2_iocb_clear_sem_locked(iocb);
-       }
  
 +      if (ocfs2_iocb_is_unaligned_aio(iocb)) {
 +              ocfs2_iocb_clear_unaligned_aio(iocb);
 +
 +              if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
 +                  waitqueue_active(wq)) {
 +                      wake_up_all(wq);
 +              }
 +      }
 +
        ocfs2_iocb_clear_rw_locked(iocb);
  
        level = ocfs2_iocb_rw_locked_level(iocb);
  
        if (is_async)
                aio_complete(iocb, ret, 0);
+       inode_dio_done(inode);
  }
  
  /*
@@@ -882,6 -862,12 +880,12 @@@ struct ocfs2_write_ctxt 
        struct page                     *w_pages[OCFS2_MAX_CTXT_PAGES];
        struct page                     *w_target_page;
  
+       /*
+        * w_target_locked is used for page_mkwrite path indicating no unlocking
+        * against w_target_page in ocfs2_write_end_nolock.
+        */
+       unsigned int                    w_target_locked:1;
        /*
         * ocfs2_write_end() uses this to know what the real range to
         * write in the target should be.
@@@ -915,6 -901,24 +919,24 @@@ void ocfs2_unlock_and_free_pages(struc
  
  static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
  {
+       int i;
+       /*
+        * w_target_locked is only set to true in the page_mkwrite() case.
+        * The intent is to allow us to lock the target page from write_begin()
+        * to write_end(). The caller must hold a ref on w_target_page.
+        */
+       if (wc->w_target_locked) {
+               BUG_ON(!wc->w_target_page);
+               for (i = 0; i < wc->w_num_pages; i++) {
+                       if (wc->w_target_page == wc->w_pages[i]) {
+                               wc->w_pages[i] = NULL;
+                               break;
+                       }
+               }
+               mark_page_accessed(wc->w_target_page);
+               page_cache_release(wc->w_target_page);
+       }
        ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
  
        brelse(wc->w_di_bh);
@@@ -1152,20 -1156,17 +1174,17 @@@ static int ocfs2_grab_pages_for_write(s
                         */
                        lock_page(mmap_page);
  
+                       /* Exit and let the caller retry */
                        if (mmap_page->mapping != mapping) {
+                               WARN_ON(mmap_page->mapping);
                                unlock_page(mmap_page);
-                               /*
-                                * Sanity check - the locking in
-                                * ocfs2_pagemkwrite() should ensure
-                                * that this code doesn't trigger.
-                                */
-                               ret = -EINVAL;
-                               mlog_errno(ret);
+                               ret = -EAGAIN;
                                goto out;
                        }
  
                        page_cache_get(mmap_page);
                        wc->w_pages[i] = mmap_page;
+                       wc->w_target_locked = true;
                } else {
                        wc->w_pages[i] = find_or_create_page(mapping, index,
                                                             GFP_NOFS);
                        wc->w_target_page = wc->w_pages[i];
        }
  out:
+       if (ret)
+               wc->w_target_locked = false;
        return ret;
  }
  
@@@ -1837,11 -1840,23 +1858,23 @@@ try_again
         */
        ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
                                         cluster_of_pages, mmap_page);
-       if (ret) {
+       if (ret && ret != -EAGAIN) {
                mlog_errno(ret);
                goto out_quota;
        }
  
+       /*
+        * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
+        * the target page. In this case, we exit with no error and no target
+        * page. This will trigger the caller, page_mkwrite(), to re-try
+        * the operation.
+        */
+       if (ret == -EAGAIN) {
+               BUG_ON(wc->w_target_page);
+               ret = 0;
+               goto out_quota;
+       }
        ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
                                          len);
        if (ret) {
diff --combined fs/ocfs2/file.c
index 145f4533a93644be863cea16fa31ca01fb266e5b,c0f015e11c28f2d45eecbcb62b3886108b3df228..5c4a74e04ab4b67e9e64a533f1968955b4d11eca
@@@ -171,7 -171,8 +171,8 @@@ static int ocfs2_dir_release(struct ino
        return 0;
  }
  
- static int ocfs2_sync_file(struct file *file, int datasync)
+ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
+                          int datasync)
  {
        int err = 0;
        journal_t *journal;
                              file->f_path.dentry->d_name.name,
                              (unsigned long long)datasync);
  
+       err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       if (err)
+               return err;
+       /*
+        * Probably don't need the i_mutex at all in here, just putting it here
+        * to be consistent with how fsync used to be called, someone more
+        * familiar with the fs could possibly remove it.
+        */
+       mutex_lock(&inode->i_mutex);
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
                /*
                 * We still have to flush drive's caches to get data to the
  bail:
        if (err)
                mlog_errno(err);
+       mutex_unlock(&inode->i_mutex);
  
        return (err < 0) ? -EIO : 0;
  }
@@@ -1142,6 -1154,8 +1154,8 @@@ int ocfs2_setattr(struct dentry *dentry
                if (status)
                        goto bail_unlock;
  
+               inode_dio_wait(inode);
                if (i_size_read(inode) > attr->ia_size) {
                        if (ocfs2_should_order_data(inode)) {
                                status = ocfs2_begin_ordered_truncate(inode,
@@@ -1279,11 -1293,11 +1293,11 @@@ bail
        return err;
  }
  
- int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
+ int ocfs2_permission(struct inode *inode, int mask)
  {
        int ret;
  
-       if (flags & IPERM_FLAG_RCU)
+       if (mask & MAY_NOT_BLOCK)
                return -ECHILD;
  
        ret = ocfs2_inode_lock(inode, NULL, 0);
                goto out;
        }
  
-       ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
+       ret = generic_permission(inode, mask);
  
        ocfs2_inode_unlock(inode, 0);
  out:
        return ret;
  }
  
 +static void ocfs2_aiodio_wait(struct inode *inode)
 +{
 +      wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
 +
 +      wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
 +}
 +
 +static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
 +{
 +      int blockmask = inode->i_sb->s_blocksize - 1;
 +      loff_t final_size = pos + count;
 +
 +      if ((pos & blockmask) || (final_size & blockmask))
 +              return 1;
 +      return 0;
 +}
 +
  static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
                                            struct file *file,
                                            loff_t pos, size_t count,
@@@ -2233,7 -2230,6 +2247,7 @@@ static ssize_t ocfs2_file_aio_write(str
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        int full_coherency = !(osb->s_mount_opt &
                               OCFS2_MOUNT_COHERENCY_BUFFERED);
 +      int unaligned_dio = 0;
  
        trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
                (unsigned long long)OCFS2_I(inode)->ip_blkno,
        ocfs2_iocb_clear_sem_locked(iocb);
  
  relock:
-       /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
+       /* to match setattr's i_mutex -> rw_lock ordering */
        if (direct_io) {
-               down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
                /* communicate with ocfs2_dio_end_io */
                ocfs2_iocb_set_sem_locked(iocb);
                goto out;
        }
  
 +      if (direct_io && !is_sync_kiocb(iocb))
 +              unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
 +                                                    *ppos);
 +
        /*
         * We can't complete the direct I/O as requested, fall back to
         * buffered I/O.
         */
        if (direct_io && !can_do_direct) {
                ocfs2_rw_unlock(inode, rw_level);
-               up_read(&inode->i_alloc_sem);
  
                have_alloc_sem = 0;
                rw_level = -1;
                goto relock;
        }
  
 +      if (unaligned_dio) {
 +              /*
 +               * Wait on previous unaligned aio to complete before
 +               * proceeding.
 +               */
 +              ocfs2_aiodio_wait(inode);
 +
 +              /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
 +              atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
 +              ocfs2_iocb_set_unaligned_aio(iocb);
 +      }
 +
        /*
         * To later detect whether a journal commit for sync writes is
         * necessary, we sample i_size, and cluster count here.
@@@ -2395,8 -2373,7 +2407,7 @@@ out_dio
        /*
         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
         * function pointer which is called when o_direct io completes so that
-        * it can unlock our rw lock.  (it's the clustered equivalent of
-        * i_alloc_sem; protects truncate from racing with pending ios).
+        * it can unlock our rw lock.
         * Unfortunately there are error cases which call end_io and others
         * that don't.  so we don't have to unlock the rw_lock if either an
         * async dio is going to do it in the future or an end_io after an
        if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                rw_level = -1;
                have_alloc_sem = 0;
 +              unaligned_dio = 0;
        }
  
 +      if (unaligned_dio)
 +              atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
 +
  out:
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
  
  out_sems:
-       if (have_alloc_sem) {
-               up_read(&inode->i_alloc_sem);
+       if (have_alloc_sem)
                ocfs2_iocb_clear_sem_locked(iocb);
-       }
  
        mutex_unlock(&inode->i_mutex);
  
@@@ -2569,7 -2540,6 +2578,6 @@@ static ssize_t ocfs2_file_aio_read(stru
         * need locks to protect pending reads from racing with truncate.
         */
        if (filp->f_flags & O_DIRECT) {
-               down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
                ocfs2_iocb_set_sem_locked(iocb);
  
        }
  
  bail:
-       if (have_alloc_sem) {
-               up_read(&inode->i_alloc_sem);
+       if (have_alloc_sem)
                ocfs2_iocb_clear_sem_locked(iocb);
-       }
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
  
        return ret;
  }
  
+ /* Refer generic_file_llseek_unlocked() */
+ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
+ {
+       struct inode *inode = file->f_mapping->host;
+       int ret = 0;
+       mutex_lock(&inode->i_mutex);
+       switch (origin) {
+       case SEEK_SET:
+               break;
+       case SEEK_END:
+               offset += inode->i_size;
+               break;
+       case SEEK_CUR:
+               if (offset == 0) {
+                       offset = file->f_pos;
+                       goto out;
+               }
+               offset += file->f_pos;
+               break;
+       case SEEK_DATA:
+       case SEEK_HOLE:
+               ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
+               if (ret)
+                       goto out;
+               break;
+       default:
+               ret = -EINVAL;
+               goto out;
+       }
+       if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+               ret = -EINVAL;
+       if (!ret && offset > inode->i_sb->s_maxbytes)
+               ret = -EINVAL;
+       if (ret)
+               goto out;
+       if (offset != file->f_pos) {
+               file->f_pos = offset;
+               file->f_version = 0;
+       }
+ out:
+       mutex_unlock(&inode->i_mutex);
+       if (ret)
+               return ret;
+       return offset;
+ }
  const struct inode_operations ocfs2_file_iops = {
        .setattr        = ocfs2_setattr,
        .getattr        = ocfs2_getattr,
        .listxattr      = ocfs2_listxattr,
        .removexattr    = generic_removexattr,
        .fiemap         = ocfs2_fiemap,
+       .check_acl      = ocfs2_check_acl,
  };
  
  const struct inode_operations ocfs2_special_file_iops = {
        .setattr        = ocfs2_setattr,
        .getattr        = ocfs2_getattr,
        .permission     = ocfs2_permission,
+       .check_acl      = ocfs2_check_acl,
  };
  
  /*
   * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
   */
  const struct file_operations ocfs2_fops = {
-       .llseek         = generic_file_llseek,
+       .llseek         = ocfs2_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .mmap           = ocfs2_mmap,
@@@ -2692,7 -2714,7 +2752,7 @@@ const struct file_operations ocfs2_dop
   * the cluster.
   */
  const struct file_operations ocfs2_fops_no_plocks = {
-       .llseek         = generic_file_llseek,
+       .llseek         = ocfs2_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .mmap           = ocfs2_mmap,
diff --combined fs/ocfs2/super.c
index 603f5fe9f81603d1d12699eebb2a5dc63d3ea6f0,3e7850380d2b6145cd917a7baaec0b6bf06f94f7..938e2b2b0c9c9f8635a43ad4686dff675885361f
@@@ -54,7 -54,6 +54,7 @@@
  #include "ocfs1_fs_compat.h"
  
  #include "alloc.h"
 +#include "aops.h"
  #include "blockcheck.h"
  #include "dlmglue.h"
  #include "export.h"
@@@ -1073,7 -1072,7 +1073,7 @@@ static int ocfs2_fill_super(struct supe
  
        sb->s_magic = OCFS2_SUPER_MAGIC;
  
-       sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+       sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
                ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
  
        /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
  
                ocfs2_set_ro_flag(osb, 1);
  
-               printk(KERN_NOTICE "Readonly device detected. No cluster "
-                      "services will be utilized for this mount. Recovery "
-                      "will be skipped.\n");
+               printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
+                      "Cluster services will not be used for this mount. "
+                      "Recovery will be skipped.\n", osb->dev_str);
        }
  
        if (!ocfs2_is_hard_readonly(osb)) {
@@@ -1617,17 -1616,12 +1617,17 @@@ static int ocfs2_show_options(struct se
        return 0;
  }
  
 +wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
 +
  static int __init ocfs2_init(void)
  {
 -      int status;
 +      int status, i;
  
        ocfs2_print_version();
  
 +      for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
 +              init_waitqueue_head(&ocfs2__ioend_wq[i]);
 +
        status = init_ocfs2_uptodate_cache();
        if (status < 0) {
                mlog_errno(status);
@@@ -1766,7 -1760,7 +1766,7 @@@ static void ocfs2_inode_init_once(void 
        ocfs2_extent_map_init(&oi->vfs_inode);
        INIT_LIST_HEAD(&oi->ip_io_markers);
        oi->ip_dir_start_lookup = 0;
 -
 +      atomic_set(&oi->ip_unaligned_aio, 0);
        init_rwsem(&oi->ip_alloc_sem);
        init_rwsem(&oi->ip_xattr_sem);
        mutex_init(&oi->ip_io_mutex);
@@@ -1980,8 -1974,7 +1980,8 @@@ static void ocfs2_dismount_volume(struc
         * If we failed before we got a uuid_str yet, we can't stop
         * heartbeat.  Otherwise, do it.
         */
 -      if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
 +      if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
 +          !ocfs2_is_hard_readonly(osb))
                hangup_needed = 1;
  
        if (osb->cconn)
@@@ -2469,8 -2462,8 +2469,8 @@@ static int ocfs2_check_volume(struct oc
                        goto finally;
                }
        } else {
-               mlog(ML_NOTICE, "File system was not unmounted cleanly, "
-                    "recovering volume.\n");
+               printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
+                      "unmounted cleanly, recovering it.\n", osb->dev_str);
        }
  
        local = ocfs2_mount_local(osb);