md/raid5,6: add percpu scribble region for buffer lists
authorDan Williams <dan.j.williams@intel.com>
Tue, 14 Jul 2009 18:50:52 +0000 (11:50 -0700)
committerDan Williams <dan.j.williams@intel.com>
Sun, 30 Aug 2009 02:09:26 +0000 (19:09 -0700)
Use percpu memory rather than stack for storing the buffer lists used in
parity calculations.  Include space for dma address conversions and pass
that to async_tx via the async_submit_ctl.scribble pointer.

[ Impact: move memory pressure from stack to heap ]

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
drivers/md/raid5.c
drivers/md/raid5.h

index 5359236a1ec76d94910db17ee1c14d48398bc0ec..7727954cf72617daf4599ae0c2422f14710001c2 100644 (file)
@@ -642,11 +642,18 @@ static void ops_complete_compute5(void *stripe_head_ref)
        release_stripe(sh);
 }
 
-static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
+/* return a pointer to the address conversion region of the scribble buffer */
+static addr_conv_t *to_addr_conv(struct stripe_head *sh,
+                                struct raid5_percpu *percpu)
+{
+       return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
-       /* kernel stack size limits the total number of disks */
        int disks = sh->disks;
-       struct page *xor_srcs[disks];
+       struct page **xor_srcs = percpu->scribble;
        int target = sh->ops.target;
        struct r5dev *tgt = &sh->dev[target];
        struct page *xor_dest = tgt->page;
@@ -666,7 +673,7 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
        atomic_inc(&sh->count);
 
        init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
-                         ops_complete_compute5, sh, NULL);
+                         ops_complete_compute5, sh, to_addr_conv(sh, percpu));
        if (unlikely(count == 1))
                tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
        else
@@ -684,11 +691,11 @@ static void ops_complete_prexor(void *stripe_head_ref)
 }
 
 static struct dma_async_tx_descriptor *
-ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
+              struct dma_async_tx_descriptor *tx)
 {
-       /* kernel stack size limits the total number of disks */
        int disks = sh->disks;
-       struct page *xor_srcs[disks];
+       struct page **xor_srcs = percpu->scribble;
        int count = 0, pd_idx = sh->pd_idx, i;
        struct async_submit_ctl submit;
 
@@ -706,7 +713,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
        }
 
        init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx,
-                         ops_complete_prexor, sh, NULL);
+                         ops_complete_prexor, sh, to_addr_conv(sh, percpu));
        tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 
        return tx;
@@ -775,11 +782,11 @@ static void ops_complete_postxor(void *stripe_head_ref)
 }
 
 static void
-ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+ops_run_postxor(struct stripe_head *sh, struct raid5_percpu *percpu,
+               struct dma_async_tx_descriptor *tx)
 {
-       /* kernel stack size limits the total number of disks */
        int disks = sh->disks;
-       struct page *xor_srcs[disks];
+       struct page **xor_srcs = percpu->scribble;
        struct async_submit_ctl submit;
        int count = 0, pd_idx = sh->pd_idx, i;
        struct page *xor_dest;
@@ -819,7 +826,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 
        atomic_inc(&sh->count);
 
-       init_async_submit(&submit, flags, tx, ops_complete_postxor, sh, NULL);
+       init_async_submit(&submit, flags, tx, ops_complete_postxor, sh,
+                         to_addr_conv(sh, percpu));
        if (unlikely(count == 1))
                tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
        else
@@ -838,11 +846,10 @@ static void ops_complete_check(void *stripe_head_ref)
        release_stripe(sh);
 }
 
-static void ops_run_check(struct stripe_head *sh)
+static void ops_run_check(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
-       /* kernel stack size limits the total number of disks */
        int disks = sh->disks;
-       struct page *xor_srcs[disks];
+       struct page **xor_srcs = percpu->scribble;
        struct dma_async_tx_descriptor *tx;
        struct async_submit_ctl submit;
 
@@ -858,7 +865,8 @@ static void ops_run_check(struct stripe_head *sh)
                        xor_srcs[count++] = dev->page;
        }
 
-       init_async_submit(&submit, 0, NULL, NULL, NULL, NULL);
+       init_async_submit(&submit, 0, NULL, NULL, NULL,
+                         to_addr_conv(sh, percpu));
        tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
                           &sh->ops.zero_sum_result, &submit);
 
@@ -871,21 +879,26 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
 {
        int overlap_clear = 0, i, disks = sh->disks;
        struct dma_async_tx_descriptor *tx = NULL;
+       raid5_conf_t *conf = sh->raid_conf;
+       struct raid5_percpu *percpu;
+       unsigned long cpu;
 
+       cpu = get_cpu();
+       percpu = per_cpu_ptr(conf->percpu, cpu);
        if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
                ops_run_biofill(sh);
                overlap_clear++;
        }
 
        if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
-               tx = ops_run_compute5(sh);
+               tx = ops_run_compute5(sh, percpu);
                /* terminate the chain if postxor is not set to be run */
                if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
                        async_tx_ack(tx);
        }
 
        if (test_bit(STRIPE_OP_PREXOR, &ops_request))
-               tx = ops_run_prexor(sh, tx);
+               tx = ops_run_prexor(sh, percpu, tx);
 
        if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
                tx = ops_run_biodrain(sh, tx);
@@ -893,10 +906,10 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
        }
 
        if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
-               ops_run_postxor(sh, tx);
+               ops_run_postxor(sh, percpu, tx);
 
        if (test_bit(STRIPE_OP_CHECK, &ops_request))
-               ops_run_check(sh);
+               ops_run_check(sh, percpu);
 
        if (overlap_clear)
                for (i = disks; i--; ) {
@@ -904,6 +917,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
                        if (test_and_clear_bit(R5_Overlap, &dev->flags))
                                wake_up(&sh->raid_conf->wait_for_overlap);
                }
+       put_cpu();
 }
 
 static int grow_one_stripe(raid5_conf_t *conf)
@@ -953,6 +967,28 @@ static int grow_stripes(raid5_conf_t *conf, int num)
        return 0;
 }
 
+/**
+ * scribble_len - return the required size of the scribble region
+ * @num - total number of disks in the array
+ *
+ * The size must be enough to contain:
+ * 1/ a struct page pointer for each device in the array +2
+ * 2/ room to convert each entry in (1) to its corresponding dma
+ *    (dma_map_page()) or page (page_address()) address.
+ *
+ * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
+ * calculate over all devices (not just the data blocks), using zeros in place
+ * of the P and Q blocks.
+ */
+static size_t scribble_len(int num)
+{
+       size_t len;
+
+       len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
+
+       return len;
+}
+
 static int resize_stripes(raid5_conf_t *conf, int newsize)
 {
        /* Make all the stripes able to hold 'newsize' devices.
@@ -981,6 +1017,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
        struct stripe_head *osh, *nsh;
        LIST_HEAD(newstripes);
        struct disk_info *ndisks;
+       unsigned long cpu;
        int err;
        struct kmem_cache *sc;
        int i;
@@ -1046,7 +1083,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
        /* Step 3.
         * At this point, we are holding all the stripes so the array
         * is completely stalled, so now is a good time to resize
-        * conf->disks.
+        * conf->disks and the scribble region
         */
        ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
        if (ndisks) {
@@ -1057,10 +1094,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
        } else
                err = -ENOMEM;
 
+       get_online_cpus();
+       conf->scribble_len = scribble_len(newsize);
+       for_each_present_cpu(cpu) {
+               struct raid5_percpu *percpu;
+               void *scribble;
+
+               percpu = per_cpu_ptr(conf->percpu, cpu);
+               scribble = kmalloc(conf->scribble_len, GFP_NOIO);
+
+               if (scribble) {
+                       kfree(percpu->scribble);
+                       percpu->scribble = scribble;
+               } else {
+                       err = -ENOMEM;
+                       break;
+               }
+       }
+       put_online_cpus();
+
        /* Step 4, return new stripes to service */
        while(!list_empty(&newstripes)) {
                nsh = list_entry(newstripes.next, struct stripe_head, lru);
                list_del_init(&nsh->lru);
+
                for (i=conf->raid_disks; i < newsize; i++)
                        if (nsh->dev[i].page == NULL) {
                                struct page *p = alloc_page(GFP_NOIO);
@@ -4318,6 +4375,7 @@ static void raid5_free_percpu(raid5_conf_t *conf)
        for_each_possible_cpu(cpu) {
                percpu = per_cpu_ptr(conf->percpu, cpu);
                safe_put_page(percpu->spare_page);
+               kfree(percpu->scribble);
        }
 #ifdef CONFIG_HOTPLUG_CPU
        unregister_cpu_notifier(&conf->cpu_notify);
@@ -4347,9 +4405,15 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-               if (!percpu->spare_page)
+               if (conf->level == 6 && !percpu->spare_page)
                        percpu->spare_page = alloc_page(GFP_KERNEL);
-               if (!percpu->spare_page) {
+               if (!percpu->scribble)
+                       percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
+
+               if (!percpu->scribble ||
+                   (conf->level == 6 && !percpu->spare_page)) {
+                       safe_put_page(percpu->spare_page);
+                       kfree(percpu->scribble);
                        pr_err("%s: failed memory allocation for cpu%ld\n",
                               __func__, cpu);
                        return NOTIFY_BAD;
@@ -4358,7 +4422,9 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                safe_put_page(percpu->spare_page);
+               kfree(percpu->scribble);
                percpu->spare_page = NULL;
+               percpu->scribble = NULL;
                break;
        default:
                break;
@@ -4372,12 +4438,9 @@ static int raid5_alloc_percpu(raid5_conf_t *conf)
        unsigned long cpu;
        struct page *spare_page;
        struct raid5_percpu *allcpus;
+       void *scribble;
        int err;
 
-       /* the only percpu data is the raid6 spare page */
-       if (conf->level != 6)
-               return 0;
-
        allcpus = alloc_percpu(struct raid5_percpu);
        if (!allcpus)
                return -ENOMEM;
@@ -4386,12 +4449,20 @@ static int raid5_alloc_percpu(raid5_conf_t *conf)
        get_online_cpus();
        err = 0;
        for_each_present_cpu(cpu) {
-               spare_page = alloc_page(GFP_KERNEL);
-               if (!spare_page) {
+               if (conf->level == 6) {
+                       spare_page = alloc_page(GFP_KERNEL);
+                       if (!spare_page) {
+                               err = -ENOMEM;
+                               break;
+                       }
+                       per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
+               }
+               scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL);
+               if (!scribble) {
                        err = -ENOMEM;
                        break;
                }
-               per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
+               per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
        }
 #ifdef CONFIG_HOTPLUG_CPU
        conf->cpu_notify.notifier_call = raid456_cpu_notify;
@@ -4443,6 +4514,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
                goto abort;
 
        conf->raid_disks = mddev->raid_disks;
+       conf->scribble_len = scribble_len(conf->raid_disks);
        if (mddev->reshape_position == MaxSector)
                conf->previous_raid_disks = mddev->raid_disks;
        else
index 07a7a4102f0545b39a2be49718882a76014434d1..e7baabffee861ff0df68766e151ac5244cc6242d 100644 (file)
@@ -386,7 +386,15 @@ struct raid5_private_data {
        /* per cpu variables */
        struct raid5_percpu {
                struct page     *spare_page; /* Used when checking P/Q in raid6 */
+               void            *scribble;   /* space for constructing buffer
+                                             * lists and performing address
+                                             * conversions
+                                             */
        } *percpu;
+       size_t                  scribble_len; /* size of scribble region must be
+                                              * associated with conf to handle
+                                              * cpu hotplug while reshaping
+                                              */
 #ifdef CONFIG_HOTPLUG_CPU
        struct notifier_block   cpu_notify;
 #endif