x86/irq: Plug vector cleanup race
[firefly-linux-kernel-4.4.55.git] / arch / x86 / kernel / apic / vector.c
index 74f7dd6f69b12819e273bb2f75c082cc52a9585f..a35f6b5473f48d8833bd549a80c799260df3dc6a 100644 (file)
@@ -30,7 +30,7 @@ struct apic_chip_data {
 
 struct irq_domain *x86_vector_domain;
 static DEFINE_RAW_SPINLOCK(vector_lock);
-static cpumask_var_t vector_cpumask, searched_cpumask;
+static cpumask_var_t vector_cpumask, vector_searchmask, searched_cpumask;
 static struct irq_chip lapic_controller;
 #ifdef CONFIG_X86_IO_APIC
 static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
@@ -116,9 +116,14 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
         */
        static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
        static int current_offset = VECTOR_OFFSET_START % 16;
-       int cpu;
+       int cpu, vector;
 
-       if (d->move_in_progress)
+       /*
+        * If there is still a move in progress or the previous move has not
+        * been cleaned up completely, tell the caller to come back later.
+        */
+       if (d->move_in_progress ||
+           cpumask_intersects(d->old_domain, cpu_online_mask))
                return -EBUSY;
 
        /* Only try and allocate irqs on cpus that are present */
@@ -126,24 +131,32 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
        cpumask_clear(searched_cpumask);
        cpu = cpumask_first_and(mask, cpu_online_mask);
        while (cpu < nr_cpu_ids) {
-               int new_cpu, vector, offset;
+               int new_cpu, offset;
 
+               /* Get the possible target cpus for @mask/@cpu from the apic */
                apic->vector_allocation_domain(cpu, vector_cpumask, mask);
 
+               /*
+                * Clear the offline cpus from @vector_cpumask for searching
+                * and verify whether the result overlaps with @mask. If true,
+                * then the call to apic->cpu_mask_to_apicid_and() will
+                * succeed as well. If not, no point in trying to find a
+                * vector in this mask.
+                */
+               cpumask_and(vector_searchmask, vector_cpumask, cpu_online_mask);
+               if (!cpumask_intersects(vector_searchmask, mask))
+                       goto next_cpu;
+
                if (cpumask_subset(vector_cpumask, d->domain)) {
                        if (cpumask_equal(vector_cpumask, d->domain))
                                goto success;
                        /*
-                        * New cpumask using the vector is a proper subset of
-                        * the current in use mask. So cleanup the vector
-                        * allocation for the members that are not used anymore.
+                        * Mark the cpus which are not longer in the mask for
+                        * cleanup.
                         */
-                       cpumask_andnot(d->old_domain, d->domain,
-                                      vector_cpumask);
-                       d->move_in_progress =
-                          cpumask_intersects(d->old_domain, cpu_online_mask);
-                       cpumask_and(d->domain, d->domain, vector_cpumask);
-                       goto success;
+                       cpumask_andnot(d->old_domain, d->domain, vector_cpumask);
+                       vector = d->cfg.vector;
+                       goto update;
                }
 
                vector = current_vector;
@@ -155,41 +168,60 @@ next:
                        vector = FIRST_EXTERNAL_VECTOR + offset;
                }
 
-               if (unlikely(current_vector == vector)) {
-                       cpumask_or(searched_cpumask, searched_cpumask,
-                                  vector_cpumask);
-                       cpumask_andnot(vector_cpumask, mask, searched_cpumask);
-                       cpu = cpumask_first_and(vector_cpumask,
-                                               cpu_online_mask);
-                       continue;
-               }
+               /* If the search wrapped around, try the next cpu */
+               if (unlikely(current_vector == vector))
+                       goto next_cpu;
 
                if (test_bit(vector, used_vectors))
                        goto next;
 
-               for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
+               for_each_cpu(new_cpu, vector_searchmask) {
                        if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
                                goto next;
                }
                /* Found one! */
                current_vector = vector;
                current_offset = offset;
-               if (d->cfg.vector) {
+               /* Schedule the old vector for cleanup on all cpus */
+               if (d->cfg.vector)
                        cpumask_copy(d->old_domain, d->domain);
-                       d->move_in_progress =
-                          cpumask_intersects(d->old_domain, cpu_online_mask);
-               }
-               for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
+               for_each_cpu(new_cpu, vector_searchmask)
                        per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
-               d->cfg.vector = vector;
-               cpumask_copy(d->domain, vector_cpumask);
-               goto success;
+               goto update;
+
+next_cpu:
+               /*
+                * We exclude the current @vector_cpumask from the requested
+                * @mask and try again with the next online cpu in the
+                * result. We cannot modify @mask, so we use @vector_cpumask
+                * as a temporary buffer here as it will be reassigned when
+                * calling apic->vector_allocation_domain() above.
+                */
+               cpumask_or(searched_cpumask, searched_cpumask, vector_cpumask);
+               cpumask_andnot(vector_cpumask, mask, searched_cpumask);
+               cpu = cpumask_first_and(vector_cpumask, cpu_online_mask);
+               continue;
        }
        return -ENOSPC;
 
+update:
+       /*
+        * Exclude offline cpus from the cleanup mask and set the
+        * move_in_progress flag when the result is not empty.
+        */
+       cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
+       d->move_in_progress = !cpumask_empty(d->old_domain);
+       d->cfg.vector = vector;
+       cpumask_copy(d->domain, vector_cpumask);
 success:
-       /* cache destination APIC IDs into cfg->dest_apicid */
-       return apic->cpu_mask_to_apicid_and(mask, d->domain, &d->cfg.dest_apicid);
+       /*
+        * Cache destination APIC IDs into cfg->dest_apicid. This cannot fail
+        * as we already established, that mask & d->domain & cpu_online_mask
+        * is not empty.
+        */
+       BUG_ON(apic->cpu_mask_to_apicid_and(mask, d->domain,
+                                           &d->cfg.dest_apicid));
+       return 0;
 }
 
 static int assign_irq_vector(int irq, struct apic_chip_data *data,
@@ -230,7 +262,12 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
        data->cfg.vector = 0;
        cpumask_clear(data->domain);
 
-       if (likely(!data->move_in_progress))
+       /*
+        * If move is in progress or the old_domain mask is not empty,
+        * i.e. the cleanup IPI has not been processed yet, we need to remove
+        * the old references to desc from all cpus vector tables.
+        */
+       if (!data->move_in_progress && cpumask_empty(data->old_domain))
                return;
 
        desc = irq_to_desc(irq);
@@ -399,6 +436,7 @@ int __init arch_early_irq_init(void)
        arch_init_htirq_domain(x86_vector_domain);
 
        BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
+       BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&searched_cpumask, GFP_KERNEL));
 
        return arch_early_ioapic_init();
@@ -488,14 +526,7 @@ static int apic_set_affinity(struct irq_data *irq_data,
                return -EINVAL;
 
        err = assign_irq_vector(irq, data, dest);
-       if (err) {
-               if (assign_irq_vector(irq, data,
-                                     irq_data_get_affinity_mask(irq_data)))
-                       pr_err("Failed to recover vector for irq %d\n", irq);
-               return err;
-       }
-
-       return IRQ_SET_MASK_OK;
+       return err ? err : IRQ_SET_MASK_OK;
 }
 
 static struct irq_chip lapic_controller = {
@@ -507,20 +538,12 @@ static struct irq_chip lapic_controller = {
 #ifdef CONFIG_SMP
 static void __send_cleanup_vector(struct apic_chip_data *data)
 {
-       cpumask_var_t cleanup_mask;
-
-       if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
-               unsigned int i;
-
-               for_each_cpu_and(i, data->old_domain, cpu_online_mask)
-                       apic->send_IPI_mask(cpumask_of(i),
-                                           IRQ_MOVE_CLEANUP_VECTOR);
-       } else {
-               cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
-               apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-               free_cpumask_var(cleanup_mask);
-       }
+       raw_spin_lock(&vector_lock);
+       cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
        data->move_in_progress = 0;
+       if (!cpumask_empty(data->old_domain))
+               apic->send_IPI_mask(data->old_domain, IRQ_MOVE_CLEANUP_VECTOR);
+       raw_spin_unlock(&vector_lock);
 }
 
 void send_cleanup_vector(struct irq_cfg *cfg)
@@ -564,12 +587,25 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
                        goto unlock;
 
                /*
-                * Check if the irq migration is in progress. If so, we
-                * haven't received the cleanup request yet for this irq.
+                * Nothing to cleanup if irq migration is in progress
+                * or this cpu is not set in the cleanup mask.
                 */
-               if (data->move_in_progress)
+               if (data->move_in_progress ||
+                   !cpumask_test_cpu(me, data->old_domain))
                        goto unlock;
 
+               /*
+                * We have two cases to handle here:
+                * 1) vector is unchanged but the target mask got reduced
+                * 2) vector and the target mask has changed
+                *
+                * #1 is obvious, but in #2 we have two vectors with the same
+                * irq descriptor: the old and the new vector. So we need to
+                * make sure that we only cleanup the old vector. The new
+                * vector has the current @vector number in the config and
+                * this cpu is part of the target mask. We better leave that
+                * one alone.
+                */
                if (vector == data->cfg.vector &&
                    cpumask_test_cpu(me, data->domain))
                        goto unlock;
@@ -587,6 +623,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
                        goto unlock;
                }
                __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+               cpumask_clear_cpu(me, data->old_domain);
 unlock:
                raw_spin_unlock(&desc->lock);
        }
@@ -615,12 +652,48 @@ void irq_complete_move(struct irq_cfg *cfg)
        __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
 }
 
-void irq_force_complete_move(int irq)
+/*
+ * Called with @desc->lock held and interrupts disabled.
+ */
+void irq_force_complete_move(struct irq_desc *desc)
 {
-       struct irq_cfg *cfg = irq_cfg(irq);
+       struct irq_data *irqdata = irq_desc_get_irq_data(desc);
+       struct apic_chip_data *data = apic_chip_data(irqdata);
+       struct irq_cfg *cfg = data ? &data->cfg : NULL;
+
+       if (!cfg)
+               return;
 
-       if (cfg)
-               __irq_complete_move(cfg, cfg->vector);
+       __irq_complete_move(cfg, cfg->vector);
+
+       /*
+        * This is tricky. If the cleanup of @data->old_domain has not been
+        * done yet, then the following setaffinity call will fail with
+        * -EBUSY. This can leave the interrupt in a stale state.
+        *
+        * The cleanup cannot make progress because we hold @desc->lock. So in
+        * case @data->old_domain is not yet cleaned up, we need to drop the
+        * lock and acquire it again. @desc cannot go away, because the
+        * hotplug code holds the sparse irq lock.
+        */
+       raw_spin_lock(&vector_lock);
+       /* Clean out all offline cpus (including ourself) first. */
+       cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
+       while (!cpumask_empty(data->old_domain)) {
+               raw_spin_unlock(&vector_lock);
+               raw_spin_unlock(&desc->lock);
+               cpu_relax();
+               raw_spin_lock(&desc->lock);
+               /*
+                * Reevaluate apic_chip_data. It might have been cleared after
+                * we dropped @desc->lock.
+                */
+               data = apic_chip_data(irqdata);
+               if (!data)
+                       return;
+               raw_spin_lock(&vector_lock);
+       }
+       raw_spin_unlock(&vector_lock);
 }
 #endif