x86/vector: Simplify vector move cleanup The vector move cleanup needs to walk the vector space and do a lot of sanity checks to find a vector to cleanup. With single CPU affinities this can be simplified and made more robust by queueing the vector configuration which needs to be cleaned up in a hlist on the CPU which was the previous target. That removes all the race conditions because the cleanup either finds a valid list entry or not. The latter happens when the interrupt was torn down before the cleanup handler was able to run. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Juergen Gross <jgross@suse.com> Tested-by: Yu Chen <yu.c.chen@intel.com> Acked-by: Juergen Gross <jgross@suse.com> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: Tony Luck <tony.luck@intel.com> Cc: Marc Zyngier <marc.zyngier@arm.com> Cc: Alok Kataria <akataria@vmware.com> Cc: Joerg Roedel <joro@8bytes.org> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Rui Zhang <rui.zhang@intel.com> Cc: "K. Y. Srinivasan" <kys@microsoft.com> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Len Brown <lenb@kernel.org> Link: https://lkml.kernel.org/r/20170913213154.622727892@linutronix.de

commit: dccfe3147b42b78458ab8e4440822c805ee76d72 [log] [tgz]
author: Thomas Gleixner <tglx@linutronix.de> Wed Sep 13 23:29:32 2017 +0200
committer: Thomas Gleixner <tglx@linutronix.de> Mon Sep 25 20:51:54 2017 +0200
tree: 8abff1331b010bfdbafbc078e2a22b7065efa2f3
parent: 029c6e1c9df776fe1b2ba756a28fb65e9f9e9f69 [diff] [blame]
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 7a9e0c6..68f88591 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c

@@ -25,6 +25,7 @@
 	struct irq_cfg		cfg;
 	unsigned int		cpu;
 	unsigned int		prev_cpu;
+	struct hlist_node	clist;
 	cpumask_var_t		domain;
 	cpumask_var_t		old_domain;
 	u8			move_in_progress : 1;
@@ -38,6 +39,9 @@
 #ifdef	CONFIG_X86_IO_APIC
 static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
 #endif
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct hlist_head, cleanup_list);
+#endif
 
 void lock_vector_lock(void)
 {
@@ -87,6 +91,7 @@
 		goto out_data;
 	if (!zalloc_cpumask_var_node(&apicd->old_domain, GFP_KERNEL, node))
 		goto out_domain;
+	INIT_HLIST_NODE(&apicd->clist);
 	return apicd;
 out_domain:
 	free_cpumask_var(apicd->domain);
@@ -127,8 +132,7 @@
 	 * If there is still a move in progress or the previous move has not
 	 * been cleaned up completely, tell the caller to come back later.
 	 */
-	if (d->move_in_progress ||
-	    cpumask_intersects(d->old_domain, cpu_online_mask))
+	if (d->cfg.old_vector)
 		return -EBUSY;
 
 	/* Only try and allocate irqs on cpus that are present */
@@ -263,38 +267,22 @@
 
 static void clear_irq_vector(int irq, struct apic_chip_data *apicd)
 {
-	struct irq_desc *desc;
-	int cpu, vector;
+	unsigned int vector = apicd->cfg.vector;
 
-	if (!apicd->cfg.vector)
+	if (!vector)
 		return;
 
-	vector = apicd->cfg.vector;
-	for_each_cpu_and(cpu, apicd->domain, cpu_online_mask)
-		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
-
+	per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_UNUSED;
 	apicd->cfg.vector = 0;
-	cpumask_clear(apicd->domain);
 
-	/*
-	 * If move is in progress or the old_domain mask is not empty,
-	 * i.e. the cleanup IPI has not been processed yet, we need to remove
-	 * the old references to desc from all cpus vector tables.
-	 */
-	if (!apicd->move_in_progress && cpumask_empty(apicd->old_domain))
+	/* Clean up move in progress */
+	vector = apicd->cfg.old_vector;
+	if (!vector)
 		return;
 
-	desc = irq_to_desc(irq);
-	for_each_cpu_and(cpu, apicd->old_domain, cpu_online_mask) {
-		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
-		     vector++) {
-			if (per_cpu(vector_irq, cpu)[vector] != desc)
-				continue;
-			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
-			break;
-		}
-	}
+	per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED;
 	apicd->move_in_progress = 0;
+	hlist_del_init(&apicd->clist);
 }
 
 void init_irq_alloc_info(struct irq_alloc_info *info,
@@ -474,7 +462,7 @@
 		struct irq_data *irqd = irq_desc_get_irq_data(desc);
 		struct apic_chip_data *ad = apic_chip_data(irqd);
 
-		if (ad && cpumask_test_cpu(cpu, ad->domain) && ad->cfg.vector)
+		if (ad && ad->cfg.vector && ad->cpu == smp_processor_id())
 			this_cpu_write(vector_irq[ad->cfg.vector], desc);
 	}
 }
@@ -524,11 +512,9 @@
 {
 	struct apic_chip_data *apicd = apic_chip_data(irqd);
 	unsigned long flags;
-	int cpu;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	cpu = cpumask_first_and(apicd->domain, cpu_online_mask);
-	apic->send_IPI_mask(cpumask_of(cpu), apicd->cfg.vector);
+	apic->send_IPI(apicd->cpu, apicd->cfg.vector);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 
 	return 1;
@@ -565,13 +551,56 @@
 };
 
 #ifdef CONFIG_SMP
+
+asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
+{
+	struct hlist_head *clhead = this_cpu_ptr(&cleanup_list);
+	struct apic_chip_data *apicd;
+	struct hlist_node *tmp;
+
+	entering_ack_irq();
+	/* Prevent vectors vanishing under us */
+	raw_spin_lock(&vector_lock);
+
+	hlist_for_each_entry_safe(apicd, tmp, clhead, clist) {
+		unsigned int irr, vector = apicd->cfg.old_vector;
+
+		/*
+		 * Paranoia: Check if the vector that needs to be cleaned
+		 * up is registered at the APICs IRR. If so, then this is
+		 * not the best time to clean it up. Clean it up in the
+		 * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+		 * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest
+		 * priority external vector, so on return from this
+		 * interrupt the device interrupt will happen first.
+		 */
+		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+		if (irr & (1U << (vector % 32))) {
+			apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+			continue;
+		}
+		hlist_del_init(&apicd->clist);
+		__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+		apicd->cfg.old_vector = 0;
+	}
+
+	raw_spin_unlock(&vector_lock);
+	exiting_irq();
+}
+
 static void __send_cleanup_vector(struct apic_chip_data *apicd)
 {
+	unsigned int cpu;
+
 	raw_spin_lock(&vector_lock);
-	cpumask_and(apicd->old_domain, apicd->old_domain, cpu_online_mask);
 	apicd->move_in_progress = 0;
-	if (!cpumask_empty(apicd->old_domain))
-		apic->send_IPI_mask(apicd->old_domain, IRQ_MOVE_CLEANUP_VECTOR);
+	cpu = apicd->prev_cpu;
+	if (cpu_online(cpu)) {
+		hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu));
+		apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR);
+	} else {
+		apicd->cfg.old_vector = 0;
+	}
 	raw_spin_unlock(&vector_lock);
 }
 
@@ -584,95 +613,15 @@
 		__send_cleanup_vector(apicd);
 }
 
-asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
-{
-	unsigned vector, me;
-
-	entering_ack_irq();
-
-	/* Prevent vectors vanishing under us */
-	raw_spin_lock(&vector_lock);
-
-	me = smp_processor_id();
-	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-		struct apic_chip_data *apicd;
-		struct irq_desc *desc;
-		unsigned int irr;
-
-	retry:
-		desc = __this_cpu_read(vector_irq[vector]);
-		if (IS_ERR_OR_NULL(desc))
-			continue;
-
-		if (!raw_spin_trylock(&desc->lock)) {
-			raw_spin_unlock(&vector_lock);
-			cpu_relax();
-			raw_spin_lock(&vector_lock);
-			goto retry;
-		}
-
-		apicd = apic_chip_data(irq_desc_get_irq_data(desc));
-		if (!apicd)
-			goto unlock;
-
-		/*
-		 * Nothing to cleanup if irq migration is in progress
-		 * or this cpu is not set in the cleanup mask.
-		 */
-		if (apicd->move_in_progress ||
-		    !cpumask_test_cpu(me, apicd->old_domain))
-			goto unlock;
-
-		/*
-		 * We have two cases to handle here:
-		 * 1) vector is unchanged but the target mask got reduced
-		 * 2) vector and the target mask has changed
-		 *
-		 * #1 is obvious, but in #2 we have two vectors with the same
-		 * irq descriptor: the old and the new vector. So we need to
-		 * make sure that we only cleanup the old vector. The new
-		 * vector has the current @vector number in the config and
-		 * this cpu is part of the target mask. We better leave that
-		 * one alone.
-		 */
-		if (vector == apicd->cfg.vector &&
-		    cpumask_test_cpu(me, apicd->domain))
-			goto unlock;
-
-		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
-		/*
-		 * Check if the vector that needs to be cleanedup is
-		 * registered at the cpu's IRR. If so, then this is not
-		 * the best time to clean it up. Lets clean it up in the
-		 * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
-		 * to myself.
-		 */
-		if (irr  & (1 << (vector % 32))) {
-			apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
-			goto unlock;
-		}
-		__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
-		cpumask_clear_cpu(me, apicd->old_domain);
-unlock:
-		raw_spin_unlock(&desc->lock);
-	}
-
-	raw_spin_unlock(&vector_lock);
-
-	exiting_irq();
-}
-
 static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
 {
-	unsigned me;
 	struct apic_chip_data *apicd;
 
 	apicd = container_of(cfg, struct apic_chip_data, cfg);
 	if (likely(!apicd->move_in_progress))
 		return;
 
-	me = smp_processor_id();
-	if (vector == apicd->cfg.vector && cpumask_test_cpu(me, apicd->domain))
+	if (vector == apicd->cfg.vector && apicd->cpu == smp_processor_id())
 		__send_cleanup_vector(apicd);
 }
 
@@ -686,10 +635,9 @@
  */
 void irq_force_complete_move(struct irq_desc *desc)
 {
-	struct irq_data *irqd;
 	struct apic_chip_data *apicd;
-	struct irq_cfg *cfg;
-	unsigned int cpu;
+	struct irq_data *irqd;
+	unsigned int vector;
 
 	/*
 	 * The function is called for all descriptors regardless of which
@@ -701,42 +649,30 @@
 	 * (apic_chip_data) before touching it any further.
 	 */
 	irqd = irq_domain_get_irq_data(x86_vector_domain,
-					  irq_desc_get_irq(desc));
+				       irq_desc_get_irq(desc));
 	if (!irqd)
 		return;
 
+	raw_spin_lock(&vector_lock);
 	apicd = apic_chip_data(irqd);
-	cfg = apicd ? &apicd->cfg : NULL;
-
-	if (!cfg)
-		return;
+	if (!apicd)
+		goto unlock;
 
 	/*
-	 * This is tricky. If the cleanup of @data->old_domain has not been
+	 * If old_vector is empty, no action required.
+	 */
+	vector = apicd->cfg.old_vector;
+	if (!vector)
+		goto unlock;
+
+	/*
+	 * This is tricky. If the cleanup of the old vector has not been
 	 * done yet, then the following setaffinity call will fail with
 	 * -EBUSY. This can leave the interrupt in a stale state.
 	 *
 	 * All CPUs are stuck in stop machine with interrupts disabled so
 	 * calling __irq_complete_move() would be completely pointless.
-	 */
-	raw_spin_lock(&vector_lock);
-	/*
-	 * Clean out all offline cpus (including the outgoing one) from the
-	 * old_domain mask.
-	 */
-	cpumask_and(apicd->old_domain, apicd->old_domain, cpu_online_mask);
-
-	/*
-	 * If move_in_progress is cleared and the old_domain mask is empty,
-	 * then there is nothing to cleanup. fixup_irqs() will take care of
-	 * the stale vectors on the outgoing cpu.
-	 */
-	if (!apicd->move_in_progress && cpumask_empty(apicd->old_domain)) {
-		raw_spin_unlock(&vector_lock);
-		return;
-	}
-
-	/*
+	 *
 	 * 1) The interrupt is in move_in_progress state. That means that we
 	 *    have not seen an interrupt since the io_apic was reprogrammed to
 	 *    the new vector.
@@ -778,18 +714,15 @@
 		 * area arises.
 		 */
 		pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
-			irqd->irq, cfg->old_vector);
+			irqd->irq, vector);
 	}
-	/*
-	 * If old_domain is not empty, then other cpus still have the irq
-	 * descriptor set in their vector array. Clean it up.
-	 */
-	for_each_cpu(cpu, apicd->old_domain)
-		per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED;
-
+	per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED;
 	/* Cleanup the left overs of the (half finished) move */
 	cpumask_clear(apicd->old_domain);
+	apicd->cfg.old_vector = 0;
 	apicd->move_in_progress = 0;
+	hlist_del_init(&apicd->clist);
+unlock:
 	raw_spin_unlock(&vector_lock);
 }
 #endif
commit	dccfe3147b42b78458ab8e4440822c805ee76d72	[log] [tgz]
author	Thomas Gleixner <tglx@linutronix.de>	Wed Sep 13 23:29:32 2017 +0200
committer	Thomas Gleixner <tglx@linutronix.de>	Mon Sep 25 20:51:54 2017 +0200
tree	8abff1331b010bfdbafbc078e2a22b7065efa2f3
parent	029c6e1c9df776fe1b2ba756a28fb65e9f9e9f69 [diff] [blame]