From 17549b0f184d870f2cfa4e5cfa79f4c4905ed757 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Fri, 16 Dec 2022 16:04:41 +0100 Subject: [PATCH 01/42] genirq: Add might_sleep() to disable_irq() With the introduction of threaded interrupt handlers, it is virtually never safe to call disable_irq() from non-premptible context. Thus: Update the documentation, add an explicit might_sleep() to catch any offenders. This is more obvious and straight forward than the implicit might_sleep() check deeper down in the disable_irq() call chain. Fixes: 3aa551c9b4c4 ("genirq: add threaded interrupt handler support") Signed-off-by: Manfred Spraul Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20221216150441.200533-3-manfred@colorfullife.com --- kernel/irq/manage.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5b7cf28df2905..8ce75495e04f1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -723,10 +723,13 @@ EXPORT_SYMBOL(disable_irq_nosync); * to complete before returning. If you use this function while * holding a resource the IRQ handler may need you will deadlock. * - * This function may be called - with care - from IRQ context. + * Can only be called from preemptible code as it might sleep when + * an interrupt thread is associated to @irq. + * */ void disable_irq(unsigned int irq) { + might_sleep(); if (!__disable_irq_nosync(irq)) synchronize_irq(irq); } From 379af13b31fa8a36ad4abd59a5c511f25c5d4d42 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Mon, 12 Dec 2022 17:37:15 +0100 Subject: [PATCH 02/42] docs: locking: Discourage from calling disable_irq() in atomic Correct the example in the documentation so that disable_irq() is not being called in atomic context. disable_irq() calls sleeping synchronize_irq(), it's not allowed to call them in atomic context. Signed-off-by: Alexander Sverdlin Signed-off-by: Thomas Gleixner Reviewed-by: Manfred Spraul Cc: linux-doc@vger.kernel.org Link: https://lore.kernel.org/lkml/87k02wbs2n.ffs@tglx/ Link: https://lore.kernel.org/r/20221212163715.830315-1-alexander.sverdlin@siemens.com --- Documentation/kernel-hacking/locking.rst | 4 ++-- Documentation/translations/it_IT/kernel-hacking/locking.rst | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/kernel-hacking/locking.rst b/Documentation/kernel-hacking/locking.rst index c756786e17aea..dff0646a717bf 100644 --- a/Documentation/kernel-hacking/locking.rst +++ b/Documentation/kernel-hacking/locking.rst @@ -1277,11 +1277,11 @@ Manfred Spraul points out that you can still do this, even if the data is very occasionally accessed in user context or softirqs/tasklets. The irq handler doesn't use a lock, and all other accesses are done as so:: - spin_lock(&lock); + mutex_lock(&lock); disable_irq(irq); ... enable_irq(irq); - spin_unlock(&lock); + mutex_unlock(&lock); The disable_irq() prevents the irq handler from running (and waits for it to finish if it's currently running on other CPUs). diff --git a/Documentation/translations/it_IT/kernel-hacking/locking.rst b/Documentation/translations/it_IT/kernel-hacking/locking.rst index b8ecf41273c57..05d362b16bf0b 100644 --- a/Documentation/translations/it_IT/kernel-hacking/locking.rst +++ b/Documentation/translations/it_IT/kernel-hacking/locking.rst @@ -1307,11 +1307,11 @@ se i dati vengono occasionalmente utilizzati da un contesto utente o da un'interruzione software. Il gestore d'interruzione non utilizza alcun *lock*, e tutti gli altri accessi verranno fatti così:: - spin_lock(&lock); + mutex_lock(&lock); disable_irq(irq); ... enable_irq(irq); - spin_unlock(&lock); + mutex_unlock(&lock); La funzione disable_irq() impedisce al gestore d'interruzioni d'essere eseguito (e aspetta che finisca nel caso fosse in esecuzione su From 0e2213fe0ab4c04da0e2354e84ec3b90e59939a4 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 13 Dec 2022 15:08:43 +0100 Subject: [PATCH 03/42] irqchip: Use irq_domain_alloc_irqs() Use the irq_domain_alloc_irqs() wrapper instead of the full __irq_domain_alloc_irqs() interface, which was only intended for some legacy (x86) use cases. Signed-off-by: Johan Hovold Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20221213140844.15470-2-johan+linaro@kernel.org --- drivers/irqchip/irq-apple-aic.c | 4 +--- drivers/irqchip/irq-armada-370-xp.c | 3 +-- drivers/irqchip/irq-bcm2836.c | 5 +---- drivers/irqchip/irq-gic-v3.c | 4 +--- drivers/irqchip/irq-gic-v4.c | 9 +++------ drivers/irqchip/irq-gic.c | 4 +--- 6 files changed, 8 insertions(+), 21 deletions(-) diff --git a/drivers/irqchip/irq-apple-aic.c b/drivers/irqchip/irq-apple-aic.c index ae3437f03e6c2..cf513b6576547 100644 --- a/drivers/irqchip/irq-apple-aic.c +++ b/drivers/irqchip/irq-apple-aic.c @@ -924,9 +924,7 @@ static int __init aic_init_smp(struct aic_irq_chip *irqc, struct device_node *no ipi_domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE; irq_domain_update_bus_token(ipi_domain, DOMAIN_BUS_IPI); - base_ipi = __irq_domain_alloc_irqs(ipi_domain, -1, AIC_NR_SWIPI, - NUMA_NO_NODE, NULL, false, NULL); - + base_ipi = irq_domain_alloc_irqs(ipi_domain, AIC_NR_SWIPI, NUMA_NO_NODE, NULL); if (WARN_ON(!base_ipi)) { irq_domain_remove(ipi_domain); return -ENODEV; diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c index ee18eb3e72b72..a55528469278c 100644 --- a/drivers/irqchip/irq-armada-370-xp.c +++ b/drivers/irqchip/irq-armada-370-xp.c @@ -454,8 +454,7 @@ static __init void armada_xp_ipi_init(struct device_node *node) return; irq_domain_update_bus_token(ipi_domain, DOMAIN_BUS_IPI); - base_ipi = __irq_domain_alloc_irqs(ipi_domain, -1, IPI_DOORBELL_END, - NUMA_NO_NODE, NULL, false, NULL); + base_ipi = irq_domain_alloc_irqs(ipi_domain, IPI_DOORBELL_END, NUMA_NO_NODE, NULL); if (WARN_ON(!base_ipi)) return; diff --git a/drivers/irqchip/irq-bcm2836.c b/drivers/irqchip/irq-bcm2836.c index 51491c3c6fdd2..e5f1059b989fe 100644 --- a/drivers/irqchip/irq-bcm2836.c +++ b/drivers/irqchip/irq-bcm2836.c @@ -268,10 +268,7 @@ static void __init bcm2836_arm_irqchip_smp_init(void) ipi_domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE; irq_domain_update_bus_token(ipi_domain, DOMAIN_BUS_IPI); - base_ipi = __irq_domain_alloc_irqs(ipi_domain, -1, BITS_PER_MBOX, - NUMA_NO_NODE, NULL, - false, NULL); - + base_ipi = irq_domain_alloc_irqs(ipi_domain, BITS_PER_MBOX, NUMA_NO_NODE, NULL); if (WARN_ON(!base_ipi)) return; diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 997104d4338e7..bb57ab8bff6a3 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -1310,9 +1310,7 @@ static void __init gic_smp_init(void) gic_starting_cpu, NULL); /* Register all 8 non-secure SGIs */ - base_sgi = __irq_domain_alloc_irqs(gic_data.domain, -1, 8, - NUMA_NO_NODE, &sgi_fwspec, - false, NULL); + base_sgi = irq_domain_alloc_irqs(gic_data.domain, 8, NUMA_NO_NODE, &sgi_fwspec); if (WARN_ON(base_sgi <= 0)) return; diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c index a6277dea4c7a7..94d56a03b1757 100644 --- a/drivers/irqchip/irq-gic-v4.c +++ b/drivers/irqchip/irq-gic-v4.c @@ -139,9 +139,7 @@ static int its_alloc_vcpu_sgis(struct its_vpe *vpe, int idx) if (!vpe->sgi_domain) goto err; - sgi_base = __irq_domain_alloc_irqs(vpe->sgi_domain, -1, 16, - NUMA_NO_NODE, vpe, - false, NULL); + sgi_base = irq_domain_alloc_irqs(vpe->sgi_domain, 16, NUMA_NO_NODE, vpe); if (sgi_base <= 0) goto err; @@ -176,9 +174,8 @@ int its_alloc_vcpu_irqs(struct its_vm *vm) vm->vpes[i]->idai = true; } - vpe_base_irq = __irq_domain_alloc_irqs(vm->domain, -1, vm->nr_vpes, - NUMA_NO_NODE, vm, - false, NULL); + vpe_base_irq = irq_domain_alloc_irqs(vm->domain, vm->nr_vpes, + NUMA_NO_NODE, vm); if (vpe_base_irq <= 0) goto err; diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 210bc2f4d5550..4fa4d8ac76d9b 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -868,9 +868,7 @@ static __init void gic_smp_init(void) "irqchip/arm/gic:starting", gic_starting_cpu, NULL); - base_sgi = __irq_domain_alloc_irqs(gic_data[0].domain, -1, 8, - NUMA_NO_NODE, &sgi_fwspec, - false, NULL); + base_sgi = irq_domain_alloc_irqs(gic_data[0].domain, 8, NUMA_NO_NODE, &sgi_fwspec); if (WARN_ON(base_sgi <= 0)) return; From cdf07f0ea48a3b52f924714d477366ac510ee870 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Dec 2022 10:29:00 +0800 Subject: [PATCH 04/42] genirq/affinity: Remove the 'firstvec' parameter from irq_build_affinity_masks The 'firstvec' parameter is always same with the parameter of 'startvec', so use 'startvec' directly inside irq_build_affinity_masks(). Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20221227022905.352674-2-ming.lei@redhat.com --- kernel/irq/affinity.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index d9a5c1d65a79d..3361e36ebaa1e 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -337,10 +337,10 @@ static int __irq_build_affinity_masks(unsigned int startvec, * 2) spread other possible CPUs on these vectors */ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, - unsigned int firstvec, struct irq_affinity_desc *masks) { unsigned int curvec = startvec, nr_present = 0, nr_others = 0; + unsigned int firstvec = startvec; cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; int ret = -ENOMEM; @@ -463,8 +463,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) unsigned int this_vecs = affd->set_size[i]; int ret; - ret = irq_build_affinity_masks(curvec, this_vecs, - curvec, masks); + ret = irq_build_affinity_masks(curvec, this_vecs, masks); if (ret) { kfree(masks); return NULL; From 1f962d91a15af54301c63febb8ac2ba07aa3654f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Dec 2022 10:29:01 +0800 Subject: [PATCH 05/42] genirq/affinity: Pass affinity managed mask array to irq_build_affinity_masks Pass affinity managed mask array to irq_build_affinity_masks() so that the index of the first affinity managed vector is always zero. This allows to simplify the implementation a bit. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20221227022905.352674-3-ming.lei@redhat.com --- kernel/irq/affinity.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 3361e36ebaa1e..da6379cd27fd4 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -246,14 +246,13 @@ static void alloc_nodes_vectors(unsigned int numvecs, static int __irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, - unsigned int firstvec, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, struct irq_affinity_desc *masks) { unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; - unsigned int last_affv = firstvec + numvecs; + unsigned int last_affv = numvecs; unsigned int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; struct node_vectors *node_vectors; @@ -273,7 +272,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk); if (++curvec == last_affv) - curvec = firstvec; + curvec = 0; } return numvecs; } @@ -321,7 +320,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, * may start anywhere */ if (curvec >= last_affv) - curvec = firstvec; + curvec = 0; irq_spread_init_one(&masks[curvec].mask, nmsk, cpus_per_vec); } @@ -336,11 +335,10 @@ static int __irq_build_affinity_masks(unsigned int startvec, * 1) spread present CPU on these vectors * 2) spread other possible CPUs on these vectors */ -static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, +static int irq_build_affinity_masks(unsigned int numvecs, struct irq_affinity_desc *masks) { - unsigned int curvec = startvec, nr_present = 0, nr_others = 0; - unsigned int firstvec = startvec; + unsigned int curvec = 0, nr_present = 0, nr_others = 0; cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; int ret = -ENOMEM; @@ -360,9 +358,8 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, build_node_to_cpumask(node_to_cpumask); /* Spread on present CPUs starting from affd->pre_vectors */ - ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, - node_to_cpumask, cpu_present_mask, - nmsk, masks); + ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask, + cpu_present_mask, nmsk, masks); if (ret < 0) goto fail_build_affinity; nr_present = ret; @@ -374,13 +371,12 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, * out vectors. */ if (nr_present >= numvecs) - curvec = firstvec; + curvec = 0; else - curvec = firstvec + nr_present; + curvec = nr_present; cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, - node_to_cpumask, npresmsk, nmsk, - masks); + ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask, + npresmsk, nmsk, masks); if (ret >= 0) nr_others = ret; @@ -463,7 +459,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) unsigned int this_vecs = affd->set_size[i]; int ret; - ret = irq_build_affinity_masks(curvec, this_vecs, masks); + ret = irq_build_affinity_masks(this_vecs, &masks[curvec]); if (ret) { kfree(masks); return NULL; From e7bdd7f0cbd1c001bb9b4d3313edc5ee094bc3f8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Dec 2022 10:29:02 +0800 Subject: [PATCH 06/42] genirq/affinity: Don't pass irq_affinity_desc array to irq_build_affinity_masks Prepare for abstracting irq_build_affinity_masks() into a public function for assigning all CPUs evenly into several groups. Don't pass irq_affinity_desc array to irq_build_affinity_masks, instead return a cpumask array by storing each assigned group into one element of the array. This allows to provide a generic interface for grouping all CPUs evenly from a NUMA and CPU locality viewpoint, and the cost is one extra allocation in irq_build_affinity_masks(), which should be fine since it is done via GFP_KERNEL and irq_build_affinity_masks() is a slow path anyway. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20221227022905.352674-4-ming.lei@redhat.com --- kernel/irq/affinity.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index da6379cd27fd4..00bba1020ecb2 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -249,7 +249,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, - struct irq_affinity_desc *masks) + struct cpumask *masks) { unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; unsigned int last_affv = numvecs; @@ -270,7 +270,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, for_each_node_mask(n, nodemsk) { /* Ensure that only CPUs which are in both masks are set */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk); + cpumask_or(&masks[curvec], &masks[curvec], nmsk); if (++curvec == last_affv) curvec = 0; } @@ -321,7 +321,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, */ if (curvec >= last_affv) curvec = 0; - irq_spread_init_one(&masks[curvec].mask, nmsk, + irq_spread_init_one(&masks[curvec], nmsk, cpus_per_vec); } done += nv->nvectors; @@ -335,16 +335,16 @@ static int __irq_build_affinity_masks(unsigned int startvec, * 1) spread present CPU on these vectors * 2) spread other possible CPUs on these vectors */ -static int irq_build_affinity_masks(unsigned int numvecs, - struct irq_affinity_desc *masks) +static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) { unsigned int curvec = 0, nr_present = 0, nr_others = 0; cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; int ret = -ENOMEM; + struct cpumask *masks = NULL; if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return ret; + return NULL; if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) goto fail_nmsk; @@ -353,6 +353,10 @@ static int irq_build_affinity_masks(unsigned int numvecs, if (!node_to_cpumask) goto fail_npresmsk; + masks = kcalloc(numvecs, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto fail_node_to_cpumask; + /* Stabilize the cpumasks */ cpus_read_lock(); build_node_to_cpumask(node_to_cpumask); @@ -386,6 +390,7 @@ static int irq_build_affinity_masks(unsigned int numvecs, if (ret >= 0) WARN_ON(nr_present + nr_others < numvecs); + fail_node_to_cpumask: free_node_to_cpumask(node_to_cpumask); fail_npresmsk: @@ -393,7 +398,11 @@ static int irq_build_affinity_masks(unsigned int numvecs, fail_nmsk: free_cpumask_var(nmsk); - return ret < 0 ? ret : 0; + if (ret < 0) { + kfree(masks); + return NULL; + } + return masks; } static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) @@ -457,13 +466,18 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) */ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { unsigned int this_vecs = affd->set_size[i]; - int ret; + int j; + struct cpumask *result = irq_build_affinity_masks(this_vecs); - ret = irq_build_affinity_masks(this_vecs, &masks[curvec]); - if (ret) { + if (!result) { kfree(masks); return NULL; } + + for (j = 0; j < this_vecs; j++) + cpumask_copy(&masks[curvec + j].mask, &result[j]); + kfree(result); + curvec += this_vecs; usedvecs += this_vecs; } From 523f1ea76aad9025f9bd5258d77f4406fa9dbe5d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Dec 2022 10:29:03 +0800 Subject: [PATCH 07/42] genirq/affinity: Rename irq_build_affinity_masks as group_cpus_evenly Map irq vector into group, which allows to abstract the algorithm for a generic use case outside of the interrupt core. Rename irq_build_affinity_masks as group_cpus_evenly, so the API can be reused for blk-mq to make default queue mapping even though irq vectors aren't involved. No functional change, just rename vector as group. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20221227022905.352674-5-ming.lei@redhat.com --- kernel/irq/affinity.c | 242 +++++++++++++++++++++--------------------- 1 file changed, 121 insertions(+), 121 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 00bba1020ecb2..54083331f1bcb 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -9,13 +9,13 @@ #include #include -static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, - unsigned int cpus_per_vec) +static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, + unsigned int cpus_per_grp) { const struct cpumask *siblmsk; int cpu, sibl; - for ( ; cpus_per_vec > 0; ) { + for ( ; cpus_per_grp > 0; ) { cpu = cpumask_first(nmsk); /* Should not happen, but I'm too lazy to think about it */ @@ -24,18 +24,18 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, cpumask_clear_cpu(cpu, nmsk); cpumask_set_cpu(cpu, irqmsk); - cpus_per_vec--; + cpus_per_grp--; /* If the cpu has siblings, use them first */ siblmsk = topology_sibling_cpumask(cpu); - for (sibl = -1; cpus_per_vec > 0; ) { + for (sibl = -1; cpus_per_grp > 0; ) { sibl = cpumask_next(sibl, siblmsk); if (sibl >= nr_cpu_ids) break; if (!cpumask_test_and_clear_cpu(sibl, nmsk)) continue; cpumask_set_cpu(sibl, irqmsk); - cpus_per_vec--; + cpus_per_grp--; } } } @@ -95,48 +95,48 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } -struct node_vectors { +struct node_groups { unsigned id; union { - unsigned nvectors; + unsigned ngroups; unsigned ncpus; }; }; static int ncpus_cmp_func(const void *l, const void *r) { - const struct node_vectors *ln = l; - const struct node_vectors *rn = r; + const struct node_groups *ln = l; + const struct node_groups *rn = r; return ln->ncpus - rn->ncpus; } /* - * Allocate vector number for each node, so that for each node: + * Allocate group number for each node, so that for each node: * * 1) the allocated number is >= 1 * - * 2) the allocated numbver is <= active CPU number of this node + * 2) the allocated number is <= active CPU number of this node * - * The actual allocated total vectors may be less than @numvecs when - * active total CPU number is less than @numvecs. + * The actual allocated total groups may be less than @numgrps when + * active total CPU number is less than @numgrps. * * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' * for each node. */ -static void alloc_nodes_vectors(unsigned int numvecs, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - const nodemask_t nodemsk, - struct cpumask *nmsk, - struct node_vectors *node_vectors) +static void alloc_nodes_groups(unsigned int numgrps, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + const nodemask_t nodemsk, + struct cpumask *nmsk, + struct node_groups *node_groups) { unsigned n, remaining_ncpus = 0; for (n = 0; n < nr_node_ids; n++) { - node_vectors[n].id = n; - node_vectors[n].ncpus = UINT_MAX; + node_groups[n].id = n; + node_groups[n].ncpus = UINT_MAX; } for_each_node_mask(n, nodemsk) { @@ -148,61 +148,61 @@ static void alloc_nodes_vectors(unsigned int numvecs, if (!ncpus) continue; remaining_ncpus += ncpus; - node_vectors[n].ncpus = ncpus; + node_groups[n].ncpus = ncpus; } - numvecs = min_t(unsigned, remaining_ncpus, numvecs); + numgrps = min_t(unsigned, remaining_ncpus, numgrps); - sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]), + sort(node_groups, nr_node_ids, sizeof(node_groups[0]), ncpus_cmp_func, NULL); /* - * Allocate vectors for each node according to the ratio of this - * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is + * Allocate groups for each node according to the ratio of this + * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is * bigger than number of active numa nodes. Always start the * allocation from the node with minimized nr_cpus. * * This way guarantees that each active node gets allocated at - * least one vector, and the theory is simple: over-allocation - * is only done when this node is assigned by one vector, so - * other nodes will be allocated >= 1 vector, since 'numvecs' is + * least one group, and the theory is simple: over-allocation + * is only done when this node is assigned by one group, so + * other nodes will be allocated >= 1 groups, since 'numgrps' is * bigger than number of numa nodes. * - * One perfect invariant is that number of allocated vectors for + * One perfect invariant is that number of allocated groups for * each node is <= CPU count of this node: * * 1) suppose there are two nodes: A and B * ncpu(X) is CPU count of node X - * vecs(X) is the vector count allocated to node X via this + * grps(X) is the group count allocated to node X via this * algorithm * * ncpu(A) <= ncpu(B) * ncpu(A) + ncpu(B) = N - * vecs(A) + vecs(B) = V + * grps(A) + grps(B) = G * - * vecs(A) = max(1, round_down(V * ncpu(A) / N)) - * vecs(B) = V - vecs(A) + * grps(A) = max(1, round_down(G * ncpu(A) / N)) + * grps(B) = G - grps(A) * - * both N and V are integer, and 2 <= V <= N, suppose - * V = N - delta, and 0 <= delta <= N - 2 + * both N and G are integer, and 2 <= G <= N, suppose + * G = N - delta, and 0 <= delta <= N - 2 * - * 2) obviously vecs(A) <= ncpu(A) because: + * 2) obviously grps(A) <= ncpu(A) because: * - * if vecs(A) is 1, then vecs(A) <= ncpu(A) given + * if grps(A) is 1, then grps(A) <= ncpu(A) given * ncpu(A) >= 1 * * otherwise, - * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N + * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N * - * 3) prove how vecs(B) <= ncpu(B): + * 3) prove how grps(B) <= ncpu(B): * - * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be - * over-allocated, so vecs(B) <= ncpu(B), + * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be + * over-allocated, so grps(B) <= ncpu(B), * * otherwise: * - * vecs(A) = - * round_down(V * ncpu(A) / N) = + * grps(A) = + * round_down(G * ncpu(A) / N) = * round_down((N - delta) * ncpu(A) / N) = * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= * round_down((N * ncpu(A) - delta * N) / N) = @@ -210,52 +210,50 @@ static void alloc_nodes_vectors(unsigned int numvecs, * * then: * - * vecs(A) - V >= ncpu(A) - delta - V + * grps(A) - G >= ncpu(A) - delta - G * => - * V - vecs(A) <= V + delta - ncpu(A) + * G - grps(A) <= G + delta - ncpu(A) * => - * vecs(B) <= N - ncpu(A) + * grps(B) <= N - ncpu(A) * => - * vecs(B) <= cpu(B) + * grps(B) <= cpu(B) * * For nodes >= 3, it can be thought as one node and another big * node given that is exactly what this algorithm is implemented, - * and we always re-calculate 'remaining_ncpus' & 'numvecs', and - * finally for each node X: vecs(X) <= ncpu(X). + * and we always re-calculate 'remaining_ncpus' & 'numgrps', and + * finally for each node X: grps(X) <= ncpu(X). * */ for (n = 0; n < nr_node_ids; n++) { - unsigned nvectors, ncpus; + unsigned ngroups, ncpus; - if (node_vectors[n].ncpus == UINT_MAX) + if (node_groups[n].ncpus == UINT_MAX) continue; - WARN_ON_ONCE(numvecs == 0); + WARN_ON_ONCE(numgrps == 0); - ncpus = node_vectors[n].ncpus; - nvectors = max_t(unsigned, 1, - numvecs * ncpus / remaining_ncpus); - WARN_ON_ONCE(nvectors > ncpus); + ncpus = node_groups[n].ncpus; + ngroups = max_t(unsigned, 1, + numgrps * ncpus / remaining_ncpus); + WARN_ON_ONCE(ngroups > ncpus); - node_vectors[n].nvectors = nvectors; + node_groups[n].ngroups = ngroups; remaining_ncpus -= ncpus; - numvecs -= nvectors; + numgrps -= ngroups; } } -static int __irq_build_affinity_masks(unsigned int startvec, - unsigned int numvecs, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - struct cpumask *nmsk, - struct cpumask *masks) +static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, struct cpumask *masks) { - unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; - unsigned int last_affv = numvecs; - unsigned int curvec = startvec; + unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; + unsigned int last_grp = numgrps; + unsigned int curgrp = startgrp; nodemask_t nodemsk = NODE_MASK_NONE; - struct node_vectors *node_vectors; + struct node_groups *node_groups; if (cpumask_empty(cpu_mask)) return 0; @@ -264,34 +262,33 @@ static int __irq_build_affinity_masks(unsigned int startvec, /* * If the number of nodes in the mask is greater than or equal the - * number of vectors we just spread the vectors across the nodes. + * number of groups we just spread the groups across the nodes. */ - if (numvecs <= nodes) { + if (numgrps <= nodes) { for_each_node_mask(n, nodemsk) { /* Ensure that only CPUs which are in both masks are set */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - cpumask_or(&masks[curvec], &masks[curvec], nmsk); - if (++curvec == last_affv) - curvec = 0; + cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); + if (++curgrp == last_grp) + curgrp = 0; } - return numvecs; + return numgrps; } - node_vectors = kcalloc(nr_node_ids, - sizeof(struct node_vectors), + node_groups = kcalloc(nr_node_ids, + sizeof(struct node_groups), GFP_KERNEL); - if (!node_vectors) + if (!node_groups) return -ENOMEM; - /* allocate vector number for each node */ - alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask, - nodemsk, nmsk, node_vectors); - + /* allocate group number for each node */ + alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, + nodemsk, nmsk, node_groups); for (i = 0; i < nr_node_ids; i++) { unsigned int ncpus, v; - struct node_vectors *nv = &node_vectors[i]; + struct node_groups *nv = &node_groups[i]; - if (nv->nvectors == UINT_MAX) + if (nv->ngroups == UINT_MAX) continue; /* Get the cpus on this node which are in the mask */ @@ -300,44 +297,47 @@ static int __irq_build_affinity_masks(unsigned int startvec, if (!ncpus) continue; - WARN_ON_ONCE(nv->nvectors > ncpus); + WARN_ON_ONCE(nv->ngroups > ncpus); /* Account for rounding errors */ - extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors); + extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); - /* Spread allocated vectors on CPUs of the current node */ - for (v = 0; v < nv->nvectors; v++, curvec++) { - cpus_per_vec = ncpus / nv->nvectors; + /* Spread allocated groups on CPUs of the current node */ + for (v = 0; v < nv->ngroups; v++, curgrp++) { + cpus_per_grp = ncpus / nv->ngroups; - /* Account for extra vectors to compensate rounding errors */ - if (extra_vecs) { - cpus_per_vec++; - --extra_vecs; + /* Account for extra groups to compensate rounding errors */ + if (extra_grps) { + cpus_per_grp++; + --extra_grps; } /* - * wrapping has to be considered given 'startvec' + * wrapping has to be considered given 'startgrp' * may start anywhere */ - if (curvec >= last_affv) - curvec = 0; - irq_spread_init_one(&masks[curvec], nmsk, - cpus_per_vec); + if (curgrp >= last_grp) + curgrp = 0; + grp_spread_init_one(&masks[curgrp], nmsk, + cpus_per_grp); } - done += nv->nvectors; + done += nv->ngroups; } - kfree(node_vectors); + kfree(node_groups); return done; } /* - * build affinity in two stages: - * 1) spread present CPU on these vectors - * 2) spread other possible CPUs on these vectors + * build affinity in two stages for each group, and try to put close CPUs + * in viewpoint of CPU and NUMA locality into same group, and we run + * two-stage grouping: + * + * 1) allocate present CPUs on these groups evenly first + * 2) allocate other possible CPUs on these groups evenly */ -static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) +static struct cpumask *group_cpus_evenly(unsigned int numgrps) { - unsigned int curvec = 0, nr_present = 0, nr_others = 0; + unsigned int curgrp = 0, nr_present = 0, nr_others = 0; cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; int ret = -ENOMEM; @@ -353,7 +353,7 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) if (!node_to_cpumask) goto fail_npresmsk; - masks = kcalloc(numvecs, sizeof(*masks), GFP_KERNEL); + masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); if (!masks) goto fail_node_to_cpumask; @@ -361,26 +361,26 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) cpus_read_lock(); build_node_to_cpumask(node_to_cpumask); - /* Spread on present CPUs starting from affd->pre_vectors */ - ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask, - cpu_present_mask, nmsk, masks); + /* grouping present CPUs first */ + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + cpu_present_mask, nmsk, masks); if (ret < 0) goto fail_build_affinity; nr_present = ret; /* - * Spread on non present CPUs starting from the next vector to be - * handled. If the spreading of present CPUs already exhausted the - * vector space, assign the non present CPUs to the already spread - * out vectors. + * Allocate non present CPUs starting from the next group to be + * handled. If the grouping of present CPUs already exhausted the + * group space, assign the non present CPUs to the already + * allocated out groups. */ - if (nr_present >= numvecs) - curvec = 0; + if (nr_present >= numgrps) + curgrp = 0; else - curvec = nr_present; + curgrp = nr_present; cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask, - npresmsk, nmsk, masks); + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + npresmsk, nmsk, masks); if (ret >= 0) nr_others = ret; @@ -388,7 +388,7 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) cpus_read_unlock(); if (ret >= 0) - WARN_ON(nr_present + nr_others < numvecs); + WARN_ON(nr_present + nr_others < numgrps); fail_node_to_cpumask: free_node_to_cpumask(node_to_cpumask); @@ -467,7 +467,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { unsigned int this_vecs = affd->set_size[i]; int j; - struct cpumask *result = irq_build_affinity_masks(this_vecs); + struct cpumask *result = group_cpus_evenly(this_vecs); if (!result) { kfree(masks); From f7b3ea8cf72f3d6060fe08e461805181e7450a13 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Dec 2022 10:29:04 +0800 Subject: [PATCH 08/42] genirq/affinity: Move group_cpus_evenly() into lib/ group_cpus_evenly() has become a generic function which can be used for other subsystems than the interrupt subsystem, so move it into lib/. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20221227022905.352674-6-ming.lei@redhat.com --- MAINTAINERS | 2 + include/linux/group_cpus.h | 14 ++ kernel/irq/affinity.c | 398 +--------------------------------- lib/Makefile | 2 + lib/group_cpus.c | 427 +++++++++++++++++++++++++++++++++++++ 5 files changed, 446 insertions(+), 397 deletions(-) create mode 100644 include/linux/group_cpus.h create mode 100644 lib/group_cpus.c diff --git a/MAINTAINERS b/MAINTAINERS index a36df9ed283d3..9a07bd4f097f4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10935,6 +10935,8 @@ L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core F: kernel/irq/ +F: include/linux/group_cpus.h +F: lib/group_cpus.c IRQCHIP DRIVERS M: Thomas Gleixner diff --git a/include/linux/group_cpus.h b/include/linux/group_cpus.h new file mode 100644 index 0000000000000..e42807ec61f6e --- /dev/null +++ b/include/linux/group_cpus.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2016 Thomas Gleixner. + * Copyright (C) 2016-2017 Christoph Hellwig. + */ + +#ifndef __LINUX_GROUP_CPUS_H +#define __LINUX_GROUP_CPUS_H +#include +#include + +struct cpumask *group_cpus_evenly(unsigned int numgrps); + +#endif diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 54083331f1bcb..44a4eba80315c 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -7,403 +7,7 @@ #include #include #include -#include - -static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, - unsigned int cpus_per_grp) -{ - const struct cpumask *siblmsk; - int cpu, sibl; - - for ( ; cpus_per_grp > 0; ) { - cpu = cpumask_first(nmsk); - - /* Should not happen, but I'm too lazy to think about it */ - if (cpu >= nr_cpu_ids) - return; - - cpumask_clear_cpu(cpu, nmsk); - cpumask_set_cpu(cpu, irqmsk); - cpus_per_grp--; - - /* If the cpu has siblings, use them first */ - siblmsk = topology_sibling_cpumask(cpu); - for (sibl = -1; cpus_per_grp > 0; ) { - sibl = cpumask_next(sibl, siblmsk); - if (sibl >= nr_cpu_ids) - break; - if (!cpumask_test_and_clear_cpu(sibl, nmsk)) - continue; - cpumask_set_cpu(sibl, irqmsk); - cpus_per_grp--; - } - } -} - -static cpumask_var_t *alloc_node_to_cpumask(void) -{ - cpumask_var_t *masks; - int node; - - masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); - if (!masks) - return NULL; - - for (node = 0; node < nr_node_ids; node++) { - if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) - goto out_unwind; - } - - return masks; - -out_unwind: - while (--node >= 0) - free_cpumask_var(masks[node]); - kfree(masks); - return NULL; -} - -static void free_node_to_cpumask(cpumask_var_t *masks) -{ - int node; - - for (node = 0; node < nr_node_ids; node++) - free_cpumask_var(masks[node]); - kfree(masks); -} - -static void build_node_to_cpumask(cpumask_var_t *masks) -{ - int cpu; - - for_each_possible_cpu(cpu) - cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); -} - -static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, - const struct cpumask *mask, nodemask_t *nodemsk) -{ - int n, nodes = 0; - - /* Calculate the number of nodes in the supplied affinity mask */ - for_each_node(n) { - if (cpumask_intersects(mask, node_to_cpumask[n])) { - node_set(n, *nodemsk); - nodes++; - } - } - return nodes; -} - -struct node_groups { - unsigned id; - - union { - unsigned ngroups; - unsigned ncpus; - }; -}; - -static int ncpus_cmp_func(const void *l, const void *r) -{ - const struct node_groups *ln = l; - const struct node_groups *rn = r; - - return ln->ncpus - rn->ncpus; -} - -/* - * Allocate group number for each node, so that for each node: - * - * 1) the allocated number is >= 1 - * - * 2) the allocated number is <= active CPU number of this node - * - * The actual allocated total groups may be less than @numgrps when - * active total CPU number is less than @numgrps. - * - * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' - * for each node. - */ -static void alloc_nodes_groups(unsigned int numgrps, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - const nodemask_t nodemsk, - struct cpumask *nmsk, - struct node_groups *node_groups) -{ - unsigned n, remaining_ncpus = 0; - - for (n = 0; n < nr_node_ids; n++) { - node_groups[n].id = n; - node_groups[n].ncpus = UINT_MAX; - } - - for_each_node_mask(n, nodemsk) { - unsigned ncpus; - - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - ncpus = cpumask_weight(nmsk); - - if (!ncpus) - continue; - remaining_ncpus += ncpus; - node_groups[n].ncpus = ncpus; - } - - numgrps = min_t(unsigned, remaining_ncpus, numgrps); - - sort(node_groups, nr_node_ids, sizeof(node_groups[0]), - ncpus_cmp_func, NULL); - - /* - * Allocate groups for each node according to the ratio of this - * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is - * bigger than number of active numa nodes. Always start the - * allocation from the node with minimized nr_cpus. - * - * This way guarantees that each active node gets allocated at - * least one group, and the theory is simple: over-allocation - * is only done when this node is assigned by one group, so - * other nodes will be allocated >= 1 groups, since 'numgrps' is - * bigger than number of numa nodes. - * - * One perfect invariant is that number of allocated groups for - * each node is <= CPU count of this node: - * - * 1) suppose there are two nodes: A and B - * ncpu(X) is CPU count of node X - * grps(X) is the group count allocated to node X via this - * algorithm - * - * ncpu(A) <= ncpu(B) - * ncpu(A) + ncpu(B) = N - * grps(A) + grps(B) = G - * - * grps(A) = max(1, round_down(G * ncpu(A) / N)) - * grps(B) = G - grps(A) - * - * both N and G are integer, and 2 <= G <= N, suppose - * G = N - delta, and 0 <= delta <= N - 2 - * - * 2) obviously grps(A) <= ncpu(A) because: - * - * if grps(A) is 1, then grps(A) <= ncpu(A) given - * ncpu(A) >= 1 - * - * otherwise, - * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N - * - * 3) prove how grps(B) <= ncpu(B): - * - * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be - * over-allocated, so grps(B) <= ncpu(B), - * - * otherwise: - * - * grps(A) = - * round_down(G * ncpu(A) / N) = - * round_down((N - delta) * ncpu(A) / N) = - * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= - * round_down((N * ncpu(A) - delta * N) / N) = - * cpu(A) - delta - * - * then: - * - * grps(A) - G >= ncpu(A) - delta - G - * => - * G - grps(A) <= G + delta - ncpu(A) - * => - * grps(B) <= N - ncpu(A) - * => - * grps(B) <= cpu(B) - * - * For nodes >= 3, it can be thought as one node and another big - * node given that is exactly what this algorithm is implemented, - * and we always re-calculate 'remaining_ncpus' & 'numgrps', and - * finally for each node X: grps(X) <= ncpu(X). - * - */ - for (n = 0; n < nr_node_ids; n++) { - unsigned ngroups, ncpus; - - if (node_groups[n].ncpus == UINT_MAX) - continue; - - WARN_ON_ONCE(numgrps == 0); - - ncpus = node_groups[n].ncpus; - ngroups = max_t(unsigned, 1, - numgrps * ncpus / remaining_ncpus); - WARN_ON_ONCE(ngroups > ncpus); - - node_groups[n].ngroups = ngroups; - - remaining_ncpus -= ncpus; - numgrps -= ngroups; - } -} - -static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - struct cpumask *nmsk, struct cpumask *masks) -{ - unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; - unsigned int last_grp = numgrps; - unsigned int curgrp = startgrp; - nodemask_t nodemsk = NODE_MASK_NONE; - struct node_groups *node_groups; - - if (cpumask_empty(cpu_mask)) - return 0; - - nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); - - /* - * If the number of nodes in the mask is greater than or equal the - * number of groups we just spread the groups across the nodes. - */ - if (numgrps <= nodes) { - for_each_node_mask(n, nodemsk) { - /* Ensure that only CPUs which are in both masks are set */ - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); - if (++curgrp == last_grp) - curgrp = 0; - } - return numgrps; - } - - node_groups = kcalloc(nr_node_ids, - sizeof(struct node_groups), - GFP_KERNEL); - if (!node_groups) - return -ENOMEM; - - /* allocate group number for each node */ - alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, - nodemsk, nmsk, node_groups); - for (i = 0; i < nr_node_ids; i++) { - unsigned int ncpus, v; - struct node_groups *nv = &node_groups[i]; - - if (nv->ngroups == UINT_MAX) - continue; - - /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); - ncpus = cpumask_weight(nmsk); - if (!ncpus) - continue; - - WARN_ON_ONCE(nv->ngroups > ncpus); - - /* Account for rounding errors */ - extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); - - /* Spread allocated groups on CPUs of the current node */ - for (v = 0; v < nv->ngroups; v++, curgrp++) { - cpus_per_grp = ncpus / nv->ngroups; - - /* Account for extra groups to compensate rounding errors */ - if (extra_grps) { - cpus_per_grp++; - --extra_grps; - } - - /* - * wrapping has to be considered given 'startgrp' - * may start anywhere - */ - if (curgrp >= last_grp) - curgrp = 0; - grp_spread_init_one(&masks[curgrp], nmsk, - cpus_per_grp); - } - done += nv->ngroups; - } - kfree(node_groups); - return done; -} - -/* - * build affinity in two stages for each group, and try to put close CPUs - * in viewpoint of CPU and NUMA locality into same group, and we run - * two-stage grouping: - * - * 1) allocate present CPUs on these groups evenly first - * 2) allocate other possible CPUs on these groups evenly - */ -static struct cpumask *group_cpus_evenly(unsigned int numgrps) -{ - unsigned int curgrp = 0, nr_present = 0, nr_others = 0; - cpumask_var_t *node_to_cpumask; - cpumask_var_t nmsk, npresmsk; - int ret = -ENOMEM; - struct cpumask *masks = NULL; - - if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return NULL; - - if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) - goto fail_nmsk; - - node_to_cpumask = alloc_node_to_cpumask(); - if (!node_to_cpumask) - goto fail_npresmsk; - - masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); - if (!masks) - goto fail_node_to_cpumask; - - /* Stabilize the cpumasks */ - cpus_read_lock(); - build_node_to_cpumask(node_to_cpumask); - - /* grouping present CPUs first */ - ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, - cpu_present_mask, nmsk, masks); - if (ret < 0) - goto fail_build_affinity; - nr_present = ret; - - /* - * Allocate non present CPUs starting from the next group to be - * handled. If the grouping of present CPUs already exhausted the - * group space, assign the non present CPUs to the already - * allocated out groups. - */ - if (nr_present >= numgrps) - curgrp = 0; - else - curgrp = nr_present; - cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, - npresmsk, nmsk, masks); - if (ret >= 0) - nr_others = ret; - - fail_build_affinity: - cpus_read_unlock(); - - if (ret >= 0) - WARN_ON(nr_present + nr_others < numgrps); - - fail_node_to_cpumask: - free_node_to_cpumask(node_to_cpumask); - - fail_npresmsk: - free_cpumask_var(npresmsk); - - fail_nmsk: - free_cpumask_var(nmsk); - if (ret < 0) { - kfree(masks); - return NULL; - } - return masks; -} +#include static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) { diff --git a/lib/Makefile b/lib/Makefile index 4d9461bfea42c..a4665a802e871 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -353,6 +353,8 @@ obj-$(CONFIG_SBITMAP) += sbitmap.o obj-$(CONFIG_PARMAN) += parman.o +obj-y += group_cpus.o + # GCC library routines obj-$(CONFIG_GENERIC_LIB_ASHLDI3) += ashldi3.o obj-$(CONFIG_GENERIC_LIB_ASHRDI3) += ashrdi3.o diff --git a/lib/group_cpus.c b/lib/group_cpus.c new file mode 100644 index 0000000000000..99f08c6cb9d97 --- /dev/null +++ b/lib/group_cpus.c @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2016 Thomas Gleixner. + * Copyright (C) 2016-2017 Christoph Hellwig. + */ +#include +#include +#include +#include +#include + +static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, + unsigned int cpus_per_grp) +{ + const struct cpumask *siblmsk; + int cpu, sibl; + + for ( ; cpus_per_grp > 0; ) { + cpu = cpumask_first(nmsk); + + /* Should not happen, but I'm too lazy to think about it */ + if (cpu >= nr_cpu_ids) + return; + + cpumask_clear_cpu(cpu, nmsk); + cpumask_set_cpu(cpu, irqmsk); + cpus_per_grp--; + + /* If the cpu has siblings, use them first */ + siblmsk = topology_sibling_cpumask(cpu); + for (sibl = -1; cpus_per_grp > 0; ) { + sibl = cpumask_next(sibl, siblmsk); + if (sibl >= nr_cpu_ids) + break; + if (!cpumask_test_and_clear_cpu(sibl, nmsk)) + continue; + cpumask_set_cpu(sibl, irqmsk); + cpus_per_grp--; + } + } +} + +static cpumask_var_t *alloc_node_to_cpumask(void) +{ + cpumask_var_t *masks; + int node; + + masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); + if (!masks) + return NULL; + + for (node = 0; node < nr_node_ids; node++) { + if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) + goto out_unwind; + } + + return masks; + +out_unwind: + while (--node >= 0) + free_cpumask_var(masks[node]); + kfree(masks); + return NULL; +} + +static void free_node_to_cpumask(cpumask_var_t *masks) +{ + int node; + + for (node = 0; node < nr_node_ids; node++) + free_cpumask_var(masks[node]); + kfree(masks); +} + +static void build_node_to_cpumask(cpumask_var_t *masks) +{ + int cpu; + + for_each_possible_cpu(cpu) + cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); +} + +static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, + const struct cpumask *mask, nodemask_t *nodemsk) +{ + int n, nodes = 0; + + /* Calculate the number of nodes in the supplied affinity mask */ + for_each_node(n) { + if (cpumask_intersects(mask, node_to_cpumask[n])) { + node_set(n, *nodemsk); + nodes++; + } + } + return nodes; +} + +struct node_groups { + unsigned id; + + union { + unsigned ngroups; + unsigned ncpus; + }; +}; + +static int ncpus_cmp_func(const void *l, const void *r) +{ + const struct node_groups *ln = l; + const struct node_groups *rn = r; + + return ln->ncpus - rn->ncpus; +} + +/* + * Allocate group number for each node, so that for each node: + * + * 1) the allocated number is >= 1 + * + * 2) the allocated number is <= active CPU number of this node + * + * The actual allocated total groups may be less than @numgrps when + * active total CPU number is less than @numgrps. + * + * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' + * for each node. + */ +static void alloc_nodes_groups(unsigned int numgrps, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + const nodemask_t nodemsk, + struct cpumask *nmsk, + struct node_groups *node_groups) +{ + unsigned n, remaining_ncpus = 0; + + for (n = 0; n < nr_node_ids; n++) { + node_groups[n].id = n; + node_groups[n].ncpus = UINT_MAX; + } + + for_each_node_mask(n, nodemsk) { + unsigned ncpus; + + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + ncpus = cpumask_weight(nmsk); + + if (!ncpus) + continue; + remaining_ncpus += ncpus; + node_groups[n].ncpus = ncpus; + } + + numgrps = min_t(unsigned, remaining_ncpus, numgrps); + + sort(node_groups, nr_node_ids, sizeof(node_groups[0]), + ncpus_cmp_func, NULL); + + /* + * Allocate groups for each node according to the ratio of this + * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is + * bigger than number of active numa nodes. Always start the + * allocation from the node with minimized nr_cpus. + * + * This way guarantees that each active node gets allocated at + * least one group, and the theory is simple: over-allocation + * is only done when this node is assigned by one group, so + * other nodes will be allocated >= 1 groups, since 'numgrps' is + * bigger than number of numa nodes. + * + * One perfect invariant is that number of allocated groups for + * each node is <= CPU count of this node: + * + * 1) suppose there are two nodes: A and B + * ncpu(X) is CPU count of node X + * grps(X) is the group count allocated to node X via this + * algorithm + * + * ncpu(A) <= ncpu(B) + * ncpu(A) + ncpu(B) = N + * grps(A) + grps(B) = G + * + * grps(A) = max(1, round_down(G * ncpu(A) / N)) + * grps(B) = G - grps(A) + * + * both N and G are integer, and 2 <= G <= N, suppose + * G = N - delta, and 0 <= delta <= N - 2 + * + * 2) obviously grps(A) <= ncpu(A) because: + * + * if grps(A) is 1, then grps(A) <= ncpu(A) given + * ncpu(A) >= 1 + * + * otherwise, + * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N + * + * 3) prove how grps(B) <= ncpu(B): + * + * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be + * over-allocated, so grps(B) <= ncpu(B), + * + * otherwise: + * + * grps(A) = + * round_down(G * ncpu(A) / N) = + * round_down((N - delta) * ncpu(A) / N) = + * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= + * round_down((N * ncpu(A) - delta * N) / N) = + * cpu(A) - delta + * + * then: + * + * grps(A) - G >= ncpu(A) - delta - G + * => + * G - grps(A) <= G + delta - ncpu(A) + * => + * grps(B) <= N - ncpu(A) + * => + * grps(B) <= cpu(B) + * + * For nodes >= 3, it can be thought as one node and another big + * node given that is exactly what this algorithm is implemented, + * and we always re-calculate 'remaining_ncpus' & 'numgrps', and + * finally for each node X: grps(X) <= ncpu(X). + * + */ + for (n = 0; n < nr_node_ids; n++) { + unsigned ngroups, ncpus; + + if (node_groups[n].ncpus == UINT_MAX) + continue; + + WARN_ON_ONCE(numgrps == 0); + + ncpus = node_groups[n].ncpus; + ngroups = max_t(unsigned, 1, + numgrps * ncpus / remaining_ncpus); + WARN_ON_ONCE(ngroups > ncpus); + + node_groups[n].ngroups = ngroups; + + remaining_ncpus -= ncpus; + numgrps -= ngroups; + } +} + +static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, struct cpumask *masks) +{ + unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; + unsigned int last_grp = numgrps; + unsigned int curgrp = startgrp; + nodemask_t nodemsk = NODE_MASK_NONE; + struct node_groups *node_groups; + + if (cpumask_empty(cpu_mask)) + return 0; + + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); + + /* + * If the number of nodes in the mask is greater than or equal the + * number of groups we just spread the groups across the nodes. + */ + if (numgrps <= nodes) { + for_each_node_mask(n, nodemsk) { + /* Ensure that only CPUs which are in both masks are set */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); + if (++curgrp == last_grp) + curgrp = 0; + } + return numgrps; + } + + node_groups = kcalloc(nr_node_ids, + sizeof(struct node_groups), + GFP_KERNEL); + if (!node_groups) + return -ENOMEM; + + /* allocate group number for each node */ + alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, + nodemsk, nmsk, node_groups); + for (i = 0; i < nr_node_ids; i++) { + unsigned int ncpus, v; + struct node_groups *nv = &node_groups[i]; + + if (nv->ngroups == UINT_MAX) + continue; + + /* Get the cpus on this node which are in the mask */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); + ncpus = cpumask_weight(nmsk); + if (!ncpus) + continue; + + WARN_ON_ONCE(nv->ngroups > ncpus); + + /* Account for rounding errors */ + extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); + + /* Spread allocated groups on CPUs of the current node */ + for (v = 0; v < nv->ngroups; v++, curgrp++) { + cpus_per_grp = ncpus / nv->ngroups; + + /* Account for extra groups to compensate rounding errors */ + if (extra_grps) { + cpus_per_grp++; + --extra_grps; + } + + /* + * wrapping has to be considered given 'startgrp' + * may start anywhere + */ + if (curgrp >= last_grp) + curgrp = 0; + grp_spread_init_one(&masks[curgrp], nmsk, + cpus_per_grp); + } + done += nv->ngroups; + } + kfree(node_groups); + return done; +} + +#ifdef CONFIG_SMP +/** + * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality + * @numgrps: number of groups + * + * Return: cpumask array if successful, NULL otherwise. And each element + * includes CPUs assigned to this group + * + * Try to put close CPUs from viewpoint of CPU and NUMA locality into + * same group, and run two-stage grouping: + * 1) allocate present CPUs on these groups evenly first + * 2) allocate other possible CPUs on these groups evenly + * + * We guarantee in the resulted grouping that all CPUs are covered, and + * no same CPU is assigned to multiple groups + */ +struct cpumask *group_cpus_evenly(unsigned int numgrps) +{ + unsigned int curgrp = 0, nr_present = 0, nr_others = 0; + cpumask_var_t *node_to_cpumask; + cpumask_var_t nmsk, npresmsk; + int ret = -ENOMEM; + struct cpumask *masks = NULL; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return NULL; + + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) + goto fail_nmsk; + + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) + goto fail_npresmsk; + + masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto fail_node_to_cpumask; + + /* Stabilize the cpumasks */ + cpus_read_lock(); + build_node_to_cpumask(node_to_cpumask); + + /* grouping present CPUs first */ + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + cpu_present_mask, nmsk, masks); + if (ret < 0) + goto fail_build_affinity; + nr_present = ret; + + /* + * Allocate non present CPUs starting from the next group to be + * handled. If the grouping of present CPUs already exhausted the + * group space, assign the non present CPUs to the already + * allocated out groups. + */ + if (nr_present >= numgrps) + curgrp = 0; + else + curgrp = nr_present; + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + npresmsk, nmsk, masks); + if (ret >= 0) + nr_others = ret; + + fail_build_affinity: + cpus_read_unlock(); + + if (ret >= 0) + WARN_ON(nr_present + nr_others < numgrps); + + fail_node_to_cpumask: + free_node_to_cpumask(node_to_cpumask); + + fail_npresmsk: + free_cpumask_var(npresmsk); + + fail_nmsk: + free_cpumask_var(nmsk); + if (ret < 0) { + kfree(masks); + return NULL; + } + return masks; +} +#else +struct cpumask *group_cpus_evenly(unsigned int numgrps) +{ + struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); + + if (!masks) + return NULL; + + /* assign all CPUs(cpu 0) to the 1st group only */ + cpumask_copy(&masks[0], cpu_possible_mask); + return masks; +} +#endif From 6a6dcae8f486c3f3298d0767d34505121c7b0b81 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Dec 2022 10:29:05 +0800 Subject: [PATCH 09/42] blk-mq: Build default queue map via group_cpus_evenly() The default queue mapping builder of blk_mq_map_queues doesn't take NUMA topo into account, so the built mapping is pretty bad, since CPUs belonging to different NUMA node are assigned to same queue. It is observed that IOPS drops by ~30% when running two jobs on same hctx of null_blk from two CPUs belonging to two NUMA nodes compared with from same NUMA node. Address the issue by reusing group_cpus_evenly() for building queue mapping since group_cpus_evenly() does group cpus according to CPU/NUMA locality. Also performance data becomes more stable with this given correct queue mapping is applied wrt. numa locality viewpoint, for example, on one two nodes arm64 machine with 160 cpus, node 0(cpu 0~79), node 1(cpu 80~159): 1) modprobe null_blk nr_devices=1 submit_queues=2 2) run 'fio(t/io_uring -p 0 -n 4 -r 20 /dev/nullb0)', and observe that IOPS becomes much stable on multiple tests: - unpatched: IOPS is 2.5M ~ 4.5M - patched: IOPS is 4.3M ~ 5.0M Lots of drivers may benefit from the change, such as nvme pci poll, nvme tcp, ... Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20221227022905.352674-7-ming.lei@redhat.com --- block/blk-mq-cpumap.c | 63 +++++++++---------------------------------- 1 file changed, 13 insertions(+), 50 deletions(-) diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 9c2fce1a7b50e..0c612c19feb8b 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -10,66 +10,29 @@ #include #include #include +#include #include #include "blk.h" #include "blk-mq.h" -static int queue_index(struct blk_mq_queue_map *qmap, - unsigned int nr_queues, const int q) -{ - return qmap->queue_offset + (q % nr_queues); -} - -static int get_first_sibling(unsigned int cpu) -{ - unsigned int ret; - - ret = cpumask_first(topology_sibling_cpumask(cpu)); - if (ret < nr_cpu_ids) - return ret; - - return cpu; -} - void blk_mq_map_queues(struct blk_mq_queue_map *qmap) { - unsigned int *map = qmap->mq_map; - unsigned int nr_queues = qmap->nr_queues; - unsigned int cpu, first_sibling, q = 0; - - for_each_possible_cpu(cpu) - map[cpu] = -1; - - /* - * Spread queues among present CPUs first for minimizing - * count of dead queues which are mapped by all un-present CPUs - */ - for_each_present_cpu(cpu) { - if (q >= nr_queues) - break; - map[cpu] = queue_index(qmap, nr_queues, q++); + const struct cpumask *masks; + unsigned int queue, cpu; + + masks = group_cpus_evenly(qmap->nr_queues); + if (!masks) { + for_each_possible_cpu(cpu) + qmap->mq_map[cpu] = qmap->queue_offset; + return; } - for_each_possible_cpu(cpu) { - if (map[cpu] != -1) - continue; - /* - * First do sequential mapping between CPUs and queues. - * In case we still have CPUs to map, and we have some number of - * threads per cores then map sibling threads to the same queue - * for performance optimizations. - */ - if (q < nr_queues) { - map[cpu] = queue_index(qmap, nr_queues, q++); - } else { - first_sibling = get_first_sibling(cpu); - if (first_sibling == cpu) - map[cpu] = queue_index(qmap, nr_queues, q++); - else - map[cpu] = map[first_sibling]; - } + for (queue = 0; queue < qmap->nr_queues; queue++) { + for_each_cpu(cpu, &masks[queue]) + qmap->mq_map[cpu] = qmap->queue_offset + queue; } + kfree(masks); } EXPORT_SYMBOL_GPL(blk_mq_map_queues); From 188a569658584e93930ab60334c5a1079c0330d8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Jan 2023 12:14:01 +0100 Subject: [PATCH 10/42] genirq/affinity: Only build SMP-only helper functions on SMP kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit allnoconfig grew these new build warnings in lib/group_cpus.c: lib/group_cpus.c:247:12: warning: ‘__group_cpus_evenly’ defined but not used [-Wunused-function] lib/group_cpus.c:75:13: warning: ‘build_node_to_cpumask’ defined but not used [-Wunused-function] lib/group_cpus.c:66:13: warning: ‘free_node_to_cpumask’ defined but not used [-Wunused-function] lib/group_cpus.c:43:23: warning: ‘alloc_node_to_cpumask’ defined but not used [-Wunused-function] Widen the #ifdef CONFIG_SMP block to not expose unused helpers on non-SMP builds. Also annotate the preprocessor branches for better readability. Fixes: f7b3ea8cf72f ("genirq/affinity: Move group_cpus_evenly() into lib/") Cc: Ming Lei Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20221227022905.352674-6-ming.lei@redhat.com Signed-off-by: Ingo Molnar --- lib/group_cpus.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/group_cpus.c b/lib/group_cpus.c index 99f08c6cb9d97..9c837a35fef7b 100644 --- a/lib/group_cpus.c +++ b/lib/group_cpus.c @@ -9,6 +9,8 @@ #include #include +#ifdef CONFIG_SMP + static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, unsigned int cpus_per_grp) { @@ -327,7 +329,6 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, return done; } -#ifdef CONFIG_SMP /** * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality * @numgrps: number of groups @@ -412,7 +413,7 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) } return masks; } -#else +#else /* CONFIG_SMP */ struct cpumask *group_cpus_evenly(unsigned int numgrps) { struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); @@ -424,4 +425,4 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) cpumask_copy(&masks[0], cpu_possible_mask); return masks; } -#endif +#endif /* CONFIG_SMP */ From e740604232dc5c3097808f3e91fd02d9316010c5 Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Mon, 30 Jan 2023 16:54:30 +0800 Subject: [PATCH 11/42] irqchip/aspeed-scu-ic: Correctly initialise status and enable registers The status and enable registers are never initialised with sensible default values. Fix those. Signed-off-by: Ryan Chen [maz: commit message] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230130085430.635583-1-ryan_chen@aspeedtech.com --- drivers/irqchip/irq-aspeed-scu-ic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-aspeed-scu-ic.c b/drivers/irqchip/irq-aspeed-scu-ic.c index 279e92cf0b16b..94a7223e95df6 100644 --- a/drivers/irqchip/irq-aspeed-scu-ic.c +++ b/drivers/irqchip/irq-aspeed-scu-ic.c @@ -17,8 +17,9 @@ #define ASPEED_SCU_IC_REG 0x018 #define ASPEED_SCU_IC_SHIFT 0 -#define ASPEED_SCU_IC_ENABLE GENMASK(6, ASPEED_SCU_IC_SHIFT) +#define ASPEED_SCU_IC_ENABLE GENMASK(15, ASPEED_SCU_IC_SHIFT) #define ASPEED_SCU_IC_NUM_IRQS 7 +#define ASPEED_SCU_IC_STATUS GENMASK(28, 16) #define ASPEED_SCU_IC_STATUS_SHIFT 16 #define ASPEED_AST2600_SCU_IC0_REG 0x560 @@ -155,6 +156,8 @@ static int aspeed_scu_ic_of_init_common(struct aspeed_scu_ic *scu_ic, rc = PTR_ERR(scu_ic->scu); goto err; } + regmap_write_bits(scu_ic->scu, scu_ic->reg, ASPEED_SCU_IC_STATUS, ASPEED_SCU_IC_STATUS); + regmap_write_bits(scu_ic->scu, scu_ic->reg, ASPEED_SCU_IC_ENABLE, 0); irq = irq_of_parse_and_map(node, 0); if (!irq) { From fc98adb9a8435cdb4e8349138ac0b728df80ade9 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 7 Dec 2022 22:06:43 +0800 Subject: [PATCH 12/42] irqchip/loongson-liointc: Save/restore int_edge/int_pol registers during S3/S4 If int_edge/int_pol registers are configured to not be the default values, we should save/restore them during S3/S4. Signed-off-by: Yingkun Meng Signed-off-by: Huacai Chen Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221207140643.1600743-1-chenhuacai@loongson.cn --- drivers/irqchip/irq-loongson-liointc.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/irqchip/irq-loongson-liointc.c b/drivers/irqchip/irq-loongson-liointc.c index 85b754f7f4e6e..8d00a9ad5b005 100644 --- a/drivers/irqchip/irq-loongson-liointc.c +++ b/drivers/irqchip/irq-loongson-liointc.c @@ -55,6 +55,8 @@ struct liointc_priv { struct liointc_handler_data handler[LIOINTC_NUM_PARENT]; void __iomem *core_isr[LIOINTC_NUM_CORES]; u8 map_cache[LIOINTC_CHIP_IRQ]; + u32 int_pol; + u32 int_edge; bool has_lpc_irq_errata; }; @@ -138,6 +140,14 @@ static int liointc_set_type(struct irq_data *data, unsigned int type) return 0; } +static void liointc_suspend(struct irq_chip_generic *gc) +{ + struct liointc_priv *priv = gc->private; + + priv->int_pol = readl(gc->reg_base + LIOINTC_REG_INTC_POL); + priv->int_edge = readl(gc->reg_base + LIOINTC_REG_INTC_EDGE); +} + static void liointc_resume(struct irq_chip_generic *gc) { struct liointc_priv *priv = gc->private; @@ -150,6 +160,8 @@ static void liointc_resume(struct irq_chip_generic *gc) /* Restore map cache */ for (i = 0; i < LIOINTC_CHIP_IRQ; i++) writeb(priv->map_cache[i], gc->reg_base + i); + writel(priv->int_pol, gc->reg_base + LIOINTC_REG_INTC_POL); + writel(priv->int_edge, gc->reg_base + LIOINTC_REG_INTC_EDGE); /* Restore mask cache */ writel(gc->mask_cache, gc->reg_base + LIOINTC_REG_INTC_ENABLE); irq_gc_unlock_irqrestore(gc, flags); @@ -269,6 +281,7 @@ static int liointc_init(phys_addr_t addr, unsigned long size, int revision, gc->private = priv; gc->reg_base = base; gc->domain = domain; + gc->suspend = liointc_suspend; gc->resume = liointc_resume; ct = gc->chip_types; From 835a486cd9f55790dee9f6b67ce0057d49f15da5 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Tue, 3 Jan 2023 19:42:15 +0530 Subject: [PATCH 13/42] genirq: Add mechanism to multiplex a single HW IPI All RISC-V platforms have a single HW IPI provided by the INTC local interrupt controller. The HW method to trigger INTC IPI can be through external irqchip (e.g. RISC-V AIA), through platform specific device (e.g. SiFive CLINT timer), or through firmware (e.g. SBI IPI call). To support multiple IPIs on RISC-V, add a generic IPI multiplexing mechanism which help us create multiple virtual IPIs using a single HW IPI. This generic IPI multiplexing is inspired by the Apple AIC irqchip driver and it is shared by various RISC-V irqchip drivers. Signed-off-by: Anup Patel Reviewed-by: Hector Martin Tested-by: Hector Martin Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230103141221.772261-4-apatel@ventanamicro.com --- include/linux/irq.h | 3 + kernel/irq/Kconfig | 5 ++ kernel/irq/Makefile | 1 + kernel/irq/ipi-mux.c | 207 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 216 insertions(+) create mode 100644 kernel/irq/ipi-mux.c diff --git a/include/linux/irq.h b/include/linux/irq.h index c3eb89606c2b1..b1b28affb32a7 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1266,6 +1266,9 @@ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest); int ipi_send_single(unsigned int virq, unsigned int cpu); int ipi_send_mask(unsigned int virq, const struct cpumask *dest); +void ipi_mux_process(void); +int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu)); + #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER /* * Registers a generic IRQ handling function as the top-level IRQ handler in diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index b64c44ae4c250..2531f3496ab6d 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -86,6 +86,11 @@ config GENERIC_IRQ_IPI depends on SMP select IRQ_DOMAIN_HIERARCHY +# Generic IRQ IPI Mux support +config GENERIC_IRQ_IPI_MUX + bool + depends on SMP + # Generic MSI hierarchical interrupt domain support config GENERIC_MSI_IRQ bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index b4f53717d1434..f19d3080bf11a 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o +obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o obj-$(CONFIG_SMP) += affinity.o obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c new file mode 100644 index 0000000000000..3a403c3a785d1 --- /dev/null +++ b/kernel/irq/ipi-mux.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Multiplex several virtual IPIs over a single HW IPI. + * + * Copyright The Asahi Linux Contributors + * Copyright (c) 2022 Ventana Micro Systems Inc. + */ + +#define pr_fmt(fmt) "ipi-mux: " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ipi_mux_cpu { + atomic_t enable; + atomic_t bits; +}; + +static struct ipi_mux_cpu __percpu *ipi_mux_pcpu; +static struct irq_domain *ipi_mux_domain; +static void (*ipi_mux_send)(unsigned int cpu); + +static void ipi_mux_mask(struct irq_data *d) +{ + struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); + + atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable); +} + +static void ipi_mux_unmask(struct irq_data *d) +{ + struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); + u32 ibit = BIT(irqd_to_hwirq(d)); + + atomic_or(ibit, &icpu->enable); + + /* + * The atomic_or() above must complete before the atomic_read() + * below to avoid racing ipi_mux_send_mask(). + */ + smp_mb__after_atomic(); + + /* If a pending IPI was unmasked, raise a parent IPI immediately. */ + if (atomic_read(&icpu->bits) & ibit) + ipi_mux_send(smp_processor_id()); +} + +static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask) +{ + struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); + u32 ibit = BIT(irqd_to_hwirq(d)); + unsigned long pending; + int cpu; + + for_each_cpu(cpu, mask) { + icpu = per_cpu_ptr(ipi_mux_pcpu, cpu); + + /* + * This sequence is the mirror of the one in ipi_mux_unmask(); + * see the comment there. Additionally, release semantics + * ensure that the vIPI flag set is ordered after any shared + * memory accesses that precede it. This therefore also pairs + * with the atomic_fetch_andnot in ipi_mux_process(). + */ + pending = atomic_fetch_or_release(ibit, &icpu->bits); + + /* + * The atomic_fetch_or_release() above must complete + * before the atomic_read() below to avoid racing with + * ipi_mux_unmask(). + */ + smp_mb__after_atomic(); + + /* + * The flag writes must complete before the physical IPI is + * issued to another CPU. This is implied by the control + * dependency on the result of atomic_read() below, which is + * itself already ordered after the vIPI flag write. + */ + if (!(pending & ibit) && (atomic_read(&icpu->enable) & ibit)) + ipi_mux_send(cpu); + } +} + +static const struct irq_chip ipi_mux_chip = { + .name = "IPI Mux", + .irq_mask = ipi_mux_mask, + .irq_unmask = ipi_mux_unmask, + .ipi_send_mask = ipi_mux_send_mask, +}; + +static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_set_percpu_devid(virq + i); + irq_domain_set_info(d, virq + i, i, &ipi_mux_chip, NULL, + handle_percpu_devid_irq, NULL, NULL); + } + + return 0; +} + +static const struct irq_domain_ops ipi_mux_domain_ops = { + .alloc = ipi_mux_domain_alloc, + .free = irq_domain_free_irqs_top, +}; + +/** + * ipi_mux_process - Process multiplexed virtual IPIs + */ +void ipi_mux_process(void) +{ + struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); + irq_hw_number_t hwirq; + unsigned long ipis; + unsigned int en; + + /* + * Reading enable mask does not need to be ordered as long as + * this function is called from interrupt handler because only + * the CPU itself can change it's own enable mask. + */ + en = atomic_read(&icpu->enable); + + /* + * Clear the IPIs we are about to handle. This pairs with the + * atomic_fetch_or_release() in ipi_mux_send_mask(). + */ + ipis = atomic_fetch_andnot(en, &icpu->bits) & en; + + for_each_set_bit(hwirq, &ipis, BITS_PER_TYPE(int)) + generic_handle_domain_irq(ipi_mux_domain, hwirq); +} + +/** + * ipi_mux_create - Create virtual IPIs multiplexed on top of a single + * parent IPI. + * @nr_ipi: number of virtual IPIs to create. This should + * be <= BITS_PER_TYPE(int) + * @mux_send: callback to trigger parent IPI for a particular CPU + * + * Returns first virq of the newly created virtual IPIs upon success + * or <=0 upon failure + */ +int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu)) +{ + struct fwnode_handle *fwnode; + struct irq_domain *domain; + int rc; + + if (ipi_mux_domain) + return -EEXIST; + + if (BITS_PER_TYPE(int) < nr_ipi || !mux_send) + return -EINVAL; + + ipi_mux_pcpu = alloc_percpu(typeof(*ipi_mux_pcpu)); + if (!ipi_mux_pcpu) + return -ENOMEM; + + fwnode = irq_domain_alloc_named_fwnode("IPI-Mux"); + if (!fwnode) { + pr_err("unable to create IPI Mux fwnode\n"); + rc = -ENOMEM; + goto fail_free_cpu; + } + + domain = irq_domain_create_linear(fwnode, nr_ipi, + &ipi_mux_domain_ops, NULL); + if (!domain) { + pr_err("unable to add IPI Mux domain\n"); + rc = -ENOMEM; + goto fail_free_fwnode; + } + + domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE; + irq_domain_update_bus_token(domain, DOMAIN_BUS_IPI); + + rc = __irq_domain_alloc_irqs(domain, -1, nr_ipi, + NUMA_NO_NODE, NULL, false, NULL); + if (rc <= 0) { + pr_err("unable to alloc IRQs from IPI Mux domain\n"); + goto fail_free_domain; + } + + ipi_mux_domain = domain; + ipi_mux_send = mux_send; + + return rc; + +fail_free_domain: + irq_domain_remove(domain); +fail_free_fwnode: + irq_domain_free_fwnode(fwnode); +fail_free_cpu: + free_percpu(ipi_mux_pcpu); + return rc; +} From c19f897194288ec286bb52001b9ee9551876a614 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 3 Jan 2023 19:42:21 +0530 Subject: [PATCH 14/42] irqchip/apple-aic: Move over to core ipi-mux Now that the complexity of the AIC IPI mux has been copied into the core code for the benefit of the riscv architecture, shrink the AIC driver by the same amount by using that infrastructure. Signed-off-by: Marc Zyngier Signed-off-by: Anup Patel Acked-by: Hector Martin Link: https://lore.kernel.org/r/20230103141221.772261-10-apatel@ventanamicro.com --- drivers/irqchip/Kconfig | 1 + drivers/irqchip/irq-apple-aic.c | 161 ++------------------------------ 2 files changed, 9 insertions(+), 153 deletions(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index caa952c40ff92..cbf504794e7c1 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -658,6 +658,7 @@ config APPLE_AIC bool "Apple Interrupt Controller (AIC)" depends on ARM64 depends on ARCH_APPLE || COMPILE_TEST + select GENERIC_IRQ_IPI_MUX help Support for the Apple Interrupt Controller found on Apple Silicon SoCs, such as the M1. diff --git a/drivers/irqchip/irq-apple-aic.c b/drivers/irqchip/irq-apple-aic.c index ae3437f03e6c2..eabb3b92965b2 100644 --- a/drivers/irqchip/irq-apple-aic.c +++ b/drivers/irqchip/irq-apple-aic.c @@ -292,7 +292,6 @@ struct aic_irq_chip { void __iomem *base; void __iomem *event; struct irq_domain *hw_domain; - struct irq_domain *ipi_domain; struct { cpumask_t aff; } *fiq_aff[AIC_NR_FIQ]; @@ -307,9 +306,6 @@ struct aic_irq_chip { static DEFINE_PER_CPU(uint32_t, aic_fiq_unmasked); -static DEFINE_PER_CPU(atomic_t, aic_vipi_flag); -static DEFINE_PER_CPU(atomic_t, aic_vipi_enable); - static struct aic_irq_chip *aic_irqc; static void aic_handle_ipi(struct pt_regs *regs); @@ -751,98 +747,8 @@ static void aic_ipi_send_fast(int cpu) isb(); } -static void aic_ipi_mask(struct irq_data *d) -{ - u32 irq_bit = BIT(irqd_to_hwirq(d)); - - /* No specific ordering requirements needed here. */ - atomic_andnot(irq_bit, this_cpu_ptr(&aic_vipi_enable)); -} - -static void aic_ipi_unmask(struct irq_data *d) -{ - struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d); - u32 irq_bit = BIT(irqd_to_hwirq(d)); - - atomic_or(irq_bit, this_cpu_ptr(&aic_vipi_enable)); - - /* - * The atomic_or() above must complete before the atomic_read() - * below to avoid racing aic_ipi_send_mask(). - */ - smp_mb__after_atomic(); - - /* - * If a pending vIPI was unmasked, raise a HW IPI to ourselves. - * No barriers needed here since this is a self-IPI. - */ - if (atomic_read(this_cpu_ptr(&aic_vipi_flag)) & irq_bit) { - if (static_branch_likely(&use_fast_ipi)) - aic_ipi_send_fast(smp_processor_id()); - else - aic_ic_write(ic, AIC_IPI_SEND, AIC_IPI_SEND_CPU(smp_processor_id())); - } -} - -static void aic_ipi_send_mask(struct irq_data *d, const struct cpumask *mask) -{ - struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d); - u32 irq_bit = BIT(irqd_to_hwirq(d)); - u32 send = 0; - int cpu; - unsigned long pending; - - for_each_cpu(cpu, mask) { - /* - * This sequence is the mirror of the one in aic_ipi_unmask(); - * see the comment there. Additionally, release semantics - * ensure that the vIPI flag set is ordered after any shared - * memory accesses that precede it. This therefore also pairs - * with the atomic_fetch_andnot in aic_handle_ipi(). - */ - pending = atomic_fetch_or_release(irq_bit, per_cpu_ptr(&aic_vipi_flag, cpu)); - - /* - * The atomic_fetch_or_release() above must complete before the - * atomic_read() below to avoid racing aic_ipi_unmask(). - */ - smp_mb__after_atomic(); - - if (!(pending & irq_bit) && - (atomic_read(per_cpu_ptr(&aic_vipi_enable, cpu)) & irq_bit)) { - if (static_branch_likely(&use_fast_ipi)) - aic_ipi_send_fast(cpu); - else - send |= AIC_IPI_SEND_CPU(cpu); - } - } - - /* - * The flag writes must complete before the physical IPI is issued - * to another CPU. This is implied by the control dependency on - * the result of atomic_read_acquire() above, which is itself - * already ordered after the vIPI flag write. - */ - if (send) - aic_ic_write(ic, AIC_IPI_SEND, send); -} - -static struct irq_chip ipi_chip = { - .name = "AIC-IPI", - .irq_mask = aic_ipi_mask, - .irq_unmask = aic_ipi_unmask, - .ipi_send_mask = aic_ipi_send_mask, -}; - -/* - * IPI IRQ domain - */ - static void aic_handle_ipi(struct pt_regs *regs) { - int i; - unsigned long enabled, firing; - /* * Ack the IPI. We need to order this after the AIC event read, but * that is enforced by normal MMIO ordering guarantees. @@ -857,27 +763,7 @@ static void aic_handle_ipi(struct pt_regs *regs) aic_ic_write(aic_irqc, AIC_IPI_ACK, AIC_IPI_OTHER); } - /* - * The mask read does not need to be ordered. Only we can change - * our own mask anyway, so no races are possible here, as long as - * we are properly in the interrupt handler (which is covered by - * the barrier that is part of the top-level AIC handler's readl()). - */ - enabled = atomic_read(this_cpu_ptr(&aic_vipi_enable)); - - /* - * Clear the IPIs we are about to handle. This pairs with the - * atomic_fetch_or_release() in aic_ipi_send_mask(), and needs to be - * ordered after the aic_ic_write() above (to avoid dropping vIPIs) and - * before IPI handling code (to avoid races handling vIPIs before they - * are signaled). The former is taken care of by the release semantics - * of the write portion, while the latter is taken care of by the - * acquire semantics of the read portion. - */ - firing = atomic_fetch_andnot(enabled, this_cpu_ptr(&aic_vipi_flag)) & enabled; - - for_each_set_bit(i, &firing, AIC_NR_SWIPI) - generic_handle_domain_irq(aic_irqc->ipi_domain, i); + ipi_mux_process(); /* * No ordering needed here; at worst this just changes the timing of @@ -887,55 +773,24 @@ static void aic_handle_ipi(struct pt_regs *regs) aic_ic_write(aic_irqc, AIC_IPI_MASK_CLR, AIC_IPI_OTHER); } -static int aic_ipi_alloc(struct irq_domain *d, unsigned int virq, - unsigned int nr_irqs, void *args) +static void aic_ipi_send_single(unsigned int cpu) { - int i; - - for (i = 0; i < nr_irqs; i++) { - irq_set_percpu_devid(virq + i); - irq_domain_set_info(d, virq + i, i, &ipi_chip, d->host_data, - handle_percpu_devid_irq, NULL, NULL); - } - - return 0; -} - -static void aic_ipi_free(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs) -{ - /* Not freeing IPIs */ + if (static_branch_likely(&use_fast_ipi)) + aic_ipi_send_fast(cpu); + else + aic_ic_write(aic_irqc, AIC_IPI_SEND, AIC_IPI_SEND_CPU(cpu)); } -static const struct irq_domain_ops aic_ipi_domain_ops = { - .alloc = aic_ipi_alloc, - .free = aic_ipi_free, -}; - static int __init aic_init_smp(struct aic_irq_chip *irqc, struct device_node *node) { - struct irq_domain *ipi_domain; int base_ipi; - ipi_domain = irq_domain_create_linear(irqc->hw_domain->fwnode, AIC_NR_SWIPI, - &aic_ipi_domain_ops, irqc); - if (WARN_ON(!ipi_domain)) - return -ENODEV; - - ipi_domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE; - irq_domain_update_bus_token(ipi_domain, DOMAIN_BUS_IPI); - - base_ipi = __irq_domain_alloc_irqs(ipi_domain, -1, AIC_NR_SWIPI, - NUMA_NO_NODE, NULL, false, NULL); - - if (WARN_ON(!base_ipi)) { - irq_domain_remove(ipi_domain); + base_ipi = ipi_mux_create(AIC_NR_SWIPI, aic_ipi_send_single); + if (WARN_ON(base_ipi <= 0)) return -ENODEV; - } set_smp_ipi_range(base_ipi, AIC_NR_SWIPI); - irqc->ipi_domain = ipi_domain; - return 0; } From 6caa5a2b78f5f53c433d3a3781e53325da22f0ac Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Mon, 2 Jan 2023 16:13:18 +0400 Subject: [PATCH 15/42] irqchip: Fix refcount leak in platform_irqchip_probe of_irq_find_parent() returns a node pointer with refcount incremented, We should use of_node_put() on it when not needed anymore. Add missing of_node_put() to avoid refcount leak. Fixes: f8410e626569 ("irqchip: Add IRQCHIP_PLATFORM_DRIVER_BEGIN/END and IRQCHIP_MATCH helper macros") Signed-off-by: Miaoqian Lin Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230102121318.3990586-1-linmq006@gmail.com --- drivers/irqchip/irqchip.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irqchip.c b/drivers/irqchip/irqchip.c index 3570f0a588c4b..7899607fbee8d 100644 --- a/drivers/irqchip/irqchip.c +++ b/drivers/irqchip/irqchip.c @@ -38,8 +38,10 @@ int platform_irqchip_probe(struct platform_device *pdev) struct device_node *par_np = of_irq_find_parent(np); of_irq_init_cb_t irq_init_cb = of_device_get_match_data(&pdev->dev); - if (!irq_init_cb) + if (!irq_init_cb) { + of_node_put(par_np); return -EINVAL; + } if (par_np == np) par_np = NULL; @@ -52,8 +54,10 @@ int platform_irqchip_probe(struct platform_device *pdev) * interrupt controller. The actual initialization callback of this * interrupt controller can check for specific domains as necessary. */ - if (par_np && !irq_find_matching_host(par_np, DOMAIN_BUS_ANY)) + if (par_np && !irq_find_matching_host(par_np, DOMAIN_BUS_ANY)) { + of_node_put(par_np); return -EPROBE_DEFER; + } return irq_init_cb(np, par_np); } From 071d068b89e95d1b078aa6bbcb9d0961b77d6aa1 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Mon, 2 Jan 2023 12:28:10 +0400 Subject: [PATCH 16/42] irqchip/alpine-msi: Fix refcount leak in alpine_msix_init_domains of_irq_find_parent() returns a node pointer with refcount incremented, We should use of_node_put() on it when not needed anymore. Add missing of_node_put() to avoid refcount leak. Fixes: e6b78f2c3e14 ("irqchip: Add the Alpine MSIX interrupt controller") Signed-off-by: Miaoqian Lin Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230102082811.3947760-1-linmq006@gmail.com --- drivers/irqchip/irq-alpine-msi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/irqchip/irq-alpine-msi.c b/drivers/irqchip/irq-alpine-msi.c index 5ddb8e578ac6a..fc1ef7de37973 100644 --- a/drivers/irqchip/irq-alpine-msi.c +++ b/drivers/irqchip/irq-alpine-msi.c @@ -199,6 +199,7 @@ static int alpine_msix_init_domains(struct alpine_msix_data *priv, } gic_domain = irq_find_host(gic_node); + of_node_put(gic_node); if (!gic_domain) { pr_err("Failed to find the GIC domain\n"); return -ENXIO; From 9419e700021a393f67be36abd0c4f3acc6139041 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Mon, 2 Jan 2023 12:42:08 +0400 Subject: [PATCH 17/42] irqchip/irq-mvebu-gicp: Fix refcount leak in mvebu_gicp_probe of_irq_find_parent() returns a node pointer with refcount incremented, We should use of_node_put() on it when not needed anymore. Add missing of_node_put() to avoid refcount leak. Fixes: a68a63cb4dfc ("irqchip/irq-mvebu-gicp: Add new driver for Marvell GICP") Signed-off-by: Miaoqian Lin Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230102084208.3951758-1-linmq006@gmail.com --- drivers/irqchip/irq-mvebu-gicp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/irqchip/irq-mvebu-gicp.c b/drivers/irqchip/irq-mvebu-gicp.c index fe88a782173dd..c43a345061d53 100644 --- a/drivers/irqchip/irq-mvebu-gicp.c +++ b/drivers/irqchip/irq-mvebu-gicp.c @@ -221,6 +221,7 @@ static int mvebu_gicp_probe(struct platform_device *pdev) } parent_domain = irq_find_host(irq_parent_dn); + of_node_put(irq_parent_dn); if (!parent_domain) { dev_err(&pdev->dev, "failed to find parent IRQ domain\n"); return -ENODEV; From 02298b7bae12936ca313975b02e7f98b06670d37 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Mon, 2 Jan 2023 12:56:10 +0400 Subject: [PATCH 18/42] irqchip/ti-sci: Fix refcount leak in ti_sci_intr_irq_domain_probe of_irq_find_parent() returns a node pointer with refcount incremented, We should use of_node_put() on it when not needed anymore. Add missing of_node_put() to avoid refcount leak. Fixes: cd844b0715ce ("irqchip/ti-sci-intr: Add support for Interrupt Router driver") Signed-off-by: Miaoqian Lin Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230102085611.3955984-1-linmq006@gmail.com --- drivers/irqchip/irq-ti-sci-intr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/irqchip/irq-ti-sci-intr.c b/drivers/irqchip/irq-ti-sci-intr.c index fe8fad22bcf96..020ddf29efb80 100644 --- a/drivers/irqchip/irq-ti-sci-intr.c +++ b/drivers/irqchip/irq-ti-sci-intr.c @@ -236,6 +236,7 @@ static int ti_sci_intr_irq_domain_probe(struct platform_device *pdev) } parent_domain = irq_find_host(parent_node); + of_node_put(parent_node); if (!parent_domain) { dev_err(dev, "Failed to find IRQ parent domain\n"); return -ENODEV; From 9c1a7bfc2993112cfb3056b18301fcafe5c2fde5 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 11 Jan 2023 11:05:54 +0100 Subject: [PATCH 19/42] irqchip/ls-scfg-msi: Simplify Kconfig dependencies Having both PCI_MSI and PCI is redundant. Drop PCI. Signed-off-by: Lukas Bulwahn [maz: cut commit message extra verbosity] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230111100554.24500-1-lukas.bulwahn@gmail.com --- drivers/irqchip/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index caa952c40ff92..f3d88e9c87a3b 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -389,7 +389,7 @@ config LS_EXTIRQ config LS_SCFG_MSI def_bool y if SOC_LS1021A || ARCH_LAYERSCAPE - depends on PCI && PCI_MSI + depends on PCI_MSI config PARTITION_PERCPU bool From 3d812a0f27baa2d094f2c18298d48b012878dc0b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 6 Feb 2023 17:21:15 +0000 Subject: [PATCH 20/42] genirq/ipi-mux: Use irq_domain_alloc_irqs() Using __irq_domain_alloc_irqs() is an unnecessary complexity. Use irq_domain_alloc_irqs(), which is simpler and makes the code more readable. Reported-by: Stephen Rothwell Signed-off-by: Marc Zyngier --- kernel/irq/ipi-mux.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c index 3a403c3a785d1..fa4fc18c61316 100644 --- a/kernel/irq/ipi-mux.c +++ b/kernel/irq/ipi-mux.c @@ -185,8 +185,7 @@ int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu)) domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE; irq_domain_update_bus_token(domain, DOMAIN_BUS_IPI); - rc = __irq_domain_alloc_irqs(domain, -1, nr_ipi, - NUMA_NO_NODE, NULL, false, NULL); + rc = irq_domain_alloc_irqs(domain, nr_ipi, NUMA_NO_NODE, NULL); if (rc <= 0) { pr_err("unable to alloc IRQs from IPI Mux domain\n"); goto fail_free_domain; From b06730a571a9ff1ba5bd6b20bf9e50e5a12f1ec6 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:43 +0100 Subject: [PATCH 21/42] irqdomain: Fix association race The sanity check for an already mapped virq is done outside of the irq_domain_mutex-protected section which means that an (unlikely) racing association may not be detected. Fix this by factoring out the association implementation, which will also be used in a follow-on change to fix a shared-interrupt mapping race. Fixes: ddaf144c61da ("irqdomain: Refactor irq_domain_associate_many()") Cc: stable@vger.kernel.org # 3.11 Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-2-johan+linaro@kernel.org --- kernel/irq/irqdomain.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8fe1da9614ee8..6661de18550e9 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -559,8 +559,8 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) irq_domain_clear_mapping(domain, hwirq); } -int irq_domain_associate(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq) +static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) { struct irq_data *irq_data = irq_get_irq_data(virq); int ret; @@ -573,7 +573,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq, if (WARN(irq_data->domain, "error: virq%i is already associated", virq)) return -EINVAL; - mutex_lock(&irq_domain_mutex); irq_data->hwirq = hwirq; irq_data->domain = domain; if (domain->ops->map) { @@ -590,7 +589,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq, } irq_data->domain = NULL; irq_data->hwirq = 0; - mutex_unlock(&irq_domain_mutex); return ret; } @@ -601,12 +599,23 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq, domain->mapcount++; irq_domain_set_mapping(domain, hwirq, irq_data); - mutex_unlock(&irq_domain_mutex); irq_clear_status_flags(virq, IRQ_NOREQUEST); return 0; } + +int irq_domain_associate(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) +{ + int ret; + + mutex_lock(&irq_domain_mutex); + ret = irq_domain_associate_locked(domain, virq, hwirq); + mutex_unlock(&irq_domain_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(irq_domain_associate); void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, From 3f883c38f5628f46b30bccf090faec054088e262 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:44 +0100 Subject: [PATCH 22/42] irqdomain: Fix disassociation race The global irq_domain_mutex is held when mapping interrupts from non-hierarchical domains but currently not when disposing them. This specifically means that updates of the domain mapcount is racy (currently only used for statistics in debugfs). Make sure to hold the global irq_domain_mutex also when disposing mappings from non-hierarchical domains. Fixes: 9dc6be3d4193 ("genirq/irqdomain: Add map counter") Cc: stable@vger.kernel.org # 4.13 Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-3-johan+linaro@kernel.org --- kernel/irq/irqdomain.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 6661de18550e9..f77549a2a1783 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -538,6 +538,9 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) return; hwirq = irq_data->hwirq; + + mutex_lock(&irq_domain_mutex); + irq_set_status_flags(irq, IRQ_NOREQUEST); /* remove chip and handler */ @@ -557,6 +560,8 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) /* Clear reverse map for this hwirq */ irq_domain_clear_mapping(domain, hwirq); + + mutex_unlock(&irq_domain_mutex); } static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq, From e3b7ab025e931accdc2c12acf9b75c6197f1c062 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:45 +0100 Subject: [PATCH 23/42] irqdomain: Drop bogus fwspec-mapping error handling In case a newly allocated IRQ ever ends up not having any associated struct irq_data it would not even be possible to dispose the mapping. Replace the bogus disposal with a WARN_ON(). This will also be used to fix a shared-interrupt mapping race, hence the CC-stable tag. Fixes: 1e2a7d78499e ("irqdomain: Don't set type when mapping an IRQ") Cc: stable@vger.kernel.org # 4.8 Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-4-johan+linaro@kernel.org --- kernel/irq/irqdomain.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f77549a2a1783..9f5b96cf6c5cd 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -847,13 +847,8 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) } irq_data = irq_get_irq_data(virq); - if (!irq_data) { - if (irq_domain_is_hierarchy(domain)) - irq_domain_free_irqs(virq, 1); - else - irq_dispose_mapping(virq); + if (WARN_ON(!irq_data)) return 0; - } /* Store trigger type */ irqd_set_trigger_type(irq_data, type); From 6e6f75c9c98d2d246d90411ff2b6f0cd271f4cba Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:46 +0100 Subject: [PATCH 24/42] irqdomain: Look for existing mapping only once Avoid looking for an existing mapping twice when creating a new mapping using irq_create_fwspec_mapping() by factoring out the actual allocation which is shared with irq_create_mapping_affinity(). The new helper function will also be used to fix a shared-interrupt mapping race, hence the Fixes tag. Fixes: b62b2cf5759b ("irqdomain: Fix handling of type settings for existing mappings") Cc: stable@vger.kernel.org # 4.8 Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-5-johan+linaro@kernel.org --- kernel/irq/irqdomain.c | 60 +++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 9f5b96cf6c5cd..9f95047e4bc7f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -682,6 +682,34 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) EXPORT_SYMBOL_GPL(irq_create_direct_mapping); #endif +static unsigned int __irq_create_mapping_affinity(struct irq_domain *domain, + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity) +{ + struct device_node *of_node = irq_domain_get_of_node(domain); + int virq; + + pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); + + /* Allocate a virtual interrupt number */ + virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), + affinity); + if (virq <= 0) { + pr_debug("-> virq allocation failed\n"); + return 0; + } + + if (irq_domain_associate(domain, virq, hwirq)) { + irq_free_desc(virq); + return 0; + } + + pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", + hwirq, of_node_full_name(of_node), virq); + + return virq; +} + /** * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space * @domain: domain owning this hardware interrupt or NULL for default domain @@ -694,14 +722,11 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping); * on the number returned from that call. */ unsigned int irq_create_mapping_affinity(struct irq_domain *domain, - irq_hw_number_t hwirq, - const struct irq_affinity_desc *affinity) + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity) { - struct device_node *of_node; int virq; - pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); - /* Look for default domain if necessary */ if (domain == NULL) domain = irq_default_domain; @@ -709,34 +734,15 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain, WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq); return 0; } - pr_debug("-> using domain @%p\n", domain); - - of_node = irq_domain_get_of_node(domain); /* Check if mapping already exists */ virq = irq_find_mapping(domain, hwirq); if (virq) { - pr_debug("-> existing mapping on virq %d\n", virq); + pr_debug("existing mapping on virq %d\n", virq); return virq; } - /* Allocate a virtual interrupt number */ - virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), - affinity); - if (virq <= 0) { - pr_debug("-> virq allocation failed\n"); - return 0; - } - - if (irq_domain_associate(domain, virq, hwirq)) { - irq_free_desc(virq); - return 0; - } - - pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", - hwirq, of_node_full_name(of_node), virq); - - return virq; + return __irq_create_mapping_affinity(domain, hwirq, affinity); } EXPORT_SYMBOL_GPL(irq_create_mapping_affinity); @@ -841,7 +847,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) return 0; } else { /* Create mapping */ - virq = irq_create_mapping(domain, hwirq); + virq = __irq_create_mapping_affinity(domain, hwirq, NULL); if (!virq) return virq; } From d55f7f4c58c07beb5050a834bf57ae2ede599c7e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:47 +0100 Subject: [PATCH 25/42] irqdomain: Refactor __irq_domain_alloc_irqs() Refactor __irq_domain_alloc_irqs() so that it can be called internally while holding the irq_domain_mutex. This will be used to fix a shared-interrupt mapping race, hence the Fixes tag. Fixes: b62b2cf5759b ("irqdomain: Fix handling of type settings for existing mappings") Cc: stable@vger.kernel.org # 4.8 Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-6-johan+linaro@kernel.org --- kernel/irq/irqdomain.c | 88 +++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 9f95047e4bc7f..78fb4800c0d27 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1441,40 +1441,12 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, return domain->ops->alloc(domain, irq_base, nr_irqs, arg); } -/** - * __irq_domain_alloc_irqs - Allocate IRQs from domain - * @domain: domain to allocate from - * @irq_base: allocate specified IRQ number if irq_base >= 0 - * @nr_irqs: number of IRQs to allocate - * @node: NUMA node id for memory allocation - * @arg: domain specific argument - * @realloc: IRQ descriptors have already been allocated if true - * @affinity: Optional irq affinity mask for multiqueue devices - * - * Allocate IRQ numbers and initialized all data structures to support - * hierarchy IRQ domains. - * Parameter @realloc is mainly to support legacy IRQs. - * Returns error code or allocated IRQ number - * - * The whole process to setup an IRQ has been split into two steps. - * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ - * descriptor and required hardware resources. The second step, - * irq_domain_activate_irq(), is to program the hardware with preallocated - * resources. In this way, it's easier to rollback when failing to - * allocate resources. - */ -int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, - unsigned int nr_irqs, int node, void *arg, - bool realloc, const struct irq_affinity_desc *affinity) +static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc, const struct irq_affinity_desc *affinity) { int i, ret, virq; - if (domain == NULL) { - domain = irq_default_domain; - if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) - return -EINVAL; - } - if (realloc && irq_base >= 0) { virq = irq_base; } else { @@ -1493,24 +1465,18 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, goto out_free_desc; } - mutex_lock(&irq_domain_mutex); ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg); - if (ret < 0) { - mutex_unlock(&irq_domain_mutex); + if (ret < 0) goto out_free_irq_data; - } for (i = 0; i < nr_irqs; i++) { ret = irq_domain_trim_hierarchy(virq + i); - if (ret) { - mutex_unlock(&irq_domain_mutex); + if (ret) goto out_free_irq_data; - } } - + for (i = 0; i < nr_irqs; i++) irq_domain_insert_irq(virq + i); - mutex_unlock(&irq_domain_mutex); return virq; @@ -1520,6 +1486,48 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, irq_free_descs(virq, nr_irqs); return ret; } + +/** + * __irq_domain_alloc_irqs - Allocate IRQs from domain + * @domain: domain to allocate from + * @irq_base: allocate specified IRQ number if irq_base >= 0 + * @nr_irqs: number of IRQs to allocate + * @node: NUMA node id for memory allocation + * @arg: domain specific argument + * @realloc: IRQ descriptors have already been allocated if true + * @affinity: Optional irq affinity mask for multiqueue devices + * + * Allocate IRQ numbers and initialized all data structures to support + * hierarchy IRQ domains. + * Parameter @realloc is mainly to support legacy IRQs. + * Returns error code or allocated IRQ number + * + * The whole process to setup an IRQ has been split into two steps. + * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ + * descriptor and required hardware resources. The second step, + * irq_domain_activate_irq(), is to program the hardware with preallocated + * resources. In this way, it's easier to rollback when failing to + * allocate resources. + */ +int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc, const struct irq_affinity_desc *affinity) +{ + int ret; + + if (domain == NULL) { + domain = irq_default_domain; + if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) + return -EINVAL; + } + + mutex_lock(&irq_domain_mutex); + ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg, + realloc, affinity); + mutex_unlock(&irq_domain_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs); /* The irq_data was moved, fix the revmap to refer to the new location */ From 601363cc08da25747feb87c55573dd54de91d66a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:48 +0100 Subject: [PATCH 26/42] irqdomain: Fix mapping-creation race Parallel probing of devices that share interrupts (e.g. when a driver uses asynchronous probing) can currently result in two mappings for the same hardware interrupt to be created due to missing serialisation. Make sure to hold the irq_domain_mutex when creating mappings so that looking for an existing mapping before creating a new one is done atomically. Fixes: 765230b5f084 ("driver-core: add asynchronous probing support for drivers") Fixes: b62b2cf5759b ("irqdomain: Fix handling of type settings for existing mappings") Link: https://lore.kernel.org/r/YuJXMHoT4ijUxnRb@hovoldconsulting.com Cc: stable@vger.kernel.org # 4.8 Cc: Dmitry Torokhov Cc: Jon Hunter Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-7-johan+linaro@kernel.org --- kernel/irq/irqdomain.c | 64 ++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 78fb4800c0d27..df0cbad1b0d71 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -25,6 +25,9 @@ static DEFINE_MUTEX(irq_domain_mutex); static struct irq_domain *irq_default_domain; +static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc, const struct irq_affinity_desc *affinity); static void irq_domain_check_hierarchy(struct irq_domain *domain); struct irqchip_fwid { @@ -682,9 +685,9 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) EXPORT_SYMBOL_GPL(irq_create_direct_mapping); #endif -static unsigned int __irq_create_mapping_affinity(struct irq_domain *domain, - irq_hw_number_t hwirq, - const struct irq_affinity_desc *affinity) +static unsigned int irq_create_mapping_affinity_locked(struct irq_domain *domain, + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity) { struct device_node *of_node = irq_domain_get_of_node(domain); int virq; @@ -699,7 +702,7 @@ static unsigned int __irq_create_mapping_affinity(struct irq_domain *domain, return 0; } - if (irq_domain_associate(domain, virq, hwirq)) { + if (irq_domain_associate_locked(domain, virq, hwirq)) { irq_free_desc(virq); return 0; } @@ -735,14 +738,20 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain, return 0; } + mutex_lock(&irq_domain_mutex); + /* Check if mapping already exists */ virq = irq_find_mapping(domain, hwirq); if (virq) { pr_debug("existing mapping on virq %d\n", virq); - return virq; + goto out; } - return __irq_create_mapping_affinity(domain, hwirq, affinity); + virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity); +out: + mutex_unlock(&irq_domain_mutex); + + return virq; } EXPORT_SYMBOL_GPL(irq_create_mapping_affinity); @@ -809,6 +818,8 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK)) type &= IRQ_TYPE_SENSE_MASK; + mutex_lock(&irq_domain_mutex); + /* * If we've already configured this interrupt, * don't do it again, or hell will break loose. @@ -821,7 +832,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) * interrupt number. */ if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) - return virq; + goto out; /* * If the trigger type has not been set yet, then set @@ -829,35 +840,45 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) */ if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) { irq_data = irq_get_irq_data(virq); - if (!irq_data) - return 0; + if (!irq_data) { + virq = 0; + goto out; + } irqd_set_trigger_type(irq_data, type); - return virq; + goto out; } pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n", hwirq, of_node_full_name(to_of_node(fwspec->fwnode))); - return 0; + virq = 0; + goto out; } if (irq_domain_is_hierarchy(domain)) { - virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); - if (virq <= 0) - return 0; + virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE, + fwspec, false, NULL); + if (virq <= 0) { + virq = 0; + goto out; + } } else { /* Create mapping */ - virq = __irq_create_mapping_affinity(domain, hwirq, NULL); + virq = irq_create_mapping_affinity_locked(domain, hwirq, NULL); if (!virq) - return virq; + goto out; } irq_data = irq_get_irq_data(virq); - if (WARN_ON(!irq_data)) - return 0; + if (WARN_ON(!irq_data)) { + virq = 0; + goto out; + } /* Store trigger type */ irqd_set_trigger_type(irq_data, type); +out: + mutex_unlock(&irq_domain_mutex); return virq; } @@ -1888,6 +1909,13 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, irq_set_handler_data(virq, handler_data); } +static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc, const struct irq_affinity_desc *affinity) +{ + return -EINVAL; +} + static void irq_domain_check_hierarchy(struct irq_domain *domain) { } From 8932c32c3053accd50702b36e944ac2016cd103c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 13 Feb 2023 11:42:49 +0100 Subject: [PATCH 27/42] irqdomain: Fix domain registration race Hierarchical domains created using irq_domain_create_hierarchy() are currently added to the domain list before having been fully initialised. This specifically means that a racing allocation request might fail to allocate irq data for the inner domains of a hierarchy in case the parent domain pointer has not yet been set up. Note that this is not really any issue for irqchip drivers that are registered early (e.g. via IRQCHIP_DECLARE() or IRQCHIP_ACPI_DECLARE()) but could potentially cause trouble with drivers that are registered later (e.g. modular drivers using IRQCHIP_PLATFORM_DRIVER_BEGIN(), gpiochip drivers, etc.). Fixes: afb7da83b9f4 ("irqdomain: Introduce helper function irq_domain_add_hierarchy()") Cc: stable@vger.kernel.org # 3.19 Signed-off-by: Marc Zyngier [ johan: add commit message ] Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-8-johan+linaro@kernel.org --- kernel/irq/irqdomain.c | 62 +++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index df0cbad1b0d71..a6d1b108b8f7c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -126,23 +126,12 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode) } EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); -/** - * __irq_domain_add() - Allocate a new irq_domain data structure - * @fwnode: firmware node for the interrupt controller - * @size: Size of linear map; 0 for radix mapping only - * @hwirq_max: Maximum number of interrupts supported by controller - * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no - * direct mapping - * @ops: domain callbacks - * @host_data: Controller private data pointer - * - * Allocates and initializes an irq_domain structure. - * Returns pointer to IRQ domain, or NULL on failure. - */ -struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, - irq_hw_number_t hwirq_max, int direct_max, - const struct irq_domain_ops *ops, - void *host_data) +static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode, + unsigned int size, + irq_hw_number_t hwirq_max, + int direct_max, + const struct irq_domain_ops *ops, + void *host_data) { struct irqchip_fwid *fwid; struct irq_domain *domain; @@ -230,12 +219,44 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int s irq_domain_check_hierarchy(domain); + return domain; +} + +static void __irq_domain_publish(struct irq_domain *domain) +{ mutex_lock(&irq_domain_mutex); debugfs_add_domain_dir(domain); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); pr_debug("Added domain %s\n", domain->name); +} + +/** + * __irq_domain_add() - Allocate a new irq_domain data structure + * @fwnode: firmware node for the interrupt controller + * @size: Size of linear map; 0 for radix mapping only + * @hwirq_max: Maximum number of interrupts supported by controller + * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no + * direct mapping + * @ops: domain callbacks + * @host_data: Controller private data pointer + * + * Allocates and initializes an irq_domain structure. + * Returns pointer to IRQ domain, or NULL on failure. + */ +struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, + irq_hw_number_t hwirq_max, int direct_max, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain; + + domain = __irq_domain_create(fwnode, size, hwirq_max, direct_max, + ops, host_data); + if (domain) + __irq_domain_publish(domain); + return domain; } EXPORT_SYMBOL_GPL(__irq_domain_add); @@ -1138,12 +1159,15 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, struct irq_domain *domain; if (size) - domain = irq_domain_create_linear(fwnode, size, ops, host_data); + domain = __irq_domain_create(fwnode, size, size, 0, ops, host_data); else - domain = irq_domain_create_tree(fwnode, ops, host_data); + domain = __irq_domain_create(fwnode, 0, ~0, 0, ops, host_data); + if (domain) { domain->parent = parent; domain->flags |= flags; + + __irq_domain_publish(domain); } return domain; From 47d1932f37de99bae3345bb93f098ac8750ab0fb Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:50 +0100 Subject: [PATCH 28/42] irqdomain: Drop revmap mutex The revmap mutex is essentially only used to maintain the integrity of the radix tree during updates (lookups use RCU). As the global irq_domain_mutex is now held in all paths that update the revmap structures there is strictly no longer any need for the dedicated mutex, which can be removed. Drop the revmap mutex and add lockdep assertions to the revmap helpers to make sure that the global lock is always held when updating the revmap. Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-9-johan+linaro@kernel.org --- include/linux/irqdomain.h | 2 -- kernel/irq/irqdomain.c | 13 ++++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index a372086750ca5..16399de00b480 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -143,7 +143,6 @@ struct irq_domain_chip_generic; * Revmap data, used internally by the irq domain code: * @revmap_size: Size of the linear map table @revmap[] * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map - * @revmap_mutex: Lock for the revmap * @revmap: Linear table of irq_data pointers */ struct irq_domain { @@ -171,7 +170,6 @@ struct irq_domain { irq_hw_number_t hwirq_max; unsigned int revmap_size; struct radix_tree_root revmap_tree; - struct mutex revmap_mutex; struct irq_data __rcu *revmap[]; }; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index a6d1b108b8f7c..c7113e776543a 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -206,7 +206,6 @@ static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode, /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); - mutex_init(&domain->revmap_mutex); domain->ops = ops; domain->host_data = host_data; domain->hwirq_max = hwirq_max; @@ -526,30 +525,30 @@ static bool irq_domain_is_nomap(struct irq_domain *domain) static void irq_domain_clear_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { + lockdep_assert_held(&irq_domain_mutex); + if (irq_domain_is_nomap(domain)) return; - mutex_lock(&domain->revmap_mutex); if (hwirq < domain->revmap_size) rcu_assign_pointer(domain->revmap[hwirq], NULL); else radix_tree_delete(&domain->revmap_tree, hwirq); - mutex_unlock(&domain->revmap_mutex); } static void irq_domain_set_mapping(struct irq_domain *domain, irq_hw_number_t hwirq, struct irq_data *irq_data) { + lockdep_assert_held(&irq_domain_mutex); + if (irq_domain_is_nomap(domain)) return; - mutex_lock(&domain->revmap_mutex); if (hwirq < domain->revmap_size) rcu_assign_pointer(domain->revmap[hwirq], irq_data); else radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); - mutex_unlock(&domain->revmap_mutex); } static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) @@ -1580,11 +1579,12 @@ static void irq_domain_fix_revmap(struct irq_data *d) { void __rcu **slot; + lockdep_assert_held(&irq_domain_mutex); + if (irq_domain_is_nomap(d->domain)) return; /* Fix up the revmap. */ - mutex_lock(&d->domain->revmap_mutex); if (d->hwirq < d->domain->revmap_size) { /* Not using radix tree */ rcu_assign_pointer(d->domain->revmap[d->hwirq], d); @@ -1593,7 +1593,6 @@ static void irq_domain_fix_revmap(struct irq_data *d) if (slot) radix_tree_replace_slot(&d->domain->revmap_tree, slot, d); } - mutex_unlock(&d->domain->revmap_mutex); } /** From 28a9ff23d8b56db09cb01cef174a205ea5e2ca49 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:51 +0100 Subject: [PATCH 29/42] irqdomain: Drop dead domain-name assignment Since commit d59f6617eef0 ("genirq: Allow fwnode to carry name information only") an IRQ domain is always given a name during allocation (e.g. used for the debugfs entry). Drop the leftover name assignment when allocating the first IRQ. Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-10-johan+linaro@kernel.org --- kernel/irq/irqdomain.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index c7113e776543a..6bd6b610568cb 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -619,10 +619,6 @@ static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int v irq_data->hwirq = 0; return ret; } - - /* If not already assigned, give the domain the chip's name */ - if (!domain->name && irq_data->chip) - domain->name = irq_data->chip->name; } domain->mapcount++; @@ -1182,10 +1178,6 @@ static void irq_domain_insert_irq(int virq) domain->mapcount++; irq_domain_set_mapping(domain, data->hwirq, data); - - /* If not already assigned, give the domain the chip's name */ - if (!domain->name && data->chip) - domain->name = data->chip->name; } irq_clear_status_flags(virq, IRQ_NOREQUEST); From 4e0d86df9344bfd1951eb2571e4ef8f3d37000a4 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:52 +0100 Subject: [PATCH 30/42] irqdomain: Drop leftover brackets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop some unnecessary brackets that were left in place when the corresponding code was updated. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-11-johan+linaro@kernel.org --- kernel/irq/irqdomain.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 6bd6b610568cb..3a3213d730ee6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -210,9 +210,8 @@ static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode, domain->host_data = host_data; domain->hwirq_max = hwirq_max; - if (direct_max) { + if (direct_max) domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP; - } domain->revmap_size = size; @@ -652,9 +651,8 @@ void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, of_node_full_name(of_node), irq_base, (int)hwirq_base, count); - for (i = 0; i < count; i++) { + for (i = 0; i < count; i++) irq_domain_associate(domain, irq_base + i, hwirq_base + i); - } } EXPORT_SYMBOL_GPL(irq_domain_associate_many); From 930a1bbbef01cdcd682d9c2b4bc9e36b9618fed3 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:53 +0100 Subject: [PATCH 31/42] irqdomain: Clean up irq_domain_push/pop_irq() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The irq_domain_push_irq() interface is used to add a new (outmost) level to a hierarchical domain after IRQs have been allocated. Possibly due to differing mental images of hierarchical domains, the names used for the irq_data variables make these functions much harder to understand than what they need to be. Rename the struct irq_data pointer to the data embedded in the descriptor as simply 'irq_data' and refer to the data allocated by this interface as 'parent_irq_data' so that the names reflect how hierarchical domains are implemented. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-12-johan+linaro@kernel.org --- kernel/irq/irqdomain.c | 65 +++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3a3213d730ee6..6d480dc6ab53b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1598,8 +1598,8 @@ static void irq_domain_fix_revmap(struct irq_data *d) */ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg) { - struct irq_data *child_irq_data; - struct irq_data *root_irq_data = irq_get_irq_data(virq); + struct irq_data *irq_data = irq_get_irq_data(virq); + struct irq_data *parent_irq_data; struct irq_desc *desc; int rv = 0; @@ -1624,45 +1624,44 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg) if (WARN_ON(!irq_domain_is_hierarchy(domain))) return -EINVAL; - if (!root_irq_data) + if (!irq_data) return -EINVAL; - if (domain->parent != root_irq_data->domain) + if (domain->parent != irq_data->domain) return -EINVAL; - child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL, - irq_data_get_node(root_irq_data)); - if (!child_irq_data) + parent_irq_data = kzalloc_node(sizeof(*parent_irq_data), GFP_KERNEL, + irq_data_get_node(irq_data)); + if (!parent_irq_data) return -ENOMEM; mutex_lock(&irq_domain_mutex); /* Copy the original irq_data. */ - *child_irq_data = *root_irq_data; + *parent_irq_data = *irq_data; /* - * Overwrite the root_irq_data, which is embedded in struct - * irq_desc, with values for this domain. + * Overwrite the irq_data, which is embedded in struct irq_desc, with + * values for this domain. */ - root_irq_data->parent_data = child_irq_data; - root_irq_data->domain = domain; - root_irq_data->mask = 0; - root_irq_data->hwirq = 0; - root_irq_data->chip = NULL; - root_irq_data->chip_data = NULL; + irq_data->parent_data = parent_irq_data; + irq_data->domain = domain; + irq_data->mask = 0; + irq_data->hwirq = 0; + irq_data->chip = NULL; + irq_data->chip_data = NULL; /* May (probably does) set hwirq, chip, etc. */ rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); if (rv) { /* Restore the original irq_data. */ - *root_irq_data = *child_irq_data; - kfree(child_irq_data); + *irq_data = *parent_irq_data; + kfree(parent_irq_data); goto error; } - irq_domain_fix_revmap(child_irq_data); - irq_domain_set_mapping(domain, root_irq_data->hwirq, root_irq_data); - + irq_domain_fix_revmap(parent_irq_data); + irq_domain_set_mapping(domain, irq_data->hwirq, irq_data); error: mutex_unlock(&irq_domain_mutex); @@ -1680,8 +1679,8 @@ EXPORT_SYMBOL_GPL(irq_domain_push_irq); */ int irq_domain_pop_irq(struct irq_domain *domain, int virq) { - struct irq_data *root_irq_data = irq_get_irq_data(virq); - struct irq_data *child_irq_data; + struct irq_data *irq_data = irq_get_irq_data(virq); + struct irq_data *parent_irq_data; struct irq_data *tmp_irq_data; struct irq_desc *desc; @@ -1703,37 +1702,37 @@ int irq_domain_pop_irq(struct irq_domain *domain, int virq) if (domain == NULL) return -EINVAL; - if (!root_irq_data) + if (!irq_data) return -EINVAL; tmp_irq_data = irq_domain_get_irq_data(domain, virq); /* We can only "pop" if this domain is at the top of the list */ - if (WARN_ON(root_irq_data != tmp_irq_data)) + if (WARN_ON(irq_data != tmp_irq_data)) return -EINVAL; - if (WARN_ON(root_irq_data->domain != domain)) + if (WARN_ON(irq_data->domain != domain)) return -EINVAL; - child_irq_data = root_irq_data->parent_data; - if (WARN_ON(!child_irq_data)) + parent_irq_data = irq_data->parent_data; + if (WARN_ON(!parent_irq_data)) return -EINVAL; mutex_lock(&irq_domain_mutex); - root_irq_data->parent_data = NULL; + irq_data->parent_data = NULL; - irq_domain_clear_mapping(domain, root_irq_data->hwirq); + irq_domain_clear_mapping(domain, irq_data->hwirq); irq_domain_free_irqs_hierarchy(domain, virq, 1); /* Restore the original irq_data. */ - *root_irq_data = *child_irq_data; + *irq_data = *parent_irq_data; - irq_domain_fix_revmap(root_irq_data); + irq_domain_fix_revmap(irq_data); mutex_unlock(&irq_domain_mutex); - kfree(child_irq_data); + kfree(parent_irq_data); return 0; } From bc1bc1b309b42ff3faea957df7ac9382366c5eef Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:54 +0100 Subject: [PATCH 32/42] x86/ioapic: Use irq_domain_create_hierarchy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the irq_domain_create_hierarchy() helper to create the hierarchical domain, which both serves as documentation and avoids poking at irqdomain internals. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-13-johan+linaro@kernel.org --- arch/x86/kernel/apic/io_apic.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a868b76cd3d42..1f83b052bb74e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2364,9 +2364,8 @@ static int mp_irqdomain_create(int ioapic) return -ENODEV; } - ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops, - (void *)(long)ioapic); - + ip->irqdomain = irq_domain_create_hierarchy(parent, 0, hwirqs, fn, cfg->ops, + (void *)(long)ioapic); if (!ip->irqdomain) { /* Release fw handle if it was allocated above */ if (!cfg->dev) @@ -2374,8 +2373,6 @@ static int mp_irqdomain_create(int ioapic) return -ENOMEM; } - ip->irqdomain->parent = parent; - if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT) ioapic_dynirq_base = max(ioapic_dynirq_base, From a14e7fdd43040a2a5da2cc979063b3c958015aa9 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:55 +0100 Subject: [PATCH 33/42] x86/uv: Use irq_domain_create_hierarchy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the irq_domain_create_hierarchy() helper to create the hierarchical domain, which both serves as documentation and avoids poking at irqdomain internals. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-14-johan+linaro@kernel.org --- arch/x86/platform/uv/uv_irq.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 1a536a187d748..ee21d6a36a80d 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -166,10 +166,9 @@ static struct irq_domain *uv_get_irq_domain(void) if (!fn) goto out; - uv_domain = irq_domain_create_tree(fn, &uv_domain_ops, NULL); - if (uv_domain) - uv_domain->parent = x86_vector_domain; - else + uv_domain = irq_domain_create_hierarchy(x86_vector_domain, 0, 0, fn, + &uv_domain_ops, NULL); + if (!uv_domain) irq_domain_free_fwnode(fn); out: mutex_unlock(&uv_lock); From 6c889231e04dffa4b668c1f0fbc26db679600147 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:56 +0100 Subject: [PATCH 34/42] irqchip/alpine-msi: Use irq_domain_add_hierarchy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the irq_domain_add_hierarchy() helper to create the hierarchical domain, which both serves as documentation and avoids poking at irqdomain internals. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-15-johan+linaro@kernel.org --- drivers/irqchip/irq-alpine-msi.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/irqchip/irq-alpine-msi.c b/drivers/irqchip/irq-alpine-msi.c index 5ddb8e578ac6a..604459372fdd2 100644 --- a/drivers/irqchip/irq-alpine-msi.c +++ b/drivers/irqchip/irq-alpine-msi.c @@ -204,16 +204,14 @@ static int alpine_msix_init_domains(struct alpine_msix_data *priv, return -ENXIO; } - middle_domain = irq_domain_add_tree(NULL, - &alpine_msix_middle_domain_ops, - priv); + middle_domain = irq_domain_add_hierarchy(gic_domain, 0, 0, NULL, + &alpine_msix_middle_domain_ops, + priv); if (!middle_domain) { pr_err("Failed to create the MSIX middle domain\n"); return -ENOMEM; } - middle_domain->parent = gic_domain; - msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(node), &alpine_msix_domain_info, middle_domain); From e6e8cd62a56f9ed0dcdf8d01147709b59a111418 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:57 +0100 Subject: [PATCH 35/42] irqchip/gic-v2m: Use irq_domain_create_hierarchy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the irq_domain_create_hierarchy() helper to create the hierarchical domain, which both serves as documentation and avoids poking at irqdomain internals. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-16-johan+linaro@kernel.org --- drivers/irqchip/irq-gic-v2m.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index f4d7eeb13951a..f1e75b35a52af 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -287,15 +287,14 @@ static __init int gicv2m_allocate_domains(struct irq_domain *parent) if (!v2m) return 0; - inner_domain = irq_domain_create_tree(v2m->fwnode, - &gicv2m_domain_ops, v2m); + inner_domain = irq_domain_create_hierarchy(parent, 0, 0, v2m->fwnode, + &gicv2m_domain_ops, v2m); if (!inner_domain) { pr_err("Failed to create GICv2m domain\n"); return -ENOMEM; } irq_domain_update_bus_token(inner_domain, DOMAIN_BUS_NEXUS); - inner_domain->parent = parent; pci_domain = pci_msi_create_irq_domain(v2m->fwnode, &gicv2m_msi_domain_info, inner_domain); From 1e46e040decede1723a5b11f0689942134c29d9a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:58 +0100 Subject: [PATCH 36/42] irqchip/gic-v3-its: Use irq_domain_create_hierarchy() Use the irq_domain_create_hierarchy() helper to create the hierarchical domain, which both serves as documentation and avoids poking at irqdomain internals. Note that the domain host_data was first set to the struct its_node during allocation only to immediately be overwritten with the struct msi_domain_info. Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-17-johan+linaro@kernel.org --- drivers/irqchip/irq-gic-v3-its.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 973ede0197e36..5634d29b644d8 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -4909,18 +4909,19 @@ static int its_init_domain(struct fwnode_handle *handle, struct its_node *its) if (!info) return -ENOMEM; - inner_domain = irq_domain_create_tree(handle, &its_domain_ops, its); + info->ops = &its_msi_domain_ops; + info->data = its; + + inner_domain = irq_domain_create_hierarchy(its_parent, + its->msi_domain_flags, 0, + handle, &its_domain_ops, + info); if (!inner_domain) { kfree(info); return -ENOMEM; } - inner_domain->parent = its_parent; irq_domain_update_bus_token(inner_domain, DOMAIN_BUS_NEXUS); - inner_domain->flags |= its->msi_domain_flags; - info->ops = &its_msi_domain_ops; - info->data = its; - inner_domain->host_data = info; return 0; } From 331f9aac03267f76a15776260eda4f47b81731f7 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:42:59 +0100 Subject: [PATCH 37/42] irqchip/gic-v3-mbi: Use irq_domain_create_hierarchy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the irq_domain_create_hierarchy() helper to create the hierarchical domain, which both serves as documentation and avoids poking at irqdomain internals. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-18-johan+linaro@kernel.org --- drivers/irqchip/irq-gic-v3-mbi.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-mbi.c b/drivers/irqchip/irq-gic-v3-mbi.c index e1efdec9e9acf..dbb8b1efda44c 100644 --- a/drivers/irqchip/irq-gic-v3-mbi.c +++ b/drivers/irqchip/irq-gic-v3-mbi.c @@ -233,13 +233,12 @@ static int mbi_allocate_domains(struct irq_domain *parent) struct irq_domain *nexus_domain, *pci_domain, *plat_domain; int err; - nexus_domain = irq_domain_create_tree(parent->fwnode, - &mbi_domain_ops, NULL); + nexus_domain = irq_domain_create_hierarchy(parent, 0, 0, parent->fwnode, + &mbi_domain_ops, NULL); if (!nexus_domain) return -ENOMEM; irq_domain_update_bus_token(nexus_domain, DOMAIN_BUS_NEXUS); - nexus_domain->parent = parent; err = mbi_allocate_pci_domain(nexus_domain, &pci_domain); From 6159c470f812eab0a2f1900c70acbb3ca7b9e14a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:43:00 +0100 Subject: [PATCH 38/42] irqchip/loongson-pch-msi: Use irq_domain_create_hierarchy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the irq_domain_create_hierarchy() helper to create the hierarchical domain, which both serves as documentation and avoids poking at irqdomain internals. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-19-johan+linaro@kernel.org --- drivers/irqchip/irq-loongson-pch-msi.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/irqchip/irq-loongson-pch-msi.c b/drivers/irqchip/irq-loongson-pch-msi.c index a72ede90ffc69..6e1e1f011bb29 100644 --- a/drivers/irqchip/irq-loongson-pch-msi.c +++ b/drivers/irqchip/irq-loongson-pch-msi.c @@ -163,16 +163,15 @@ static int pch_msi_init_domains(struct pch_msi_data *priv, { struct irq_domain *middle_domain, *msi_domain; - middle_domain = irq_domain_create_linear(domain_handle, - priv->num_irqs, - &pch_msi_middle_domain_ops, - priv); + middle_domain = irq_domain_create_hierarchy(parent, 0, priv->num_irqs, + domain_handle, + &pch_msi_middle_domain_ops, + priv); if (!middle_domain) { pr_err("Failed to create the MSI middle domain\n"); return -ENOMEM; } - middle_domain->parent = parent; irq_domain_update_bus_token(middle_domain, DOMAIN_BUS_NEXUS); msi_domain = pci_msi_create_irq_domain(domain_handle, From f743f54fa8d2bcb3f2891b783687d91b76a144f5 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:43:01 +0100 Subject: [PATCH 39/42] irqchip/mvebu-odmi: Use irq_domain_create_hierarchy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the irq_domain_create_hierarchy() helper to create the hierarchical domain, which both serves as documentation and avoids poking at irqdomain internals. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-20-johan+linaro@kernel.org --- drivers/irqchip/irq-mvebu-odmi.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/irqchip/irq-mvebu-odmi.c b/drivers/irqchip/irq-mvebu-odmi.c index dc4145abdd6f7..108091533e10d 100644 --- a/drivers/irqchip/irq-mvebu-odmi.c +++ b/drivers/irqchip/irq-mvebu-odmi.c @@ -161,7 +161,7 @@ static struct msi_domain_info odmi_msi_domain_info = { static int __init mvebu_odmi_init(struct device_node *node, struct device_node *parent) { - struct irq_domain *inner_domain, *plat_domain; + struct irq_domain *parent_domain, *inner_domain, *plat_domain; int ret, i; if (of_property_read_u32(node, "marvell,odmi-frames", &odmis_count)) @@ -197,16 +197,17 @@ static int __init mvebu_odmi_init(struct device_node *node, } } - inner_domain = irq_domain_create_linear(of_node_to_fwnode(node), - odmis_count * NODMIS_PER_FRAME, - &odmi_domain_ops, NULL); + parent_domain = irq_find_host(parent); + + inner_domain = irq_domain_create_hierarchy(parent_domain, 0, + odmis_count * NODMIS_PER_FRAME, + of_node_to_fwnode(node), + &odmi_domain_ops, NULL); if (!inner_domain) { ret = -ENOMEM; goto err_unmap; } - inner_domain->parent = irq_find_host(parent); - plat_domain = platform_msi_create_irq_domain(of_node_to_fwnode(node), &odmi_msi_domain_info, inner_domain); From 9dbb8e3452aba34e6fa4f63054b3adc66aceb7ec Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Feb 2023 11:43:02 +0100 Subject: [PATCH 40/42] irqdomain: Switch to per-domain locking The IRQ domain structures are currently protected by the global irq_domain_mutex. Switch to using more fine-grained per-domain locking, which can speed up parallel probing by reducing lock contention. On a recent arm64 laptop, the total time spent waiting for the locks during boot drops from 160 to 40 ms on average, while the maximum aggregate wait time drops from 550 to 90 ms over ten runs for example. Note that the domain lock of the root domain (innermost domain) must be used for hierarchical domains. For non-hierarchical domains (as for root domains), the new root pointer is set to the domain itself so that &domain->root->mutex always points to the right lock. Also note that hierarchical domains should be constructed using irq_domain_create_hierarchy() (or irq_domain_add_hierarchy()) to avoid having racing allocations access a not fully initialised domain. As a safeguard, the lockdep assertion in irq_domain_set_mapping() will catch any offenders that also fail to set the root domain pointer. Tested-by: Hsin-Yi Wang Tested-by: Mark-PK Tsai Signed-off-by: Johan Hovold Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230213104302.17307-21-johan+linaro@kernel.org --- include/linux/irqdomain.h | 4 +++ kernel/irq/irqdomain.c | 59 ++++++++++++++++++++++++++------------- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 16399de00b480..d320d15d4fba8 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -125,6 +125,8 @@ struct irq_domain_chip_generic; * core code. * @flags: Per irq_domain flags * @mapcount: The number of mapped interrupts + * @mutex: Domain lock, hierarchical domains use root domain's lock + * @root: Pointer to root domain, or containing structure if non-hierarchical * * Optional elements: * @fwnode: Pointer to firmware node associated with the irq_domain. Pretty easy @@ -152,6 +154,8 @@ struct irq_domain { void *host_data; unsigned int flags; unsigned int mapcount; + struct mutex mutex; + struct irq_domain *root; /* Optional data */ struct fwnode_handle *fwnode; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 6d480dc6ab53b..1983f1beeec70 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -215,6 +215,17 @@ static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode, domain->revmap_size = size; + /* + * Hierarchical domains use the domain lock of the root domain + * (innermost domain). + * + * For non-hierarchical domains (as for root domains), the root + * pointer is set to the domain itself so that &domain->root->mutex + * always points to the right lock. + */ + mutex_init(&domain->mutex); + domain->root = domain; + irq_domain_check_hierarchy(domain); return domain; @@ -524,7 +535,7 @@ static bool irq_domain_is_nomap(struct irq_domain *domain) static void irq_domain_clear_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { - lockdep_assert_held(&irq_domain_mutex); + lockdep_assert_held(&domain->root->mutex); if (irq_domain_is_nomap(domain)) return; @@ -539,7 +550,11 @@ static void irq_domain_set_mapping(struct irq_domain *domain, irq_hw_number_t hwirq, struct irq_data *irq_data) { - lockdep_assert_held(&irq_domain_mutex); + /* + * This also makes sure that all domains point to the same root when + * called from irq_domain_insert_irq() for each domain in a hierarchy. + */ + lockdep_assert_held(&domain->root->mutex); if (irq_domain_is_nomap(domain)) return; @@ -561,7 +576,7 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) hwirq = irq_data->hwirq; - mutex_lock(&irq_domain_mutex); + mutex_lock(&domain->root->mutex); irq_set_status_flags(irq, IRQ_NOREQUEST); @@ -583,7 +598,7 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) /* Clear reverse map for this hwirq */ irq_domain_clear_mapping(domain, hwirq); - mutex_unlock(&irq_domain_mutex); + mutex_unlock(&domain->root->mutex); } static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq, @@ -633,9 +648,9 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq, { int ret; - mutex_lock(&irq_domain_mutex); + mutex_lock(&domain->root->mutex); ret = irq_domain_associate_locked(domain, virq, hwirq); - mutex_unlock(&irq_domain_mutex); + mutex_unlock(&domain->root->mutex); return ret; } @@ -752,7 +767,7 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain, return 0; } - mutex_lock(&irq_domain_mutex); + mutex_lock(&domain->root->mutex); /* Check if mapping already exists */ virq = irq_find_mapping(domain, hwirq); @@ -763,7 +778,7 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain, virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity); out: - mutex_unlock(&irq_domain_mutex); + mutex_unlock(&domain->root->mutex); return virq; } @@ -832,7 +847,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK)) type &= IRQ_TYPE_SENSE_MASK; - mutex_lock(&irq_domain_mutex); + mutex_lock(&domain->root->mutex); /* * If we've already configured this interrupt, @@ -892,7 +907,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) /* Store trigger type */ irqd_set_trigger_type(irq_data, type); out: - mutex_unlock(&irq_domain_mutex); + mutex_unlock(&domain->root->mutex); return virq; } @@ -1157,6 +1172,7 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, domain = __irq_domain_create(fwnode, 0, ~0, 0, ops, host_data); if (domain) { + domain->root = parent->root; domain->parent = parent; domain->flags |= flags; @@ -1555,10 +1571,10 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, return -EINVAL; } - mutex_lock(&irq_domain_mutex); + mutex_lock(&domain->root->mutex); ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg, realloc, affinity); - mutex_unlock(&irq_domain_mutex); + mutex_unlock(&domain->root->mutex); return ret; } @@ -1569,7 +1585,7 @@ static void irq_domain_fix_revmap(struct irq_data *d) { void __rcu **slot; - lockdep_assert_held(&irq_domain_mutex); + lockdep_assert_held(&d->domain->root->mutex); if (irq_domain_is_nomap(d->domain)) return; @@ -1635,7 +1651,7 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg) if (!parent_irq_data) return -ENOMEM; - mutex_lock(&irq_domain_mutex); + mutex_lock(&domain->root->mutex); /* Copy the original irq_data. */ *parent_irq_data = *irq_data; @@ -1663,7 +1679,7 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg) irq_domain_fix_revmap(parent_irq_data); irq_domain_set_mapping(domain, irq_data->hwirq, irq_data); error: - mutex_unlock(&irq_domain_mutex); + mutex_unlock(&domain->root->mutex); return rv; } @@ -1718,7 +1734,7 @@ int irq_domain_pop_irq(struct irq_domain *domain, int virq) if (WARN_ON(!parent_irq_data)) return -EINVAL; - mutex_lock(&irq_domain_mutex); + mutex_lock(&domain->root->mutex); irq_data->parent_data = NULL; @@ -1730,7 +1746,7 @@ int irq_domain_pop_irq(struct irq_domain *domain, int virq) irq_domain_fix_revmap(irq_data); - mutex_unlock(&irq_domain_mutex); + mutex_unlock(&domain->root->mutex); kfree(parent_irq_data); @@ -1746,17 +1762,20 @@ EXPORT_SYMBOL_GPL(irq_domain_pop_irq); void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) { struct irq_data *data = irq_get_irq_data(virq); + struct irq_domain *domain; int i; if (WARN(!data || !data->domain || !data->domain->ops->free, "NULL pointer, cannot free irq\n")) return; - mutex_lock(&irq_domain_mutex); + domain = data->domain; + + mutex_lock(&domain->root->mutex); for (i = 0; i < nr_irqs; i++) irq_domain_remove_irq(virq + i); - irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs); - mutex_unlock(&irq_domain_mutex); + irq_domain_free_irqs_hierarchy(domain, virq, nr_irqs); + mutex_unlock(&domain->root->mutex); irq_domain_free_irq_data(virq, nr_irqs); irq_free_descs(virq, nr_irqs); From 94debe03e8afa1267f95a9001786a6aa506b9ff3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 16 Dec 2022 15:09:33 -0800 Subject: [PATCH 41/42] irqchip/irq-brcmstb-l2: Set IRQ_LEVEL for level triggered interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When support for the level triggered interrupt controller flavor was added with c0ca7262088e, we forgot to update the flags to be set to contain IRQ_LEVEL. While the flow handler is correct, the output from /proc/interrupts does not show such interrupts as being level triggered when they are, correct that. Fixes: c0ca7262088e ("irqchip/brcmstb-l2: Add support for the BCM7271 L2 controller") Signed-off-by: Florian Fainelli Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221216230934.2478345-2-f.fainelli@gmail.com --- drivers/irqchip/irq-brcmstb-l2.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c index e4efc08ac5948..091b0fe7e3242 100644 --- a/drivers/irqchip/irq-brcmstb-l2.c +++ b/drivers/irqchip/irq-brcmstb-l2.c @@ -161,6 +161,7 @@ static int __init brcmstb_l2_intc_of_init(struct device_node *np, *init_params) { unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; + unsigned int set = 0; struct brcmstb_l2_intc_data *data; struct irq_chip_type *ct; int ret; @@ -208,9 +209,12 @@ static int __init brcmstb_l2_intc_of_init(struct device_node *np, if (IS_ENABLED(CONFIG_MIPS) && IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) flags |= IRQ_GC_BE_IO; + if (init_params->handler == handle_level_irq) + set |= IRQ_LEVEL; + /* Allocate a single Generic IRQ chip for this node */ ret = irq_alloc_domain_generic_chips(data->domain, 32, 1, - np->full_name, init_params->handler, clr, 0, flags); + np->full_name, init_params->handler, clr, set, flags); if (ret) { pr_err("failed to allocate generic irq chip\n"); goto out_free_domain; From 13a157b38ca5b4f9eed81442b8821db293755961 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 16 Dec 2022 15:09:34 -0800 Subject: [PATCH 42/42] irqchip/irq-bcm7120-l2: Set IRQ_LEVEL for level triggered interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When support for the interrupt controller was added with a5042de2688d, we forgot to update the flags to be set to contain IRQ_LEVEL. While the flow handler is correct, the output from /proc/interrupts does not show such interrupts as being level triggered when they are, correct that. Fixes: a5042de2688d ("irqchip: bcm7120-l2: Add Broadcom BCM7120-style Level 2 interrupt controller") Signed-off-by: Florian Fainelli Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221216230934.2478345-3-f.fainelli@gmail.com --- drivers/irqchip/irq-bcm7120-l2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-bcm7120-l2.c b/drivers/irqchip/irq-bcm7120-l2.c index bb6609cebdbce..1e9dab6e0d86f 100644 --- a/drivers/irqchip/irq-bcm7120-l2.c +++ b/drivers/irqchip/irq-bcm7120-l2.c @@ -279,7 +279,8 @@ static int __init bcm7120_l2_intc_probe(struct device_node *dn, flags |= IRQ_GC_BE_IO; ret = irq_alloc_domain_generic_chips(data->domain, IRQS_PER_WORD, 1, - dn->full_name, handle_level_irq, clr, 0, flags); + dn->full_name, handle_level_irq, clr, + IRQ_LEVEL, flags); if (ret) { pr_err("failed to allocate generic irq chip\n"); goto out_free_domain;