Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: Linux: Kernel

[PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

 

 

First page Previous page 1 2 Next page Last page  View All Linux kernel RSS feed   Index | Next | Previous | View Threaded


agordeev at redhat

May 18, 2012, 3:26 AM

Post #1 of 28 (262 views)
Permalink
[PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode

Currently x2APIC in logical destination mode delivers interrupts to a
single CPU, no matter how many CPUs were specified in the destination
cpumask.

This fix enables delivery of interrupts to multiple CPUs by bit-ORing
Logical IDs of destination CPUs that have matching Cluster ID.

Because only one cluster could be specified in a message destination
address, the destination cpumask is tried for a cluster that contains
maximum number of CPUs matching this cpumask. The CPUs in this cluster
are selected to receive the interrupts while all other CPUs (in the
cpumask) are ignored.

Signed-off-by: Alexander Gordeev <agordeev [at] redhat>
---
arch/x86/include/asm/x2apic.h | 9 --
arch/x86/kernel/apic/x2apic_cluster.c | 140 +++++++++++++++++++++++++++++----
arch/x86/kernel/apic/x2apic_phys.c | 9 ++-
3 files changed, 131 insertions(+), 27 deletions(-)

diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
index 92e54ab..7a5a832 100644
--- a/arch/x86/include/asm/x2apic.h
+++ b/arch/x86/include/asm/x2apic.h
@@ -28,15 +28,6 @@ static int x2apic_apic_id_registered(void)
return 1;
}

-/*
- * For now each logical cpu is in its own vector allocation domain.
- */
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
static void
__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
{
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8f012b2..f8fa4c4 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -96,36 +96,142 @@ static void x2apic_send_IPI_all(int vector)
__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
}

+static inline unsigned int
+__x2apic_cluster_to_apicid(int cpu_in_cluster, const struct cpumask *cpumask)
+{
+ unsigned int apicid = 0;
+ int cpu;
+
+ for_each_cpu_and(cpu, per_cpu(cpus_in_cluster, cpu_in_cluster), cpumask)
+ apicid |= per_cpu(x86_cpu_to_logical_apicid, cpu);
+
+ return apicid;
+}
+
+static int
+__x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid)
+{
+ int ret = 0;
+ int cpu, heaviest;
+ unsigned int weight, max_weight;
+ cpumask_var_t target_cpus, cluster_cpus;
+
+ if (unlikely(!alloc_cpumask_var(&target_cpus, GFP_ATOMIC))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (unlikely(!alloc_cpumask_var(&cluster_cpus, GFP_ATOMIC))) {
+ ret = -ENOMEM;
+ goto out_free_target_cpus;
+ }
+
+ cpumask_and(target_cpus, cpumask, cpu_online_mask);
+ max_weight = 0;
+
+ for_each_cpu(cpu, target_cpus) {
+ cpumask_and(cluster_cpus, per_cpu(cpus_in_cluster, cpu), cpumask);
+
+ weight = cpumask_weight(cluster_cpus);
+ if (weight > max_weight) {
+ max_weight = weight;
+ heaviest = cpu;
+ }
+
+ cpumask_andnot(target_cpus, target_cpus, cluster_cpus);
+ }
+
+ if (!max_weight) {
+ ret = -EINVAL;
+ goto out_free_cluster_cpus;
+ }
+
+ *apicid = __x2apic_cluster_to_apicid(heaviest, cpumask);
+
+out_free_cluster_cpus:
+ free_cpumask_var(cluster_cpus);
+out_free_target_cpus:
+ free_cpumask_var(target_cpus);
+out:
+ return ret;
+}
+
static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
- /*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
- */
- int cpu = cpumask_first(cpumask);
+ int err;
+ int cpu;
+ unsigned int apicid;

- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
- else
- return BAD_APICID;
+ err = __x2apic_cpu_mask_to_apicid(cpumask, &apicid);
+ WARN_ON(err);
+
+ if (!err)
+ return apicid;
+
+ if (err == -ENOMEM) {
+ for_each_cpu(cpu, cpumask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+ if (cpu < nr_cpu_ids)
+ return __x2apic_cluster_to_apicid(cpu, cpumask);
+ }
+
+ return BAD_APICID;
}

static unsigned int
x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
const struct cpumask *andmask)
{
- int cpu;
+ int err;
+ int cpu, first_cpu;
+ unsigned int apicid;
+ cpumask_var_t target_cpus;
+
+ if (likely(alloc_cpumask_var(&target_cpus, GFP_ATOMIC))) {
+ cpumask_and(target_cpus, cpumask, andmask);
+
+ err = __x2apic_cpu_mask_to_apicid(target_cpus, &apicid);
+
+ free_cpumask_var(target_cpus);
+
+ if (!err)
+ return apicid;
+ } else {
+ err = -ENOMEM;
+ }
+
+ WARN_ON(err);
+
+ if (err != -ENOMEM)
+ return 0;
+
+ apicid = 0;
+ first_cpu = nr_cpu_ids;

- /*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
- */
for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
+ if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+ first_cpu = cpu;
break;
+ }
+ }
+
+ if (first_cpu < nr_cpu_ids) {
+ for_each_cpu_and(cpu, per_cpu(cpus_in_cluster, first_cpu),
+ cpumask) {
+ if (!cpumask_test_cpu(cpu, andmask))
+ continue;
+ apicid |= per_cpu(x86_cpu_to_logical_apicid, cpu);
+ }
}

- return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ return apicid;
+}
+
+static void
+x2apic_cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_copy(retmask, cpu_possible_mask);
}

static void init_x2apic_ldr(void)
@@ -225,7 +331,7 @@ static struct apic apic_x2apic_cluster = {
.check_apicid_used = NULL,
.check_apicid_present = NULL,

- .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .vector_allocation_domain = x2apic_cluster_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,

.ioapic_phys_id_map = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 991e315..f0ee4a4 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -108,6 +108,13 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
return per_cpu(x86_cpu_to_apicid, cpu);
}

+static void
+x2apic_phys_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
static void init_x2apic_ldr(void)
{
}
@@ -137,7 +144,7 @@ static struct apic apic_x2apic_phys = {
.check_apicid_used = NULL,
.check_apicid_present = NULL,

- .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .vector_allocation_domain = x2apic_phys_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,

.ioapic_phys_id_map = NULL,
--
1.7.6.5

--
Regards,
Alexander Gordeev
agordeev [at] redhat
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


gorcunov at openvz

May 18, 2012, 7:41 AM

Post #2 of 28 (254 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Fri, May 18, 2012 at 12:26:41PM +0200, Alexander Gordeev wrote:
> Currently x2APIC in logical destination mode delivers interrupts to a
> single CPU, no matter how many CPUs were specified in the destination
> cpumask.
>
> This fix enables delivery of interrupts to multiple CPUs by bit-ORing
> Logical IDs of destination CPUs that have matching Cluster ID.
>
> Because only one cluster could be specified in a message destination
> address, the destination cpumask is tried for a cluster that contains
> maximum number of CPUs matching this cpumask. The CPUs in this cluster
> are selected to receive the interrupts while all other CPUs (in the
> cpumask) are ignored.

Hi Alexander,

I'm a bit confused, we do compose destination id from target cpumask
and send one ipi per _cluster_ with all cpus belonging to this cluster
OR'ed. So if my memory doesn't betray me all specified cpus in cluster
should obtain this message. Thus I'm wondering where do you find that
only one apic obtains ipi? (note i don't have the real physical
machine with x2apic enabled handy to test, so please share
the experience).

Cyrill
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


agordeev at redhat

May 18, 2012, 8:42 AM

Post #3 of 28 (254 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Fri, May 18, 2012 at 06:41:53PM +0400, Cyrill Gorcunov wrote:
> On Fri, May 18, 2012 at 12:26:41PM +0200, Alexander Gordeev wrote:
> > Currently x2APIC in logical destination mode delivers interrupts to a
> > single CPU, no matter how many CPUs were specified in the destination
> > cpumask.
> >
> > This fix enables delivery of interrupts to multiple CPUs by bit-ORing
> > Logical IDs of destination CPUs that have matching Cluster ID.
> >
> > Because only one cluster could be specified in a message destination
> > address, the destination cpumask is tried for a cluster that contains
> > maximum number of CPUs matching this cpumask. The CPUs in this cluster
> > are selected to receive the interrupts while all other CPUs (in the
> > cpumask) are ignored.
>
> Hi Alexander,
>
> I'm a bit confused, we do compose destination id from target cpumask
> and send one ipi per _cluster_ with all cpus belonging to this cluster
> OR'ed. So if my memory doesn't betray me all specified cpus in cluster
> should obtain this message. Thus I'm wondering where do you find that
> only one apic obtains ipi? (note i don't have the real physical
> machine with x2apic enabled handy to test, so please share
> the experience).

Cyrill,

This patchset is not about IPIs at all. It is about interrupts coming from
IO-APICs.

I.e. if you check /proc/interrups for some IR-IO-APIC you will notice that just
a single (first found) CPU receives the interrupts, even though the
corresponding /proc/irq/<irq#>/smp_affinity would specify multiple CPUs.

Given the current design it is unavoidable in physical destination mode (well,
as long as we do not broadcast, which I did not try yet). But it is well
avoidable in clustered logical addressing mode + lowest priority delivery mode.

I am attaching my debugging patchses so you could check actual values of ITREs.

>
> Cyrill

--
Regards,
Alexander Gordeev
agordeev [at] redhat
Attachments: 0001-x86-io-apic-Dump-IO-APICs-to-sys-kernel-ioapic.patch (6.53 KB)
  0002-x86-intremap-Fix-get_irte-NULL-pointer-assignment.patch (0.80 KB)
  0003-x86-intremap-Dump-IRTEs-to-sys-kernel-intremap.patch (2.63 KB)


gorcunov at openvz

May 18, 2012, 8:51 AM

Post #4 of 28 (255 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Fri, May 18, 2012 at 05:42:37PM +0200, Alexander Gordeev wrote:
> > Hi Alexander,
> >
> > I'm a bit confused, we do compose destination id from target cpumask
> > and send one ipi per _cluster_ with all cpus belonging to this cluster
> > OR'ed. So if my memory doesn't betray me all specified cpus in cluster
> > should obtain this message. Thus I'm wondering where do you find that
> > only one apic obtains ipi? (note i don't have the real physical
> > machine with x2apic enabled handy to test, so please share
> > the experience).
>
> Cyrill,
>
> This patchset is not about IPIs at all. It is about interrupts coming from
> IO-APICs.

Ah, you mean io-apic here. I'll try to find some time tonight/tomorrow-morning
for review. Thanks!

Cyrill
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


gorcunov at openvz

May 19, 2012, 3:47 AM

Post #5 of 28 (264 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Fri, May 18, 2012 at 07:51:19PM +0400, Cyrill Gorcunov wrote:
> On Fri, May 18, 2012 at 05:42:37PM +0200, Alexander Gordeev wrote:
> > > Hi Alexander,
> > >
> > > I'm a bit confused, we do compose destination id from target cpumask
> > > and send one ipi per _cluster_ with all cpus belonging to this cluster
> > > OR'ed. So if my memory doesn't betray me all specified cpus in cluster
> > > should obtain this message. Thus I'm wondering where do you find that
> > > only one apic obtains ipi? (note i don't have the real physical
> > > machine with x2apic enabled handy to test, so please share
> > > the experience).
> >
> > This patchset is not about IPIs at all. It is about interrupts coming from
> > IO-APICs.
>
> Ah, you mean io-apic here. I'll try to find some time tonight/tomorrow-morning
> for review. Thanks!

Sorry for delay, Alexander. I can only review the code (I've no x2apic
testing machine at the moment) and it looks good for me.

Reviewed-by: Cyrill Gorcunov <gorcunov [at] openvz>

p.s. Hope Suresh and Yinghai will take a look too. The thing which bothers
me is that we use LowPrio delivery mode, while there is no lowprio in
x2apic ipi messages and I assume the same should apply to ioapic generated
messages.

Cyrill
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


yinghai at kernel

May 19, 2012, 1:53 PM

Post #6 of 28 (254 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Fri, May 18, 2012 at 3:26 AM, Alexander Gordeev <agordeev [at] redhat> wrote:
> Currently x2APIC in logical destination mode delivers interrupts to a
> single CPU, no matter how many CPUs were specified in the destination
> cpumask.
>
> This fix enables delivery of interrupts to multiple CPUs by bit-ORing
> Logical IDs of destination CPUs that have matching Cluster ID.
>
> Because only one cluster could be specified in a message destination
> address, the destination cpumask is tried for a cluster that contains
> maximum number of CPUs matching this cpumask. The CPUs in this cluster
> are selected to receive the interrupts while all other CPUs (in the
> cpumask) are ignored.
>
> Signed-off-by: Alexander Gordeev <agordeev [at] redhat>
> ---
>  arch/x86/include/asm/x2apic.h         |    9 --
>  arch/x86/kernel/apic/x2apic_cluster.c |  140 +++++++++++++++++++++++++++++----
>  arch/x86/kernel/apic/x2apic_phys.c    |    9 ++-
>  3 files changed, 131 insertions(+), 27 deletions(-)
>
> diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
> index 92e54ab..7a5a832 100644
> --- a/arch/x86/include/asm/x2apic.h
> +++ b/arch/x86/include/asm/x2apic.h
> @@ -28,15 +28,6 @@ static int x2apic_apic_id_registered(void)
>        return 1;
>  }
>
> -/*
> - * For now each logical cpu is in its own vector allocation domain.
> - */
> -static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
> -{
> -       cpumask_clear(retmask);
> -       cpumask_set_cpu(cpu, retmask);
> -}
> -
>  static void
>  __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
>  {
> diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
> index 8f012b2..f8fa4c4 100644
> --- a/arch/x86/kernel/apic/x2apic_cluster.c
> +++ b/arch/x86/kernel/apic/x2apic_cluster.c
> @@ -96,36 +96,142 @@ static void x2apic_send_IPI_all(int vector)
>        __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
>  }
>
> +static inline unsigned int
> +__x2apic_cluster_to_apicid(int cpu_in_cluster, const struct cpumask *cpumask)
> +{
> +       unsigned int apicid = 0;
> +       int cpu;
> +
> +       for_each_cpu_and(cpu, per_cpu(cpus_in_cluster, cpu_in_cluster), cpumask)
> +               apicid |= per_cpu(x86_cpu_to_logical_apicid, cpu);
> +
> +       return apicid;
> +}
> +
> +static int
> +__x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *apicid)
> +{
> +       int ret = 0;
> +       int cpu, heaviest;
> +       unsigned int weight, max_weight;
> +       cpumask_var_t target_cpus, cluster_cpus;
> +
> +       if (unlikely(!alloc_cpumask_var(&target_cpus, GFP_ATOMIC))) {
> +               ret = -ENOMEM;
> +               goto out;
> +       }
> +       if (unlikely(!alloc_cpumask_var(&cluster_cpus, GFP_ATOMIC))) {
> +               ret = -ENOMEM;
> +               goto out_free_target_cpus;
> +       }
> +
> +       cpumask_and(target_cpus, cpumask, cpu_online_mask);
> +       max_weight = 0;
> +
> +       for_each_cpu(cpu, target_cpus) {
> +               cpumask_and(cluster_cpus, per_cpu(cpus_in_cluster, cpu), cpumask);
> +
> +               weight = cpumask_weight(cluster_cpus);
> +               if (weight > max_weight) {
> +                       max_weight = weight;
> +                       heaviest = cpu;
> +               }
> +
> +               cpumask_andnot(target_cpus, target_cpus, cluster_cpus);
> +       }
> +
> +       if (!max_weight) {
> +               ret = -EINVAL;
> +               goto out_free_cluster_cpus;
> +       }
> +
> +       *apicid = __x2apic_cluster_to_apicid(heaviest, cpumask);
> +
> +out_free_cluster_cpus:
> +       free_cpumask_var(cluster_cpus);
> +out_free_target_cpus:
> +       free_cpumask_var(target_cpus);
> +out:
> +       return ret;
> +}
> +
>  static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
>  {
> -       /*
> -        * We're using fixed IRQ delivery, can only return one logical APIC ID.
> -        * May as well be the first.
> -        */
> -       int cpu = cpumask_first(cpumask);
> +       int err;
> +       int cpu;
> +       unsigned int apicid;
>
> -       if ((unsigned)cpu < nr_cpu_ids)
> -               return per_cpu(x86_cpu_to_logical_apicid, cpu);
> -       else
> -               return BAD_APICID;
> +       err = __x2apic_cpu_mask_to_apicid(cpumask, &apicid);
> +       WARN_ON(err);
> +
> +       if (!err)
> +               return apicid;
> +
> +       if (err == -ENOMEM) {
> +               for_each_cpu(cpu, cpumask) {
> +                       if (cpumask_test_cpu(cpu, cpu_online_mask))
> +                               break;
> +               }
> +               if (cpu < nr_cpu_ids)
> +                       return __x2apic_cluster_to_apicid(cpu, cpumask);
> +       }
> +
> +       return BAD_APICID;
>  }
>
>  static unsigned int
>  x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
>                              const struct cpumask *andmask)
>  {
> -       int cpu;
> +       int err;
> +       int cpu, first_cpu;
> +       unsigned int apicid;
> +       cpumask_var_t target_cpus;
> +
> +       if (likely(alloc_cpumask_var(&target_cpus, GFP_ATOMIC))) {
> +               cpumask_and(target_cpus, cpumask, andmask);
> +
> +               err = __x2apic_cpu_mask_to_apicid(target_cpus, &apicid);
> +
> +               free_cpumask_var(target_cpus);
> +
> +               if (!err)
> +                       return apicid;
> +       } else {
> +               err = -ENOMEM;
> +       }
> +
> +       WARN_ON(err);
> +
> +       if (err != -ENOMEM)
> +               return 0;
> +
> +       apicid = 0;
> +       first_cpu = nr_cpu_ids;
>
> -       /*
> -        * We're using fixed IRQ delivery, can only return one logical APIC ID.
> -        * May as well be the first.
> -        */
>        for_each_cpu_and(cpu, cpumask, andmask) {
> -               if (cpumask_test_cpu(cpu, cpu_online_mask))
> +               if (cpumask_test_cpu(cpu, cpu_online_mask)) {
> +                       first_cpu = cpu;
>                        break;
> +               }
> +       }
> +
> +       if (first_cpu < nr_cpu_ids) {
> +               for_each_cpu_and(cpu, per_cpu(cpus_in_cluster, first_cpu),
> +                                cpumask) {
> +                       if (!cpumask_test_cpu(cpu, andmask))
> +                               continue;
> +                       apicid |= per_cpu(x86_cpu_to_logical_apicid, cpu);
> +               }
>        }
>
> -       return per_cpu(x86_cpu_to_logical_apicid, cpu);
> +       return apicid;
> +}
> +
> +static void
> +x2apic_cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
> +{
> +       cpumask_copy(retmask, cpu_possible_mask);

why not using per_cpu(cpus_in_cluster, cpu) instead?

also you may add one per cpu var like x86_cpu_to_logical_cluster_apicid.


Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


agordeev at redhat

May 21, 2012, 12:11 AM

Post #7 of 28 (248 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Sat, May 19, 2012 at 02:47:25PM +0400, Cyrill Gorcunov wrote:
> On Fri, May 18, 2012 at 07:51:19PM +0400, Cyrill Gorcunov wrote:

> > Ah, you mean io-apic here. I'll try to find some time tonight/tomorrow-morning
> > for review. Thanks!
>
> Sorry for delay, Alexander. I can only review the code (I've no x2apic
> testing machine at the moment) and it looks good for me.
>
> Reviewed-by: Cyrill Gorcunov <gorcunov [at] openvz>

Thank you, Cyrill.

> p.s. Hope Suresh and Yinghai will take a look too. The thing which bothers
> me is that we use LowPrio delivery mode, while there is no lowprio in
> x2apic ipi messages and I assume the same should apply to ioapic generated
> messages.

My understanding of the specification is LowestPrio is not supported only for
IPIs on x2APIC. Can not imagine why IO-APICs need to be limited here. But let's
hear what the guys think.


--
Regards,
Alexander Gordeev
agordeev [at] redhat
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


agordeev at redhat

May 21, 2012, 1:13 AM

Post #8 of 28 (238 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Sat, May 19, 2012 at 01:53:36PM -0700, Yinghai Lu wrote:
> > +static void
> > +x2apic_cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
> > +{
> > +       cpumask_copy(retmask, cpu_possible_mask);
>
> why not using per_cpu(cpus_in_cluster, cpu) instead?

Because it would lead to suboptimal results when updating IRQ affinity:

int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
unsigned int *dest_id)
{
struct irq_cfg *cfg = data->chip_data;

if (!cpumask_intersects(mask, cpu_online_mask))
return -1;

if (assign_irq_vector(data->irq, data->chip_data, mask))
return -1;

This call ^^^ will update cfg->domain with the value returned by the call to
apic->vector_allocation_domain(). If per_cpu(cpus_in_cluster, cpu) is returned
as cfg->domain here then all other clusters contained in the 'mask' will not
be taken into consideration by the apic->cpu_mask_to_apicid_and() call below.

cpumask_copy(data->affinity, mask);

*dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);

So we really need to submit all possible CPUs here ^^^ to be able finding the
best/heaviest cluster out of the 'mask'.

return 0;
}


> also you may add one per cpu var like x86_cpu_to_logical_cluster_apicid.

Both cpu_mask_to_apicid() and cpu_mask_to_apicid_and() take a cpumask to
derive the apicid from. Even though we could cache the value of apicid in
'x86_cpu_to_logical_cluster_apicid' variable, we still would have to unset
CPUs which are not in the requested cpumask. That means scanning through the
cpumask etc -- exactly what the the patch does now.

Or I am missing your point here..

> Yinghai

--
Regards,
Alexander Gordeev
agordeev [at] redhat
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


mingo at kernel

May 21, 2012, 1:22 AM

Post #9 of 28 (241 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

* Alexander Gordeev <agordeev [at] redhat> wrote:

> Currently x2APIC in logical destination mode delivers
> interrupts to a single CPU, no matter how many CPUs were
> specified in the destination cpumask.
>
> This fix enables delivery of interrupts to multiple CPUs by
> bit-ORing Logical IDs of destination CPUs that have matching
> Cluster ID.
>
> Because only one cluster could be specified in a message
> destination address, the destination cpumask is tried for a
> cluster that contains maximum number of CPUs matching this
> cpumask. The CPUs in this cluster are selected to receive the
> interrupts while all other CPUs (in the cpumask) are ignored.

I'm wondering how you tested this. AFAICS current irqbalanced
will create masks but on x2apic only the first CPU is targeted
by the kernel.

So, in theory, prior the patch you should be seeing irqs go to
only one CPU, while after the patch they are spread out amongst
the CPU. If it's using LowestPrio delivery then we depend on the
hardware doing this for us - how does this work out in practice,
are the target CPUs round-robin-ed, with a new CPU for every new
IRQ delivered?

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


agordeev at redhat

May 21, 2012, 2:36 AM

Post #10 of 28 (240 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Mon, May 21, 2012 at 10:22:40AM +0200, Ingo Molnar wrote:
>
> * Alexander Gordeev <agordeev [at] redhat> wrote:
>
> > Currently x2APIC in logical destination mode delivers
> > interrupts to a single CPU, no matter how many CPUs were
> > specified in the destination cpumask.
> >
> > This fix enables delivery of interrupts to multiple CPUs by
> > bit-ORing Logical IDs of destination CPUs that have matching
> > Cluster ID.
> >
> > Because only one cluster could be specified in a message
> > destination address, the destination cpumask is tried for a
> > cluster that contains maximum number of CPUs matching this
> > cpumask. The CPUs in this cluster are selected to receive the
> > interrupts while all other CPUs (in the cpumask) are ignored.
>
> I'm wondering how you tested this. AFAICS current irqbalanced
> will create masks but on x2apic only the first CPU is targeted
> by the kernel.

Right, that is what this patch is intended to change. So I use:
'hwclock --test' to generate rtc interruts
/proc/interrupts to check where/how many interrupts were delevired
/proc/irq/8/smp_affinity to check how clusters are chosen

> So, in theory, prior the patch you should be seeing irqs go to
> only one CPU, while after the patch they are spread out amongst
> the CPU. If it's using LowestPrio delivery then we depend on the
> hardware doing this for us - how does this work out in practice,
> are the target CPUs round-robin-ed, with a new CPU for every new
> IRQ delivered?


That is exactly what I can observe.

As of 'target CPUs round-robin-ed' and 'with a new CPU for every new IRQ
delivered' -- that is something we can not control as you noted. Nor do we
care to my understanding.

I can not commit on every h/w out there obviously, but on my PowerEdge M910
with some half-dozen clusters with six CPU per each, the interrupts are
perfectly balanced among those ones present in IRTEs.


> Thanks,
>
> Ingo

--
Regards,
Alexander Gordeev
agordeev [at] redhat
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


gorcunov at openvz

May 21, 2012, 2:46 AM

Post #11 of 28 (240 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Mon, May 21, 2012 at 09:11:36AM +0200, Alexander Gordeev wrote:
>
> > p.s. Hope Suresh and Yinghai will take a look too. The thing which bothers
> > me is that we use LowPrio delivery mode, while there is no lowprio in
> > x2apic ipi messages and I assume the same should apply to ioapic generated
> > messages.
>
> My understanding of the specification is LowestPrio is not supported only for
> IPIs on x2APIC. Can not imagine why IO-APICs need to be limited here. But let's
> hear what the guys think.

True. This limitation applies to IPIs only.

Cyrill
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


mingo at kernel

May 21, 2012, 5:40 AM

Post #12 of 28 (243 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

* Alexander Gordeev <agordeev [at] redhat> wrote:

> > So, in theory, prior the patch you should be seeing irqs go
> > to only one CPU, while after the patch they are spread out
> > amongst the CPU. If it's using LowestPrio delivery then we
> > depend on the hardware doing this for us - how does this
> > work out in practice, are the target CPUs round-robin-ed,
> > with a new CPU for every new IRQ delivered?
>
> That is exactly what I can observe.
>
> As of 'target CPUs round-robin-ed' and 'with a new CPU for
> every new IRQ delivered' -- that is something we can not
> control as you noted. Nor do we care to my understanding.
>
> I can not commit on every h/w out there obviously, but on my
> PowerEdge M910 with some half-dozen clusters with six CPU per
> each, the interrupts are perfectly balanced among those ones
> present in IRTEs.

But that is not 'perfectly balanced' in many cases.

When the hardware round-robins the interrupts then each
interrupt will go to a 'cache cold' CPU in essence. This is
pretty much the worst thing possible thing to do in most cases:
while it's "perfectly balanced" in the sense of distributing
cycles evenly between CPUs, each interrupt handler execution
will generate an avalance of cachemisses, for cachelines there
were modified in the previous invocation of the irq.

One notable exception is when the CPUs are SMT/Hyperthreading
siblings, in that case they are sharing even the L1 cache, so
there's very little cost to round-robining the IRQs within the
CPU mask.

But AFAICS irqbalanced will spread irqs on wider masks than SMT
sibling boundaries, exposing us to the above performance
problem.

So I think we need to tread carefully here.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


agordeev at redhat

May 21, 2012, 7:48 AM

Post #13 of 28 (243 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Mon, May 21, 2012 at 02:40:26PM +0200, Ingo Molnar wrote:
> But that is not 'perfectly balanced' in many cases.
>
> When the hardware round-robins the interrupts then each
> interrupt will go to a 'cache cold' CPU in essence. This is
> pretty much the worst thing possible thing to do in most cases:
> while it's "perfectly balanced" in the sense of distributing
> cycles evenly between CPUs, each interrupt handler execution
> will generate an avalance of cachemisses, for cachelines there
> were modified in the previous invocation of the irq.

Absolutely.

There are at least two more offenders :) exercising lowest priority + logical
addressing similarly. So in this regard the patch is nothing new:


static inline unsigned int
default_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
}

static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
unsigned int round = 0;
int cpu, apicid = 0;

/*
* The cpus in the mask must all be on the apic cluster.
*/
for_each_cpu(cpu, cpumask) {
int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);

if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
printk("%s: Not a valid mask!\n", __func__);
return BAD_APICID;
}
apicid |= new_apicid;
round++;
}
return apicid;
}

> One notable exception is when the CPUs are SMT/Hyperthreading
> siblings, in that case they are sharing even the L1 cache, so
> there's very little cost to round-robining the IRQs within the
> CPU mask.
>
> But AFAICS irqbalanced will spread irqs on wider masks than SMT
> sibling boundaries, exposing us to the above performance
> problem.

I would speculate it is irqbalanced who should be (in case of x86) cluster-
agnostic and ask for a mask while the apic layer is just execute or at least
report what was set. But that is a different topic.

Considering a bigger picture, it appears strange to me that apic is the layer
to take decision whether to make CPU a target or not. It is especially true
when one means a particluar cpumask, wants it to be set, but still is not able
to do that due to the current limitation.

> So I think we need to tread carefully here.

Kernel parameter? IRQ line flag? Totally opposed? :)

> Thanks,

>
> Ingo

--
Regards,
Alexander Gordeev
agordeev [at] redhat
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


mingo at kernel

May 21, 2012, 7:59 AM

Post #14 of 28 (244 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

* Alexander Gordeev <agordeev [at] redhat> wrote:

> > So I think we need to tread carefully here.
>
> Kernel parameter? IRQ line flag? Totally opposed? :)

Undecided and sceptical, because: -ENONUMBERS :-)

The kernel actually tries to have as much locality as we can
hide from the user.

For example we don't execute tasks for 100 usecs on one CPU,
then jump to another CPU and execute 100 usecs there, then to
yet another CPU to create an 'absolutely balanced use of CPU
resources'. Why? Because the cache-misses would be killing us.

When a device is generating ten thousand irqs a second then
round-robining the IRQs is equivalent to switching a CPU every
100 usecs.

There's another cost: tasks tend to try to migrate to sources of
wakeups. So if an IRQ wakes up a task then we will try to
execute the task locally, to have locality of access between the
data that the IRQ handler has touched, and the task that makes
use of it. Round-robining the IRQ in that situations means that
the task either follows the IRQ and round-robins (bad), or that
it stays remote to the IRQ data (not good either).

So, I'm sceptical :-/

Your other two patches looked sensible - will they apply fine
after each other, without the round-robin patch which is still
under discussion, or is there a dependency?

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


agordeev at redhat

May 21, 2012, 8:22 AM

Post #15 of 28 (250 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Mon, May 21, 2012 at 04:59:04PM +0200, Ingo Molnar wrote:
>
> * Alexander Gordeev <agordeev [at] redhat> wrote:
> Your other two patches looked sensible - will they apply fine
> after each other, without the round-robin patch which is still
> under discussion, or is there a dependency?

1/3 will apply cleanly; not worth worrying though -- whitespace oneliner
3/3 depends on 2/3 round-robin patch, and doubtful :)

> Thanks,
>
> Ingo

--
Regards,
Alexander Gordeev
agordeev [at] redhat
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


gorcunov at openvz

May 21, 2012, 8:34 AM

Post #16 of 28 (244 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Mon, May 21, 2012 at 04:59:04PM +0200, Ingo Molnar wrote:
>
> When a device is generating ten thousand irqs a second then
> round-robining the IRQs is equivalent to switching a CPU every
> 100 usecs.
>

As far as I remember it's even more complex, with lowprio it
might be not complete fair round-robin, since if focused
processor found then it'll be assigned to handle incoming
irq again.

Cyrill
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


torvalds at linux-foundation

May 21, 2012, 8:36 AM

Post #17 of 28 (243 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Mon, May 21, 2012 at 7:59 AM, Ingo Molnar <mingo [at] kernel> wrote:
>
> For example we don't execute tasks for 100 usecs on one CPU,
> then jump to another CPU and execute 100 usecs there, then to
> yet another CPU to create an 'absolutely balanced use of CPU
> resources'. Why? Because the cache-misses would be killing us.

That is likely generally not true within a single socket, though.

Interrupt handlers will basically never hit in the L1 anyway (*maybe*
it happens if the CPU is totally idle, but quite frankly, I doubt it).
Even the L2 is likely not large enough to have much cache across irqs,
unless it's one of the big Core 2 L2's that are largely shared per
socket anyway.

So it may well make perfect sense to allow a mask of CPU's for
interrupt delivery, but just make sure that the mask all points to
CPU's on the same socket. That would give the hardware some leeway in
choosing the actual core - it's very possible that hardware could
avoid cores that are running with irq's disabled (possibly improving
latecy) or even more likely - avoid cores that are in deeper
powersaving modes.

Avoiding waking up CPU's that are in C6 would not only help latency,
it would help power use. I don't know how well the irq handling
actually works on a hw level, but that's exactly the kind of thing I
would expect HW to do well (and sw would do badly, because the
latencies for things like CPU power states are low enough that trying
to do SW irq balancing at that level is entirely and completely
idiotic).

So I do think that we should aim for *allowing* hardware to do these
kinds of choices for us. Limiting irq delivery to a particular core is
very limiting for very little gain (almost no cache benefits), but
limiting it to a particular socket could certainly be a valid thing.
You might want to limit it to a particular socket anyway, just because
the hardware itself may well be closer to one socket (coming off the
PCIe lanes of that particular socket) than anything else.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


suresh.b.siddha at intel

May 21, 2012, 11:07 AM

Post #18 of 28 (247 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Mon, 2012-05-21 at 08:36 -0700, Linus Torvalds wrote:
> On Mon, May 21, 2012 at 7:59 AM, Ingo Molnar <mingo [at] kernel> wrote:
> >
> > For example we don't execute tasks for 100 usecs on one CPU,
> > then jump to another CPU and execute 100 usecs there, then to
> > yet another CPU to create an 'absolutely balanced use of CPU
> > resources'. Why? Because the cache-misses would be killing us.
>
> That is likely generally not true within a single socket, though.
>
> Interrupt handlers will basically never hit in the L1 anyway (*maybe*
> it happens if the CPU is totally idle, but quite frankly, I doubt it).
> Even the L2 is likely not large enough to have much cache across irqs,
> unless it's one of the big Core 2 L2's that are largely shared per
> socket anyway.
>
> So it may well make perfect sense to allow a mask of CPU's for
> interrupt delivery, but just make sure that the mask all points to
> CPU's on the same socket.

All the cluster members of a given x2apic cluster belong to the same
package. These x2apic cluster id's are setup by the HW and not by the
SW. And only one cluster (with one or multiple members of that cluster
set) can be specified in the interrupt destination field of the routing
table entry.

> That would give the hardware some leeway in
> choosing the actual core - it's very possible that hardware could
> avoid cores that are running with irq's disabled (possibly improving
> latecy) or even more likely - avoid cores that are in deeper
> powersaving modes.

Power aware interrupt routing in IVB does this. And the policy of
whether you want the interrupt to be routed to the busy core (to save
power) or an idle core (for minimizing the interruptions on the busy
core) can be selected by the SW (using IA32_ENERGY_PERF_BIAS MSR).

thanks,
suresh

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


torvalds at linux-foundation

May 21, 2012, 11:18 AM

Post #19 of 28 (240 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Mon, May 21, 2012 at 11:07 AM, Suresh Siddha
<suresh.b.siddha [at] intel> wrote:
>
> All the cluster members of a given x2apic cluster belong to the same
> package. These x2apic cluster id's are setup by the HW and not by the
> SW. And only one cluster (with one or multiple members of that cluster
> set) can be specified in the interrupt destination field of the routing
> table entry.

Ok, then the main question ends up being if there are enough cache or
power domains within a cluster to still worry about it.

For example, you say "package", but that can sometimes mean multiple
dies, or even just split caches that are big enough to matter
(although I can't think of any such right now on the x86 side - Core2
Duo had huge L2's, but they were shared, not split).

> Power aware interrupt routing in IVB does this. And the policy of
> whether you want the interrupt to be routed to the busy core (to save
> power) or an idle core (for minimizing the interruptions on the busy
> core) can be selected by the SW (using IA32_ENERGY_PERF_BIAS MSR).

Sounds like we definitely would want to support this at least in the
IVB timeframe then.

But I do agree with Ingo that it would be really good to actually see
numbers (and no, I don't mean "look here, now the irq's are nicely
spread out", but power and/or performance numbers showing that it
actually helps something).

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


suresh.b.siddha at intel

May 21, 2012, 11:37 AM

Post #20 of 28 (255 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Mon, 2012-05-21 at 11:18 -0700, Linus Torvalds wrote:
> On Mon, May 21, 2012 at 11:07 AM, Suresh Siddha
> <suresh.b.siddha [at] intel> wrote:
> >
> > All the cluster members of a given x2apic cluster belong to the same
> > package. These x2apic cluster id's are setup by the HW and not by the
> > SW. And only one cluster (with one or multiple members of that cluster
> > set) can be specified in the interrupt destination field of the routing
> > table entry.
>
> Ok, then the main question ends up being if there are enough cache or
> power domains within a cluster to still worry about it.

There are 16 members with in a x2apic cluster. With two HT siblings,
that will still leave 8-cores.

>
> For example, you say "package", but that can sometimes mean multiple
> dies, or even just split caches that are big enough to matter
> (although I can't think of any such right now on the x86 side - Core2
> Duo had huge L2's, but they were shared, not split).

Most likely multiple dies or split caches will have different
cluster-id's. I don't know of any upcoming implementations that will
have such an implementation supporting x2apic, but will keep an eye.

>
> > Power aware interrupt routing in IVB does this. And the policy of
> > whether you want the interrupt to be routed to the busy core (to save
> > power) or an idle core (for minimizing the interruptions on the busy
> > core) can be selected by the SW (using IA32_ENERGY_PERF_BIAS MSR).
>
> Sounds like we definitely would want to support this at least in the
> IVB timeframe then.
>
> But I do agree with Ingo that it would be really good to actually see
> numbers (and no, I don't mean "look here, now the irq's are nicely
> spread out", but power and/or performance numbers showing that it
> actually helps something).

I agree. This is the reason why I held up posting these patches before.
I can come up with micro-benchmarks that can show some difference but
the key is to find good workload/benchmark that can show measurable
difference. Any suggestions?

thanks,
suresh

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


mingo at kernel

May 21, 2012, 12:15 PM

Post #21 of 28 (256 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

* Linus Torvalds <torvalds [at] linux-foundation> wrote:

> On Mon, May 21, 2012 at 7:59 AM, Ingo Molnar <mingo [at] kernel> wrote:
> >
> > For example we don't execute tasks for 100 usecs on one CPU,
> > then jump to another CPU and execute 100 usecs there, then
> > to yet another CPU to create an 'absolutely balanced use of
> > CPU resources'. Why? Because the cache-misses would be
> > killing us.
>
> That is likely generally not true within a single socket,
> though.
>
> Interrupt handlers will basically never hit in the L1 anyway
> (*maybe* it happens if the CPU is totally idle, but quite
> frankly, I doubt it). Even the L2 is likely not large enough
> to have much cache across irqs, unless it's one of the big
> Core 2 L2's that are largely shared per socket anyway.

Indeed, you are right as far as the L1 cache is concerned:

For networking we have about 180 L1 misses per IRQ handler
invocation, if the L1 cache is cold - so most IRQ handlers
easily fit int the L1 cache. I have measured this using a
constant rate of networking IRQs and doing:

perf stat -a -C 0 --repeat 10 -e L1-dcache-load-misses:k sleep 1

It only takes another 180 cache misses for these lines to get
evicted, and this happens very easily:

On the same testsystem a parallel kernel build will evict about
25 million L1 cachelines/sec/CPU. That means that an IRQ
handler's working set in the L1 cache is indeed gone in less
than 8 microseconds.

When the workload is RAM-bound then the L1 working set of an IRQ
handler is gone in about 80 usecs. That corresponds to an IRQ
rate of about 12,500/sec/CPU: if IRQs are coming in faster than
this then they can still see some of the previous execution's
footprint in the cache.

Wrt. the L2 cache the numbers come in much more in favor of not
moving IRQ handlers across L2 cache domains - this means that
allowing the hardware to distribute them per socket is a pretty
sensible default.

> So it may well make perfect sense to allow a mask of CPU's for
> interrupt delivery, but just make sure that the mask all
> points to CPU's on the same socket. That would give the
> hardware some leeway in choosing the actual core - it's very
> possible that hardware could avoid cores that are running with
> irq's disabled (possibly improving latecy) or even more likely
> - avoid cores that are in deeper powersaving modes.

Indeed, and that's an important argument.

The one negative effect I mentioned, affine wakeups done by the
scheduler, could still bite us - this has to be measured and
affine wakeups have to be made less prominent if IRQ handlers
start jumping around. We definitely don't want tasks to follow
round-robin IRQs around.

> Avoiding waking up CPU's that are in C6 would not only help
> latency, it would help power use. I don't know how well the
> irq handling actually works on a hw level, but that's exactly
> the kind of thing I would expect HW to do well (and sw would
> do badly, because the latencies for things like CPU power
> states are low enough that trying to do SW irq balancing at
> that level is entirely and completely idiotic).
>
> So I do think that we should aim for *allowing* hardware to do
> these kinds of choices for us. Limiting irq delivery to a
> particular core is very limiting for very little gain (almost
> no cache benefits), but limiting it to a particular socket
> could certainly be a valid thing. You might want to limit it
> to a particular socket anyway, just because the hardware
> itself may well be closer to one socket (coming off the PCIe
> lanes of that particular socket) than anything else.

Ok, I'm convinced.

Limiting to a socket is I suspect an important constraint: we
shouldn't just feed the hardware whatever mask user-space sends
us, user-space might not be aware of (or confused about) socket
boundaries.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


mingo at kernel

May 21, 2012, 12:30 PM

Post #22 of 28 (246 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

* Suresh Siddha <suresh.b.siddha [at] intel> wrote:

> > But I do agree with Ingo that it would be really good to
> > actually see numbers (and no, I don't mean "look here, now
> > the irq's are nicely spread out", but power and/or
> > performance numbers showing that it actually helps
> > something).
>
> I agree. This is the reason why I held up posting these
> patches before. I can come up with micro-benchmarks that can
> show some difference but the key is to find good
> workload/benchmark that can show measurable difference. Any
> suggestions?

It's rather difficult to measure this reliably. The main
complication is the inherent noise of cache stats on SMP/NUMA
systems, which all modern multi-socket systems are ...

But, since you asked, if you can generate a *very* precise
incoming external IRQ rate, it's possible:

Generate say 10,000 irqs/sec of a workload directed at a single
CPU - something like multiple copies of ping -i 0.001 -q
executed on a nearby system might do.

Then run a user-space cycle soaker, nice -19 running NOPs on all
CPUs. It's important that it *only* a user-space infinite loop,
with no kernel instructions executed at all - see later.

Then play around with variants of:

perf stat -a --repeat 10 -e cycles:u -e instructions:u sleep 1

this will tell you the number of user-space cycles and
instructions executed, per second. The ':u' attribute to limit
to user-space cycles filters apart the IRQ handler overhead from
your user-space cycle soaker.

This number of 'available user-space performance' should not get
worse when you switch from single-CPU APIC target to a
harware-round-robin target mask. You can switch the mask using
/proc/irq/nr/smp_affinity with very low overhead, while all the
above masurements are running - this allows you to see how
user-space throughput reacts to the IRQ details.

Double check that the irq rate is constant, via 'vmstat 1'.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


suresh.b.siddha at intel

May 21, 2012, 12:56 PM

Post #23 of 28 (242 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Mon, 2012-05-21 at 21:15 +0200, Ingo Molnar wrote:
> The one negative effect I mentioned, affine wakeups done by the
> scheduler, could still bite us - this has to be measured and
> affine wakeups have to be made less prominent if IRQ handlers
> start jumping around. We definitely don't want tasks to follow
> round-robin IRQs around.

Actually this (ping-ponging around idle cpu's in a socket) happens
already today, as the affine wakeups use select_idle_sibling() to wake
up the task on an idle sibling (HT or core sibling) if the cpu on which
the task is woken-up is busy with something else.

But I agree, somehow all these need to work together. For example to
save power, what we want is the idle core to stay in idle with least
number of interruptions, with interrupts routed to busy cores, scheduler
not aggressively waking the tasks up on idle cores etc.

One quick thought is to use the cpufreq governor decisions and if all
the cpu's in the package are in lower p-states, then we can default for
power-saving decisions, otherwise if any cpu is at P0, switch for
performance policy(in terms of irq routing, scheduler wake-up decisions
etc) dynamically etc.

thanks,
suresh



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


yinghai at kernel

May 21, 2012, 4:02 PM

Post #24 of 28 (241 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Mon, May 21, 2012 at 1:13 AM, Alexander Gordeev <agordeev [at] redhat> wrote:
> On Sat, May 19, 2012 at 01:53:36PM -0700, Yinghai Lu wrote:
>> > +static void
>> > +x2apic_cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
>> > +{
>> > +       cpumask_copy(retmask, cpu_possible_mask);
>>
>> why not using per_cpu(cpus_in_cluster, cpu) instead?
>
> Because it would lead to suboptimal results when updating IRQ affinity:
>
> int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
>                          unsigned int *dest_id)
> {
>        struct irq_cfg *cfg = data->chip_data;
>
>        if (!cpumask_intersects(mask, cpu_online_mask))
>                return -1;
>
>        if (assign_irq_vector(data->irq, data->chip_data, mask))
>                return -1;
>
> This call ^^^ will update cfg->domain with the value returned by the call to
> apic->vector_allocation_domain(). If per_cpu(cpus_in_cluster, cpu) is returned
> as cfg->domain here then all other clusters contained in the 'mask' will not
> be taken into consideration by the apic->cpu_mask_to_apicid_and() call below.
>
>        cpumask_copy(data->affinity, mask);
>
>        *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
>
> So we really need to submit all possible CPUs here ^^^ to be able finding the
> best/heaviest cluster out of the 'mask'.

ok.

>
>        return 0;
> }
>
>
>> also you may add one per cpu var like x86_cpu_to_logical_cluster_apicid.
>
> Both cpu_mask_to_apicid() and cpu_mask_to_apicid_and() take a cpumask to
> derive the apicid from. Even though we could cache the value of apicid in
> 'x86_cpu_to_logical_cluster_apicid' variable, we still would have to unset
> CPUs which are not in the requested cpumask. That means scanning through the
> cpumask etc -- exactly what the the patch does now.

ok, i got it. thanks for the explanation.

I was thinking: pick up one for cpumask that user want, and then just
use all cpus in that cluster as dest cpus.

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


yinghai at kernel

May 21, 2012, 4:33 PM

Post #25 of 28 (245 views)
Permalink
Re: [PATCH 2/3] x86: x2apic/cluster: Make use of lowest priority delivery mode [In reply to]

On Mon, May 21, 2012 at 4:02 PM, Yinghai Lu <yinghai [at] kernel> wrote:
> On Mon, May 21, 2012 at 1:13 AM, Alexander Gordeev <agordeev [at] redhat> wrote:
>> On Sat, May 19, 2012 at 01:53:36PM -0700, Yinghai Lu wrote:
>>> > +static void
>>> > +x2apic_cluster_vector_allocation_domain(int cpu, struct cpumask *retmask)
>>> > +{
>>> > +       cpumask_copy(retmask, cpu_possible_mask);
>>>
>>> why not using per_cpu(cpus_in_cluster, cpu) instead?
>>
>> Because it would lead to suboptimal results when updating IRQ affinity:
>>
>> int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
>>                          unsigned int *dest_id)
>> {
>>        struct irq_cfg *cfg = data->chip_data;
>>
>>        if (!cpumask_intersects(mask, cpu_online_mask))
>>                return -1;
>>
>>        if (assign_irq_vector(data->irq, data->chip_data, mask))
>>                return -1;
>>
>> This call ^^^ will update cfg->domain with the value returned by the call to
>> apic->vector_allocation_domain(). If per_cpu(cpus_in_cluster, cpu) is returned
>> as cfg->domain here then all other clusters contained in the 'mask' will not
>> be taken into consideration by the apic->cpu_mask_to_apicid_and() call below.
>>
>>        cpumask_copy(data->affinity, mask);
>>
>>        *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
>>
>> So we really need to submit all possible CPUs here ^^^ to be able finding the
>> best/heaviest cluster out of the 'mask'.
>
> ok.

Maybe you can not simply to change that to possible_cpu_mask.

otherwise assign_irq_vector will try to get same vector for all cpus
for one irq.

then will make x2apic cluster mode will one support 224 irqs instead.

so you need to make vector_allocation_domain() to be less cpu set.

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

First page Previous page 1 2 Next page Last page  View All Linux kernel RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact Gossamer Threads
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.