Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: Linux: Kernel

[PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints

 

 

First page Previous page 1 2 Next page Last page  View All Linux kernel RSS feed   Index | Next | Previous | View Threaded


peter.p.waskiewicz.jr at intel

Nov 22, 2009, 10:46 PM

Post #1 of 35 (605 views)
Permalink
[PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints

This patchset adds a new CPU mask for SMP systems to the irq_desc
struct. It also exposes an API for underlying device drivers to
assist irqbalance in making smarter decisions when balancing, especially
in a NUMA environment. For example, an ethernet driver with MSI-X may
wish to limit the CPUs that an interrupt can be balanced within to
stay on a single NUMA node. Current irqbalance operation can move the
interrupt off the node, resulting in cross-node memory accesses and
locks.

The API is a get/set API within the kernel, along with a /proc entry
for the interrupt.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr [at] intel>
---

include/linux/interrupt.h | 8 ++++++
include/linux/irq.h | 2 ++
kernel/irq/manage.c | 32 +++++++++++++++++++++++++
kernel/irq/proc.c | 57 +++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 99 insertions(+), 0 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 75f3f00..9fd08aa 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -208,6 +208,8 @@ extern cpumask_var_t irq_default_affinity;
extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
extern int irq_can_set_affinity(unsigned int irq);
extern int irq_select_affinity(unsigned int irq);
+extern int irq_set_node_affinity(unsigned int irq,
+ const struct cpumask *cpumask);

#else /* CONFIG_SMP */

@@ -223,6 +225,12 @@ static inline int irq_can_set_affinity(unsigned int irq)

static inline int irq_select_affinity(unsigned int irq) { return 0; }

+static inline int irq_set_node_affinity(unsigned int irq,
+ const struct cpumask *m)
+{
+ return -EINVAL;
+}
+
#endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */

#ifdef CONFIG_GENERIC_HARDIRQS
diff --git a/include/linux/irq.h b/include/linux/irq.h
index ae9653d..26d7d07 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -166,6 +166,7 @@ struct irq_2_iommu;
* @lock: locking for SMP
* @affinity: IRQ affinity on SMP
* @node: node index useful for balancing
+ * @node_affinity: irq mask hints for irqbalance
* @pending_mask: pending rebalanced interrupts
* @threads_active: number of irqaction threads currently running
* @wait_for_threads: wait queue for sync_irq to wait for threaded handlers
@@ -196,6 +197,7 @@ struct irq_desc {
#ifdef CONFIG_SMP
cpumask_var_t affinity;
unsigned int node;
+ cpumask_var_t node_affinity;
#ifdef CONFIG_GENERIC_PENDING_IRQ
cpumask_var_t pending_mask;
#endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7305b29..9e80783 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,38 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
return 0;
}

+/**
+ * irq_set_node_affinity - Set the CPU mask this interrupt can run on
+ * @irq: Interrupt to modify
+ * @cpumask: CPU mask to assign to the interrupt
+ *
+ */
+int irq_set_node_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ cpumask_copy(desc->node_affinity, cpumask);
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(irq_set_node_affinity);
+
+/**
+ * irq_get_node_affinity - Get the CPU mask this interrupt can run on
+ * @irq: Interrupt to get information
+ *
+ */
+struct cpumask *irq_get_node_affinity(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return desc->node_affinity;
+}
+EXPORT_SYMBOL(irq_get_node_affinity);
+
#ifndef CONFIG_AUTO_IRQ_AFFINITY
/*
* Generic version of the affinity autoselector.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 0832145..192e3fb 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -31,6 +31,16 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
return 0;
}

+static int irq_node_affinity_proc_show(struct seq_file *m, void *v)
+{
+ struct irq_desc *desc = irq_to_desc((long)m->private);
+ const struct cpumask *mask = desc->node_affinity;
+
+ seq_cpumask(m, mask);
+ seq_putc(m, '\n');
+ return 0;
+}
+
#ifndef is_affinity_mask_valid
#define is_affinity_mask_valid(val) 1
#endif
@@ -78,11 +88,46 @@ free_cpumask:
return err;
}

+static ssize_t irq_node_affinity_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
+ cpumask_var_t new_value;
+ int err;
+
+ if (no_irq_affinity || irq_balancing_disabled(irq))
+ return -EIO;
+
+ if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = cpumask_parse_user(buffer, count, new_value);
+ if (err)
+ goto free_cpumask;
+
+ if (!is_affinity_mask_valid(new_value)) {
+ err = -EINVAL;
+ goto free_cpumask;
+ }
+
+ irq_set_node_affinity(irq, new_value);
+ err = count;
+
+free_cpumask:
+ free_cpumask_var(new_value);
+ return err;
+}
+
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
}

+static int irq_node_affinity_proc_open(struct inode *inode, struct file *f)
+{
+ return single_open(f, irq_node_affinity_proc_show, PDE(inode)->data);
+}
+
static const struct file_operations irq_affinity_proc_fops = {
.open = irq_affinity_proc_open,
.read = seq_read,
@@ -91,6 +136,14 @@ static const struct file_operations irq_affinity_proc_fops = {
.write = irq_affinity_proc_write,
};

+static const struct file_operations irq_node_affinity_proc_fops = {
+ .open = irq_node_affinity_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = irq_node_affinity_proc_write,
+};
+
static int default_affinity_show(struct seq_file *m, void *v)
{
seq_cpumask(m, irq_default_affinity);
@@ -230,6 +283,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
/* create /proc/irq/<irq>/smp_affinity */
proc_create_data("smp_affinity", 0600, desc->dir,
&irq_affinity_proc_fops, (void *)(long)irq);
+
+ /* create /proc/irq/<irq>/node_affinity */
+ proc_create_data("node_affinity", 0600, desc->dir,
+ &irq_node_affinity_proc_fops, (void *)(long)irq);
#endif

proc_create_data("spurious", 0444, desc->dir,

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peter.p.waskiewicz.jr at intel

Nov 22, 2009, 11:12 PM

Post #2 of 35 (596 views)
Permalink
[PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

This patchset adds a new CPU mask for SMP systems to the irq_desc
struct. It also exposes an API for underlying device drivers to
assist irqbalance in making smarter decisions when balancing, especially
in a NUMA environment. For example, an ethernet driver with MSI-X may
wish to limit the CPUs that an interrupt can be balanced within to
stay on a single NUMA node. Current irqbalance operation can move the
interrupt off the node, resulting in cross-node memory accesses and
locks.

The API is a get/set API within the kernel, along with a /proc entry
for the interrupt.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr [at] intel>
---

include/linux/interrupt.h | 8 ++++++
include/linux/irq.h | 2 ++
kernel/irq/manage.c | 32 +++++++++++++++++++++++++
kernel/irq/proc.c | 57 +++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 99 insertions(+), 0 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 75f3f00..9fd08aa 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -208,6 +208,8 @@ extern cpumask_var_t irq_default_affinity;
extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
extern int irq_can_set_affinity(unsigned int irq);
extern int irq_select_affinity(unsigned int irq);
+extern int irq_set_node_affinity(unsigned int irq,
+ const struct cpumask *cpumask);

#else /* CONFIG_SMP */

@@ -223,6 +225,12 @@ static inline int irq_can_set_affinity(unsigned int irq)

static inline int irq_select_affinity(unsigned int irq) { return 0; }

+static inline int irq_set_node_affinity(unsigned int irq,
+ const struct cpumask *m)
+{
+ return -EINVAL;
+}
+
#endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */

#ifdef CONFIG_GENERIC_HARDIRQS
diff --git a/include/linux/irq.h b/include/linux/irq.h
index ae9653d..26d7d07 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -166,6 +166,7 @@ struct irq_2_iommu;
* @lock: locking for SMP
* @affinity: IRQ affinity on SMP
* @node: node index useful for balancing
+ * @node_affinity: irq mask hints for irqbalance
* @pending_mask: pending rebalanced interrupts
* @threads_active: number of irqaction threads currently running
* @wait_for_threads: wait queue for sync_irq to wait for threaded handlers
@@ -196,6 +197,7 @@ struct irq_desc {
#ifdef CONFIG_SMP
cpumask_var_t affinity;
unsigned int node;
+ cpumask_var_t node_affinity;
#ifdef CONFIG_GENERIC_PENDING_IRQ
cpumask_var_t pending_mask;
#endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7305b29..9e80783 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,38 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
return 0;
}

+/**
+ * irq_set_node_affinity - Set the CPU mask this interrupt can run on
+ * @irq: Interrupt to modify
+ * @cpumask: CPU mask to assign to the interrupt
+ *
+ */
+int irq_set_node_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ cpumask_copy(desc->node_affinity, cpumask);
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(irq_set_node_affinity);
+
+/**
+ * irq_get_node_affinity - Get the CPU mask this interrupt can run on
+ * @irq: Interrupt to get information
+ *
+ */
+struct cpumask *irq_get_node_affinity(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return desc->node_affinity;
+}
+EXPORT_SYMBOL(irq_get_node_affinity);
+
#ifndef CONFIG_AUTO_IRQ_AFFINITY
/*
* Generic version of the affinity autoselector.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 0832145..192e3fb 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -31,6 +31,16 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
return 0;
}

+static int irq_node_affinity_proc_show(struct seq_file *m, void *v)
+{
+ struct irq_desc *desc = irq_to_desc((long)m->private);
+ const struct cpumask *mask = desc->node_affinity;
+
+ seq_cpumask(m, mask);
+ seq_putc(m, '\n');
+ return 0;
+}
+
#ifndef is_affinity_mask_valid
#define is_affinity_mask_valid(val) 1
#endif
@@ -78,11 +88,46 @@ free_cpumask:
return err;
}

+static ssize_t irq_node_affinity_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
+ cpumask_var_t new_value;
+ int err;
+
+ if (no_irq_affinity || irq_balancing_disabled(irq))
+ return -EIO;
+
+ if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = cpumask_parse_user(buffer, count, new_value);
+ if (err)
+ goto free_cpumask;
+
+ if (!is_affinity_mask_valid(new_value)) {
+ err = -EINVAL;
+ goto free_cpumask;
+ }
+
+ irq_set_node_affinity(irq, new_value);
+ err = count;
+
+free_cpumask:
+ free_cpumask_var(new_value);
+ return err;
+}
+
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
}

+static int irq_node_affinity_proc_open(struct inode *inode, struct file *f)
+{
+ return single_open(f, irq_node_affinity_proc_show, PDE(inode)->data);
+}
+
static const struct file_operations irq_affinity_proc_fops = {
.open = irq_affinity_proc_open,
.read = seq_read,
@@ -91,6 +136,14 @@ static const struct file_operations irq_affinity_proc_fops = {
.write = irq_affinity_proc_write,
};

+static const struct file_operations irq_node_affinity_proc_fops = {
+ .open = irq_node_affinity_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = irq_node_affinity_proc_write,
+};
+
static int default_affinity_show(struct seq_file *m, void *v)
{
seq_cpumask(m, irq_default_affinity);
@@ -230,6 +283,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
/* create /proc/irq/<irq>/smp_affinity */
proc_create_data("smp_affinity", 0600, desc->dir,
&irq_affinity_proc_fops, (void *)(long)irq);
+
+ /* create /proc/irq/<irq>/node_affinity */
+ proc_create_data("node_affinity", 0600, desc->dir,
+ &irq_node_affinity_proc_fops, (void *)(long)irq);
#endif

proc_create_data("spurious", 0444, desc->dir,

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


yong.zhang0 at gmail

Nov 22, 2009, 11:32 PM

Post #3 of 35 (599 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Mon, Nov 23, 2009 at 2:46 PM, Peter P Waskiewicz Jr
<peter.p.waskiewicz.jr [at] intel> wrote:
> This patchset adds a new CPU mask for SMP systems to the irq_desc
> struct.  It also exposes an API for underlying device drivers to
> assist irqbalance in making smarter decisions when balancing, especially
> in a NUMA environment.  For example, an ethernet driver with MSI-X may
> wish to limit the CPUs that an interrupt can be balanced within to
> stay on a single NUMA node.  Current irqbalance operation can move the
> interrupt off the node, resulting in cross-node memory accesses and
> locks.
>
> The API is a get/set API within the kernel, along with a /proc entry
> for the interrupt.
>
> Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr [at] intel>
> ---

1) I think you should consider CONFIG_CPUMASK_OFFSTACK which will affect
node_affinity.
2) It seems like this patch can't work with SPARSE_IRQ.

Thanks,
Yong
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peter.p.waskiewicz.jr at intel

Nov 23, 2009, 1:36 AM

Post #4 of 35 (592 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Sun, 2009-11-22 at 23:32 -0800, Yong Zhang wrote:
> On Mon, Nov 23, 2009 at 2:46 PM, Peter P Waskiewicz Jr
> <peter.p.waskiewicz.jr [at] intel> wrote:
> > This patchset adds a new CPU mask for SMP systems to the irq_desc
> > struct. It also exposes an API for underlying device drivers to
> > assist irqbalance in making smarter decisions when balancing, especially
> > in a NUMA environment. For example, an ethernet driver with MSI-X may
> > wish to limit the CPUs that an interrupt can be balanced within to
> > stay on a single NUMA node. Current irqbalance operation can move the
> > interrupt off the node, resulting in cross-node memory accesses and
> > locks.
> >
> > The API is a get/set API within the kernel, along with a /proc entry
> > for the interrupt.
> >
> > Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr [at] intel>
> > ---
>
> 1) I think you should consider CONFIG_CPUMASK_OFFSTACK which will affect
> node_affinity.
> 2) It seems like this patch can't work with SPARSE_IRQ.

This mechanism isn't going to be used by any internal kernel mechanism
for determining interrupt placement or operation. It's purely something
that either a driver can modify, or external script (through /proc),
that irqbalance will make use of. If irqbalance isn't running, or the
current version of irqbalance doesn't support reading node_affinity,
then it won't affect the system's operation.

If irqbalance does support it, it'll read whatever the supplied mask is,
and then will try and balance interrupts within that mask. It will bail
if the mask is invalid, or won't apply to the running system, just like
how putting a bogus mask into smp_affinity is ignored.

If there's something I'm missing beyond this with the two suggestions
you've made (I looked into those two parameters and tried to draw
conclusions), please let me know.

Cheers,
-PJ Waskiewicz

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peterz at infradead

Nov 23, 2009, 9:05 AM

Post #5 of 35 (584 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Mon, 2009-11-23 at 01:36 -0800, Peter P Waskiewicz Jr wrote:

> This mechanism isn't going to be used by any internal kernel mechanism
> for determining interrupt placement or operation. It's purely something
> that either a driver can modify, or external script (through /proc),
> that irqbalance will make use of. If irqbalance isn't running, or the
> current version of irqbalance doesn't support reading node_affinity,
> then it won't affect the system's operation.
>
> If irqbalance does support it, it'll read whatever the supplied mask is,
> and then will try and balance interrupts within that mask. It will bail
> if the mask is invalid, or won't apply to the running system, just like
> how putting a bogus mask into smp_affinity is ignored.
>
> If there's something I'm missing beyond this with the two suggestions
> you've made (I looked into those two parameters and tried to draw
> conclusions), please let me know.

I don't see the point in adding it, if the driver wants to set a node
cpu mask it can already do that using the regular smp affinity settings.

Same for userspace.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peter.p.waskiewicz.jr at intel

Nov 23, 2009, 3:32 PM

Post #6 of 35 (584 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Mon, 23 Nov 2009, Peter Zijlstra wrote:

> On Mon, 2009-11-23 at 01:36 -0800, Peter P Waskiewicz Jr wrote:
>
> > This mechanism isn't going to be used by any internal kernel mechanism
> > for determining interrupt placement or operation. It's purely something
> > that either a driver can modify, or external script (through /proc),
> > that irqbalance will make use of. If irqbalance isn't running, or the
> > current version of irqbalance doesn't support reading node_affinity,
> > then it won't affect the system's operation.
> >
> > If irqbalance does support it, it'll read whatever the supplied mask is,
> > and then will try and balance interrupts within that mask. It will bail
> > if the mask is invalid, or won't apply to the running system, just like
> > how putting a bogus mask into smp_affinity is ignored.
> >
> > If there's something I'm missing beyond this with the two suggestions
> > you've made (I looked into those two parameters and tried to draw
> > conclusions), please let me know.
>
> I don't see the point in adding it, if the driver wants to set a node
> cpu mask it can already do that using the regular smp affinity settings.

Unfortunately, a driver can't. The irq_set_affinity() function isn't
exported. I proposed a patch on netdev to export it, and then to tie down
an interrupt using IRQF_NOBALANCING, so irqbalance won't touch it. That
was rejected, since the driver is enforcing policy of the interrupt
balancing, not irqbalance.

I and Jesse Brandeburg had a meeting with Arjan about this. What we came
up with was this interface, so drivers can set what they'd like to see, if
irqbalance decides to honor it. That way interrupt affinity policies are
set only by irqbalance, but this interface gives us a mechanism to hint to
irqbalance what we'd like it to do.

Also, if you use the /proc interface to change smp_affinity on an
interrupt without any of these changes, irqbalance will override it on its
next poll interval. This also is not desirable.

Cheers,
-PJ
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


yong.zhang0 at gmail

Nov 23, 2009, 9:17 PM

Post #7 of 35 (583 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

[snip]
>>
>> 1) I think you should consider CONFIG_CPUMASK_OFFSTACK which will affect
>>    node_affinity.
>> 2) It seems like this patch can't work with SPARSE_IRQ.
>
> This mechanism isn't going to be used by any internal kernel mechanism
> for determining interrupt placement or operation.  It's purely something
> that either a driver can modify, or external script (through /proc),
> that irqbalance will make use of.  If irqbalance isn't running, or the
> current version of irqbalance doesn't support reading node_affinity,
> then it won't affect the system's operation.
>
> If irqbalance does support it, it'll read whatever the supplied mask is,
> and then will try and balance interrupts within that mask.  It will bail
> if the mask is invalid, or won't apply to the running system, just like
> how putting a bogus mask into smp_affinity is ignored.
>
> If there's something I'm missing beyond this with the two suggestions
> you've made (I looked into those two parameters and tried to draw
> conclusions), please let me know.

My two suggestions are both about your adding node_affinity. Before you can
use this element, you must initialise it firstly. You can refer how
irq_desc::affinity
is used in function alloc_desc_masks().
include/linux/irq.h:
static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
bool boot)
{
gfp_t gfp = GFP_ATOMIC;

if (boot)
gfp = GFP_NOWAIT;

#ifdef CONFIG_CPUMASK_OFFSTACK
if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
return false;

#ifdef CONFIG_GENERIC_PENDING_IRQ
if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
free_cpumask_var(desc->affinity);
return false;
}
#endif
#endif
return true;
}

Thanks,
Yong

>
> Cheers,
> -PJ Waskiewicz
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


arjan at linux

Nov 23, 2009, 10:07 PM

Post #8 of 35 (582 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

Peter Zijlstra wrote:
> On Mon, 2009-11-23 at 01:36 -0800, Peter P Waskiewicz Jr wrote:
>
>> This mechanism isn't going to be used by any internal kernel mechanism
>> for determining interrupt placement or operation. It's purely something
>> that either a driver can modify, or external script (through /proc),
>> that irqbalance will make use of. If irqbalance isn't running, or the
>> current version of irqbalance doesn't support reading node_affinity,
>> then it won't affect the system's operation.
>>
>> If irqbalance does support it, it'll read whatever the supplied mask is,
>> and then will try and balance interrupts within that mask. It will bail
>> if the mask is invalid, or won't apply to the running system, just like
>> how putting a bogus mask into smp_affinity is ignored.
>>
>> If there's something I'm missing beyond this with the two suggestions
>> you've made (I looked into those two parameters and tried to draw
>> conclusions), please let me know.
>
> I don't see the point in adding it, if the driver wants to set a node
> cpu mask it can already do that using the regular smp affinity settings.
>
> Same for userspace.

the problem is that there is no way currently that the driver can communicate
"I allocated all my metadata on THIS numa node". irqbalance and sysadmins need
that to not make really stupid decisions.....
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peterz at infradead

Nov 24, 2009, 12:38 AM

Post #9 of 35 (577 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Mon, 2009-11-23 at 15:32 -0800, Waskiewicz Jr, Peter P wrote:

> Unfortunately, a driver can't. The irq_set_affinity() function isn't
> exported. I proposed a patch on netdev to export it, and then to tie down
> an interrupt using IRQF_NOBALANCING, so irqbalance won't touch it. That
> was rejected, since the driver is enforcing policy of the interrupt
> balancing, not irqbalance.

Why would a patch touching the irq subsystem go to netdev?

What is wrong with exporting irq_set_affinity(), and wtf do you need
IRQF_NOBALANCING for?

> I and Jesse Brandeburg had a meeting with Arjan about this. What we came
> up with was this interface, so drivers can set what they'd like to see, if
> irqbalance decides to honor it. That way interrupt affinity policies are
> set only by irqbalance, but this interface gives us a mechanism to hint to
> irqbalance what we'd like it to do.

If all you want is to expose policy to userspace then you don't need any
of this, simply expose the NICs home node through a sysfs device thingy
(I was under the impression its already there somewhere, but I can't
ever find anything in /sys).

No need what so ever to poke at the IRQ subsystem.

> Also, if you use the /proc interface to change smp_affinity on an
> interrupt without any of these changes, irqbalance will override it on its
> next poll interval. This also is not desirable.

This all sounds backwards.. we've got a perfectly functional interface
for affinity -- which people object to being used for some reason. So
you add another interface on top, and that is ok?

All the while not CC'ing the IRQ folks,.. brilliant approach.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peter.p.waskiewicz.jr at intel

Nov 24, 2009, 12:39 AM

Post #10 of 35 (577 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Mon, 2009-11-23 at 22:17 -0700, Yong Zhang wrote:
> [snip]
> >>
> >> 1) I think you should consider CONFIG_CPUMASK_OFFSTACK which will affect
> >> node_affinity.
> >> 2) It seems like this patch can't work with SPARSE_IRQ.
> >
> > This mechanism isn't going to be used by any internal kernel mechanism
> > for determining interrupt placement or operation. It's purely something
> > that either a driver can modify, or external script (through /proc),
> > that irqbalance will make use of. If irqbalance isn't running, or the
> > current version of irqbalance doesn't support reading node_affinity,
> > then it won't affect the system's operation.
> >
> > If irqbalance does support it, it'll read whatever the supplied mask is,
> > and then will try and balance interrupts within that mask. It will bail
> > if the mask is invalid, or won't apply to the running system, just like
> > how putting a bogus mask into smp_affinity is ignored.
> >
> > If there's something I'm missing beyond this with the two suggestions
> > you've made (I looked into those two parameters and tried to draw
> > conclusions), please let me know.
>
> My two suggestions are both about your adding node_affinity. Before you can
> use this element, you must initialise it firstly. You can refer how
> irq_desc::affinity
> is used in function alloc_desc_masks().
> include/linux/irq.h:
> static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
> bool boot)
> {
> gfp_t gfp = GFP_ATOMIC;
>
> if (boot)
> gfp = GFP_NOWAIT;
>
> #ifdef CONFIG_CPUMASK_OFFSTACK
> if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
> return false;
>
> #ifdef CONFIG_GENERIC_PENDING_IRQ
> if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
> free_cpumask_var(desc->affinity);
> return false;
> }
> #endif
> #endif
> return true;
> }
>

Ah, ok. I see what you were referring to now. Let me respin the patch
and send a second version.

Thanks Yong,

-PJ Waskiewicz

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peterz at infradead

Nov 24, 2009, 12:39 AM

Post #11 of 35 (589 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Mon, 2009-11-23 at 22:07 -0800, Arjan van de Ven wrote:
> Peter Zijlstra wrote:
> > On Mon, 2009-11-23 at 01:36 -0800, Peter P Waskiewicz Jr wrote:
> >
> >> This mechanism isn't going to be used by any internal kernel mechanism
> >> for determining interrupt placement or operation. It's purely something
> >> that either a driver can modify, or external script (through /proc),
> >> that irqbalance will make use of. If irqbalance isn't running, or the
> >> current version of irqbalance doesn't support reading node_affinity,
> >> then it won't affect the system's operation.
> >>
> >> If irqbalance does support it, it'll read whatever the supplied mask is,
> >> and then will try and balance interrupts within that mask. It will bail
> >> if the mask is invalid, or won't apply to the running system, just like
> >> how putting a bogus mask into smp_affinity is ignored.
> >>
> >> If there's something I'm missing beyond this with the two suggestions
> >> you've made (I looked into those two parameters and tried to draw
> >> conclusions), please let me know.
> >
> > I don't see the point in adding it, if the driver wants to set a node
> > cpu mask it can already do that using the regular smp affinity settings.
> >
> > Same for userspace.
>
> the problem is that there is no way currently that the driver can communicate
> "I allocated all my metadata on THIS numa node". irqbalance and sysadmins need
> that to not make really stupid decisions.....

And what exactly is struct device::numa_node good for then?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peter.p.waskiewicz.jr at intel

Nov 24, 2009, 12:59 AM

Post #12 of 35 (585 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Tue, 2009-11-24 at 01:38 -0700, Peter Zijlstra wrote:
> On Mon, 2009-11-23 at 15:32 -0800, Waskiewicz Jr, Peter P wrote:
>
> > Unfortunately, a driver can't. The irq_set_affinity() function isn't
> > exported. I proposed a patch on netdev to export it, and then to tie down
> > an interrupt using IRQF_NOBALANCING, so irqbalance won't touch it. That
> > was rejected, since the driver is enforcing policy of the interrupt
> > balancing, not irqbalance.
>
> Why would a patch touching the irq subsystem go to netdev?

The only change to the IRQ subsystem was:

EXPORT_SYMBOL(irq_set_affinity);

The majority of the changeset was for the ixgbe driver.

> What is wrong with exporting irq_set_affinity(), and wtf do you need
> IRQF_NOBALANCING for?
>

Again, the pushback I received was with allowing anything other than
irqbalance to dictate interrupt affinity policy.

And if I set interrupt affinity from the driver or from /proc,
irqbalance will happily rebalance the interrupt elsewhere. The
IRQF_NOBALANCING flag will prevent irqbalance from being able to move
the interrupt.

> > I and Jesse Brandeburg had a meeting with Arjan about this. What we came
> > up with was this interface, so drivers can set what they'd like to see, if
> > irqbalance decides to honor it. That way interrupt affinity policies are
> > set only by irqbalance, but this interface gives us a mechanism to hint to
> > irqbalance what we'd like it to do.
>
> If all you want is to expose policy to userspace then you don't need any
> of this, simply expose the NICs home node through a sysfs device thingy
> (I was under the impression its already there somewhere, but I can't
> ever find anything in /sys).
>
> No need what so ever to poke at the IRQ subsystem.

The point is we need something common that the kernel side (whether a
driver or /proc can modify) that irqbalance can use.

> > Also, if you use the /proc interface to change smp_affinity on an
> > interrupt without any of these changes, irqbalance will override it on its
> > next poll interval. This also is not desirable.
>
> This all sounds backwards.. we've got a perfectly functional interface
> for affinity -- which people object to being used for some reason. So
> you add another interface on top, and that is ok?
>

But it's not functional. If I set the affinity in smp_affinity, then
irqbalance will override it 10 seconds later.

> All the while not CC'ing the IRQ folks,.. brilliant approach.

If I knew who I should CC, I'd be happy to add them. Can you provide
email addresses please?

Cheers,
-PJ Waskiewicz

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peterz at infradead

Nov 24, 2009, 1:08 AM

Post #13 of 35 (581 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Tue, 2009-11-24 at 00:59 -0800, Peter P Waskiewicz Jr wrote:
> > This all sounds backwards.. we've got a perfectly functional interface
> > for affinity -- which people object to being used for some reason. So
> > you add another interface on top, and that is ok?
> >
>
> But it's not functional. If I set the affinity in smp_affinity, then
> irqbalance will override it 10 seconds later.

And here I was thinking the kernel round-robins IRQ delivery on the mask
specified there. Are you talking about some daft userspace thing that
writes into the irq smp_affinity to effect irq balancing?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peter.p.waskiewicz.jr at intel

Nov 24, 2009, 1:15 AM

Post #14 of 35 (581 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Tue, 2009-11-24 at 02:08 -0700, Peter Zijlstra wrote:
> On Tue, 2009-11-24 at 00:59 -0800, Peter P Waskiewicz Jr wrote:
> > > This all sounds backwards.. we've got a perfectly functional interface
> > > for affinity -- which people object to being used for some reason. So
> > > you add another interface on top, and that is ok?
> > >
> >
> > But it's not functional. If I set the affinity in smp_affinity, then
> > irqbalance will override it 10 seconds later.
>
> And here I was thinking the kernel round-robins IRQ delivery on the mask
> specified there. Are you talking about some daft userspace thing that
> writes into the irq smp_affinity to effect irq balancing?
>

Yep. That's exactly what irqbalance does.

Cheers,
-PJ

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peterz at infradead

Nov 24, 2009, 1:15 AM

Post #15 of 35 (577 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Tue, 2009-11-24 at 00:59 -0800, Peter P Waskiewicz Jr wrote:
>
> > All the while not CC'ing the IRQ folks,.. brilliant approach.
>
> If I knew who I should CC, I'd be happy to add them. Can you provide
> email addresses please?

Since most people can't seen to read a simple MAINTAINERS file, some
other people wrote a script to read it for you:

# scripts/get_maintainer.pl -f kernel/irq/manage.c
Ingo Molnar <mingo [at] elte>
Thomas Gleixner <tglx [at] linutronix>
linux-kernel [at] vger


Another option is to do something like:

# git log kernel/irq/manage.c | grep Author | head -30 | awk
'{ t[$0]++; } END { for (i in t) { print t[i] " " i; }}' | sort -rn

10 Author: Thomas Gleixner <tglx [at] linutronix>
9 Author: Ingo Molnar <mingo [at] elte>
3 Author: Magnus Damm <damm [at] igel>
2 Author: Linus Torvalds <torvalds [at] linux-foundation>
...

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


tglx at linutronix

Nov 24, 2009, 2:07 AM

Post #16 of 35 (581 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Tue, 24 Nov 2009, Peter P Waskiewicz Jr wrote:
> On Tue, 2009-11-24 at 01:38 -0700, Peter Zijlstra wrote:
> > On Mon, 2009-11-23 at 15:32 -0800, Waskiewicz Jr, Peter P wrote:
> >
> > > Unfortunately, a driver can't. The irq_set_affinity() function isn't
> > > exported. I proposed a patch on netdev to export it, and then to tie down
> > > an interrupt using IRQF_NOBALANCING, so irqbalance won't touch it. That
> > > was rejected, since the driver is enforcing policy of the interrupt
> > > balancing, not irqbalance.
> >
> > Why would a patch touching the irq subsystem go to netdev?
>
> The only change to the IRQ subsystem was:
>
> EXPORT_SYMBOL(irq_set_affinity);

Which is still touching the generic irq subsystem and needs the ack of
the relevant maintainer. If there is a need to expose such an
interface to drivers then the maintainer wants to know exactly why and
needs to be part of the discussion of alternative solutions. Otherwise
you waste time on implementing stuff like the current patch which is
definitely not going anywhere near the irq subsystem.

> > If all you want is to expose policy to userspace then you don't need any
> > of this, simply expose the NICs home node through a sysfs device thingy
> > (I was under the impression its already there somewhere, but I can't
> > ever find anything in /sys).
> >
> > No need what so ever to poke at the IRQ subsystem.
>
> The point is we need something common that the kernel side (whether a
> driver or /proc can modify) that irqbalance can use.

/sys/class/net/ethX/device/numa_node

perhaps ?

> > > Also, if you use the /proc interface to change smp_affinity on an
> > > interrupt without any of these changes, irqbalance will override it on its
> > > next poll interval. This also is not desirable.
> >
> > This all sounds backwards.. we've got a perfectly functional interface
> > for affinity -- which people object to being used for some reason. So
> > you add another interface on top, and that is ok?
> >
>
> But it's not functional. If I set the affinity in smp_affinity, then
> irqbalance will override it 10 seconds later.

And to work around the brain wreckage of irqbalanced you want to
fiddle in the irq code instead of teaching irqbalanced to handle node
affinities ?

The only thing which is worth to investigate is whether the irq core
code should honour the dev->numa_node setting and restrict the
possible irq affinity settings to that node. If a device is tied to a
node it makes a certain amount of sense to do that.

But such a change would not need a new interface in the irq core and
definitely not a new cpumask_t member in the irq_desc structure to
store a node affinity which can be expressed with a simple
integer.

But this needs more thoughts and I want to know more about the
background and the reasoning for such a change.

Thanks,

tglx



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


arjan at linux

Nov 24, 2009, 6:42 AM

Post #17 of 35 (582 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

Peter Zijlstra wrote:
>>> Same for userspace.
>> the problem is that there is no way currently that the driver can communicate
>> "I allocated all my metadata on THIS numa node". irqbalance and sysadmins need
>> that to not make really stupid decisions.....
>
> And what exactly is struct device::numa_node good for then?
>

and that is exported to userspace.. how?

... that has nothing to do with an irq, and also falls flat for a driver that
supports multiple irqs, and assigns one to each numa node.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


arjan at linux

Nov 24, 2009, 6:43 AM

Post #18 of 35 (572 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

Peter Zijlstra wrote:
> On Tue, 2009-11-24 at 00:59 -0800, Peter P Waskiewicz Jr wrote:
>>> This all sounds backwards.. we've got a perfectly functional interface
>>> for affinity -- which people object to being used for some reason. So
>>> you add another interface on top, and that is ok?
>>>
>> But it's not functional. If I set the affinity in smp_affinity, then
>> irqbalance will override it 10 seconds later.
>
> And here I was thinking the kernel round-robins IRQ delivery on the mask
> specified there.

the kernel does no such thing, nor has code to do so.

> Are you talking about some daft userspace thing that
> writes into the irq smp_affinity to effect irq balancing?

thanks ;)



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


davem at davemloft

Nov 24, 2009, 9:39 AM

Post #19 of 35 (573 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

From: Peter Zijlstra <peterz [at] infradead>
Date: Tue, 24 Nov 2009 09:39:46 +0100

> On Mon, 2009-11-23 at 22:07 -0800, Arjan van de Ven wrote:
>> the problem is that there is no way currently that the driver can communicate
>> "I allocated all my metadata on THIS numa node". irqbalance and sysadmins need
>> that to not make really stupid decisions.....
>
> And what exactly is struct device::numa_node good for then?

device->numa_node just says where the device is.

For better performance, it can make sense to, for example, allocate the ring
buffers for different device queues on other NUMA nodes.

That's the kind of thing PJ is trying to make available.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peter.p.waskiewicz.jr at intel

Nov 24, 2009, 9:55 AM

Post #20 of 35 (574 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Tue, 2009-11-24 at 03:07 -0700, Thomas Gleixner wrote:
> On Tue, 24 Nov 2009, Peter P Waskiewicz Jr wrote:
> > On Tue, 2009-11-24 at 01:38 -0700, Peter Zijlstra wrote:
> > > On Mon, 2009-11-23 at 15:32 -0800, Waskiewicz Jr, Peter P wrote:
> > >
> > > > Unfortunately, a driver can't. The irq_set_affinity() function isn't
> > > > exported. I proposed a patch on netdev to export it, and then to tie down
> > > > an interrupt using IRQF_NOBALANCING, so irqbalance won't touch it. That
> > > > was rejected, since the driver is enforcing policy of the interrupt
> > > > balancing, not irqbalance.
> > >
> > > Why would a patch touching the irq subsystem go to netdev?
> >
> > The only change to the IRQ subsystem was:
> >
> > EXPORT_SYMBOL(irq_set_affinity);
>
> Which is still touching the generic irq subsystem and needs the ack of
> the relevant maintainer. If there is a need to expose such an
> interface to drivers then the maintainer wants to know exactly why and
> needs to be part of the discussion of alternative solutions. Otherwise
> you waste time on implementing stuff like the current patch which is
> definitely not going anywhere near the irq subsystem.
>

Understood, and duly noted.

> > > If all you want is to expose policy to userspace then you don't need any
> > > of this, simply expose the NICs home node through a sysfs device thingy
> > > (I was under the impression its already there somewhere, but I can't
> > > ever find anything in /sys).
> > >
> > > No need what so ever to poke at the IRQ subsystem.
> >
> > The point is we need something common that the kernel side (whether a
> > driver or /proc can modify) that irqbalance can use.
>
> /sys/class/net/ethX/device/numa_node
>
> perhaps ?

What I'm trying to do though is one to many NUMA node assignments. See
below for a better overview of what the issue is we're trying to solve.

>
> > > > Also, if you use the /proc interface to change smp_affinity on an
> > > > interrupt without any of these changes, irqbalance will override it on its
> > > > next poll interval. This also is not desirable.
> > >
> > > This all sounds backwards.. we've got a perfectly functional interface
> > > for affinity -- which people object to being used for some reason. So
> > > you add another interface on top, and that is ok?
> > >
> >
> > But it's not functional. If I set the affinity in smp_affinity, then
> > irqbalance will override it 10 seconds later.
>
> And to work around the brain wreckage of irqbalanced you want to
> fiddle in the irq code instead of teaching irqbalanced to handle node
> affinities ?
>
> The only thing which is worth to investigate is whether the irq core
> code should honour the dev->numa_node setting and restrict the
> possible irq affinity settings to that node. If a device is tied to a
> node it makes a certain amount of sense to do that.
>
> But such a change would not need a new interface in the irq core and
> definitely not a new cpumask_t member in the irq_desc structure to
> store a node affinity which can be expressed with a simple
> integer.
>
> But this needs more thoughts and I want to know more about the
> background and the reasoning for such a change.
>

I'll use the ixgbe driver as my example, since that is where my
immediate problems are. This is our 10GbE device, and supports 128 Rx
queues, 128 Tx queues, and has a maximum of 64 MSI-X vectors. In a
typical case, let's say an 8-core machine (Nehalem-EP with
hyperthreading off) brings one port online. We'll allocate 8 Rx and 8
Tx queues. When these allocations occur, we want to allocate the memory
for our descriptor rings and buffer structs and DMA areas onto the
various NUMA nodes. This will promote spreading of the load not just
across CPUs, but also the memory controllers.

If we were to just run like that and have irqbalance move our vectors to
a single node, then we'd have half of our network resources creating
cross-node traffic, which is undesirable, since the OS may have to take
locks node to node to get the memory it's looking for.

The bottom line is we need some mechanism that allows a driver/user to
deterministically assign the underlying interrupt resources to the
correct NUMA node for each interrupt. And in the example above, we may
have more than one NUMA node we need to balance into.

Please let me know if I've explained this well enough. I appreciate the
time.

Cheers,
-PJ Waskiewicz

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peter.p.waskiewicz.jr at intel

Nov 24, 2009, 9:56 AM

Post #21 of 35 (572 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Tue, 2009-11-24 at 09:39 -0800, David Miller wrote:
> From: Peter Zijlstra <peterz [at] infradead>
> Date: Tue, 24 Nov 2009 09:39:46 +0100
>
> > On Mon, 2009-11-23 at 22:07 -0800, Arjan van de Ven wrote:
> >> the problem is that there is no way currently that the driver can communicate
> >> "I allocated all my metadata on THIS numa node". irqbalance and sysadmins need
> >> that to not make really stupid decisions.....
> >
> > And what exactly is struct device::numa_node good for then?
>
> device->numa_node just says where the device is.
>
> For better performance, it can make sense to, for example, allocate the ring
> buffers for different device queues on other NUMA nodes.
>
> That's the kind of thing PJ is trying to make available.

Yes, that's exactly what I'm trying to do. Even further, we want to
allocate the ring SW struct itself and descriptor structures on other
NUMA nodes, and make sure the interrupt lines up with those allocations.

Cheers,
-PJ

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


eric.dumazet at gmail

Nov 24, 2009, 10:26 AM

Post #22 of 35 (572 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

Peter P Waskiewicz Jr a écrit :
That's the kind of thing PJ is trying to make available.
>
> Yes, that's exactly what I'm trying to do. Even further, we want to
> allocate the ring SW struct itself and descriptor structures on other
> NUMA nodes, and make sure the interrupt lines up with those allocations.
>

Say you allocate ring buffers on NUMA node of the CPU handling interrupt
on a particular queue.

If irqbalance or an admin changes /proc/irq/{number}/smp_affinities,
do you want to realloc ring buffer to another NUMA node ?

It seems complex to me, maybe optimal thing would be to use a NUMA policy to
spread vmalloc() allocations to all nodes to get a good bandwidth...

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peter.p.waskiewicz.jr at intel

Nov 24, 2009, 10:33 AM

Post #23 of 35 (571 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

On Tue, 2009-11-24 at 10:26 -0800, Eric Dumazet wrote:
> Peter P Waskiewicz Jr a écrit :
> That's the kind of thing PJ is trying to make available.
> >
> > Yes, that's exactly what I'm trying to do. Even further, we want to
> > allocate the ring SW struct itself and descriptor structures on other
> > NUMA nodes, and make sure the interrupt lines up with those allocations.
> >
>
> Say you allocate ring buffers on NUMA node of the CPU handling interrupt
> on a particular queue.
>
> If irqbalance or an admin changes /proc/irq/{number}/smp_affinities,
> do you want to realloc ring buffer to another NUMA node ?
>

That's why I'm trying to add the node_affinity mechanism that irqbalance
can use to prevent the interrupt being moved to another node.

> It seems complex to me, maybe optimal thing would be to use a NUMA policy to
> spread vmalloc() allocations to all nodes to get a good bandwidth...

That's exactly what we're doing in our 10GbE driver right now (isn't
pushed upstream yet, still finalizing our testing). We spread to all
NUMA nodes in a semi-intelligent fashion when allocating our rings and
buffers. The last piece is ensuring the interrupts tied to the various
queues all route to the NUMA nodes those CPUs belong to. irqbalance
needs some kind of hint to make sure it does the right thing, which
today it does not.

I don't see how this is complex though. Driver loads, allocates across
the NUMA nodes for optimal throughput, then writes CPU masks for the
NUMA nodes each interrupt belongs to. irqbalance comes along and looks
at the new mask "hint," and then balances that interrupt within that
hinted mask.

Cheers,
-PJ

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


davem at davemloft

Nov 24, 2009, 10:54 AM

Post #24 of 35 (572 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

From: Eric Dumazet <eric.dumazet [at] gmail>
Date: Tue, 24 Nov 2009 19:26:15 +0100

> It seems complex to me, maybe optimal thing would be to use a NUMA policy to
> spread vmalloc() allocations to all nodes to get a good bandwidth...

vmalloc() and sk_buff's don't currently mix and I really don't see us
every allowing them to :-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


eric.dumazet at gmail

Nov 24, 2009, 10:58 AM

Post #25 of 35 (575 views)
Permalink
Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints [In reply to]

David Miller a écrit :
> From: Eric Dumazet <eric.dumazet [at] gmail>
> Date: Tue, 24 Nov 2009 19:26:15 +0100
>
>> It seems complex to me, maybe optimal thing would be to use a NUMA policy to
>> spread vmalloc() allocations to all nodes to get a good bandwidth...
>
> vmalloc() and sk_buff's don't currently mix and I really don't see us
> every allowing them to :-)

I think Peter was referring to tx/rx rings buffers, not sk_buffs.

They (ring buffers) are allocated with vmalloc() at driver init time.

And Tom pointed out that our rx sk_buff allocation should be using the node
of requester, no need to hardcode node number per rx queue (or per device as of today)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

First page Previous page 1 2 Next page Last page  View All Linux kernel RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact Gossamer Threads
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.