Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: Linux: Kernel

[PATCH RFC] rcu: Limit GP initialization to CPUs that have been online

 

 

First page Previous page 1 2 Next page Last page  View All Linux kernel RSS feed   Index | Next | Previous | View Threaded


paulmck at linux

Mar 13, 2012, 5:24 PM

Post #1 of 32 (322 views)
Permalink
[PATCH RFC] rcu: Limit GP initialization to CPUs that have been online

The following builds, but is only very lightly tested. Probably full
of bug, especially when exercising CPU hotplug.

Thanx, Paul

------------------------------------------------------------------------

The current grace-period initialization initializes all leaf rcu_node
structures, even those corresponding to CPUs that have never been online.
This is harmless in many configurations, but results in 200-microsecond
latency spikes for kernels built with NR_CPUS=4096.

This commit therefore keeps track of the largest-numbered CPU that has
ever been online, and limits grace-period initialization to rcu_node
structures corresponding to that CPU and to smaller-numbered CPUs.

Reported-by: Dimitri Sivanich <sivanich [at] sgi>
Signed-off-by: Paul E. McKenney <paulmck [at] linux>

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index c3b05ef..5688443 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -91,6 +91,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);

static struct rcu_state *rcu_state;

+int rcu_max_cpu __read_mostly; /* Largest # CPU that has ever been online. */
+
/*
* The rcu_scheduler_active variable transitions from zero to one just
* before the first task is spawned. So when this variable is zero, RCU
@@ -1129,8 +1131,9 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
unsigned long gp_duration;
- struct rcu_node *rnp = rcu_get_root(rsp);
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);

WARN_ON_ONCE(!rcu_gp_in_progress(rsp));

@@ -1159,26 +1162,28 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
* completed.
*/
if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */

/*
* Propagate new ->completed value to rcu_node structures
* so that other CPUs don't have to wait until the start
* of the next grace period to process their callbacks.
+ * We must hold the root rcu_node structure's ->lock
+ * across rcu_for_each_node_breadth_first() in order to
+ * synchronize with CPUs coming online for the first time.
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
+ raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
rnp->completed = rsp->gpnum;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_lock(&rnp_root->lock); /* already disabled. */
}
- rnp = rcu_get_root(rsp);
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
}

rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
trace_rcu_grace_period(rsp->name, rsp->completed, "end");
rsp->fqs_state = RCU_GP_IDLE;
- rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
+ rcu_start_gp(rsp, flags); /* releases root node's ->lock. */
}

/*
@@ -2440,6 +2445,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
unsigned long mask;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
+ struct rcu_node *rnp_init;

/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -2462,6 +2468,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
/* Exclude any attempts to start a new GP on large systems. */
raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */

+ /* Initialize any rcu_node structures that will see their first use. */
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
+ rnp_init <= rdp->mynode;
+ rnp_init++) {
+ rnp_init->gpnum = rsp->gpnum;
+ rnp_init->completed = rsp->completed;
+ }
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+
/* Add CPU to rcu_node bitmasks. */
rnp = rdp->mynode;
mask = rdp->grpmask;
@@ -2495,6 +2511,8 @@ static void __cpuinit rcu_prepare_cpu(int cpu)
rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
rcu_preempt_init_percpu_data(cpu);
+ if (cpu > rcu_max_cpu)
+ rcu_max_cpu = cpu;
}

/*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1e49c56..1dc74e0 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -192,11 +192,13 @@ struct rcu_node {

/*
* Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * specified rcu_state structure. The caller must hold either the
+ * ->onofflock or the root rcu_node structure's ->lock.
*/
+extern int rcu_max_cpu;
#define rcu_for_each_node_breadth_first(rsp, rnp) \
for ((rnp) = &(rsp)->node[0]; \
- (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+ (rnp) < per_cpu_ptr((rsp)->rda, rcu_max_cpu)->mynode; (rnp)++)

/*
* Do a breadth-first scan of the non-leaf rcu_node structures for the

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


efault at gmx

Mar 14, 2012, 2:24 AM

Post #2 of 32 (308 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote:
> The following builds, but is only very lightly tested. Probably full
> of bug, especially when exercising CPU hotplug.

You didn't say RFT, but...

To beat on this in a rotund 3.0 kernel, the equivalent patch would be
the below? My box may well answer that before you can.. hope not ;-)

---
kernel/rcutree.c | 31 ++++++++++++++++++++++++++-----
kernel/rcutree.h | 6 ++++--
2 files changed, 30 insertions(+), 7 deletions(-)

--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d

static struct rcu_state *rcu_state;

+int rcu_max_cpu __read_mostly; /* Largest # CPU that has ever been online. */
+
/*
* The rcu_scheduler_active variable transitions from zero to one just
* before the first task is spawned. So when this variable is zero, RCU
@@ -827,25 +829,31 @@ rcu_start_gp(struct rcu_state *rsp, unsi
struct rcu_node *rnp = rcu_get_root(rsp);

if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
+ struct rcu_node *rnp_root = rnp;
+
if (cpu_needs_another_gp(rsp, rdp))
rsp->fqs_need_gp = 1;
- if (rnp->completed == rsp->completed) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (rnp_root->completed == rsp->completed) {
+ raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
return;
}
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */

/*
* Propagate new ->completed value to rcu_node structures
* so that other CPUs don't have to wait until the start
* of the next grace period to process their callbacks.
+ * We must hold the root rcu_node structure's ->lock
+ * across rcu_for_each_node_breadth_first() in order to
+ * synchronize with CPUs coming online for the first time.
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
+ raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
rnp->completed = rsp->completed;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_lock(&rnp_root->lock); /* already disabled. */
}
- local_irq_restore(flags);
+ raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
return;
}

@@ -935,7 +943,7 @@ static void rcu_report_qs_rsp(struct rcu
rsp->gp_max = gp_duration;
rsp->completed = rsp->gpnum;
rsp->signaled = RCU_GP_IDLE;
- rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
+ rcu_start_gp(rsp, flags); /* releases root node's ->lock. */
}

/*
@@ -1862,6 +1870,7 @@ rcu_init_percpu_data(int cpu, struct rcu
unsigned long mask;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
+ struct rcu_node *rnp_init;

/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1882,6 +1891,16 @@ rcu_init_percpu_data(int cpu, struct rcu
/* Exclude any attempts to start a new GP on large systems. */
raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */

+ /* Initialize any rcu_node structures that will see their first use. */
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
+ rnp_init <= rdp->mynode;
+ rnp_init++) {
+ rnp_init->gpnum = rsp->gpnum;
+ rnp_init->completed = rsp->completed;
+ }
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+
/* Add CPU to rcu_node bitmasks. */
rnp = rdp->mynode;
mask = rdp->grpmask;
@@ -1907,6 +1926,8 @@ static void __cpuinit rcu_prepare_cpu(in
rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
rcu_preempt_init_percpu_data(cpu);
+ if (cpu > rcu_max_cpu)
+ rcu_max_cpu = cpu;
}

/*
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -191,11 +191,13 @@ struct rcu_node {

/*
* Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * specified rcu_state structure. The caller must hold either the
+ * ->onofflock or the root rcu_node structure's ->lock.
*/
+extern int rcu_max_cpu;
#define rcu_for_each_node_breadth_first(rsp, rnp) \
for ((rnp) = &(rsp)->node[0]; \
- (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+ (rnp) < per_cpu_ptr((rsp)->rda, rcu_max_cpu)->mynode; (rnp)++)

/*
* Do a breadth-first scan of the non-leaf rcu_node structures for the


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


efault at gmx

Mar 14, 2012, 5:40 AM

Post #3 of 32 (311 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote:
> On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote:
> > The following builds, but is only very lightly tested. Probably full
> > of bug, especially when exercising CPU hotplug.
>
> You didn't say RFT, but...
>
> To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> the below? My box may well answer that before you can.. hope not ;-)

(Darn, it did. Box says boot stall with virgin patch in tip too though.
Wedging it straight into 3.0 was perhaps a tad premature;)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


sivanich at sgi

Mar 14, 2012, 6:08 AM

Post #4 of 32 (315 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote:
> > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote:
> > > The following builds, but is only very lightly tested. Probably full
> > > of bug, especially when exercising CPU hotplug.
> >
> > You didn't say RFT, but...
> >
> > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > the below? My box may well answer that before you can.. hope not ;-)
>
> (Darn, it did. Box says boot stall with virgin patch in tip too though.
> Wedging it straight into 3.0 was perhaps a tad premature;)

I saw the same thing with 3.3.0-rc7+ and virgin patch on UV. Boots fine without the patch.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


paulmck at linux

Mar 14, 2012, 8:17 AM

Post #5 of 32 (309 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Wed, Mar 14, 2012 at 08:08:01AM -0500, Dimitri Sivanich wrote:
> On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> > On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote:
> > > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote:
> > > > The following builds, but is only very lightly tested. Probably full
> > > > of bug, especially when exercising CPU hotplug.
> > >
> > > You didn't say RFT, but...
> > >
> > > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > > the below? My box may well answer that before you can.. hope not ;-)
> >
> > (Darn, it did. Box says boot stall with virgin patch in tip too though.
> > Wedging it straight into 3.0 was perhaps a tad premature;)
>
> I saw the same thing with 3.3.0-rc7+ and virgin patch on UV. Boots fine without the patch.

Right... Bozo here forgot to set the kernel parameters for large-system
emulation during testing. Apologies for the busted patch, will fix.

And thank you both for the testing!!!

Hey, at least I labeled it "RFC". ;-)

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


paulmck at linux

Mar 14, 2012, 9:56 AM

Post #6 of 32 (307 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Wed, Mar 14, 2012 at 08:17:17AM -0700, Paul E. McKenney wrote:
> On Wed, Mar 14, 2012 at 08:08:01AM -0500, Dimitri Sivanich wrote:
> > On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> > > On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote:
> > > > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote:
> > > > > The following builds, but is only very lightly tested. Probably full
> > > > > of bug, especially when exercising CPU hotplug.
> > > >
> > > > You didn't say RFT, but...
> > > >
> > > > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > > > the below? My box may well answer that before you can.. hope not ;-)
> > >
> > > (Darn, it did. Box says boot stall with virgin patch in tip too though.
> > > Wedging it straight into 3.0 was perhaps a tad premature;)
> >
> > I saw the same thing with 3.3.0-rc7+ and virgin patch on UV. Boots fine without the patch.
>
> Right... Bozo here forgot to set the kernel parameters for large-system
> emulation during testing. Apologies for the busted patch, will fix.
>
> And thank you both for the testing!!!
>
> Hey, at least I labeled it "RFC". ;-)

Does the following work better? It does pass my fake-big-system tests
(more testing in the works).

Thanx, Paul

------------------------------------------------------------------------

rcu: Limit GP initialization to CPUs that have been online

The current grace-period initialization initializes all leaf rcu_node
structures, even those corresponding to CPUs that have never been online.
This is harmless in many configurations, but results in 200-microsecond
latency spikes for kernels built with NR_CPUS=4096.

This commit therefore keeps track of the largest-numbered CPU that has
ever been online, and limits grace-period initialization to rcu_node
structures corresponding to that CPU and to smaller-numbered CPUs.

Reported-by: Dimitri Sivanich <sivanich [at] sgi>
Signed-off-by: Paul E. McKenney <paulmck [at] linux>

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index c3b05ef..5688443 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -91,6 +91,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);

static struct rcu_state *rcu_state;

+int rcu_max_cpu __read_mostly; /* Largest # CPU that has ever been online. */
+
/*
* The rcu_scheduler_active variable transitions from zero to one just
* before the first task is spawned. So when this variable is zero, RCU
@@ -1129,8 +1131,9 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
unsigned long gp_duration;
- struct rcu_node *rnp = rcu_get_root(rsp);
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);

WARN_ON_ONCE(!rcu_gp_in_progress(rsp));

@@ -1159,26 +1162,28 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
* completed.
*/
if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */

/*
* Propagate new ->completed value to rcu_node structures
* so that other CPUs don't have to wait until the start
* of the next grace period to process their callbacks.
+ * We must hold the root rcu_node structure's ->lock
+ * across rcu_for_each_node_breadth_first() in order to
+ * synchronize with CPUs coming online for the first time.
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
+ raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
rnp->completed = rsp->gpnum;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_lock(&rnp_root->lock); /* already disabled. */
}
- rnp = rcu_get_root(rsp);
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
}

rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
trace_rcu_grace_period(rsp->name, rsp->completed, "end");
rsp->fqs_state = RCU_GP_IDLE;
- rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
+ rcu_start_gp(rsp, flags); /* releases root node's ->lock. */
}

/*
@@ -2440,6 +2445,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
unsigned long mask;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
+ struct rcu_node *rnp_init;

/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -2462,6 +2468,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
/* Exclude any attempts to start a new GP on large systems. */
raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */

+ /* Initialize any rcu_node structures that will see their first use. */
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
+ rnp_init <= rdp->mynode;
+ rnp_init++) {
+ rnp_init->gpnum = rsp->gpnum;
+ rnp_init->completed = rsp->completed;
+ }
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+
/* Add CPU to rcu_node bitmasks. */
rnp = rdp->mynode;
mask = rdp->grpmask;
@@ -2495,6 +2511,8 @@ static void __cpuinit rcu_prepare_cpu(int cpu)
rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
rcu_preempt_init_percpu_data(cpu);
+ if (cpu > rcu_max_cpu)
+ rcu_max_cpu = cpu;
}

/*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1e49c56..afdf410 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -192,11 +192,13 @@ struct rcu_node {

/*
* Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * specified rcu_state structure. The caller must hold either the
+ * ->onofflock or the root rcu_node structure's ->lock.
*/
+extern int rcu_max_cpu;
#define rcu_for_each_node_breadth_first(rsp, rnp) \
for ((rnp) = &(rsp)->node[0]; \
- (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+ (rnp) <= per_cpu_ptr((rsp)->rda, rcu_max_cpu)->mynode; (rnp)++)

/*
* Do a breadth-first scan of the non-leaf rcu_node structures for the

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


efault at gmx

Mar 14, 2012, 10:07 AM

Post #7 of 32 (308 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Wed, 2012-03-14 at 08:17 -0700, Paul E. McKenney wrote:

> And thank you both for the testing!!!

Try to beat RCU into submission with rocks and sticks, or test patch
from RCU expert... not a hard choice. I will most happily test ;-)

-Mike

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


efault at gmx

Mar 14, 2012, 7:42 PM

Post #8 of 32 (317 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:

> Does the following work better? It does pass my fake-big-system tests
> (more testing in the works).

Yup, tip booted fine. Thanks! I'll test, see if it gets upset.

-Mike

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


efault at gmx

Mar 14, 2012, 8:07 PM

Post #9 of 32 (310 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Thu, 2012-03-15 at 03:42 +0100, Mike Galbraith wrote:
> On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:
>
> > Does the following work better? It does pass my fake-big-system tests
> > (more testing in the works).
>
> Yup, tip booted fine. Thanks! I'll test, see if it gets upset.

Wedged into 3.0 enterprise booted fine too, is now running rcutorture.
I'll add hotplug after it runs explosion free for a while. Any
suggestions for giving both virgin and 3.0 a thorough trouncing?

-Mike

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


paulmck at linux

Mar 15, 2012, 10:02 AM

Post #10 of 32 (307 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Thu, Mar 15, 2012 at 04:07:11AM +0100, Mike Galbraith wrote:
> On Thu, 2012-03-15 at 03:42 +0100, Mike Galbraith wrote:
> > On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:
> >
> > > Does the following work better? It does pass my fake-big-system tests
> > > (more testing in the works).
> >
> > Yup, tip booted fine. Thanks! I'll test, see if it gets upset.
>
> Wedged into 3.0 enterprise booted fine too, is now running rcutorture.
> I'll add hotplug after it runs explosion free for a while. Any
> suggestions for giving both virgin and 3.0 a thorough trouncing?

It would be good to check the latency of RCU's grace-period
initialization, and comparing the results with and without this patch.
Dimitri might have scripting for that.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


sivanich at sgi

Mar 15, 2012, 10:21 AM

Post #11 of 32 (306 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Thu, Mar 15, 2012 at 10:02:36AM -0700, Paul E. McKenney wrote:
> On Thu, Mar 15, 2012 at 04:07:11AM +0100, Mike Galbraith wrote:
> > On Thu, 2012-03-15 at 03:42 +0100, Mike Galbraith wrote:
> > > On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:
> > >
> > > > Does the following work better? It does pass my fake-big-system tests
> > > > (more testing in the works).
> > >
> > > Yup, tip booted fine. Thanks! I'll test, see if it gets upset.
> >
> > Wedged into 3.0 enterprise booted fine too, is now running rcutorture.
> > I'll add hotplug after it runs explosion free for a while. Any
> > suggestions for giving both virgin and 3.0 a thorough trouncing?
>
> It would be good to check the latency of RCU's grace-period
> initialization, and comparing the results with and without this patch.
> Dimitri might have scripting for that.
>

I'll attempt to get that tested yet this week.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


sivanich at sgi

Mar 15, 2012, 10:58 AM

Post #12 of 32 (307 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Wed, Mar 14, 2012 at 09:56:57AM -0700, Paul E. McKenney wrote:
> On Wed, Mar 14, 2012 at 08:17:17AM -0700, Paul E. McKenney wrote:
> > On Wed, Mar 14, 2012 at 08:08:01AM -0500, Dimitri Sivanich wrote:
> > > On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> > > > On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote:
> > > > > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote:
> > > > > > The following builds, but is only very lightly tested. Probably full
> > > > > > of bug, especially when exercising CPU hotplug.
> > > > >
> > > > > You didn't say RFT, but...
> > > > >
> > > > > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > > > > the below? My box may well answer that before you can.. hope not ;-)
> > > >
> > > > (Darn, it did. Box says boot stall with virgin patch in tip too though.
> > > > Wedging it straight into 3.0 was perhaps a tad premature;)
> > >
> > > I saw the same thing with 3.3.0-rc7+ and virgin patch on UV. Boots fine without the patch.
> >
> > Right... Bozo here forgot to set the kernel parameters for large-system
> > emulation during testing. Apologies for the busted patch, will fix.
> >
> > And thank you both for the testing!!!
> >
> > Hey, at least I labeled it "RFC". ;-)
>
> Does the following work better? It does pass my fake-big-system tests
> (more testing in the works).

This one stalls for me at the same place the other one did. Once again,
if I remove the patch and rebuild, it boots just fine.

Is there some debug/trace information that you would like me to provide?

>
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> rcu: Limit GP initialization to CPUs that have been online
>
> The current grace-period initialization initializes all leaf rcu_node
> structures, even those corresponding to CPUs that have never been online.
> This is harmless in many configurations, but results in 200-microsecond
> latency spikes for kernels built with NR_CPUS=4096.
>
> This commit therefore keeps track of the largest-numbered CPU that has
> ever been online, and limits grace-period initialization to rcu_node
> structures corresponding to that CPU and to smaller-numbered CPUs.
>
> Reported-by: Dimitri Sivanich <sivanich [at] sgi>
> Signed-off-by: Paul E. McKenney <paulmck [at] linux>
>
> diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> index c3b05ef..5688443 100644
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -91,6 +91,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
>
> static struct rcu_state *rcu_state;
>
> +int rcu_max_cpu __read_mostly; /* Largest # CPU that has ever been online. */
> +
> /*
> * The rcu_scheduler_active variable transitions from zero to one just
> * before the first task is spawned. So when this variable is zero, RCU
> @@ -1129,8 +1131,9 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
> __releases(rcu_get_root(rsp)->lock)
> {
> unsigned long gp_duration;
> - struct rcu_node *rnp = rcu_get_root(rsp);
> struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
> + struct rcu_node *rnp;
> + struct rcu_node *rnp_root = rcu_get_root(rsp);
>
> WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
>
> @@ -1159,26 +1162,28 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
> * completed.
> */
> if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
> - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
>
> /*
> * Propagate new ->completed value to rcu_node structures
> * so that other CPUs don't have to wait until the start
> * of the next grace period to process their callbacks.
> + * We must hold the root rcu_node structure's ->lock
> + * across rcu_for_each_node_breadth_first() in order to
> + * synchronize with CPUs coming online for the first time.
> */
> rcu_for_each_node_breadth_first(rsp, rnp) {
> + raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
> raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> rnp->completed = rsp->gpnum;
> raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> + raw_spin_lock(&rnp_root->lock); /* already disabled. */
> }
> - rnp = rcu_get_root(rsp);
> - raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> }
>
> rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
> trace_rcu_grace_period(rsp->name, rsp->completed, "end");
> rsp->fqs_state = RCU_GP_IDLE;
> - rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
> + rcu_start_gp(rsp, flags); /* releases root node's ->lock. */
> }
>
> /*
> @@ -2440,6 +2445,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
> unsigned long mask;
> struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
> struct rcu_node *rnp = rcu_get_root(rsp);
> + struct rcu_node *rnp_init;
>
> /* Set up local state, ensuring consistent view of global state. */
> raw_spin_lock_irqsave(&rnp->lock, flags);
> @@ -2462,6 +2468,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
> /* Exclude any attempts to start a new GP on large systems. */
> raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
>
> + /* Initialize any rcu_node structures that will see their first use. */
> + raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> + for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
> + rnp_init <= rdp->mynode;
> + rnp_init++) {
> + rnp_init->gpnum = rsp->gpnum;
> + rnp_init->completed = rsp->completed;
> + }
> + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> +
> /* Add CPU to rcu_node bitmasks. */
> rnp = rdp->mynode;
> mask = rdp->grpmask;
> @@ -2495,6 +2511,8 @@ static void __cpuinit rcu_prepare_cpu(int cpu)
> rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
> rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
> rcu_preempt_init_percpu_data(cpu);
> + if (cpu > rcu_max_cpu)
> + rcu_max_cpu = cpu;
> }
>
> /*
> diff --git a/kernel/rcutree.h b/kernel/rcutree.h
> index 1e49c56..afdf410 100644
> --- a/kernel/rcutree.h
> +++ b/kernel/rcutree.h
> @@ -192,11 +192,13 @@ struct rcu_node {
>
> /*
> * Do a full breadth-first scan of the rcu_node structures for the
> - * specified rcu_state structure.
> + * specified rcu_state structure. The caller must hold either the
> + * ->onofflock or the root rcu_node structure's ->lock.
> */
> +extern int rcu_max_cpu;
> #define rcu_for_each_node_breadth_first(rsp, rnp) \
> for ((rnp) = &(rsp)->node[0]; \
> - (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
> + (rnp) <= per_cpu_ptr((rsp)->rda, rcu_max_cpu)->mynode; (rnp)++)
>
> /*
> * Do a breadth-first scan of the non-leaf rcu_node structures for the
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


sivanich at sgi

Mar 15, 2012, 10:59 AM

Post #13 of 32 (308 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Thu, Mar 15, 2012 at 04:07:11AM +0100, Mike Galbraith wrote:
> On Thu, 2012-03-15 at 03:42 +0100, Mike Galbraith wrote:
> > On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:
> >
> > > Does the following work better? It does pass my fake-big-system tests
> > > (more testing in the works).
> >
> > Yup, tip booted fine. Thanks! I'll test, see if it gets upset.
>
> Wedged into 3.0 enterprise booted fine too, is now running rcutorture.
> I'll add hotplug after it runs explosion free for a while. Any
> suggestions for giving both virgin and 3.0 a thorough trouncing?

Mike,

Could I try your 3.0 enterprise patch?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


paulmck at linux

Mar 15, 2012, 11:23 AM

Post #14 of 32 (309 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Thu, Mar 15, 2012 at 12:58:57PM -0500, Dimitri Sivanich wrote:
> On Wed, Mar 14, 2012 at 09:56:57AM -0700, Paul E. McKenney wrote:
> > On Wed, Mar 14, 2012 at 08:17:17AM -0700, Paul E. McKenney wrote:
> > > On Wed, Mar 14, 2012 at 08:08:01AM -0500, Dimitri Sivanich wrote:
> > > > On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> > > > > On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote:
> > > > > > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote:
> > > > > > > The following builds, but is only very lightly tested. Probably full
> > > > > > > of bug, especially when exercising CPU hotplug.
> > > > > >
> > > > > > You didn't say RFT, but...
> > > > > >
> > > > > > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > > > > > the below? My box may well answer that before you can.. hope not ;-)
> > > > >
> > > > > (Darn, it did. Box says boot stall with virgin patch in tip too though.
> > > > > Wedging it straight into 3.0 was perhaps a tad premature;)
> > > >
> > > > I saw the same thing with 3.3.0-rc7+ and virgin patch on UV. Boots fine without the patch.
> > >
> > > Right... Bozo here forgot to set the kernel parameters for large-system
> > > emulation during testing. Apologies for the busted patch, will fix.
> > >
> > > And thank you both for the testing!!!
> > >
> > > Hey, at least I labeled it "RFC". ;-)
> >
> > Does the following work better? It does pass my fake-big-system tests
> > (more testing in the works).
>
> This one stalls for me at the same place the other one did. Once again,
> if I remove the patch and rebuild, it boots just fine.
>
> Is there some debug/trace information that you would like me to provide?

Very strange.

Could you please send your dmesg and .config?

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


paulmck at linux

Mar 15, 2012, 2:07 PM

Post #15 of 32 (309 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Thu, Mar 15, 2012 at 11:23:14AM -0700, Paul E. McKenney wrote:
> On Thu, Mar 15, 2012 at 12:58:57PM -0500, Dimitri Sivanich wrote:
> > On Wed, Mar 14, 2012 at 09:56:57AM -0700, Paul E. McKenney wrote:
> > > On Wed, Mar 14, 2012 at 08:17:17AM -0700, Paul E. McKenney wrote:
> > > > On Wed, Mar 14, 2012 at 08:08:01AM -0500, Dimitri Sivanich wrote:
> > > > > On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> > > > > > On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote:
> > > > > > > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote:
> > > > > > > > The following builds, but is only very lightly tested. Probably full
> > > > > > > > of bug, especially when exercising CPU hotplug.
> > > > > > >
> > > > > > > You didn't say RFT, but...
> > > > > > >
> > > > > > > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > > > > > > the below? My box may well answer that before you can.. hope not ;-)
> > > > > >
> > > > > > (Darn, it did. Box says boot stall with virgin patch in tip too though.
> > > > > > Wedging it straight into 3.0 was perhaps a tad premature;)
> > > > >
> > > > > I saw the same thing with 3.3.0-rc7+ and virgin patch on UV. Boots fine without the patch.
> > > >
> > > > Right... Bozo here forgot to set the kernel parameters for large-system
> > > > emulation during testing. Apologies for the busted patch, will fix.
> > > >
> > > > And thank you both for the testing!!!
> > > >
> > > > Hey, at least I labeled it "RFC". ;-)
> > >
> > > Does the following work better? It does pass my fake-big-system tests
> > > (more testing in the works).
> >
> > This one stalls for me at the same place the other one did. Once again,
> > if I remove the patch and rebuild, it boots just fine.
> >
> > Is there some debug/trace information that you would like me to provide?
>
> Very strange.
>
> Could you please send your dmesg and .config?

Hmmm... Memory ordering could be a problem, though in that case I would
have expected the hand during the onlining process. However, the memory
ordering does need to be cleaned up in any case, please see below.

Thanx, Paul

------------------------------------------------------------------------

rcu: Limit GP initialization to CPUs that have been online

The current grace-period initialization initializes all leaf rcu_node
structures, even those corresponding to CPUs that have never been online.
This is harmless in many configurations, but results in 200-microsecond
latency spikes for kernels built with NR_CPUS=4096.

This commit therefore keeps track of the largest-numbered CPU that has
ever been online, and limits grace-period initialization to rcu_node
structures corresponding to that CPU and to smaller-numbered CPUs.

Reported-by: Dimitri Sivanich <sivanich [at] sgi>
Signed-off-by: Paul E. McKenney <paulmck [at] linux>
Tested-by: Mike Galbraith <efault [at] gmx>

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 8269656..7247fa8 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -91,6 +91,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);

static struct rcu_state *rcu_state;

+int rcu_max_cpu __read_mostly; /* Largest # CPU that has ever been online. */
+
/*
* The rcu_scheduler_active variable transitions from zero to one just
* before the first task is spawned. So when this variable is zero, RCU
@@ -1129,8 +1131,9 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
unsigned long gp_duration;
- struct rcu_node *rnp = rcu_get_root(rsp);
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);

WARN_ON_ONCE(!rcu_gp_in_progress(rsp));

@@ -1159,26 +1162,28 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
* completed.
*/
if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */

/*
* Propagate new ->completed value to rcu_node structures
* so that other CPUs don't have to wait until the start
* of the next grace period to process their callbacks.
+ * We must hold the root rcu_node structure's ->lock
+ * across rcu_for_each_node_breadth_first() in order to
+ * synchronize with CPUs coming online for the first time.
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
+ raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
rnp->completed = rsp->gpnum;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_lock(&rnp_root->lock); /* already disabled. */
}
- rnp = rcu_get_root(rsp);
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
}

rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
trace_rcu_grace_period(rsp->name, rsp->completed, "end");
rsp->fqs_state = RCU_GP_IDLE;
- rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
+ rcu_start_gp(rsp, flags); /* releases root node's ->lock. */
}

/*
@@ -2447,6 +2452,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
unsigned long mask;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
+ struct rcu_node *rnp_init;

/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -2469,6 +2475,20 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
/* Exclude any attempts to start a new GP on large systems. */
raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */

+ /*
+ * Initialize any rcu_node structures that will see their first use.
+ * Note that rcu_max_cpu cannot change out from under us because the
+ * hotplug locks are held.
+ */
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
+ rnp_init <= rdp->mynode;
+ rnp_init++) {
+ rnp_init->gpnum = rsp->gpnum;
+ rnp_init->completed = rsp->completed;
+ }
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+
/* Add CPU to rcu_node bitmasks. */
rnp = rdp->mynode;
mask = rdp->grpmask;
@@ -2502,6 +2522,11 @@ static void __cpuinit rcu_prepare_cpu(int cpu)
rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
rcu_preempt_init_percpu_data(cpu);
+ if (cpu > rcu_max_cpu) {
+ smp_mb(); /* Initialization before rcu_max_cpu assignment. */
+ rcu_max_cpu = cpu;
+ smp_mb(); /* rcu_max_cpu assignment before later uses. */
+ }
}

/*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1e49c56..772df1c 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -192,11 +192,23 @@ struct rcu_node {

/*
* Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * specified rcu_state structure. The caller must hold either the
+ * ->onofflock or the root rcu_node structure's ->lock.
*/
+extern int rcu_max_cpu;
+static inline int rcu_get_max_cpu(void)
+{
+ int ret;
+
+ smp_mb(); /* Pairs with barriers in rcu_prepare_cpu(). */
+ ret = rcu_max_cpu;
+ smp_mb(); /* Pairs with barriers in rcu_prepare_cpu(). */
+ return ret;
+}
#define rcu_for_each_node_breadth_first(rsp, rnp) \
for ((rnp) = &(rsp)->node[0]; \
- (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+ (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
+ (rnp)++)

/*
* Do a breadth-first scan of the non-leaf rcu_node structures for the

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


efault at gmx

Mar 15, 2012, 9:45 PM

Post #16 of 32 (309 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Thu, 2012-03-15 at 10:02 -0700, Paul E. McKenney wrote:
> On Thu, Mar 15, 2012 at 04:07:11AM +0100, Mike Galbraith wrote:
> > On Thu, 2012-03-15 at 03:42 +0100, Mike Galbraith wrote:
> > > On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:
> > >
> > > > Does the following work better? It does pass my fake-big-system tests
> > > > (more testing in the works).
> > >
> > > Yup, tip booted fine. Thanks! I'll test, see if it gets upset.
> >
> > Wedged into 3.0 enterprise booted fine too, is now running rcutorture.
> > I'll add hotplug after it runs explosion free for a while. Any
> > suggestions for giving both virgin and 3.0 a thorough trouncing?
>
> It would be good to check the latency of RCU's grace-period
> initialization, and comparing the results with and without this patch.
> Dimitri might have scripting for that.

Yeah, I'll zoom in. Generic irqsoff tracing with rcutorture running
was... not the cleverest of ideas. (between fbcon and serial console, a
300us RCU blip would be a pimple on giants left butt cheek)

-Mike

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


efault at gmx

Mar 16, 2012, 12:27 AM

Post #17 of 32 (308 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Thu, 2012-03-15 at 12:59 -0500, Dimitri Sivanich wrote:
> On Thu, Mar 15, 2012 at 04:07:11AM +0100, Mike Galbraith wrote:
> > On Thu, 2012-03-15 at 03:42 +0100, Mike Galbraith wrote:
> > > On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:
> > >
> > > > Does the following work better? It does pass my fake-big-system tests
> > > > (more testing in the works).
> > >
> > > Yup, tip booted fine. Thanks! I'll test, see if it gets upset.
> >
> > Wedged into 3.0 enterprise booted fine too, is now running rcutorture.
> > I'll add hotplug after it runs explosion free for a while. Any
> > suggestions for giving both virgin and 3.0 a thorough trouncing?
>
> Mike,
>
> Could I try your 3.0 enterprise patch?

Sure, v3 below. Boots on my little boxen.

caveat: looks to me like it should be equivalent, but what I know about
RCUs gizzard will cover the bottom of a thimble.. maybe.

rcu: Limit GP initialization to CPUs that have been online

The current grace-period initialization initializes all leaf rcu_node
structures, even those corresponding to CPUs that have never been online.
This is harmless in many configurations, but results in 200-microsecond
latency spikes for kernels built with NR_CPUS=4096.

This commit therefore keeps track of the largest-numbered CPU that has
ever been online, and limits grace-period initialization to rcu_node
structures corresponding to that CPU and to smaller-numbered CPUs.

Reported-by: Dimitri Sivanich <sivanich [at] sgi>
Signed-off-by: Paul E. McKenney <paulmck [at] linux>
Acked-by: Mike Galbraith <mgalbraith [at] suse>

---
kernel/rcutree.c | 24 +++++++++++++++++++++++-
kernel/rcutree.h | 16 ++++++++++++++--
2 files changed, 37 insertions(+), 3 deletions(-)

--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d

static struct rcu_state *rcu_state;

+int rcu_max_cpu __read_mostly; /* Largest # CPU that has ever been online. */
+
/*
* The rcu_scheduler_active variable transitions from zero to one just
* before the first task is spawned. So when this variable is zero, RCU
@@ -935,7 +937,7 @@ static void rcu_report_qs_rsp(struct rcu
rsp->gp_max = gp_duration;
rsp->completed = rsp->gpnum;
rsp->signaled = RCU_GP_IDLE;
- rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
+ rcu_start_gp(rsp, flags); /* releases root node's ->lock. */
}

/*
@@ -1862,6 +1864,7 @@ rcu_init_percpu_data(int cpu, struct rcu
unsigned long mask;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
+ struct rcu_node *rnp_init;

/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1882,6 +1885,20 @@ rcu_init_percpu_data(int cpu, struct rcu
/* Exclude any attempts to start a new GP on large systems. */
raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */

+ /*
+ * Initialize any rcu_node structures that will see their first use.
+ * Note that rcu_max_cpu cannot change out from under us because the
+ * hotplug locks are held.
+ */
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
+ rnp_init <= rdp->mynode;
+ rnp_init++) {
+ rnp_init->gpnum = rsp->gpnum;
+ rnp_init->completed = rsp->completed;
+ }
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+
/* Add CPU to rcu_node bitmasks. */
rnp = rdp->mynode;
mask = rdp->grpmask;
@@ -1907,6 +1924,11 @@ static void __cpuinit rcu_prepare_cpu(in
rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
rcu_preempt_init_percpu_data(cpu);
+ if (cpu > rcu_max_cpu) {
+ smp_mb(); /* Initialization before rcu_max_cpu assignment. */
+ rcu_max_cpu = cpu;
+ smp_mb(); /* rcu_max_cpu assignment before later uses. */
+ }
}

/*
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -191,11 +191,23 @@ struct rcu_node {

/*
* Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * specified rcu_state structure. The caller must hold either the
+ * ->onofflock or the root rcu_node structure's ->lock.
*/
+extern int rcu_max_cpu;
+static inline int rcu_get_max_cpu(void)
+{
+ int ret;
+
+ smp_mb(); /* Pairs with barriers in rcu_prepare_cpu(). */
+ ret = rcu_max_cpu;
+ smp_mb(); /* Pairs with barriers in rcu_prepare_cpu(). */
+ return ret;
+}
#define rcu_for_each_node_breadth_first(rsp, rnp) \
for ((rnp) = &(rsp)->node[0]; \
- (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+ (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
+ (rnp)++)

/*
* Do a breadth-first scan of the non-leaf rcu_node structures for the



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


efault at gmx

Mar 16, 2012, 1:09 AM

Post #18 of 32 (306 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Fri, 2012-03-16 at 08:27 +0100, Mike Galbraith wrote:
> On Thu, 2012-03-15 at 12:59 -0500, Dimitri Sivanich wrote:
> > On Thu, Mar 15, 2012 at 04:07:11AM +0100, Mike Galbraith wrote:
> > > On Thu, 2012-03-15 at 03:42 +0100, Mike Galbraith wrote:
> > > > On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:
> > > >
> > > > > Does the following work better? It does pass my fake-big-system tests
> > > > > (more testing in the works).
> > > >
> > > > Yup, tip booted fine. Thanks! I'll test, see if it gets upset.
> > >
> > > Wedged into 3.0 enterprise booted fine too, is now running rcutorture.
> > > I'll add hotplug after it runs explosion free for a while. Any
> > > suggestions for giving both virgin and 3.0 a thorough trouncing?
> >
> > Mike,
> >
> > Could I try your 3.0 enterprise patch?
>
> Sure, v3 below.

Erm, a bit of that went missing. I'll put it back.

-Mike

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


efault at gmx

Mar 16, 2012, 1:45 AM

Post #19 of 32 (311 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Fri, 2012-03-16 at 09:09 +0100, Mike Galbraith wrote:
> On Fri, 2012-03-16 at 08:27 +0100, Mike Galbraith wrote:
> > On Thu, 2012-03-15 at 12:59 -0500, Dimitri Sivanich wrote:

> > > Could I try your 3.0 enterprise patch?
> >
> > Sure, v3 below.
>
> Erm, a bit of that went missing. I'll put it back.

OK, box finally finished rebuild/boot.


rcu: Limit GP initialization to CPUs that have been online

The current grace-period initialization initializes all leaf rcu_node
structures, even those corresponding to CPUs that have never been online.
This is harmless in many configurations, but results in 200-microsecond
latency spikes for kernels built with NR_CPUS=4096.

This commit therefore keeps track of the largest-numbered CPU that has
ever been online, and limits grace-period initialization to rcu_node
structures corresponding to that CPU and to smaller-numbered CPUs.

Reported-by: Dimitri Sivanich <sivanich [at] sgi>
Signed-off-by: Paul E. McKenney <paulmck [at] linux>
Acked-by: Mike Galbraith <mgalbraith [at] suse>

---
kernel/rcutree.c | 36 ++++++++++++++++++++++++++++++++----
kernel/rcutree.h | 16 ++++++++++++++--
2 files changed, 46 insertions(+), 6 deletions(-)

--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d

static struct rcu_state *rcu_state;

+int rcu_max_cpu __read_mostly; /* Largest # CPU that has ever been online. */
+
/*
* The rcu_scheduler_active variable transitions from zero to one just
* before the first task is spawned. So when this variable is zero, RCU
@@ -827,25 +829,31 @@ rcu_start_gp(struct rcu_state *rsp, unsi
struct rcu_node *rnp = rcu_get_root(rsp);

if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
+ struct rcu_node *rnp_root = rnp;
+
if (cpu_needs_another_gp(rsp, rdp))
rsp->fqs_need_gp = 1;
if (rnp->completed == rsp->completed) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
return;
}
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */

/*
* Propagate new ->completed value to rcu_node structures
* so that other CPUs don't have to wait until the start
* of the next grace period to process their callbacks.
+ * We must hold the root rcu_node structure's ->lock
+ * across rcu_for_each_node_breadth_first() in order to
+ * synchronize with CPUs coming online for the first time.
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
+ raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
rnp->completed = rsp->completed;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_lock(&rnp_root->lock); /* already disabled. */
}
- local_irq_restore(flags);
+ raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
return;
}

@@ -935,7 +943,7 @@ static void rcu_report_qs_rsp(struct rcu
rsp->gp_max = gp_duration;
rsp->completed = rsp->gpnum;
rsp->signaled = RCU_GP_IDLE;
- rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
+ rcu_start_gp(rsp, flags); /* releases root node's ->lock. */
}

/*
@@ -1862,6 +1870,7 @@ rcu_init_percpu_data(int cpu, struct rcu
unsigned long mask;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
+ struct rcu_node *rnp_init;

/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1882,6 +1891,20 @@ rcu_init_percpu_data(int cpu, struct rcu
/* Exclude any attempts to start a new GP on large systems. */
raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */

+ /*
+ * Initialize any rcu_node structures that will see their first use.
+ * Note that rcu_max_cpu cannot change out from under us because the
+ * hotplug locks are held.
+ */
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
+ rnp_init <= rdp->mynode;
+ rnp_init++) {
+ rnp_init->gpnum = rsp->gpnum;
+ rnp_init->completed = rsp->completed;
+ }
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+
/* Add CPU to rcu_node bitmasks. */
rnp = rdp->mynode;
mask = rdp->grpmask;
@@ -1907,6 +1930,11 @@ static void __cpuinit rcu_prepare_cpu(in
rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
rcu_preempt_init_percpu_data(cpu);
+ if (cpu > rcu_max_cpu) {
+ smp_mb(); /* Initialization before rcu_max_cpu assignment. */
+ rcu_max_cpu = cpu;
+ smp_mb(); /* rcu_max_cpu assignment before later uses. */
+ }
}

/*
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -191,11 +191,23 @@ struct rcu_node {

/*
* Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * specified rcu_state structure. The caller must hold either the
+ * ->onofflock or the root rcu_node structure's ->lock.
*/
+extern int rcu_max_cpu;
+static inline int rcu_get_max_cpu(void)
+{
+ int ret;
+
+ smp_mb(); /* Pairs with barriers in rcu_prepare_cpu(). */
+ ret = rcu_max_cpu;
+ smp_mb(); /* Pairs with barriers in rcu_prepare_cpu(). */
+ return ret;
+}
#define rcu_for_each_node_breadth_first(rsp, rnp) \
for ((rnp) = &(rsp)->node[0]; \
- (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+ (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
+ (rnp)++)

/*
* Do a breadth-first scan of the non-leaf rcu_node structures for the


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


sivanich at sgi

Mar 16, 2012, 8:46 AM

Post #20 of 32 (310 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Thu, Mar 15, 2012 at 02:07:53PM -0700, Paul E. McKenney wrote:
> On Thu, Mar 15, 2012 at 11:23:14AM -0700, Paul E. McKenney wrote:
> > On Thu, Mar 15, 2012 at 12:58:57PM -0500, Dimitri Sivanich wrote:
> > > On Wed, Mar 14, 2012 at 09:56:57AM -0700, Paul E. McKenney wrote:
> > > > On Wed, Mar 14, 2012 at 08:17:17AM -0700, Paul E. McKenney wrote:
> > > > > On Wed, Mar 14, 2012 at 08:08:01AM -0500, Dimitri Sivanich wrote:
> > > > > > On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> > > > > > > On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote:
> > > > > > > > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote:
> > > > > > > > > The following builds, but is only very lightly tested. Probably full
> > > > > > > > > of bug, especially when exercising CPU hotplug.
> > > > > > > >
> > > > > > > > You didn't say RFT, but...
> > > > > > > >
> > > > > > > > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > > > > > > > the below? My box may well answer that before you can.. hope not ;-)
> > > > > > >
> > > > > > > (Darn, it did. Box says boot stall with virgin patch in tip too though.
> > > > > > > Wedging it straight into 3.0 was perhaps a tad premature;)
> > > > > >
> > > > > > I saw the same thing with 3.3.0-rc7+ and virgin patch on UV. Boots fine without the patch.
> > > > >
> > > > > Right... Bozo here forgot to set the kernel parameters for large-system
> > > > > emulation during testing. Apologies for the busted patch, will fix.
> > > > >
> > > > > And thank you both for the testing!!!
> > > > >
> > > > > Hey, at least I labeled it "RFC". ;-)
> > > >
> > > > Does the following work better? It does pass my fake-big-system tests
> > > > (more testing in the works).
> > >
> > > This one stalls for me at the same place the other one did. Once again,
> > > if I remove the patch and rebuild, it boots just fine.
> > >
> > > Is there some debug/trace information that you would like me to provide?
> >
> > Very strange.
> >
> > Could you please send your dmesg and .config?
>
> Hmmm... Memory ordering could be a problem, though in that case I would
> have expected the hand during the onlining process. However, the memory
> ordering does need to be cleaned up in any case, please see below.
>
After testing this on 3.3.0-rc7+ I can say that this very much improves the
latency in the two rcu_for_each_node_breadth_first() loops.

Without the patch, under moderate load and while running an interrupt latency
test, I see the majority of loops taking 100-200 usec.

With the patch there are a few that take between 20-30, the rest are below
that.

Not that everything is OK latency-wise in RCU land. There is still an
interrupt holdoff in force_quiescent_state() that is taking > 100usec,
with or without the patch. I'm having difficulty finding exactly where
the other holdoff is happening because the kernel isn't accepting my nmi
handler.

That said, this fix is a nice improvement in those two loops.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


paulmck at linux

Mar 16, 2012, 10:21 AM

Post #21 of 32 (306 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Fri, Mar 16, 2012 at 10:46:23AM -0500, Dimitri Sivanich wrote:
> On Thu, Mar 15, 2012 at 02:07:53PM -0700, Paul E. McKenney wrote:
> > On Thu, Mar 15, 2012 at 11:23:14AM -0700, Paul E. McKenney wrote:
> > > On Thu, Mar 15, 2012 at 12:58:57PM -0500, Dimitri Sivanich wrote:
> > > > On Wed, Mar 14, 2012 at 09:56:57AM -0700, Paul E. McKenney wrote:
> > > > > On Wed, Mar 14, 2012 at 08:17:17AM -0700, Paul E. McKenney wrote:
> > > > > > On Wed, Mar 14, 2012 at 08:08:01AM -0500, Dimitri Sivanich wrote:
> > > > > > > On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> > > > > > > > On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote:
> > > > > > > > > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote:
> > > > > > > > > > The following builds, but is only very lightly tested. Probably full
> > > > > > > > > > of bug, especially when exercising CPU hotplug.
> > > > > > > > >
> > > > > > > > > You didn't say RFT, but...
> > > > > > > > >
> > > > > > > > > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > > > > > > > > the below? My box may well answer that before you can.. hope not ;-)
> > > > > > > >
> > > > > > > > (Darn, it did. Box says boot stall with virgin patch in tip too though.
> > > > > > > > Wedging it straight into 3.0 was perhaps a tad premature;)
> > > > > > >
> > > > > > > I saw the same thing with 3.3.0-rc7+ and virgin patch on UV. Boots fine without the patch.
> > > > > >
> > > > > > Right... Bozo here forgot to set the kernel parameters for large-system
> > > > > > emulation during testing. Apologies for the busted patch, will fix.
> > > > > >
> > > > > > And thank you both for the testing!!!
> > > > > >
> > > > > > Hey, at least I labeled it "RFC". ;-)
> > > > >
> > > > > Does the following work better? It does pass my fake-big-system tests
> > > > > (more testing in the works).
> > > >
> > > > This one stalls for me at the same place the other one did. Once again,
> > > > if I remove the patch and rebuild, it boots just fine.
> > > >
> > > > Is there some debug/trace information that you would like me to provide?
> > >
> > > Very strange.
> > >
> > > Could you please send your dmesg and .config?
> >
> > Hmmm... Memory ordering could be a problem, though in that case I would
> > have expected the hand during the onlining process. However, the memory
> > ordering does need to be cleaned up in any case, please see below.
> >
> After testing this on 3.3.0-rc7+ I can say that this very much improves the
> latency in the two rcu_for_each_node_breadth_first() loops.
>
> Without the patch, under moderate load and while running an interrupt latency
> test, I see the majority of loops taking 100-200 usec.
>
> With the patch there are a few that take between 20-30, the rest are below
> that.
>
> Not that everything is OK latency-wise in RCU land. There is still an
> interrupt holdoff in force_quiescent_state() that is taking > 100usec,
> with or without the patch. I'm having difficulty finding exactly where
> the other holdoff is happening because the kernel isn't accepting my nmi
> handler.

Please see my subsequent patch for force_quiescent_state(). ;-)

It is at https://lkml.org/lkml/2012/3/16/286 in case you missed it.

> That said, this fix is a nice improvement in those two loops.

Glad it helps, I have documented the improvement in the commit message.

Thank you both again for your testing efforts!

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


sivanich at sgi

Mar 16, 2012, 10:28 AM

Post #22 of 32 (307 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Fri, Mar 16, 2012 at 09:45:35AM +0100, Mike Galbraith wrote:
> On Fri, 2012-03-16 at 09:09 +0100, Mike Galbraith wrote:
> > On Fri, 2012-03-16 at 08:27 +0100, Mike Galbraith wrote:
> > > On Thu, 2012-03-15 at 12:59 -0500, Dimitri Sivanich wrote:
>
> > > > Could I try your 3.0 enterprise patch?
> > >
> > > Sure, v3 below.
> >
> > Erm, a bit of that went missing. I'll put it back.
>
> OK, box finally finished rebuild/boot.
>

This patch also shows great improvement in the two
rcu_for_each_node_breadth_first() (nothing over 20 usec and most less than
10 in initial testing).

However, there are spinlock holdoffs at the following tracebacks (my nmi
handler does work on the 3.0 kernel):

[ 584.157019] [<ffffffff8144c5a0>] nmi+0x20/0x30
[ 584.157023] [<ffffffff8144bc8a>] _raw_spin_lock_irqsave+0x1a/0x30
[ 584.157026] [<ffffffff810c5f18>] force_qs_rnp+0x58/0x170
[ 584.157030] [<ffffffff810c6192>] force_quiescent_state+0x162/0x1d0
[ 584.157033] [<ffffffff810c6c95>] __rcu_process_callbacks+0x165/0x200
[ 584.157037] [<ffffffff810c6d4d>] rcu_process_callbacks+0x1d/0x80
[ 584.157041] [<ffffffff81061eaf>] __do_softirq+0xef/0x220
[ 584.157044] [<ffffffff81454cbc>] call_softirq+0x1c/0x30
[ 584.157048] [<ffffffff810043a5>] do_softirq+0x65/0xa0
[ 584.157051] [<ffffffff81061c85>] irq_exit+0xb5/0xe0
[ 584.157054] [<ffffffff810212c8>] smp_apic_timer_interrupt+0x68/0xa0
[ 584.157057] [<ffffffff81454473>] apic_timer_interrupt+0x13/0x20
[ 584.157061] [<ffffffff8102b352>] native_safe_halt+0x2/0x10
[ 584.157064] [<ffffffff8100adf5>] default_idle+0x145/0x150
[ 584.157067] [<ffffffff810020c6>] cpu_idle+0x66/0xc0

>
> rcu: Limit GP initialization to CPUs that have been online
>
> The current grace-period initialization initializes all leaf rcu_node
> structures, even those corresponding to CPUs that have never been online.
> This is harmless in many configurations, but results in 200-microsecond
> latency spikes for kernels built with NR_CPUS=4096.
>
> This commit therefore keeps track of the largest-numbered CPU that has
> ever been online, and limits grace-period initialization to rcu_node
> structures corresponding to that CPU and to smaller-numbered CPUs.
>
> Reported-by: Dimitri Sivanich <sivanich [at] sgi>
> Signed-off-by: Paul E. McKenney <paulmck [at] linux>
> Acked-by: Mike Galbraith <mgalbraith [at] suse>
>
> ---
> kernel/rcutree.c | 36 ++++++++++++++++++++++++++++++++----
> kernel/rcutree.h | 16 ++++++++++++++--
> 2 files changed, 46 insertions(+), 6 deletions(-)
>
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
>
> static struct rcu_state *rcu_state;
>
> +int rcu_max_cpu __read_mostly; /* Largest # CPU that has ever been online. */
> +
> /*
> * The rcu_scheduler_active variable transitions from zero to one just
> * before the first task is spawned. So when this variable is zero, RCU
> @@ -827,25 +829,31 @@ rcu_start_gp(struct rcu_state *rsp, unsi
> struct rcu_node *rnp = rcu_get_root(rsp);
>
> if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
> + struct rcu_node *rnp_root = rnp;
> +
> if (cpu_needs_another_gp(rsp, rdp))
> rsp->fqs_need_gp = 1;
> if (rnp->completed == rsp->completed) {
> - raw_spin_unlock_irqrestore(&rnp->lock, flags);
> + raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
> return;
> }
> - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
>
> /*
> * Propagate new ->completed value to rcu_node structures
> * so that other CPUs don't have to wait until the start
> * of the next grace period to process their callbacks.
> + * We must hold the root rcu_node structure's ->lock
> + * across rcu_for_each_node_breadth_first() in order to
> + * synchronize with CPUs coming online for the first time.
> */
> rcu_for_each_node_breadth_first(rsp, rnp) {
> + raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
> raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> rnp->completed = rsp->completed;
> raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> + raw_spin_lock(&rnp_root->lock); /* already disabled. */
> }
> - local_irq_restore(flags);
> + raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
> return;
> }
>
> @@ -935,7 +943,7 @@ static void rcu_report_qs_rsp(struct rcu
> rsp->gp_max = gp_duration;
> rsp->completed = rsp->gpnum;
> rsp->signaled = RCU_GP_IDLE;
> - rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
> + rcu_start_gp(rsp, flags); /* releases root node's ->lock. */
> }
>
> /*
> @@ -1862,6 +1870,7 @@ rcu_init_percpu_data(int cpu, struct rcu
> unsigned long mask;
> struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
> struct rcu_node *rnp = rcu_get_root(rsp);
> + struct rcu_node *rnp_init;
>
> /* Set up local state, ensuring consistent view of global state. */
> raw_spin_lock_irqsave(&rnp->lock, flags);
> @@ -1882,6 +1891,20 @@ rcu_init_percpu_data(int cpu, struct rcu
> /* Exclude any attempts to start a new GP on large systems. */
> raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
>
> + /*
> + * Initialize any rcu_node structures that will see their first use.
> + * Note that rcu_max_cpu cannot change out from under us because the
> + * hotplug locks are held.
> + */
> + raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> + for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
> + rnp_init <= rdp->mynode;
> + rnp_init++) {
> + rnp_init->gpnum = rsp->gpnum;
> + rnp_init->completed = rsp->completed;
> + }
> + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> +
> /* Add CPU to rcu_node bitmasks. */
> rnp = rdp->mynode;
> mask = rdp->grpmask;
> @@ -1907,6 +1930,11 @@ static void __cpuinit rcu_prepare_cpu(in
> rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
> rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
> rcu_preempt_init_percpu_data(cpu);
> + if (cpu > rcu_max_cpu) {
> + smp_mb(); /* Initialization before rcu_max_cpu assignment. */
> + rcu_max_cpu = cpu;
> + smp_mb(); /* rcu_max_cpu assignment before later uses. */
> + }
> }
>
> /*
> --- a/kernel/rcutree.h
> +++ b/kernel/rcutree.h
> @@ -191,11 +191,23 @@ struct rcu_node {
>
> /*
> * Do a full breadth-first scan of the rcu_node structures for the
> - * specified rcu_state structure.
> + * specified rcu_state structure. The caller must hold either the
> + * ->onofflock or the root rcu_node structure's ->lock.
> */
> +extern int rcu_max_cpu;
> +static inline int rcu_get_max_cpu(void)
> +{
> + int ret;
> +
> + smp_mb(); /* Pairs with barriers in rcu_prepare_cpu(). */
> + ret = rcu_max_cpu;
> + smp_mb(); /* Pairs with barriers in rcu_prepare_cpu(). */
> + return ret;
> +}
> #define rcu_for_each_node_breadth_first(rsp, rnp) \
> for ((rnp) = &(rsp)->node[0]; \
> - (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
> + (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
> + (rnp)++)
>
> /*
> * Do a breadth-first scan of the non-leaf rcu_node structures for the
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


paulmck at linux

Mar 16, 2012, 10:51 AM

Post #23 of 32 (308 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Fri, Mar 16, 2012 at 12:28:50PM -0500, Dimitri Sivanich wrote:
> On Fri, Mar 16, 2012 at 09:45:35AM +0100, Mike Galbraith wrote:
> > On Fri, 2012-03-16 at 09:09 +0100, Mike Galbraith wrote:
> > > On Fri, 2012-03-16 at 08:27 +0100, Mike Galbraith wrote:
> > > > On Thu, 2012-03-15 at 12:59 -0500, Dimitri Sivanich wrote:
> >
> > > > > Could I try your 3.0 enterprise patch?
> > > >
> > > > Sure, v3 below.
> > >
> > > Erm, a bit of that went missing. I'll put it back.
> >
> > OK, box finally finished rebuild/boot.
> >
>
> This patch also shows great improvement in the two
> rcu_for_each_node_breadth_first() (nothing over 20 usec and most less than
> 10 in initial testing).
>
> However, there are spinlock holdoffs at the following tracebacks (my nmi
> handler does work on the 3.0 kernel):
>
> [ 584.157019] [<ffffffff8144c5a0>] nmi+0x20/0x30
> [ 584.157023] [<ffffffff8144bc8a>] _raw_spin_lock_irqsave+0x1a/0x30
> [ 584.157026] [<ffffffff810c5f18>] force_qs_rnp+0x58/0x170

This is a holdoff, not a deadlock hang, correct?

If so...

Presumably this is the last raw_spin_lock_irqsave() in force_qs_rnp(),
the one right before the call to rcu_initiate_boost(). Could you please
verify for me?

If so, someone is holding the root rcu_node structure's lock for longer
than is good. Or that there is significant contention on that lock,
which there might well be on large configurations. Any info on who
is holding or contending for this lock would be very helpful!

Are you running TREE_RCU or TREE_PREEMPT_RCU?

Thanx, Paul

> [ 584.157030] [<ffffffff810c6192>] force_quiescent_state+0x162/0x1d0
> [ 584.157033] [<ffffffff810c6c95>] __rcu_process_callbacks+0x165/0x200
> [ 584.157037] [<ffffffff810c6d4d>] rcu_process_callbacks+0x1d/0x80
> [ 584.157041] [<ffffffff81061eaf>] __do_softirq+0xef/0x220
> [ 584.157044] [<ffffffff81454cbc>] call_softirq+0x1c/0x30
> [ 584.157048] [<ffffffff810043a5>] do_softirq+0x65/0xa0
> [ 584.157051] [<ffffffff81061c85>] irq_exit+0xb5/0xe0
> [ 584.157054] [<ffffffff810212c8>] smp_apic_timer_interrupt+0x68/0xa0
> [ 584.157057] [<ffffffff81454473>] apic_timer_interrupt+0x13/0x20
> [ 584.157061] [<ffffffff8102b352>] native_safe_halt+0x2/0x10
> [ 584.157064] [<ffffffff8100adf5>] default_idle+0x145/0x150
> [ 584.157067] [<ffffffff810020c6>] cpu_idle+0x66/0xc0
>
> >
> > rcu: Limit GP initialization to CPUs that have been online
> >
> > The current grace-period initialization initializes all leaf rcu_node
> > structures, even those corresponding to CPUs that have never been online.
> > This is harmless in many configurations, but results in 200-microsecond
> > latency spikes for kernels built with NR_CPUS=4096.
> >
> > This commit therefore keeps track of the largest-numbered CPU that has
> > ever been online, and limits grace-period initialization to rcu_node
> > structures corresponding to that CPU and to smaller-numbered CPUs.
> >
> > Reported-by: Dimitri Sivanich <sivanich [at] sgi>
> > Signed-off-by: Paul E. McKenney <paulmck [at] linux>
> > Acked-by: Mike Galbraith <mgalbraith [at] suse>
> >
> > ---
> > kernel/rcutree.c | 36 ++++++++++++++++++++++++++++++++----
> > kernel/rcutree.h | 16 ++++++++++++++--
> > 2 files changed, 46 insertions(+), 6 deletions(-)
> >
> > --- a/kernel/rcutree.c
> > +++ b/kernel/rcutree.c
> > @@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
> >
> > static struct rcu_state *rcu_state;
> >
> > +int rcu_max_cpu __read_mostly; /* Largest # CPU that has ever been online. */
> > +
> > /*
> > * The rcu_scheduler_active variable transitions from zero to one just
> > * before the first task is spawned. So when this variable is zero, RCU
> > @@ -827,25 +829,31 @@ rcu_start_gp(struct rcu_state *rsp, unsi
> > struct rcu_node *rnp = rcu_get_root(rsp);
> >
> > if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
> > + struct rcu_node *rnp_root = rnp;
> > +
> > if (cpu_needs_another_gp(rsp, rdp))
> > rsp->fqs_need_gp = 1;
> > if (rnp->completed == rsp->completed) {
> > - raw_spin_unlock_irqrestore(&rnp->lock, flags);
> > + raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
> > return;
> > }
> > - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> >
> > /*
> > * Propagate new ->completed value to rcu_node structures
> > * so that other CPUs don't have to wait until the start
> > * of the next grace period to process their callbacks.
> > + * We must hold the root rcu_node structure's ->lock
> > + * across rcu_for_each_node_breadth_first() in order to
> > + * synchronize with CPUs coming online for the first time.
> > */
> > rcu_for_each_node_breadth_first(rsp, rnp) {
> > + raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
> > raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> > rnp->completed = rsp->completed;
> > raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> > + raw_spin_lock(&rnp_root->lock); /* already disabled. */
> > }
> > - local_irq_restore(flags);
> > + raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
> > return;
> > }
> >
> > @@ -935,7 +943,7 @@ static void rcu_report_qs_rsp(struct rcu
> > rsp->gp_max = gp_duration;
> > rsp->completed = rsp->gpnum;
> > rsp->signaled = RCU_GP_IDLE;
> > - rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
> > + rcu_start_gp(rsp, flags); /* releases root node's ->lock. */
> > }
> >
> > /*
> > @@ -1862,6 +1870,7 @@ rcu_init_percpu_data(int cpu, struct rcu
> > unsigned long mask;
> > struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
> > struct rcu_node *rnp = rcu_get_root(rsp);
> > + struct rcu_node *rnp_init;
> >
> > /* Set up local state, ensuring consistent view of global state. */
> > raw_spin_lock_irqsave(&rnp->lock, flags);
> > @@ -1882,6 +1891,20 @@ rcu_init_percpu_data(int cpu, struct rcu
> > /* Exclude any attempts to start a new GP on large systems. */
> > raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
> >
> > + /*
> > + * Initialize any rcu_node structures that will see their first use.
> > + * Note that rcu_max_cpu cannot change out from under us because the
> > + * hotplug locks are held.
> > + */
> > + raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> > + for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
> > + rnp_init <= rdp->mynode;
> > + rnp_init++) {
> > + rnp_init->gpnum = rsp->gpnum;
> > + rnp_init->completed = rsp->completed;
> > + }
> > + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> > +
> > /* Add CPU to rcu_node bitmasks. */
> > rnp = rdp->mynode;
> > mask = rdp->grpmask;
> > @@ -1907,6 +1930,11 @@ static void __cpuinit rcu_prepare_cpu(in
> > rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
> > rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
> > rcu_preempt_init_percpu_data(cpu);
> > + if (cpu > rcu_max_cpu) {
> > + smp_mb(); /* Initialization before rcu_max_cpu assignment. */
> > + rcu_max_cpu = cpu;
> > + smp_mb(); /* rcu_max_cpu assignment before later uses. */
> > + }
> > }
> >
> > /*
> > --- a/kernel/rcutree.h
> > +++ b/kernel/rcutree.h
> > @@ -191,11 +191,23 @@ struct rcu_node {
> >
> > /*
> > * Do a full breadth-first scan of the rcu_node structures for the
> > - * specified rcu_state structure.
> > + * specified rcu_state structure. The caller must hold either the
> > + * ->onofflock or the root rcu_node structure's ->lock.
> > */
> > +extern int rcu_max_cpu;
> > +static inline int rcu_get_max_cpu(void)
> > +{
> > + int ret;
> > +
> > + smp_mb(); /* Pairs with barriers in rcu_prepare_cpu(). */
> > + ret = rcu_max_cpu;
> > + smp_mb(); /* Pairs with barriers in rcu_prepare_cpu(). */
> > + return ret;
> > +}
> > #define rcu_for_each_node_breadth_first(rsp, rnp) \
> > for ((rnp) = &(rsp)->node[0]; \
> > - (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
> > + (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
> > + (rnp)++)
> >
> > /*
> > * Do a breadth-first scan of the non-leaf rcu_node structures for the
> >
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


sivanich at sgi

Mar 16, 2012, 10:56 AM

Post #24 of 32 (306 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Fri, Mar 16, 2012 at 10:51:42AM -0700, Paul E. McKenney wrote:
> On Fri, Mar 16, 2012 at 12:28:50PM -0500, Dimitri Sivanich wrote:
> > On Fri, Mar 16, 2012 at 09:45:35AM +0100, Mike Galbraith wrote:
> > > On Fri, 2012-03-16 at 09:09 +0100, Mike Galbraith wrote:
> > > > On Fri, 2012-03-16 at 08:27 +0100, Mike Galbraith wrote:
> > > > > On Thu, 2012-03-15 at 12:59 -0500, Dimitri Sivanich wrote:
> > >
> > > > > > Could I try your 3.0 enterprise patch?
> > > > >
> > > > > Sure, v3 below.
> > > >
> > > > Erm, a bit of that went missing. I'll put it back.
> > >
> > > OK, box finally finished rebuild/boot.
> > >
> >
> > This patch also shows great improvement in the two
> > rcu_for_each_node_breadth_first() (nothing over 20 usec and most less than
> > 10 in initial testing).
> >
> > However, there are spinlock holdoffs at the following tracebacks (my nmi
> > handler does work on the 3.0 kernel):
> >
> > [ 584.157019] [<ffffffff8144c5a0>] nmi+0x20/0x30
> > [ 584.157023] [<ffffffff8144bc8a>] _raw_spin_lock_irqsave+0x1a/0x30
> > [ 584.157026] [<ffffffff810c5f18>] force_qs_rnp+0x58/0x170
>
> This is a holdoff, not a deadlock hang, correct?

Correct. Likely < 200 usec.

>
> If so...
>
> Presumably this is the last raw_spin_lock_irqsave() in force_qs_rnp(),
> the one right before the call to rcu_initiate_boost(). Could you please
> verify for me?

Stay tuned. I need to reproduce this without the extra tracecode I had
added around the loops this morning (it -shouldn't- have had an effect..).
>
> If so, someone is holding the root rcu_node structure's lock for longer
> than is good. Or that there is significant contention on that lock,
> which there might well be on large configurations. Any info on who
> is holding or contending for this lock would be very helpful!
>
> Are you running TREE_RCU or TREE_PREEMPT_RCU?

TREE_RCU


>
> Thanx, Paul
>
> > [ 584.157030] [<ffffffff810c6192>] force_quiescent_state+0x162/0x1d0
> > [ 584.157033] [<ffffffff810c6c95>] __rcu_process_callbacks+0x165/0x200
> > [ 584.157037] [<ffffffff810c6d4d>] rcu_process_callbacks+0x1d/0x80
> > [ 584.157041] [<ffffffff81061eaf>] __do_softirq+0xef/0x220
> > [ 584.157044] [<ffffffff81454cbc>] call_softirq+0x1c/0x30
> > [ 584.157048] [<ffffffff810043a5>] do_softirq+0x65/0xa0
> > [ 584.157051] [<ffffffff81061c85>] irq_exit+0xb5/0xe0
> > [ 584.157054] [<ffffffff810212c8>] smp_apic_timer_interrupt+0x68/0xa0
> > [ 584.157057] [<ffffffff81454473>] apic_timer_interrupt+0x13/0x20
> > [ 584.157061] [<ffffffff8102b352>] native_safe_halt+0x2/0x10
> > [ 584.157064] [<ffffffff8100adf5>] default_idle+0x145/0x150
> > [ 584.157067] [<ffffffff810020c6>] cpu_idle+0x66/0xc0
> >
> > >
> > > rcu: Limit GP initialization to CPUs that have been online
> > >
> > > The current grace-period initialization initializes all leaf rcu_node
> > > structures, even those corresponding to CPUs that have never been online.
> > > This is harmless in many configurations, but results in 200-microsecond
> > > latency spikes for kernels built with NR_CPUS=4096.
> > >
> > > This commit therefore keeps track of the largest-numbered CPU that has
> > > ever been online, and limits grace-period initialization to rcu_node
> > > structures corresponding to that CPU and to smaller-numbered CPUs.
> > >
> > > Reported-by: Dimitri Sivanich <sivanich [at] sgi>
> > > Signed-off-by: Paul E. McKenney <paulmck [at] linux>
> > > Acked-by: Mike Galbraith <mgalbraith [at] suse>
> > >
> > > ---
> > > kernel/rcutree.c | 36 ++++++++++++++++++++++++++++++++----
> > > kernel/rcutree.h | 16 ++++++++++++++--
> > > 2 files changed, 46 insertions(+), 6 deletions(-)
> > >
> > > --- a/kernel/rcutree.c
> > > +++ b/kernel/rcutree.c
> > > @@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
> > >
> > > static struct rcu_state *rcu_state;
> > >
> > > +int rcu_max_cpu __read_mostly; /* Largest # CPU that has ever been online. */
> > > +
> > > /*
> > > * The rcu_scheduler_active variable transitions from zero to one just
> > > * before the first task is spawned. So when this variable is zero, RCU
> > > @@ -827,25 +829,31 @@ rcu_start_gp(struct rcu_state *rsp, unsi
> > > struct rcu_node *rnp = rcu_get_root(rsp);
> > >
> > > if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
> > > + struct rcu_node *rnp_root = rnp;
> > > +
> > > if (cpu_needs_another_gp(rsp, rdp))
> > > rsp->fqs_need_gp = 1;
> > > if (rnp->completed == rsp->completed) {
> > > - raw_spin_unlock_irqrestore(&rnp->lock, flags);
> > > + raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
> > > return;
> > > }
> > > - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> > >
> > > /*
> > > * Propagate new ->completed value to rcu_node structures
> > > * so that other CPUs don't have to wait until the start
> > > * of the next grace period to process their callbacks.
> > > + * We must hold the root rcu_node structure's ->lock
> > > + * across rcu_for_each_node_breadth_first() in order to
> > > + * synchronize with CPUs coming online for the first time.
> > > */
> > > rcu_for_each_node_breadth_first(rsp, rnp) {
> > > + raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
> > > raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> > > rnp->completed = rsp->completed;
> > > raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> > > + raw_spin_lock(&rnp_root->lock); /* already disabled. */
> > > }
> > > - local_irq_restore(flags);
> > > + raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
> > > return;
> > > }
> > >
> > > @@ -935,7 +943,7 @@ static void rcu_report_qs_rsp(struct rcu
> > > rsp->gp_max = gp_duration;
> > > rsp->completed = rsp->gpnum;
> > > rsp->signaled = RCU_GP_IDLE;
> > > - rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
> > > + rcu_start_gp(rsp, flags); /* releases root node's ->lock. */
> > > }
> > >
> > > /*
> > > @@ -1862,6 +1870,7 @@ rcu_init_percpu_data(int cpu, struct rcu
> > > unsigned long mask;
> > > struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
> > > struct rcu_node *rnp = rcu_get_root(rsp);
> > > + struct rcu_node *rnp_init;
> > >
> > > /* Set up local state, ensuring consistent view of global state. */
> > > raw_spin_lock_irqsave(&rnp->lock, flags);
> > > @@ -1882,6 +1891,20 @@ rcu_init_percpu_data(int cpu, struct rcu
> > > /* Exclude any attempts to start a new GP on large systems. */
> > > raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
> > >
> > > + /*
> > > + * Initialize any rcu_node structures that will see their first use.
> > > + * Note that rcu_max_cpu cannot change out from under us because the
> > > + * hotplug locks are held.
> > > + */
> > > + raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> > > + for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
> > > + rnp_init <= rdp->mynode;
> > > + rnp_init++) {
> > > + rnp_init->gpnum = rsp->gpnum;
> > > + rnp_init->completed = rsp->completed;
> > > + }
> > > + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> > > +
> > > /* Add CPU to rcu_node bitmasks. */
> > > rnp = rdp->mynode;
> > > mask = rdp->grpmask;
> > > @@ -1907,6 +1930,11 @@ static void __cpuinit rcu_prepare_cpu(in
> > > rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
> > > rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
> > > rcu_preempt_init_percpu_data(cpu);
> > > + if (cpu > rcu_max_cpu) {
> > > + smp_mb(); /* Initialization before rcu_max_cpu assignment. */
> > > + rcu_max_cpu = cpu;
> > > + smp_mb(); /* rcu_max_cpu assignment before later uses. */
> > > + }
> > > }
> > >
> > > /*
> > > --- a/kernel/rcutree.h
> > > +++ b/kernel/rcutree.h
> > > @@ -191,11 +191,23 @@ struct rcu_node {
> > >
> > > /*
> > > * Do a full breadth-first scan of the rcu_node structures for the
> > > - * specified rcu_state structure.
> > > + * specified rcu_state structure. The caller must hold either the
> > > + * ->onofflock or the root rcu_node structure's ->lock.
> > > */
> > > +extern int rcu_max_cpu;
> > > +static inline int rcu_get_max_cpu(void)
> > > +{
> > > + int ret;
> > > +
> > > + smp_mb(); /* Pairs with barriers in rcu_prepare_cpu(). */
> > > + ret = rcu_max_cpu;
> > > + smp_mb(); /* Pairs with barriers in rcu_prepare_cpu(). */
> > > + return ret;
> > > +}
> > > #define rcu_for_each_node_breadth_first(rsp, rnp) \
> > > for ((rnp) = &(rsp)->node[0]; \
> > > - (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
> > > + (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
> > > + (rnp)++)
> > >
> > > /*
> > > * Do a breadth-first scan of the non-leaf rcu_node structures for the
> > >
> >
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


efault at gmx

Mar 16, 2012, 12:11 PM

Post #25 of 32 (307 views)
Permalink
Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online [In reply to]

On Fri, 2012-03-16 at 12:28 -0500, Dimitri Sivanich wrote:

> However, there are spinlock holdoffs at the following tracebacks (my nmi
> handler does work on the 3.0 kernel):

I'll try wedging/testing the subsequent patch Paul mentioned in both RT,
and NR_CPUS=max NOPREEMPT kernels.

-Mike

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

First page Previous page 1 2 Next page Last page  View All Linux kernel RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact Gossamer Threads
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.