Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: Linux: Kernel

[PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions

 

 

Linux kernel RSS feed   Index | Next | Previous | View Threaded


jack at suse

May 15, 2012, 8:43 AM

Post #1 of 14 (211 views)
Permalink
[PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions

Convert calculations of proportion of writeback each bdi does to new flexible
proportion code. That allows us to use aging period of fixed wallclock time
which gives better proportion estimates given the hugely varying throughput of
different devices.

Signed-off-by: Jan Kara <jack [at] suse>
---
include/linux/backing-dev.h | 6 +-
mm/backing-dev.c | 5 +-
mm/page-writeback.c | 91 +++++++++++++++++++++++--------------------
3 files changed, 54 insertions(+), 48 deletions(-)

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b1038bd..64a3617 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -10,7 +10,7 @@

#include <linux/percpu_counter.h>
#include <linux/log2.h>
-#include <linux/proportions.h>
+#include <linux/flex_proportions.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/sched.h>
@@ -89,11 +89,11 @@ struct backing_dev_info {
unsigned long dirty_ratelimit;
unsigned long balanced_dirty_ratelimit;

- struct prop_local_percpu completions;
+ struct fprop_local_percpu completions;
int dirty_exceeded;

unsigned int min_ratio;
- unsigned int max_ratio, max_prop_frac;
+ unsigned int max_ratio;

struct bdi_writeback wb; /* default writeback info for this bdi */
spinlock_t wb_lock; /* protects work_list */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dd8e2aa..f3a2608 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -677,7 +677,6 @@ int bdi_init(struct backing_dev_info *bdi)

bdi->min_ratio = 0;
bdi->max_ratio = 100;
- bdi->max_prop_frac = PROP_FRAC_BASE;
spin_lock_init(&bdi->wb_lock);
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->work_list);
@@ -700,7 +699,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->write_bandwidth = INIT_BW;
bdi->avg_write_bandwidth = INIT_BW;

- err = prop_local_init_percpu(&bdi->completions);
+ err = fprop_local_init_percpu(&bdi->completions);

if (err) {
err:
@@ -744,7 +743,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);

- prop_local_destroy_percpu(&bdi->completions);
+ fprop_local_destroy_percpu(&bdi->completions);
}
EXPORT_SYMBOL(bdi_destroy);

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 26adea8..97c6396 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
#include <linux/syscalls.h>
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
#include <linux/pagevec.h>
+#include <linux/timer.h>
#include <trace/events/writeback.h>

/*
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit;
* measured in page writeback completions.
*
*/
-static struct prop_descriptor vm_completions;
+static struct fprop_global writeout_completions;
+
+static void writeout_period(unsigned long t);
+/* Timer for aging of writeout_completions */
+static struct timer_list writeout_period_timer =
+ TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
+bool writeout_period_timer_running = false;
+
+/*
+ * Length of period for aging writeout fractions of bdis. This is an
+ * arbitrarily chosen number. The longer the period, the slower fractions will
+ * reflect changes in current writeout rate.
+ */
+#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)

/*
* Work out the current dirty-memory clamping and background writeout
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone)
zone_page_state(zone, NR_WRITEBACK) <= limit;
}

-/*
- * couple the period to the dirty_ratio:
- *
- * period/2 ~ roundup_pow_of_two(dirty limit)
- */
-static int calc_period_shift(void)
-{
- unsigned long dirty_total;
-
- if (vm_dirty_bytes)
- dirty_total = vm_dirty_bytes / PAGE_SIZE;
- else
- dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
- 100;
- return 2 + ilog2(dirty_total - 1);
-}
-
-/*
- * update the period when the dirty threshold changes.
- */
-static void update_completion_period(void)
-{
- int shift = calc_period_shift();
- prop_change_shift(&vm_completions, shift);
-
- writeback_set_ratelimit();
-}
-
int dirty_background_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,

ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- update_completion_period();
+ writeback_set_ratelimit();
vm_dirty_bytes = 0;
}
return ret;
@@ -398,7 +384,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,

ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
- update_completion_period();
+ writeback_set_ratelimit();
vm_dirty_ratio = 0;
}
return ret;
@@ -411,8 +397,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
{
__inc_bdi_stat(bdi, BDI_WRITTEN);
- __prop_inc_percpu_max(&vm_completions, &bdi->completions,
- bdi->max_prop_frac);
+ __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
+ bdi->max_ratio);
+ /*
+ * This check is racy but it's not important which of the currently
+ * running events will arm the timer or even whether we lose the race
+ * with writeout_period() and writeout_period_timer_running will be
+ * false despite the timer being armed...
+ */
+ if (!writeout_period_timer_running) {
+ writeout_period_timer_running = true;
+ mod_timer(&writeout_period_timer,
+ jiffies + VM_COMPLETIONS_PERIOD_LEN);
+ }
}

void bdi_writeout_inc(struct backing_dev_info *bdi)
@@ -431,10 +428,25 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);
static void bdi_writeout_fraction(struct backing_dev_info *bdi,
long *numerator, long *denominator)
{
- prop_fraction_percpu(&vm_completions, &bdi->completions,
+ fprop_fraction_percpu(&writeout_completions, &bdi->completions,
numerator, denominator);
}

+
+static void writeout_period(unsigned long t)
+{
+ if (fprop_new_period(&writeout_completions)) {
+ mod_timer(&writeout_period_timer,
+ jiffies + VM_COMPLETIONS_PERIOD_LEN);
+ } else {
+ /*
+ * Aging has zeroed all fractions. Stop wasting CPU on period
+ * updates.
+ */
+ writeout_period_timer_running = false;
+ }
+}
+
/*
* bdi_min_ratio keeps the sum of the minimum dirty shares of all
* registered backing devices, which, for obvious reasons, can not
@@ -471,12 +483,10 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
return -EINVAL;

spin_lock_bh(&bdi_lock);
- if (bdi->min_ratio > max_ratio) {
+ if (bdi->min_ratio > max_ratio)
ret = -EINVAL;
- } else {
+ else
bdi->max_ratio = max_ratio;
- bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
- }
spin_unlock_bh(&bdi_lock);

return ret;
@@ -1605,13 +1615,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
*/
void __init page_writeback_init(void)
{
- int shift;
-
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);

- shift = calc_period_shift();
- prop_descriptor_init(&vm_completions, shift);
+ fprop_global_init(&writeout_completions);
}

/**
--
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peterz at infradead

May 17, 2012, 3:04 PM

Post #2 of 14 (204 views)
Permalink
Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions [In reply to]

On Tue, 2012-05-15 at 17:43 +0200, Jan Kara wrote:
> +static struct timer_list writeout_period_timer =
> + TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);

So the problem with using a deferred timer is that it 'ignores' idle
time. So if a very busy period is followed by a real quiet period you'd
expect all the proportions to have aged to 0, but they won't have.

One way to solve that is to track a jiffies count of the last time the
timer triggered and compute the missed periods from that and extend
fprop_new_period() to deal with period increments of more than 1.

The other is of course to not use deferred timers.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jack at suse

May 18, 2012, 7:24 AM

Post #3 of 14 (204 views)
Permalink
Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions [In reply to]

On Fri 18-05-12 00:04:33, Peter Zijlstra wrote:
> On Tue, 2012-05-15 at 17:43 +0200, Jan Kara wrote:
> > +static struct timer_list writeout_period_timer =
> > + TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
>
> So the problem with using a deferred timer is that it 'ignores' idle
> time. So if a very busy period is followed by a real quiet period you'd
> expect all the proportions to have aged to 0, but they won't have.
Ah, I see. Thanks for warning me.

> One way to solve that is to track a jiffies count of the last time the
> timer triggered and compute the missed periods from that and extend
> fprop_new_period() to deal with period increments of more than 1.
Yeah, that should be easy enough so I'll try it that way since I presume
it's nicer to power usage to use deferred timers if it's reasonably
possible.

Honza
--
Jan Kara <jack [at] suse>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peterz at infradead

May 18, 2012, 7:34 AM

Post #4 of 14 (205 views)
Permalink
Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions [In reply to]

On Fri, 2012-05-18 at 16:24 +0200, Jan Kara wrote:
> Yeah, that should be easy enough so I'll try it that way since I presume
> it's nicer to power usage to use deferred timers if it's reasonably
> possible.

Btw, your current scheme also drifts. Since you do jiffes + 3*HZ you
period might actually be longer if the timer got delayed.

If you keep an external jiffies count like:

unsigned long period_jiffies = jiffies;

void my_timer_func()
{
unsigned long delta = jiffies - period_jiffies;
unsigned long periods = delta / 3*HZ;

age(periods);

period_jiffies += 3*HZ * periods;
mod_timer(&my_timer, period_jiffies);
}


it all works without drift (+- bugs of course).
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jack at suse

May 24, 2012, 9:59 AM

Post #5 of 14 (205 views)
Permalink
[PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions [In reply to]

Convert calculations of proportion of writeback each bdi does to new flexible
proportion code. That allows us to use aging period of fixed wallclock time
which gives better proportion estimates given the hugely varying throughput of
different devices.

Signed-off-by: Jan Kara <jack [at] suse>
---
include/linux/backing-dev.h | 4 +-
mm/backing-dev.c | 6 +-
mm/page-writeback.c | 103 ++++++++++++++++++++++++++----------------
3 files changed, 69 insertions(+), 44 deletions(-)

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b1038bd..489de62 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -10,7 +10,7 @@

#include <linux/percpu_counter.h>
#include <linux/log2.h>
-#include <linux/proportions.h>
+#include <linux/flex_proportions.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/sched.h>
@@ -89,7 +89,7 @@ struct backing_dev_info {
unsigned long dirty_ratelimit;
unsigned long balanced_dirty_ratelimit;

- struct prop_local_percpu completions;
+ struct fprop_local_percpu completions;
int dirty_exceeded;

unsigned int min_ratio;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dd8e2aa..3387aea 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -677,7 +677,7 @@ int bdi_init(struct backing_dev_info *bdi)

bdi->min_ratio = 0;
bdi->max_ratio = 100;
- bdi->max_prop_frac = PROP_FRAC_BASE;
+ bdi->max_prop_frac = FPROP_FRAC_BASE;
spin_lock_init(&bdi->wb_lock);
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->work_list);
@@ -700,7 +700,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->write_bandwidth = INIT_BW;
bdi->avg_write_bandwidth = INIT_BW;

- err = prop_local_init_percpu(&bdi->completions);
+ err = fprop_local_init_percpu(&bdi->completions);

if (err) {
err:
@@ -744,7 +744,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);

- prop_local_destroy_percpu(&bdi->completions);
+ fprop_local_destroy_percpu(&bdi->completions);
}
EXPORT_SYMBOL(bdi_destroy);

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 26adea8..647daa3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
#include <linux/syscalls.h>
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
#include <linux/pagevec.h>
+#include <linux/timer.h>
#include <trace/events/writeback.h>

/*
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit;
* measured in page writeback completions.
*
*/
-static struct prop_descriptor vm_completions;
+static struct fprop_global writeout_completions;
+
+static void writeout_period(unsigned long t);
+/* Timer for aging of writeout_completions */
+static struct timer_list writeout_period_timer =
+ TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
+static unsigned long writeout_period_time = 0;
+
+/*
+ * Length of period for aging writeout fractions of bdis. This is an
+ * arbitrarily chosen number. The longer the period, the slower fractions will
+ * reflect changes in current writeout rate.
+ */
+#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)

/*
* Work out the current dirty-memory clamping and background writeout
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone)
zone_page_state(zone, NR_WRITEBACK) <= limit;
}

-/*
- * couple the period to the dirty_ratio:
- *
- * period/2 ~ roundup_pow_of_two(dirty limit)
- */
-static int calc_period_shift(void)
-{
- unsigned long dirty_total;
-
- if (vm_dirty_bytes)
- dirty_total = vm_dirty_bytes / PAGE_SIZE;
- else
- dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
- 100;
- return 2 + ilog2(dirty_total - 1);
-}
-
-/*
- * update the period when the dirty threshold changes.
- */
-static void update_completion_period(void)
-{
- int shift = calc_period_shift();
- prop_change_shift(&vm_completions, shift);
-
- writeback_set_ratelimit();
-}
-
int dirty_background_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,

ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- update_completion_period();
+ writeback_set_ratelimit();
vm_dirty_bytes = 0;
}
return ret;
@@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write,

ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
- update_completion_period();
+ writeback_set_ratelimit();
vm_dirty_ratio = 0;
}
return ret;
}

+static unsigned long wp_next_time(unsigned long cur_time)
+{
+ cur_time += VM_COMPLETIONS_PERIOD_LEN;
+ /* 0 has a special meaning... */
+ if (!cur_time)
+ return 1;
+ return cur_time;
+}
+
/*
* Increment the BDI's writeout completion count and the global writeout
* completion count. Called from test_clear_page_writeback().
@@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
{
__inc_bdi_stat(bdi, BDI_WRITTEN);
- __prop_inc_percpu_max(&vm_completions, &bdi->completions,
- bdi->max_prop_frac);
+ __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
+ bdi->max_prop_frac);
+ /* First event after period switching was turned off? */
+ if (!unlikely(writeout_period_time)) {
+ /*
+ * We can race with other __bdi_writeout_inc calls here but
+ * it does not cause any harm since the resulting time when
+ * timer will fire and what is in writeout_period_time will be
+ * roughly the same.
+ */
+ writeout_period_time = wp_next_time(jiffies);
+ mod_timer(&writeout_period_timer, writeout_period_time);
+ }
}

void bdi_writeout_inc(struct backing_dev_info *bdi)
@@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);
static void bdi_writeout_fraction(struct backing_dev_info *bdi,
long *numerator, long *denominator)
{
- prop_fraction_percpu(&vm_completions, &bdi->completions,
+ fprop_fraction_percpu(&writeout_completions, &bdi->completions,
numerator, denominator);
}

/*
+ * On idle system, we can be called long after we scheduled because we use
+ * deferred timers so count with missed periods.
+ */
+static void writeout_period(unsigned long t)
+{
+ int miss_periods = (jiffies - writeout_period_time) /
+ VM_COMPLETIONS_PERIOD_LEN;
+
+ if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
+ writeout_period_time = wp_next_time(writeout_period_time +
+ miss_periods * VM_COMPLETIONS_PERIOD_LEN);
+ mod_timer(&writeout_period_timer, writeout_period_time);
+ } else {
+ /*
+ * Aging has zeroed all fractions. Stop wasting CPU on period
+ * updates.
+ */
+ writeout_period_time = 0;
+ }
+}
+
+/*
* bdi_min_ratio keeps the sum of the minimum dirty shares of all
* registered backing devices, which, for obvious reasons, can not
* exceed 100%.
@@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
ret = -EINVAL;
} else {
bdi->max_ratio = max_ratio;
- bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
+ bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
}
spin_unlock_bh(&bdi_lock);

@@ -1605,13 +1633,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
*/
void __init page_writeback_init(void)
{
- int shift;
-
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);

- shift = calc_period_shift();
- prop_descriptor_init(&vm_completions, shift);
+ fprop_global_init(&writeout_completions);
}

/**
--
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


levinsasha928 at gmail

May 28, 2012, 8:49 AM

Post #6 of 14 (198 views)
Permalink
Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions [In reply to]

Hi Jan,

On Thu, 2012-05-24 at 18:59 +0200, Jan Kara wrote:
> Convert calculations of proportion of writeback each bdi does to new flexible
> proportion code. That allows us to use aging period of fixed wallclock time
> which gives better proportion estimates given the hugely varying throughput of
> different devices.
>
> Signed-off-by: Jan Kara <jack [at] suse>
> ---

This patch appears to be causing lockdep warnings over here:

[ 20.545016] =================================
[ 20.545016] [ INFO: inconsistent lock state ]
[ 20.545016] 3.4.0-next-20120528-sasha-00008-g11ef39f #307 Tainted: G W
[ 20.545016] ---------------------------------
[ 20.545016] inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage.
[ 20.545016] rcu_torture_rea/2493 [HC0[0]:SC1[1]:HE1:SE0] takes:
[ 20.545016] (key#3){?.-...}, at: [<ffffffff81993527>] __percpu_counter_sum+0x17/0xc0
[ 20.545016] {IN-HARDIRQ-W} state was registered at:
[ 20.545016] [<ffffffff8114ffab>] mark_irqflags+0x6b/0x170
[ 20.545016] [<ffffffff811519bb>] __lock_acquire+0x2bb/0x4c0
[ 20.545016] [<ffffffff81151d4a>] lock_acquire+0x18a/0x1e0
[ 20.545016] [<ffffffff8325ac9b>] _raw_spin_lock+0x3b/0x70
[ 20.545016] [<ffffffff81993620>] __percpu_counter_add+0x50/0xb0
[ 20.545016] [<ffffffff8195b53a>] __fprop_inc_percpu_max+0x8a/0xa0
[ 20.545016] [<ffffffff811daf8d>] test_clear_page_writeback+0x12d/0x1c0
[ 20.545016] [<ffffffff811ccc44>] end_page_writeback+0x24/0x50
[ 20.545016] [<ffffffff8126ed2a>] end_buffer_async_write+0x26a/0x350
[ 20.545016] [<ffffffff8126bfdd>] end_bio_bh_io_sync+0x3d/0x50
[ 20.545016] [<ffffffff81270b59>] bio_endio+0x29/0x30
[ 20.545016] [<ffffffff819330e9>] req_bio_endio+0xb9/0xd0
[ 20.545016] [<ffffffff81936318>] blk_update_request+0x1a8/0x3c0
[ 20.545016] [<ffffffff81936552>] blk_update_bidi_request+0x22/0x90
[ 20.545016] [<ffffffff8193673c>] __blk_end_bidi_request+0x1c/0x40
[ 20.545016] [<ffffffff81936788>] __blk_end_request_all+0x28/0x40
[ 20.545016] [<ffffffff81e04f2e>] blk_done+0x9e/0xf0
[ 20.545016] [<ffffffff81afb106>] vring_interrupt+0x86/0xa0
[ 20.680186] [<ffffffff81187c01>] handle_irq_event_percpu+0x151/0x3e0
[ 20.680186] [<ffffffff81187ed3>] handle_irq_event+0x43/0x70
[ 20.680186] [<ffffffff8118b5a8>] handle_edge_irq+0xe8/0x120
[ 20.680186] [<ffffffff81069444>] handle_irq+0x164/0x180
[ 20.680186] [<ffffffff81068638>] do_IRQ+0x58/0xd0
[ 20.680186] [<ffffffff8325beef>] ret_from_intr+0x0/0x1a
[ 20.680186] [<ffffffff81937bed>] blk_queue_bio+0x30d/0x430
[ 20.680186] [<ffffffff8193423e>] generic_make_request+0xbe/0x120
[ 20.680186] [<ffffffff81934398>] submit_bio+0xf8/0x120
[ 20.680186] [<ffffffff8126bf72>] submit_bh+0x122/0x150
[ 20.680186] [<ffffffff8126ded7>] __block_write_full_page+0x287/0x3b0
[ 20.680186] [<ffffffff8126f2cc>] block_write_full_page_endio+0xfc/0x120
[ 20.680186] [<ffffffff8126f300>] block_write_full_page+0x10/0x20
[ 20.680186] [<ffffffff81273d83>] blkdev_writepage+0x13/0x20
[ 20.680186] [<ffffffff811d90c5>] __writepage+0x15/0x40
[ 20.680186] [<ffffffff811db78f>] write_cache_pages+0x49f/0x650
[ 20.680186] [<ffffffff811db98f>] generic_writepages+0x4f/0x70
[ 20.680186] [<ffffffff811db9ce>] do_writepages+0x1e/0x50
[ 20.680186] [<ffffffff811cd219>] __filemap_fdatawrite_range+0x49/0x50
[ 20.680186] [<ffffffff811cd44a>] filemap_fdatawrite+0x1a/0x20
[ 20.680186] [<ffffffff811cd475>] filemap_write_and_wait+0x25/0x50
[ 20.680186] [<ffffffff812740bd>] __sync_blockdev+0x2d/0x40
[ 20.680186] [<ffffffff812740de>] sync_blockdev+0xe/0x10
[ 20.680186] [<ffffffff813917d2>] journal_recover+0x182/0x1c0
[ 20.680186] [<ffffffff81396ae8>] journal_load+0x58/0xa0
[ 20.680186] [<ffffffff8132b750>] ext3_load_journal+0x200/0x2b0
[ 20.680186] [<ffffffff8132e2c8>] ext3_fill_super+0xc18/0x10d0
[ 20.680186] [<ffffffff8123c636>] mount_bdev+0x176/0x210
[ 20.680186] [<ffffffff81327e00>] ext3_mount+0x10/0x20
[ 20.680186] [<ffffffff8123bf75>] mount_fs+0x85/0x1a0
[ 20.680186] [<ffffffff812592a4>] vfs_kern_mount+0x74/0x100
[ 20.680186] [<ffffffff8125b991>] do_kern_mount+0x51/0x120
[ 20.680186] [<ffffffff8125bc34>] do_mount+0x1d4/0x240
[ 20.680186] [<ffffffff8125bd3d>] sys_mount+0x9d/0xe0
[ 20.680186] [<ffffffff84cb6232>] do_mount_root+0x1e/0x94
[ 20.680186] [<ffffffff84cb64c2>] mount_block_root+0xe2/0x224
[ 20.680186] [<ffffffff84cb672f>] mount_root+0x12b/0x136
[ 20.680186] [<ffffffff84cb689f>] prepare_namespace+0x165/0x19e
[ 20.680186] [<ffffffff84cb5afb>] kernel_init+0x274/0x28a
[ 20.680186] [<ffffffff8325dd34>] kernel_thread_helper+0x4/0x10
[ 20.680186] irq event stamp: 1551906
[ 20.680186] hardirqs last enabled at (1551906): [<ffffffff8325b7db>] _raw_spin_unlock_irq+0x2b/0x80
[ 20.680186] hardirqs last disabled at (1551905): [<ffffffff8325aea4>] _raw_spin_lock_irq+0x34/0xa0
[ 20.680186] softirqs last enabled at (1551022): [<ffffffff810e316b>] __do_softirq+0x3db/0x460
[ 20.680186] softirqs last disabled at (1551903): [<ffffffff8325de2c>] call_softirq+0x1c/0x30
[ 20.680186]
[ 20.680186] other info that might help us debug this:
[ 20.680186] Possible unsafe locking scenario:
[ 20.680186]
[ 20.680186] CPU0
[ 20.680186] ----
[ 20.680186] lock(key#3);
[ 20.680186] <Interrupt>
[ 20.680186] lock(key#3);
[ 20.680186]
[ 20.680186] *** DEADLOCK ***
[ 20.680186]
[ 20.680186] 2 locks held by rcu_torture_rea/2493:
[ 20.680186] #0: (rcu_read_lock){.+.+..}, at: [<ffffffff811914f0>] rcu_torture_read_lock+0x0/0x80
[ 20.680186] #1: (mm/page-writeback.c:144){+.-...}, at: [<ffffffff810ebf90>] call_timer_fn+0x0/0x260
[ 20.680186]
[ 20.680186] stack backtrace:
[ 20.680186] Pid: 2493, comm: rcu_torture_rea Tainted: G W 3.4.0-next-20120528-sasha-00008-g11ef39f #307
[ 20.680186] Call Trace:
[ 20.680186] <IRQ> [<ffffffff8114f6b9>] print_usage_bug+0x1a9/0x1d0
[ 20.680186] [<ffffffff8114eed0>] ? check_usage_forwards+0xf0/0xf0
[ 20.680186] [<ffffffff8114fb99>] mark_lock_irq+0xc9/0x270
[ 20.680186] [<ffffffff8114fe5d>] mark_lock+0x11d/0x200
[ 20.680186] [<ffffffff81150030>] mark_irqflags+0xf0/0x170
[ 20.680186] [<ffffffff811519bb>] __lock_acquire+0x2bb/0x4c0
[ 20.680186] [<ffffffff81151d4a>] lock_acquire+0x18a/0x1e0
[ 20.680186] [<ffffffff81993527>] ? __percpu_counter_sum+0x17/0xc0
[ 20.680186] [<ffffffff811d9260>] ? laptop_io_completion+0x30/0x30
[ 20.680186] [<ffffffff8325ac9b>] _raw_spin_lock+0x3b/0x70
[ 20.680186] [<ffffffff81993527>] ? __percpu_counter_sum+0x17/0xc0
[ 20.680186] [<ffffffff81993527>] __percpu_counter_sum+0x17/0xc0
[ 20.680186] [<ffffffff810ebf90>] ? init_timer_deferrable_key+0x20/0x20
[ 20.680186] [<ffffffff8195b5c2>] fprop_new_period+0x12/0x60
[ 20.680186] [<ffffffff811d929d>] writeout_period+0x3d/0xa0
[ 20.680186] [<ffffffff810ec0bf>] call_timer_fn+0x12f/0x260
[ 20.680186] [<ffffffff810ebf90>] ? init_timer_deferrable_key+0x20/0x20
[ 20.680186] [<ffffffff8325b7db>] ? _raw_spin_unlock_irq+0x2b/0x80
[ 20.680186] [<ffffffff811d9260>] ? laptop_io_completion+0x30/0x30
[ 20.680186] [<ffffffff810ecd6e>] run_timer_softirq+0x29e/0x2f0
[ 20.680186] [<ffffffff810e2fb1>] __do_softirq+0x221/0x460
[ 20.680186] [<ffffffff8109a516>] ? kvm_clock_read+0x46/0x80
[ 20.680186] [<ffffffff8325de2c>] call_softirq+0x1c/0x30
[ 20.680186] [<ffffffff81069235>] do_softirq+0x75/0x120
[ 20.680186] [<ffffffff810e1fbb>] irq_exit+0x5b/0xf0
[ 20.680186] [<ffffffff8108e88a>] smp_apic_timer_interrupt+0x8a/0xa0
[ 20.680186] [<ffffffff8325d42f>] apic_timer_interrupt+0x6f/0x80
[ 20.680186] <EOI> [<ffffffff81151d7e>] ? lock_acquire+0x1be/0x1e0
[ 20.680186] [<ffffffff811914f0>] ? rcu_torture_reader+0x380/0x380
[ 20.680186] [<ffffffff81191523>] rcu_torture_read_lock+0x33/0x80
[ 20.680186] [<ffffffff811914f0>] ? rcu_torture_reader+0x380/0x380
[ 20.680186] [<ffffffff81191293>] rcu_torture_reader+0x123/0x380
[ 20.680186] [<ffffffff8118ff50>] ? T.841+0x50/0x50
[ 20.680186] [<ffffffff81191170>] ? rcu_torture_read_unlock+0x60/0x60
[ 20.680186] [<ffffffff811071c2>] kthread+0xb2/0xc0
[ 20.680186] [<ffffffff8325dd34>] kernel_thread_helper+0x4/0x10
[ 20.680186] [<ffffffff8325bfb4>] ? retint_restore_args+0x13/0x13
[ 20.680186] [<ffffffff81107110>] ? __init_kthread_worker+0x70/0x70
[ 20.680186] [<ffffffff8325dd30>] ? gs_change+0x13/0x13

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jack at suse

May 29, 2012, 5:34 AM

Post #7 of 14 (197 views)
Permalink
Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions [In reply to]

On Mon 28-05-12 17:49:45, Sasha Levin wrote:
> Hi Jan,
>
> On Thu, 2012-05-24 at 18:59 +0200, Jan Kara wrote:
> > Convert calculations of proportion of writeback each bdi does to new flexible
> > proportion code. That allows us to use aging period of fixed wallclock time
> > which gives better proportion estimates given the hugely varying throughput of
> > different devices.
> >
> > Signed-off-by: Jan Kara <jack [at] suse>
> > ---
>
> This patch appears to be causing lockdep warnings over here:
Actually, this is not caused directly by my patch. Just my patch makes
the problem more likely because I use smaller counter batch in
__fprop_inc_percpu_max() than is used in original __prop_inc_percpu_max(),
so the probability that percpu counter takes spinlock (which is what
triggers the warning) is higher.

The only safe solution seems to be to create a variant of percpu counters
that can be used from an interrupt. Or do you have other idea Peter?

Honza
>
> [ 20.545016] =================================
> [ 20.545016] [ INFO: inconsistent lock state ]
> [ 20.545016] 3.4.0-next-20120528-sasha-00008-g11ef39f #307 Tainted: G W
> [ 20.545016] ---------------------------------
> [ 20.545016] inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage.
> [ 20.545016] rcu_torture_rea/2493 [HC0[0]:SC1[1]:HE1:SE0] takes:
> [ 20.545016] (key#3){?.-...}, at: [<ffffffff81993527>] __percpu_counter_sum+0x17/0xc0
> [ 20.545016] {IN-HARDIRQ-W} state was registered at:
> [ 20.545016] [<ffffffff8114ffab>] mark_irqflags+0x6b/0x170
> [ 20.545016] [<ffffffff811519bb>] __lock_acquire+0x2bb/0x4c0
> [ 20.545016] [<ffffffff81151d4a>] lock_acquire+0x18a/0x1e0
> [ 20.545016] [<ffffffff8325ac9b>] _raw_spin_lock+0x3b/0x70
> [ 20.545016] [<ffffffff81993620>] __percpu_counter_add+0x50/0xb0
> [ 20.545016] [<ffffffff8195b53a>] __fprop_inc_percpu_max+0x8a/0xa0
> [ 20.545016] [<ffffffff811daf8d>] test_clear_page_writeback+0x12d/0x1c0
> [ 20.545016] [<ffffffff811ccc44>] end_page_writeback+0x24/0x50
> [ 20.545016] [<ffffffff8126ed2a>] end_buffer_async_write+0x26a/0x350
> [ 20.545016] [<ffffffff8126bfdd>] end_bio_bh_io_sync+0x3d/0x50
> [ 20.545016] [<ffffffff81270b59>] bio_endio+0x29/0x30
> [ 20.545016] [<ffffffff819330e9>] req_bio_endio+0xb9/0xd0
> [ 20.545016] [<ffffffff81936318>] blk_update_request+0x1a8/0x3c0
> [ 20.545016] [<ffffffff81936552>] blk_update_bidi_request+0x22/0x90
> [ 20.545016] [<ffffffff8193673c>] __blk_end_bidi_request+0x1c/0x40
> [ 20.545016] [<ffffffff81936788>] __blk_end_request_all+0x28/0x40
> [ 20.545016] [<ffffffff81e04f2e>] blk_done+0x9e/0xf0
> [ 20.545016] [<ffffffff81afb106>] vring_interrupt+0x86/0xa0
> [ 20.680186] [<ffffffff81187c01>] handle_irq_event_percpu+0x151/0x3e0
> [ 20.680186] [<ffffffff81187ed3>] handle_irq_event+0x43/0x70
> [ 20.680186] [<ffffffff8118b5a8>] handle_edge_irq+0xe8/0x120
> [ 20.680186] [<ffffffff81069444>] handle_irq+0x164/0x180
> [ 20.680186] [<ffffffff81068638>] do_IRQ+0x58/0xd0
> [ 20.680186] [<ffffffff8325beef>] ret_from_intr+0x0/0x1a
> [ 20.680186] [<ffffffff81937bed>] blk_queue_bio+0x30d/0x430
> [ 20.680186] [<ffffffff8193423e>] generic_make_request+0xbe/0x120
> [ 20.680186] [<ffffffff81934398>] submit_bio+0xf8/0x120
> [ 20.680186] [<ffffffff8126bf72>] submit_bh+0x122/0x150
> [ 20.680186] [<ffffffff8126ded7>] __block_write_full_page+0x287/0x3b0
> [ 20.680186] [<ffffffff8126f2cc>] block_write_full_page_endio+0xfc/0x120
> [ 20.680186] [<ffffffff8126f300>] block_write_full_page+0x10/0x20
> [ 20.680186] [<ffffffff81273d83>] blkdev_writepage+0x13/0x20
> [ 20.680186] [<ffffffff811d90c5>] __writepage+0x15/0x40
> [ 20.680186] [<ffffffff811db78f>] write_cache_pages+0x49f/0x650
> [ 20.680186] [<ffffffff811db98f>] generic_writepages+0x4f/0x70
> [ 20.680186] [<ffffffff811db9ce>] do_writepages+0x1e/0x50
> [ 20.680186] [<ffffffff811cd219>] __filemap_fdatawrite_range+0x49/0x50
> [ 20.680186] [<ffffffff811cd44a>] filemap_fdatawrite+0x1a/0x20
> [ 20.680186] [<ffffffff811cd475>] filemap_write_and_wait+0x25/0x50
> [ 20.680186] [<ffffffff812740bd>] __sync_blockdev+0x2d/0x40
> [ 20.680186] [<ffffffff812740de>] sync_blockdev+0xe/0x10
> [ 20.680186] [<ffffffff813917d2>] journal_recover+0x182/0x1c0
> [ 20.680186] [<ffffffff81396ae8>] journal_load+0x58/0xa0
> [ 20.680186] [<ffffffff8132b750>] ext3_load_journal+0x200/0x2b0
> [ 20.680186] [<ffffffff8132e2c8>] ext3_fill_super+0xc18/0x10d0
> [ 20.680186] [<ffffffff8123c636>] mount_bdev+0x176/0x210
> [ 20.680186] [<ffffffff81327e00>] ext3_mount+0x10/0x20
> [ 20.680186] [<ffffffff8123bf75>] mount_fs+0x85/0x1a0
> [ 20.680186] [<ffffffff812592a4>] vfs_kern_mount+0x74/0x100
> [ 20.680186] [<ffffffff8125b991>] do_kern_mount+0x51/0x120
> [ 20.680186] [<ffffffff8125bc34>] do_mount+0x1d4/0x240
> [ 20.680186] [<ffffffff8125bd3d>] sys_mount+0x9d/0xe0
> [ 20.680186] [<ffffffff84cb6232>] do_mount_root+0x1e/0x94
> [ 20.680186] [<ffffffff84cb64c2>] mount_block_root+0xe2/0x224
> [ 20.680186] [<ffffffff84cb672f>] mount_root+0x12b/0x136
> [ 20.680186] [<ffffffff84cb689f>] prepare_namespace+0x165/0x19e
> [ 20.680186] [<ffffffff84cb5afb>] kernel_init+0x274/0x28a
> [ 20.680186] [<ffffffff8325dd34>] kernel_thread_helper+0x4/0x10
> [ 20.680186] irq event stamp: 1551906
> [ 20.680186] hardirqs last enabled at (1551906): [<ffffffff8325b7db>] _raw_spin_unlock_irq+0x2b/0x80
> [ 20.680186] hardirqs last disabled at (1551905): [<ffffffff8325aea4>] _raw_spin_lock_irq+0x34/0xa0
> [ 20.680186] softirqs last enabled at (1551022): [<ffffffff810e316b>] __do_softirq+0x3db/0x460
> [ 20.680186] softirqs last disabled at (1551903): [<ffffffff8325de2c>] call_softirq+0x1c/0x30
> [ 20.680186]
> [ 20.680186] other info that might help us debug this:
> [ 20.680186] Possible unsafe locking scenario:
> [ 20.680186]
> [ 20.680186] CPU0
> [ 20.680186] ----
> [ 20.680186] lock(key#3);
> [ 20.680186] <Interrupt>
> [ 20.680186] lock(key#3);
> [ 20.680186]
> [ 20.680186] *** DEADLOCK ***
> [ 20.680186]
> [ 20.680186] 2 locks held by rcu_torture_rea/2493:
> [ 20.680186] #0: (rcu_read_lock){.+.+..}, at: [<ffffffff811914f0>] rcu_torture_read_lock+0x0/0x80
> [ 20.680186] #1: (mm/page-writeback.c:144){+.-...}, at: [<ffffffff810ebf90>] call_timer_fn+0x0/0x260
> [ 20.680186]
> [ 20.680186] stack backtrace:
> [ 20.680186] Pid: 2493, comm: rcu_torture_rea Tainted: G W 3.4.0-next-20120528-sasha-00008-g11ef39f #307
> [ 20.680186] Call Trace:
> [ 20.680186] <IRQ> [<ffffffff8114f6b9>] print_usage_bug+0x1a9/0x1d0
> [ 20.680186] [<ffffffff8114eed0>] ? check_usage_forwards+0xf0/0xf0
> [ 20.680186] [<ffffffff8114fb99>] mark_lock_irq+0xc9/0x270
> [ 20.680186] [<ffffffff8114fe5d>] mark_lock+0x11d/0x200
> [ 20.680186] [<ffffffff81150030>] mark_irqflags+0xf0/0x170
> [ 20.680186] [<ffffffff811519bb>] __lock_acquire+0x2bb/0x4c0
> [ 20.680186] [<ffffffff81151d4a>] lock_acquire+0x18a/0x1e0
> [ 20.680186] [<ffffffff81993527>] ? __percpu_counter_sum+0x17/0xc0
> [ 20.680186] [<ffffffff811d9260>] ? laptop_io_completion+0x30/0x30
> [ 20.680186] [<ffffffff8325ac9b>] _raw_spin_lock+0x3b/0x70
> [ 20.680186] [<ffffffff81993527>] ? __percpu_counter_sum+0x17/0xc0
> [ 20.680186] [<ffffffff81993527>] __percpu_counter_sum+0x17/0xc0
> [ 20.680186] [<ffffffff810ebf90>] ? init_timer_deferrable_key+0x20/0x20
> [ 20.680186] [<ffffffff8195b5c2>] fprop_new_period+0x12/0x60
> [ 20.680186] [<ffffffff811d929d>] writeout_period+0x3d/0xa0
> [ 20.680186] [<ffffffff810ec0bf>] call_timer_fn+0x12f/0x260
> [ 20.680186] [<ffffffff810ebf90>] ? init_timer_deferrable_key+0x20/0x20
> [ 20.680186] [<ffffffff8325b7db>] ? _raw_spin_unlock_irq+0x2b/0x80
> [ 20.680186] [<ffffffff811d9260>] ? laptop_io_completion+0x30/0x30
> [ 20.680186] [<ffffffff810ecd6e>] run_timer_softirq+0x29e/0x2f0
> [ 20.680186] [<ffffffff810e2fb1>] __do_softirq+0x221/0x460
> [ 20.680186] [<ffffffff8109a516>] ? kvm_clock_read+0x46/0x80
> [ 20.680186] [<ffffffff8325de2c>] call_softirq+0x1c/0x30
> [ 20.680186] [<ffffffff81069235>] do_softirq+0x75/0x120
> [ 20.680186] [<ffffffff810e1fbb>] irq_exit+0x5b/0xf0
> [ 20.680186] [<ffffffff8108e88a>] smp_apic_timer_interrupt+0x8a/0xa0
> [ 20.680186] [<ffffffff8325d42f>] apic_timer_interrupt+0x6f/0x80
> [ 20.680186] <EOI> [<ffffffff81151d7e>] ? lock_acquire+0x1be/0x1e0
> [ 20.680186] [<ffffffff811914f0>] ? rcu_torture_reader+0x380/0x380
> [ 20.680186] [<ffffffff81191523>] rcu_torture_read_lock+0x33/0x80
> [ 20.680186] [<ffffffff811914f0>] ? rcu_torture_reader+0x380/0x380
> [ 20.680186] [<ffffffff81191293>] rcu_torture_reader+0x123/0x380
> [ 20.680186] [<ffffffff8118ff50>] ? T.841+0x50/0x50
> [ 20.680186] [<ffffffff81191170>] ? rcu_torture_read_unlock+0x60/0x60
> [ 20.680186] [<ffffffff811071c2>] kthread+0xb2/0xc0
> [ 20.680186] [<ffffffff8325dd34>] kernel_thread_helper+0x4/0x10
> [ 20.680186] [<ffffffff8325bfb4>] ? retint_restore_args+0x13/0x13
> [ 20.680186] [<ffffffff81107110>] ? __init_kthread_worker+0x70/0x70
> [ 20.680186] [<ffffffff8325dd30>] ? gs_change+0x13/0x13
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peterz at infradead

May 29, 2012, 5:38 AM

Post #8 of 14 (199 views)
Permalink
Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions [In reply to]

On Tue, 2012-05-29 at 14:34 +0200, Jan Kara wrote:

> The only safe solution seems to be to create a variant of percpu counters
> that can be used from an interrupt. Or do you have other idea Peter?

> > [ 20.680186] [<ffffffff8325ac9b>] _raw_spin_lock+0x3b/0x70
> > [ 20.680186] [<ffffffff81993527>] ? __percpu_counter_sum+0x17/0xc0
> > [ 20.680186] [<ffffffff81993527>] __percpu_counter_sum+0x17/0xc0
> > [ 20.680186] [<ffffffff810ebf90>] ? init_timer_deferrable_key+0x20/0x20
> > [ 20.680186] [<ffffffff8195b5c2>] fprop_new_period+0x12/0x60
> > [ 20.680186] [<ffffffff811d929d>] writeout_period+0x3d/0xa0
> > [ 20.680186] [<ffffffff810ec0bf>] call_timer_fn+0x12f/0x260
> > [ 20.680186] [<ffffffff810ebf90>] ? init_timer_deferrable_key+0x20/0x20

Yeah, just make sure IRQs are disabled around doing that ;-)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jack at suse

May 29, 2012, 5:54 AM

Post #9 of 14 (195 views)
Permalink
Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions [In reply to]

On Tue 29-05-12 14:38:31, Peter Zijlstra wrote:
> On Tue, 2012-05-29 at 14:34 +0200, Jan Kara wrote:
>
> > The only safe solution seems to be to create a variant of percpu counters
> > that can be used from an interrupt. Or do you have other idea Peter?
>
> > > [ 20.680186] [<ffffffff8325ac9b>] _raw_spin_lock+0x3b/0x70
> > > [ 20.680186] [<ffffffff81993527>] ? __percpu_counter_sum+0x17/0xc0
> > > [ 20.680186] [<ffffffff81993527>] __percpu_counter_sum+0x17/0xc0
> > > [ 20.680186] [<ffffffff810ebf90>] ? init_timer_deferrable_key+0x20/0x20
> > > [ 20.680186] [<ffffffff8195b5c2>] fprop_new_period+0x12/0x60
> > > [ 20.680186] [<ffffffff811d929d>] writeout_period+0x3d/0xa0
> > > [ 20.680186] [<ffffffff810ec0bf>] call_timer_fn+0x12f/0x260
> > > [ 20.680186] [<ffffffff810ebf90>] ? init_timer_deferrable_key+0x20/0x20
>
> Yeah, just make sure IRQs are disabled around doing that ;-)
Evil ;) But we'd need to have IRQs disabled also in each
fprop_fraction_percpu() call, and generally, if we want things clean, we'd
need to disable them in all entry points to proportion code (or at least
around all percpu calls)...

Honza
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jack at suse

May 31, 2012, 3:11 PM

Post #10 of 14 (192 views)
Permalink
Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions [In reply to]

On Tue 29-05-12 14:54:52, Jan Kara wrote:
> On Tue 29-05-12 14:38:31, Peter Zijlstra wrote:
> > On Tue, 2012-05-29 at 14:34 +0200, Jan Kara wrote:
> >
> > > The only safe solution seems to be to create a variant of percpu counters
> > > that can be used from an interrupt. Or do you have other idea Peter?
> >
> > > > [ 20.680186] [<ffffffff8325ac9b>] _raw_spin_lock+0x3b/0x70
> > > > [ 20.680186] [<ffffffff81993527>] ? __percpu_counter_sum+0x17/0xc0
> > > > [ 20.680186] [<ffffffff81993527>] __percpu_counter_sum+0x17/0xc0
> > > > [ 20.680186] [<ffffffff810ebf90>] ? init_timer_deferrable_key+0x20/0x20
> > > > [ 20.680186] [<ffffffff8195b5c2>] fprop_new_period+0x12/0x60
> > > > [ 20.680186] [<ffffffff811d929d>] writeout_period+0x3d/0xa0
> > > > [ 20.680186] [<ffffffff810ec0bf>] call_timer_fn+0x12f/0x260
> > > > [ 20.680186] [<ffffffff810ebf90>] ? init_timer_deferrable_key+0x20/0x20
> >
> > Yeah, just make sure IRQs are disabled around doing that ;-)
> Evil ;) But we'd need to have IRQs disabled also in each
> fprop_fraction_percpu() call, and generally, if we want things clean, we'd
> need to disable them in all entry points to proportion code (or at least
> around all percpu calls)...
OK, after some thought I was wrong and fixing fprop_new_period() is
enough. Attached patch should fix the warning (and possible deadlock).
Fengguang should I resend you fixed patch implementing flexible proportions
or do you prefer incremental patch against your tree?

Honza
--
Jan Kara <jack [at] suse>
SUSE Labs, CR
Attachments: flex-proportion-irq-save.diff (1.25 KB)


peterz at infradead

May 31, 2012, 3:26 PM

Post #11 of 14 (194 views)
Permalink
Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions [In reply to]

On Fri, 2012-06-01 at 00:11 +0200, Jan Kara wrote:
> bool fprop_new_period(struct fprop_global *p, int periods)
> {
> - u64 events = percpu_counter_sum(&p->events);
> + u64 events;
> + unsigned long flags;
>
> + local_irq_save(flags);
> + events = percpu_counter_sum(&p->events);
> + local_irq_restore(flags);
> /*
> * Don't do anything if there are no events.
> */
> @@ -73,7 +77,9 @@ bool fprop_new_period(struct fprop_global *p, int periods)
> if (periods < 64)
> events -= events >> periods;
> /* Use addition to avoid losing events happening between sum and set */
> + local_irq_save(flags);
> percpu_counter_add(&p->events, -events);
> + local_irq_restore(flags);
> p->period += periods;
> write_seqcount_end(&p->sequence);

Uhm, why bother enabling it in between? Just wrap the whole function in
a single IRQ disable.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jack at suse

May 31, 2012, 3:42 PM

Post #12 of 14 (191 views)
Permalink
Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions [In reply to]

On Fri 01-06-12 00:26:05, Peter Zijlstra wrote:
> On Fri, 2012-06-01 at 00:11 +0200, Jan Kara wrote:
> > bool fprop_new_period(struct fprop_global *p, int periods)
> > {
> > - u64 events = percpu_counter_sum(&p->events);
> > + u64 events;
> > + unsigned long flags;
> >
> > + local_irq_save(flags);
> > + events = percpu_counter_sum(&p->events);
> > + local_irq_restore(flags);
> > /*
> > * Don't do anything if there are no events.
> > */
> > @@ -73,7 +77,9 @@ bool fprop_new_period(struct fprop_global *p, int periods)
> > if (periods < 64)
> > events -= events >> periods;
> > /* Use addition to avoid losing events happening between sum and set */
> > + local_irq_save(flags);
> > percpu_counter_add(&p->events, -events);
> > + local_irq_restore(flags);
> > p->period += periods;
> > write_seqcount_end(&p->sequence);
>
> Uhm, why bother enabling it in between? Just wrap the whole function in
> a single IRQ disable.
I wanted to have interrupts disabled for as short as possible but if you
think it doesn't matter, I'll take your advice. The result is attached.

Honza
--
Jan Kara <jack [at] suse>
SUSE Labs, CR
Attachments: flex-proportion-irq-save.diff (1.14 KB)


fengguang.wu at intel

May 31, 2012, 8:10 PM

Post #13 of 14 (179 views)
Permalink
Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions [In reply to]

On Fri, Jun 01, 2012 at 12:42:06AM +0200, Jan Kara wrote:
> On Fri 01-06-12 00:26:05, Peter Zijlstra wrote:
> > On Fri, 2012-06-01 at 00:11 +0200, Jan Kara wrote:
> > > bool fprop_new_period(struct fprop_global *p, int periods)
> > > {
> > > - u64 events = percpu_counter_sum(&p->events);
> > > + u64 events;
> > > + unsigned long flags;
> > >
> > > + local_irq_save(flags);
> > > + events = percpu_counter_sum(&p->events);
> > > + local_irq_restore(flags);
> > > /*
> > > * Don't do anything if there are no events.
> > > */
> > > @@ -73,7 +77,9 @@ bool fprop_new_period(struct fprop_global *p, int periods)
> > > if (periods < 64)
> > > events -= events >> periods;
> > > /* Use addition to avoid losing events happening between sum and set */
> > > + local_irq_save(flags);
> > > percpu_counter_add(&p->events, -events);
> > > + local_irq_restore(flags);
> > > p->period += periods;
> > > write_seqcount_end(&p->sequence);
> >
> > Uhm, why bother enabling it in between? Just wrap the whole function in
> > a single IRQ disable.
> I wanted to have interrupts disabled for as short as possible but if you
> think it doesn't matter, I'll take your advice. The result is attached.

Thank you! I applied this incremental fix next to the commit
"lib: Proportions with flexible period".

Thanks,
Fengguang

> From: Jan Kara <jack [at] suse>
> Subject: lib: Fix possible deadlock in flexible proportion code
>
> When percpu counter function in fprop_new_period() is interrupted by an
> interrupt while holding counter lock, it can cause deadlock when the
> interrupt wants to take the lock as well. Fix the problem by disabling
> interrupts when calling percpu counter functions.
>
> Signed-off-by: Jan Kara <jack [at] suse>
>
> diff -u b/lib/flex_proportions.c b/lib/flex_proportions.c
> --- b/lib/flex_proportions.c
> +++ b/lib/flex_proportions.c
> @@ -62,13 +62,18 @@
> */
> bool fprop_new_period(struct fprop_global *p, int periods)
> {
> - u64 events = percpu_counter_sum(&p->events);
> + u64 events;
> + unsigned long flags;
>
> + local_irq_save(flags);
> + events = percpu_counter_sum(&p->events);
> /*
> * Don't do anything if there are no events.
> */
> - if (events <= 1)
> + if (events <= 1) {
> + local_irq_restore(flags);
> return false;
> + }
> write_seqcount_begin(&p->sequence);
> if (periods < 64)
> events -= events >> periods;
> @@ -76,6 +81,7 @@
> percpu_counter_add(&p->events, -events);
> p->period += periods;
> write_seqcount_end(&p->sequence);
> + local_irq_restore(flags);
>
> return true;
> }

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


peterz at infradead

Jun 1, 2012, 3:13 AM

Post #14 of 14 (181 views)
Permalink
Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions [In reply to]

On Fri, 2012-06-01 at 00:42 +0200, Jan Kara wrote:
> On Fri 01-06-12 00:26:05, Peter Zijlstra wrote:
> > On Fri, 2012-06-01 at 00:11 +0200, Jan Kara wrote:
> > > bool fprop_new_period(struct fprop_global *p, int periods)
> > > {
> > > - u64 events = percpu_counter_sum(&p->events);
> > > + u64 events;
> > > + unsigned long flags;
> > >
> > > + local_irq_save(flags);
> > > + events = percpu_counter_sum(&p->events);
> > > + local_irq_restore(flags);
> > > /*
> > > * Don't do anything if there are no events.
> > > */
> > > @@ -73,7 +77,9 @@ bool fprop_new_period(struct fprop_global *p, int periods)
> > > if (periods < 64)
> > > events -= events >> periods;
> > > /* Use addition to avoid losing events happening between sum and set */
> > > + local_irq_save(flags);
> > > percpu_counter_add(&p->events, -events);
> > > + local_irq_restore(flags);
> > > p->period += periods;
> > > write_seqcount_end(&p->sequence);
> >
> > Uhm, why bother enabling it in between? Just wrap the whole function in
> > a single IRQ disable.
> I wanted to have interrupts disabled for as short as possible but if you
> think it doesn't matter, I'll take your advice. The result is attached.

Thing is, disabling interrupts is quite expensive and the extra few
instructions covered isn't much.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linux kernel RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact Gossamer Threads
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.