Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: Linux: Kernel

Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead

 

 

Linux kernel RSS feed   Index | Next | Previous | View Threaded


cl at linux-foundation

Nov 5, 2009, 11:20 AM

Post #1 of 13 (316 views)
Permalink
Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead

From: Christoph Lamter <cl [at] linux-foundation>
Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead

Instead of a rw semaphore use a mutex and a per cpu counter for the number
of the current readers. read locking then becomes very cheap requiring only
the increment of a per cpu counter.

Write locking is more expensive since the writer must scan the percpu array
and wait until all readers are complete. Since the readers are not holding
semaphores we have no wait queue from which the writer could wakeup. In this
draft we simply wait for one millisecond between scans of the percpu
array. A different solution must be found there.

Patch is on top of -next and the percpu counter patches that I posted
yesterday. The patch adds another per cpu counter to the file and anon rss
counters.

Signed-off-by: Christoph Lamter <cl [at] linux-foundation>

---
include/linux/mm_types.h | 68 ++++++++++++++++++++++++++++++++++++++---------
mm/init-mm.c | 2 -
2 files changed, 56 insertions(+), 14 deletions(-)

Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h 2009-11-05 13:03:11.000000000 -0600
+++ linux-2.6/include/linux/mm_types.h 2009-11-05 13:06:31.000000000 -0600
@@ -14,6 +14,7 @@
#include <linux/page-debug-flags.h>
#include <asm/page.h>
#include <asm/mmu.h>
+#include <linux/percpu.h>

#ifndef AT_VECTOR_SIZE_ARCH
#define AT_VECTOR_SIZE_ARCH 0
@@ -27,6 +28,7 @@ struct address_space;
struct mm_counter {
long file;
long anon;
+ long readers;
};

/*
@@ -214,7 +216,7 @@ struct mm_struct {
atomic_t mm_users; /* How many users with user space? */
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
int map_count; /* number of VMAs */
- struct rw_semaphore sem;
+ struct mutex lock;
spinlock_t page_table_lock; /* Protects page tables and some counters */

struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung
@@ -285,64 +287,104 @@ struct mm_struct {
#endif
};

+static inline int mm_readers(struct mm_struct *mm)
+{
+ int cpu;
+ int readers = 0;
+
+ for_each_possible_cpu(cpu)
+ readers += per_cpu(mm->rss->readers, cpu);
+
+ return readers;
+}
+
static inline void mm_reader_lock(struct mm_struct *mm)
{
- down_read(&mm->sem);
+redo:
+ this_cpu_inc(mm->rss->readers);
+ if (mutex_is_locked(&mm->lock)) {
+ this_cpu_dec(mm->rss->readers);
+ /* Need to wait till mutex is released */
+ mutex_lock(&mm->lock);
+ mutex_unlock(&mm->lock);
+ goto redo;
+ }
}

static inline void mm_reader_unlock(struct mm_struct *mm)
{
- up_read(&mm->sem);
+ this_cpu_dec(mm->rss->readers);
}

static inline int mm_reader_trylock(struct mm_struct *mm)
{
- return down_read_trylock(&mm->sem);
+ this_cpu_inc(mm->rss->readers);
+ if (mutex_is_locked(&mm->lock)) {
+ this_cpu_dec(mm->rss->readers);
+ return 0;
+ }
+ return 1;
}

static inline void mm_writer_lock(struct mm_struct *mm)
{
- down_write(&mm->sem);
+redo:
+ mutex_lock(&mm->lock);
+ if (mm_readers(mm) == 0)
+ return;
+
+ mutex_unlock(&mm->lock);
+ msleep(1);
+ goto redo;
}

static inline void mm_writer_unlock(struct mm_struct *mm)
{
- up_write(&mm->sem);
+ mutex_unlock(&mm->lock);
}

static inline int mm_writer_trylock(struct mm_struct *mm)
{
- return down_write_trylock(&mm->sem);
+ if (!mutex_trylock(&mm->lock))
+ goto fail;
+
+ if (mm_readers(mm) == 0)
+ return 1;
+
+ mutex_unlock(&mm->lock);
+fail:
+ return 0;
}

static inline int mm_locked(struct mm_struct *mm)
{
- return rwsem_is_locked(&mm->sem);
+ return mutex_is_locked(&mm->lock) || mm_readers(mm);
}

static inline void mm_writer_to_reader_lock(struct mm_struct *mm)
{
- downgrade_write(&mm->sem);
+ this_cpu_inc(mm->rss->readers);
+ mutex_unlock(&mm->lock);
}

static inline void mm_writer_lock_nested(struct mm_struct *mm, int x)
{
- down_write_nested(&mm->sem, x);
+ mutex_lock_nested(&mm->lock, x);
}

static inline void mm_lock_init(struct mm_struct *mm)
{
- init_rwsem(&mm->sem);
+ mutex_init(&mm->lock);
}

static inline void mm_lock_prefetch(struct mm_struct *mm)
{
- prefetchw(&mm->sem);
+ prefetchw(&mm->lock);
}

static inline void mm_nest_lock(spinlock_t *s, struct mm_struct *mm)
{
- spin_lock_nest_lock(s, &mm->sem);
+ spin_lock_nest_lock(s, &mm->lock);
}

/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
Index: linux-2.6/mm/init-mm.c
===================================================================
--- linux-2.6.orig/mm/init-mm.c 2009-11-05 13:02:54.000000000 -0600
+++ linux-2.6/mm/init-mm.c 2009-11-05 13:03:22.000000000 -0600
@@ -15,7 +15,7 @@ struct mm_struct init_mm = {
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
- .sem = __RWSEM_INITIALIZER(init_mm.sem),
+ .lock = __MUTEX_INITIALIZER(init_mm.lock),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.cpu_vm_mask = CPU_MASK_ALL,

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


andi at firstfloor

Nov 5, 2009, 12:56 PM

Post #2 of 13 (284 views)
Permalink
Re: Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead [In reply to]

Christoph Lameter <cl [at] linux-foundation> writes:

> Instead of a rw semaphore use a mutex and a per cpu counter for the number
> of the current readers. read locking then becomes very cheap requiring only
> the increment of a per cpu counter.
>
> Write locking is more expensive since the writer must scan the percpu array
> and wait until all readers are complete. Since the readers are not holding
> semaphores we have no wait queue from which the writer could wakeup. In this
> draft we simply wait for one millisecond between scans of the percpu
> array. A different solution must be found there.

I'm not sure making all writers more expensive is really a good idea.

For example it will definitely impact the AIM7 multi brk() issue
or the mysql allocation case, which are all writer intensive. I assume
doing a lot of mmaps/brks in parallel is not that uncommon.

My thinking was more that we simply need per VMA locking or
some other per larger address range locking. Unfortunately that
needs changes in a lot of users that mess with the VMA lists
(perhaps really needs some better abstractions for VMA list management
first)

That said also addressing the convoying issues in the current
semaphores would be a good idea, which is what your patch does.

-Andi

--
ak [at] linux -- Speaking for myself only.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


cl at linux-foundation

Nov 5, 2009, 1:03 PM

Post #3 of 13 (286 views)
Permalink
Re: Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead [In reply to]

On Thu, 5 Nov 2009, Andi Kleen wrote:

> I'm not sure making all writers more expensive is really a good idea.

The scaling problems that I have seen (like simple concurrent page faults)
are due to lock contention on mmap_sem and due to counter updates in
mm_struct.

> For example it will definitely impact the AIM7 multi brk() issue
> or the mysql allocation case, which are all writer intensive. I assume
> doing a lot of mmaps/brks in parallel is not that uncommon.

No its not that common. Page faults are much more common. The AIM7 seems
to be an artificial case? What does mysql do for allocation? If its brk()
related then simply going to larger increases may fix the issue??

> My thinking was more that we simply need per VMA locking or
> some other per larger address range locking. Unfortunately that
> needs changes in a lot of users that mess with the VMA lists
> (perhaps really needs some better abstractions for VMA list management
> first)

We have range locking through the distribution of the ptl for systems with
more than 4 processors. One can use that today to lock ranges of the
address space.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


andi at firstfloor

Nov 5, 2009, 11:39 PM

Post #4 of 13 (280 views)
Permalink
Re: Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead [In reply to]

On Thu, Nov 05, 2009 at 04:03:39PM -0500, Christoph Lameter wrote:
> > For example it will definitely impact the AIM7 multi brk() issue
> > or the mysql allocation case, which are all writer intensive. I assume
> > doing a lot of mmaps/brks in parallel is not that uncommon.
>
> No its not that common. Page faults are much more common. The AIM7 seems
> to be an artificial case? What does mysql do for allocation? If its brk()

AIM7 is artificial yes, but I suspect similar problems (to a less
extreme degree) are in other workloads.

> related then simply going to larger increases may fix the issue??

For mysql it's mmap through malloc(). There has been some tuning in
glibc for it. But I suspect it's a more general problem that will
still need kernel improvements.

>
> > My thinking was more that we simply need per VMA locking or
> > some other per larger address range locking. Unfortunately that
> > needs changes in a lot of users that mess with the VMA lists
> > (perhaps really needs some better abstractions for VMA list management
> > first)
>
> We have range locking through the distribution of the ptl for systems with
> more than 4 processors. One can use that today to lock ranges of the
> address space.

Yes but all the major calls still take mmap_sem, which is not ranged.

-Andi
--
ak [at] linux -- Speaking for myself only.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


minchan.kim at gmail

Nov 6, 2009, 7:41 AM

Post #5 of 13 (279 views)
Permalink
Re: Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead [In reply to]

Hi, Christoph.

How about change from 'mm_readers' to 'is_readers' to improve your
goal 'scalibility'?
===
static inline int is_readers(struct mm_struct *mm)
{
int cpu;
int ret = 0;

for_each_possible_cpu(cpu) {
if (per_cpu(mm->rss->readers, cpu)) {
ret = 1;
break;
}
}

return ret;
}
===


On Fri, Nov 6, 2009 at 4:20 AM, Christoph Lameter
<cl [at] linux-foundation> wrote:
> From: Christoph Lamter <cl [at] linux-foundation>
> Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead
>
> Instead of a rw semaphore use a mutex and a per cpu counter for the number
> of the current readers. read locking then becomes very cheap requiring only
> the increment of a per cpu counter.
>
> Write locking is more expensive since the writer must scan the percpu array
> and wait until all readers are complete. Since the readers are not holding
> semaphores we have no wait queue from which the writer could wakeup. In this
> draft we simply wait for one millisecond between scans of the percpu
> array. A different solution must be found there.
>
> Patch is on top of -next and the percpu counter patches that I posted
> yesterday. The patch adds another per cpu counter to the file and anon rss
> counters.
>
> Signed-off-by: Christoph Lamter <cl [at] linux-foundation>
>
> ---
>  include/linux/mm_types.h |   68 ++++++++++++++++++++++++++++++++++++++---------
>  mm/init-mm.c             |    2 -
>  2 files changed, 56 insertions(+), 14 deletions(-)
>
> Index: linux-2.6/include/linux/mm_types.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm_types.h     2009-11-05 13:03:11.000000000 -0600
> +++ linux-2.6/include/linux/mm_types.h  2009-11-05 13:06:31.000000000 -0600
> @@ -14,6 +14,7 @@
>  #include <linux/page-debug-flags.h>
>  #include <asm/page.h>
>  #include <asm/mmu.h>
> +#include <linux/percpu.h>
>
>  #ifndef AT_VECTOR_SIZE_ARCH
>  #define AT_VECTOR_SIZE_ARCH 0
> @@ -27,6 +28,7 @@ struct address_space;
>  struct mm_counter {
>        long file;
>        long anon;
> +       long readers;
>  };
>
>  /*
> @@ -214,7 +216,7 @@ struct mm_struct {
>        atomic_t mm_users;                      /* How many users with user space? */
>        atomic_t mm_count;                      /* How many references to "struct mm_struct" (users count as 1) */
>        int map_count;                          /* number of VMAs */
> -       struct rw_semaphore sem;
> +       struct mutex lock;
>        spinlock_t page_table_lock;             /* Protects page tables and some counters */
>
>        struct list_head mmlist;                /* List of maybe swapped mm's.  These are globally strung
> @@ -285,64 +287,104 @@ struct mm_struct {
>  #endif
>  };
>
> +static inline int mm_readers(struct mm_struct *mm)
> +{
> +       int cpu;
> +       int readers = 0;
> +
> +       for_each_possible_cpu(cpu)
> +               readers += per_cpu(mm->rss->readers, cpu);
> +
> +       return readers;
> +}
> +
>  static inline void mm_reader_lock(struct mm_struct *mm)
>  {
> -       down_read(&mm->sem);
> +redo:
> +       this_cpu_inc(mm->rss->readers);
> +       if (mutex_is_locked(&mm->lock)) {
> +               this_cpu_dec(mm->rss->readers);
> +               /* Need to wait till mutex is released */
> +               mutex_lock(&mm->lock);
> +               mutex_unlock(&mm->lock);
> +               goto redo;
> +       }
>  }
>
>  static inline void mm_reader_unlock(struct mm_struct *mm)
>  {
> -       up_read(&mm->sem);
> +       this_cpu_dec(mm->rss->readers);
>  }
>
>  static inline int mm_reader_trylock(struct mm_struct *mm)
>  {
> -       return down_read_trylock(&mm->sem);
> +       this_cpu_inc(mm->rss->readers);
> +       if (mutex_is_locked(&mm->lock)) {
> +               this_cpu_dec(mm->rss->readers);
> +               return 0;
> +       }
> +       return 1;
>  }
>
>  static inline void mm_writer_lock(struct mm_struct *mm)
>  {
> -       down_write(&mm->sem);
> +redo:
> +       mutex_lock(&mm->lock);
> +       if (mm_readers(mm) == 0)

We can change this.

if (!is_readers(mm))
return;

> +               return;
> +
> +       mutex_unlock(&mm->lock);
> +       msleep(1);
> +       goto redo;
>  }
>
>  static inline void mm_writer_unlock(struct mm_struct *mm)
>  {
> -       up_write(&mm->sem);
> +       mutex_unlock(&mm->lock);
>  }
>
>  static inline int mm_writer_trylock(struct mm_struct *mm)
>  {
> -       return down_write_trylock(&mm->sem);
> +       if (!mutex_trylock(&mm->lock))
> +               goto fail;
> +
> +       if (mm_readers(mm) == 0)
> +               return 1;

if (!is_readers(mm))
return 1;

> +
> +       mutex_unlock(&mm->lock);
> +fail:
> +       return 0;
>  }
>

--
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


cl at linux-foundation

Nov 6, 2009, 9:08 AM

Post #6 of 13 (278 views)
Permalink
Re: Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead [In reply to]

On Fri, 6 Nov 2009, Andi Kleen wrote:

> Yes but all the major calls still take mmap_sem, which is not ranged.

But exactly that issue is addressed by this patch!

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


cl at linux-foundation

Nov 6, 2009, 9:10 AM

Post #7 of 13 (278 views)
Permalink
Re: Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead [In reply to]

On Sat, 7 Nov 2009, Minchan Kim wrote:

> How about change from 'mm_readers' to 'is_readers' to improve your
> goal 'scalibility'?

Good idea. Thanks. Next rev will use your suggestion.

Any creative thoughts on what to do about the 1 millisecond wait period?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


andi at firstfloor

Nov 6, 2009, 9:44 AM

Post #8 of 13 (277 views)
Permalink
Re: Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead [In reply to]

On Fri, Nov 06, 2009 at 12:08:54PM -0500, Christoph Lameter wrote:
> On Fri, 6 Nov 2009, Andi Kleen wrote:
>
> > Yes but all the major calls still take mmap_sem, which is not ranged.
>
> But exactly that issue is addressed by this patch!

Major calls = mmap, brk, etc.

Only for page faults, not for anything that takes it for write.

Anyways the better reader lock is a step in the right direction, but
I have my doubts it's a good idea to make write really slow here.

-Andi
--
ak [at] linux -- Speaking for myself only.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


cl at linux-foundation

Nov 6, 2009, 9:54 AM

Post #9 of 13 (278 views)
Permalink
Re: Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead [In reply to]

On Fri, 6 Nov 2009, Andi Kleen wrote:

> On Fri, Nov 06, 2009 at 12:08:54PM -0500, Christoph Lameter wrote:
> > On Fri, 6 Nov 2009, Andi Kleen wrote:
> >
> > > Yes but all the major calls still take mmap_sem, which is not ranged.
> >
> > But exactly that issue is addressed by this patch!
>
> Major calls = mmap, brk, etc.

Those are rare. More frequently are for faults, get_user_pages and
the like operations that are frequent.

brk depends on process wide settings and has to be
serialized using a processor wide locks.

mmap and other address space local modification may be able to avoid
taking mmap write lock by taking the read lock and then locking the
ptls in the page struct relevant to the address space being modified.

This is also enabled by this patchset.

> Only for page faults, not for anything that takes it for write.
>
> Anyways the better reader lock is a step in the right direction, but
> I have my doubts it's a good idea to make write really slow here.

The bigger the system the larger the problems with mmap. This is one key
scaling issue important for the VM. We can work on that. I have a patch
here that restricts the per cpu checks to only those cpus on which the
process has at some times run before.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


minchan.kim at gmail

Nov 6, 2009, 8:19 PM

Post #10 of 13 (271 views)
Permalink
Re: Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead [In reply to]

On Sat, Nov 7, 2009 at 2:10 AM, Christoph Lameter
<cl [at] linux-foundation> wrote:
> On Sat, 7 Nov 2009, Minchan Kim wrote:
>
>> How about change from 'mm_readers' to 'is_readers' to improve your
>> goal 'scalibility'?
>
> Good idea. Thanks. Next rev will use your suggestion.
>
> Any creative thoughts on what to do about the 1 millisecond wait period?
>

Hmm,
it would be importatn to prevent livelock for reader to hold lock
continuously before
hodling writer than 1 msec write ovhead.
First of all, After we solve it, second step is that optimize write
overhead, I think.

--
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


kosaki.motohiro at jp

Nov 9, 2009, 10:21 PM

Post #11 of 13 (243 views)
Permalink
Re: Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead [In reply to]

> On Fri, 6 Nov 2009, Andi Kleen wrote:
>
> > On Fri, Nov 06, 2009 at 12:08:54PM -0500, Christoph Lameter wrote:
> > > On Fri, 6 Nov 2009, Andi Kleen wrote:
> > >
> > > > Yes but all the major calls still take mmap_sem, which is not ranged.
> > >
> > > But exactly that issue is addressed by this patch!
> >
> > Major calls = mmap, brk, etc.
>
> Those are rare. More frequently are for faults, get_user_pages and
> the like operations that are frequent.
>
> brk depends on process wide settings and has to be
> serialized using a processor wide locks.
>
> mmap and other address space local modification may be able to avoid
> taking mmap write lock by taking the read lock and then locking the
> ptls in the page struct relevant to the address space being modified.
>
> This is also enabled by this patchset.

Andi, Why do you ignore fork? fork() hold mmap_sem write-side lock and
it is one of critical path.
Ah yes, I know HPC workload doesn't call fork() so frequently, I mean
typical desktop and small server case.

I agree with cristoph halfly. if the issue is only in mmap, it isn't
so important.

Probably, I haven't catch your mention.


Plus, most critical mmap_sem issue is not locking cost itself. In stree workload,
the procss grabbing mmap_sem frequently sleep. and fair rw-semaphoe logic
frequently prevent reader side locking.
At least, this improvement doesn't help google like workload.

Thanks.


> > Only for page faults, not for anything that takes it for write.
> >
> > Anyways the better reader lock is a step in the right direction, but
> > I have my doubts it's a good idea to make write really slow here.
>
> The bigger the system the larger the problems with mmap. This is one key
> scaling issue important for the VM. We can work on that. I have a patch
> here that restricts the per cpu checks to only those cpus on which the
> process has at some times run before.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


andi at firstfloor

Nov 10, 2009, 1:19 AM

Post #12 of 13 (243 views)
Permalink
Re: Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead [In reply to]

On Tue, Nov 10, 2009 at 03:21:11PM +0900, KOSAKI Motohiro wrote:
> > On Fri, 6 Nov 2009, Andi Kleen wrote:
> >
> > > On Fri, Nov 06, 2009 at 12:08:54PM -0500, Christoph Lameter wrote:
> > > > On Fri, 6 Nov 2009, Andi Kleen wrote:
> > > >
> > > > > Yes but all the major calls still take mmap_sem, which is not ranged.
> > > >
> > > > But exactly that issue is addressed by this patch!
> > >
> > > Major calls = mmap, brk, etc.
> >
> > Those are rare. More frequently are for faults, get_user_pages and
> > the like operations that are frequent.
> >
> > brk depends on process wide settings and has to be
> > serialized using a processor wide locks.
> >
> > mmap and other address space local modification may be able to avoid
> > taking mmap write lock by taking the read lock and then locking the
> > ptls in the page struct relevant to the address space being modified.
> >
> > This is also enabled by this patchset.
>
> Andi, Why do you ignore fork? fork() hold mmap_sem write-side lock and
> it is one of critical path.

I have not seen profile logs where fork was critical. But that's not saying
that it can't be. But fork is so intrusive that locking it fine grained
is probably very hard.

> Plus, most critical mmap_sem issue is not locking cost itself. In stree workload,
> the procss grabbing mmap_sem frequently sleep. and fair rw-semaphoe logic
> frequently prevent reader side locking.
> At least, this improvement doesn't help google like workload.

Not helping is not too bad, the problem I had was just that it makes
writers even slower.

-Andi
--
ak [at] linux -- Speaking for myself only.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


cl at linux-foundation

Nov 10, 2009, 12:20 PM

Post #13 of 13 (240 views)
Permalink
Re: Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead [In reply to]

On Sat, 7 Nov 2009, Minchan Kim wrote:

> On Sat, Nov 7, 2009 at 2:10 AM, Christoph Lameter
> <cl [at] linux-foundation> wrote:
> > On Sat, 7 Nov 2009, Minchan Kim wrote:
> >
> >> How about change from 'mm_readers' to 'is_readers' to improve your
> >> goal 'scalibility'?
> >
> > Good idea. Thanks. Next rev will use your suggestion.
> >
> > Any creative thoughts on what to do about the 1 millisecond wait period?
> >
>
> Hmm,
> it would be importatn to prevent livelock for reader to hold lock
> continuously before
> hodling writer than 1 msec write ovhead.

Livelock because there are too frequent readers?

We could just keep the mutex locked to ensure that no new readers arrive.

> First of all, After we solve it, second step is that optimize write
> overhead, I think.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linux kernel RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact Gossamer Threads
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.