Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: Linux: Kernel

[PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen

 

 

Linux kernel RSS feed   Index | Next | Previous | View Threaded


andrea at qumranet

Apr 8, 2008, 8:44 AM

Post #1 of 17 (756 views)
Permalink
[PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen

# HG changeset patch
# User Andrea Arcangeli <andrea[at]qumranet.com>
# Date 1207666462 -7200
# Node ID ec6d8f91b299cf26cce5c3d49bb25d35ee33c137
# Parent d4c25404de6376297ed34fada14cd6b894410eb0
Lock the entire mm to prevent any mmu related operation to happen.

Signed-off-by: Andrea Arcangeli <andrea[at]qumranet.com>

diff --git a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1050,6 +1050,15 @@
unsigned long addr, unsigned long len,
unsigned long flags, struct page **pages);

+struct mm_lock_data {
+ spinlock_t **i_mmap_locks;
+ spinlock_t **anon_vma_locks;
+ unsigned long nr_i_mmap_locks;
+ unsigned long nr_anon_vma_locks;
+};
+extern struct mm_lock_data *mm_lock(struct mm_struct * mm);
+extern void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data);
+
extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);

extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -26,6 +26,7 @@
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
+#include <linux/vmalloc.h>

#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -2242,3 +2243,140 @@

return 0;
}
+
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults. The holder
+ * must not hold any mm related lock. A single task can't take more
+ * than one mm lock in a row or it would deadlock.
+ */
+struct mm_lock_data *mm_lock(struct mm_struct * mm)
+{
+ struct vm_area_struct *vma;
+ spinlock_t *i_mmap_lock_last, *anon_vma_lock_last;
+ unsigned long nr_i_mmap_locks, nr_anon_vma_locks, i;
+ struct mm_lock_data *data;
+ int err;
+
+ down_write(&mm->mmap_sem);
+
+ err = -EINTR;
+ nr_i_mmap_locks = nr_anon_vma_locks = 0;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ cond_resched();
+ if (unlikely(signal_pending(current)))
+ goto out;
+
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ nr_i_mmap_locks++;
+ if (vma->anon_vma)
+ nr_anon_vma_locks++;
+ }
+
+ err = -ENOMEM;
+ data = kmalloc(sizeof(struct mm_lock_data), GFP_KERNEL);
+ if (!data)
+ goto out;
+
+ if (nr_i_mmap_locks) {
+ data->i_mmap_locks = vmalloc(nr_i_mmap_locks *
+ sizeof(spinlock_t));
+ if (!data->i_mmap_locks)
+ goto out_kfree;
+ } else
+ data->i_mmap_locks = NULL;
+
+ if (nr_anon_vma_locks) {
+ data->anon_vma_locks = vmalloc(nr_anon_vma_locks *
+ sizeof(spinlock_t));
+ if (!data->anon_vma_locks)
+ goto out_vfree;
+ } else
+ data->anon_vma_locks = NULL;
+
+ err = -EINTR;
+ i_mmap_lock_last = NULL;
+ nr_i_mmap_locks = 0;
+ for (;;) {
+ spinlock_t *i_mmap_lock = (spinlock_t *) -1UL;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ cond_resched();
+ if (unlikely(signal_pending(current)))
+ goto out_vfree_both;
+
+ if (!vma->vm_file || !vma->vm_file->f_mapping)
+ continue;
+ if ((unsigned long) i_mmap_lock >
+ (unsigned long)
+ &vma->vm_file->f_mapping->i_mmap_lock &&
+ (unsigned long)
+ &vma->vm_file->f_mapping->i_mmap_lock >
+ (unsigned long) i_mmap_lock_last)
+ i_mmap_lock =
+ &vma->vm_file->f_mapping->i_mmap_lock;
+ }
+ if (i_mmap_lock == (spinlock_t *) -1UL)
+ break;
+ i_mmap_lock_last = i_mmap_lock;
+ data->i_mmap_locks[nr_i_mmap_locks++] = i_mmap_lock;
+ }
+ data->nr_i_mmap_locks = nr_i_mmap_locks;
+
+ anon_vma_lock_last = NULL;
+ nr_anon_vma_locks = 0;
+ for (;;) {
+ spinlock_t *anon_vma_lock = (spinlock_t *) -1UL;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ cond_resched();
+ if (unlikely(signal_pending(current)))
+ goto out_vfree_both;
+
+ if (!vma->anon_vma)
+ continue;
+ if ((unsigned long) anon_vma_lock >
+ (unsigned long) &vma->anon_vma->lock &&
+ (unsigned long) &vma->anon_vma->lock >
+ (unsigned long) anon_vma_lock_last)
+ anon_vma_lock = &vma->anon_vma->lock;
+ }
+ if (anon_vma_lock == (spinlock_t *) -1UL)
+ break;
+ anon_vma_lock_last = anon_vma_lock;
+ data->anon_vma_locks[nr_anon_vma_locks++] = anon_vma_lock;
+ }
+ data->nr_anon_vma_locks = nr_anon_vma_locks;
+
+ for (i = 0; i < nr_i_mmap_locks; i++)
+ spin_lock(data->i_mmap_locks[i]);
+ for (i = 0; i < nr_anon_vma_locks; i++)
+ spin_lock(data->anon_vma_locks[i]);
+
+ return data;
+
+out_vfree_both:
+ vfree(data->anon_vma_locks);
+out_vfree:
+ vfree(data->i_mmap_locks);
+out_kfree:
+ kfree(data);
+out:
+ up_write(&mm->mmap_sem);
+ return ERR_PTR(err);
+}
+
+void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data)
+{
+ unsigned long i;
+
+ for (i = 0; i < data->nr_i_mmap_locks; i++)
+ spin_unlock(data->i_mmap_locks[i]);
+ for (i = 0; i < data->nr_anon_vma_locks; i++)
+ spin_unlock(data->anon_vma_locks[i]);
+
+ up_write(&mm->mmap_sem);
+
+ vfree(data->i_mmap_locks);
+ vfree(data->anon_vma_locks);
+ kfree(data);
+}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


holt at sgi

Apr 16, 2008, 9:33 AM

Post #2 of 17 (726 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

I don't think this lock mechanism is completely working. I have
gotten a few failures trying to dereference 0x100100 which appears to
be LIST_POISON1.

Thanks,
Robin
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


clameter at sgi

Apr 16, 2008, 11:35 AM

Post #3 of 17 (727 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

On Wed, 16 Apr 2008, Robin Holt wrote:

> I don't think this lock mechanism is completely working. I have
> gotten a few failures trying to dereference 0x100100 which appears to
> be LIST_POISON1.

How does xpmem unregistering of notifiers work?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


holt at sgi

Apr 16, 2008, 12:02 PM

Post #4 of 17 (724 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

On Wed, Apr 16, 2008 at 11:35:38AM -0700, Christoph Lameter wrote:
> On Wed, 16 Apr 2008, Robin Holt wrote:
>
> > I don't think this lock mechanism is completely working. I have
> > gotten a few failures trying to dereference 0x100100 which appears to
> > be LIST_POISON1.
>
> How does xpmem unregistering of notifiers work?

For the tests I have been running, we are waiting for the release
callout as part of exit.

Thanks,
Robin
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


clameter at sgi

Apr 16, 2008, 12:15 PM

Post #5 of 17 (724 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

On Wed, 16 Apr 2008, Robin Holt wrote:

> On Wed, Apr 16, 2008 at 11:35:38AM -0700, Christoph Lameter wrote:
> > On Wed, 16 Apr 2008, Robin Holt wrote:
> >
> > > I don't think this lock mechanism is completely working. I have
> > > gotten a few failures trying to dereference 0x100100 which appears to
> > > be LIST_POISON1.
> >
> > How does xpmem unregistering of notifiers work?
>
> For the tests I have been running, we are waiting for the release
> callout as part of exit.

Some more details on the failure may be useful. AFAICT list_del[_rcu] is
the culprit here and that is only used on release or unregister.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


holt at sgi

Apr 17, 2008, 4:14 AM

Post #6 of 17 (721 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

On Wed, Apr 16, 2008 at 12:15:08PM -0700, Christoph Lameter wrote:
> On Wed, 16 Apr 2008, Robin Holt wrote:
>
> > On Wed, Apr 16, 2008 at 11:35:38AM -0700, Christoph Lameter wrote:
> > > On Wed, 16 Apr 2008, Robin Holt wrote:
> > >
> > > > I don't think this lock mechanism is completely working. I have
> > > > gotten a few failures trying to dereference 0x100100 which appears to
> > > > be LIST_POISON1.
> > >
> > > How does xpmem unregistering of notifiers work?
> >
> > For the tests I have been running, we are waiting for the release
> > callout as part of exit.
>
> Some more details on the failure may be useful. AFAICT list_del[_rcu] is
> the culprit here and that is only used on release or unregister.

I think I have this understood now. It happens quite quickly (within
10 minutes) on a 128 rank job of small data set in a loop.

In these failing jobs, all the ranks are nearly symmetric. There is
a certain part of each ranks address space that has access granted.
All the ranks have included all the other ranks including themselves in
exactly the same layout at exactly the same virtual address.

Rank 3 has hit _release and is beginning to clean up, but has not deleted
the notifier from its list.

Rank 9 calls the xpmem_invalidate_page() callout. That page was attached
by rank 3 so we call zap_page_range on rank 3 which then calls back into
xpmem's invalidate_range_start callout.

The rank 3 _release callout begins and deletes its notifier from the list.

Rank 9's call to rank 3's zap_page_range notifier returns and dereferences
LIST_POISON1.

I often confuse myself while trying to explain these so please kick me
where the holes in the flow appear. The console output from the simple
debugging stuff I put in is a bit overwhelming.


I am trying to figure out now which locks we hold as part of the zap
callout that should have prevented the _release callout.

Thanks,
Robin
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


andrea at qumranet

Apr 17, 2008, 8:51 AM

Post #7 of 17 (720 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

On Wed, Apr 16, 2008 at 11:35:38AM -0700, Christoph Lameter wrote:
> On Wed, 16 Apr 2008, Robin Holt wrote:
>
> > I don't think this lock mechanism is completely working. I have
> > gotten a few failures trying to dereference 0x100100 which appears to
> > be LIST_POISON1.
>
> How does xpmem unregistering of notifiers work?

Especially are you using mmu_notifier_unregister?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


holt at sgi

Apr 17, 2008, 9:36 AM

Post #8 of 17 (716 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

On Thu, Apr 17, 2008 at 05:51:57PM +0200, Andrea Arcangeli wrote:
> On Wed, Apr 16, 2008 at 11:35:38AM -0700, Christoph Lameter wrote:
> > On Wed, 16 Apr 2008, Robin Holt wrote:
> >
> > > I don't think this lock mechanism is completely working. I have
> > > gotten a few failures trying to dereference 0x100100 which appears to
> > > be LIST_POISON1.
> >
> > How does xpmem unregistering of notifiers work?
>
> Especially are you using mmu_notifier_unregister?

In this case, we are not making the call to unregister, we are waiting
for the _release callout which has already removed it from the list.

In the event that the user has removed all the grants, we use unregister.
That typically does not occur. We merely wait for exit processing to
clean up the structures.

Thanks,
Robin
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


andrea at qumranet

Apr 17, 2008, 10:14 AM

Post #9 of 17 (715 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

On Thu, Apr 17, 2008 at 11:36:42AM -0500, Robin Holt wrote:
> In this case, we are not making the call to unregister, we are waiting
> for the _release callout which has already removed it from the list.
>
> In the event that the user has removed all the grants, we use unregister.
> That typically does not occur. We merely wait for exit processing to
> clean up the structures.

Then it's very strange. LIST_POISON1 is set in n->next. If it was a
second hlist_del triggering the bug in theory list_poison2 should
trigger first, so perhaps it's really a notifier running despite a
mm_lock is taken? Could you post a full stack trace so I can see who's
running into LIST_POISON1? If it's really a notifier running outside
of some mm_lock that will be _immediately_ visible from the stack
trace that triggered the LIST_POISON1!

Also note, EMM isn't using the clean hlist_del, it's implementing list
by hand (with zero runtime gain) so all the debugging may not be
existent in EMM, so if it's really a mm_lock race, and it only
triggers with mmu notifiers and not with EMM, it doesn't necessarily
mean EMM is bug free. If you've a full stack trace it would greatly
help to verify what is mangling over the list when the oops triggers.

Thanks!
Andrea
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


holt at sgi

Apr 17, 2008, 10:25 AM

Post #10 of 17 (714 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

On Thu, Apr 17, 2008 at 07:14:43PM +0200, Andrea Arcangeli wrote:
> On Thu, Apr 17, 2008 at 11:36:42AM -0500, Robin Holt wrote:
> > In this case, we are not making the call to unregister, we are waiting
> > for the _release callout which has already removed it from the list.
> >
> > In the event that the user has removed all the grants, we use unregister.
> > That typically does not occur. We merely wait for exit processing to
> > clean up the structures.
>
> Then it's very strange. LIST_POISON1 is set in n->next. If it was a
> second hlist_del triggering the bug in theory list_poison2 should
> trigger first, so perhaps it's really a notifier running despite a
> mm_lock is taken? Could you post a full stack trace so I can see who's
> running into LIST_POISON1? If it's really a notifier running outside
> of some mm_lock that will be _immediately_ visible from the stack
> trace that triggered the LIST_POISON1!
>
> Also note, EMM isn't using the clean hlist_del, it's implementing list
> by hand (with zero runtime gain) so all the debugging may not be
> existent in EMM, so if it's really a mm_lock race, and it only
> triggers with mmu notifiers and not with EMM, it doesn't necessarily
> mean EMM is bug free. If you've a full stack trace it would greatly
> help to verify what is mangling over the list when the oops triggers.

The stack trace is below. I did not do this level of testing on emm so
I can not compare the two in this area.

This is for a different, but equivalent failure. I just reproduce the
LIST_POISON1 failure without trying to reproduce the exact same failure
as I had documented earlier (lost that stack trace, sorry).

Thanks,
Robin


<1>Unable to handle kernel paging request at virtual address 0000000000100100
<4>mpi006.f.x[23403]: Oops 11012296146944 [1]
<4>Modules linked in: nfs lockd sunrpc binfmt_misc thermal processor fan button loop md_mod dm_mod xpmem xp mspec sg
<4>
<4>Pid: 23403, CPU 114, comm: mpi006.f.x
<4>psr : 0000121008526010 ifs : 800000000000038b ip : [<a00000010015d6a1>] Not tainted (2.6.25-rc8)
<4>ip is at __mmu_notifier_invalidate_range_start+0x81/0x120
<4>unat: 0000000000000000 pfs : 000000000000038b rsc : 0000000000000003
<4>rnat: a000000100149a00 bsps: a000000000010740 pr : 66555666a9599aa9
<4>ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c0270033f
<4>csd : 0000000000000000 ssd : 0000000000000000
<4>b0 : a00000010015d670 b6 : a0000002101ddb40 b7 : a00000010000eb50
<4>f6 : 1003e2222222222222222 f7 : 000000000000000000000
<4>f8 : 000000000000000000000 f9 : 000000000000000000000
<4>f10 : 000000000000000000000 f11 : 000000000000000000000
<4>r1 : a000000100ef1190 r2 : e0000e6080cc1940 r3 : a0000002101edd10
<4>r8 : e0000e6080cc1970 r9 : 0000000000000000 r10 : e0000e6080cc19c8
<4>r11 : 20000003a6480000 r12 : e0000c60d31efb90 r13 : e0000c60d31e0000
<4>r14 : 000000000000004d r15 : e0000e6080cc1914 r16 : e0000e6080cc1970
<4>r17 : 20000003a6480000 r18 : 20000007bf900000 r19 : 0000000000040000
<4>r20 : e0000c60d31e0000 r21 : 0000000000000010 r22 : e0000e6080cc19a8
<4>r23 : e0000c60c55f1120 r24 : e0000c60d31efda0 r25 : e0000c60d31efd98
<4>r26 : e0000e60812166d0 r27 : e0000c60d31efdc0 r28 : e0000c60d31efdb8
<4>r29 : e0000c60d31e0b60 r30 : 0000000000000000 r31 : 0000000000000081
<4>
<4>Call Trace:
<4> [<a000000100014a20>] show_stack+0x40/0xa0
<4> sp=e0000c60d31ef760 bsp=e0000c60d31e11f0
<4> [<a000000100015330>] show_regs+0x850/0x8a0
<4> sp=e0000c60d31ef930 bsp=e0000c60d31e1198
<4> [<a000000100035ed0>] die+0x1b0/0x2e0
<4> sp=e0000c60d31ef930 bsp=e0000c60d31e1150
<4> [<a000000100060e90>] ia64_do_page_fault+0x8d0/0xa40
<4> sp=e0000c60d31ef930 bsp=e0000c60d31e1100
<4> [<a00000010000ab00>] ia64_leave_kernel+0x0/0x270
<4> sp=e0000c60d31ef9c0 bsp=e0000c60d31e1100
<4> [<a00000010015d6a0>] __mmu_notifier_invalidate_range_start+0x80/0x120
<4> sp=e0000c60d31efb90 bsp=e0000c60d31e10a8
<4> [<a00000010011b1d0>] unmap_vmas+0x70/0x14c0
<4> sp=e0000c60d31efb90 bsp=e0000c60d31e0fa8
<4> [<a00000010011c660>] zap_page_range+0x40/0x60
<4> sp=e0000c60d31efda0 bsp=e0000c60d31e0f70
<4> [<a0000002101d62d0>] xpmem_clear_PTEs+0x350/0x560 [xpmem]
<4> sp=e0000c60d31efdb0 bsp=e0000c60d31e0ef0
<4> [<a0000002101d1e30>] xpmem_remove_seg+0x3f0/0x700 [xpmem]
<4> sp=e0000c60d31efde0 bsp=e0000c60d31e0ea8
<4> [<a0000002101d2500>] xpmem_remove_segs_of_tg+0x80/0x140 [xpmem]
<4> sp=e0000c60d31efe10 bsp=e0000c60d31e0e78
<4> [<a0000002101dda40>] xpmem_mmu_notifier_release+0x40/0x80 [xpmem]
<4> sp=e0000c60d31efe10 bsp=e0000c60d31e0e58
<4> [<a00000010015d7f0>] __mmu_notifier_release+0xb0/0x100
<4> sp=e0000c60d31efe10 bsp=e0000c60d31e0e38
<4> [<a000000100124430>] exit_mmap+0x50/0x180
<4> sp=e0000c60d31efe10 bsp=e0000c60d31e0e10
<4> [<a00000010008fb30>] mmput+0x70/0x180
<4> sp=e0000c60d31efe20 bsp=e0000c60d31e0dd8
<4> [<a000000100098df0>] exit_mm+0x1f0/0x220
<4> sp=e0000c60d31efe20 bsp=e0000c60d31e0da0
<4> [<a00000010009ca60>] do_exit+0x4e0/0xf40
<4> sp=e0000c60d31efe20 bsp=e0000c60d31e0d58
<4> [<a00000010009d640>] do_group_exit+0x180/0x1c0
<4> sp=e0000c60d31efe30 bsp=e0000c60d31e0d20
<4> [<a00000010009d6a0>] sys_exit_group+0x20/0x40
<4> sp=e0000c60d31efe30 bsp=e0000c60d31e0cc8
<4> [<a00000010000a960>] ia64_ret_from_syscall+0x0/0x20
<4> sp=e0000c60d31efe30 bsp=e0000c60d31e0cc8
<4> [<a000000000010720>] __kernel_syscall_via_break+0x0/0x20
<4> sp=e0000c60d31f0000 bsp=e0000c60d31e0cc8

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


clameter at sgi

Apr 17, 2008, 12:10 PM

Post #11 of 17 (713 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

On Thu, 17 Apr 2008, Andrea Arcangeli wrote:

> Also note, EMM isn't using the clean hlist_del, it's implementing list
> by hand (with zero runtime gain) so all the debugging may not be
> existent in EMM, so if it's really a mm_lock race, and it only
> triggers with mmu notifiers and not with EMM, it doesn't necessarily
> mean EMM is bug free. If you've a full stack trace it would greatly
> help to verify what is mangling over the list when the oops triggers.

EMM was/is using a single linked list which allows atomic updates. Looked
cleaner to me since doubly linked list must update two pointers.

I have not seen docs on the locking so not sure why you use rcu
operations here? Isnt the requirement to have either rmap locks or
mmap_sem held enough to guarantee the consistency of the doubly linked list?


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


andrea at qumranet

Apr 17, 2008, 3:16 PM

Post #12 of 17 (706 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

On Thu, Apr 17, 2008 at 12:10:52PM -0700, Christoph Lameter wrote:
> EMM was/is using a single linked list which allows atomic updates. Looked
> cleaner to me since doubly linked list must update two pointers.

Cleaner would be if it would provide an abstraction in list.h. The
important is the memory taken by the head for this usage.

> I have not seen docs on the locking so not sure why you use rcu
> operations here? Isnt the requirement to have either rmap locks or
> mmap_sem held enough to guarantee the consistency of the doubly linked list?

Yes, exactly, I'm not using rcu anymore.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


rusty at rustcorp

Apr 21, 2008, 10:06 PM

Post #13 of 17 (693 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

On Wednesday 09 April 2008 01:44:04 Andrea Arcangeli wrote:
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1050,6 +1050,15 @@
> unsigned long addr, unsigned long len,
> unsigned long flags, struct page **pages);
>
> +struct mm_lock_data {
> + spinlock_t **i_mmap_locks;
> + spinlock_t **anon_vma_locks;
> + unsigned long nr_i_mmap_locks;
> + unsigned long nr_anon_vma_locks;
> +};
> +extern struct mm_lock_data *mm_lock(struct mm_struct * mm);
> +extern void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data);

As far as I can tell you don't actually need to expose this struct at all?

> + data->i_mmap_locks = vmalloc(nr_i_mmap_locks *
> + sizeof(spinlock_t));

This is why non-typesafe allocators suck. You want 'sizeof(spinlock_t *)'
here.

> + data->anon_vma_locks = vmalloc(nr_anon_vma_locks *
> + sizeof(spinlock_t));

and here.

> + err = -EINTR;
> + i_mmap_lock_last = NULL;
> + nr_i_mmap_locks = 0;
> + for (;;) {
> + spinlock_t *i_mmap_lock = (spinlock_t *) -1UL;
> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
...
> + data->i_mmap_locks[nr_i_mmap_locks++] = i_mmap_lock;
> + }
> + data->nr_i_mmap_locks = nr_i_mmap_locks;

How about you track your running counter in data->nr_i_mmap_locks, leave
nr_i_mmap_locks alone, and BUG_ON(data->nr_i_mmap_locks != nr_i_mmap_locks)?

Even nicer would be to wrap this in a "get_sorted_mmap_locks()" function.

Similarly for anon_vma locks.

Unfortunately, I just don't think we can fail locking like this. In your next
patch unregistering a notifier can fail because of it: that not usable.

I think it means you need to add a linked list element to the vma for the
CONFIG_MMU_NOTIFIER case. Or track the max number of vmas for any mm, and
keep a pool to handle mm_lock for this number (ie. if you can't enlarge the
pool, fail the vma allocation).

Both have their problems though...
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


andrea at qumranet

Apr 25, 2008, 9:56 AM

Post #14 of 17 (671 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

I somehow lost missed this email in my inbox, found it now because it
was strangely still unread... Sorry for the late reply!

On Tue, Apr 22, 2008 at 03:06:24PM +1000, Rusty Russell wrote:
> On Wednesday 09 April 2008 01:44:04 Andrea Arcangeli wrote:
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1050,6 +1050,15 @@
> > unsigned long addr, unsigned long len,
> > unsigned long flags, struct page **pages);
> >
> > +struct mm_lock_data {
> > + spinlock_t **i_mmap_locks;
> > + spinlock_t **anon_vma_locks;
> > + unsigned long nr_i_mmap_locks;
> > + unsigned long nr_anon_vma_locks;
> > +};
> > +extern struct mm_lock_data *mm_lock(struct mm_struct * mm);
> > +extern void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data);
>
> As far as I can tell you don't actually need to expose this struct at all?

Yes, it should be possible to only expose 'struct mm_lock_data;'.

> > + data->i_mmap_locks = vmalloc(nr_i_mmap_locks *
> > + sizeof(spinlock_t));
>
> This is why non-typesafe allocators suck. You want 'sizeof(spinlock_t *)'
> here.
>
> > + data->anon_vma_locks = vmalloc(nr_anon_vma_locks *
> > + sizeof(spinlock_t));
>
> and here.

Great catch! (it was temporarily wasting some ram which isn't nice at all)

> > + err = -EINTR;
> > + i_mmap_lock_last = NULL;
> > + nr_i_mmap_locks = 0;
> > + for (;;) {
> > + spinlock_t *i_mmap_lock = (spinlock_t *) -1UL;
> > + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> ...
> > + data->i_mmap_locks[nr_i_mmap_locks++] = i_mmap_lock;
> > + }
> > + data->nr_i_mmap_locks = nr_i_mmap_locks;
>
> How about you track your running counter in data->nr_i_mmap_locks, leave
> nr_i_mmap_locks alone, and BUG_ON(data->nr_i_mmap_locks != nr_i_mmap_locks)?
>
> Even nicer would be to wrap this in a "get_sorted_mmap_locks()" function.

I'll try to clean this up further and I'll make a further update for review.

> Unfortunately, I just don't think we can fail locking like this. In your next
> patch unregistering a notifier can fail because of it: that not usable.

Fortunately I figured out we don't really need mm_lock in unregister
because it's ok to unregister in the middle of the range_begin/end
critical section (that's definitely not ok for register that's why
register needs mm_lock). And it's perfectly ok to fail in register().

Also it wasn't ok to unpin the module count in ->release as ->release
needs to 'ret' to get back to the mmu notifier code. And without any
unregister at all, the module can't be unloaded at all which
is quite unacceptable...

The logic is to prevent mmu_notifier_register to race with
mmu_notifier_release because it takes the mm_users pin (implicit or
explicit, and then mmput just after mmu_notifier_register
returns). Then _register serializes against all the mmu notifier
methods (except ->release) with srcu (->release can't run thanks to
the mm_users pin). The mmu_notifier_mm->lock then serializes the
modification on the list (register vs unregister) and it ensures one
and only one between _unregister and _releases calls ->release before
_unregister returns. All other methods runs freely with srcu. Having
the guarante that ->release is called just before all pages are freed
or inside _unregister, allows the module to zap and freeze its
secondary mmu inside ->release with the race condition of exit()
against mmu_notifier_unregister internally by the mmu notifier code
and without dependency on exit_files/exit_mm ordering depending if the
fd of the driver is open the filetables or in the vma only. The
mmu_notifier_mm can be reset to 0 only after the last mmdrop.

About the mm_count refcounting for _release and _unregiste: no mmu
notifier and not even mmu_notifier_unregister and _release can cope
with mmu_notfier_mm list and srcu structures going away out of
order. exit_mmap is safe as it holds an mm_count implicitly because
mmdrop is run after exit_mmap returns. mmu_notifier_unregister is safe
too as _register takes the mm_count pin. We can't prevent
mmu_notifer_mm to go away with mm_users as that will screwup the vma
filedescriptor closure that only happens inside exit_mmap (mm_users
pinned prevents exit_mmap to run, and it can only be taken temporarily
until _register returns).
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


andrea at qumranet

Apr 25, 2008, 10:04 AM

Post #15 of 17 (669 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

On Fri, Apr 25, 2008 at 06:56:39PM +0200, Andrea Arcangeli wrote:
> > > + data->i_mmap_locks = vmalloc(nr_i_mmap_locks *
> > > + sizeof(spinlock_t));
> >
> > This is why non-typesafe allocators suck. You want 'sizeof(spinlock_t *)'
> > here.
> >
> > > + data->anon_vma_locks = vmalloc(nr_anon_vma_locks *
> > > + sizeof(spinlock_t));
> >
> > and here.
>
> Great catch! (it was temporarily wasting some ram which isn't nice at all)

As I went into the editor I just found the above already fixed in
#v14-pre3. And I can't move the structure into the file anymore
without kmallocing it. Exposing that structure avoids the
ERR_PTR/PTR_ERR on the retvals and one kmalloc so I think it makes the
code simpler in the end to keep it as it is now. I'd rather avoid
further changes to the 1/N patch, as long as they don't make any
difference at runtime and as long as they involve more than
cut-and-pasting a structure from .h to .c file.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


holt at sgi

Apr 25, 2008, 12:25 PM

Post #16 of 17 (668 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

On Fri, Apr 25, 2008 at 06:56:40PM +0200, Andrea Arcangeli wrote:
> Fortunately I figured out we don't really need mm_lock in unregister
> because it's ok to unregister in the middle of the range_begin/end
> critical section (that's definitely not ok for register that's why
> register needs mm_lock). And it's perfectly ok to fail in register().

I think you still need mm_lock (unless I miss something). What happens
when one callout is scanning mmu_notifier_invalidate_range_start() and
you unlink. That list next pointer with LIST_POISON1 which is a really
bad address for the processor to track.

Maybe I misunderstood your description.

Thanks,
Robin
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


andrea at qumranet

Apr 25, 2008, 5:57 PM

Post #17 of 17 (664 views)
Permalink
Re: [PATCH 1 of 9] Lock the entire mm to prevent any mmu related operation to happen [In reply to]

On Fri, Apr 25, 2008 at 02:25:32PM -0500, Robin Holt wrote:
> I think you still need mm_lock (unless I miss something). What happens
> when one callout is scanning mmu_notifier_invalidate_range_start() and
> you unlink. That list next pointer with LIST_POISON1 which is a really
> bad address for the processor to track.

Ok, _release list_del_init qcan't race with that because it happens in
exit_mmap when no other mmu notifier can trigger anymore.

_unregister can run concurrently but it does list_del_rcu, that only
overwrites the pprev pointer with LIST_POISON2. The
mmu_notifier_invalidate_range_start won't crash on LIST_POISON1 thanks
to srcu.

Actually I did more changes than necessary, for example I noticed the
mmu_notifier_register can return a list_add_head instead of
list_add_head_rcu. _register can't race against _release thanks to the
mm_users temporary or implicit pin. _register can't race against
_unregister thanks to the mmu_notifier_mm->lock. And register can't
race against all other mmu notifiers thanks to the mm_lock.

At this time I've no other pending patches on top of v14-pre3 other
than the below micro-optimizing cleanup. It'd be great to have
confirmation that v14-pre3 passes GRU/XPMEM regressions tests as well
as my KVM testing already passed successfully on it. I'll forward
v14-pre3 mmu-notifier-core plus the below to Andrew tomorrow, I'm
trying to be optimistic here! ;)

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -187,7 +187,7 @@ int mmu_notifier_register(struct mmu_not
* current->mm or explicitly with get_task_mm() or similar).
*/
spin_lock(&mm->mmu_notifier_mm->lock);
- hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
+ hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
spin_unlock(&mm->mmu_notifier_mm->lock);
out_unlock:
mm_unlock(mm, &data);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo[at]vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linux kernel RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact lists@gossamer-threads.com
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.