Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: Linux: Kernel

[PATCH] show per-process swap usage via procfs

 

 

Linux kernel RSS feed   Index | Next | Previous | View Threaded


kamezawa.hiroyu at jp

Nov 3, 2009, 10:24 PM

Post #1 of 10 (179 views)
Permalink
[PATCH] show per-process swap usage via procfs

Passed several tests and one bug was fixed since RFC version.
This patch is against mmotm.
=
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu [at] jp>

Now, anon_rss and file_rss is counted as RSS and exported via /proc.
RSS usage is important information but one more information which
is often asked by users is "usage of swap".(user support team said.)

This patch counts swap entry usage per process and show it via
/proc/<pid>/status. I think status file is robust against new entry.
Then, it is the first candidate..

After this, /proc/<pid>/status includes following line
<snip>
VmPeak: 315360 kB
VmSize: 315360 kB
VmLck: 0 kB
VmHWM: 180452 kB
VmRSS: 180452 kB
VmData: 311624 kB
VmStk: 84 kB
VmExe: 4 kB
VmLib: 1568 kB
VmPTE: 640 kB
VmSwap: 131240 kB <=== new information

Note:
Because this patch catches swap_pte on page table, this will
not catch shmem's swapout. It's already accounted in per-shmem
inode and we don't need to do more.

Changelog: 2009/11/03
- clean up.
- fixed initialization bug at fork (init_mm())

Acked-by: Acked-by; David Rientjes <rientjes [at] google>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu [at] jp>
---
fs/proc/task_mmu.c | 9 ++++++---
include/linux/mm_types.h | 1 +
kernel/fork.c | 1 +
mm/memory.c | 30 +++++++++++++++++++++---------
mm/rmap.c | 1 +
mm/swapfile.c | 1 +
6 files changed, 31 insertions(+), 12 deletions(-)

Index: mmotm-2.6.32-Nov2/include/linux/mm_types.h
===================================================================
--- mmotm-2.6.32-Nov2.orig/include/linux/mm_types.h
+++ mmotm-2.6.32-Nov2/include/linux/mm_types.h
@@ -228,6 +228,7 @@ struct mm_struct {
*/
mm_counter_t _file_rss;
mm_counter_t _anon_rss;
+ mm_counter_t _swap_usage;

unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
Index: mmotm-2.6.32-Nov2/mm/memory.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/memory.c
+++ mmotm-2.6.32-Nov2/mm/memory.c
@@ -376,12 +376,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
return 0;
}

-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline void
+add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swap_usage)
{
if (file_rss)
add_mm_counter(mm, file_rss, file_rss);
if (anon_rss)
add_mm_counter(mm, anon_rss, anon_rss);
+ if (swap_usage)
+ add_mm_counter(mm, swap_usage, swap_usage);
}

/*
@@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
&src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
- if (is_write_migration_entry(entry) &&
+ if (!is_migration_entry(entry))
+ rss[2]++;
+ else if (is_write_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
* COW mappings require pages in both parent
@@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
- int rss[2];
+ int rss[3];
swp_entry_t entry = (swp_entry_t){0};

again:
- rss[1] = rss[0] = 0;
+ rss[2] = rss[1] = rss[0] = 0;
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
@@ -688,7 +693,7 @@ again:
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
pte_unmap_nested(orig_src_pte);
- add_mm_rss(dst_mm, rss[0], rss[1]);
+ add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();

@@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
spinlock_t *ptl;
int file_rss = 0;
int anon_rss = 0;
+ int swap_usage = 0;

pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
if (pte_file(ptent)) {
if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
print_bad_pte(vma, addr, ptent, NULL);
- } else if
- (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
- print_bad_pte(vma, addr, ptent, NULL);
+ } else {
+ swp_entry_t ent = pte_to_swp_entry(ptent);
+
+ if (!is_migration_entry(ent))
+ swap_usage--;
+ if (unlikely(!free_swap_and_cache(ent)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ }
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));

- add_mm_rss(mm, file_rss, anon_rss);
+ add_mm_rss(mm, file_rss, anon_rss, swap_usage);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);

@@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
*/

inc_mm_counter(mm, anon_rss);
+ dec_mm_counter(mm, swap_usage);
pte = mk_pte(page, vma->vm_page_prot);
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
Index: mmotm-2.6.32-Nov2/mm/swapfile.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
+++ mmotm-2.6.32-Nov2/mm/swapfile.c
@@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
}

inc_mm_counter(vma->vm_mm, anon_rss);
+ dec_mm_counter(vma->vm_mm, swap_usage);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
+++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
@@ -17,7 +17,7 @@
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
unsigned long data, text, lib;
- unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+ unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;

/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
data = mm->total_vm - mm->shared_vm - mm->stack_vm;
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+ swap = get_mm_counter(mm, swap_usage);
seq_printf(m,
"VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n"
@@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
"VmLib:\t%8lu kB\n"
- "VmPTE:\t%8lu kB\n",
+ "VmPTE:\t%8lu kB\n"
+ "VmSwap:\t%8lu kB\n",
hiwater_vm << (PAGE_SHIFT-10),
(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
- (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+ (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+ swap << (PAGE_SHIFT - 10));
}

unsigned long task_vsize(struct mm_struct *mm)
Index: mmotm-2.6.32-Nov2/mm/rmap.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/rmap.c
+++ mmotm-2.6.32-Nov2/mm/rmap.c
@@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page
spin_unlock(&mmlist_lock);
}
dec_mm_counter(mm, anon_rss);
+ inc_mm_counter(mm, swap_usage);
} else if (PAGE_MIGRATION) {
/*
* Store the pfn of the page in a special migration
Index: mmotm-2.6.32-Nov2/kernel/fork.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/kernel/fork.c
+++ mmotm-2.6.32-Nov2/kernel/fork.c
@@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
mm->nr_ptes = 0;
set_mm_counter(mm, file_rss, 0);
set_mm_counter(mm, anon_rss, 0);
+ set_mm_counter(mm, swap_usage, 0);
spin_lock_init(&mm->page_table_lock);
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


cl at linux-foundation

Nov 4, 2009, 11:15 AM

Post #2 of 10 (167 views)
Permalink
Re: [PATCH] show per-process swap usage via procfs [In reply to]

On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:

> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)

Hmmm... Could we do some rework of the counters first so that they are per
cpu?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


kosaki.motohiro at jp

Nov 4, 2009, 3:25 PM

Post #3 of 10 (165 views)
Permalink
Re: [PATCH] show per-process swap usage via procfs [In reply to]

> On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
>
> > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > RSS usage is important information but one more information which
> > is often asked by users is "usage of swap".(user support team said.)
>
> Hmmm... Could we do some rework of the counters first so that they are per
> cpu?

per-cpu swap counter?
It seems overkill effort....



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


kamezawa.hiroyu at jp

Nov 4, 2009, 4:06 PM

Post #4 of 10 (167 views)
Permalink
Re: [PATCH] show per-process swap usage via procfs [In reply to]

On Wed, 4 Nov 2009 14:15:40 -0500 (EST)
Christoph Lameter <cl [at] linux-foundation> wrote:

> On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
>
> > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > RSS usage is important information but one more information which
> > is often asked by users is "usage of swap".(user support team said.)
>
> Hmmm... Could we do some rework of the counters first so that they are per
> cpu?
>
I don't think swap_usage counter has much costs because it's call path
is always slow path. But, I'm not in hurry. So rework is ok.

I'll post my percpu array counter with some rework, CCing you.
Maybe it can be used in this case.

Thanks,
-Kame

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


kamezawa.hiroyu at jp

Nov 4, 2009, 6:28 PM

Post #5 of 10 (158 views)
Permalink
Re: [PATCH] show per-process swap usage via procfs [In reply to]

On Thu, 5 Nov 2009 08:25:28 +0900 (JST)
KOSAKI Motohiro <kosaki.motohiro [at] jp> wrote:

> > On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> >
> > > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > > RSS usage is important information but one more information which
> > > is often asked by users is "usage of swap".(user support team said.)
> >
> > Hmmm... Could we do some rework of the counters first so that they are per
> > cpu?
>
> per-cpu swap counter?
> It seems overkill effort....
>
I nearly agree with you.

Thanks,
-Kame

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


kosaki.motohiro at jp

Nov 5, 2009, 6:41 AM

Post #6 of 10 (157 views)
Permalink
Re: [PATCH] show per-process swap usage via procfs [In reply to]

2009/11/4 KAMEZAWA Hiroyuki <kamezawa.hiroyu [at] jp>:
>
> Passed several tests and one bug was fixed since RFC version.
> This patch is against mmotm.
> =
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu [at] jp>
>
> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)
>
> This patch counts swap entry usage per process and show it via
> /proc/<pid>/status. I think status file is robust against new entry.
> Then, it is the first candidate..
>
>  After this, /proc/<pid>/status includes following line
>  <snip>
>  VmPeak:   315360 kB
>  VmSize:   315360 kB
>  VmLck:         0 kB
>  VmHWM:    180452 kB
>  VmRSS:    180452 kB
>  VmData:   311624 kB
>  VmStk:        84 kB
>  VmExe:         4 kB
>  VmLib:      1568 kB
>  VmPTE:       640 kB
>  VmSwap:   131240 kB <=== new information
>
> Note:
>  Because this patch catches swap_pte on page table, this will
>  not catch shmem's swapout. It's already accounted in per-shmem
>  inode and we don't need to do more.

Sidenote: top(1) can show SWAP usage. but it is crazy buggy
implementation. it define
VIRT = SWAP + RES (see man top or actual source code). this patch help
to fix its insane
calculation.

Acked-by: KOSAKI Motohiro <kosaki.motohiro [at] jp>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


cl at linux-foundation

Nov 5, 2009, 7:04 AM

Post #7 of 10 (154 views)
Permalink
Re: [PATCH] show per-process swap usage via procfs [In reply to]

On Thu, 5 Nov 2009, KOSAKI Motohiro wrote:

> > On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> >
> > > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > > RSS usage is important information but one more information which
> > > is often asked by users is "usage of swap".(user support team said.)
> >
> > Hmmm... Could we do some rework of the counters first so that they are per
> > cpu?
>
> per-cpu swap counter?
> It seems overkill effort....

The other alternative is to use atomic ops which are significantly slower
and have an impact on critical sections.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


minchan.kim at gmail

Nov 5, 2009, 7:11 AM

Post #8 of 10 (155 views)
Permalink
Re: [PATCH] show per-process swap usage via procfs [In reply to]

Hi, Kame.

On Wed, Nov 4, 2009 at 3:24 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu [at] jp> wrote:
>
> Passed several tests and one bug was fixed since RFC version.
> This patch is against mmotm.
> =
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu [at] jp>
>
> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)
>
> This patch counts swap entry usage per process and show it via
> /proc/<pid>/status. I think status file is robust against new entry.
> Then, it is the first candidate..
>
>  After this, /proc/<pid>/status includes following line
>  <snip>
>  VmPeak:   315360 kB
>  VmSize:   315360 kB
>  VmLck:         0 kB
>  VmHWM:    180452 kB
>  VmRSS:    180452 kB
>  VmData:   311624 kB
>  VmStk:        84 kB
>  VmExe:         4 kB
>  VmLib:      1568 kB
>  VmPTE:       640 kB
>  VmSwap:   131240 kB <=== new information
>
> Note:
>  Because this patch catches swap_pte on page table, this will
>  not catch shmem's swapout. It's already accounted in per-shmem
>  inode and we don't need to do more.
>
> Changelog: 2009/11/03
>  - clean up.
>  - fixed initialization bug at fork (init_mm())
>
> Acked-by: Acked-by; David Rientjes <rientjes [at] google>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu [at] jp>
> ---
>  fs/proc/task_mmu.c       |    9 ++++++---
>  include/linux/mm_types.h |    1 +
>  kernel/fork.c            |    1 +
>  mm/memory.c              |   30 +++++++++++++++++++++---------
>  mm/rmap.c                |    1 +
>  mm/swapfile.c            |    1 +
>  6 files changed, 31 insertions(+), 12 deletions(-)
>
> Index: mmotm-2.6.32-Nov2/include/linux/mm_types.h
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/include/linux/mm_types.h
> +++ mmotm-2.6.32-Nov2/include/linux/mm_types.h
> @@ -228,6 +228,7 @@ struct mm_struct {
>         */
>        mm_counter_t _file_rss;
>        mm_counter_t _anon_rss;
> +       mm_counter_t _swap_usage;
>
>        unsigned long hiwater_rss;      /* High-watermark of RSS usage */
>        unsigned long hiwater_vm;       /* High-water virtual memory usage */
> Index: mmotm-2.6.32-Nov2/mm/memory.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/mm/memory.c
> +++ mmotm-2.6.32-Nov2/mm/memory.c
> @@ -376,12 +376,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
>        return 0;
>  }
>
> -static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
> +static inline void
> +add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swap_usage)
>  {
>        if (file_rss)
>                add_mm_counter(mm, file_rss, file_rss);
>        if (anon_rss)
>                add_mm_counter(mm, anon_rss, anon_rss);
> +       if (swap_usage)
> +               add_mm_counter(mm, swap_usage, swap_usage);
>  }
>
>  /*
> @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
>                                                 &src_mm->mmlist);
>                                spin_unlock(&mmlist_lock);
>                        }
> -                       if (is_write_migration_entry(entry) &&
> +                       if (!is_migration_entry(entry))
> +                               rss[2]++;

First thought I come to is that we believe !is_migration_entry(entry) equal
swap entry?
We began supporting HWPOISON.
HWPOISON would be rare event so some less exact swap accouting may
be allowed, I think. Is this enough to jusitfy that?

> +                       else if (is_write_migration_entry(entry) &&
>                                        is_cow_mapping(vm_flags)) {
>                                /*
>                                 * COW mappings require pages in both parent
> @@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
>        pte_t *src_pte, *dst_pte;
>        spinlock_t *src_ptl, *dst_ptl;
>        int progress = 0;
> -       int rss[2];
> +       int rss[3];
>        swp_entry_t entry = (swp_entry_t){0};
>
>  again:
> -       rss[1] = rss[0] = 0;
> +       rss[2] = rss[1] = rss[0] = 0;
>        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
>        if (!dst_pte)
>                return -ENOMEM;
> @@ -688,7 +693,7 @@ again:
>        arch_leave_lazy_mmu_mode();
>        spin_unlock(src_ptl);
>        pte_unmap_nested(orig_src_pte);
> -       add_mm_rss(dst_mm, rss[0], rss[1]);
> +       add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
>        pte_unmap_unlock(orig_dst_pte, dst_ptl);
>        cond_resched();
>
> @@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
>        spinlock_t *ptl;
>        int file_rss = 0;
>        int anon_rss = 0;
> +       int swap_usage = 0;
>
>        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
>        arch_enter_lazy_mmu_mode();
> @@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
>                if (pte_file(ptent)) {
>                        if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
>                                print_bad_pte(vma, addr, ptent, NULL);
> -               } else if
> -                 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
> -                       print_bad_pte(vma, addr, ptent, NULL);
> +               } else {
> +                       swp_entry_t ent = pte_to_swp_entry(ptent);
> +
> +                       if (!is_migration_entry(ent))
> +                               swap_usage--;

ditto

> +                       if (unlikely(!free_swap_and_cache(ent)))
> +                               print_bad_pte(vma, addr, ptent, NULL);
> +               }
>                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
>        } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
>
> -       add_mm_rss(mm, file_rss, anon_rss);
> +       add_mm_rss(mm, file_rss, anon_rss, swap_usage);
>        arch_leave_lazy_mmu_mode();
>        pte_unmap_unlock(pte - 1, ptl);
>
> @@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
>         */
>
>        inc_mm_counter(mm, anon_rss);
> +       dec_mm_counter(mm, swap_usage);
>        pte = mk_pte(page, vma->vm_page_prot);
>        if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
>                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> Index: mmotm-2.6.32-Nov2/mm/swapfile.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
> +++ mmotm-2.6.32-Nov2/mm/swapfile.c
> @@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
>        }
>
>        inc_mm_counter(vma->vm_mm, anon_rss);
> +       dec_mm_counter(vma->vm_mm, swap_usage);
>        get_page(page);
>        set_pte_at(vma->vm_mm, addr, pte,
>                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
> Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
> +++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> @@ -17,7 +17,7 @@
>  void task_mem(struct seq_file *m, struct mm_struct *mm)
>  {
>        unsigned long data, text, lib;
> -       unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
> +       unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;
>
>        /*
>         * Note: to minimize their overhead, mm maintains hiwater_vm and
> @@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
>        data = mm->total_vm - mm->shared_vm - mm->stack_vm;
>        text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
>        lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
> +       swap = get_mm_counter(mm, swap_usage);
>        seq_printf(m,
>                "VmPeak:\t%8lu kB\n"
>                "VmSize:\t%8lu kB\n"
> @@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
>                "VmStk:\t%8lu kB\n"
>                "VmExe:\t%8lu kB\n"
>                "VmLib:\t%8lu kB\n"
> -               "VmPTE:\t%8lu kB\n",
> +               "VmPTE:\t%8lu kB\n"
> +               "VmSwap:\t%8lu kB\n",
>                hiwater_vm << (PAGE_SHIFT-10),
>                (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
>                mm->locked_vm << (PAGE_SHIFT-10),
> @@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
>                total_rss << (PAGE_SHIFT-10),
>                data << (PAGE_SHIFT-10),
>                mm->stack_vm << (PAGE_SHIFT-10), text, lib,
> -               (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
> +               (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
> +               swap << (PAGE_SHIFT - 10));
>  }
>
>  unsigned long task_vsize(struct mm_struct *mm)
> Index: mmotm-2.6.32-Nov2/mm/rmap.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/mm/rmap.c
> +++ mmotm-2.6.32-Nov2/mm/rmap.c
> @@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page
>                                spin_unlock(&mmlist_lock);
>                        }
>                        dec_mm_counter(mm, anon_rss);
> +                       inc_mm_counter(mm, swap_usage);
>                } else if (PAGE_MIGRATION) {
>                        /*
>                         * Store the pfn of the page in a special migration
> Index: mmotm-2.6.32-Nov2/kernel/fork.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/kernel/fork.c
> +++ mmotm-2.6.32-Nov2/kernel/fork.c
> @@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
>        mm->nr_ptes = 0;
>        set_mm_counter(mm, file_rss, 0);
>        set_mm_counter(mm, anon_rss, 0);
> +       set_mm_counter(mm, swap_usage, 0);
>        spin_lock_init(&mm->page_table_lock);
>        mm->free_area_cache = TASK_UNMAPPED_BASE;
>        mm->cached_hole_size = ~0UL;
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo [at] kvack  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont [at] kvack"> email [at] kvack </a>
>

That's good.
From now on, we can chagne scanning of pte to find swap pte
in smaps_pte_rangem, too. :)

--
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


kamezawa.hiroyu at jp

Nov 5, 2009, 3:48 PM

Post #9 of 10 (150 views)
Permalink
Re: [PATCH] show per-process swap usage via procfs [In reply to]

On Fri, 6 Nov 2009 00:11:32 +0900
Minchan Kim <minchan.kim [at] gmail> wrote:

> Hi, Kame.
>
Hi,

<snip>
> >  /*
> > @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
> >                                                 &src_mm->mmlist);
> >                                spin_unlock(&mmlist_lock);
> >                        }
> > -                       if (is_write_migration_entry(entry) &&
> > +                       if (!is_migration_entry(entry))
> > +                               rss[2]++;
>
> First thought I come to is that we believe !is_migration_entry(entry) equal
> swap entry?
> We began supporting HWPOISON.
> HWPOISON would be rare event so some less exact swap accouting may
> be allowed, I think. Is this enough to jusitfy that?
>
Ah, ok, I'll fix here.


> > +                       else if (is_write_migration_entry(entry) &&
> >                                        is_cow_mapping(vm_flags)) {
> >                                /*
> >                                 * COW mappings require pages in both parent
> > @@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
> >        pte_t *src_pte, *dst_pte;
> >        spinlock_t *src_ptl, *dst_ptl;
> >        int progress = 0;
> > -       int rss[2];
> > +       int rss[3];
> >        swp_entry_t entry = (swp_entry_t){0};
> >
> >  again:
> > -       rss[1] = rss[0] = 0;
> > +       rss[2] = rss[1] = rss[0] = 0;
> >        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
> >        if (!dst_pte)
> >                return -ENOMEM;
> > @@ -688,7 +693,7 @@ again:
> >        arch_leave_lazy_mmu_mode();
> >        spin_unlock(src_ptl);
> >        pte_unmap_nested(orig_src_pte);
> > -       add_mm_rss(dst_mm, rss[0], rss[1]);
> > +       add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
> >        pte_unmap_unlock(orig_dst_pte, dst_ptl);
> >        cond_resched();
> >
> > @@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
> >        spinlock_t *ptl;
> >        int file_rss = 0;
> >        int anon_rss = 0;
> > +       int swap_usage = 0;
> >
> >        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> >        arch_enter_lazy_mmu_mode();
> > @@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
> >                if (pte_file(ptent)) {
> >                        if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
> >                                print_bad_pte(vma, addr, ptent, NULL);
> > -               } else if
> > -                 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
> > -                       print_bad_pte(vma, addr, ptent, NULL);
> > +               } else {
> > +                       swp_entry_t ent = pte_to_swp_entry(ptent);
> > +
> > +                       if (!is_migration_entry(ent))
> > +                               swap_usage--;
>
> ditto
>
ok, will do.


> > +                       if (unlikely(!free_swap_and_cache(ent)))
> > +                               print_bad_pte(vma, addr, ptent, NULL);
> > +               }
> >                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> >        } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
> >
> > -       add_mm_rss(mm, file_rss, anon_rss);
> > +       add_mm_rss(mm, file_rss, anon_rss, swap_usage);
> >        arch_leave_lazy_mmu_mode();
> >        pte_unmap_unlock(pte - 1, ptl);
> >
> > @@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
> >         */
> >
> >        inc_mm_counter(mm, anon_rss);
> > +       dec_mm_counter(mm, swap_usage);
> >        pte = mk_pte(page, vma->vm_page_prot);
> >        if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
> >                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> > Index: mmotm-2.6.32-Nov2/mm/swapfile.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
> > +++ mmotm-2.6.32-Nov2/mm/swapfile.c
> > @@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
> >        }
> >
> >        inc_mm_counter(vma->vm_mm, anon_rss);
> > +       dec_mm_counter(vma->vm_mm, swap_usage);
> >        get_page(page);
> >        set_pte_at(vma->vm_mm, addr, pte,
> >                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
> > Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
> > +++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> > @@ -17,7 +17,7 @@
> >  void task_mem(struct seq_file *m, struct mm_struct *mm)
> >  {
> >        unsigned long data, text, lib;
> > -       unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
> > +       unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;
> >
> >        /*
> >         * Note: to minimize their overhead, mm maintains hiwater_vm and
> > @@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
> >        data = mm->total_vm - mm->shared_vm - mm->stack_vm;
> >        text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
> >        lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
> > +       swap = get_mm_counter(mm, swap_usage);
> >        seq_printf(m,
> >                "VmPeak:\t%8lu kB\n"
> >                "VmSize:\t%8lu kB\n"
> > @@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
> >                "VmStk:\t%8lu kB\n"
> >                "VmExe:\t%8lu kB\n"
> >                "VmLib:\t%8lu kB\n"
> > -               "VmPTE:\t%8lu kB\n",
> > +               "VmPTE:\t%8lu kB\n"
> > +               "VmSwap:\t%8lu kB\n",
> >                hiwater_vm << (PAGE_SHIFT-10),
> >                (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
> >                mm->locked_vm << (PAGE_SHIFT-10),
> > @@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
> >                total_rss << (PAGE_SHIFT-10),
> >                data << (PAGE_SHIFT-10),
> >                mm->stack_vm << (PAGE_SHIFT-10), text, lib,
> > -               (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
> > +               (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
> > +               swap << (PAGE_SHIFT - 10));
> >  }
> >
> >  unsigned long task_vsize(struct mm_struct *mm)
> > Index: mmotm-2.6.32-Nov2/mm/rmap.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/mm/rmap.c
> > +++ mmotm-2.6.32-Nov2/mm/rmap.c
> > @@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page
> >                                spin_unlock(&mmlist_lock);
> >                        }
> >                        dec_mm_counter(mm, anon_rss);
> > +                       inc_mm_counter(mm, swap_usage);
> >                } else if (PAGE_MIGRATION) {
> >                        /*
> >                         * Store the pfn of the page in a special migration
> > Index: mmotm-2.6.32-Nov2/kernel/fork.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/kernel/fork.c
> > +++ mmotm-2.6.32-Nov2/kernel/fork.c
> > @@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
> >        mm->nr_ptes = 0;
> >        set_mm_counter(mm, file_rss, 0);
> >        set_mm_counter(mm, anon_rss, 0);
> > +       set_mm_counter(mm, swap_usage, 0);
> >        spin_lock_init(&mm->page_table_lock);
> >        mm->free_area_cache = TASK_UNMAPPED_BASE;
> >        mm->cached_hole_size = ~0UL;
> >
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to majordomo [at] kvack  For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"dont [at] kvack"> email [at] kvack </a>
> >
>
> That's good.
> From now on, we can chagne scanning of pte to find swap pte
> in smaps_pte_rangem, too. :)
>

Thanks, I'll update this.
-Kame


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


pavel at ucw

Nov 8, 2009, 9:04 AM

Post #10 of 10 (146 views)
Permalink
Re: [PATCH] show per-process swap usage via procfs [In reply to]

On Thu 2009-11-05 10:04:01, Christoph Lameter wrote:
> On Thu, 5 Nov 2009, KOSAKI Motohiro wrote:
>
> > > On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> > >
> > > > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > > > RSS usage is important information but one more information which
> > > > is often asked by users is "usage of swap".(user support team said.)
> > >
> > > Hmmm... Could we do some rework of the counters first so that they are per
> > > cpu?
> >
> > per-cpu swap counter?
> > It seems overkill effort....
>
> The other alternative is to use atomic ops which are significantly slower
> and have an impact on critical sections.

...but compared to disk i/o, overhead should be almost zero, right?
Keep it simple...

Pavel

--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linux kernel RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact Gossamer Threads
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.