Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: Linux: Kernel

[PATCH 36/40] autonuma: page_autonuma

 

 

Linux kernel RSS feed   Index | Next | Previous | View Threaded


aarcange at redhat

Jun 28, 2012, 5:56 AM

Post #1 of 5 (68 views)
Permalink
[PATCH 36/40] autonuma: page_autonuma

Move the AutoNUMA per page information from the "struct page" to a
separate page_autonuma data structure allocated in the memsection
(with sparsemem) or in the pgdat (with flatmem).

This is done to avoid growing the size of the "struct page" and the
page_autonuma data is only allocated if the kernel has been booted on
real NUMA hardware (or if noautonuma is passed as parameter to the
kernel).

Signed-off-by: Andrea Arcangeli <aarcange [at] redhat>
---
include/linux/autonuma.h | 18 +++-
include/linux/autonuma_flags.h | 6 +
include/linux/autonuma_types.h | 55 ++++++++++
include/linux/mm_types.h | 26 -----
include/linux/mmzone.h | 14 +++-
include/linux/page_autonuma.h | 53 +++++++++
init/main.c | 2 +
mm/Makefile | 2 +-
mm/autonuma.c | 98 ++++++++++-------
mm/huge_memory.c | 26 +++--
mm/page_alloc.c | 21 +---
mm/page_autonuma.c | 234 ++++++++++++++++++++++++++++++++++++++++
mm/sparse.c | 126 ++++++++++++++++++++-
13 files changed, 577 insertions(+), 104 deletions(-)
create mode 100644 include/linux/page_autonuma.h
create mode 100644 mm/page_autonuma.c

diff --git a/include/linux/autonuma.h b/include/linux/autonuma.h
index 85ca5eb..67af86a 100644
--- a/include/linux/autonuma.h
+++ b/include/linux/autonuma.h
@@ -7,15 +7,26 @@

extern void autonuma_enter(struct mm_struct *mm);
extern void autonuma_exit(struct mm_struct *mm);
-extern void __autonuma_migrate_page_remove(struct page *page);
+extern void __autonuma_migrate_page_remove(struct page *,
+ struct page_autonuma *);
extern void autonuma_migrate_split_huge_page(struct page *page,
struct page *page_tail);
extern void autonuma_setup_new_exec(struct task_struct *p);
+extern struct page_autonuma *lookup_page_autonuma(struct page *page);

static inline void autonuma_migrate_page_remove(struct page *page)
{
- if (ACCESS_ONCE(page->autonuma_migrate_nid) >= 0)
- __autonuma_migrate_page_remove(page);
+ struct page_autonuma *page_autonuma = lookup_page_autonuma(page);
+ if (ACCESS_ONCE(page_autonuma->autonuma_migrate_nid) >= 0)
+ __autonuma_migrate_page_remove(page, page_autonuma);
+}
+
+static inline void autonuma_free_page(struct page *page)
+{
+ if (!autonuma_impossible()) {
+ autonuma_migrate_page_remove(page);
+ lookup_page_autonuma(page)->autonuma_last_nid = -1;
+ }
}

#define autonuma_printk(format, args...) \
@@ -29,6 +40,7 @@ static inline void autonuma_migrate_page_remove(struct page *page) {}
static inline void autonuma_migrate_split_huge_page(struct page *page,
struct page *page_tail) {}
static inline void autonuma_setup_new_exec(struct task_struct *p) {}
+static inline void autonuma_free_page(struct page *page) {}

#endif /* CONFIG_AUTONUMA */

diff --git a/include/linux/autonuma_flags.h b/include/linux/autonuma_flags.h
index 5e29a75..035d993 100644
--- a/include/linux/autonuma_flags.h
+++ b/include/linux/autonuma_flags.h
@@ -15,6 +15,12 @@ enum autonuma_flag {

extern unsigned long autonuma_flags;

+static inline bool autonuma_impossible(void)
+{
+ return num_possible_nodes() <= 1 ||
+ test_bit(AUTONUMA_IMPOSSIBLE_FLAG, &autonuma_flags);
+}
+
static inline bool autonuma_enabled(void)
{
return !!test_bit(AUTONUMA_FLAG, &autonuma_flags);
diff --git a/include/linux/autonuma_types.h b/include/linux/autonuma_types.h
index 9e697e3..1e860f6 100644
--- a/include/linux/autonuma_types.h
+++ b/include/linux/autonuma_types.h
@@ -39,6 +39,61 @@ struct task_autonuma {
unsigned long task_numa_fault[0];
};

+/*
+ * Per page (or per-pageblock) structure dynamically allocated only if
+ * autonuma is not impossible.
+ */
+struct page_autonuma {
+ /*
+ * To modify autonuma_last_nid lockless the architecture,
+ * needs SMP atomic granularity < sizeof(long), not all archs
+ * have that, notably some ancient alpha (but none of those
+ * should run in NUMA systems). Archs without that requires
+ * autonuma_last_nid to be a long.
+ */
+#if BITS_PER_LONG > 32
+ /*
+ * autonuma_migrate_nid is -1 if the page_autonuma structure
+ * is not linked into any
+ * pgdat->autonuma_migrate_head. Otherwise it means the
+ * page_autonuma structure is linked into the
+ * &NODE_DATA(autonuma_migrate_nid)->autonuma_migrate_head[page_nid].
+ * page_nid is the nid that the page (referenced by the
+ * page_autonuma structure) belongs to.
+ */
+ int autonuma_migrate_nid;
+ /*
+ * autonuma_last_nid records which is the NUMA nid that tried
+ * to access this page at the last NUMA hinting page fault.
+ * If it changed, AutoNUMA will not try to migrate the page to
+ * the nid where the thread is running on and to the contrary,
+ * it will make different threads trashing on the same pages,
+ * converge on the same NUMA node (if possible).
+ */
+ int autonuma_last_nid;
+#else
+#if MAX_NUMNODES >= 32768
+#error "too many nodes"
+#endif
+ short autonuma_migrate_nid;
+ short autonuma_last_nid;
+#endif
+ /*
+ * This is the list node that links the page (referenced by
+ * the page_autonuma structure) in the
+ * &NODE_DATA(dst_nid)->autonuma_migrate_head[page_nid] lru.
+ */
+ struct list_head autonuma_migrate_node;
+
+ /*
+ * To find the page starting from the autonuma_migrate_node we
+ * need a backlink.
+ *
+ * FIXME: drop it;
+ */
+ struct page *page;
+};
+
extern int alloc_task_autonuma(struct task_struct *tsk,
struct task_struct *orig,
int node);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d1248cf..f0c6379 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -136,32 +136,6 @@ struct page {
struct page *first_page; /* Compound tail pages */
};

-#ifdef CONFIG_AUTONUMA
- /*
- * FIXME: move to pgdat section along with the memcg and allocate
- * at runtime only in presence of a numa system.
- */
- /*
- * To modify autonuma_last_nid lockless the architecture,
- * needs SMP atomic granularity < sizeof(long), not all archs
- * have that, notably some ancient alpha (but none of those
- * should run in NUMA systems). Archs without that requires
- * autonuma_last_nid to be a long.
- */
-#if BITS_PER_LONG > 32
- int autonuma_migrate_nid;
- int autonuma_last_nid;
-#else
-#if MAX_NUMNODES >= 32768
-#error "too many nodes"
-#endif
- /* FIXME: remember to check the updates are atomic */
- short autonuma_migrate_nid;
- short autonuma_last_nid;
-#endif
- struct list_head autonuma_migrate_node;
-#endif
-
/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d53b26a..e66da74 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -698,10 +698,13 @@ typedef struct pglist_data {
int kswapd_max_order;
enum zone_type classzone_idx;
#ifdef CONFIG_AUTONUMA
- spinlock_t autonuma_lock;
+#if !defined(CONFIG_SPARSEMEM)
+ struct page_autonuma *node_page_autonuma;
+#endif
struct list_head autonuma_migrate_head[MAX_NUMNODES];
unsigned long autonuma_nr_migrate_pages;
wait_queue_head_t autonuma_knuma_migrated_wait;
+ spinlock_t autonuma_lock;
#endif
} pg_data_t;

@@ -1064,6 +1067,15 @@ struct mem_section {
* section. (see memcontrol.h/page_cgroup.h about this.)
*/
struct page_cgroup *page_cgroup;
+#endif
+#ifdef CONFIG_AUTONUMA
+ /*
+ * If !SPARSEMEM, pgdat doesn't have page_autonuma pointer. We use
+ * section.
+ */
+ struct page_autonuma *section_page_autonuma;
+#endif
+#if defined(CONFIG_CGROUP_MEM_RES_CTLR) ^ defined(CONFIG_AUTONUMA)
unsigned long pad;
#endif
};
diff --git a/include/linux/page_autonuma.h b/include/linux/page_autonuma.h
new file mode 100644
index 0000000..d748aa2
--- /dev/null
+++ b/include/linux/page_autonuma.h
@@ -0,0 +1,53 @@
+#ifndef _LINUX_PAGE_AUTONUMA_H
+#define _LINUX_PAGE_AUTONUMA_H
+
+#if defined(CONFIG_AUTONUMA) && !defined(CONFIG_SPARSEMEM)
+extern void __init page_autonuma_init_flatmem(void);
+#else
+static inline void __init page_autonuma_init_flatmem(void) {}
+#endif
+
+#ifdef CONFIG_AUTONUMA
+
+#include <linux/autonuma_flags.h>
+
+extern void __meminit page_autonuma_map_init(struct page *page,
+ struct page_autonuma *page_autonuma,
+ int nr_pages);
+
+#ifdef CONFIG_SPARSEMEM
+#define PAGE_AUTONUMA_SIZE (sizeof(struct page_autonuma))
+#define SECTION_PAGE_AUTONUMA_SIZE (PAGE_AUTONUMA_SIZE * \
+ PAGES_PER_SECTION)
+#endif
+
+extern void __meminit pgdat_autonuma_init(struct pglist_data *);
+
+#else /* CONFIG_AUTONUMA */
+
+#ifdef CONFIG_SPARSEMEM
+struct page_autonuma;
+#define PAGE_AUTONUMA_SIZE 0
+#define SECTION_PAGE_AUTONUMA_SIZE 0
+
+#define autonuma_impossible() true
+
+#endif
+
+static inline void pgdat_autonuma_init(struct pglist_data *pgdat) {}
+
+#endif /* CONFIG_AUTONUMA */
+
+#ifdef CONFIG_SPARSEMEM
+extern struct page_autonuma * __meminit __kmalloc_section_page_autonuma(int nid,
+ unsigned long nr_pages);
+extern void __kfree_section_page_autonuma(struct page_autonuma *page_autonuma,
+ unsigned long nr_pages);
+extern void __init sparse_early_page_autonuma_alloc_node(struct page_autonuma **page_autonuma_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count,
+ int nodeid);
+#endif
+
+#endif /* _LINUX_PAGE_AUTONUMA_H */
diff --git a/init/main.c b/init/main.c
index b5cc0a7..070a377 100644
--- a/init/main.c
+++ b/init/main.c
@@ -68,6 +68,7 @@
#include <linux/shmem_fs.h>
#include <linux/slab.h>
#include <linux/perf_event.h>
+#include <linux/page_autonuma.h>

#include <asm/io.h>
#include <asm/bugs.h>
@@ -455,6 +456,7 @@ static void __init mm_init(void)
* bigger than MAX_ORDER unless SPARSEMEM.
*/
page_cgroup_init_flatmem();
+ page_autonuma_init_flatmem();
mem_init();
kmem_cache_init();
percpu_init_late();
diff --git a/mm/Makefile b/mm/Makefile
index 15900fd..a4d8354 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
-obj-$(CONFIG_AUTONUMA) += autonuma.o
+obj-$(CONFIG_AUTONUMA) += autonuma.o page_autonuma.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_SLOB) += slob.o
diff --git a/mm/autonuma.c b/mm/autonuma.c
index f44272b..ec4d492 100644
--- a/mm/autonuma.c
+++ b/mm/autonuma.c
@@ -51,12 +51,6 @@ static struct knumad_scan {
.mm_head = LIST_HEAD_INIT(knumad_scan.mm_head),
};

-static inline bool autonuma_impossible(void)
-{
- return num_possible_nodes() <= 1 ||
- test_bit(AUTONUMA_IMPOSSIBLE_FLAG, &autonuma_flags);
-}
-
static inline void autonuma_migrate_lock(int nid)
{
spin_lock(&NODE_DATA(nid)->autonuma_lock);
@@ -82,54 +76,63 @@ void autonuma_migrate_split_huge_page(struct page *page,
struct page *page_tail)
{
int nid, last_nid;
+ struct page_autonuma *page_autonuma, *page_tail_autonuma;

- nid = page->autonuma_migrate_nid;
+ if (autonuma_impossible())
+ return;
+
+ page_autonuma = lookup_page_autonuma(page);
+ page_tail_autonuma = lookup_page_autonuma(page_tail);
+
+ nid = page_autonuma->autonuma_migrate_nid;
VM_BUG_ON(nid >= MAX_NUMNODES);
VM_BUG_ON(nid < -1);
- VM_BUG_ON(page_tail->autonuma_migrate_nid != -1);
+ VM_BUG_ON(page_tail_autonuma->autonuma_migrate_nid != -1);
if (nid >= 0) {
VM_BUG_ON(page_to_nid(page) != page_to_nid(page_tail));

compound_lock(page_tail);
autonuma_migrate_lock(nid);
- list_add_tail(&page_tail->autonuma_migrate_node,
- &page->autonuma_migrate_node);
+ list_add_tail(&page_tail_autonuma->autonuma_migrate_node,
+ &page_autonuma->autonuma_migrate_node);
autonuma_migrate_unlock(nid);

- page_tail->autonuma_migrate_nid = nid;
+ page_tail_autonuma->autonuma_migrate_nid = nid;
compound_unlock(page_tail);
}

- last_nid = ACCESS_ONCE(page->autonuma_last_nid);
+ last_nid = ACCESS_ONCE(page_autonuma->autonuma_last_nid);
if (last_nid >= 0)
- page_tail->autonuma_last_nid = last_nid;
+ page_tail_autonuma->autonuma_last_nid = last_nid;
}

-void __autonuma_migrate_page_remove(struct page *page)
+void __autonuma_migrate_page_remove(struct page *page,
+ struct page_autonuma *page_autonuma)
{
unsigned long flags;
int nid;

flags = compound_lock_irqsave(page);

- nid = page->autonuma_migrate_nid;
+ nid = page_autonuma->autonuma_migrate_nid;
VM_BUG_ON(nid >= MAX_NUMNODES);
VM_BUG_ON(nid < -1);
if (nid >= 0) {
int numpages = hpage_nr_pages(page);
autonuma_migrate_lock(nid);
- list_del(&page->autonuma_migrate_node);
+ list_del(&page_autonuma->autonuma_migrate_node);
NODE_DATA(nid)->autonuma_nr_migrate_pages -= numpages;
autonuma_migrate_unlock(nid);

- page->autonuma_migrate_nid = -1;
+ page_autonuma->autonuma_migrate_nid = -1;
}

compound_unlock_irqrestore(page, flags);
}

-static void __autonuma_migrate_page_add(struct page *page, int dst_nid,
- int page_nid)
+static void __autonuma_migrate_page_add(struct page *page,
+ struct page_autonuma *page_autonuma,
+ int dst_nid, int page_nid)
{
unsigned long flags;
int nid;
@@ -148,25 +151,25 @@ static void __autonuma_migrate_page_add(struct page *page, int dst_nid,
flags = compound_lock_irqsave(page);

numpages = hpage_nr_pages(page);
- nid = page->autonuma_migrate_nid;
+ nid = page_autonuma->autonuma_migrate_nid;
VM_BUG_ON(nid >= MAX_NUMNODES);
VM_BUG_ON(nid < -1);
if (nid >= 0) {
autonuma_migrate_lock(nid);
- list_del(&page->autonuma_migrate_node);
+ list_del(&page_autonuma->autonuma_migrate_node);
NODE_DATA(nid)->autonuma_nr_migrate_pages -= numpages;
autonuma_migrate_unlock(nid);
}

autonuma_migrate_lock(dst_nid);
- list_add(&page->autonuma_migrate_node,
+ list_add(&page_autonuma->autonuma_migrate_node,
&NODE_DATA(dst_nid)->autonuma_migrate_head[page_nid]);
NODE_DATA(dst_nid)->autonuma_nr_migrate_pages += numpages;
nr_migrate_pages = NODE_DATA(dst_nid)->autonuma_nr_migrate_pages;

autonuma_migrate_unlock(dst_nid);

- page->autonuma_migrate_nid = dst_nid;
+ page_autonuma->autonuma_migrate_nid = dst_nid;

compound_unlock_irqrestore(page, flags);

@@ -182,9 +185,13 @@ static void __autonuma_migrate_page_add(struct page *page, int dst_nid,
static void autonuma_migrate_page_add(struct page *page, int dst_nid,
int page_nid)
{
- int migrate_nid = ACCESS_ONCE(page->autonuma_migrate_nid);
+ int migrate_nid;
+ struct page_autonuma *page_autonuma = lookup_page_autonuma(page);
+
+ migrate_nid = ACCESS_ONCE(page_autonuma->autonuma_migrate_nid);
if (migrate_nid != dst_nid)
- __autonuma_migrate_page_add(page, dst_nid, page_nid);
+ __autonuma_migrate_page_add(page, page_autonuma,
+ dst_nid, page_nid);
}

static bool balance_pgdat(struct pglist_data *pgdat,
@@ -255,23 +262,26 @@ static inline bool last_nid_set(struct task_struct *p,
struct page *page, int cpu_nid)
{
bool ret = true;
- int autonuma_last_nid = ACCESS_ONCE(page->autonuma_last_nid);
+ struct page_autonuma *page_autonuma = lookup_page_autonuma(page);
+ int autonuma_last_nid = ACCESS_ONCE(page_autonuma->autonuma_last_nid);
VM_BUG_ON(cpu_nid < 0);
VM_BUG_ON(cpu_nid >= MAX_NUMNODES);
if (autonuma_last_nid >= 0 && autonuma_last_nid != cpu_nid) {
- int migrate_nid = ACCESS_ONCE(page->autonuma_migrate_nid);
+ int migrate_nid;
+ migrate_nid = ACCESS_ONCE(page_autonuma->autonuma_migrate_nid);
if (migrate_nid >= 0 && migrate_nid != cpu_nid)
- __autonuma_migrate_page_remove(page);
+ __autonuma_migrate_page_remove(page, page_autonuma);
ret = false;
}
if (autonuma_last_nid != cpu_nid)
- ACCESS_ONCE(page->autonuma_last_nid) = cpu_nid;
+ ACCESS_ONCE(page_autonuma->autonuma_last_nid) = cpu_nid;
return ret;
}

static int __page_migrate_nid(struct page *page, int page_nid)
{
- int migrate_nid = ACCESS_ONCE(page->autonuma_migrate_nid);
+ struct page_autonuma *page_autonuma = lookup_page_autonuma(page);
+ int migrate_nid = ACCESS_ONCE(page_autonuma->autonuma_migrate_nid);
if (migrate_nid < 0)
migrate_nid = page_nid;
#if 0
@@ -810,6 +820,7 @@ static int isolate_migratepages(struct list_head *migratepages,
struct zone *zone;
struct page *page;
struct lruvec *lruvec;
+ struct page_autonuma *page_autonuma;

cond_resched();
VM_BUG_ON(numa_node_id() != pgdat->node_id);
@@ -833,16 +844,17 @@ static int isolate_migratepages(struct list_head *migratepages,
autonuma_migrate_unlock_irq(pgdat->node_id);
continue;
}
- page = list_entry(heads[nid].prev,
- struct page,
- autonuma_migrate_node);
+ page_autonuma = list_entry(heads[nid].prev,
+ struct page_autonuma,
+ autonuma_migrate_node);
+ page = page_autonuma->page;
if (unlikely(!get_page_unless_zero(page))) {
/*
* Is getting freed and will remove self from the
* autonuma list shortly, skip it for now.
*/
- list_del(&page->autonuma_migrate_node);
- list_add(&page->autonuma_migrate_node,
+ list_del(&page_autonuma->autonuma_migrate_node);
+ list_add(&page_autonuma->autonuma_migrate_node,
&heads[nid]);
autonuma_migrate_unlock_irq(pgdat->node_id);
autonuma_printk("autonuma migrate page is free\n");
@@ -851,7 +863,7 @@ static int isolate_migratepages(struct list_head *migratepages,
if (!PageLRU(page)) {
autonuma_migrate_unlock_irq(pgdat->node_id);
autonuma_printk("autonuma migrate page not in LRU\n");
- __autonuma_migrate_page_remove(page);
+ __autonuma_migrate_page_remove(page, page_autonuma);
put_page(page);
continue;
}
@@ -871,7 +883,7 @@ static int isolate_migratepages(struct list_head *migratepages,
}
}

- __autonuma_migrate_page_remove(page);
+ __autonuma_migrate_page_remove(page, page_autonuma);

zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
@@ -917,11 +929,16 @@ static struct page *alloc_migrate_dst_page(struct page *page,
{
int nid = (int) data;
struct page *newpage;
+ struct page_autonuma *page_autonuma, *newpage_autonuma;
newpage = alloc_pages_exact_node(nid,
GFP_HIGHUSER_MOVABLE | GFP_THISNODE,
0);
- if (newpage)
- newpage->autonuma_last_nid = page->autonuma_last_nid;
+ if (newpage) {
+ page_autonuma = lookup_page_autonuma(page);
+ newpage_autonuma = lookup_page_autonuma(newpage);
+ newpage_autonuma->autonuma_last_nid =
+ page_autonuma->autonuma_last_nid;
+ }
return newpage;
}

@@ -1345,7 +1362,8 @@ static int __init noautonuma_setup(char *str)
}
return 1;
}
-__setup("noautonuma", noautonuma_setup);
+/* early so sparse.c also can see it */
+early_param("noautonuma", noautonuma_setup);

static int __init autonuma_init(void)
{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bcaa8ac..c5e47bc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1831,6 +1831,13 @@ static bool __collapse_huge_page_copy(pte_t *pte, struct page *page,
{
pte_t *_pte;
bool mknuma = false;
+#ifdef CONFIG_AUTONUMA
+ struct page_autonuma *src_page_an, *page_an = NULL;
+
+ if (!autonuma_impossible())
+ page_an = lookup_page_autonuma(page);
+#endif
+
for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
pte_t pteval = *_pte;
struct page *src_page;
@@ -1839,17 +1846,18 @@ static bool __collapse_huge_page_copy(pte_t *pte, struct page *page,
clear_user_highpage(page, address);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
} else {
-#ifdef CONFIG_AUTONUMA
- int autonuma_last_nid;
-#endif
src_page = pte_page(pteval);
#ifdef CONFIG_AUTONUMA
- /* pick the last one, better than nothing */
- autonuma_last_nid =
- ACCESS_ONCE(src_page->autonuma_last_nid);
- if (autonuma_last_nid >= 0)
- ACCESS_ONCE(page->autonuma_last_nid) =
- autonuma_last_nid;
+ if (!autonuma_impossible()) {
+ int autonuma_last_nid;
+ src_page_an = lookup_page_autonuma(src_page);
+ /* pick the last one, better than nothing */
+ autonuma_last_nid =
+ ACCESS_ONCE(src_page_an->autonuma_last_nid);
+ if (autonuma_last_nid >= 0)
+ ACCESS_ONCE(page_an->autonuma_last_nid) =
+ autonuma_last_nid;
+ }
#endif
copy_user_highpage(page, src_page, address, vma);
VM_BUG_ON(page_mapcount(src_page) != 1);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8c4ae8e..2d53a1f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -60,6 +60,7 @@
#include <linux/migrate.h>
#include <linux/page-debug-flags.h>
#include <linux/autonuma.h>
+#include <linux/page_autonuma.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -615,10 +616,7 @@ static inline int free_pages_check(struct page *page)
bad_page(page);
return 1;
}
- autonuma_migrate_page_remove(page);
-#ifdef CONFIG_AUTONUMA
- page->autonuma_last_nid = -1;
-#endif
+ autonuma_free_page(page);
if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
return 0;
@@ -3729,10 +3727,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
set_pageblock_migratetype(page, MIGRATE_MOVABLE);

INIT_LIST_HEAD(&page->lru);
-#ifdef CONFIG_AUTONUMA
- page->autonuma_last_nid = -1;
- page->autonuma_migrate_nid = -1;
-#endif
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
@@ -4357,22 +4351,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
-#ifdef CONFIG_AUTONUMA
- int node_iter;
-#endif

pgdat_resize_init(pgdat);
-#ifdef CONFIG_AUTONUMA
- spin_lock_init(&pgdat->autonuma_lock);
- init_waitqueue_head(&pgdat->autonuma_knuma_migrated_wait);
- pgdat->autonuma_nr_migrate_pages = 0;
- for_each_node(node_iter)
- INIT_LIST_HEAD(&pgdat->autonuma_migrate_head[node_iter]);
-#endif
pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
pgdat->kswapd_max_order = 0;
pgdat_page_cgroup_init(pgdat);
+ pgdat_autonuma_init(pgdat);

for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
diff --git a/mm/page_autonuma.c b/mm/page_autonuma.c
new file mode 100644
index 0000000..bace9b8
--- /dev/null
+++ b/mm/page_autonuma.c
@@ -0,0 +1,234 @@
+#include <linux/mm.h>
+#include <linux/memory.h>
+#include <linux/autonuma_flags.h>
+#include <linux/page_autonuma.h>
+#include <linux/bootmem.h>
+
+void __meminit page_autonuma_map_init(struct page *page,
+ struct page_autonuma *page_autonuma,
+ int nr_pages)
+{
+ struct page *end;
+ for (end = page + nr_pages; page < end; page++, page_autonuma++) {
+ page_autonuma->autonuma_last_nid = -1;
+ page_autonuma->autonuma_migrate_nid = -1;
+ page_autonuma->page = page;
+ }
+}
+
+static void __meminit __pgdat_autonuma_init(struct pglist_data *pgdat)
+{
+ int node_iter;
+
+ spin_lock_init(&pgdat->autonuma_lock);
+ init_waitqueue_head(&pgdat->autonuma_knuma_migrated_wait);
+ pgdat->autonuma_nr_migrate_pages = 0;
+ for_each_node(node_iter)
+ INIT_LIST_HEAD(&pgdat->autonuma_migrate_head[node_iter]);
+}
+
+#if !defined(CONFIG_SPARSEMEM)
+
+static unsigned long total_usage;
+
+void __meminit pgdat_autonuma_init(struct pglist_data *pgdat)
+{
+ __pgdat_autonuma_init(pgdat);
+ pgdat->node_page_autonuma = NULL;
+}
+
+struct page_autonuma *lookup_page_autonuma(struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long offset;
+ struct page_autonuma *base;
+
+ base = NODE_DATA(page_to_nid(page))->node_page_autonuma;
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * The sanity checks the page allocator does upon freeing a
+ * page can reach here before the page_autonuma arrays are
+ * allocated when feeding a range of pages to the allocator
+ * for the first time during bootup or memory hotplug.
+ */
+ if (unlikely(!base))
+ return NULL;
+#endif
+ offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
+ return base + offset;
+}
+
+static int __init alloc_node_page_autonuma(int nid)
+{
+ struct page_autonuma *base;
+ unsigned long table_size;
+ unsigned long nr_pages;
+
+ nr_pages = NODE_DATA(nid)->node_spanned_pages;
+ if (!nr_pages)
+ return 0;
+
+ table_size = sizeof(struct page_autonuma) * nr_pages;
+
+ base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ if (!base)
+ return -ENOMEM;
+ NODE_DATA(nid)->node_page_autonuma = base;
+ total_usage += table_size;
+ page_autonuma_map_init(NODE_DATA(nid)->node_mem_map, base, nr_pages);
+ return 0;
+}
+
+void __init page_autonuma_init_flatmem(void)
+{
+
+ int nid, fail;
+
+ if (autonuma_impossible())
+ return;
+
+ for_each_online_node(nid) {
+ fail = alloc_node_page_autonuma(nid);
+ if (fail)
+ goto fail;
+ }
+ printk(KERN_INFO "allocated %lu KBytes of page_autonuma\n",
+ total_usage >> 10);
+ printk(KERN_INFO "please try the 'noautonuma' option if you"
+ " don't want to allocate page_autonuma memory\n");
+ return;
+fail:
+ printk(KERN_CRIT "allocation of page_autonuma failed.\n");
+ printk(KERN_CRIT "please try the 'noautonuma' boot option\n");
+ panic("Out of memory");
+}
+
+#else /* CONFIG_SPARSEMEM */
+
+struct page_autonuma *lookup_page_autonuma(struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ struct mem_section *section = __pfn_to_section(pfn);
+
+ /* if it's not a power of two we may be wasting memory */
+ BUILD_BUG_ON(SECTION_PAGE_AUTONUMA_SIZE &
+ (SECTION_PAGE_AUTONUMA_SIZE-1));
+
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * The sanity checks the page allocator does upon freeing a
+ * page can reach here before the page_autonuma arrays are
+ * allocated when feeding a range of pages to the allocator
+ * for the first time during bootup or memory hotplug.
+ */
+ if (!section->section_page_autonuma)
+ return NULL;
+#endif
+ return section->section_page_autonuma + pfn;
+}
+
+void __meminit pgdat_autonuma_init(struct pglist_data *pgdat)
+{
+ __pgdat_autonuma_init(pgdat);
+}
+
+struct page_autonuma * __meminit __kmalloc_section_page_autonuma(int nid,
+ unsigned long nr_pages)
+{
+ struct page_autonuma *ret;
+ struct page *page;
+ unsigned long memmap_size = PAGE_AUTONUMA_SIZE * nr_pages;
+
+ page = alloc_pages_node(nid, GFP_KERNEL|__GFP_NOWARN,
+ get_order(memmap_size));
+ if (page)
+ goto got_map_page_autonuma;
+
+ ret = vmalloc(memmap_size);
+ if (ret)
+ goto out;
+
+ return NULL;
+got_map_page_autonuma:
+ ret = (struct page_autonuma *)pfn_to_kaddr(page_to_pfn(page));
+out:
+ return ret;
+}
+
+void __kfree_section_page_autonuma(struct page_autonuma *page_autonuma,
+ unsigned long nr_pages)
+{
+ if (is_vmalloc_addr(page_autonuma))
+ vfree(page_autonuma);
+ else
+ free_pages((unsigned long)page_autonuma,
+ get_order(PAGE_AUTONUMA_SIZE * nr_pages));
+}
+
+static struct page_autonuma __init *sparse_page_autonuma_map_populate(unsigned long pnum,
+ int nid)
+{
+ struct page_autonuma *map;
+ unsigned long size;
+
+ map = alloc_remap(nid, SECTION_PAGE_AUTONUMA_SIZE);
+ if (map)
+ return map;
+
+ size = PAGE_ALIGN(SECTION_PAGE_AUTONUMA_SIZE);
+ map = __alloc_bootmem_node_high(NODE_DATA(nid), size,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ return map;
+}
+
+void __init sparse_early_page_autonuma_alloc_node(struct page_autonuma **page_autonuma_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count,
+ int nodeid)
+{
+ void *map;
+ unsigned long pnum;
+ unsigned long size = SECTION_PAGE_AUTONUMA_SIZE;
+
+ map = alloc_remap(nodeid, size * map_count);
+ if (map) {
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ page_autonuma_map[pnum] = map;
+ map += size;
+ }
+ return;
+ }
+
+ size = PAGE_ALIGN(size);
+ map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ if (map) {
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ page_autonuma_map[pnum] = map;
+ map += size;
+ }
+ return;
+ }
+
+ /* fallback */
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+ page_autonuma_map[pnum] = sparse_page_autonuma_map_populate(pnum, nodeid);
+ if (page_autonuma_map[pnum])
+ continue;
+ ms = __nr_to_section(pnum);
+ printk(KERN_ERR "%s: sparsemem page_autonuma map backing failed "
+ "some memory will not be available.\n", __func__);
+ }
+}
+
+#endif
diff --git a/mm/sparse.c b/mm/sparse.c
index 6a4bf91..1eb301e 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -9,6 +9,7 @@
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
+#include <linux/page_autonuma.h>
#include "internal.h"
#include <asm/dma.h>
#include <asm/pgalloc.h>
@@ -242,7 +243,8 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn

static int __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
- unsigned long *pageblock_bitmap)
+ unsigned long *pageblock_bitmap,
+ struct page_autonuma *page_autonuma)
{
if (!present_section(ms))
return -EINVAL;
@@ -251,6 +253,14 @@ static int __meminit sparse_init_one_section(struct mem_section *ms,
ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
SECTION_HAS_MEM_MAP;
ms->pageblock_flags = pageblock_bitmap;
+#ifdef CONFIG_AUTONUMA
+ if (page_autonuma) {
+ ms->section_page_autonuma = page_autonuma - section_nr_to_pfn(pnum);
+ page_autonuma_map_init(mem_map, page_autonuma, PAGES_PER_SECTION);
+ }
+#else
+ BUG_ON(page_autonuma);
+#endif

return 1;
}
@@ -484,6 +494,9 @@ void __init sparse_init(void)
int size2;
struct page **map_map;
#endif
+ struct page_autonuma **uninitialized_var(page_autonuma_map);
+ struct page_autonuma *page_autonuma;
+ int size3;

/*
* map is using big page (aka 2M in x86 64 bit)
@@ -578,6 +591,62 @@ void __init sparse_init(void)
map_count, nodeid_begin);
#endif

+ if (!autonuma_impossible()) {
+ unsigned long total_page_autonuma;
+ unsigned long page_autonuma_count;
+
+ size3 = sizeof(struct page_autonuma *) * NR_MEM_SECTIONS;
+ page_autonuma_map = alloc_bootmem(size3);
+ if (!page_autonuma_map)
+ panic("can not allocate page_autonuma_map\n");
+
+ for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid_begin = sparse_early_nid(ms);
+ pnum_begin = pnum;
+ break;
+ }
+ total_page_autonuma = 0;
+ page_autonuma_count = 1;
+ for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+ int nodeid;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid = sparse_early_nid(ms);
+ if (nodeid == nodeid_begin) {
+ page_autonuma_count++;
+ continue;
+ }
+ /* ok, we need to take cake of from pnum_begin to pnum - 1*/
+ sparse_early_page_autonuma_alloc_node(page_autonuma_map,
+ pnum_begin,
+ NR_MEM_SECTIONS,
+ page_autonuma_count,
+ nodeid_begin);
+ total_page_autonuma += SECTION_PAGE_AUTONUMA_SIZE * page_autonuma_count;
+ /* new start, update count etc*/
+ nodeid_begin = nodeid;
+ pnum_begin = pnum;
+ page_autonuma_count = 1;
+ }
+ /* ok, last chunk */
+ sparse_early_page_autonuma_alloc_node(page_autonuma_map, pnum_begin,
+ NR_MEM_SECTIONS,
+ page_autonuma_count, nodeid_begin);
+ total_page_autonuma += SECTION_PAGE_AUTONUMA_SIZE * page_autonuma_count;
+ printk("allocated %lu KBytes of page_autonuma\n",
+ total_page_autonuma >> 10);
+ printk(KERN_INFO "please try the 'noautonuma' option if you"
+ " don't want to allocate page_autonuma memory\n");
+ }
+
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
if (!present_section_nr(pnum))
continue;
@@ -586,6 +655,14 @@ void __init sparse_init(void)
if (!usemap)
continue;

+ if (autonuma_impossible())
+ page_autonuma = NULL;
+ else {
+ page_autonuma = page_autonuma_map[pnum];
+ if (!page_autonuma)
+ continue;
+ }
+
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
map = map_map[pnum];
#else
@@ -595,11 +672,13 @@ void __init sparse_init(void)
continue;

sparse_init_one_section(__nr_to_section(pnum), pnum, map,
- usemap);
+ usemap, page_autonuma);
}

vmemmap_populate_print_last();

+ if (!autonuma_impossible())
+ free_bootmem(__pa(page_autonuma_map), size3);
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
free_bootmem(__pa(map_map), size2);
#endif
@@ -686,7 +765,8 @@ static void free_map_bootmem(struct page *page, unsigned long nr_pages)
}
#endif /* CONFIG_SPARSEMEM_VMEMMAP */

-static void free_section_usemap(struct page *memmap, unsigned long *usemap)
+static void free_section_usemap(struct page *memmap, unsigned long *usemap,
+ struct page_autonuma *page_autonuma)
{
struct page *usemap_page;
unsigned long nr_pages;
@@ -700,8 +780,14 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
*/
if (PageSlab(usemap_page)) {
kfree(usemap);
- if (memmap)
+ if (memmap) {
__kfree_section_memmap(memmap, PAGES_PER_SECTION);
+ if (!autonuma_impossible())
+ __kfree_section_page_autonuma(page_autonuma,
+ PAGES_PER_SECTION);
+ else
+ BUG_ON(page_autonuma);
+ }
return;
}

@@ -718,6 +804,13 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
>> PAGE_SHIFT;

free_map_bootmem(memmap_page, nr_pages);
+
+ if (!autonuma_impossible()) {
+ struct page *page_autonuma_page;
+ page_autonuma_page = virt_to_page(page_autonuma);
+ free_map_bootmem(page_autonuma_page, nr_pages);
+ } else
+ BUG_ON(page_autonuma);
}
}

@@ -733,6 +826,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
struct pglist_data *pgdat = zone->zone_pgdat;
struct mem_section *ms;
struct page *memmap;
+ struct page_autonuma *page_autonuma;
unsigned long *usemap;
unsigned long flags;
int ret;
@@ -752,6 +846,16 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
__kfree_section_memmap(memmap, nr_pages);
return -ENOMEM;
}
+ if (!autonuma_impossible()) {
+ page_autonuma = __kmalloc_section_page_autonuma(pgdat->node_id,
+ nr_pages);
+ if (!page_autonuma) {
+ kfree(usemap);
+ __kfree_section_memmap(memmap, nr_pages);
+ return -ENOMEM;
+ }
+ } else
+ page_autonuma = NULL;

pgdat_resize_lock(pgdat, &flags);

@@ -763,11 +867,16 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,

ms->section_mem_map |= SECTION_MARKED_PRESENT;

- ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
+ ret = sparse_init_one_section(ms, section_nr, memmap, usemap,
+ page_autonuma);

out:
pgdat_resize_unlock(pgdat, &flags);
if (ret <= 0) {
+ if (!autonuma_impossible())
+ __kfree_section_page_autonuma(page_autonuma, nr_pages);
+ else
+ BUG_ON(page_autonuma);
kfree(usemap);
__kfree_section_memmap(memmap, nr_pages);
}
@@ -778,6 +887,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
{
struct page *memmap = NULL;
unsigned long *usemap = NULL;
+ struct page_autonuma *page_autonuma = NULL;

if (ms->section_mem_map) {
usemap = ms->pageblock_flags;
@@ -785,8 +895,12 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
__section_nr(ms));
ms->section_mem_map = 0;
ms->pageblock_flags = NULL;
+
+#ifdef CONFIG_AUTONUMA
+ page_autonuma = ms->section_page_autonuma;
+#endif
}

- free_section_usemap(memmap, usemap);
+ free_section_usemap(memmap, usemap, page_autonuma);
}
#endif
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


konrad at darnok

Jun 29, 2012, 10:24 PM

Post #2 of 5 (65 views)
Permalink
Re: [PATCH 36/40] autonuma: page_autonuma [In reply to]

On Thu, Jun 28, 2012 at 02:56:16PM +0200, Andrea Arcangeli wrote:
> Move the AutoNUMA per page information from the "struct page" to a
> separate page_autonuma data structure allocated in the memsection
> (with sparsemem) or in the pgdat (with flatmem).
>
> This is done to avoid growing the size of the "struct page" and the
> page_autonuma data is only allocated if the kernel has been booted on
> real NUMA hardware (or if noautonuma is passed as parameter to the
> kernel).
>
> Signed-off-by: Andrea Arcangeli <aarcange [at] redhat>
> ---
> include/linux/autonuma.h | 18 +++-
> include/linux/autonuma_flags.h | 6 +
> include/linux/autonuma_types.h | 55 ++++++++++
> include/linux/mm_types.h | 26 -----
> include/linux/mmzone.h | 14 +++-
> include/linux/page_autonuma.h | 53 +++++++++
> init/main.c | 2 +
> mm/Makefile | 2 +-
> mm/autonuma.c | 98 ++++++++++-------
> mm/huge_memory.c | 26 +++--
> mm/page_alloc.c | 21 +---
> mm/page_autonuma.c | 234 ++++++++++++++++++++++++++++++++++++++++
> mm/sparse.c | 126 ++++++++++++++++++++-
> 13 files changed, 577 insertions(+), 104 deletions(-)
> create mode 100644 include/linux/page_autonuma.h
> create mode 100644 mm/page_autonuma.c
>
> diff --git a/include/linux/autonuma.h b/include/linux/autonuma.h
> index 85ca5eb..67af86a 100644
> --- a/include/linux/autonuma.h
> +++ b/include/linux/autonuma.h
> @@ -7,15 +7,26 @@
>
> extern void autonuma_enter(struct mm_struct *mm);
> extern void autonuma_exit(struct mm_struct *mm);
> -extern void __autonuma_migrate_page_remove(struct page *page);
> +extern void __autonuma_migrate_page_remove(struct page *,
> + struct page_autonuma *);
> extern void autonuma_migrate_split_huge_page(struct page *page,
> struct page *page_tail);
> extern void autonuma_setup_new_exec(struct task_struct *p);
> +extern struct page_autonuma *lookup_page_autonuma(struct page *page);
>
> static inline void autonuma_migrate_page_remove(struct page *page)
> {
> - if (ACCESS_ONCE(page->autonuma_migrate_nid) >= 0)
> - __autonuma_migrate_page_remove(page);
> + struct page_autonuma *page_autonuma = lookup_page_autonuma(page);
> + if (ACCESS_ONCE(page_autonuma->autonuma_migrate_nid) >= 0)
> + __autonuma_migrate_page_remove(page, page_autonuma);
> +}
> +
> +static inline void autonuma_free_page(struct page *page)
> +{
> + if (!autonuma_impossible()) {

I think you are better using a different name.

Perhaps 'if (autonuma_on())'

> + autonuma_migrate_page_remove(page);
> + lookup_page_autonuma(page)->autonuma_last_nid = -1;
> + }
> }
>
> #define autonuma_printk(format, args...) \
> @@ -29,6 +40,7 @@ static inline void autonuma_migrate_page_remove(struct page *page) {}
> static inline void autonuma_migrate_split_huge_page(struct page *page,
> struct page *page_tail) {}
> static inline void autonuma_setup_new_exec(struct task_struct *p) {}
> +static inline void autonuma_free_page(struct page *page) {}
>
> #endif /* CONFIG_AUTONUMA */
>
> diff --git a/include/linux/autonuma_flags.h b/include/linux/autonuma_flags.h
> index 5e29a75..035d993 100644
> --- a/include/linux/autonuma_flags.h
> +++ b/include/linux/autonuma_flags.h
> @@ -15,6 +15,12 @@ enum autonuma_flag {
>
> extern unsigned long autonuma_flags;
>
> +static inline bool autonuma_impossible(void)
> +{
> + return num_possible_nodes() <= 1 ||
> + test_bit(AUTONUMA_IMPOSSIBLE_FLAG, &autonuma_flags);
> +}
> +
> static inline bool autonuma_enabled(void)
> {
> return !!test_bit(AUTONUMA_FLAG, &autonuma_flags);
> diff --git a/include/linux/autonuma_types.h b/include/linux/autonuma_types.h
> index 9e697e3..1e860f6 100644
> --- a/include/linux/autonuma_types.h
> +++ b/include/linux/autonuma_types.h
> @@ -39,6 +39,61 @@ struct task_autonuma {
> unsigned long task_numa_fault[0];
> };
>
> +/*
> + * Per page (or per-pageblock) structure dynamically allocated only if
> + * autonuma is not impossible.

not impossible? So possible?

> + */
> +struct page_autonuma {
> + /*
> + * To modify autonuma_last_nid lockless the architecture,
> + * needs SMP atomic granularity < sizeof(long), not all archs
> + * have that, notably some ancient alpha (but none of those
> + * should run in NUMA systems). Archs without that requires
> + * autonuma_last_nid to be a long.
> + */
> +#if BITS_PER_LONG > 32
> + /*
> + * autonuma_migrate_nid is -1 if the page_autonuma structure
> + * is not linked into any
> + * pgdat->autonuma_migrate_head. Otherwise it means the
> + * page_autonuma structure is linked into the
> + * &NODE_DATA(autonuma_migrate_nid)->autonuma_migrate_head[page_nid].
> + * page_nid is the nid that the page (referenced by the
> + * page_autonuma structure) belongs to.
> + */
> + int autonuma_migrate_nid;
> + /*
> + * autonuma_last_nid records which is the NUMA nid that tried
> + * to access this page at the last NUMA hinting page fault.
> + * If it changed, AutoNUMA will not try to migrate the page to
> + * the nid where the thread is running on and to the contrary,
> + * it will make different threads trashing on the same pages,
> + * converge on the same NUMA node (if possible).
> + */
> + int autonuma_last_nid;
> +#else
> +#if MAX_NUMNODES >= 32768
> +#error "too many nodes"
> +#endif
> + short autonuma_migrate_nid;
> + short autonuma_last_nid;
> +#endif
> + /*
> + * This is the list node that links the page (referenced by
> + * the page_autonuma structure) in the
> + * &NODE_DATA(dst_nid)->autonuma_migrate_head[page_nid] lru.
> + */
> + struct list_head autonuma_migrate_node;
> +
> + /*
> + * To find the page starting from the autonuma_migrate_node we
> + * need a backlink.
> + *
> + * FIXME: drop it;
> + */
> + struct page *page;
> +};
> +
> extern int alloc_task_autonuma(struct task_struct *tsk,
> struct task_struct *orig,
> int node);
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index d1248cf..f0c6379 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -136,32 +136,6 @@ struct page {
> struct page *first_page; /* Compound tail pages */
> };
>
> -#ifdef CONFIG_AUTONUMA
> - /*
> - * FIXME: move to pgdat section along with the memcg and allocate
> - * at runtime only in presence of a numa system.
> - */
> - /*
> - * To modify autonuma_last_nid lockless the architecture,
> - * needs SMP atomic granularity < sizeof(long), not all archs
> - * have that, notably some ancient alpha (but none of those
> - * should run in NUMA systems). Archs without that requires
> - * autonuma_last_nid to be a long.
> - */
> -#if BITS_PER_LONG > 32
> - int autonuma_migrate_nid;
> - int autonuma_last_nid;
> -#else
> -#if MAX_NUMNODES >= 32768
> -#error "too many nodes"
> -#endif
> - /* FIXME: remember to check the updates are atomic */
> - short autonuma_migrate_nid;
> - short autonuma_last_nid;
> -#endif
> - struct list_head autonuma_migrate_node;
> -#endif
> -
> /*
> * On machines where all RAM is mapped into kernel address space,
> * we can simply calculate the virtual address. On machines with
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index d53b26a..e66da74 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -698,10 +698,13 @@ typedef struct pglist_data {
> int kswapd_max_order;
> enum zone_type classzone_idx;
> #ifdef CONFIG_AUTONUMA
> - spinlock_t autonuma_lock;
> +#if !defined(CONFIG_SPARSEMEM)
> + struct page_autonuma *node_page_autonuma;
> +#endif
> struct list_head autonuma_migrate_head[MAX_NUMNODES];
> unsigned long autonuma_nr_migrate_pages;
> wait_queue_head_t autonuma_knuma_migrated_wait;
> + spinlock_t autonuma_lock;
> #endif
> } pg_data_t;
>
> @@ -1064,6 +1067,15 @@ struct mem_section {
> * section. (see memcontrol.h/page_cgroup.h about this.)
> */
> struct page_cgroup *page_cgroup;
> +#endif
> +#ifdef CONFIG_AUTONUMA
> + /*
> + * If !SPARSEMEM, pgdat doesn't have page_autonuma pointer. We use
> + * section.
> + */
> + struct page_autonuma *section_page_autonuma;
> +#endif
> +#if defined(CONFIG_CGROUP_MEM_RES_CTLR) ^ defined(CONFIG_AUTONUMA)
> unsigned long pad;
> #endif
> };
> diff --git a/include/linux/page_autonuma.h b/include/linux/page_autonuma.h
> new file mode 100644
> index 0000000..d748aa2
> --- /dev/null
> +++ b/include/linux/page_autonuma.h
> @@ -0,0 +1,53 @@
> +#ifndef _LINUX_PAGE_AUTONUMA_H
> +#define _LINUX_PAGE_AUTONUMA_H
> +
> +#if defined(CONFIG_AUTONUMA) && !defined(CONFIG_SPARSEMEM)
> +extern void __init page_autonuma_init_flatmem(void);
> +#else
> +static inline void __init page_autonuma_init_flatmem(void) {}
> +#endif
> +
> +#ifdef CONFIG_AUTONUMA
> +
> +#include <linux/autonuma_flags.h>
> +
> +extern void __meminit page_autonuma_map_init(struct page *page,
> + struct page_autonuma *page_autonuma,
> + int nr_pages);
> +
> +#ifdef CONFIG_SPARSEMEM
> +#define PAGE_AUTONUMA_SIZE (sizeof(struct page_autonuma))
> +#define SECTION_PAGE_AUTONUMA_SIZE (PAGE_AUTONUMA_SIZE * \
> + PAGES_PER_SECTION)
> +#endif
> +
> +extern void __meminit pgdat_autonuma_init(struct pglist_data *);
> +
> +#else /* CONFIG_AUTONUMA */
> +
> +#ifdef CONFIG_SPARSEMEM
> +struct page_autonuma;
> +#define PAGE_AUTONUMA_SIZE 0
> +#define SECTION_PAGE_AUTONUMA_SIZE 0
> +
> +#define autonuma_impossible() true
> +
> +#endif
> +
> +static inline void pgdat_autonuma_init(struct pglist_data *pgdat) {}
> +
> +#endif /* CONFIG_AUTONUMA */
> +
> +#ifdef CONFIG_SPARSEMEM
> +extern struct page_autonuma * __meminit __kmalloc_section_page_autonuma(int nid,
> + unsigned long nr_pages);
> +extern void __kfree_section_page_autonuma(struct page_autonuma *page_autonuma,
> + unsigned long nr_pages);
> +extern void __init sparse_early_page_autonuma_alloc_node(struct page_autonuma **page_autonuma_map,
> + unsigned long pnum_begin,
> + unsigned long pnum_end,
> + unsigned long map_count,
> + int nodeid);
> +#endif
> +
> +#endif /* _LINUX_PAGE_AUTONUMA_H */
> diff --git a/init/main.c b/init/main.c
> index b5cc0a7..070a377 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -68,6 +68,7 @@
> #include <linux/shmem_fs.h>
> #include <linux/slab.h>
> #include <linux/perf_event.h>
> +#include <linux/page_autonuma.h>
>
> #include <asm/io.h>
> #include <asm/bugs.h>
> @@ -455,6 +456,7 @@ static void __init mm_init(void)
> * bigger than MAX_ORDER unless SPARSEMEM.
> */
> page_cgroup_init_flatmem();
> + page_autonuma_init_flatmem();
> mem_init();
> kmem_cache_init();
> percpu_init_late();
> diff --git a/mm/Makefile b/mm/Makefile
> index 15900fd..a4d8354 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -33,7 +33,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o
> obj-$(CONFIG_HAS_DMA) += dmapool.o
> obj-$(CONFIG_HUGETLBFS) += hugetlb.o
> obj-$(CONFIG_NUMA) += mempolicy.o
> -obj-$(CONFIG_AUTONUMA) += autonuma.o
> +obj-$(CONFIG_AUTONUMA) += autonuma.o page_autonuma.o
> obj-$(CONFIG_SPARSEMEM) += sparse.o
> obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
> obj-$(CONFIG_SLOB) += slob.o
> diff --git a/mm/autonuma.c b/mm/autonuma.c
> index f44272b..ec4d492 100644
> --- a/mm/autonuma.c
> +++ b/mm/autonuma.c
> @@ -51,12 +51,6 @@ static struct knumad_scan {
> .mm_head = LIST_HEAD_INIT(knumad_scan.mm_head),
> };
>
> -static inline bool autonuma_impossible(void)
> -{
> - return num_possible_nodes() <= 1 ||
> - test_bit(AUTONUMA_IMPOSSIBLE_FLAG, &autonuma_flags);
> -}
> -
> static inline void autonuma_migrate_lock(int nid)
> {
> spin_lock(&NODE_DATA(nid)->autonuma_lock);
> @@ -82,54 +76,63 @@ void autonuma_migrate_split_huge_page(struct page *page,
> struct page *page_tail)
> {
> int nid, last_nid;
> + struct page_autonuma *page_autonuma, *page_tail_autonuma;
>
> - nid = page->autonuma_migrate_nid;
> + if (autonuma_impossible())

Is it just better to call it 'autonuma_off()' ?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


riel at redhat

Jul 1, 2012, 11:37 PM

Post #3 of 5 (64 views)
Permalink
Re: [PATCH 36/40] autonuma: page_autonuma [In reply to]

On 06/28/2012 08:56 AM, Andrea Arcangeli wrote:

> +++ b/include/linux/autonuma_flags.h
> @@ -15,6 +15,12 @@ enum autonuma_flag {
>
> extern unsigned long autonuma_flags;
>
> +static inline bool autonuma_impossible(void)
> +{
> + return num_possible_nodes()<= 1 ||
> + test_bit(AUTONUMA_IMPOSSIBLE_FLAG,&autonuma_flags);
> +}

When you fix the name of this function, could you also put it
in the right spot, in the patch where it is originally introduced?

Moving stuff around for no reason in a patch series is not very
reviewer friendly.

> diff --git a/include/linux/autonuma_types.h b/include/linux/autonuma_types.h
> index 9e697e3..1e860f6 100644
> --- a/include/linux/autonuma_types.h
> +++ b/include/linux/autonuma_types.h
> @@ -39,6 +39,61 @@ struct task_autonuma {
> unsigned long task_numa_fault[0];
> };
>
> +/*
> + * Per page (or per-pageblock) structure dynamically allocated only if
> + * autonuma is not impossible.
> + */

Double negatives are not easy to read.

s/not impossible/enabled/

> +struct page_autonuma {
> + /*
> + * To modify autonuma_last_nid lockless the architecture,
> + * needs SMP atomic granularity< sizeof(long), not all archs
> + * have that, notably some ancient alpha (but none of those
> + * should run in NUMA systems). Archs without that requires
> + * autonuma_last_nid to be a long.
> + */

If only all your data structures were documented like this.

I guess that will give you something to do, when addressing
the comments on the other patches :)

> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index bcaa8ac..c5e47bc 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c

> #ifdef CONFIG_AUTONUMA
> - /* pick the last one, better than nothing */
> - autonuma_last_nid =
> - ACCESS_ONCE(src_page->autonuma_last_nid);
> - if (autonuma_last_nid>= 0)
> - ACCESS_ONCE(page->autonuma_last_nid) =
> - autonuma_last_nid;
> + if (!autonuma_impossible()) {
> + int autonuma_last_nid;
> + src_page_an = lookup_page_autonuma(src_page);
> + /* pick the last one, better than nothing */
> + autonuma_last_nid =
> + ACCESS_ONCE(src_page_an->autonuma_last_nid);
> + if (autonuma_last_nid>= 0)
> + ACCESS_ONCE(page_an->autonuma_last_nid) =
> + autonuma_last_nid;
> + }

Remembering the last page the loop went through, and then
looking up the autonuma struct after you exit the loop could
be better.

> diff --git a/mm/page_autonuma.c b/mm/page_autonuma.c
> new file mode 100644
> index 0000000..bace9b8
> --- /dev/null
> +++ b/mm/page_autonuma.c
> @@ -0,0 +1,234 @@
> +#include<linux/mm.h>
> +#include<linux/memory.h>
> +#include<linux/autonuma_flags.h>

This should be <linux/autonuma.h>

There is absolutely no good reason why that one-liner change
is a separate patch.

> +struct page_autonuma *lookup_page_autonuma(struct page *page)
> +{

> + offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
> + return base + offset;
> +}

Doing this and the reverse allows you to drop the page pointer
in struct autonuma.

It would make sense to do that either in this patch, or in a
new one, but either way pulling it forward out of patch 40
would make the series easier to review for the next round.

> +fail:
> + printk(KERN_CRIT "allocation of page_autonuma failed.\n");
> + printk(KERN_CRIT "please try the 'noautonuma' boot option\n");
> + panic("Out of memory");
> +}

The system can run just fine without autonuma.

Would it make sense to simply disable autonuma at this point,
but to try continue running?

> @@ -700,8 +780,14 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
> */
> if (PageSlab(usemap_page)) {
> kfree(usemap);
> - if (memmap)
> + if (memmap) {
> __kfree_section_memmap(memmap, PAGES_PER_SECTION);
> + if (!autonuma_impossible())
> + __kfree_section_page_autonuma(page_autonuma,
> + PAGES_PER_SECTION);
> + else
> + BUG_ON(page_autonuma);

VM_BUG_ON ?

> + if (!autonuma_impossible()) {
> + struct page *page_autonuma_page;
> + page_autonuma_page = virt_to_page(page_autonuma);
> + free_map_bootmem(page_autonuma_page, nr_pages);
> + } else
> + BUG_ON(page_autonuma);

ditto

> pgdat_resize_unlock(pgdat,&flags);
> if (ret<= 0) {
> + if (!autonuma_impossible())
> + __kfree_section_page_autonuma(page_autonuma, nr_pages);
> + else
> + BUG_ON(page_autonuma);

VM_BUG_ON ?

--
All rights reversed
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


aarcange at redhat

Jul 12, 2012, 12:43 PM

Post #4 of 5 (40 views)
Permalink
Re: [PATCH 36/40] autonuma: page_autonuma [In reply to]

On Sat, Jun 30, 2012 at 01:24:05AM -0400, Konrad Rzeszutek Wilk wrote:
> I think you are better using a different name.
>
> Perhaps 'if (autonuma_on())'

I changed it to AUTONUMA_POSSIBLE_FLAG/autonuma_possible() and
optimized the implementation to a single test_bit on the read mostly
flag variable.

"possible" is the term all NUMA code in the kernel already uses with
almost the same meaning (modulo noautonuma parameter) as
num_possible_nodes() etc...
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


aarcange at redhat

Jul 12, 2012, 12:58 PM

Post #5 of 5 (40 views)
Permalink
Re: [PATCH 36/40] autonuma: page_autonuma [In reply to]

On Mon, Jul 02, 2012 at 02:37:10AM -0400, Rik van Riel wrote:
> > +fail:
> > + printk(KERN_CRIT "allocation of page_autonuma failed.\n");
> > + printk(KERN_CRIT "please try the 'noautonuma' boot option\n");
> > + panic("Out of memory");
> > +}
>
> The system can run just fine without autonuma.
>
> Would it make sense to simply disable autonuma at this point,
> but to try continue running?

BTW, the same would apply to mm/page_cgroup.c, but I think the idea
here is that something serious went wrong. Workaround with noautonuma
boot option is enough.

>
> > @@ -700,8 +780,14 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
> > */
> > if (PageSlab(usemap_page)) {
> > kfree(usemap);
> > - if (memmap)
> > + if (memmap) {
> > __kfree_section_memmap(memmap, PAGES_PER_SECTION);
> > + if (!autonuma_impossible())
> > + __kfree_section_page_autonuma(page_autonuma,
> > + PAGES_PER_SECTION);
> > + else
> > + BUG_ON(page_autonuma);
>
> VM_BUG_ON ?
>
> > + if (!autonuma_impossible()) {
> > + struct page *page_autonuma_page;
> > + page_autonuma_page = virt_to_page(page_autonuma);
> > + free_map_bootmem(page_autonuma_page, nr_pages);
> > + } else
> > + BUG_ON(page_autonuma);
>
> ditto
>
> > pgdat_resize_unlock(pgdat,&flags);
> > if (ret<= 0) {
> > + if (!autonuma_impossible())
> > + __kfree_section_page_autonuma(page_autonuma, nr_pages);
> > + else
> > + BUG_ON(page_autonuma);
>
> VM_BUG_ON ?

These only run at the very boot stage, so performance is irrelevant
and it's safer to keep them on.

The rest was corrected.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linux kernel RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact Gossamer Threads
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.