[PATCH] [0/18] GB pages hugetlb support - Kernel

This is a discussion on [PATCH] [0/18] GB pages hugetlb support - Kernel ; On (17/03/08 02:58), Andi Kleen didst pronounce: > Signed-off-by: Andi Kleen > > --- > include/linux/hugetlb.h | 1 + > mm/hugetlb.c | 23 ++++++++++++++++++----- > 2 files changed, 19 insertions(+), 5 deletions(-) > > Index: linux/mm/hugetlb.c > ================================================== ================= > ...

+ Reply to Thread
Page 4 of 4 FirstFirst ... 2 3 4
Results 61 to 75 of 75

Thread: [PATCH] [0/18] GB pages hugetlb support

  1. Re: [PATCH] [13/18] Add support to allocate hugepages of different size with hugepages=...

    On (17/03/08 02:58), Andi Kleen didst pronounce:
    > Signed-off-by: Andi Kleen
    >
    > ---
    > include/linux/hugetlb.h | 1 +
    > mm/hugetlb.c | 23 ++++++++++++++++++-----
    > 2 files changed, 19 insertions(+), 5 deletions(-)
    >
    > Index: linux/mm/hugetlb.c
    > ================================================== =================
    > --- linux.orig/mm/hugetlb.c
    > +++ linux/mm/hugetlb.c
    > @@ -552,19 +552,23 @@ static int __init hugetlb_init_hstate(st
    > {
    > unsigned long i;
    >
    > - for (i = 0; i < MAX_NUMNODES; ++i)
    > - INIT_LIST_HEAD(&h->hugepage_freelists[i]);
    > + /* Don't reinitialize lists if they have been already init'ed */
    > + if (!h->hugepage_freelists[0].next) {
    > + for (i = 0; i < MAX_NUMNODES; ++i)
    > + INIT_LIST_HEAD(&h->hugepage_freelists[i]);
    >
    > - h->hugetlb_next_nid = first_node(node_online_map);
    > + h->hugetlb_next_nid = first_node(node_online_map);
    > + }



    hmm, it's not very clear to me how hugetlb_init_hstate() would get
    called twice for the same hstate. Should it be VM_BUG_ON() if a hstate
    gets initialised twice instead?

    >
    > - for (i = 0; i < max_huge_pages[h - hstates]; ++i) {
    > + while (h->parsed_hugepages < max_huge_pages[h - hstates]) {
    > if (h->order > MAX_ORDER) {
    > if (!alloc_bm_huge_page(h))
    > break;
    > } else if (!alloc_fresh_huge_page(h))
    > break;
    > + h->parsed_hugepages++;
    > }
    > - max_huge_pages[h - hstates] = h->free_huge_pages = h->nr_huge_pages = i;
    > + max_huge_pages[h - hstates] = h->parsed_hugepages;
    >
    > printk(KERN_INFO "Total HugeTLB memory allocated, %ld %dMB pages\n",
    > h->free_huge_pages,
    > @@ -602,6 +606,15 @@ static int __init hugetlb_setup(char *s)
    > unsigned long *mhp = &max_huge_pages[parsed_hstate - hstates];
    > if (sscanf(s, "%lu", mhp) <= 0)
    > *mhp = 0;
    > + /*
    > + * Global state is always initialized later in hugetlb_init.
    > + * But we need to allocate > MAX_ORDER hstates here early to still
    > + * use the bootmem allocator.
    > + * If you add additional hstates <= MAX_ORDER you'll need
    > + * to fix that.
    > + */
    > + if (parsed_hstate != &global_hstate)
    > + hugetlb_init_hstate(parsed_hstate);
    > return 1;
    > }
    > __setup("hugepages=", hugetlb_setup);
    > Index: linux/include/linux/hugetlb.h
    > ================================================== =================
    > --- linux.orig/include/linux/hugetlb.h
    > +++ linux/include/linux/hugetlb.h
    > @@ -212,6 +212,7 @@ struct hstate {
    > unsigned int nr_huge_pages_node[MAX_NUMNODES];
    > unsigned int free_huge_pages_node[MAX_NUMNODES];
    > unsigned int surplus_huge_pages_node[MAX_NUMNODES];
    > + unsigned long parsed_hugepages;
    > };
    >
    > void __init huge_add_hstate(unsigned order);
    >
    > --
    > To unsubscribe, send a message with 'unsubscribe linux-mm' in
    > the body to majordomo@kvack.org. For more info on Linux MM,
    > see: http://www.linux-mm.org/ .
    > Don't email: email@kvack.org
    >


    --
    Mel Gorman
    Part-time Phd Student Linux Technology Center
    University of Limerick IBM Dublin Software Lab
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  2. Re: [PATCH] [12/18] Add support to allocate hugetlb pages that are larger than MAX_ORDER

    On (17/03/08 02:58), Andi Kleen didst pronounce:
    > This is needed on x86-64 to handle GB pages in hugetlbfs, because it is
    > not practical to enlarge MAX_ORDER to 1GB.
    >
    > Instead the 1GB pages are only allocated at boot using the bootmem
    > allocator using the hugepages=... option.
    >
    > These 1G bootmem pages are never freed. In theory it would be possible
    > to implement that with some complications, but since it would be a one-way
    > street (> MAX_ORDER pages cannot be allocated later) I decided not to currently.
    >
    > The > MAX_ORDER code is not ifdef'ed per architecture. It is not very big
    > and the ifdef uglyness seemed not be worth it.
    >
    > Known problems: /proc/meminfo and "free" do not display the memory
    > allocated for gb pages in "Total". This is a little confusing for the
    > user.
    >
    > Signed-off-by: Andi Kleen
    >
    > ---
    > mm/hugetlb.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++ +++++++--
    > 1 file changed, 62 insertions(+), 2 deletions(-)
    >
    > Index: linux/mm/hugetlb.c
    > ================================================== =================
    > --- linux.orig/mm/hugetlb.c
    > +++ linux/mm/hugetlb.c
    > @@ -14,6 +14,7 @@
    > #include
    > #include
    > #include
    > +#include
    >
    > #include
    > #include
    > @@ -153,7 +154,7 @@ static void free_huge_page(struct page *
    > INIT_LIST_HEAD(&page->lru);
    >
    > spin_lock(&hugetlb_lock);
    > - if (h->surplus_huge_pages_node[nid]) {
    > + if (h->surplus_huge_pages_node[nid] && h->order <= MAX_ORDER) {
    > update_and_free_page(h, page);
    > h->surplus_huge_pages--;
    > h->surplus_huge_pages_node[nid]--;
    > @@ -215,6 +216,9 @@ static struct page *alloc_fresh_huge_pag
    > {
    > struct page *page;
    >
    > + if (h->order > MAX_ORDER)
    > + return NULL;
    > +


    Should this print out a KERN_INFO message to the effect that pages of
    that size must be reserved at boot-time?

    > page = alloc_pages_node(nid,
    > htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NO WARN,
    > huge_page_order(h));
    > @@ -271,6 +275,9 @@ static struct page *alloc_buddy_huge_pag
    > struct page *page;
    > unsigned int nid;
    >
    > + if (h->order > MAX_ORDER)
    > + return NULL;
    > +
    > /*
    > * Assume we will successfully allocate the surplus page to
    > * prevent racing processes from causing the surplus to exceed
    > @@ -422,6 +429,10 @@ return_unused_surplus_pages(struct hstat
    > /* Uncommit the reservation */
    > h->resv_huge_pages -= unused_resv_pages;
    >
    > + /* Cannot return gigantic pages currently */
    > + if (h->order > MAX_ORDER)
    > + return;
    > +
    > nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
    >
    > while (nr_pages) {
    > @@ -499,6 +510,44 @@ static struct page *alloc_huge_page(stru
    > return page;
    > }
    >
    > +static __initdata LIST_HEAD(huge_boot_pages);
    > +
    > +struct huge_bm_page {
    > + struct list_head list;
    > + struct hstate *hstate;
    > +};
    > +
    > +static int __init alloc_bm_huge_page(struct hstate *h)
    > +{
    > + struct huge_bm_page *m;
    > + m = __alloc_bootmem_node_nopanic(NODE_DATA(h->hugetlb_next_nid),
    > + huge_page_size(h), huge_page_size(h),
    > + 0);
    > + if (!m)
    > + return 0;
    > + BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
    > + /* Put them into a private list first because mem_map is not up yet */
    > + list_add(&m->list, &huge_boot_pages);
    > + m->hstate = h;
    > + huge_next_node(h);
    > + return 1;
    > +}
    > +
    > +/* Put bootmem huge pages into the standard lists after mem_map is up */
    > +static int __init huge_init_bm(void)
    > +{
    > + struct huge_bm_page *m;
    > + list_for_each_entry (m, &huge_boot_pages, list) {
    > + struct page *page = virt_to_page(m);
    > + struct hstate *h = m->hstate;
    > + __ClearPageReserved(page);
    > + prep_compound_page(page, h->order);
    > + huge_new_page(h, page);
    > + }
    > + return 0;
    > +}
    > +__initcall(huge_init_bm);
    > +
    > static int __init hugetlb_init_hstate(struct hstate *h)
    > {
    > unsigned long i;
    > @@ -509,7 +558,10 @@ static int __init hugetlb_init_hstate(st
    > h->hugetlb_next_nid = first_node(node_online_map);
    >
    > for (i = 0; i < max_huge_pages[h - hstates]; ++i) {
    > - if (!alloc_fresh_huge_page(h))
    > + if (h->order > MAX_ORDER) {
    > + if (!alloc_bm_huge_page(h))
    > + break;
    > + } else if (!alloc_fresh_huge_page(h))
    > break;
    > }
    > max_huge_pages[h - hstates] = h->free_huge_pages = h->nr_huge_pages = i;
    > @@ -581,6 +633,9 @@ static void do_try_to_free_low(struct hs
    > {
    > int i;
    >
    > + if (h->order > MAX_ORDER)
    > + return;
    > +
    > for (i = 0; i < MAX_NUMNODES; ++i) {
    > struct page *page, *next;
    > struct list_head *freel = &h->hugepage_freelists[i];
    > @@ -618,6 +673,11 @@ set_max_huge_pages(struct hstate *h, uns
    >
    > *err = 0;
    >
    > + if (h->order > MAX_ORDER) {
    > + *err = -EINVAL;
    > + return max_huge_pages[h - hstates];
    > + }
    > +


    Ah, scratch the comment on an earlier patch where I said I cannot see
    where err ever gets updated in set_max_huge_pages().

    > /*
    > * Increase the pool size
    > * First take pages out of surplus state. Then make up the
    >


    --
    Mel Gorman
    Part-time Phd Student Linux Technology Center
    University of Limerick IBM Dublin Software Lab
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  3. Re: [PATCH] [7/18] Abstract out the NUMA node round robin code into a separate function

    On (17/03/08 02:58), Andi Kleen didst pronounce:
    > Need this as a separate function for a future patch.
    >
    > No behaviour change.
    >
    > Signed-off-by: Andi Kleen


    Maybe if you moved this beside patch 1, they could both be tested in
    isolation as a fairly reasonable cleanup that does not alter
    functionality? Not a big deal.

    >
    > ---
    > mm/hugetlb.c | 37 ++++++++++++++++++++++---------------
    > 1 file changed, 22 insertions(+), 15 deletions(-)
    >
    > Index: linux/mm/hugetlb.c
    > ================================================== =================
    > --- linux.orig/mm/hugetlb.c
    > +++ linux/mm/hugetlb.c
    > @@ -219,6 +219,27 @@ static struct page *alloc_fresh_huge_pag
    > return page;
    > }
    >
    > +/*
    > + * Use a helper variable to find the next node and then
    > + * copy it back to hugetlb_next_nid afterwards:
    > + * otherwise there's a window in which a racer might
    > + * pass invalid nid MAX_NUMNODES to alloc_pages_node.
    > + * But we don't need to use a spin_lock here: it really
    > + * doesn't matter if occasionally a racer chooses the
    > + * same nid as we do. Move nid forward in the mask even
    > + * if we just successfully allocated a hugepage so that
    > + * the next caller gets hugepages on the next node.
    > + */
    > +static int huge_next_node(struct hstate *h)
    > +{
    > + int next_nid;
    > + next_nid = next_node(h->hugetlb_next_nid, node_online_map);
    > + if (next_nid == MAX_NUMNODES)
    > + next_nid = first_node(node_online_map);
    > + h->hugetlb_next_nid = next_nid;
    > + return next_nid;
    > +}
    > +
    > static int alloc_fresh_huge_page(struct hstate *h)
    > {
    > struct page *page;
    > @@ -232,21 +253,7 @@ static int alloc_fresh_huge_page(struct
    > page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
    > if (page)
    > ret = 1;
    > - /*
    > - * Use a helper variable to find the next node and then
    > - * copy it back to hugetlb_next_nid afterwards:
    > - * otherwise there's a window in which a racer might
    > - * pass invalid nid MAX_NUMNODES to alloc_pages_node.
    > - * But we don't need to use a spin_lock here: it really
    > - * doesn't matter if occasionally a racer chooses the
    > - * same nid as we do. Move nid forward in the mask even
    > - * if we just successfully allocated a hugepage so that
    > - * the next caller gets hugepages on the next node.
    > - */
    > - next_nid = next_node(h->hugetlb_next_nid, node_online_map);
    > - if (next_nid == MAX_NUMNODES)
    > - next_nid = first_node(node_online_map);
    > - h->hugetlb_next_nid = next_nid;
    > + next_nid = huge_next_node(h);


    hmm, I'm not seeing where next_nid gets declared locally here as it
    should have been removed in an earlier patch. Maybe it's reintroduced
    later but if you do reshuffle the patchset so that the cleanups can be
    merged on their own, it'll show up in a compile test.

    > } while (!page && h->hugetlb_next_nid != start_nid);
    >
    > return ret;
    >


    Other than the possible gotcha with next_nid declared locally, the move
    seems fine.

    Acked-by: Mel Gorman

    --
    Mel Gorman
    Part-time Phd Student Linux Technology Center
    University of Limerick IBM Dublin Software Lab
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  4. Re: [PATCH] [11/18] Fix alignment bug in bootmem allocator

    On (17/03/08 02:58), Andi Kleen didst pronounce:
    > Without this fix bootmem can return unaligned addresses when the start of a
    > node is not aligned to the align value. Needed for reliably allocating
    > gigabyte pages.
    > Signed-off-by: Andi Kleen


    Seems like something that should be fixed anyway independently of your
    patchset. If moved to the start of the set, it can be treated in batch with
    the cleanups as well.

    >
    > ---
    > mm/bootmem.c | 4 +++-
    > 1 file changed, 3 insertions(+), 1 deletion(-)
    >
    > Index: linux/mm/bootmem.c
    > ================================================== =================
    > --- linux.orig/mm/bootmem.c
    > +++ linux/mm/bootmem.c
    > @@ -197,6 +197,7 @@ __alloc_bootmem_core(struct bootmem_data
    > {
    > unsigned long offset, remaining_size, areasize, preferred;
    > unsigned long i, start = 0, incr, eidx, end_pfn;
    > + unsigned long pfn;
    > void *ret;
    >
    > if (!size) {
    > @@ -239,12 +240,13 @@ __alloc_bootmem_core(struct bootmem_data
    > preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
    > areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
    > incr = align >> PAGE_SHIFT ? : 1;
    > + pfn = PFN_DOWN(bdata->node_boot_start);
    >


    hmm, preferred is already been aligned above and it appears that "offset"
    was meant to handle the situation you are dealing with here. Is the caller
    passing in "goal" (to avoid DMA32 for example) and messing up how "offset"
    is calculated?

    > restart_scan:
    > for (i = preferred; i < eidx; i += incr) {
    > unsigned long j;
    > i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
    > - i = ALIGN(i, incr);
    > + i = ALIGN(pfn + i, incr) - pfn;
    > if (i >= eidx)
    > break;
    > if (test_bit(i, bdata->node_bootmem_map))
    >


    --
    Mel Gorman
    Part-time Phd Student Linux Technology Center
    University of Limerick IBM Dublin Software Lab
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  5. Re: [PATCH] [5/18] Expand the hugetlbfs sysctls to handle arrays for all hstates

    On (17/03/08 02:58), Andi Kleen didst pronounce:
    > - I didn't bother with hugetlb_shm_group and treat_as_movable,
    > these are still single global.


    I cannot imagine why either of those would be per-pool anyway.
    Potentially shm_group could become a per-mount value which is both
    outside the scope of this patchset and not per-pool so unsuitable for
    hstate.

    > - Also improve error propagation for the sysctl handlers a bit
    >
    >
    > Signed-off-by: Andi Kleen
    >
    > ---
    > include/linux/hugetlb.h | 5 +++--
    > kernel/sysctl.c | 2 +-
    > mm/hugetlb.c | 43 +++++++++++++++++++++++++++++++------------
    > 3 files changed, 35 insertions(+), 15 deletions(-)
    >
    > Index: linux/include/linux/hugetlb.h
    > ================================================== =================
    > --- linux.orig/include/linux/hugetlb.h
    > +++ linux/include/linux/hugetlb.h
    > @@ -32,8 +32,6 @@ int hugetlb_fault(struct mm_struct *mm,
    > int hugetlb_reserve_pages(struct inode *inode, long from, long to);
    > void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
    >
    > -extern unsigned long max_huge_pages;
    > -extern unsigned long sysctl_overcommit_huge_pages;
    > extern unsigned long hugepages_treat_as_movable;
    > extern const unsigned long hugetlb_zero, hugetlb_infinity;
    > extern int sysctl_hugetlb_shm_group;
    > @@ -258,6 +256,9 @@ static inline unsigned huge_page_shift(s
    > return h->order + PAGE_SHIFT;
    > }
    >
    > +extern unsigned long max_huge_pages[HUGE_MAX_HSTATE];
    > +extern unsigned long sysctl_overcommit_huge_pages[HUGE_MAX_HSTATE];


    Any particular reason for moving them?

    Also, offhand it's not super-clear why max_huge_pages is not part of
    hstate as we only expect one hstate per pagesize anyway.

    > +
    > #else
    > struct hstate {};
    > #define hstate_file(f) NULL
    > Index: linux/kernel/sysctl.c
    > ================================================== =================
    > --- linux.orig/kernel/sysctl.c
    > +++ linux/kernel/sysctl.c
    > @@ -935,7 +935,7 @@ static struct ctl_table vm_table[] = {
    > {
    > .procname = "nr_hugepages",
    > .data = &max_huge_pages,
    > - .maxlen = sizeof(unsigned long),
    > + .maxlen = sizeof(max_huge_pages),
    > .mode = 0644,
    > .proc_handler = &hugetlb_sysctl_handler,
    > .extra1 = (void *)&hugetlb_zero,
    > Index: linux/mm/hugetlb.c
    > ================================================== =================
    > --- linux.orig/mm/hugetlb.c
    > +++ linux/mm/hugetlb.c
    > @@ -22,8 +22,8 @@
    > #include "internal.h"
    >
    > const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
    > -unsigned long max_huge_pages;
    > -unsigned long sysctl_overcommit_huge_pages;
    > +unsigned long max_huge_pages[HUGE_MAX_HSTATE];
    > +unsigned long sysctl_overcommit_huge_pages[HUGE_MAX_HSTATE];
    > static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
    > unsigned long hugepages_treat_as_movable;
    >
    > @@ -496,11 +496,11 @@ static int __init hugetlb_init_hstate(st
    >
    > h->hugetlb_next_nid = first_node(node_online_map);
    >
    > - for (i = 0; i < max_huge_pages; ++i) {
    > + for (i = 0; i < max_huge_pages[h - hstates]; ++i) {
    > if (!alloc_fresh_huge_page(h))
    > break;
    > }
    > - max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i;
    > + max_huge_pages[h - hstates] = h->free_huge_pages = h->nr_huge_pages = i;
    >


    hmm ok, it looks a little weird to be working out h - hstates multiple times
    in a loop when it is invariant but functionally, it's fine.

    > printk(KERN_INFO "Total HugeTLB memory allocated, %ld %dMB pages\n",
    > h->free_huge_pages,
    > @@ -531,8 +531,9 @@ void __init huge_add_hstate(unsigned ord
    >
    > static int __init hugetlb_setup(char *s)
    > {
    > - if (sscanf(s, "%lu", &max_huge_pages) <= 0)
    > - max_huge_pages = 0;
    > + unsigned long *mhp = &max_huge_pages[parsed_hstate - hstates];


    This looks like we are assuming there is only ever one other
    parsed_hstate. For the purposes of what you aim to achieve in this set,
    it's not important but a comment over parsed_hstate about this
    assumption is probably necessary.

    > + if (sscanf(s, "%lu", mhp) <= 0)
    > + *mhp = 0;
    > return 1;
    > }
    > __setup("hugepages=", hugetlb_setup);
    > @@ -584,10 +585,12 @@ static inline void try_to_free_low(unsig
    > #endif
    >
    > #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
    > -static unsigned long set_max_huge_pages(unsigned long count)
    > +static unsigned long
    > +set_max_huge_pages(struct hstate *h, unsigned long count, int *err)
    > {
    > unsigned long min_count, ret;
    > - struct hstate *h = &global_hstate;
    > +
    > + *err = 0;
    >


    What is updating err to anything else in set_max_huge_pages()?

    > /*
    > * Increase the pool size
    > @@ -659,8 +662,20 @@ int hugetlb_sysctl_handler(struct ctl_ta
    > struct file *file, void __user *buffer,
    > size_t *length, loff_t *ppos)
    > {
    > - proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
    > - max_huge_pages = set_max_huge_pages(max_huge_pages);
    > + int err = 0;
    > + struct hstate *h;
    > + int i;
    > + err = proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
    > + if (err)
    > + return err;
    > + i = 0;
    > + for_each_hstate (h) {
    > + max_huge_pages[i] = set_max_huge_pages(h, max_huge_pages[i],
    > + &err);


    hmm, this is saying when I write 10 to nr_hugepages, I am asking for 10
    2MB pages and 10 1GB pages potentially. Is that what you want?

    > + if (err)
    > + return err;


    I'm failing to see how the error handling is improved when
    set_max_huge_pages() is not updating err. Maybe it happens in another
    patch.

    > + i++;
    > + }
    > return 0;
    > }
    >
    > @@ -680,10 +695,14 @@ int hugetlb_overcommit_handler(struct ct
    > struct file *file, void __user *buffer,
    > size_t *length, loff_t *ppos)
    > {
    > - struct hstate *h = &global_hstate;
    > + struct hstate *h;
    > + int i = 0;
    > proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
    > spin_lock(&hugetlb_lock);
    > - h->nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
    > + for_each_hstate (h) {
    > + h->nr_overcommit_huge_pages = sysctl_overcommit_huge_pages[i];
    > + i++;
    > + }


    Similar to the other sysctl here, the overcommit value is being set for
    all the huge page sizes.

    > spin_unlock(&hugetlb_lock);
    > return 0;
    > }
    >


    --
    Mel Gorman
    Part-time Phd Student Linux Technology Center
    University of Limerick IBM Dublin Software Lab
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  6. Re: [PATCH] [13/18] Add support to allocate hugepages of different size with hugepages=...

    On (18/03/08 17:45), Andi Kleen didst pronounce:
    > > hmm, it's not very clear to me how hugetlb_init_hstate() would get
    > > called twice for the same hstate. Should it be VM_BUG_ON() if a hstate

    >
    > It is called from a __setup function and the user can specify them multiple
    > times. Also when the user specified the HPAGE_SIZE already and it got set up
    > it should not be called again.
    >


    Ok, that is a fair explanation. Thanks.

    --
    Mel Gorman
    Part-time Phd Student Linux Technology Center
    University of Limerick IBM Dublin Software Lab
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  7. Re: [PATCH] [8/18] Add a __alloc_bootmem_node_nopanic

    On (17/03/08 02:58), Andi Kleen didst pronounce:
    > Straight forward variant of the existing __alloc_bootmem_node, only
    > Signed-off-by: Andi Kleen
    >
    > difference is that it doesn't panic on failure


    Seems to be a bit of cut&paste jumbling there.

    >
    > Signed-off-by: Andi Kleen
    > ---
    > include/linux/bootmem.h | 4 ++++
    > mm/bootmem.c | 12 ++++++++++++
    > 2 files changed, 16 insertions(+)
    >
    > Index: linux/mm/bootmem.c
    > ================================================== =================
    > --- linux.orig/mm/bootmem.c
    > +++ linux/mm/bootmem.c
    > @@ -471,6 +471,18 @@ void * __init __alloc_bootmem_node(pg_da
    > return __alloc_bootmem(size, align, goal);
    > }
    >
    > +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
    > + unsigned long align, unsigned long goal)
    > +{
    > + void *ptr;
    > +
    > + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
    > + if (ptr)
    > + return ptr;
    > +
    > + return __alloc_bootmem_nopanic(size, align, goal);
    > +}


    Straight-forward. Mildly irritating that there are multiple variants that
    only differ by whether they panic on allocation failure or not. Probably
    should be a seperate removal of duplicated code there but outside the
    scope of this patch.

    > +
    > #ifndef ARCH_LOW_ADDRESS_LIMIT
    > #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
    > #endif
    > Index: linux/include/linux/bootmem.h
    > ================================================== =================
    > --- linux.orig/include/linux/bootmem.h
    > +++ linux/include/linux/bootmem.h
    > @@ -90,6 +90,10 @@ extern void *__alloc_bootmem_node(pg_dat
    > unsigned long size,
    > unsigned long align,
    > unsigned long goal);
    > +extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
    > + unsigned long size,
    > + unsigned long align,
    > + unsigned long goal);
    > extern unsigned long init_bootmem_node(pg_data_t *pgdat,
    > unsigned long freepfn,
    > unsigned long startpfn,
    >


    Acked-by: Mel Gorman

    --
    Mel Gorman
    Part-time Phd Student Linux Technology Center
    University of Limerick IBM Dublin Software Lab
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  8. Re: [PATCH] [5/18] Expand the hugetlbfs sysctls to handle arrays for all hstates

    > Also, offhand it's not super-clear why max_huge_pages is not part of
    > hstate as we only expect one hstate per pagesize anyway.


    They need to be an separate array for the sysctl parsing function.

    -Andi

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  9. Re: [PATCH] [13/18] Add support to allocate hugepages of different size with hugepages=...

    > hmm, it's not very clear to me how hugetlb_init_hstate() would get
    > called twice for the same hstate. Should it be VM_BUG_ON() if a hstate

    It is called from a __setup function and the user can specify them multiple
    times. Also when the user specified the HPAGE_SIZE already and it got set up
    it should not be called again.

    -Andi
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  10. Re: [PATCH] [14/18] Clean up hugetlb boot time printk

    On (17/03/08 02:58), Andi Kleen didst pronounce:
    > - Reword sentence to clarify meaning with multiple options
    > - Add support for using GB prefixes for the page size
    > - Add extra printk to delayed > MAX_ORDER allocation code
    >


    Scratch earlier comments about this printk. If the printk fix
    was broken out, it could be moved to the start of the set so it can be
    tested/merged separetly. The remainder of this patch could then be
    folded into the patch allowing 1GB pages to be reserved at boot-time.

    > Signed-off-by: Andi Kleen
    >
    > ---
    > mm/hugetlb.c | 33 ++++++++++++++++++++++++++++++---
    > 1 file changed, 30 insertions(+), 3 deletions(-)
    >
    > Index: linux/mm/hugetlb.c
    > ================================================== =================
    > --- linux.orig/mm/hugetlb.c
    > +++ linux/mm/hugetlb.c
    > @@ -510,6 +510,15 @@ static struct page *alloc_huge_page(stru
    > return page;
    > }
    >
    > +static __init char *memfmt(char *buf, unsigned long n)
    > +{
    > + if (n >= (1UL << 30))
    > + sprintf(buf, "%lu GB", n >> 30);
    > + else
    > + sprintf(buf, "%lu MB", n >> 20);
    > + return buf;
    > +}
    > +
    > static __initdata LIST_HEAD(huge_boot_pages);
    >
    > struct huge_bm_page {
    > @@ -536,14 +545,28 @@ static int __init alloc_bm_huge_page(str
    > /* Put bootmem huge pages into the standard lists after mem_map is up */
    > static int __init huge_init_bm(void)
    > {
    > + unsigned long pages = 0;
    > struct huge_bm_page *m;
    > + struct hstate *h = NULL;
    > + char buf[32];
    > +
    > list_for_each_entry (m, &huge_boot_pages, list) {
    > struct page *page = virt_to_page(m);
    > - struct hstate *h = m->hstate;
    > + h = m->hstate;
    > __ClearPageReserved(page);
    > prep_compound_page(page, h->order);
    > huge_new_page(h, page);
    > + pages++;
    > }
    > +
    > + /*
    > + * This only prints for a single hstate. This works for x86-64,
    > + * but if you do multiple > MAX_ORDER hstates you'll need to fix it.
    > + */
    > + if (pages > 0)
    > + printk(KERN_INFO "HugeTLB pre-allocated %ld %s pages\n",
    > + h->free_huge_pages,
    > + memfmt(buf, huge_page_size(h)));
    > return 0;
    > }
    > __initcall(huge_init_bm);
    > @@ -551,6 +574,8 @@ __initcall(huge_init_bm);
    > static int __init hugetlb_init_hstate(struct hstate *h)
    > {
    > unsigned long i;
    > + char buf[32];
    > + unsigned long pages = 0;
    >
    > /* Don't reinitialize lists if they have been already init'ed */
    > if (!h->hugepage_freelists[0].next) {
    > @@ -567,12 +592,14 @@ static int __init hugetlb_init_hstate(st
    > } else if (!alloc_fresh_huge_page(h))
    > break;
    > h->parsed_hugepages++;
    > + pages++;
    > }
    > max_huge_pages[h - hstates] = h->parsed_hugepages;
    >
    > - printk(KERN_INFO "Total HugeTLB memory allocated, %ld %dMB pages\n",
    > + if (pages > 0)
    > + printk(KERN_INFO "HugeTLB pre-allocated %ld %s pages\n",
    > h->free_huge_pages,
    > - 1 << (h->order + PAGE_SHIFT - 20));
    > + memfmt(buf, huge_page_size(h)));
    > return 0;
    > }
    >
    >


    --
    Mel Gorman
    Part-time Phd Student Linux Technology Center
    University of Limerick IBM Dublin Software Lab
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  11. Re: [PATCH] [2/18] Add basic support for more than one hstate in hugetlbfs

    Hi Andi

    sorry for very late review.

    > @@ -497,11 +501,34 @@ static int __init hugetlb_init(void)
    > break;
    > }
    > max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i;
    > - printk("Total HugeTLB memory allocated, %ld\n", h->free_huge_pages);
    > +
    > + printk(KERN_INFO "Total HugeTLB memory allocated, %ld %dMB pages\n",
    > + h->free_huge_pages,
    > + 1 << (h->order + PAGE_SHIFT - 20));
    > return 0;
    > }


    IA64 arch support 64k hugepage, assumption >1MB size is wrong.


    > +/* Should be called on processing a hugepagesz=... option */
    > +void __init huge_add_hstate(unsigned order)
    > +{
    > + struct hstate *h;
    > + BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
    > + BUG_ON(order <= HPAGE_SHIFT - PAGE_SHIFT);
    > + h = &hstates[max_hstate++];
    > + h->order = order;
    > + h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
    > + hugetlb_init_hstate(h);
    > + parsed_hstate = h;
    > +}


    this function is called once by one boot parameter, right?
    if so, this function cause panic when stupid user write many
    hugepagesz boot parameter.

    Why don't you use following check.

    if (max_hstate >= HUGE_MAX_HSTATE) {
    printk("hoge hoge");
    return;
    }



    - kosaki


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  12. Re: [PATCH] [2/18] Add basic support for more than one hstate in hugetlbfs

    > this function is called once by one boot parameter, right?
    > if so, this function cause panic when stupid user write many
    > hugepagesz boot parameter.


    A later patch fixes that up by looking up the hstate explicitely. Also it
    is bisect safe because the callers are only added later.

    -Andi
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  13. Re: [PATCH] [2/18] Add basic support for more than one hstate in hugetlbfs

    > > this function is called once by one boot parameter, right?
    > > if so, this function cause panic when stupid user write many
    > > hugepagesz boot parameter.

    >
    > A later patch fixes that up by looking up the hstate explicitely. Also it
    > is bisect safe because the callers are only added later.


    Oops, sorry.


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  14. Re: [PATCH] [12/18] Add support to allocate hugetlb pages that are larger than MAX_ORDER

    > This looks like an off-by-one error here and in the code below -- it
    > should be ">= MAX_ORDER" not "> MAX_ORDER". Cf alloc_pages() in gfp.h:
    >
    > if (unlikely(order >= MAX_ORDER))
    > return NULL;


    True good point. Although it will only matter if some architecture
    has MAX_ORDER sized huge pages x86-64 definitely hasn't.

    I passed this code over to Nick so he'll hopefully incorporate the fix.

    -Andi
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  15. Re: [PATCH] [12/18] Add support to allocate hugetlb pages that are larger than MAX_ORDER

    Andi Kleen wrote:
    > This is needed on x86-64 to handle GB pages in hugetlbfs, because it is
    > not practical to enlarge MAX_ORDER to 1GB.
    >
    > Instead the 1GB pages are only allocated at boot using the bootmem
    > allocator using the hugepages=... option.
    >
    > These 1G bootmem pages are never freed. In theory it would be possible
    > to implement that with some complications, but since it would be a one-way
    > street (> MAX_ORDER pages cannot be allocated later) I decided not to currently.
    >
    > The > MAX_ORDER code is not ifdef'ed per architecture. It is not very big
    > and the ifdef uglyness seemed not be worth it.


    This looks like an off-by-one error here and in the code below -- it
    should be ">= MAX_ORDER" not "> MAX_ORDER". Cf alloc_pages() in gfp.h:

    if (unlikely(order >= MAX_ORDER))
    return NULL;

    > Known problems: /proc/meminfo and "free" do not display the memory
    > allocated for gb pages in "Total". This is a little confusing for the
    > user.
    >
    > Signed-off-by: Andi Kleen
    >
    > ---
    > mm/hugetlb.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++ +++++++--
    > 1 file changed, 62 insertions(+), 2 deletions(-)
    >
    > Index: linux/mm/hugetlb.c
    > ================================================== =================
    > --- linux.orig/mm/hugetlb.c
    > +++ linux/mm/hugetlb.c
    > @@ -14,6 +14,7 @@
    > #include
    > #include
    > #include
    > +#include
    >
    > #include
    > #include
    > @@ -153,7 +154,7 @@ static void free_huge_page(struct page *
    > INIT_LIST_HEAD(&page->lru);
    >
    > spin_lock(&hugetlb_lock);
    > - if (h->surplus_huge_pages_node[nid]) {
    > + if (h->surplus_huge_pages_node[nid] && h->order <= MAX_ORDER) {
    > update_and_free_page(h, page);
    > h->surplus_huge_pages--;
    > h->surplus_huge_pages_node[nid]--;
    > @@ -215,6 +216,9 @@ static struct page *alloc_fresh_huge_pag
    > {
    > struct page *page;
    >
    > + if (h->order > MAX_ORDER)
    > + return NULL;
    > +
    > page = alloc_pages_node(nid,
    > htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NO WARN,
    > huge_page_order(h));
    > @@ -271,6 +275,9 @@ static struct page *alloc_buddy_huge_pag
    > struct page *page;
    > unsigned int nid;
    >
    > + if (h->order > MAX_ORDER)
    > + return NULL;
    > +
    > /*
    > * Assume we will successfully allocate the surplus page to
    > * prevent racing processes from causing the surplus to exceed
    > @@ -422,6 +429,10 @@ return_unused_surplus_pages(struct hstat
    > /* Uncommit the reservation */
    > h->resv_huge_pages -= unused_resv_pages;
    >
    > + /* Cannot return gigantic pages currently */
    > + if (h->order > MAX_ORDER)
    > + return;
    > +
    > nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
    >
    > while (nr_pages) {
    > @@ -499,6 +510,44 @@ static struct page *alloc_huge_page(stru
    > return page;
    > }
    >
    > +static __initdata LIST_HEAD(huge_boot_pages);
    > +
    > +struct huge_bm_page {
    > + struct list_head list;
    > + struct hstate *hstate;
    > +};
    > +
    > +static int __init alloc_bm_huge_page(struct hstate *h)
    > +{
    > + struct huge_bm_page *m;
    > + m = __alloc_bootmem_node_nopanic(NODE_DATA(h->hugetlb_next_nid),
    > + huge_page_size(h), huge_page_size(h),
    > + 0);
    > + if (!m)
    > + return 0;
    > + BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
    > + /* Put them into a private list first because mem_map is not up yet */
    > + list_add(&m->list, &huge_boot_pages);
    > + m->hstate = h;
    > + huge_next_node(h);
    > + return 1;
    > +}
    > +
    > +/* Put bootmem huge pages into the standard lists after mem_map is up */
    > +static int __init huge_init_bm(void)
    > +{
    > + struct huge_bm_page *m;
    > + list_for_each_entry (m, &huge_boot_pages, list) {
    > + struct page *page = virt_to_page(m);
    > + struct hstate *h = m->hstate;
    > + __ClearPageReserved(page);
    > + prep_compound_page(page, h->order);
    > + huge_new_page(h, page);
    > + }
    > + return 0;
    > +}
    > +__initcall(huge_init_bm);
    > +
    > static int __init hugetlb_init_hstate(struct hstate *h)
    > {
    > unsigned long i;
    > @@ -509,7 +558,10 @@ static int __init hugetlb_init_hstate(st
    > h->hugetlb_next_nid = first_node(node_online_map);
    >
    > for (i = 0; i < max_huge_pages[h - hstates]; ++i) {
    > - if (!alloc_fresh_huge_page(h))
    > + if (h->order > MAX_ORDER) {
    > + if (!alloc_bm_huge_page(h))
    > + break;
    > + } else if (!alloc_fresh_huge_page(h))
    > break;
    > }
    > max_huge_pages[h - hstates] = h->free_huge_pages = h->nr_huge_pages = i;
    > @@ -581,6 +633,9 @@ static void do_try_to_free_low(struct hs
    > {
    > int i;
    >
    > + if (h->order > MAX_ORDER)
    > + return;
    > +
    > for (i = 0; i < MAX_NUMNODES; ++i) {
    > struct page *page, *next;
    > struct list_head *freel = &h->hugepage_freelists[i];
    > @@ -618,6 +673,11 @@ set_max_huge_pages(struct hstate *h, uns
    >
    > *err = 0;
    >
    > + if (h->order > MAX_ORDER) {
    > + *err = -EINVAL;
    > + return max_huge_pages[h - hstates];
    > + }
    > +
    > /*
    > * Increase the pool size
    > * First take pages out of surplus state. Then make up the


    -Andrew Hastings
    Cray Inc.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

+ Reply to Thread
Page 4 of 4 FirstFirst ... 2 3 4