[PATCH 0/12] memcg updates v5 - Kernel

This is a discussion on [PATCH 0/12] memcg updates v5 - Kernel ; Hi, I updated the stack and reflected comments. Against the latest mmotm. (rc7-mm1) Major changes from previous one is - page_cgroup allocation/lookup manner is changed. all FLATMEM/DISCONTIGMEM/SPARSEMEM and MEMORY_HOTPLUG is supported. - force_empty is totally rewritten. and a problem that ...

+ Reply to Thread
Page 1 of 4 1 2 3 ... LastLast
Results 1 to 20 of 69

Thread: [PATCH 0/12] memcg updates v5

  1. [PATCH 0/12] memcg updates v5

    Hi, I updated the stack and reflected comments.
    Against the latest mmotm. (rc7-mm1)

    Major changes from previous one is
    - page_cgroup allocation/lookup manner is changed.
    all FLATMEM/DISCONTIGMEM/SPARSEMEM and MEMORY_HOTPLUG is supported.
    - force_empty is totally rewritten. and a problem that "force_empty takes long time"
    in previous version is fixed (I think...)
    - reordered patches.
    - first half are easy ones.
    - second half are big ones.

    I'm still testing with full debug option. No problem found yet.
    (I'm afraid of race condition which have not been caught yet.)

    [1/12] avoid accounting special mappings not on LRU. (fix)
    [2/12] move charege() call to swapped-in page under lock_page() (clean up)
    [3/12] make root cgroup to be unlimited. (change semantics.)
    [4/12] make page->mapping NULL before calling uncharge (clean up)
    [5/12] make page->flags to use atomic ops. (changes in infrastructure)
    [6/12] optimize stat. (clean up)
    [7/12] add support function for moving account. (new function)
    [8/12] rewrite force_empty to use move_account. (change semantics.)
    [9/12] allocate all page_cgroup at boot. (changes in infrastructure)
    [10/12] free page_cgroup from LRU in lazy way (optimize)
    [11/12] add page_cgroup to LRU in lazy way (optimize)
    [12/12] fix race at charging swap (fix by new logic.)

    *Any* comment is welcome.

    Thanks,
    -Kame

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  2. [PATCH 1/12] memcg avoid accounting special mappings not on LRU

    There are not-on-LRU pages which can be mapped and they are not worth to
    be accounted. (becasue we can't shrink them and need dirty codes to handle
    specical case) We'd like to make use of usual objrmap/radix-tree's protcol
    and don't want to account out-of-vm's control pages.

    When special_mapping_fault() is called, page->mapping is tend to be NULL
    and it's charged as Anonymous page.
    insert_page() also handles some special pages from drivers.

    This patch is for avoiding to account special pages.

    Signed-off-by: KAMEZAWA Hiroyuki

    mm/memory.c | 18 ++++++------------
    mm/rmap.c | 4 ++--
    2 files changed, 8 insertions(+), 14 deletions(-)

    Index: mmotm-2.6.27-rc6+/mm/memory.c
    ================================================== =================
    --- mmotm-2.6.27-rc6+.orig/mm/memory.c
    +++ mmotm-2.6.27-rc6+/mm/memory.c
    @@ -1323,18 +1323,14 @@ static int insert_page(struct vm_area_st
    pte_t *pte;
    spinlock_t *ptl;

    - retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
    - if (retval)
    - goto out;
    -
    retval = -EINVAL;
    if (PageAnon(page))
    - goto out_uncharge;
    + goto out;
    retval = -ENOMEM;
    flush_dcache_page(page);
    pte = get_locked_pte(mm, addr, &ptl);
    if (!pte)
    - goto out_uncharge;
    + goto out;
    retval = -EBUSY;
    if (!pte_none(*pte))
    goto out_unlock;
    @@ -1350,8 +1346,6 @@ static int insert_page(struct vm_area_st
    return retval;
    out_unlock:
    pte_unmap_unlock(pte, ptl);
    -out_uncharge:
    - mem_cgroup_uncharge_page(page);
    out:
    return retval;
    }
    @@ -2542,7 +2536,7 @@ static int __do_fault(struct mm_struct *

    }

    - if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
    + if (anon && mem_cgroup_charge(page, mm, GFP_KERNEL)) {
    ret = VM_FAULT_OOM;
    goto out;
    }
    @@ -2584,10 +2578,10 @@ static int __do_fault(struct mm_struct *
    /* no need to invalidate: a not-present page won't be cached */
    update_mmu_cache(vma, address, entry);
    } else {
    - mem_cgroup_uncharge_page(page);
    - if (anon)
    + if (anon) {
    + mem_cgroup_uncharge_page(page);
    page_cache_release(page);
    - else
    + } else
    anon = 1; /* no anon but release faulted_page */
    }

    Index: mmotm-2.6.27-rc6+/mm/rmap.c
    ================================================== =================
    --- mmotm-2.6.27-rc6+.orig/mm/rmap.c
    +++ mmotm-2.6.27-rc6+/mm/rmap.c
    @@ -725,8 +725,8 @@ void page_remove_rmap(struct page *page,
    page_clear_dirty(page);
    set_page_dirty(page);
    }
    -
    - mem_cgroup_uncharge_page(page);
    + if (PageAnon(page))
    + mem_cgroup_uncharge_page(page);
    __dec_zone_page_state(page,
    PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
    /*

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  3. [PATCH 5/12] memcg make page_cgroup->flags atomic

    This patch makes page_cgroup->flags to be atomic_ops and define
    functions (and macros) to access it.

    This patch itself makes memcg slow but this patch's final purpose is
    to remove lock_page_cgroup() and allowing fast access to page_cgroup.
    (And total performance will increase after all patches applied.)

    Before trying to modify memory resource controller, this atomic operation
    on flags is necessary. Most of flags in this patch is for LRU and modfied
    under mz->lru_lock but we'll add another flags which is not for LRU soon.
    So we use atomic version here.


    Changelog: (v4) -> (v5)
    - removed unsued operations.
    - adjusted to new ctype MEM_CGROUP_CHARGE_TYPE_SHMEM

    Changelog: (v3) -> (v4)
    - no changes.

    Changelog: (v2) -> (v3)
    - renamed macros and flags to be longer name.
    - added comments.
    - added "default bit set" for File, Shmem, Anon.

    Changelog: (preview) -> (v1):
    - patch ordering is changed.
    - Added macro for defining functions for Test/Set/Clear bit.
    - made the names of flags shorter.

    Signed-off-by: KAMEZAWA Hiroyuki

    mm/memcontrol.c | 122 +++++++++++++++++++++++++++++++++++++-------------------
    1 file changed, 82 insertions(+), 40 deletions(-)

    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -161,12 +161,46 @@ struct page_cgroup {
    struct list_head lru; /* per cgroup LRU list */
    struct page *page;
    struct mem_cgroup *mem_cgroup;
    - int flags;
    + unsigned long flags;
    };
    -#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
    -#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
    -#define PAGE_CGROUP_FLAG_FILE (0x4) /* page is file system backed */
    -#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8) /* page is unevictableable */
    +
    +enum {
    + /* flags for mem_cgroup */
    + PCG_CACHE, /* charged as cache */
    + /* flags for LRU placement */
    + PCG_ACTIVE, /* page is active in this cgroup */
    + PCG_FILE, /* page is file system backed */
    + PCG_UNEVICTABLE, /* page is unevictableable */
    +};
    +
    +#define TESTPCGFLAG(uname, lname) \
    +static inline int PageCgroup##uname(struct page_cgroup *pc) \
    + { return test_bit(PCG_##lname, &pc->flags); }
    +
    +#define SETPCGFLAG(uname, lname) \
    +static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
    + { set_bit(PCG_##lname, &pc->flags); }
    +
    +#define CLEARPCGFLAG(uname, lname) \
    +static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
    + { clear_bit(PCG_##lname, &pc->flags); }
    +
    +
    +/* Cache flag is set only once (at allocation) */
    +TESTPCGFLAG(Cache, CACHE)
    +
    +/* LRU management flags (from global-lru definition) */
    +TESTPCGFLAG(File, FILE)
    +SETPCGFLAG(File, FILE)
    +CLEARPCGFLAG(File, FILE)
    +
    +TESTPCGFLAG(Active, ACTIVE)
    +SETPCGFLAG(Active, ACTIVE)
    +CLEARPCGFLAG(Active, ACTIVE)
    +
    +TESTPCGFLAG(Unevictable, UNEVICTABLE)
    +SETPCGFLAG(Unevictable, UNEVICTABLE)
    +CLEARPCGFLAG(Unevictable, UNEVICTABLE)

    static int page_cgroup_nid(struct page_cgroup *pc)
    {
    @@ -181,21 +215,31 @@ static enum zone_type page_cgroup_zid(st
    enum charge_type {
    MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
    MEM_CGROUP_CHARGE_TYPE_MAPPED,
    - MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
    MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
    + MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
    + NR_CHARGE_TYPE,
    +};
    +
    +static const unsigned long
    +pcg_default_flags[NR_CHARGE_TYPE] = {
    + ((1 << PCG_CACHE) | (1 << PCG_FILE)),
    + ((1 << PCG_ACTIVE)),
    + ((1 << PCG_ACTIVE) | (1 << PCG_CACHE)),
    + 0,
    };

    /*
    * Always modified under lru lock. Then, not necessary to preempt_disable()
    */
    -static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
    - bool charge)
    +static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
    + struct page_cgroup *pc,
    + bool charge)
    {
    int val = (charge)? 1 : -1;
    struct mem_cgroup_stat *stat = &mem->stat;

    VM_BUG_ON(!irqs_disabled());
    - if (flags & PAGE_CGROUP_FLAG_CACHE)
    + if (PageCgroupCache(pc))
    __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
    else
    __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
    @@ -296,18 +340,18 @@ static void __mem_cgroup_remove_list(str
    {
    int lru = LRU_BASE;

    - if (pc->flags & PAGE_CGROUP_FLAG_UNEVICTABLE)
    + if (PageCgroupUnevictable(pc))
    lru = LRU_UNEVICTABLE;
    else {
    - if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
    + if (PageCgroupActive(pc))
    lru += LRU_ACTIVE;
    - if (pc->flags & PAGE_CGROUP_FLAG_FILE)
    + if (PageCgroupFile(pc))
    lru += LRU_FILE;
    }

    MEM_CGROUP_ZSTAT(mz, lru) -= 1;

    - mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
    + mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
    list_del(&pc->lru);
    }

    @@ -316,27 +360,27 @@ static void __mem_cgroup_add_list(struct
    {
    int lru = LRU_BASE;

    - if (pc->flags & PAGE_CGROUP_FLAG_UNEVICTABLE)
    + if (PageCgroupUnevictable(pc))
    lru = LRU_UNEVICTABLE;
    else {
    - if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
    + if (PageCgroupActive(pc))
    lru += LRU_ACTIVE;
    - if (pc->flags & PAGE_CGROUP_FLAG_FILE)
    + if (PageCgroupFile(pc))
    lru += LRU_FILE;
    }

    MEM_CGROUP_ZSTAT(mz, lru) += 1;
    list_add(&pc->lru, &mz->lists[lru]);

    - mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
    + mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
    }

    static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
    {
    struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
    - int active = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
    - int file = pc->flags & PAGE_CGROUP_FLAG_FILE;
    - int unevictable = pc->flags & PAGE_CGROUP_FLAG_UNEVICTABLE;
    + int active = PageCgroupActive(pc);
    + int file = PageCgroupFile(pc);
    + int unevictable = PageCgroupUnevictable(pc);
    enum lru_list from = unevictable ? LRU_UNEVICTABLE :
    (LRU_FILE * !!file + !!active);

    @@ -344,16 +388,20 @@ static void __mem_cgroup_move_lists(stru
    return;

    MEM_CGROUP_ZSTAT(mz, from) -= 1;
    -
    + /*
    + * However this is done under mz->lru_lock, another flags, which
    + * are not related to LRU, will be modified from out-of-lock.
    + * We have to use atomic set/clear flags.
    + */
    if (is_unevictable_lru(lru)) {
    - pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
    - pc->flags |= PAGE_CGROUP_FLAG_UNEVICTABLE;
    + ClearPageCgroupActive(pc);
    + SetPageCgroupUnevictable(pc);
    } else {
    if (is_active_lru(lru))
    - pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
    + SetPageCgroupActive(pc);
    else
    - pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
    - pc->flags &= ~PAGE_CGROUP_FLAG_UNEVICTABLE;
    + ClearPageCgroupActive(pc);
    + ClearPageCgroupUnevictable(pc);
    }

    MEM_CGROUP_ZSTAT(mz, lru) += 1;
    @@ -590,16 +638,7 @@ static int mem_cgroup_charge_common(stru
    * If a page is accounted as a page cache, insert to inactive list.
    * If anon, insert to active list.
    */
    - if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
    - pc->flags = PAGE_CGROUP_FLAG_CACHE;
    - if (page_is_file_cache(page))
    - pc->flags |= PAGE_CGROUP_FLAG_FILE;
    - else
    - pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
    - } else if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
    - pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
    - else /* MEM_CGROUP_CHARGE_TYPE_SHMEM */
    - pc->flags = PAGE_CGROUP_FLAG_CACHE | PAGE_CGROUP_FLAG_ACTIVE;
    + pc->flags = pcg_default_flags[ctype];

    lock_page_cgroup(page);
    if (unlikely(page_get_page_cgroup(page))) {
    @@ -678,8 +717,12 @@ int mem_cgroup_cache_charge(struct page
    if (unlikely(!mm))
    mm = &init_mm;

    - return mem_cgroup_charge_common(page, mm, gfp_mask,
    + if (page_is_file_cache(page))
    + return mem_cgroup_charge_common(page, mm, gfp_mask,
    MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
    + else
    + return mem_cgroup_charge_common(page, mm, gfp_mask,
    + MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
    }

    /*
    @@ -707,8 +750,7 @@ __mem_cgroup_uncharge_common(struct page
    VM_BUG_ON(pc->page != page);

    if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
    - && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
    - || page_mapped(page)))
    + && ((PageCgroupCache(pc) || page_mapped(page))))
    goto unlock;

    mz = page_cgroup_zoneinfo(pc);
    @@ -759,7 +801,7 @@ int mem_cgroup_prepare_migration(struct
    if (pc) {
    mem = pc->mem_cgroup;
    css_get(&mem->css);
    - if (pc->flags & PAGE_CGROUP_FLAG_CACHE) {
    + if (PageCgroupCache(pc)) {
    if (page_is_file_cache(page))
    ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
    else

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  4. [PATCH 6/12] memcg optimize percpu stat

    Some obvious optimization to memcg.

    I found mem_cgroup_charge_statistics() is a little big (in object) and
    does unnecessary address calclation.
    This patch is for optimization to reduce the size of this function.

    And res_counter_charge() is 'likely' to success.

    Changelog v3->v4:
    - merged with an other leaf patch.

    Signed-off-by: KAMEZAWA Hiroyuki

    mm/memcontrol.c | 18 ++++++++++--------
    1 file changed, 10 insertions(+), 8 deletions(-)

    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -66,11 +66,10 @@ struct mem_cgroup_stat {
    /*
    * For accounting under irq disable, no need for increment preempt count.
    */
    -static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,
    +static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
    enum mem_cgroup_stat_index idx, int val)
    {
    - int cpu = smp_processor_id();
    - stat->cpustat[cpu].count[idx] += val;
    + stat->count[idx] += val;
    }

    static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
    @@ -237,18 +236,21 @@ static void mem_cgroup_charge_statistics
    {
    int val = (charge)? 1 : -1;
    struct mem_cgroup_stat *stat = &mem->stat;
    + struct mem_cgroup_stat_cpu *cpustat;

    VM_BUG_ON(!irqs_disabled());
    +
    + cpustat = &stat->cpustat[smp_processor_id()];
    if (PageCgroupCache(pc))
    - __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
    + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
    else
    - __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
    + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);

    if (charge)
    - __mem_cgroup_stat_add_safe(stat,
    + __mem_cgroup_stat_add_safe(cpustat,
    MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
    else
    - __mem_cgroup_stat_add_safe(stat,
    + __mem_cgroup_stat_add_safe(cpustat,
    MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
    }

    @@ -609,7 +611,7 @@ static int mem_cgroup_charge_common(stru
    css_get(&memcg->css);
    }

    - while (res_counter_charge(&mem->res, PAGE_SIZE)) {
    + while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
    if (!(gfp_mask & __GFP_WAIT))
    goto out;


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  5. [PATCH 2/12] memcg move charege() call to swapped-in page under lock_page()

    While page-cache's charge/uncharge is done under page_lock(), swap-cache
    isn't. (anonymous page is charged when it's newly allocated.)

    This patch moves do_swap_page()'s charge() call under lock. This helps
    us to avoid to charge already mapped one, unnecessary calls.

    Signed-off-by: KAMEZAWA Hiroyuki

    mm/memory.c | 7 +++----
    1 file changed, 3 insertions(+), 4 deletions(-)

    Index: mmotm-2.6.27-rc7+/mm/memory.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memory.c
    +++ mmotm-2.6.27-rc7+/mm/memory.c
    @@ -2320,15 +2320,14 @@ static int do_swap_page(struct mm_struct
    count_vm_event(PGMAJFAULT);
    }

    + lock_page(page);
    + delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
    +
    if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
    - delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
    ret = VM_FAULT_OOM;
    goto out;
    }
    -
    mark_page_accessed(page);
    - lock_page(page);
    - delayacct_clear_flag(DELAYACCT_PF_SWAPIN);

    /*
    * Back out if somebody else already faulted in this pte.

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  6. [PATCH 3/12] memcg make root cgroup unlimited.

    Make root cgroup of memory resource controller to have no limit.

    By this, users cannot set limit to root group. This is for making root cgroup
    as a kind of trash-can.

    For accounting pages which has no owner, which are created by force_empty,
    we need some cgroup with no_limit. A patch for rewriting force_empty will
    will follow this one.

    Signed-off-by: KAMEZAWA Hiroyuki

    Documentation/controllers/memory.txt | 4 ++++
    mm/memcontrol.c | 7 +++++++
    2 files changed, 11 insertions(+)

    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -136,6 +136,9 @@ struct mem_cgroup {
    };
    static struct mem_cgroup init_mem_cgroup;

    +#define is_root_cgroup(cgrp) ((cgrp) == &init_mem_cgroup)
    +
    +
    /*
    * We use the lower bit of the page->page_cgroup pointer as a bit spin
    * lock. We need to ensure that page->page_cgroup is at least two
    @@ -945,6 +948,10 @@ static int mem_cgroup_write(struct cgrou

    switch (cft->private) {
    case RES_LIMIT:
    + if (is_root_cgroup(memcg)) {
    + ret = -EINVAL;
    + break;
    + }
    /* This function does all necessary parse...reuse it */
    ret = res_counter_memparse_write_strategy(buffer, &val);
    if (!ret)
    Index: mmotm-2.6.27-rc7+/Documentation/controllers/memory.txt
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/Documentation/controllers/memory.txt
    +++ mmotm-2.6.27-rc7+/Documentation/controllers/memory.txt
    @@ -121,6 +121,9 @@ The corresponding routines that remove a
    a page from Page Cache is used to decrement the accounting counters of the
    cgroup.

    +The root cgroup is not allowed to be set limit but usage is accounted.
    +For controlling usage of memory, you need to create a cgroup.
    +
    2.3 Shared Page Accounting

    Shared pages are accounted on the basis of the first touch approach. The
    @@ -172,6 +175,7 @@ We can alter the memory limit:

    NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
    mega or gigabytes.
    +Note: root cgroup is not able to be set limit.

    # cat /cgroups/0/memory.limit_in_bytes
    4194304

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  7. [PATCH 4/12] memcg make page->mapping NULL before calling uncharge

    This patch tries to make page->mapping to be NULL before
    mem_cgroup_uncharge_cache_page() is called.

    "page->mapping == NULL" is a good check for "whether the page is still
    radix-tree or not".
    This patch also adds BUG_ON() to mem_cgroup_uncharge_cache_page();


    Signed-off-by: KAMEZAWA Hiroyuki

    mm/filemap.c | 2 +-
    mm/memcontrol.c | 1 +
    mm/migrate.c | 12 +++++++++---
    3 files changed, 11 insertions(+), 4 deletions(-)

    Index: mmotm-2.6.27-rc7+/mm/filemap.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/filemap.c
    +++ mmotm-2.6.27-rc7+/mm/filemap.c
    @@ -116,12 +116,12 @@ void __remove_from_page_cache(struct pag
    {
    struct address_space *mapping = page->mapping;

    - mem_cgroup_uncharge_cache_page(page);
    radix_tree_delete(&mapping->page_tree, page->index);
    page->mapping = NULL;
    mapping->nrpages--;
    __dec_zone_page_state(page, NR_FILE_PAGES);
    BUG_ON(page_mapped(page));
    + mem_cgroup_uncharge_cache_page(page);

    /*
    * Some filesystems seem to re-dirty the page even after
    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -737,6 +737,7 @@ void mem_cgroup_uncharge_page(struct pag
    void mem_cgroup_uncharge_cache_page(struct page *page)
    {
    VM_BUG_ON(page_mapped(page));
    + VM_BUG_ON(page->mapping);
    __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
    }

    Index: mmotm-2.6.27-rc7+/mm/migrate.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/migrate.c
    +++ mmotm-2.6.27-rc7+/mm/migrate.c
    @@ -330,8 +330,6 @@ static int migrate_page_move_mapping(str
    __inc_zone_page_state(newpage, NR_FILE_PAGES);

    spin_unlock_irq(&mapping->tree_lock);
    - if (!PageSwapCache(newpage))
    - mem_cgroup_uncharge_cache_page(page);

    return 0;
    }
    @@ -378,7 +376,15 @@ static void migrate_page_copy(struct pag
    #endif
    ClearPagePrivate(page);
    set_page_private(page, 0);
    - page->mapping = NULL;
    + /* page->mapping contains a flag for PageAnon() */
    + if (PageAnon(page)) {
    + /* This page is uncharged at try_to_unmap(). */
    + page->mapping = NULL;
    + } else {
    + /* Obsolete file cache should be uncharged */
    + page->mapping = NULL;
    + mem_cgroup_uncharge_cache_page(page);
    + }

    /*
    * If any waiters have accumulated on the new page then

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  8. [PATCH 8/12] memcg rewrite force empty to move account to root

    Current force_empty of memory resource controller just removes page_cgroup.
    This maans the page is never accounted at all and create an in-use page which
    has no page_cgroup.

    This patch tries to move account to "root" cgroup. By this patch, force_empty
    doesn't leak an account but move account to "root" cgroup. Maybe someone can
    think of other enhancements as moving account to its parent.
    (But moving to the parent means we have to handle "limit" of pages.
    Need more complicated work to do that.)

    For now, just moves account to root cgroup.

    Note: all lock other than old mem_cgroup's lru_lock
    in this path is try_lock().

    Changelog (v4) -> (5)
    - removed yield()
    - remove lock_page().
    - use list_for_each_entry_safe() instead of list_empty() loop.
    - check list is empty or not rather than see usage.
    - added lru_add_drain_all() at the start of loops.

    Changelog (v2) -> (v4)
    - splitted out mem_cgroup_move_account().
    - replaced get_page() with get_page_unless_zero().
    (This is necessary for avoiding confliction with migration)

    Signed-off-by: KAMEZAWA Hiroyuki

    Documentation/controllers/memory.txt | 7 ++-
    mm/memcontrol.c | 68 ++++++++++++++++++++---------------
    2 files changed, 43 insertions(+), 32 deletions(-)

    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -29,10 +29,12 @@
    #include
    #include
    #include
    +#include
    #include
    #include
    #include
    #include
    +#include

    #include

    @@ -979,45 +981,34 @@ int mem_cgroup_resize_limit(struct mem_c


    /*
    - * This routine traverse page_cgroup in given list and drop them all.
    - * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
    + * This routine traverse page_cgroup in given list and move them all.
    */
    -#define FORCE_UNCHARGE_BATCH (128)
    static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
    struct mem_cgroup_per_zone *mz,
    enum lru_list lru)
    {
    - struct page_cgroup *pc;
    + struct page_cgroup *pc, *tmp;
    struct page *page;
    - int count = FORCE_UNCHARGE_BATCH;
    unsigned long flags;
    struct list_head *list;

    list = &mz->lists[lru];

    spin_lock_irqsave(&mz->lru_lock, flags);
    - while (!list_empty(list)) {
    - pc = list_entry(list->prev, struct page_cgroup, lru);
    + list_for_each_entry_safe(pc, tmp, list, lru) {
    page = pc->page;
    - get_page(page);
    - spin_unlock_irqrestore(&mz->lru_lock, flags);
    - /*
    - * Check if this page is on LRU. !LRU page can be found
    - * if it's under page migration.
    - */
    - if (PageLRU(page)) {
    - __mem_cgroup_uncharge_common(page,
    - MEM_CGROUP_CHARGE_TYPE_FORCE);
    - put_page(page);
    - if (--count <= 0) {
    - count = FORCE_UNCHARGE_BATCH;
    - cond_resched();
    - }
    - } else
    - cond_resched();
    - spin_lock_irqsave(&mz->lru_lock, flags);
    + /* For avoiding race with speculative page cache handling. */
    + if (!PageLRU(page) || !get_page_unless_zero(page)) {
    + continue;
    + }
    + mem_cgroup_move_account(page, pc, mem, &init_mem_cgroup);
    + put_page(page);
    + if (atomic_read(&mem->css.cgroup->count) > 0)
    + break;
    }
    spin_unlock_irqrestore(&mz->lru_lock, flags);
    +
    + cond_resched();
    }

    /*
    @@ -1027,7 +1018,9 @@ static void mem_cgroup_force_empty_list(
    static int mem_cgroup_force_empty(struct mem_cgroup *mem)
    {
    int ret = -EBUSY;
    - int node, zid;
    + int node, zid, busy;
    + struct mem_cgroup_per_zone *mz;
    + enum lru_list l;

    css_get(&mem->css);
    /*
    @@ -1035,17 +1028,34 @@ static int mem_cgroup_force_empty(struct
    * active_list <-> inactive_list while we don't take a lock.
    * So, we have to do loop here until all lists are empty.
    */
    - while (mem->res.usage > 0) {
    + busy = 1;
    +
    + while (busy) {
    if (atomic_read(&mem->css.cgroup->count) > 0)
    goto out;
    - for_each_node_state(node, N_POSSIBLE)
    + /*
    + * While walking our own LRU, we also checks LRU bit on page.
    + * If a page is on pagevec, it's not on LRU and we cannot
    + * grab it. Calling lru_add_drain_all() here.
    + */
    + lru_add_drain_all();
    + for_each_node_state(node, N_HIGH_MEMORY) {
    for (zid = 0; zid < MAX_NR_ZONES; zid++) {
    - struct mem_cgroup_per_zone *mz;
    - enum lru_list l;
    mz = mem_cgroup_zoneinfo(mem, node, zid);
    for_each_lru(l)
    mem_cgroup_force_empty_list(mem, mz, l);
    }
    + }
    + busy = 0;
    + for_each_node_state(node, N_HIGH_MEMORY) {
    + for (zid = 0; !busy && zid < MAX_NR_ZONES; zid++) {
    + mz = mem_cgroup_zoneinfo(mem, node, zid);
    + for_each_lru(l)
    + busy |= !list_empty(&mz->lists[l]);
    + }
    + if (busy)
    + break;
    + }
    }
    ret = 0;
    out:
    Index: mmotm-2.6.27-rc7+/Documentation/controllers/memory.txt
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/Documentation/controllers/memory.txt
    +++ mmotm-2.6.27-rc7+/Documentation/controllers/memory.txt
    @@ -207,7 +207,8 @@ The memory.force_empty gives an interfac

    # echo 1 > memory.force_empty

    -will drop all charges in cgroup. Currently, this is maintained for test.
    +will move all charges to root cgroup.
    +(This policy may be modified in future.)

    4. Testing

    @@ -238,8 +239,8 @@ reclaimed.

    A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
    cgroup might have some charge associated with it, even though all
    -tasks have migrated away from it. Such charges are automatically dropped at
    -rmdir() if there are no tasks.
    +tasks have migrated away from it. Such charges are automatically moved to
    +root cgroup at rmidr() if there are no tasks. (This policy may be changed.)

    5. TODO


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  9. [PATCH 11/12] memcg add to LRU in lazy

    Delaying add_to_lru() and do it in batched manner like page_vec.
    For doing that 2 flags PCG_USED and PCG_LRU.

    If PCG_LRU is set, page is on LRU. It safe to access LRU via page_cgroup.
    (under some lock.)

    For avoiding race, this patch uses TestSetPageCgroupUsed().
    and checking PCG_USED bit and PCG_LRU bit in add/free vector.
    By this, lock_page_cgroup() in mem_cgroup_charge() is removed.

    (I don't want to call lock_page_cgroup() under mz->lru_lock when
    add/free vector core logic. So, TestSetPageCgroupUsed() logic is added.
    This TestSet is an easy way to avoid unneccesary nest of locks.)


    Changelog: v3 -> v5.
    - removed css_get/put per page_cgroup struct.
    Now, new force_empty checks there is page_cgroup on the memcg.
    We don't need to be afraid of leak.

    Changelog: v2 -> v3
    - added TRANSIT flag and removed lock from core logic.
    Changelog: v1 -> v2:
    - renamed function name from use_page_cgroup to set_page_cgroup_lru().

    Signed-off-by: KAMEZAWA Hiroyuki

    include/linux/page_cgroup.h | 10 +++
    mm/memcontrol.c | 121 +++++++++++++++++++++++++++++++-------------
    2 files changed, 96 insertions(+), 35 deletions(-)

    Index: mmotm-2.6.27-rc7+/include/linux/page_cgroup.h
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/include/linux/page_cgroup.h
    +++ mmotm-2.6.27-rc7+/include/linux/page_cgroup.h
    @@ -24,6 +24,7 @@ enum {
    PCG_LOCK, /* page cgroup is locked */
    PCG_CACHE, /* charged as cache */
    PCG_USED, /* this object is in use. */
    + PCG_LRU, /* this is on LRU */
    /* flags for LRU placement */
    PCG_ACTIVE, /* page is active in this cgroup */
    PCG_FILE, /* page is file system backed */
    @@ -42,11 +43,20 @@ static inline void SetPageCgroup##uname(
    static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
    { clear_bit(PCG_##lname, &pc->flags); }

    +#define TESTSETPCGFLAG(uname, lname)\
    +static inline int TestSetPageCgroup##uname(struct page_cgroup *pc) \
    + { return test_and_set_bit(PCG_##lname, &pc->flags); }
    +
    /* Cache flag is set only once (at allocation) */
    TESTPCGFLAG(Cache, CACHE)

    TESTPCGFLAG(Used, USED)
    CLEARPCGFLAG(Used, USED)
    +TESTSETPCGFLAG(Used, USED)
    +
    +TESTPCGFLAG(LRU, LRU)
    +SETPCGFLAG(LRU, LRU)
    +CLEARPCGFLAG(LRU, LRU)

    /* LRU management flags (from global-lru definition) */
    TESTPCGFLAG(File, FILE)
    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -150,9 +150,9 @@ enum charge_type {

    static const unsigned long
    pcg_default_flags[NR_CHARGE_TYPE] = {
    - (1 << PCG_CACHE) | (1 << PCG_FILE) | (1 << PCG_USED) | (1 << PCG_LOCK),
    - (1 << PCG_ACTIVE) | (1 << PCG_LOCK) | (1 << PCG_USED),
    - (1 << PCG_ACTIVE) | (1 << PCG_CACHE) | (1 << PCG_USED)| (1 << PCG_LOCK),
    + (1 << PCG_CACHE) | (1 << PCG_FILE) | (1 << PCG_USED),
    + (1 << PCG_ACTIVE) | (1 << PCG_USED),
    + (1 << PCG_ACTIVE) | (1 << PCG_CACHE) | (1 << PCG_USED),
    0,
    };

    @@ -195,7 +195,6 @@ page_cgroup_zoneinfo(struct page_cgroup
    struct mem_cgroup *mem = pc->mem_cgroup;
    int nid = page_cgroup_nid(pc);
    int zid = page_cgroup_zid(pc);
    -
    return mem_cgroup_zoneinfo(mem, nid, zid);
    }

    @@ -343,7 +342,7 @@ void mem_cgroup_move_lists(struct page *
    if (!trylock_page_cgroup(pc))
    return;

    - if (PageCgroupUsed(pc)) {
    + if (PageCgroupUsed(pc) && PageCgroupLRU(pc)) {
    mem = pc->mem_cgroup;
    mz = page_cgroup_zoneinfo(pc);
    spin_lock_irqsave(&mz->lru_lock, flags);
    @@ -502,6 +501,9 @@ int mem_cgroup_move_account(struct page
    from_mz = mem_cgroup_zoneinfo(from, nid, zid);
    to_mz = mem_cgroup_zoneinfo(to, nid, zid);

    + if (!PageCgroupLRU(pc))
    + return ret;
    +
    if (res_counter_charge(&to->res, PAGE_SIZE)) {
    /* Now, we assume no_limit...no failure here. */
    return ret;
    @@ -518,10 +520,8 @@ int mem_cgroup_move_account(struct page

    if (spin_trylock(&to_mz->lru_lock)) {
    __mem_cgroup_remove_list(from_mz, pc);
    - css_put(&from->css);
    res_counter_uncharge(&from->res, PAGE_SIZE);
    pc->mem_cgroup = to;
    - css_get(&to->css);
    __mem_cgroup_add_list(to_mz, pc);
    ret = 0;
    spin_unlock(&to_mz->lru_lock);
    @@ -542,6 +542,7 @@ struct memcg_percpu_vec {
    struct page_cgroup *vec[MEMCG_PCPVEC_SIZE];
    };
    static DEFINE_PER_CPU(struct memcg_percpu_vec, memcg_free_vec);
    +static DEFINE_PER_CPU(struct memcg_percpu_vec, memcg_add_vec);

    static void
    __release_page_cgroup(struct memcg_percpu_vec *mpv)
    @@ -557,7 +558,6 @@ __release_page_cgroup(struct memcg_percp
    prev_mz = NULL;
    for (i = nr - 1; i >= 0; i--) {
    pc = mpv->vec[i];
    - VM_BUG_ON(PageCgroupUsed(pc));
    mz = page_cgroup_zoneinfo(pc);
    if (prev_mz != mz) {
    if (prev_mz)
    @@ -565,9 +565,10 @@ __release_page_cgroup(struct memcg_percp
    prev_mz = mz;
    spin_lock(&mz->lru_lock);
    }
    - __mem_cgroup_remove_list(mz, pc);
    - css_put(&pc->mem_cgroup->css);
    - pc->mem_cgroup = NULL;
    + if (!PageCgroupUsed(pc) && PageCgroupLRU(pc)) {
    + __mem_cgroup_remove_list(mz, pc);
    + ClearPageCgroupLRU(pc);
    + }
    }
    if (prev_mz)
    spin_unlock(&prev_mz->lru_lock);
    @@ -576,10 +577,43 @@ __release_page_cgroup(struct memcg_percp
    }

    static void
    +__set_page_cgroup_lru(struct memcg_percpu_vec *mpv)
    +{
    + unsigned long flags;
    + struct mem_cgroup_per_zone *mz, *prev_mz;
    + struct page_cgroup *pc;
    + int i, nr;
    +
    + local_irq_save(flags);
    + nr = mpv->nr;
    + mpv->nr = 0;
    + prev_mz = NULL;
    +
    + for (i = nr - 1; i >= 0; i--) {
    + pc = mpv->vec[i];
    + mz = page_cgroup_zoneinfo(pc);
    + if (prev_mz != mz) {
    + if (prev_mz)
    + spin_unlock(&prev_mz->lru_lock);
    + prev_mz = mz;
    + spin_lock(&mz->lru_lock);
    + }
    + if (PageCgroupUsed(pc) && !PageCgroupLRU(pc)) {
    + SetPageCgroupLRU(pc);
    + __mem_cgroup_add_list(mz, pc);
    + }
    + }
    +
    + if (prev_mz)
    + spin_unlock(&prev_mz->lru_lock);
    + local_irq_restore(flags);
    +
    +}
    +
    +static void
    release_page_cgroup(struct page_cgroup *pc)
    {
    struct memcg_percpu_vec *mpv;
    -
    mpv = &get_cpu_var(memcg_free_vec);
    mpv->vec[mpv->nr++] = pc;
    if (mpv->nr >= mpv->limit)
    @@ -587,11 +621,25 @@ release_page_cgroup(struct page_cgroup *
    put_cpu_var(memcg_free_vec);
    }

    +static void
    +set_page_cgroup_lru(struct page_cgroup *pc)
    +{
    + struct memcg_percpu_vec *mpv;
    +
    + mpv = &get_cpu_var(memcg_add_vec);
    + mpv->vec[mpv->nr++] = pc;
    + if (mpv->nr >= mpv->limit)
    + __set_page_cgroup_lru(mpv);
    + put_cpu_var(memcg_add_vec);
    +}
    +
    static void page_cgroup_start_cache_cpu(int cpu)
    {
    struct memcg_percpu_vec *mpv;
    mpv = &per_cpu(memcg_free_vec, cpu);
    mpv->limit = MEMCG_PCPVEC_SIZE;
    + mpv = &per_cpu(memcg_add_vec, cpu);
    + mpv->limit = MEMCG_PCPVEC_SIZE;
    }

    #ifdef CONFIG_HOTPLUG_CPU
    @@ -600,6 +648,8 @@ static void page_cgroup_stop_cache_cpu(i
    struct memcg_percpu_vec *mpv;
    mpv = &per_cpu(memcg_free_vec, cpu);
    mpv->limit = 0;
    + mpv = &per_cpu(memcg_add_vec, cpu);
    + mpv->limit = 0;
    }
    #endif

    @@ -613,6 +663,9 @@ static DEFINE_MUTEX(memcg_force_drain_mu
    static void drain_page_cgroup_local(struct work_struct *work)
    {
    struct memcg_percpu_vec *mpv;
    + mpv = &get_cpu_var(memcg_add_vec);
    + __set_page_cgroup_lru(mpv);
    + put_cpu_var(mpv);
    mpv = &get_cpu_var(memcg_free_vec);
    __release_page_cgroup(mpv);
    put_cpu_var(mpv);
    @@ -679,14 +732,9 @@ static int mem_cgroup_charge_common(stru
    rcu_read_unlock();
    return 0;
    }
    - /*
    - * For every charge from the cgroup, increment reference count
    - */
    - css_get(&mem->css);
    rcu_read_unlock();
    } else {
    mem = memcg;
    - css_get(&memcg->css);
    }

    while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
    @@ -713,33 +761,36 @@ static int mem_cgroup_charge_common(stru
    }

    preempt_disable();
    - lock_page_cgroup(pc);
    - if (unlikely(PageCgroupUsed(pc))) {
    - unlock_page_cgroup(pc);
    + if (TestSetPageCgroupUsed(pc)) {
    res_counter_uncharge(&mem->res, PAGE_SIZE);
    - css_put(&mem->css);
    preempt_enable();
    goto done;
    }
    - pc->mem_cgroup = mem;
    /*
    - * If a page is accounted as a page cache, insert to inactive list.
    - * If anon, insert to active list.
    - */
    - pc->flags = pcg_default_flags[ctype];
    -
    - mz = page_cgroup_zoneinfo(pc);
    + * page cgroup is *unused* now....but....
    + * We can assume old mem_cgroup's metadata is still available
    + * because pc is not on stale LRU after force_empty() is called.
    + */
    + if (likely(!PageCgroupLRU(pc)))
    + pc->flags = pcg_default_flags[ctype];
    + else {
    + mz = page_cgroup_zoneinfo(pc);
    + spin_lock_irqsave(&mz->lru_lock, flags);
    + if (PageCgroupLRU(pc)) {
    + __mem_cgroup_remove_list(mz, pc);
    + ClearPageCgroupLRU(pc);
    + }
    + pc->flags = pcg_default_flags[ctype];
    + spin_unlock_irqrestore(&mz->lru_lock, flags);
    + }

    - spin_lock_irqsave(&mz->lru_lock, flags);
    - __mem_cgroup_add_list(mz, pc);
    - spin_unlock_irqrestore(&mz->lru_lock, flags);
    - unlock_page_cgroup(pc);
    + pc->mem_cgroup = mem;
    + set_page_cgroup_lru(pc);
    preempt_enable();

    done:
    return 0;
    out:
    - css_put(&mem->css);
    return -ENOMEM;
    }

    @@ -830,12 +881,12 @@ __mem_cgroup_uncharge_common(struct page
    return;
    }
    ClearPageCgroupUsed(pc);
    + mem = pc->mem_cgroup;
    unlock_page_cgroup(pc);
    preempt_enable();
    + res_counter_uncharge(&mem->res, PAGE_SIZE);

    - mem = pc->mem_cgroup;
    release_page_cgroup(pc);
    - res_counter_uncharge(&mem->res, PAGE_SIZE);

    return;
    }

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  10. [PATCH 9/12] memcg allocate all page_cgroup at boot

    Allocate all page_cgroup at boot and remove page_cgroup poitner
    from struct page. This patch adds an interface as

    struct page_cgroup *lookup_page_cgroup(struct page*)

    All FLATMEM/DISCONTIGMEM/SPARSEMEM and MEMORY_HOTPLUG is supported.

    Remove page_cgroup pointer reduces the amount of memory by
    - 4 bytes per PAGE_SIZE.
    - 8 bytes per PAGE_SIZE
    if memory controller is disabled. (even if configured.)
    meta data usage of this is no problem in FLATMEM/DISCONTIGMEM.
    On SPARSEMEM, this makes mem_section[] size twice.

    On usual 8GB x86-32 server, this saves 8MB of NORMAL_ZONE memory.
    On my x86-64 server with 48GB of memory, this saves 96MB of memory.
    (and uses xx kbytes for mem_section.)
    I think this reduction makes sense.

    By pre-allocation, kmalloc/kfree in charge/uncharge are removed.
    This means
    - we're not necessary to be afraid of kmalloc faiulre.
    (this can happen because of gfp_mask type.)
    - we can avoid calling kmalloc/kfree.
    - we can avoid allocating tons of small objects which can be fragmented.
    - we can know what amount of memory will be used for this extra-lru handling.

    I added printk message as

    "allocated %ld bytes of page_cgroup"
    "please try cgroup_disable=memory option if you don't want"

    maybe enough informative for users.

    Signed-off-by: KAMEZAWA Hiroyuki

    include/linux/memcontrol.h | 11 -
    include/linux/mm_types.h | 4
    include/linux/mmzone.h | 9 +
    include/linux/page_cgroup.h | 90 +++++++++++++++
    mm/Makefile | 2
    mm/memcontrol.c | 258 ++++++++++++--------------------------------
    mm/page_alloc.c | 12 --
    mm/page_cgroup.c | 253 +++++++++++++++++++++++++++++++++++++++++++
    8 files changed, 431 insertions(+), 208 deletions(-)

    Index: mmotm-2.6.27-rc7+/mm/page_cgroup.c
    ================================================== =================
    --- /dev/null
    +++ mmotm-2.6.27-rc7+/mm/page_cgroup.c
    @@ -0,0 +1,253 @@
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +
    +static void __meminit
    +__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
    +{
    + pc->flags = 0;
    + pc->mem_cgroup = NULL;
    + pc->page = pfn_to_page(pfn);
    +}
    +static unsigned long total_usage = 0;
    +
    +#ifdef CONFIG_FLAT_NODE_MEM_MAP
    +
    +
    +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
    +{
    + pgdat->node_page_cgroup = NULL;
    +}
    +
    +struct page_cgroup *lookup_page_cgroup(struct page *page)
    +{
    + unsigned long pfn = page_to_pfn(page);
    + unsigned long offset;
    + struct page_cgroup *base;
    +
    + base = NODE_DATA(page_to_nid(nid))->node_page_cgroup;
    + if (unlikely(!base))
    + return NULL;
    +
    + offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
    + return base + offset;
    +}
    +
    +static int __init alloc_node_page_cgroup(int nid)
    +{
    + struct page_cgroup *base, *pc;
    + unsigned long table_size;
    + unsigned long start_pfn, nr_pages, index;
    +
    + start_pfn = NODE_DATA(nid)->node_start_pfn;
    + nr_pages = NODE_DATA(nid)->node_spanned_pages;
    +
    + table_size = sizeof(struct page_cgroup) * nr_pages;
    +
    + base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
    + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
    + if (!base)
    + return -ENOMEM;
    + for (index = 0; index < nr_pages; index++) {
    + pc = base + index;
    + __init_page_cgroup(pc, start_pfn + index);
    + }
    + NODE_DATA(nid)->node_page_cgroup = base;
    + total_usage += table_size;
    + return 0;
    +}
    +
    +void __init free_node_page_cgroup(int nid)
    +{
    + unsigned long table_size;
    + unsigned long nr_pages;
    + struct page_cgroup *base;
    +
    + base = NODE_DATA(nid)->node_page_cgroup;
    + if (!base)
    + return;
    + nr_pages = NODE_DATA(nid)->node_spanned_pages;
    +
    + table_size = sizeof(struct page_cgroup) * nr_pages;
    +
    + free_bootmem_node(NODE_DATA(nid),
    + (unsigned long)base, table_size);
    + NODE_DATA(nid)->node_page_cgroup = NULL;
    +}
    +
    +void __init page_cgroup_init(void)
    +{
    +
    + int nid, fail;
    +
    + for_each_online_node(nid) {
    + fail = alloc_node_page_cgroup(nid);
    + if (fail)
    + goto fail;
    + }
    + printk("allocated %ld bytes of page_cgroup\n", total_usage);
    + printk("please try cgroup_disable=memory option if you don't want\n");
    + return;
    +fail:
    + printk("allocation of page_cgroup was failed.\n");
    + printk("please try cgroup_disable=memory boot option\n");
    + panic("Out of memory");
    +}
    +
    +#else /* CONFIG_FLAT_NODE_MEM_MAP */
    +
    +struct page_cgroup *lookup_page_cgroup(struct page *page)
    +{
    + unsigned long pfn = page_to_pfn(page);
    + struct mem_section *section = __pfn_to_section(pfn);
    +
    + return section->page_cgroup + pfn;
    +}
    +
    +int __meminit init_section_page_cgroup(unsigned long pfn)
    +{
    + struct mem_section *section;
    + struct page_cgroup *base, *pc;
    + unsigned long table_size;
    + int nid, index;
    +
    + section = __pfn_to_section(pfn);
    +
    + if (section->page_cgroup)
    + return 0;
    +
    + nid = page_to_nid(pfn_to_page(pfn));
    +
    + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
    + base = kmalloc_node(table_size, GFP_KERNEL, nid);
    + if (!base)
    + base = vmalloc_node(table_size, nid);
    +
    + if (!base) {
    + printk("page cgroup allocation failure\n");
    + return -ENOMEM;
    + }
    +
    + for (index = 0; index < PAGES_PER_SECTION; index++) {
    + pc = base + index;
    + __init_page_cgroup(pc, pfn + index);
    + }
    +
    + section = __pfn_to_section(pfn);
    + section->page_cgroup = base - pfn;
    + total_usage += table_size;
    + return 0;
    +}
    +#ifdef CONFIG_MEMORY_HOTPLUG
    +void __free_page_cgroup(unsigned long pfn)
    +{
    + struct mem_section *ms;
    + struct page_cgroup *base;
    +
    + ms = __pfn_to_section(pfn);
    + if (!ms || !ms->page_cgroup)
    + return;
    + base = ms->page_cgroup + pfn;
    + ms->page_cgroup = NULL;
    + if (is_vmalloc_addr(base))
    + vfree(base);
    + else
    + kfree(base);
    +}
    +
    +int online_page_cgroup(unsigned long start_pfn,
    + unsigned long nr_pages,
    + int nid)
    +{
    + unsigned long start, end, pfn;
    + int fail = 0;
    +
    + start = start_pfn & (PAGES_PER_SECTION - 1);
    + end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
    +
    + for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
    + if (!pfn_present(pfn))
    + continue;
    + fail = init_section_page_cgroup(pfn);
    + }
    + if (!fail)
    + return 0;
    +
    + /* rollback */
    + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
    + __free_page_cgroup(pfn);
    +
    + return -ENOMEM;
    +}
    +
    +int offline_page_cgroup(unsigned long start_pfn,
    + unsigned long nr_pages, int nid)
    +{
    + unsigned long start, end, pfn;
    +
    + start = start_pfn & (PAGES_PER_SECTION - 1);
    + end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
    +
    + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
    + __free_page_cgroup(pfn);
    + return 0;
    +
    +}
    +
    +static int page_cgroup_callback(struct notifier_block *self,
    + unsigned long action, void *arg)
    +{
    + struct memory_notify *mn = arg;
    + int ret = 0;
    + switch (action) {
    + case MEM_GOING_ONLINE:
    + ret = online_page_cgroup(mn->start_pfn,
    + mn->nr_pages, mn->status_change_nid);
    + break;
    + case MEM_CANCEL_ONLINE:
    + case MEM_OFFLINE:
    + offline_page_cgroup(mn->start_pfn,
    + mn->nr_pages, mn->status_change_nid);
    + break;
    + case MEM_GOING_OFFLINE:
    + break;
    + case MEM_ONLINE:
    + case MEM_CANCEL_OFFLINE:
    + break;
    + }
    + ret = notifier_from_errno(ret);
    + return ret;
    +}
    +
    +#endif
    +
    +void __init page_cgroup_init(void)
    +{
    + unsigned long pfn;
    + int fail = 0;
    +
    + for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
    + if (!pfn_present(pfn))
    + continue;
    + fail = init_section_page_cgroup(pfn);
    + }
    + if (fail) {
    + printk("please try cgroup_disable=memory boot option\n");
    + panic("Out of memory");
    + } else {
    + hotplug_memory_notifier(page_cgroup_callback, 0);
    + }
    + printk("allocated %ld bytes of page_cgroup\n", total_usage);
    + printk("please try cgroup_disable=memory option if you don't want\n");
    +}
    +
    +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
    +{
    + return;
    +}
    +
    +#endif
    Index: mmotm-2.6.27-rc7+/include/linux/mm_types.h
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/include/linux/mm_types.h
    +++ mmotm-2.6.27-rc7+/include/linux/mm_types.h
    @@ -94,10 +94,6 @@ struct page {
    void *virtual; /* Kernel virtual address (NULL if
    not kmapped, ie. highmem) */
    #endif /* WANT_PAGE_VIRTUAL */
    -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
    - unsigned long page_cgroup;
    -#endif
    -
    #ifdef CONFIG_KMEMCHECK
    void *shadow;
    #endif
    Index: mmotm-2.6.27-rc7+/mm/Makefile
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/Makefile
    +++ mmotm-2.6.27-rc7+/mm/Makefile
    @@ -34,6 +34,6 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
    obj-$(CONFIG_MIGRATION) += migrate.o
    obj-$(CONFIG_SMP) += allocpercpu.o
    obj-$(CONFIG_QUICKLIST) += quicklist.o
    -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
    +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
    obj-$(CONFIG_CGROUP_MEMRLIMIT_CTLR) += memrlimitcgroup.o
    obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
    Index: mmotm-2.6.27-rc7+/include/linux/page_cgroup.h
    ================================================== =================
    --- /dev/null
    +++ mmotm-2.6.27-rc7+/include/linux/page_cgroup.h
    @@ -0,0 +1,90 @@
    +#ifndef __LINUX_PAGE_CGROUP_H
    +#define __LINUX_PAGE_CGROUP_H
    +#include
    +/*
    + * Page Cgroup can be considered as an extended mem_map.
    + * A page_cgroup page is associated with every page descriptor. The
    + * page_cgroup helps us identify information about the cgroup
    + * All page cgroups are allocated at boot or memory hotplug event,
    + * then the page cgroup for pfn always exists.
    + */
    +struct page_cgroup {
    + unsigned long flags;
    + struct mem_cgroup *mem_cgroup;
    + struct page *page;
    + struct list_head lru; /* per cgroup LRU list */
    +};
    +
    +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat);
    +void __init page_cgroup_init(void);
    +struct page_cgroup *lookup_page_cgroup(struct page *page);
    +
    +enum {
    + /* flags for mem_cgroup */
    + PCG_LOCK, /* page cgroup is locked */
    + PCG_CACHE, /* charged as cache */
    + PCG_USED, /* this object is in use. */
    + /* flags for LRU placement */
    + PCG_ACTIVE, /* page is active in this cgroup */
    + PCG_FILE, /* page is file system backed */
    + PCG_UNEVICTABLE, /* page is unevictableable */
    +};
    +
    +#define TESTPCGFLAG(uname, lname) \
    +static inline int PageCgroup##uname(struct page_cgroup *pc) \
    + { return test_bit(PCG_##lname, &pc->flags); }
    +
    +#define SETPCGFLAG(uname, lname) \
    +static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
    + { set_bit(PCG_##lname, &pc->flags); }
    +
    +#define CLEARPCGFLAG(uname, lname) \
    +static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
    + { clear_bit(PCG_##lname, &pc->flags); }
    +
    +/* Cache flag is set only once (at allocation) */
    +TESTPCGFLAG(Cache, CACHE)
    +
    +TESTPCGFLAG(Used, USED)
    +CLEARPCGFLAG(Used, USED)
    +
    +/* LRU management flags (from global-lru definition) */
    +TESTPCGFLAG(File, FILE)
    +SETPCGFLAG(File, FILE)
    +CLEARPCGFLAG(File, FILE)
    +
    +TESTPCGFLAG(Active, ACTIVE)
    +SETPCGFLAG(Active, ACTIVE)
    +CLEARPCGFLAG(Active, ACTIVE)
    +
    +TESTPCGFLAG(Unevictable, UNEVICTABLE)
    +SETPCGFLAG(Unevictable, UNEVICTABLE)
    +CLEARPCGFLAG(Unevictable, UNEVICTABLE)
    +
    +static inline int page_cgroup_nid(struct page_cgroup *pc)
    +{
    + return page_to_nid(pc->page);
    +}
    +
    +static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
    +{
    + return page_zonenum(pc->page);
    +}
    +
    +static inline void lock_page_cgroup(struct page_cgroup *pc)
    +{
    + bit_spin_lock(PCG_LOCK, &pc->flags);
    +}
    +
    +static inline int trylock_page_cgroup(struct page_cgroup *pc)
    +{
    + return bit_spin_trylock(PCG_LOCK, &pc->flags);
    +}
    +
    +static inline void unlock_page_cgroup(struct page_cgroup *pc)
    +{
    + bit_spin_unlock(PCG_LOCK, &pc->flags);
    +}
    +
    +
    +#endif
    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -35,11 +35,11 @@
    #include
    #include
    #include
    +#include

    #include

    struct cgroup_subsys mem_cgroup_subsys __read_mostly;
    -static struct kmem_cache *page_cgroup_cache __read_mostly;
    #define MEM_CGROUP_RECLAIM_RETRIES 5

    /*
    @@ -139,80 +139,6 @@ static struct mem_cgroup init_mem_cgroup

    #define is_root_cgroup(cgrp) ((cgrp) == &init_mem_cgroup)

    -
    -/*
    - * We use the lower bit of the page->page_cgroup pointer as a bit spin
    - * lock. We need to ensure that page->page_cgroup is at least two
    - * byte aligned (based on comments from Nick Piggin). But since
    - * bit_spin_lock doesn't actually set that lock bit in a non-debug
    - * uniprocessor kernel, we should avoid setting it here too.
    - */
    -#define PAGE_CGROUP_LOCK_BIT 0x0
    -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
    -#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
    -#else
    -#define PAGE_CGROUP_LOCK 0x0
    -#endif
    -
    -/*
    - * A page_cgroup page is associated with every page descriptor. The
    - * page_cgroup helps us identify information about the cgroup
    - */
    -struct page_cgroup {
    - struct list_head lru; /* per cgroup LRU list */
    - struct page *page;
    - struct mem_cgroup *mem_cgroup;
    - unsigned long flags;
    -};
    -
    -enum {
    - /* flags for mem_cgroup */
    - PCG_CACHE, /* charged as cache */
    - /* flags for LRU placement */
    - PCG_ACTIVE, /* page is active in this cgroup */
    - PCG_FILE, /* page is file system backed */
    - PCG_UNEVICTABLE, /* page is unevictableable */
    -};
    -
    -#define TESTPCGFLAG(uname, lname) \
    -static inline int PageCgroup##uname(struct page_cgroup *pc) \
    - { return test_bit(PCG_##lname, &pc->flags); }
    -
    -#define SETPCGFLAG(uname, lname) \
    -static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
    - { set_bit(PCG_##lname, &pc->flags); }
    -
    -#define CLEARPCGFLAG(uname, lname) \
    -static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
    - { clear_bit(PCG_##lname, &pc->flags); }
    -
    -
    -/* Cache flag is set only once (at allocation) */
    -TESTPCGFLAG(Cache, CACHE)
    -
    -/* LRU management flags (from global-lru definition) */
    -TESTPCGFLAG(File, FILE)
    -SETPCGFLAG(File, FILE)
    -CLEARPCGFLAG(File, FILE)
    -
    -TESTPCGFLAG(Active, ACTIVE)
    -SETPCGFLAG(Active, ACTIVE)
    -CLEARPCGFLAG(Active, ACTIVE)
    -
    -TESTPCGFLAG(Unevictable, UNEVICTABLE)
    -SETPCGFLAG(Unevictable, UNEVICTABLE)
    -CLEARPCGFLAG(Unevictable, UNEVICTABLE)
    -
    -static int page_cgroup_nid(struct page_cgroup *pc)
    -{
    - return page_to_nid(pc->page);
    -}
    -
    -static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
    -{
    - return page_zonenum(pc->page);
    -}
    -
    enum charge_type {
    MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
    MEM_CGROUP_CHARGE_TYPE_MAPPED,
    @@ -223,9 +149,9 @@ enum charge_type {

    static const unsigned long
    pcg_default_flags[NR_CHARGE_TYPE] = {
    - ((1 << PCG_CACHE) | (1 << PCG_FILE)),
    - ((1 << PCG_ACTIVE)),
    - ((1 << PCG_ACTIVE) | (1 << PCG_CACHE)),
    + (1 << PCG_CACHE) | (1 << PCG_FILE) | (1 << PCG_USED) | (1 << PCG_LOCK),
    + (1 << PCG_ACTIVE) | (1 << PCG_LOCK) | (1 << PCG_USED),
    + (1 << PCG_ACTIVE) | (1 << PCG_CACHE) | (1 << PCG_USED)| (1 << PCG_LOCK),
    0,
    };

    @@ -308,37 +234,6 @@ struct mem_cgroup *mem_cgroup_from_task(
    struct mem_cgroup, css);
    }

    -static inline int page_cgroup_locked(struct page *page)
    -{
    - return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
    -}
    -
    -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
    -{
    - VM_BUG_ON(!page_cgroup_locked(page));
    - page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
    -}
    -
    -struct page_cgroup *page_get_page_cgroup(struct page *page)
    -{
    - return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
    -}
    -
    -static void lock_page_cgroup(struct page *page)
    -{
    - bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
    -}
    -
    -static int try_lock_page_cgroup(struct page *page)
    -{
    - return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
    -}
    -
    -static void unlock_page_cgroup(struct page *page)
    -{
    - bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
    -}
    -
    static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
    struct page_cgroup *pc)
    {
    @@ -442,22 +337,19 @@ void mem_cgroup_move_lists(struct page *
    * safely get to page_cgroup without it, so just try_lock it:
    * mem_cgroup_isolate_pages allows for page left on wrong list.
    */
    - if (!try_lock_page_cgroup(page))
    + pc = lookup_page_cgroup(page);
    +
    + if (!trylock_page_cgroup(pc))
    return;

    - pc = page_get_page_cgroup(page);
    - if (pc) {
    + if (PageCgroupUsed(pc)) {
    mem = pc->mem_cgroup;
    mz = page_cgroup_zoneinfo(pc);
    spin_lock_irqsave(&mz->lru_lock, flags);
    - /*
    - * check against the race with move_account.
    - */
    - if (likely(mem == pc->mem_cgroup))
    - __mem_cgroup_move_lists(pc, lru);
    + __mem_cgroup_move_lists(pc, lru);
    spin_unlock_irqrestore(&mz->lru_lock, flags);
    }
    - unlock_page_cgroup(page);
    + unlock_page_cgroup(pc);
    }

    /*
    @@ -544,6 +436,8 @@ unsigned long mem_cgroup_isolate_pages(u
    list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
    if (scan >= nr_to_scan)
    break;
    + if (unlikely(!PageCgroupUsed(pc)))
    + continue;
    page = pc->page;

    if (unlikely(!PageLRU(page)))
    @@ -611,12 +505,12 @@ int mem_cgroup_move_account(struct page
    /* Now, we assume no_limit...no failure here. */
    return ret;
    }
    - if (!try_lock_page_cgroup(page)) {
    + if (!trylock_page_cgroup(pc)) {
    res_counter_uncharge(&to->res, PAGE_SIZE);
    return ret;
    }

    - if (page_get_page_cgroup(page) != pc) {
    + if (!PageCgroupUsed(pc)) {
    res_counter_uncharge(&to->res, PAGE_SIZE);
    goto out;
    }
    @@ -634,7 +528,7 @@ int mem_cgroup_move_account(struct page
    res_counter_uncharge(&to->res, PAGE_SIZE);
    }
    out:
    - unlock_page_cgroup(page);
    + unlock_page_cgroup(pc);

    return ret;
    }
    @@ -651,26 +545,27 @@ static int mem_cgroup_charge_common(stru
    {
    struct mem_cgroup *mem;
    struct page_cgroup *pc;
    - unsigned long flags;
    unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
    struct mem_cgroup_per_zone *mz;
    + unsigned long flags;

    - pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
    - if (unlikely(pc == NULL))
    - goto err;
    -
    + pc = lookup_page_cgroup(page);
    + /* can happen at boot */
    + if (unlikely(!pc))
    + return 0;
    + prefetchw(pc);
    /*
    * We always charge the cgroup the mm_struct belongs to.
    * The mm_struct's mem_cgroup changes on task migration if the
    * thread group leader migrates. It's possible that mm is not
    * set, if so charge the init_mm (happens for pagecache usage).
    */
    +
    if (likely(!memcg)) {
    rcu_read_lock();
    mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
    if (unlikely(!mem)) {
    rcu_read_unlock();
    - kmem_cache_free(page_cgroup_cache, pc);
    return 0;
    }
    /*
    @@ -706,36 +601,34 @@ static int mem_cgroup_charge_common(stru
    }
    }

    + preempt_disable();
    + lock_page_cgroup(pc);
    + if (unlikely(PageCgroupUsed(pc))) {
    + unlock_page_cgroup(pc);
    + res_counter_uncharge(&mem->res, PAGE_SIZE);
    + css_put(&mem->css);
    + preempt_enable();
    + goto done;
    + }
    pc->mem_cgroup = mem;
    - pc->page = page;
    /*
    * If a page is accounted as a page cache, insert to inactive list.
    * If anon, insert to active list.
    */
    pc->flags = pcg_default_flags[ctype];

    - lock_page_cgroup(page);
    - if (unlikely(page_get_page_cgroup(page))) {
    - unlock_page_cgroup(page);
    - res_counter_uncharge(&mem->res, PAGE_SIZE);
    - css_put(&mem->css);
    - kmem_cache_free(page_cgroup_cache, pc);
    - goto done;
    - }
    - page_assign_page_cgroup(page, pc);
    -
    mz = page_cgroup_zoneinfo(pc);
    +
    spin_lock_irqsave(&mz->lru_lock, flags);
    __mem_cgroup_add_list(mz, pc);
    spin_unlock_irqrestore(&mz->lru_lock, flags);
    + unlock_page_cgroup(pc);
    + preempt_enable();

    - unlock_page_cgroup(page);
    done:
    return 0;
    out:
    css_put(&mem->css);
    - kmem_cache_free(page_cgroup_cache, pc);
    -err:
    return -ENOMEM;
    }

    @@ -743,7 +636,8 @@ int mem_cgroup_charge(struct page *page,
    {
    if (mem_cgroup_subsys.disabled)
    return 0;
    -
    + if (PageCompound(page))
    + return 0;
    /*
    * If already mapped, we don't have to account.
    * If page cache, page->mapping has address_space.
    @@ -764,7 +658,8 @@ int mem_cgroup_cache_charge(struct page
    {
    if (mem_cgroup_subsys.disabled)
    return 0;
    -
    + if (PageCompound(page))
    + return 0;
    /*
    * Corner case handling. This is called from add_to_page_cache()
    * in usual. But some FS (shmem) precharges this page before calling it
    @@ -777,15 +672,16 @@ int mem_cgroup_cache_charge(struct page
    if (!(gfp_mask & __GFP_WAIT)) {
    struct page_cgroup *pc;

    - lock_page_cgroup(page);
    - pc = page_get_page_cgroup(page);
    - if (pc) {
    - VM_BUG_ON(pc->page != page);
    - VM_BUG_ON(!pc->mem_cgroup);
    - unlock_page_cgroup(page);
    +
    + pc = lookup_page_cgroup(page);
    + if (!pc)
    + return 0;
    + lock_page_cgroup(pc);
    + if (PageCgroupUsed(pc)) {
    + unlock_page_cgroup(pc);
    return 0;
    }
    - unlock_page_cgroup(page);
    + unlock_page_cgroup(pc);
    }

    if (unlikely(!mm))
    @@ -812,49 +708,41 @@ __mem_cgroup_uncharge_common(struct page

    if (mem_cgroup_subsys.disabled)
    return;
    + /* check the condition we can know from page */

    - /*
    - * Check if our page_cgroup is valid
    - */
    - lock_page_cgroup(page);
    - pc = page_get_page_cgroup(page);
    - if (unlikely(!pc))
    - goto unlock;
    -
    - VM_BUG_ON(pc->page != page);
    + pc = lookup_page_cgroup(page);
    + if (unlikely(!pc || !PageCgroupUsed(pc)))
    + return;
    + preempt_disable();
    + lock_page_cgroup(pc);
    + if (unlikely(page_mapped(page))) {
    + unlock_page_cgroup(pc);
    + preempt_enable();
    + return;
    + }
    + ClearPageCgroupUsed(pc);
    + unlock_page_cgroup(pc);

    - if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
    - && ((PageCgroupCache(pc) || page_mapped(page))))
    - goto unlock;
    -retry:
    mem = pc->mem_cgroup;
    mz = page_cgroup_zoneinfo(pc);
    +
    spin_lock_irqsave(&mz->lru_lock, flags);
    - if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED &&
    - unlikely(mem != pc->mem_cgroup)) {
    - /* MAPPED account can be done without lock_page().
    - Check race with mem_cgroup_move_account() */
    - spin_unlock_irqrestore(&mz->lru_lock, flags);
    - goto retry;
    - }
    __mem_cgroup_remove_list(mz, pc);
    spin_unlock_irqrestore(&mz->lru_lock, flags);
    -
    - page_assign_page_cgroup(page, NULL);
    - unlock_page_cgroup(page);
    -
    -
    - res_counter_uncharge(&mem->res, PAGE_SIZE);
    + pc->mem_cgroup = NULL;
    css_put(&mem->css);
    + preempt_enable();
    + res_counter_uncharge(&mem->res, PAGE_SIZE);

    - kmem_cache_free(page_cgroup_cache, pc);
    return;
    -unlock:
    - unlock_page_cgroup(page);
    }

    void mem_cgroup_uncharge_page(struct page *page)
    {
    + if (page_mapped(page))
    + return;
    + if (page->mapping && !PageAnon(page))
    + return;
    __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
    }

    @@ -878,9 +766,9 @@ int mem_cgroup_prepare_migration(struct
    if (mem_cgroup_subsys.disabled)
    return 0;

    - lock_page_cgroup(page);
    - pc = page_get_page_cgroup(page);
    - if (pc) {
    + pc = lookup_page_cgroup(page);
    + lock_page_cgroup(pc);
    + if (PageCgroupUsed(pc)) {
    mem = pc->mem_cgroup;
    css_get(&mem->css);
    if (PageCgroupCache(pc)) {
    @@ -890,7 +778,7 @@ int mem_cgroup_prepare_migration(struct
    ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
    }
    }
    - unlock_page_cgroup(page);
    + unlock_page_cgroup(pc);
    if (mem) {
    ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
    ctype, mem);
    @@ -997,6 +885,8 @@ static void mem_cgroup_force_empty_list(
    spin_lock_irqsave(&mz->lru_lock, flags);
    list_for_each_entry_safe(pc, tmp, list, lru) {
    page = pc->page;
    + if (!PageCgroupUsed(pc))
    + continue;
    /* For avoiding race with speculative page cache handling. */
    if (!PageLRU(page) || !get_page_unless_zero(page)) {
    continue;
    @@ -1270,8 +1160,8 @@ mem_cgroup_create(struct cgroup_subsys *
    int node;

    if (unlikely((cont->parent) == NULL)) {
    + page_cgroup_init();
    mem = &init_mem_cgroup;
    - page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
    } else {
    mem = mem_cgroup_alloc();
    if (!mem)
    Index: mmotm-2.6.27-rc7+/mm/page_alloc.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/page_alloc.c
    +++ mmotm-2.6.27-rc7+/mm/page_alloc.c
    @@ -44,7 +44,7 @@
    #include
    #include
    #include
    -#include
    +#include
    #include

    #include
    @@ -223,17 +223,12 @@ static inline int bad_range(struct zone

    static void bad_page(struct page *page)
    {
    - void *pc = page_get_page_cgroup(page);
    -
    printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
    "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
    current->comm, page, (int)(2*sizeof(unsigned long)),
    (unsigned long)page->flags, page->mapping,
    page_mapcount(page), page_count(page));
    - if (pc) {
    - printk(KERN_EMERG "cgroup:%p\n", pc);
    - page_reset_bad_cgroup(page);
    - }
    +
    printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
    KERN_EMERG "Backtrace:\n");
    dump_stack();
    @@ -472,7 +467,6 @@ static inline void free_pages_check(stru
    free_page_mlock(page);
    if (unlikely(page_mapcount(page) |
    (page->mapping != NULL) |
    - (page_get_page_cgroup(page) != NULL) |
    (page_count(page) != 0) |
    (page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
    bad_page(page);
    @@ -609,7 +603,6 @@ static void prep_new_page(struct page *p
    {
    if (unlikely(page_mapcount(page) |
    (page->mapping != NULL) |
    - (page_get_page_cgroup(page) != NULL) |
    (page_count(page) != 0) |
    (page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
    bad_page(page);
    @@ -3495,6 +3488,7 @@ static void __paginginit free_area_init_
    pgdat->nr_zones = 0;
    init_waitqueue_head(&pgdat->kswapd_wait);
    pgdat->kswapd_max_order = 0;
    + pgdat_page_cgroup_init(pgdat);

    for (j = 0; j < MAX_NR_ZONES; j++) {
    struct zone *zone = pgdat->node_zones + j;
    Index: mmotm-2.6.27-rc7+/include/linux/mmzone.h
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/include/linux/mmzone.h
    +++ mmotm-2.6.27-rc7+/include/linux/mmzone.h
    @@ -604,6 +604,9 @@ typedef struct pglist_data {
    int nr_zones;
    #ifdef CONFIG_FLAT_NODE_MEM_MAP
    struct page *node_mem_map;
    +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
    + struct page_cgroup *node_page_cgroup;
    +#endif
    #endif
    struct bootmem_data *bdata;
    #ifdef CONFIG_MEMORY_HOTPLUG
    @@ -932,6 +935,7 @@ static inline unsigned long early_pfn_to
    #endif

    struct page;
    +struct page_cgroup;
    struct mem_section {
    /*
    * This is, logically, a pointer to an array of struct
    @@ -949,6 +953,11 @@ struct mem_section {

    /* See declaration of similar field in struct zone */
    unsigned long *pageblock_flags;
    +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
    + /* see page_cgroup.h */
    + struct page_cgroup *page_cgroup;
    + unsigned long pad;
    +#endif
    };

    #ifdef CONFIG_SPARSEMEM_EXTREME
    Index: mmotm-2.6.27-rc7+/include/linux/memcontrol.h
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/include/linux/memcontrol.h
    +++ mmotm-2.6.27-rc7+/include/linux/memcontrol.h
    @@ -29,7 +29,6 @@ struct mm_struct;

    #define page_reset_bad_cgroup(page) ((page)->page_cgroup = 0)

    -extern struct page_cgroup *page_get_page_cgroup(struct page *page);
    extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
    gfp_t gfp_mask);
    extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
    @@ -72,16 +71,8 @@ extern void mem_cgroup_record_reclaim_pr
    extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
    int priority, enum lru_list lru);

    -#else /* CONFIG_CGROUP_MEM_RES_CTLR */
    -static inline void page_reset_bad_cgroup(struct page *page)
    -{
    -}
    -
    -static inline struct page_cgroup *page_get_page_cgroup(struct page *page)
    -{
    - return NULL;
    -}

    +#else /* CONFIG_CGROUP_MEM_RES_CTLR */
    static inline int mem_cgroup_charge(struct page *page,
    struct mm_struct *mm, gfp_t gfp_mask)
    {

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  11. [PATCH 10/12] memcg free page_cgroup from LRU in lazy

    Free page_cgroup from its LRU in batched manner.

    When uncharge() is called, page is pushed onto per-cpu vector and
    removed from LRU, later.. This routine resembles to global LRU's pagevec.
    This patch is half of the whole patch and a set with following lazy LRU add
    patch.


    Signed-off-by: KAMEZAWA Hiroyuki

    mm/memcontrol.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++++++ +-----
    1 file changed, 152 insertions(+), 12 deletions(-)

    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -36,6 +36,7 @@
    #include
    #include
    #include
    +#include

    #include

    @@ -533,6 +534,116 @@ out:
    return ret;
    }

    +
    +#define MEMCG_PCPVEC_SIZE (14) /* size of pagevec */
    +struct memcg_percpu_vec {
    + int nr;
    + int limit;
    + struct page_cgroup *vec[MEMCG_PCPVEC_SIZE];
    +};
    +static DEFINE_PER_CPU(struct memcg_percpu_vec, memcg_free_vec);
    +
    +static void
    +__release_page_cgroup(struct memcg_percpu_vec *mpv)
    +{
    + unsigned long flags;
    + struct mem_cgroup_per_zone *mz, *prev_mz;
    + struct page_cgroup *pc;
    + int i, nr;
    +
    + local_irq_save(flags);
    + nr = mpv->nr;
    + mpv->nr = 0;
    + prev_mz = NULL;
    + for (i = nr - 1; i >= 0; i--) {
    + pc = mpv->vec[i];
    + VM_BUG_ON(PageCgroupUsed(pc));
    + mz = page_cgroup_zoneinfo(pc);
    + if (prev_mz != mz) {
    + if (prev_mz)
    + spin_unlock(&prev_mz->lru_lock);
    + prev_mz = mz;
    + spin_lock(&mz->lru_lock);
    + }
    + __mem_cgroup_remove_list(mz, pc);
    + css_put(&pc->mem_cgroup->css);
    + pc->mem_cgroup = NULL;
    + }
    + if (prev_mz)
    + spin_unlock(&prev_mz->lru_lock);
    + local_irq_restore(flags);
    +
    +}
    +
    +static void
    +release_page_cgroup(struct page_cgroup *pc)
    +{
    + struct memcg_percpu_vec *mpv;
    +
    + mpv = &get_cpu_var(memcg_free_vec);
    + mpv->vec[mpv->nr++] = pc;
    + if (mpv->nr >= mpv->limit)
    + __release_page_cgroup(mpv);
    + put_cpu_var(memcg_free_vec);
    +}
    +
    +static void page_cgroup_start_cache_cpu(int cpu)
    +{
    + struct memcg_percpu_vec *mpv;
    + mpv = &per_cpu(memcg_free_vec, cpu);
    + mpv->limit = MEMCG_PCPVEC_SIZE;
    +}
    +
    +#ifdef CONFIG_HOTPLUG_CPU
    +static void page_cgroup_stop_cache_cpu(int cpu)
    +{
    + struct memcg_percpu_vec *mpv;
    + mpv = &per_cpu(memcg_free_vec, cpu);
    + mpv->limit = 0;
    +}
    +#endif
    +
    +
    +/*
    + * Used when freeing memory resource controller to remove all
    + * page_cgroup (in obsolete list).
    + */
    +static DEFINE_MUTEX(memcg_force_drain_mutex);
    +
    +static void drain_page_cgroup_local(struct work_struct *work)
    +{
    + struct memcg_percpu_vec *mpv;
    + mpv = &get_cpu_var(memcg_free_vec);
    + __release_page_cgroup(mpv);
    + put_cpu_var(mpv);
    +}
    +
    +static void drain_page_cgroup_cpu(int cpu)
    +{
    + int local_cpu;
    + struct work_struct work;
    +
    + local_cpu = get_cpu();
    + if (local_cpu == cpu) {
    + drain_page_cgroup_local(NULL);
    + put_cpu();
    + return;
    + }
    + put_cpu();
    +
    + INIT_WORK(&work, drain_page_cgroup_local);
    + schedule_work_on(cpu, &work);
    + flush_work(&work);
    +}
    +
    +static void drain_page_cgroup_all(void)
    +{
    + mutex_lock(&memcg_force_drain_mutex);
    + schedule_on_each_cpu(drain_page_cgroup_local);
    + mutex_unlock(&memcg_force_drain_mutex);
    +}
    +
    +
    /*
    * Charge the memory controller for page usage.
    * Return
    @@ -703,8 +814,6 @@ __mem_cgroup_uncharge_common(struct page
    {
    struct page_cgroup *pc;
    struct mem_cgroup *mem;
    - struct mem_cgroup_per_zone *mz;
    - unsigned long flags;

    if (mem_cgroup_subsys.disabled)
    return;
    @@ -722,16 +831,10 @@ __mem_cgroup_uncharge_common(struct page
    }
    ClearPageCgroupUsed(pc);
    unlock_page_cgroup(pc);
    + preempt_enable();

    mem = pc->mem_cgroup;
    - mz = page_cgroup_zoneinfo(pc);
    -
    - spin_lock_irqsave(&mz->lru_lock, flags);
    - __mem_cgroup_remove_list(mz, pc);
    - spin_unlock_irqrestore(&mz->lru_lock, flags);
    - pc->mem_cgroup = NULL;
    - css_put(&mem->css);
    - preempt_enable();
    + release_page_cgroup(pc);
    res_counter_uncharge(&mem->res, PAGE_SIZE);

    return;
    @@ -888,9 +991,8 @@ static void mem_cgroup_force_empty_list(
    if (!PageCgroupUsed(pc))
    continue;
    /* For avoiding race with speculative page cache handling. */
    - if (!PageLRU(page) || !get_page_unless_zero(page)) {
    + if (!PageLRU(page) || !get_page_unless_zero(page))
    continue;
    - }
    mem_cgroup_move_account(page, pc, mem, &init_mem_cgroup);
    put_page(page);
    if (atomic_read(&mem->css.cgroup->count) > 0)
    @@ -927,8 +1029,10 @@ static int mem_cgroup_force_empty(struct
    * While walking our own LRU, we also checks LRU bit on page.
    * If a page is on pagevec, it's not on LRU and we cannot
    * grab it. Calling lru_add_drain_all() here.
    + * memory cgroup's its own vector shold be flushed, too.
    */
    lru_add_drain_all();
    + drain_page_cgroup_all();
    for_each_node_state(node, N_HIGH_MEMORY) {
    for (zid = 0; zid < MAX_NR_ZONES; zid++) {
    mz = mem_cgroup_zoneinfo(mem, node, zid);
    @@ -1152,6 +1256,38 @@ static void mem_cgroup_free(struct mem_c
    vfree(mem);
    }

    +static void mem_cgroup_init_pcp(int cpu)
    +{
    + page_cgroup_start_cache_cpu(cpu);
    +}
    +
    +static int cpu_memcgroup_callback(struct notifier_block *nb,
    + unsigned long action, void *hcpu)
    +{
    + int cpu = (long)hcpu;
    +
    + switch(action) {
    + case CPU_UP_PREPARE:
    + case CPU_UP_PREPARE_FROZEN:
    + mem_cgroup_init_pcp(cpu);
    + break;
    +#ifdef CONFIG_HOTPLUG_CPU
    + case CPU_DOWN_PREPARE:
    + case CPU_DOWN_PREPARE_FROZEN:
    + page_cgroup_stop_cache_cpu(cpu);
    + drain_page_cgroup_cpu(cpu);
    + break;
    +#endif
    + default:
    + break;
    + }
    + return NOTIFY_OK;
    +}
    +
    +static struct notifier_block __refdata memcgroup_nb =
    +{
    + .notifier_call = cpu_memcgroup_callback,
    +};

    static struct cgroup_subsys_state *
    mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
    @@ -1162,6 +1298,10 @@ mem_cgroup_create(struct cgroup_subsys *
    if (unlikely((cont->parent) == NULL)) {
    page_cgroup_init();
    mem = &init_mem_cgroup;
    + cpu_memcgroup_callback(&memcgroup_nb,
    + (unsigned long)CPU_UP_PREPARE,
    + (void *)(long)smp_processor_id());
    + register_hotcpu_notifier(&memcgroup_nb);
    } else {
    mem = mem_cgroup_alloc();
    if (!mem)

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  12. [PATCH 7/12] memcg add function to move account

    This patch provides a function to move account information of a page between
    mem_cgroups.

    This moving of page_cgroup is done under
    - lru_lock of source/destination mem_cgroup is held.
    - lock_page_cgroup() is held.

    Then, a routine which touches pc->mem_cgroup without lock_page_cgroup() should
    confirm pc->mem_cgroup is still valid or not. Typlical code can be following.

    (while page is not under lock_page())
    mem = pc->mem_cgroup;
    mz = page_cgroup_zoneinfo(pc)
    spin_lock_irqsave(&mz->lru_lock);
    if (pc->mem_cgroup == mem)
    ...../* some list handling */
    spin_unlock_irq(&mz->lru_lock);

    Or better way is
    lock_page_cgroup(pc);
    ....
    unlock_page_cgroup(pc);

    But you should confirm the nest of lock and avoid deadlock.
    (trylock is better if it's ok.)

    If you find page_cgroup from mem_cgroup's LRU under mz->lru_lock,
    you don't have to worry about what pc->mem_cgroup points to.

    Changelog: (v4) -> (v5)
    - check for lock_page() is removed.
    - rewrote description.

    Changelog: (v2) -> (v4)
    - added lock_page_cgroup().
    - splitted out from new-force-empty patch.
    - added how-to-use text.
    - fixed race in __mem_cgroup_uncharge_common().

    Signed-off-by: KAMEZAWA Hiroyuki

    mm/memcontrol.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++ ++++--
    1 file changed, 81 insertions(+), 3 deletions(-)

    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -426,6 +426,7 @@ int task_in_mem_cgroup(struct task_struc
    void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
    {
    struct page_cgroup *pc;
    + struct mem_cgroup *mem;
    struct mem_cgroup_per_zone *mz;
    unsigned long flags;

    @@ -444,9 +445,14 @@ void mem_cgroup_move_lists(struct page *

    pc = page_get_page_cgroup(page);
    if (pc) {
    + mem = pc->mem_cgroup;
    mz = page_cgroup_zoneinfo(pc);
    spin_lock_irqsave(&mz->lru_lock, flags);
    - __mem_cgroup_move_lists(pc, lru);
    + /*
    + * check against the race with move_account.
    + */
    + if (likely(mem == pc->mem_cgroup))
    + __mem_cgroup_move_lists(pc, lru);
    spin_unlock_irqrestore(&mz->lru_lock, flags);
    }
    unlock_page_cgroup(page);
    @@ -567,6 +573,70 @@ unsigned long mem_cgroup_isolate_pages(u
    return nr_taken;
    }

    +/**
    + * mem_cgroup_move_account - move account of the page
    + * @page ... the target page of being moved.
    + * @pc ... page_cgroup of the page.
    + * @from ... mem_cgroup which the page is moved from.
    + * @to ... mem_cgroup which the page is moved to.
    + *
    + * The caller must confirm following.
    + * 1. disable irq.
    + * 2. lru_lock of old mem_cgroup should be held.
    + * 3. pc is guaranteed to be valid and on mem_cgroup's LRU.
    + *
    + * Because we cannot call try_to_free_page() here, the caller must guarantee
    + * this moving of charge never fails. (if charge fails, this call fails.)
    + * Currently this is called only against root cgroup.
    + * which has no limitation of resource.
    + * Returns 0 at success, returns 1 at failure.
    + */
    +int mem_cgroup_move_account(struct page *page, struct page_cgroup *pc,
    + struct mem_cgroup *from, struct mem_cgroup *to)
    +{
    + struct mem_cgroup_per_zone *from_mz, *to_mz;
    + int nid, zid;
    + int ret = 1;
    +
    + VM_BUG_ON(!irqs_disabled());
    +
    + nid = page_to_nid(page);
    + zid = page_zonenum(page);
    + from_mz = mem_cgroup_zoneinfo(from, nid, zid);
    + to_mz = mem_cgroup_zoneinfo(to, nid, zid);
    +
    + if (res_counter_charge(&to->res, PAGE_SIZE)) {
    + /* Now, we assume no_limit...no failure here. */
    + return ret;
    + }
    + if (!try_lock_page_cgroup(page)) {
    + res_counter_uncharge(&to->res, PAGE_SIZE);
    + return ret;
    + }
    +
    + if (page_get_page_cgroup(page) != pc) {
    + res_counter_uncharge(&to->res, PAGE_SIZE);
    + goto out;
    + }
    +
    + if (spin_trylock(&to_mz->lru_lock)) {
    + __mem_cgroup_remove_list(from_mz, pc);
    + css_put(&from->css);
    + res_counter_uncharge(&from->res, PAGE_SIZE);
    + pc->mem_cgroup = to;
    + css_get(&to->css);
    + __mem_cgroup_add_list(to_mz, pc);
    + ret = 0;
    + spin_unlock(&to_mz->lru_lock);
    + } else {
    + res_counter_uncharge(&to->res, PAGE_SIZE);
    + }
    +out:
    + unlock_page_cgroup(page);
    +
    + return ret;
    +}
    +
    /*
    * Charge the memory controller for page usage.
    * Return
    @@ -754,16 +824,24 @@ __mem_cgroup_uncharge_common(struct page
    if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
    && ((PageCgroupCache(pc) || page_mapped(page))))
    goto unlock;
    -
    +retry:
    + mem = pc->mem_cgroup;
    mz = page_cgroup_zoneinfo(pc);
    spin_lock_irqsave(&mz->lru_lock, flags);
    + if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED &&
    + unlikely(mem != pc->mem_cgroup)) {
    + /* MAPPED account can be done without lock_page().
    + Check race with mem_cgroup_move_account() */
    + spin_unlock_irqrestore(&mz->lru_lock, flags);
    + goto retry;
    + }
    __mem_cgroup_remove_list(mz, pc);
    spin_unlock_irqrestore(&mz->lru_lock, flags);

    page_assign_page_cgroup(page, NULL);
    unlock_page_cgroup(page);

    - mem = pc->mem_cgroup;
    +
    res_counter_uncharge(&mem->res, PAGE_SIZE);
    css_put(&mem->css);


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  13. [PATCH 12/12] memcg: fix race at charging swap-in

    There is a small race in do_swap_page(). When the page swapped-in is charged,
    the mapcount can be greater than 0. But, at the same time some process (shares
    it ) call unmap and make mapcount 1->0 and the page is uncharged.

    For fixing this, I added a new interface.
    - precharge
    account to res_counter by PAGE_SIZE and try to free pages if necessary.
    - commit
    register page_cgroup and add to LRU if necessary.
    - cancel
    uncharge PAGE_SIZE because of do_swap_page failure.

    This protocol uses PCG_USED bit on page_cgroup for avoiding over accounting.
    Usual mem_cgroup_charge_common() does precharge -> commit at a time.

    These precharge/commit/cancel is useful and can be used for other places,
    - shmem, (and other places need precharge.)
    - migration
    - move_account(force_empty) etc...
    etc..we'll revisit later.

    Signed-off-by: KAMEZAWA Hiroyuki

    include/linux/memcontrol.h | 21 +++++++
    mm/memcontrol.c | 135 +++++++++++++++++++++++++++++++--------------
    mm/memory.c | 6 +-
    3 files changed, 120 insertions(+), 42 deletions(-)

    Index: mmotm-2.6.27-rc7+/include/linux/memcontrol.h
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/include/linux/memcontrol.h
    +++ mmotm-2.6.27-rc7+/include/linux/memcontrol.h
    @@ -31,6 +31,13 @@ struct mm_struct;

    extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
    gfp_t gfp_mask);
    +/* for swap handling */
    +extern int mem_cgroup_precharge(struct mm_struct *mm,
    + gfp_t gfp_mask, struct mem_cgroup **ptr);
    +extern void mem_cgroup_commit_charge_swap(struct page *page,
    + struct mem_cgroup *ptr);
    +extern void mem_cgroup_cancel_charge_swap(struct mem_cgroup *ptr);
    +
    extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
    gfp_t gfp_mask);
    extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru);
    @@ -85,6 +92,20 @@ static inline int mem_cgroup_cache_charg
    return 0;
    }

    +static int mem_cgroup_precharge(struct mm_struct *mm,
    + gfp_t gfp_mask, struct mem_cgroup **ptr)
    +{
    + return 0;
    +}
    +
    +static void mem_cgroup_commit_charge_swap(struct page *page,
    + struct mem_cgroup *ptr)
    +{
    +}
    +static void mem_cgroup_cancel_charge_swap(struct mem_cgroup *ptr)
    +{
    +}
    +
    static inline void mem_cgroup_uncharge_page(struct page *page)
    {
    }
    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -698,52 +698,44 @@ static void drain_page_cgroup_all(void)


    /*
    - * Charge the memory controller for page usage.
    - * Return
    - * 0 if the charge was successful
    - * < 0 if the cgroup is over its limit
    + * charge against mem_cgroup linked to this mm. (or *ptr)
    + *
    + * This just charge PAGE_SIZE and reduce memory usage if necessary.
    + *
    + * Pages on radix-tree is charged at radix-tree add/remove under lock.
    + * new pages are charged at allocation and both are guaranteed to be that
    + * there are no racy users. We does precharge->commit at once.
    + *
    + * About swapcache, we can't trust page->mapcount until it's mapped.
    + * Then we do precharge before map and commit/cancel after the mapping is
    + * established. (see below, we have commit_swap and cancel_swap)
    */
    -static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
    - gfp_t gfp_mask, enum charge_type ctype,
    - struct mem_cgroup *memcg)
    +
    +int mem_cgroup_precharge(struct mm_struct *mm,
    + gfp_t mask, struct mem_cgroup **ptr)
    {
    struct mem_cgroup *mem;
    - struct page_cgroup *pc;
    - unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
    - struct mem_cgroup_per_zone *mz;
    - unsigned long flags;
    -
    - pc = lookup_page_cgroup(page);
    - /* can happen at boot */
    - if (unlikely(!pc))
    - return 0;
    - prefetchw(pc);
    - /*
    - * We always charge the cgroup the mm_struct belongs to.
    - * The mm_struct's mem_cgroup changes on task migration if the
    - * thread group leader migrates. It's possible that mm is not
    - * set, if so charge the init_mm (happens for pagecache usage).
    - */
    + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

    - if (likely(!memcg)) {
    - rcu_read_lock();
    + rcu_read_lock();
    + if (!*ptr) {
    mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
    if (unlikely(!mem)) {
    rcu_read_unlock();
    - return 0;
    + return -ESRCH;
    }
    - rcu_read_unlock();
    + *ptr = mem;
    } else {
    - mem = memcg;
    + mem = *ptr;
    }
    + rcu_read_unlock();

    + css_get(&mem->css);
    while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
    - if (!(gfp_mask & __GFP_WAIT))
    - goto out;
    -
    - if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
    + if (!(mask & __GFP_WAIT))
    + goto nomem;
    + if (try_to_free_mem_cgroup_pages(mem, mask))
    continue;
    -
    /*
    * try_to_free_mem_cgroup_pages() might not give us a full
    * picture of reclaim. Some pages are reclaimed and might be
    @@ -755,16 +747,31 @@ static int mem_cgroup_charge_common(stru
    continue;

    if (!nr_retries--) {
    - mem_cgroup_out_of_memory(mem, gfp_mask);
    - goto out;
    + mem_cgroup_out_of_memory(mem, mask);
    + goto nomem;
    }
    }
    + return 0;
    +nomem:
    + css_put(&mem->css);
    + return -ENOMEM;
    +}

    +void mem_cgroup_commit_charge(struct page_cgroup *pc,
    + struct mem_cgroup *mem,
    + enum charge_type ctype)
    +{
    + struct mem_cgroup_per_zone *mz;
    + unsigned long flags;
    +
    + if (!mem)
    + return;
    preempt_disable();
    if (TestSetPageCgroupUsed(pc)) {
    res_counter_uncharge(&mem->res, PAGE_SIZE);
    + css_put(&mem->css);
    preempt_enable();
    - goto done;
    + return;
    }
    /*
    * page cgroup is *unused* now....but....
    @@ -786,14 +793,43 @@ static int mem_cgroup_charge_common(stru

    pc->mem_cgroup = mem;
    set_page_cgroup_lru(pc);
    + css_put(&mem->css);
    preempt_enable();
    +}

    -done:
    +/*
    + * Charge the memory controller for page usage.
    + * Return
    + * 0 if the charge was successful
    + * < 0 if the cgroup is over its limit
    + */
    +static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
    + gfp_t gfp_mask, enum charge_type ctype,
    + struct mem_cgroup *memcg)
    +{
    + struct page_cgroup *pc;
    + struct mem_cgroup *ptr = memcg;
    + int ret;
    +
    + pc = lookup_page_cgroup(page);
    + /* can happen at boot */
    + if (unlikely(!pc))
    + return 0;
    + prefetchw(pc);
    +
    + ret = mem_cgroup_precharge(mm, gfp_mask, &ptr);
    + if (likely(!ret)) {
    + mem_cgroup_commit_charge(pc, ptr, ctype);
    + return 0;
    + }
    + if (unlikely((ret == -ENOMEM)))
    + return ret;
    + /* ESRCH case */
    return 0;
    -out:
    - return -ENOMEM;
    }

    +
    +
    int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
    {
    if (mem_cgroup_subsys.disabled)
    @@ -806,7 +842,7 @@ int mem_cgroup_charge(struct page *page,
    * But page->mapping may have out-of-use anon_vma pointer,
    * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
    * is NULL.
    - */
    + */
    if (page_mapped(page) || (page->mapping && !PageAnon(page)))
    return 0;
    if (unlikely(!mm))
    @@ -857,6 +893,25 @@ int mem_cgroup_cache_charge(struct page
    MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
    }

    +
    +void mem_cgroup_commit_charge_swap(struct page *page, struct mem_cgroup *ptr)
    +{
    + struct page_cgroup *pc;
    + if (!ptr)
    + return;
    + pc = lookup_page_cgroup(page);
    + mem_cgroup_commit_charge(pc, ptr, MEM_CGROUP_CHARGE_TYPE_MAPPED);
    +}
    +
    +void mem_cgroup_cancel_charge_swap(struct mem_cgroup *mem)
    +{
    + if (!mem)
    + return;
    + res_counter_uncharge(&mem->res, PAGE_SIZE);
    + css_put(&mem->css);
    +}
    +
    +
    /*
    * uncharge if !page_mapped(page)
    */
    Index: mmotm-2.6.27-rc7+/mm/memory.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memory.c
    +++ mmotm-2.6.27-rc7+/mm/memory.c
    @@ -2287,6 +2287,7 @@ static int do_swap_page(struct mm_struct
    struct page *page;
    swp_entry_t entry;
    pte_t pte;
    + struct mem_cgroup *ptr = NULL;
    int ret = 0;

    if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
    @@ -2323,7 +2324,7 @@ static int do_swap_page(struct mm_struct
    lock_page(page);
    delayacct_clear_flag(DELAYACCT_PF_SWAPIN);

    - if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
    + if (mem_cgroup_precharge(mm, GFP_KERNEL, &ptr) == -ENOMEM) {
    ret = VM_FAULT_OOM;
    goto out;
    }
    @@ -2353,6 +2354,7 @@ static int do_swap_page(struct mm_struct
    flush_icache_page(vma, page);
    set_pte_at(mm, address, page_table, pte);
    page_add_anon_rmap(page, vma, address);
    + mem_cgroup_commit_charge_swap(page, ptr);

    swap_free(entry);
    if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
    @@ -2373,7 +2375,7 @@ unlock:
    out:
    return ret;
    out_nomap:
    - mem_cgroup_uncharge_page(page);
    + mem_cgroup_cancel_charge_swap(ptr);
    pte_unmap_unlock(page_table, ptl);
    unlock_page(page);
    page_cache_release(page);

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  14. Re: [PATCH 9/12] memcg allocate all page_cgroup at boot

    On Thu, 2008-09-25 at 15:32 +0900, KAMEZAWA Hiroyuki wrote:
    > @@ -949,6 +953,11 @@ struct mem_section {
    >
    > /* See declaration of similar field in struct zone */
    > unsigned long *pageblock_flags;
    > +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
    > + /* see page_cgroup.h */
    > + struct page_cgroup *page_cgroup;
    > + unsigned long pad;
    > +#endif
    > };


    I thought the use of this variable was under the

    +#ifdef CONFIG_FLAT_NODE_MEM_MAP

    options. Otherwise, we unconditionally bloat mem_section, right?

    -- Dave

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  15. Re: [PATCH 9/12] memcg allocate all page_cgroup at boot

    On Thu, 25 Sep 2008 11:40:47 -0700
    Dave Hansen wrote:

    > On Thu, 2008-09-25 at 15:32 +0900, KAMEZAWA Hiroyuki wrote:
    > > @@ -949,6 +953,11 @@ struct mem_section {
    > >
    > > /* See declaration of similar field in struct zone */
    > > unsigned long *pageblock_flags;
    > > +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
    > > + /* see page_cgroup.h */
    > > + struct page_cgroup *page_cgroup;
    > > + unsigned long pad;
    > > +#endif
    > > };

    >
    > I thought the use of this variable was under the
    >
    > +#ifdef CONFIG_FLAT_NODE_MEM_MAP
    >
    > options. Otherwise, we unconditionally bloat mem_section, right?
    >

    Hmmm......Oh, yes ! nice catch.

    Thanks, I'll fix.
    -Kame

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  16. Re: [PATCH 9/12] memcg allocate all page_cgroup at boot

    On Fri, 26 Sep 2008 10:17:54 +0900
    KAMEZAWA Hiroyuki wrote:

    > On Thu, 25 Sep 2008 11:40:47 -0700
    > Dave Hansen wrote:
    >
    > > On Thu, 2008-09-25 at 15:32 +0900, KAMEZAWA Hiroyuki wrote:
    > > > @@ -949,6 +953,11 @@ struct mem_section {
    > > >
    > > > /* See declaration of similar field in struct zone */
    > > > unsigned long *pageblock_flags;
    > > > +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
    > > > + /* see page_cgroup.h */
    > > > + struct page_cgroup *page_cgroup;
    > > > + unsigned long pad;
    > > > +#endif
    > > > };

    > >
    > > I thought the use of this variable was under the
    > >
    > > +#ifdef CONFIG_FLAT_NODE_MEM_MAP
    > >
    > > options. Otherwise, we unconditionally bloat mem_section, right?
    > >

    > Hmmm......Oh, yes ! nice catch.
    >
    > Thanks, I'll fix.


    But in reality, this is under CONFIG_SPARSEMEM and if CONFIG_SPARSEMEM,
    FLAT_NODE_MEM_MAP is not true (I think).
    Hmm..Maybe I shouldn't use checking CONFIG_FLAT_NODE_MEM_MAP and should
    just use CONFIG_SPARSEMEM check. I'll rewrite.

    Thanks,
    -Kame

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  17. Re: [PATCH 9/12] memcg allocate all page_cgroup at boot

    On Thu, 25 Sep 2008 15:32:06 +0900, KAMEZAWA Hiroyuki wrote:
    > Allocate all page_cgroup at boot and remove page_cgroup poitner
    > from struct page. This patch adds an interface as
    >
    > struct page_cgroup *lookup_page_cgroup(struct page*)
    >
    > All FLATMEM/DISCONTIGMEM/SPARSEMEM and MEMORY_HOTPLUG is supported.
    >
    > Remove page_cgroup pointer reduces the amount of memory by
    > - 4 bytes per PAGE_SIZE.
    > - 8 bytes per PAGE_SIZE
    > if memory controller is disabled. (even if configured.)
    > meta data usage of this is no problem in FLATMEM/DISCONTIGMEM.
    > On SPARSEMEM, this makes mem_section[] size twice.
    >
    > On usual 8GB x86-32 server, this saves 8MB of NORMAL_ZONE memory.
    > On my x86-64 server with 48GB of memory, this saves 96MB of memory.
    > (and uses xx kbytes for mem_section.)
    > I think this reduction makes sense.
    >
    > By pre-allocation, kmalloc/kfree in charge/uncharge are removed.
    > This means
    > - we're not necessary to be afraid of kmalloc faiulre.
    > (this can happen because of gfp_mask type.)
    > - we can avoid calling kmalloc/kfree.
    > - we can avoid allocating tons of small objects which can be fragmented.
    > - we can know what amount of memory will be used for this extra-lru handling.
    >
    > I added printk message as
    >
    > "allocated %ld bytes of page_cgroup"
    > "please try cgroup_disable=memory option if you don't want"
    >
    > maybe enough informative for users.
    >
    > Signed-off-by: KAMEZAWA Hiroyuki
    >
    > include/linux/memcontrol.h | 11 -
    > include/linux/mm_types.h | 4
    > include/linux/mmzone.h | 9 +
    > include/linux/page_cgroup.h | 90 +++++++++++++++
    > mm/Makefile | 2
    > mm/memcontrol.c | 258 ++++++++++++--------------------------------
    > mm/page_alloc.c | 12 --
    > mm/page_cgroup.c | 253 +++++++++++++++++++++++++++++++++++++++++++
    > 8 files changed, 431 insertions(+), 208 deletions(-)
    >
    > Index: mmotm-2.6.27-rc7+/mm/page_cgroup.c
    > ================================================== =================
    > --- /dev/null
    > +++ mmotm-2.6.27-rc7+/mm/page_cgroup.c
    > @@ -0,0 +1,253 @@
    > +#include
    > +#include
    > +#include
    > +#include
    > +#include
    > +#include
    > +#include
    > +
    > +static void __meminit
    > +__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
    > +{
    > + pc->flags = 0;
    > + pc->mem_cgroup = NULL;
    > + pc->page = pfn_to_page(pfn);
    > +}
    > +static unsigned long total_usage = 0;
    > +
    > +#ifdef CONFIG_FLAT_NODE_MEM_MAP
    > +
    > +
    > +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
    > +{
    > + pgdat->node_page_cgroup = NULL;
    > +}
    > +
    > +struct page_cgroup *lookup_page_cgroup(struct page *page)
    > +{
    > + unsigned long pfn = page_to_pfn(page);
    > + unsigned long offset;
    > + struct page_cgroup *base;
    > +
    > + base = NODE_DATA(page_to_nid(nid))->node_page_cgroup;

    page_to_nid(page)

    > + if (unlikely(!base))
    > + return NULL;
    > +
    > + offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
    > + return base + offset;
    > +}
    > +
    > +static int __init alloc_node_page_cgroup(int nid)
    > +{
    > + struct page_cgroup *base, *pc;
    > + unsigned long table_size;
    > + unsigned long start_pfn, nr_pages, index;
    > +
    > + start_pfn = NODE_DATA(nid)->node_start_pfn;
    > + nr_pages = NODE_DATA(nid)->node_spanned_pages;
    > +
    > + table_size = sizeof(struct page_cgroup) * nr_pages;
    > +
    > + base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
    > + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
    > + if (!base)
    > + return -ENOMEM;
    > + for (index = 0; index < nr_pages; index++) {
    > + pc = base + index;
    > + __init_page_cgroup(pc, start_pfn + index);
    > + }
    > + NODE_DATA(nid)->node_page_cgroup = base;
    > + total_usage += table_size;
    > + return 0;
    > +}
    > +
    > +void __init free_node_page_cgroup(int nid)
    > +{
    > + unsigned long table_size;
    > + unsigned long nr_pages;
    > + struct page_cgroup *base;
    > +
    > + base = NODE_DATA(nid)->node_page_cgroup;
    > + if (!base)
    > + return;
    > + nr_pages = NODE_DATA(nid)->node_spanned_pages;
    > +
    > + table_size = sizeof(struct page_cgroup) * nr_pages;
    > +
    > + free_bootmem_node(NODE_DATA(nid),
    > + (unsigned long)base, table_size);
    > + NODE_DATA(nid)->node_page_cgroup = NULL;
    > +}
    > +

    Hmm, who uses this function?

    (snip)

    > @@ -812,49 +708,41 @@ __mem_cgroup_uncharge_common(struct page
    >
    > if (mem_cgroup_subsys.disabled)
    > return;
    > + /* check the condition we can know from page */
    >
    > - /*
    > - * Check if our page_cgroup is valid
    > - */
    > - lock_page_cgroup(page);
    > - pc = page_get_page_cgroup(page);
    > - if (unlikely(!pc))
    > - goto unlock;
    > -
    > - VM_BUG_ON(pc->page != page);
    > + pc = lookup_page_cgroup(page);
    > + if (unlikely(!pc || !PageCgroupUsed(pc)))
    > + return;
    > + preempt_disable();
    > + lock_page_cgroup(pc);
    > + if (unlikely(page_mapped(page))) {
    > + unlock_page_cgroup(pc);
    > + preempt_enable();
    > + return;
    > + }

    Just for clarification, in what sequence will the page be mapped here?
    mem_cgroup_uncharge_page checks whether the page is mapped.

    > + ClearPageCgroupUsed(pc);
    > + unlock_page_cgroup(pc);
    >
    > - if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
    > - && ((PageCgroupCache(pc) || page_mapped(page))))
    > - goto unlock;
    > -retry:
    > mem = pc->mem_cgroup;
    > mz = page_cgroup_zoneinfo(pc);
    > +
    > spin_lock_irqsave(&mz->lru_lock, flags);
    > - if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED &&
    > - unlikely(mem != pc->mem_cgroup)) {
    > - /* MAPPED account can be done without lock_page().
    > - Check race with mem_cgroup_move_account() */
    > - spin_unlock_irqrestore(&mz->lru_lock, flags);
    > - goto retry;
    > - }

    By these changes, ctype becomes unnecessary so it can be removed.

    > __mem_cgroup_remove_list(mz, pc);
    > spin_unlock_irqrestore(&mz->lru_lock, flags);
    > -
    > - page_assign_page_cgroup(page, NULL);
    > - unlock_page_cgroup(page);
    > -
    > -
    > - res_counter_uncharge(&mem->res, PAGE_SIZE);
    > + pc->mem_cgroup = NULL;
    > css_put(&mem->css);
    > + preempt_enable();
    > + res_counter_uncharge(&mem->res, PAGE_SIZE);
    >
    > - kmem_cache_free(page_cgroup_cache, pc);
    > return;
    > -unlock:
    > - unlock_page_cgroup(page);
    > }
    >



    Thanks,
    Daisuke Nishimura.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  18. Re: [PATCH 9/12] memcg allocate all page_cgroup at boot

    On Fri, 26 Sep 2008 10:00:22 +0900
    Daisuke Nishimura wrote:

    > On Thu, 25 Sep 2008 15:32:06 +0900, KAMEZAWA Hiroyuki wrote:
    > > Allocate all page_cgroup at boot and remove page_cgroup poitner
    > > from struct page. This patch adds an interface as
    > >
    > > struct page_cgroup *lookup_page_cgroup(struct page*)
    > >
    > > All FLATMEM/DISCONTIGMEM/SPARSEMEM and MEMORY_HOTPLUG is supported.
    > >
    > > Remove page_cgroup pointer reduces the amount of memory by
    > > - 4 bytes per PAGE_SIZE.
    > > - 8 bytes per PAGE_SIZE
    > > if memory controller is disabled. (even if configured.)
    > > meta data usage of this is no problem in FLATMEM/DISCONTIGMEM.
    > > On SPARSEMEM, this makes mem_section[] size twice.
    > >
    > > On usual 8GB x86-32 server, this saves 8MB of NORMAL_ZONE memory.
    > > On my x86-64 server with 48GB of memory, this saves 96MB of memory.
    > > (and uses xx kbytes for mem_section.)
    > > I think this reduction makes sense.
    > >
    > > By pre-allocation, kmalloc/kfree in charge/uncharge are removed.
    > > This means
    > > - we're not necessary to be afraid of kmalloc faiulre.
    > > (this can happen because of gfp_mask type.)
    > > - we can avoid calling kmalloc/kfree.
    > > - we can avoid allocating tons of small objects which can be fragmented.
    > > - we can know what amount of memory will be used for this extra-lru handling.
    > >
    > > I added printk message as
    > >
    > > "allocated %ld bytes of page_cgroup"
    > > "please try cgroup_disable=memory option if you don't want"
    > >
    > > maybe enough informative for users.
    > >
    > > Signed-off-by: KAMEZAWA Hiroyuki
    > >
    > > include/linux/memcontrol.h | 11 -
    > > include/linux/mm_types.h | 4
    > > include/linux/mmzone.h | 9 +
    > > include/linux/page_cgroup.h | 90 +++++++++++++++
    > > mm/Makefile | 2
    > > mm/memcontrol.c | 258 ++++++++++++--------------------------------
    > > mm/page_alloc.c | 12 --
    > > mm/page_cgroup.c | 253 +++++++++++++++++++++++++++++++++++++++++++
    > > 8 files changed, 431 insertions(+), 208 deletions(-)
    > >
    > > Index: mmotm-2.6.27-rc7+/mm/page_cgroup.c
    > > ================================================== =================
    > > --- /dev/null
    > > +++ mmotm-2.6.27-rc7+/mm/page_cgroup.c
    > > @@ -0,0 +1,253 @@
    > > +#include
    > > +#include
    > > +#include
    > > +#include
    > > +#include
    > > +#include
    > > +#include
    > > +
    > > +static void __meminit
    > > +__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
    > > +{
    > > + pc->flags = 0;
    > > + pc->mem_cgroup = NULL;
    > > + pc->page = pfn_to_page(pfn);
    > > +}
    > > +static unsigned long total_usage = 0;
    > > +
    > > +#ifdef CONFIG_FLAT_NODE_MEM_MAP
    > > +
    > > +
    > > +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
    > > +{
    > > + pgdat->node_page_cgroup = NULL;
    > > +}
    > > +
    > > +struct page_cgroup *lookup_page_cgroup(struct page *page)
    > > +{
    > > + unsigned long pfn = page_to_pfn(page);
    > > + unsigned long offset;
    > > + struct page_cgroup *base;
    > > +
    > > + base = NODE_DATA(page_to_nid(nid))->node_page_cgroup;

    > page_to_nid(page)
    >

    yes..

    > > + if (unlikely(!base))
    > > + return NULL;
    > > +
    > > + offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
    > > + return base + offset;
    > > +}
    > > +
    > > +static int __init alloc_node_page_cgroup(int nid)
    > > +{
    > > + struct page_cgroup *base, *pc;
    > > + unsigned long table_size;
    > > + unsigned long start_pfn, nr_pages, index;
    > > +
    > > + start_pfn = NODE_DATA(nid)->node_start_pfn;
    > > + nr_pages = NODE_DATA(nid)->node_spanned_pages;
    > > +
    > > + table_size = sizeof(struct page_cgroup) * nr_pages;
    > > +
    > > + base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
    > > + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
    > > + if (!base)
    > > + return -ENOMEM;
    > > + for (index = 0; index < nr_pages; index++) {
    > > + pc = base + index;
    > > + __init_page_cgroup(pc, start_pfn + index);
    > > + }
    > > + NODE_DATA(nid)->node_page_cgroup = base;
    > > + total_usage += table_size;
    > > + return 0;
    > > +}
    > > +
    > > +void __init free_node_page_cgroup(int nid)
    > > +{
    > > + unsigned long table_size;
    > > + unsigned long nr_pages;
    > > + struct page_cgroup *base;
    > > +
    > > + base = NODE_DATA(nid)->node_page_cgroup;
    > > + if (!base)
    > > + return;
    > > + nr_pages = NODE_DATA(nid)->node_spanned_pages;
    > > +
    > > + table_size = sizeof(struct page_cgroup) * nr_pages;
    > > +
    > > + free_bootmem_node(NODE_DATA(nid),
    > > + (unsigned long)base, table_size);
    > > + NODE_DATA(nid)->node_page_cgroup = NULL;
    > > +}
    > > +

    > Hmm, who uses this function?
    >

    Uh, ok. unnecessary. (In my first version, this allocation error
    just shows Warning. Now, it panics.)

    Appearently, FLATMEM check is not enough...

    > (snip)
    >
    > > @@ -812,49 +708,41 @@ __mem_cgroup_uncharge_common(struct page
    > >
    > > if (mem_cgroup_subsys.disabled)
    > > return;
    > > + /* check the condition we can know from page */
    > >
    > > - /*
    > > - * Check if our page_cgroup is valid
    > > - */
    > > - lock_page_cgroup(page);
    > > - pc = page_get_page_cgroup(page);
    > > - if (unlikely(!pc))
    > > - goto unlock;
    > > -
    > > - VM_BUG_ON(pc->page != page);
    > > + pc = lookup_page_cgroup(page);
    > > + if (unlikely(!pc || !PageCgroupUsed(pc)))
    > > + return;
    > > + preempt_disable();
    > > + lock_page_cgroup(pc);
    > > + if (unlikely(page_mapped(page))) {
    > > + unlock_page_cgroup(pc);
    > > + preempt_enable();
    > > + return;
    > > + }

    > Just for clarification, in what sequence will the page be mapped here?
    > mem_cgroup_uncharge_page checks whether the page is mapped.
    >

    Please think about folloing situation.

    There is a SwapCache which is referred from 2 process, A, B.
    A maps it.
    B doesn't maps it.

    And now, process A exits.

    CPU0(process A) CPU1 (process B)

    zap_pte_range()
    => page remove from rmap => charge() (do_swap_page)
    => set page->mapcount->0
    => uncharge() => set page->mapcount=1

    This race is what patch 12/12 is fixed.
    This only happens on cursed SwapCache.


    > > + ClearPageCgroupUsed(pc);
    > > + unlock_page_cgroup(pc);
    > >
    > > - if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
    > > - && ((PageCgroupCache(pc) || page_mapped(page))))
    > > - goto unlock;
    > > -retry:
    > > mem = pc->mem_cgroup;
    > > mz = page_cgroup_zoneinfo(pc);
    > > +
    > > spin_lock_irqsave(&mz->lru_lock, flags);
    > > - if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED &&
    > > - unlikely(mem != pc->mem_cgroup)) {
    > > - /* MAPPED account can be done without lock_page().
    > > - Check race with mem_cgroup_move_account() */
    > > - spin_unlock_irqrestore(&mz->lru_lock, flags);
    > > - goto retry;
    > > - }

    > By these changes, ctype becomes unnecessary so it can be removed.
    >

    Uh, maybe it can be removed.

    > > __mem_cgroup_remove_list(mz, pc);
    > > spin_unlock_irqrestore(&mz->lru_lock, flags);
    > > -
    > > - page_assign_page_cgroup(page, NULL);
    > > - unlock_page_cgroup(page);
    > > -
    > > -
    > > - res_counter_uncharge(&mem->res, PAGE_SIZE);
    > > + pc->mem_cgroup = NULL;
    > > css_put(&mem->css);
    > > + preempt_enable();
    > > + res_counter_uncharge(&mem->res, PAGE_SIZE);
    > >
    > > - kmem_cache_free(page_cgroup_cache, pc);
    > > return;
    > > -unlock:
    > > - unlock_page_cgroup(page);
    > > }
    > >


    Thank you for review.

    Regards,
    -Kame

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  19. Re: [PATCH 9/12] memcg allocate all page_cgroup at boot

    On Fri, 26 Sep 2008 10:43:36 +0900
    KAMEZAWA Hiroyuki wrote:
    > > > - /*
    > > > - * Check if our page_cgroup is valid
    > > > - */
    > > > - lock_page_cgroup(page);
    > > > - pc = page_get_page_cgroup(page);
    > > > - if (unlikely(!pc))
    > > > - goto unlock;
    > > > -
    > > > - VM_BUG_ON(pc->page != page);
    > > > + pc = lookup_page_cgroup(page);
    > > > + if (unlikely(!pc || !PageCgroupUsed(pc)))
    > > > + return;
    > > > + preempt_disable();
    > > > + lock_page_cgroup(pc);
    > > > + if (unlikely(page_mapped(page))) {
    > > > + unlock_page_cgroup(pc);
    > > > + preempt_enable();
    > > > + return;
    > > > + }

    > > Just for clarification, in what sequence will the page be mapped here?
    > > mem_cgroup_uncharge_page checks whether the page is mapped.
    > >

    > Please think about folloing situation.
    >
    > There is a SwapCache which is referred from 2 process, A, B.
    > A maps it.
    > B doesn't maps it.
    >
    > And now, process A exits.
    >
    > CPU0(process A) CPU1 (process B)
    >
    > zap_pte_range()
    > => page remove from rmap => charge() (do_swap_page)
    > => set page->mapcount->0
    > => uncharge() => set page->mapcount=1
    >
    > This race is what patch 12/12 is fixed.
    > This only happens on cursed SwapCache.
    >

    Sorry, my brain seems to be sleeping.. above page_mapped() check doesn't
    help this situation. Maybe this page_mapped() check is not necessary
    because it's of no use.

    I think this kind of problem will not be fixed until we handle SwapCache.


    Thanks,
    -Kame












    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  20. [PATCH(fixed) 9/12] memcg allocate all page_cgroup at boot

    Because of terrible compile error in FLATMEM/DISCONTIGMEM, I post
    fixed version (and avoid patch bomb again.)
    I post replacements for 9(fix), and 10,11,12 (adjustment for HUNK).

    ==
    Allocate all page_cgroup at boot and remove page_cgroup poitner
    from struct page. This patch adds an interface as

    struct page_cgroup *lookup_page_cgroup(struct page*)

    All FLATMEM/DISCONTIGMEM/SPARSEMEM and MEMORY_HOTPLUG is supported.

    Remove page_cgroup pointer reduces the amount of memory by
    - 4 bytes per PAGE_SIZE.
    - 8 bytes per PAGE_SIZE
    if memory controller is disabled. (even if configured.)

    On usual 8GB x86-32 server, this saves 8MB of NORMAL_ZONE memory.
    On my x86-64 server with 48GB of memory, this saves 96MB of memory.
    I think this reduction makes sense.

    By pre-allocation, kmalloc/kfree in charge/uncharge are removed.
    This means
    - we're not necessary to be afraid of kmalloc faiulre.
    (this can happen because of gfp_mask type.)
    - we can avoid calling kmalloc/kfree.
    - we can avoid allocating tons of small objects which can be fragmented.
    - we can know what amount of memory will be used for this extra-lru handling.

    I added printk message as

    "allocated %ld bytes of page_cgroup"
    "please try cgroup_disable=memory option if you don't want"

    maybe enough informative for users.

    TODO:
    - small race window still remains in do_swap_page(). will be fixed by
    later patch in series.

    Changelog: v5 -> v6.
    * removed "ctype" from uncharge.
    * improved comment to show FLAT_NODE_MEM_MAP == !SPARSEMEM
    * fixed errors in !SPARSEMEM codes
    * removed unused function in !SPARSEMEM codes.
    (start from v5 because of series..)

    Signed-off-by: KAMEZAWA Hiroyuki

    include/linux/memcontrol.h | 11 -
    include/linux/mm_types.h | 4
    include/linux/mmzone.h | 14 ++
    include/linux/page_cgroup.h | 90 +++++++++++++++
    mm/Makefile | 2
    mm/memcontrol.c | 264 ++++++++++++--------------------------------
    mm/page_alloc.c | 12 --
    mm/page_cgroup.c | 235 +++++++++++++++++++++++++++++++++++++++
    8 files changed, 416 insertions(+), 216 deletions(-)

    Index: mmotm-2.6.27-rc7+/mm/page_cgroup.c
    ================================================== =================
    --- /dev/null
    +++ mmotm-2.6.27-rc7+/mm/page_cgroup.c
    @@ -0,0 +1,235 @@
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +
    +static void __meminit
    +__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
    +{
    + pc->flags = 0;
    + pc->mem_cgroup = NULL;
    + pc->page = pfn_to_page(pfn);
    +}
    +static unsigned long total_usage = 0;
    +
    +#if !defined(CONFIG_SPARSEMEM)
    +
    +
    +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
    +{
    + pgdat->node_page_cgroup = NULL;
    +}
    +
    +struct page_cgroup *lookup_page_cgroup(struct page *page)
    +{
    + unsigned long pfn = page_to_pfn(page);
    + unsigned long offset;
    + struct page_cgroup *base;
    +
    + base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
    + if (unlikely(!base))
    + return NULL;
    +
    + offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
    + return base + offset;
    +}
    +
    +static int __init alloc_node_page_cgroup(int nid)
    +{
    + struct page_cgroup *base, *pc;
    + unsigned long table_size;
    + unsigned long start_pfn, nr_pages, index;
    +
    + start_pfn = NODE_DATA(nid)->node_start_pfn;
    + nr_pages = NODE_DATA(nid)->node_spanned_pages;
    +
    + table_size = sizeof(struct page_cgroup) * nr_pages;
    +
    + base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
    + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
    + if (!base)
    + return -ENOMEM;
    + for (index = 0; index < nr_pages; index++) {
    + pc = base + index;
    + __init_page_cgroup(pc, start_pfn + index);
    + }
    + NODE_DATA(nid)->node_page_cgroup = base;
    + total_usage += table_size;
    + return 0;
    +}
    +
    +void __init page_cgroup_init(void)
    +{
    +
    + int nid, fail;
    +
    + for_each_online_node(nid) {
    + fail = alloc_node_page_cgroup(nid);
    + if (fail)
    + goto fail;
    + }
    + printk("allocated %ld bytes of page_cgroup\n", total_usage);
    + printk("please try cgroup_disable=memory option if you don't want\n");
    + return;
    +fail:
    + printk("allocation of page_cgroup was failed.\n");
    + printk("please try cgroup_disable=memory boot option\n");
    + panic("Out of memory");
    +}
    +
    +#else /* CONFIG_FLAT_NODE_MEM_MAP */
    +
    +struct page_cgroup *lookup_page_cgroup(struct page *page)
    +{
    + unsigned long pfn = page_to_pfn(page);
    + struct mem_section *section = __pfn_to_section(pfn);
    +
    + return section->page_cgroup + pfn;
    +}
    +
    +int __meminit init_section_page_cgroup(unsigned long pfn)
    +{
    + struct mem_section *section;
    + struct page_cgroup *base, *pc;
    + unsigned long table_size;
    + int nid, index;
    +
    + section = __pfn_to_section(pfn);
    +
    + if (section->page_cgroup)
    + return 0;
    +
    + nid = page_to_nid(pfn_to_page(pfn));
    +
    + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
    + base = kmalloc_node(table_size, GFP_KERNEL, nid);
    + if (!base)
    + base = vmalloc_node(table_size, nid);
    +
    + if (!base) {
    + printk("page cgroup allocation failure\n");
    + return -ENOMEM;
    + }
    +
    + for (index = 0; index < PAGES_PER_SECTION; index++) {
    + pc = base + index;
    + __init_page_cgroup(pc, pfn + index);
    + }
    +
    + section = __pfn_to_section(pfn);
    + section->page_cgroup = base - pfn;
    + total_usage += table_size;
    + return 0;
    +}
    +#ifdef CONFIG_MEMORY_HOTPLUG
    +void __free_page_cgroup(unsigned long pfn)
    +{
    + struct mem_section *ms;
    + struct page_cgroup *base;
    +
    + ms = __pfn_to_section(pfn);
    + if (!ms || !ms->page_cgroup)
    + return;
    + base = ms->page_cgroup + pfn;
    + ms->page_cgroup = NULL;
    + if (is_vmalloc_addr(base))
    + vfree(base);
    + else
    + kfree(base);
    +}
    +
    +int online_page_cgroup(unsigned long start_pfn,
    + unsigned long nr_pages,
    + int nid)
    +{
    + unsigned long start, end, pfn;
    + int fail = 0;
    +
    + start = start_pfn & (PAGES_PER_SECTION - 1);
    + end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
    +
    + for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
    + if (!pfn_present(pfn))
    + continue;
    + fail = init_section_page_cgroup(pfn);
    + }
    + if (!fail)
    + return 0;
    +
    + /* rollback */
    + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
    + __free_page_cgroup(pfn);
    +
    + return -ENOMEM;
    +}
    +
    +int offline_page_cgroup(unsigned long start_pfn,
    + unsigned long nr_pages, int nid)
    +{
    + unsigned long start, end, pfn;
    +
    + start = start_pfn & (PAGES_PER_SECTION - 1);
    + end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
    +
    + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
    + __free_page_cgroup(pfn);
    + return 0;
    +
    +}
    +
    +static int page_cgroup_callback(struct notifier_block *self,
    + unsigned long action, void *arg)
    +{
    + struct memory_notify *mn = arg;
    + int ret = 0;
    + switch (action) {
    + case MEM_GOING_ONLINE:
    + ret = online_page_cgroup(mn->start_pfn,
    + mn->nr_pages, mn->status_change_nid);
    + break;
    + case MEM_CANCEL_ONLINE:
    + case MEM_OFFLINE:
    + offline_page_cgroup(mn->start_pfn,
    + mn->nr_pages, mn->status_change_nid);
    + break;
    + case MEM_GOING_OFFLINE:
    + break;
    + case MEM_ONLINE:
    + case MEM_CANCEL_OFFLINE:
    + break;
    + }
    + ret = notifier_from_errno(ret);
    + return ret;
    +}
    +
    +#endif
    +
    +void __init page_cgroup_init(void)
    +{
    + unsigned long pfn;
    + int fail = 0;
    +
    + for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
    + if (!pfn_present(pfn))
    + continue;
    + fail = init_section_page_cgroup(pfn);
    + }
    + if (fail) {
    + printk("please try cgroup_disable=memory boot option\n");
    + panic("Out of memory");
    + } else {
    + hotplug_memory_notifier(page_cgroup_callback, 0);
    + }
    + printk("allocated %ld bytes of page_cgroup\n", total_usage);
    + printk("please try cgroup_disable=memory option if you don't want\n");
    +}
    +
    +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
    +{
    + return;
    +}
    +
    +#endif
    Index: mmotm-2.6.27-rc7+/include/linux/mm_types.h
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/include/linux/mm_types.h
    +++ mmotm-2.6.27-rc7+/include/linux/mm_types.h
    @@ -94,10 +94,6 @@ struct page {
    void *virtual; /* Kernel virtual address (NULL if
    not kmapped, ie. highmem) */
    #endif /* WANT_PAGE_VIRTUAL */
    -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
    - unsigned long page_cgroup;
    -#endif
    -
    #ifdef CONFIG_KMEMCHECK
    void *shadow;
    #endif
    Index: mmotm-2.6.27-rc7+/mm/Makefile
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/Makefile
    +++ mmotm-2.6.27-rc7+/mm/Makefile
    @@ -34,6 +34,6 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
    obj-$(CONFIG_MIGRATION) += migrate.o
    obj-$(CONFIG_SMP) += allocpercpu.o
    obj-$(CONFIG_QUICKLIST) += quicklist.o
    -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
    +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
    obj-$(CONFIG_CGROUP_MEMRLIMIT_CTLR) += memrlimitcgroup.o
    obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
    Index: mmotm-2.6.27-rc7+/include/linux/page_cgroup.h
    ================================================== =================
    --- /dev/null
    +++ mmotm-2.6.27-rc7+/include/linux/page_cgroup.h
    @@ -0,0 +1,90 @@
    +#ifndef __LINUX_PAGE_CGROUP_H
    +#define __LINUX_PAGE_CGROUP_H
    +#include
    +/*
    + * Page Cgroup can be considered as an extended mem_map.
    + * A page_cgroup page is associated with every page descriptor. The
    + * page_cgroup helps us identify information about the cgroup
    + * All page cgroups are allocated at boot or memory hotplug event,
    + * then the page cgroup for pfn always exists.
    + */
    +struct page_cgroup {
    + unsigned long flags;
    + struct mem_cgroup *mem_cgroup;
    + struct page *page;
    + struct list_head lru; /* per cgroup LRU list */
    +};
    +
    +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat);
    +void __init page_cgroup_init(void);
    +struct page_cgroup *lookup_page_cgroup(struct page *page);
    +
    +enum {
    + /* flags for mem_cgroup */
    + PCG_LOCK, /* page cgroup is locked */
    + PCG_CACHE, /* charged as cache */
    + PCG_USED, /* this object is in use. */
    + /* flags for LRU placement */
    + PCG_ACTIVE, /* page is active in this cgroup */
    + PCG_FILE, /* page is file system backed */
    + PCG_UNEVICTABLE, /* page is unevictableable */
    +};
    +
    +#define TESTPCGFLAG(uname, lname) \
    +static inline int PageCgroup##uname(struct page_cgroup *pc) \
    + { return test_bit(PCG_##lname, &pc->flags); }
    +
    +#define SETPCGFLAG(uname, lname) \
    +static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
    + { set_bit(PCG_##lname, &pc->flags); }
    +
    +#define CLEARPCGFLAG(uname, lname) \
    +static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
    + { clear_bit(PCG_##lname, &pc->flags); }
    +
    +/* Cache flag is set only once (at allocation) */
    +TESTPCGFLAG(Cache, CACHE)
    +
    +TESTPCGFLAG(Used, USED)
    +CLEARPCGFLAG(Used, USED)
    +
    +/* LRU management flags (from global-lru definition) */
    +TESTPCGFLAG(File, FILE)
    +SETPCGFLAG(File, FILE)
    +CLEARPCGFLAG(File, FILE)
    +
    +TESTPCGFLAG(Active, ACTIVE)
    +SETPCGFLAG(Active, ACTIVE)
    +CLEARPCGFLAG(Active, ACTIVE)
    +
    +TESTPCGFLAG(Unevictable, UNEVICTABLE)
    +SETPCGFLAG(Unevictable, UNEVICTABLE)
    +CLEARPCGFLAG(Unevictable, UNEVICTABLE)
    +
    +static inline int page_cgroup_nid(struct page_cgroup *pc)
    +{
    + return page_to_nid(pc->page);
    +}
    +
    +static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
    +{
    + return page_zonenum(pc->page);
    +}
    +
    +static inline void lock_page_cgroup(struct page_cgroup *pc)
    +{
    + bit_spin_lock(PCG_LOCK, &pc->flags);
    +}
    +
    +static inline int trylock_page_cgroup(struct page_cgroup *pc)
    +{
    + return bit_spin_trylock(PCG_LOCK, &pc->flags);
    +}
    +
    +static inline void unlock_page_cgroup(struct page_cgroup *pc)
    +{
    + bit_spin_unlock(PCG_LOCK, &pc->flags);
    +}
    +
    +
    +#endif
    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -35,11 +35,11 @@
    #include
    #include
    #include
    +#include

    #include

    struct cgroup_subsys mem_cgroup_subsys __read_mostly;
    -static struct kmem_cache *page_cgroup_cache __read_mostly;
    #define MEM_CGROUP_RECLAIM_RETRIES 5

    /*
    @@ -139,94 +139,18 @@ static struct mem_cgroup init_mem_cgroup

    #define is_root_cgroup(cgrp) ((cgrp) == &init_mem_cgroup)

    -
    -/*
    - * We use the lower bit of the page->page_cgroup pointer as a bit spin
    - * lock. We need to ensure that page->page_cgroup is at least two
    - * byte aligned (based on comments from Nick Piggin). But since
    - * bit_spin_lock doesn't actually set that lock bit in a non-debug
    - * uniprocessor kernel, we should avoid setting it here too.
    - */
    -#define PAGE_CGROUP_LOCK_BIT 0x0
    -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
    -#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
    -#else
    -#define PAGE_CGROUP_LOCK 0x0
    -#endif
    -
    -/*
    - * A page_cgroup page is associated with every page descriptor. The
    - * page_cgroup helps us identify information about the cgroup
    - */
    -struct page_cgroup {
    - struct list_head lru; /* per cgroup LRU list */
    - struct page *page;
    - struct mem_cgroup *mem_cgroup;
    - unsigned long flags;
    -};
    -
    -enum {
    - /* flags for mem_cgroup */
    - PCG_CACHE, /* charged as cache */
    - /* flags for LRU placement */
    - PCG_ACTIVE, /* page is active in this cgroup */
    - PCG_FILE, /* page is file system backed */
    - PCG_UNEVICTABLE, /* page is unevictableable */
    -};
    -
    -#define TESTPCGFLAG(uname, lname) \
    -static inline int PageCgroup##uname(struct page_cgroup *pc) \
    - { return test_bit(PCG_##lname, &pc->flags); }
    -
    -#define SETPCGFLAG(uname, lname) \
    -static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
    - { set_bit(PCG_##lname, &pc->flags); }
    -
    -#define CLEARPCGFLAG(uname, lname) \
    -static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
    - { clear_bit(PCG_##lname, &pc->flags); }
    -
    -
    -/* Cache flag is set only once (at allocation) */
    -TESTPCGFLAG(Cache, CACHE)
    -
    -/* LRU management flags (from global-lru definition) */
    -TESTPCGFLAG(File, FILE)
    -SETPCGFLAG(File, FILE)
    -CLEARPCGFLAG(File, FILE)
    -
    -TESTPCGFLAG(Active, ACTIVE)
    -SETPCGFLAG(Active, ACTIVE)
    -CLEARPCGFLAG(Active, ACTIVE)
    -
    -TESTPCGFLAG(Unevictable, UNEVICTABLE)
    -SETPCGFLAG(Unevictable, UNEVICTABLE)
    -CLEARPCGFLAG(Unevictable, UNEVICTABLE)
    -
    -static int page_cgroup_nid(struct page_cgroup *pc)
    -{
    - return page_to_nid(pc->page);
    -}
    -
    -static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
    -{
    - return page_zonenum(pc->page);
    -}
    -
    enum charge_type {
    MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
    MEM_CGROUP_CHARGE_TYPE_MAPPED,
    MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
    - MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
    NR_CHARGE_TYPE,
    };

    static const unsigned long
    pcg_default_flags[NR_CHARGE_TYPE] = {
    - ((1 << PCG_CACHE) | (1 << PCG_FILE)),
    - ((1 << PCG_ACTIVE)),
    - ((1 << PCG_ACTIVE) | (1 << PCG_CACHE)),
    - 0,
    + (1 << PCG_CACHE) | (1 << PCG_FILE) | (1 << PCG_USED) | (1 << PCG_LOCK),
    + (1 << PCG_ACTIVE) | (1 << PCG_LOCK) | (1 << PCG_USED),
    + (1 << PCG_ACTIVE) | (1 << PCG_CACHE) | (1 << PCG_USED)| (1 << PCG_LOCK),
    };

    /*
    @@ -308,37 +232,6 @@ struct mem_cgroup *mem_cgroup_from_task(
    struct mem_cgroup, css);
    }

    -static inline int page_cgroup_locked(struct page *page)
    -{
    - return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
    -}
    -
    -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
    -{
    - VM_BUG_ON(!page_cgroup_locked(page));
    - page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
    -}
    -
    -struct page_cgroup *page_get_page_cgroup(struct page *page)
    -{
    - return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
    -}
    -
    -static void lock_page_cgroup(struct page *page)
    -{
    - bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
    -}
    -
    -static int try_lock_page_cgroup(struct page *page)
    -{
    - return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
    -}
    -
    -static void unlock_page_cgroup(struct page *page)
    -{
    - bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
    -}
    -
    static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
    struct page_cgroup *pc)
    {
    @@ -442,22 +335,19 @@ void mem_cgroup_move_lists(struct page *
    * safely get to page_cgroup without it, so just try_lock it:
    * mem_cgroup_isolate_pages allows for page left on wrong list.
    */
    - if (!try_lock_page_cgroup(page))
    + pc = lookup_page_cgroup(page);
    +
    + if (!trylock_page_cgroup(pc))
    return;

    - pc = page_get_page_cgroup(page);
    - if (pc) {
    + if (PageCgroupUsed(pc)) {
    mem = pc->mem_cgroup;
    mz = page_cgroup_zoneinfo(pc);
    spin_lock_irqsave(&mz->lru_lock, flags);
    - /*
    - * check against the race with move_account.
    - */
    - if (likely(mem == pc->mem_cgroup))
    - __mem_cgroup_move_lists(pc, lru);
    + __mem_cgroup_move_lists(pc, lru);
    spin_unlock_irqrestore(&mz->lru_lock, flags);
    }
    - unlock_page_cgroup(page);
    + unlock_page_cgroup(pc);
    }

    /*
    @@ -544,6 +434,8 @@ unsigned long mem_cgroup_isolate_pages(u
    list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
    if (scan >= nr_to_scan)
    break;
    + if (unlikely(!PageCgroupUsed(pc)))
    + continue;
    page = pc->page;

    if (unlikely(!PageLRU(page)))
    @@ -611,12 +503,12 @@ int mem_cgroup_move_account(struct page
    /* Now, we assume no_limit...no failure here. */
    return ret;
    }
    - if (!try_lock_page_cgroup(page)) {
    + if (!trylock_page_cgroup(pc)) {
    res_counter_uncharge(&to->res, PAGE_SIZE);
    return ret;
    }

    - if (page_get_page_cgroup(page) != pc) {
    + if (!PageCgroupUsed(pc)) {
    res_counter_uncharge(&to->res, PAGE_SIZE);
    goto out;
    }
    @@ -634,7 +526,7 @@ int mem_cgroup_move_account(struct page
    res_counter_uncharge(&to->res, PAGE_SIZE);
    }
    out:
    - unlock_page_cgroup(page);
    + unlock_page_cgroup(pc);

    return ret;
    }
    @@ -651,26 +543,27 @@ static int mem_cgroup_charge_common(stru
    {
    struct mem_cgroup *mem;
    struct page_cgroup *pc;
    - unsigned long flags;
    unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
    struct mem_cgroup_per_zone *mz;
    + unsigned long flags;

    - pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
    - if (unlikely(pc == NULL))
    - goto err;
    -
    + pc = lookup_page_cgroup(page);
    + /* can happen at boot */
    + if (unlikely(!pc))
    + return 0;
    + prefetchw(pc);
    /*
    * We always charge the cgroup the mm_struct belongs to.
    * The mm_struct's mem_cgroup changes on task migration if the
    * thread group leader migrates. It's possible that mm is not
    * set, if so charge the init_mm (happens for pagecache usage).
    */
    +
    if (likely(!memcg)) {
    rcu_read_lock();
    mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
    if (unlikely(!mem)) {
    rcu_read_unlock();
    - kmem_cache_free(page_cgroup_cache, pc);
    return 0;
    }
    /*
    @@ -706,36 +599,34 @@ static int mem_cgroup_charge_common(stru
    }
    }

    + preempt_disable();
    + lock_page_cgroup(pc);
    + if (unlikely(PageCgroupUsed(pc))) {
    + unlock_page_cgroup(pc);
    + res_counter_uncharge(&mem->res, PAGE_SIZE);
    + css_put(&mem->css);
    + preempt_enable();
    + goto done;
    + }
    pc->mem_cgroup = mem;
    - pc->page = page;
    /*
    * If a page is accounted as a page cache, insert to inactive list.
    * If anon, insert to active list.
    */
    pc->flags = pcg_default_flags[ctype];

    - lock_page_cgroup(page);
    - if (unlikely(page_get_page_cgroup(page))) {
    - unlock_page_cgroup(page);
    - res_counter_uncharge(&mem->res, PAGE_SIZE);
    - css_put(&mem->css);
    - kmem_cache_free(page_cgroup_cache, pc);
    - goto done;
    - }
    - page_assign_page_cgroup(page, pc);
    -
    mz = page_cgroup_zoneinfo(pc);
    +
    spin_lock_irqsave(&mz->lru_lock, flags);
    __mem_cgroup_add_list(mz, pc);
    spin_unlock_irqrestore(&mz->lru_lock, flags);
    + unlock_page_cgroup(pc);
    + preempt_enable();

    - unlock_page_cgroup(page);
    done:
    return 0;
    out:
    css_put(&mem->css);
    - kmem_cache_free(page_cgroup_cache, pc);
    -err:
    return -ENOMEM;
    }

    @@ -743,7 +634,8 @@ int mem_cgroup_charge(struct page *page,
    {
    if (mem_cgroup_subsys.disabled)
    return 0;
    -
    + if (PageCompound(page))
    + return 0;
    /*
    * If already mapped, we don't have to account.
    * If page cache, page->mapping has address_space.
    @@ -764,7 +656,8 @@ int mem_cgroup_cache_charge(struct page
    {
    if (mem_cgroup_subsys.disabled)
    return 0;
    -
    + if (PageCompound(page))
    + return 0;
    /*
    * Corner case handling. This is called from add_to_page_cache()
    * in usual. But some FS (shmem) precharges this page before calling it
    @@ -777,15 +670,16 @@ int mem_cgroup_cache_charge(struct page
    if (!(gfp_mask & __GFP_WAIT)) {
    struct page_cgroup *pc;

    - lock_page_cgroup(page);
    - pc = page_get_page_cgroup(page);
    - if (pc) {
    - VM_BUG_ON(pc->page != page);
    - VM_BUG_ON(!pc->mem_cgroup);
    - unlock_page_cgroup(page);
    +
    + pc = lookup_page_cgroup(page);
    + if (!pc)
    + return 0;
    + lock_page_cgroup(pc);
    + if (PageCgroupUsed(pc)) {
    + unlock_page_cgroup(pc);
    return 0;
    }
    - unlock_page_cgroup(page);
    + unlock_page_cgroup(pc);
    }

    if (unlikely(!mm))
    @@ -803,7 +697,7 @@ int mem_cgroup_cache_charge(struct page
    * uncharge if !page_mapped(page)
    */
    static void
    -__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
    +__mem_cgroup_uncharge_common(struct page *page)
    {
    struct page_cgroup *pc;
    struct mem_cgroup *mem;
    @@ -812,57 +706,44 @@ __mem_cgroup_uncharge_common(struct page

    if (mem_cgroup_subsys.disabled)
    return;
    + /* check the condition we can know from page */

    - /*
    - * Check if our page_cgroup is valid
    - */
    - lock_page_cgroup(page);
    - pc = page_get_page_cgroup(page);
    - if (unlikely(!pc))
    - goto unlock;
    -
    - VM_BUG_ON(pc->page != page);
    + pc = lookup_page_cgroup(page);
    + if (unlikely(!pc || !PageCgroupUsed(pc)))
    + return;
    + preempt_disable();
    + lock_page_cgroup(pc);
    + ClearPageCgroupUsed(pc);
    + unlock_page_cgroup(pc);

    - if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
    - && ((PageCgroupCache(pc) || page_mapped(page))))
    - goto unlock;
    -retry:
    mem = pc->mem_cgroup;
    mz = page_cgroup_zoneinfo(pc);
    +
    spin_lock_irqsave(&mz->lru_lock, flags);
    - if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED &&
    - unlikely(mem != pc->mem_cgroup)) {
    - /* MAPPED account can be done without lock_page().
    - Check race with mem_cgroup_move_account() */
    - spin_unlock_irqrestore(&mz->lru_lock, flags);
    - goto retry;
    - }
    __mem_cgroup_remove_list(mz, pc);
    spin_unlock_irqrestore(&mz->lru_lock, flags);
    -
    - page_assign_page_cgroup(page, NULL);
    - unlock_page_cgroup(page);
    -
    -
    - res_counter_uncharge(&mem->res, PAGE_SIZE);
    + pc->mem_cgroup = NULL;
    css_put(&mem->css);
    + preempt_enable();
    + res_counter_uncharge(&mem->res, PAGE_SIZE);

    - kmem_cache_free(page_cgroup_cache, pc);
    return;
    -unlock:
    - unlock_page_cgroup(page);
    }

    void mem_cgroup_uncharge_page(struct page *page)
    {
    - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
    + if (page_mapped(page))
    + return;
    + if (page->mapping && !PageAnon(page))
    + return;
    + __mem_cgroup_uncharge_common(page);
    }

    void mem_cgroup_uncharge_cache_page(struct page *page)
    {
    VM_BUG_ON(page_mapped(page));
    VM_BUG_ON(page->mapping);
    - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
    + __mem_cgroup_uncharge_common(page);
    }

    /*
    @@ -878,9 +759,9 @@ int mem_cgroup_prepare_migration(struct
    if (mem_cgroup_subsys.disabled)
    return 0;

    - lock_page_cgroup(page);
    - pc = page_get_page_cgroup(page);
    - if (pc) {
    + pc = lookup_page_cgroup(page);
    + lock_page_cgroup(pc);
    + if (PageCgroupUsed(pc)) {
    mem = pc->mem_cgroup;
    css_get(&mem->css);
    if (PageCgroupCache(pc)) {
    @@ -890,7 +771,7 @@ int mem_cgroup_prepare_migration(struct
    ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
    }
    }
    - unlock_page_cgroup(page);
    + unlock_page_cgroup(pc);
    if (mem) {
    ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
    ctype, mem);
    @@ -914,8 +795,7 @@ void mem_cgroup_end_migration(struct pag
    * care it.
    */
    if (!newpage->mapping)
    - __mem_cgroup_uncharge_common(newpage,
    - MEM_CGROUP_CHARGE_TYPE_FORCE);
    + __mem_cgroup_uncharge_common(newpage);
    else if (PageAnon(newpage))
    mem_cgroup_uncharge_page(newpage);
    }
    @@ -997,6 +877,8 @@ static void mem_cgroup_force_empty_list(
    spin_lock_irqsave(&mz->lru_lock, flags);
    list_for_each_entry_safe(pc, tmp, list, lru) {
    page = pc->page;
    + if (!PageCgroupUsed(pc))
    + continue;
    /* For avoiding race with speculative page cache handling. */
    if (!PageLRU(page) || !get_page_unless_zero(page)) {
    continue;
    @@ -1270,8 +1152,8 @@ mem_cgroup_create(struct cgroup_subsys *
    int node;

    if (unlikely((cont->parent) == NULL)) {
    + page_cgroup_init();
    mem = &init_mem_cgroup;
    - page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
    } else {
    mem = mem_cgroup_alloc();
    if (!mem)
    Index: mmotm-2.6.27-rc7+/mm/page_alloc.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/page_alloc.c
    +++ mmotm-2.6.27-rc7+/mm/page_alloc.c
    @@ -44,7 +44,7 @@
    #include
    #include
    #include
    -#include
    +#include
    #include

    #include
    @@ -223,17 +223,12 @@ static inline int bad_range(struct zone

    static void bad_page(struct page *page)
    {
    - void *pc = page_get_page_cgroup(page);
    -
    printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
    "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
    current->comm, page, (int)(2*sizeof(unsigned long)),
    (unsigned long)page->flags, page->mapping,
    page_mapcount(page), page_count(page));
    - if (pc) {
    - printk(KERN_EMERG "cgroup:%p\n", pc);
    - page_reset_bad_cgroup(page);
    - }
    +
    printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
    KERN_EMERG "Backtrace:\n");
    dump_stack();
    @@ -472,7 +467,6 @@ static inline void free_pages_check(stru
    free_page_mlock(page);
    if (unlikely(page_mapcount(page) |
    (page->mapping != NULL) |
    - (page_get_page_cgroup(page) != NULL) |
    (page_count(page) != 0) |
    (page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
    bad_page(page);
    @@ -609,7 +603,6 @@ static void prep_new_page(struct page *p
    {
    if (unlikely(page_mapcount(page) |
    (page->mapping != NULL) |
    - (page_get_page_cgroup(page) != NULL) |
    (page_count(page) != 0) |
    (page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
    bad_page(page);
    @@ -3495,6 +3488,7 @@ static void __paginginit free_area_init_
    pgdat->nr_zones = 0;
    init_waitqueue_head(&pgdat->kswapd_wait);
    pgdat->kswapd_max_order = 0;
    + pgdat_page_cgroup_init(pgdat);

    for (j = 0; j < MAX_NR_ZONES; j++) {
    struct zone *zone = pgdat->node_zones + j;
    Index: mmotm-2.6.27-rc7+/include/linux/mmzone.h
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/include/linux/mmzone.h
    +++ mmotm-2.6.27-rc7+/include/linux/mmzone.h
    @@ -602,8 +602,11 @@ typedef struct pglist_data {
    struct zone node_zones[MAX_NR_ZONES];
    struct zonelist node_zonelists[MAX_ZONELISTS];
    int nr_zones;
    -#ifdef CONFIG_FLAT_NODE_MEM_MAP
    +#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
    struct page *node_mem_map;
    +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
    + struct page_cgroup *node_page_cgroup;
    +#endif
    #endif
    struct bootmem_data *bdata;
    #ifdef CONFIG_MEMORY_HOTPLUG
    @@ -932,6 +935,7 @@ static inline unsigned long early_pfn_to
    #endif

    struct page;
    +struct page_cgroup;
    struct mem_section {
    /*
    * This is, logically, a pointer to an array of struct
    @@ -949,6 +953,14 @@ struct mem_section {

    /* See declaration of similar field in struct zone */
    unsigned long *pageblock_flags;
    +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
    + /*
    + * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
    + * section. (see memcontrol.h/page_cgroup.h about this.)
    + */
    + struct page_cgroup *page_cgroup;
    + unsigned long pad;
    +#endif
    };

    #ifdef CONFIG_SPARSEMEM_EXTREME
    Index: mmotm-2.6.27-rc7+/include/linux/memcontrol.h
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/include/linux/memcontrol.h
    +++ mmotm-2.6.27-rc7+/include/linux/memcontrol.h
    @@ -29,7 +29,6 @@ struct mm_struct;

    #define page_reset_bad_cgroup(page) ((page)->page_cgroup = 0)

    -extern struct page_cgroup *page_get_page_cgroup(struct page *page);
    extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
    gfp_t gfp_mask);
    extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
    @@ -72,16 +71,8 @@ extern void mem_cgroup_record_reclaim_pr
    extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
    int priority, enum lru_list lru);

    -#else /* CONFIG_CGROUP_MEM_RES_CTLR */
    -static inline void page_reset_bad_cgroup(struct page *page)
    -{
    -}
    -
    -static inline struct page_cgroup *page_get_page_cgroup(struct page *page)
    -{
    - return NULL;
    -}

    +#else /* CONFIG_CGROUP_MEM_RES_CTLR */
    static inline int mem_cgroup_charge(struct page *page,
    struct mm_struct *mm, gfp_t gfp_mask)
    {

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

+ Reply to Thread
Page 1 of 4 1 2 3 ... LastLast