[PATCH 0/12] memcg updates v5 - Kernel

This is a discussion on [PATCH 0/12] memcg updates v5 - Kernel ; adjustment for changes in 9/12(fixed) == Free page_cgroup from its LRU in batched manner. When uncharge() is called, page is pushed onto per-cpu vector and removed from LRU, later.. This routine resembles to global LRU's pagevec. This patch is half ...

+ Reply to Thread
Page 2 of 4 FirstFirst 1 2 3 4 LastLast
Results 21 to 40 of 69

Thread: [PATCH 0/12] memcg updates v5

  1. [PATCH(fixed) 10/12] free page cgroup from LRU in lazy

    adjustment for changes in 9/12(fixed)
    ==
    Free page_cgroup from its LRU in batched manner.

    When uncharge() is called, page is pushed onto per-cpu vector and
    removed from LRU, later.. This routine resembles to global LRU's pagevec.
    This patch is half of the whole patch and a set with following lazy LRU add
    patch.

    Signed-off-by: KAMEZAWA Hiroyuki

    mm/memcontrol.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++++++ +-----
    1 file changed, 152 insertions(+), 12 deletions(-)

    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -36,6 +36,7 @@
    #include
    #include
    #include
    +#include

    #include

    @@ -531,6 +532,116 @@ out:
    return ret;
    }

    +
    +#define MEMCG_PCPVEC_SIZE (14) /* size of pagevec */
    +struct memcg_percpu_vec {
    + int nr;
    + int limit;
    + struct page_cgroup *vec[MEMCG_PCPVEC_SIZE];
    +};
    +static DEFINE_PER_CPU(struct memcg_percpu_vec, memcg_free_vec);
    +
    +static void
    +__release_page_cgroup(struct memcg_percpu_vec *mpv)
    +{
    + unsigned long flags;
    + struct mem_cgroup_per_zone *mz, *prev_mz;
    + struct page_cgroup *pc;
    + int i, nr;
    +
    + local_irq_save(flags);
    + nr = mpv->nr;
    + mpv->nr = 0;
    + prev_mz = NULL;
    + for (i = nr - 1; i >= 0; i--) {
    + pc = mpv->vec[i];
    + VM_BUG_ON(PageCgroupUsed(pc));
    + mz = page_cgroup_zoneinfo(pc);
    + if (prev_mz != mz) {
    + if (prev_mz)
    + spin_unlock(&prev_mz->lru_lock);
    + prev_mz = mz;
    + spin_lock(&mz->lru_lock);
    + }
    + __mem_cgroup_remove_list(mz, pc);
    + css_put(&pc->mem_cgroup->css);
    + pc->mem_cgroup = NULL;
    + }
    + if (prev_mz)
    + spin_unlock(&prev_mz->lru_lock);
    + local_irq_restore(flags);
    +
    +}
    +
    +static void
    +release_page_cgroup(struct page_cgroup *pc)
    +{
    + struct memcg_percpu_vec *mpv;
    +
    + mpv = &get_cpu_var(memcg_free_vec);
    + mpv->vec[mpv->nr++] = pc;
    + if (mpv->nr >= mpv->limit)
    + __release_page_cgroup(mpv);
    + put_cpu_var(memcg_free_vec);
    +}
    +
    +static void page_cgroup_start_cache_cpu(int cpu)
    +{
    + struct memcg_percpu_vec *mpv;
    + mpv = &per_cpu(memcg_free_vec, cpu);
    + mpv->limit = MEMCG_PCPVEC_SIZE;
    +}
    +
    +#ifdef CONFIG_HOTPLUG_CPU
    +static void page_cgroup_stop_cache_cpu(int cpu)
    +{
    + struct memcg_percpu_vec *mpv;
    + mpv = &per_cpu(memcg_free_vec, cpu);
    + mpv->limit = 0;
    +}
    +#endif
    +
    +
    +/*
    + * Used when freeing memory resource controller to remove all
    + * page_cgroup (in obsolete list).
    + */
    +static DEFINE_MUTEX(memcg_force_drain_mutex);
    +
    +static void drain_page_cgroup_local(struct work_struct *work)
    +{
    + struct memcg_percpu_vec *mpv;
    + mpv = &get_cpu_var(memcg_free_vec);
    + __release_page_cgroup(mpv);
    + put_cpu_var(mpv);
    +}
    +
    +static void drain_page_cgroup_cpu(int cpu)
    +{
    + int local_cpu;
    + struct work_struct work;
    +
    + local_cpu = get_cpu();
    + if (local_cpu == cpu) {
    + drain_page_cgroup_local(NULL);
    + put_cpu();
    + return;
    + }
    + put_cpu();
    +
    + INIT_WORK(&work, drain_page_cgroup_local);
    + schedule_work_on(cpu, &work);
    + flush_work(&work);
    +}
    +
    +static void drain_page_cgroup_all(void)
    +{
    + mutex_lock(&memcg_force_drain_mutex);
    + schedule_on_each_cpu(drain_page_cgroup_local);
    + mutex_unlock(&memcg_force_drain_mutex);
    +}
    +
    +
    /*
    * Charge the memory controller for page usage.
    * Return
    @@ -701,8 +812,6 @@ __mem_cgroup_uncharge_common(struct page
    {
    struct page_cgroup *pc;
    struct mem_cgroup *mem;
    - struct mem_cgroup_per_zone *mz;
    - unsigned long flags;

    if (mem_cgroup_subsys.disabled)
    return;
    @@ -715,16 +824,10 @@ __mem_cgroup_uncharge_common(struct page
    lock_page_cgroup(pc);
    ClearPageCgroupUsed(pc);
    unlock_page_cgroup(pc);
    + preempt_enable();

    mem = pc->mem_cgroup;
    - mz = page_cgroup_zoneinfo(pc);
    -
    - spin_lock_irqsave(&mz->lru_lock, flags);
    - __mem_cgroup_remove_list(mz, pc);
    - spin_unlock_irqrestore(&mz->lru_lock, flags);
    - pc->mem_cgroup = NULL;
    - css_put(&mem->css);
    - preempt_enable();
    + release_page_cgroup(pc);
    res_counter_uncharge(&mem->res, PAGE_SIZE);

    return;
    @@ -880,9 +983,8 @@ static void mem_cgroup_force_empty_list(
    if (!PageCgroupUsed(pc))
    continue;
    /* For avoiding race with speculative page cache handling. */
    - if (!PageLRU(page) || !get_page_unless_zero(page)) {
    + if (!PageLRU(page) || !get_page_unless_zero(page))
    continue;
    - }
    mem_cgroup_move_account(page, pc, mem, &init_mem_cgroup);
    put_page(page);
    if (atomic_read(&mem->css.cgroup->count) > 0)
    @@ -919,8 +1021,10 @@ static int mem_cgroup_force_empty(struct
    * While walking our own LRU, we also checks LRU bit on page.
    * If a page is on pagevec, it's not on LRU and we cannot
    * grab it. Calling lru_add_drain_all() here.
    + * memory cgroup's its own vector shold be flushed, too.
    */
    lru_add_drain_all();
    + drain_page_cgroup_all();
    for_each_node_state(node, N_HIGH_MEMORY) {
    for (zid = 0; zid < MAX_NR_ZONES; zid++) {
    mz = mem_cgroup_zoneinfo(mem, node, zid);
    @@ -1144,6 +1248,38 @@ static void mem_cgroup_free(struct mem_c
    vfree(mem);
    }

    +static void mem_cgroup_init_pcp(int cpu)
    +{
    + page_cgroup_start_cache_cpu(cpu);
    +}
    +
    +static int cpu_memcgroup_callback(struct notifier_block *nb,
    + unsigned long action, void *hcpu)
    +{
    + int cpu = (long)hcpu;
    +
    + switch(action) {
    + case CPU_UP_PREPARE:
    + case CPU_UP_PREPARE_FROZEN:
    + mem_cgroup_init_pcp(cpu);
    + break;
    +#ifdef CONFIG_HOTPLUG_CPU
    + case CPU_DOWN_PREPARE:
    + case CPU_DOWN_PREPARE_FROZEN:
    + page_cgroup_stop_cache_cpu(cpu);
    + drain_page_cgroup_cpu(cpu);
    + break;
    +#endif
    + default:
    + break;
    + }
    + return NOTIFY_OK;
    +}
    +
    +static struct notifier_block __refdata memcgroup_nb =
    +{
    + .notifier_call = cpu_memcgroup_callback,
    +};

    static struct cgroup_subsys_state *
    mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
    @@ -1154,6 +1290,10 @@ mem_cgroup_create(struct cgroup_subsys *
    if (unlikely((cont->parent) == NULL)) {
    page_cgroup_init();
    mem = &init_mem_cgroup;
    + cpu_memcgroup_callback(&memcgroup_nb,
    + (unsigned long)CPU_UP_PREPARE,
    + (void *)(long)smp_processor_id());
    + register_hotcpu_notifier(&memcgroup_nb);
    } else {
    mem = mem_cgroup_alloc();
    if (!mem)

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  2. [PATCH(fixed) 11/12] free page cgroup from LRU in add

    Fixed HUNK with 9/12(fixed)
    ==
    Delaying add_to_lru() and do it in batched manner like page_vec.
    For doing that 2 flags PCG_USED and PCG_LRU.

    If PCG_LRU is set, page is on LRU. It safe to access LRU via page_cgroup.
    (under some lock.)

    For avoiding race, this patch uses TestSetPageCgroupUsed().
    and checking PCG_USED bit and PCG_LRU bit in add/free vector.
    By this, lock_page_cgroup() in mem_cgroup_charge() is removed.

    (I don't want to call lock_page_cgroup() under mz->lru_lock when
    add/free vector core logic. So, TestSetPageCgroupUsed() logic is added.
    TestSet is an easy way to avoid unneccesary nest of locks.)


    Changelog: v3 -> v5.
    - removed css_get/put per page_cgroup struct.
    Now, *new* force_empty checks there is page_cgroup on the memcg.
    We don't need to be afraid of leak.

    Changelog: v2 -> v3
    - added TRANSIT flag and removed lock from core logic.
    Changelog: v1 -> v2:
    - renamed function name from use_page_cgroup to set_page_cgroup_lru().

    Signed-off-by: KAMEZAWA Hiroyuki

    include/linux/page_cgroup.h | 10 +++
    mm/memcontrol.c | 121 +++++++++++++++++++++++++++++++-------------
    2 files changed, 96 insertions(+), 35 deletions(-)

    Index: mmotm-2.6.27-rc7+/include/linux/page_cgroup.h
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/include/linux/page_cgroup.h
    +++ mmotm-2.6.27-rc7+/include/linux/page_cgroup.h
    @@ -24,6 +24,7 @@ enum {
    PCG_LOCK, /* page cgroup is locked */
    PCG_CACHE, /* charged as cache */
    PCG_USED, /* this object is in use. */
    + PCG_LRU, /* this is on LRU */
    /* flags for LRU placement */
    PCG_ACTIVE, /* page is active in this cgroup */
    PCG_FILE, /* page is file system backed */
    @@ -42,11 +43,20 @@ static inline void SetPageCgroup##uname(
    static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
    { clear_bit(PCG_##lname, &pc->flags); }

    +#define TESTSETPCGFLAG(uname, lname)\
    +static inline int TestSetPageCgroup##uname(struct page_cgroup *pc) \
    + { return test_and_set_bit(PCG_##lname, &pc->flags); }
    +
    /* Cache flag is set only once (at allocation) */
    TESTPCGFLAG(Cache, CACHE)

    TESTPCGFLAG(Used, USED)
    CLEARPCGFLAG(Used, USED)
    +TESTSETPCGFLAG(Used, USED)
    +
    +TESTPCGFLAG(LRU, LRU)
    +SETPCGFLAG(LRU, LRU)
    +CLEARPCGFLAG(LRU, LRU)

    /* LRU management flags (from global-lru definition) */
    TESTPCGFLAG(File, FILE)
    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -149,9 +149,9 @@ enum charge_type {

    static const unsigned long
    pcg_default_flags[NR_CHARGE_TYPE] = {
    - (1 << PCG_CACHE) | (1 << PCG_FILE) | (1 << PCG_USED) | (1 << PCG_LOCK),
    - (1 << PCG_ACTIVE) | (1 << PCG_LOCK) | (1 << PCG_USED),
    - (1 << PCG_ACTIVE) | (1 << PCG_CACHE) | (1 << PCG_USED)| (1 << PCG_LOCK),
    + (1 << PCG_CACHE) | (1 << PCG_FILE) | (1 << PCG_USED),
    + (1 << PCG_ACTIVE) | (1 << PCG_USED),
    + (1 << PCG_ACTIVE) | (1 << PCG_CACHE) | (1 << PCG_USED),
    };

    /*
    @@ -193,7 +193,6 @@ page_cgroup_zoneinfo(struct page_cgroup
    struct mem_cgroup *mem = pc->mem_cgroup;
    int nid = page_cgroup_nid(pc);
    int zid = page_cgroup_zid(pc);
    -
    return mem_cgroup_zoneinfo(mem, nid, zid);
    }

    @@ -341,7 +340,7 @@ void mem_cgroup_move_lists(struct page *
    if (!trylock_page_cgroup(pc))
    return;

    - if (PageCgroupUsed(pc)) {
    + if (PageCgroupUsed(pc) && PageCgroupLRU(pc)) {
    mem = pc->mem_cgroup;
    mz = page_cgroup_zoneinfo(pc);
    spin_lock_irqsave(&mz->lru_lock, flags);
    @@ -500,6 +499,9 @@ int mem_cgroup_move_account(struct page
    from_mz = mem_cgroup_zoneinfo(from, nid, zid);
    to_mz = mem_cgroup_zoneinfo(to, nid, zid);

    + if (!PageCgroupLRU(pc))
    + return ret;
    +
    if (res_counter_charge(&to->res, PAGE_SIZE)) {
    /* Now, we assume no_limit...no failure here. */
    return ret;
    @@ -516,10 +518,8 @@ int mem_cgroup_move_account(struct page

    if (spin_trylock(&to_mz->lru_lock)) {
    __mem_cgroup_remove_list(from_mz, pc);
    - css_put(&from->css);
    res_counter_uncharge(&from->res, PAGE_SIZE);
    pc->mem_cgroup = to;
    - css_get(&to->css);
    __mem_cgroup_add_list(to_mz, pc);
    ret = 0;
    spin_unlock(&to_mz->lru_lock);
    @@ -540,6 +540,7 @@ struct memcg_percpu_vec {
    struct page_cgroup *vec[MEMCG_PCPVEC_SIZE];
    };
    static DEFINE_PER_CPU(struct memcg_percpu_vec, memcg_free_vec);
    +static DEFINE_PER_CPU(struct memcg_percpu_vec, memcg_add_vec);

    static void
    __release_page_cgroup(struct memcg_percpu_vec *mpv)
    @@ -555,7 +556,6 @@ __release_page_cgroup(struct memcg_percp
    prev_mz = NULL;
    for (i = nr - 1; i >= 0; i--) {
    pc = mpv->vec[i];
    - VM_BUG_ON(PageCgroupUsed(pc));
    mz = page_cgroup_zoneinfo(pc);
    if (prev_mz != mz) {
    if (prev_mz)
    @@ -563,9 +563,10 @@ __release_page_cgroup(struct memcg_percp
    prev_mz = mz;
    spin_lock(&mz->lru_lock);
    }
    - __mem_cgroup_remove_list(mz, pc);
    - css_put(&pc->mem_cgroup->css);
    - pc->mem_cgroup = NULL;
    + if (!PageCgroupUsed(pc) && PageCgroupLRU(pc)) {
    + __mem_cgroup_remove_list(mz, pc);
    + ClearPageCgroupLRU(pc);
    + }
    }
    if (prev_mz)
    spin_unlock(&prev_mz->lru_lock);
    @@ -574,10 +575,43 @@ __release_page_cgroup(struct memcg_percp
    }

    static void
    +__set_page_cgroup_lru(struct memcg_percpu_vec *mpv)
    +{
    + unsigned long flags;
    + struct mem_cgroup_per_zone *mz, *prev_mz;
    + struct page_cgroup *pc;
    + int i, nr;
    +
    + local_irq_save(flags);
    + nr = mpv->nr;
    + mpv->nr = 0;
    + prev_mz = NULL;
    +
    + for (i = nr - 1; i >= 0; i--) {
    + pc = mpv->vec[i];
    + mz = page_cgroup_zoneinfo(pc);
    + if (prev_mz != mz) {
    + if (prev_mz)
    + spin_unlock(&prev_mz->lru_lock);
    + prev_mz = mz;
    + spin_lock(&mz->lru_lock);
    + }
    + if (PageCgroupUsed(pc) && !PageCgroupLRU(pc)) {
    + SetPageCgroupLRU(pc);
    + __mem_cgroup_add_list(mz, pc);
    + }
    + }
    +
    + if (prev_mz)
    + spin_unlock(&prev_mz->lru_lock);
    + local_irq_restore(flags);
    +
    +}
    +
    +static void
    release_page_cgroup(struct page_cgroup *pc)
    {
    struct memcg_percpu_vec *mpv;
    -
    mpv = &get_cpu_var(memcg_free_vec);
    mpv->vec[mpv->nr++] = pc;
    if (mpv->nr >= mpv->limit)
    @@ -585,11 +619,25 @@ release_page_cgroup(struct page_cgroup *
    put_cpu_var(memcg_free_vec);
    }

    +static void
    +set_page_cgroup_lru(struct page_cgroup *pc)
    +{
    + struct memcg_percpu_vec *mpv;
    +
    + mpv = &get_cpu_var(memcg_add_vec);
    + mpv->vec[mpv->nr++] = pc;
    + if (mpv->nr >= mpv->limit)
    + __set_page_cgroup_lru(mpv);
    + put_cpu_var(memcg_add_vec);
    +}
    +
    static void page_cgroup_start_cache_cpu(int cpu)
    {
    struct memcg_percpu_vec *mpv;
    mpv = &per_cpu(memcg_free_vec, cpu);
    mpv->limit = MEMCG_PCPVEC_SIZE;
    + mpv = &per_cpu(memcg_add_vec, cpu);
    + mpv->limit = MEMCG_PCPVEC_SIZE;
    }

    #ifdef CONFIG_HOTPLUG_CPU
    @@ -598,6 +646,8 @@ static void page_cgroup_stop_cache_cpu(i
    struct memcg_percpu_vec *mpv;
    mpv = &per_cpu(memcg_free_vec, cpu);
    mpv->limit = 0;
    + mpv = &per_cpu(memcg_add_vec, cpu);
    + mpv->limit = 0;
    }
    #endif

    @@ -611,6 +661,9 @@ static DEFINE_MUTEX(memcg_force_drain_mu
    static void drain_page_cgroup_local(struct work_struct *work)
    {
    struct memcg_percpu_vec *mpv;
    + mpv = &get_cpu_var(memcg_add_vec);
    + __set_page_cgroup_lru(mpv);
    + put_cpu_var(mpv);
    mpv = &get_cpu_var(memcg_free_vec);
    __release_page_cgroup(mpv);
    put_cpu_var(mpv);
    @@ -677,14 +730,9 @@ static int mem_cgroup_charge_common(stru
    rcu_read_unlock();
    return 0;
    }
    - /*
    - * For every charge from the cgroup, increment reference count
    - */
    - css_get(&mem->css);
    rcu_read_unlock();
    } else {
    mem = memcg;
    - css_get(&memcg->css);
    }

    while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
    @@ -711,33 +759,36 @@ static int mem_cgroup_charge_common(stru
    }

    preempt_disable();
    - lock_page_cgroup(pc);
    - if (unlikely(PageCgroupUsed(pc))) {
    - unlock_page_cgroup(pc);
    + if (TestSetPageCgroupUsed(pc)) {
    res_counter_uncharge(&mem->res, PAGE_SIZE);
    - css_put(&mem->css);
    preempt_enable();
    goto done;
    }
    - pc->mem_cgroup = mem;
    /*
    - * If a page is accounted as a page cache, insert to inactive list.
    - * If anon, insert to active list.
    - */
    - pc->flags = pcg_default_flags[ctype];
    -
    - mz = page_cgroup_zoneinfo(pc);
    + * page cgroup is *unused* now....but....
    + * We can assume old mem_cgroup's metadata is still available
    + * because pc is not on stale LRU after force_empty() is called.
    + */
    + if (likely(!PageCgroupLRU(pc)))
    + pc->flags = pcg_default_flags[ctype];
    + else {
    + mz = page_cgroup_zoneinfo(pc);
    + spin_lock_irqsave(&mz->lru_lock, flags);
    + if (PageCgroupLRU(pc)) {
    + __mem_cgroup_remove_list(mz, pc);
    + ClearPageCgroupLRU(pc);
    + }
    + pc->flags = pcg_default_flags[ctype];
    + spin_unlock_irqrestore(&mz->lru_lock, flags);
    + }

    - spin_lock_irqsave(&mz->lru_lock, flags);
    - __mem_cgroup_add_list(mz, pc);
    - spin_unlock_irqrestore(&mz->lru_lock, flags);
    - unlock_page_cgroup(pc);
    + pc->mem_cgroup = mem;
    + set_page_cgroup_lru(pc);
    preempt_enable();

    done:
    return 0;
    out:
    - css_put(&mem->css);
    return -ENOMEM;
    }

    @@ -823,12 +874,12 @@ __mem_cgroup_uncharge_common(struct page
    preempt_disable();
    lock_page_cgroup(pc);
    ClearPageCgroupUsed(pc);
    + mem = pc->mem_cgroup;
    unlock_page_cgroup(pc);
    preempt_enable();
    + res_counter_uncharge(&mem->res, PAGE_SIZE);

    - mem = pc->mem_cgroup;
    release_page_cgroup(pc);
    - res_counter_uncharge(&mem->res, PAGE_SIZE);

    return;
    }

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  3. Re: [PATCH 0/12] memcg updates v5

    On Thu, 25 Sep 2008 15:11:24 +0900, KAMEZAWA Hiroyuki wrote:
    > Hi, I updated the stack and reflected comments.
    > Against the latest mmotm. (rc7-mm1)
    >
    > Major changes from previous one is
    > - page_cgroup allocation/lookup manner is changed.
    > all FLATMEM/DISCONTIGMEM/SPARSEMEM and MEMORY_HOTPLUG is supported.
    > - force_empty is totally rewritten. and a problem that "force_empty takes long time"
    > in previous version is fixed (I think...)
    > - reordered patches.
    > - first half are easy ones.
    > - second half are big ones.
    >
    > I'm still testing with full debug option. No problem found yet.
    > (I'm afraid of race condition which have not been caught yet.)
    >
    > [1/12] avoid accounting special mappings not on LRU. (fix)
    > [2/12] move charege() call to swapped-in page under lock_page() (clean up)
    > [3/12] make root cgroup to be unlimited. (change semantics.)
    > [4/12] make page->mapping NULL before calling uncharge (clean up)
    > [5/12] make page->flags to use atomic ops. (changes in infrastructure)
    > [6/12] optimize stat. (clean up)
    > [7/12] add support function for moving account. (new function)
    > [8/12] rewrite force_empty to use move_account. (change semantics.)
    > [9/12] allocate all page_cgroup at boot. (changes in infrastructure)
    > [10/12] free page_cgroup from LRU in lazy way (optimize)
    > [11/12] add page_cgroup to LRU in lazy way (optimize)
    > [12/12] fix race at charging swap (fix by new logic.)
    >
    > *Any* comment is welcome.
    >
    > Thanks,
    > -Kame
    >


    I got general protection fault.

    (log from dump)
    general protection fault: 0000 [1] SMP
    last sysfs file: /sys/devices/system/cpu/cpu15/cache/index1/shared_cpu_map
    CPU 0
    Modules linked in: ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp ipv6
    autofs4 hidp rfcomm l2cap bluetooth sunrpc microcode dm_mirror dm_log dm_multipath dm_mod
    rfkill input_polldev sbs sbshc battery ac lp sg e1000 ide_cd_mod cdrom button acpi_memhotp
    lug serio_raw rtc_cmos parport_pc rtc_core parport rtc_lib i2c_i801 i2c_core pcspkr shpchp
    ata_piix libata megaraid_mbox megaraid_mm sd_mod scsi_mod ext3 jbd ehci_hcd ohci_hcd uhci
    _hcd
    Pid: 8001, comm: shmem_test_02 Tainted: G W 2.6.27-rc7-mm1-7eacf5c9 #1
    RIP: 0010:[] [] __mem_cgroup_move_lists+0x8b/0xa2
    RSP: 0018:ffff8800bb4ad888 EFLAGS: 00010046
    RAX: ffff88010b253080 RBX: ffff88010c67d618 RCX: dead000000100100
    RDX: dead000000200200 RSI: ffff88010b253088 RDI: ffff88010c67d630
    RBP: 0000000000000000 R08: ffff88010fc020a3 R09: 000000000000000f
    R10: ffffffff802a204a R11: 00000000fffffffa R12: ffff88010b253080
    R13: 0000000000000000 R14: ffff8800bb4ad9c8 R15: 0000000000000000
    FS: 00007f4600faa6f0(0000) GS:ffffffff80638900(0000) knlGS:0000000000000000
    CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
    CR2: 00000033af86c027 CR3: 00000000c1549000 CR4: 00000000000006e0
    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
    DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
    Process shmem_test_02 (pid: 8001, threadinfo ffff8800bb4ac000, task ffff880107d21470)
    Stack: ffffe200028ef8b0 0000000000000082 ffff88010c67d618 ffffffff802a1cb9
    ffff880000016f80 0000000000000000 ffff880000016f80 ffffe200028ef888
    ffff8800bb4adb38 ffffffff8027dd09 ffffc20001859000 0000000000000000
    Call Trace:
    [] mem_cgroup_move_lists+0x50/0x74
    [] shrink_list+0x443/0x4ff
    [] shrink_zone+0x289/0x315
    [] congestion_wait+0x74/0x80
    [] autoremove_wake_function+0x0/0x2e
    [] do_try_to_free_pages+0x259/0x3e3
    [] try_to_free_mem_cgroup_pages+0x80/0x85
    [] mem_cgroup_isolate_pages+0x0/0x1d2
    [] mem_cgroup_shrink_usage+0x60/0xba
    [] shmem_getpage+0x455/0x7a0
    [] target_load+0x2a/0x58
    [] place_entity+0x85/0xb3
    [] enqueue_entity+0x16e/0x18f
    [] enqueue_task_fair+0x24/0x3a
    [] enqueue_task+0x50/0x5b
    [] try_to_wake_up+0x241/0x253
    [] autoremove_wake_function+0x9/0x2e
    [] __wake_up_common+0x41/0x74
    [] __wake_up+0x38/0x4f
    [] shmem_fault+0x3b/0x68
    [] __do_fault+0x51/0x3fb
    [] handle_mm_fault+0x1d6/0x791
    [] do_page_fault+0x39c/0x773
    [] do_page_fault+0x3db/0x773
    [] error_exit+0x0/0x51
    Code: 0b 10 eb 04 f0 80 23 ef f0 80 23 bf 89 e8 48 8d 7b 18 48 ff 44 c6 58 48 c1 e0 04 48
    8b 4b 18 48 8b 57 08 48 8d 04 06 48 8d 70 08 <48> 89 51 08 48 89 0a 48 8b 50 08 59 5b 5d e
    9 75 f4 09 00 58 5b
    RIP [] __mem_cgroup_move_lists+0x8b/0xa2
    RSP
    ---[ end trace 4eaa2a86a8e2da22 ]---

    I've not investigated deeply yet, but it seems that it is trying to
    handle an entry which has been already removed from list.
    (I can see some "dead" pointer in registers.)

    I was running some ltp tests (4 "page01" tests(8MB for each)
    and 1 "shmem_test02" test(16MB)) in a group with limit=32M.


    Anyway, I'll dig it more later.


    Thanks,
    Daisuke Nishimura.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  4. Re: [PATCH 0/12] memcg updates v5

    On Fri, 26 Sep 2008 11:32:28 +0900
    Daisuke Nishimura wrote:

    > I got general protection fault.
    >
    > (log from dump)
    > general protection fault: 0000 [1] SMP
    > last sysfs file: /sys/devices/system/cpu/cpu15/cache/index1/shared_cpu_map


    > Pid: 8001, comm: shmem_test_02 Tainted: G W 2.6.27-rc7-mm1-7eacf5c9 #1
    > RIP: 0010:[] [] __mem_cgroup_move_lists+0x8b/0xa2
    > RSP: 0018:ffff8800bb4ad888 EFLAGS: 00010046
    > RAX: ffff88010b253080 RBX: ffff88010c67d618 RCX: dead000000100100
    > RDX: dead000000200200 RSI: ffff88010b253088 RDI: ffff88010c67d630
    > RBP: 0000000000000000 R08: ffff88010fc020a3 R09: 000000000000000f
    > R10: ffffffff802a204a R11: 00000000fffffffa R12: ffff88010b253080
    > R13: 0000000000000000 R14: ffff8800bb4ad9c8 R15: 0000000000000000
    > FS: 00007f4600faa6f0(0000) GS:ffffffff80638900(0000) knlGS:0000000000000000
    > CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
    > CR2: 00000033af86c027 CR3: 00000000c1549000 CR4: 00000000000006e0
    > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
    > DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
    > Process shmem_test_02 (pid: 8001, threadinfo ffff8800bb4ac000, task ffff880107d21470)
    > Stack: ffffe200028ef8b0 0000000000000082 ffff88010c67d618 ffffffff802a1cb9
    > ffff880000016f80 0000000000000000 ffff880000016f80 ffffe200028ef888
    > ffff8800bb4adb38 ffffffff8027dd09 ffffc20001859000 0000000000000000
    > Call Trace:
    > [] mem_cgroup_move_lists+0x50/0x74
    > [] shrink_list+0x443/0x4ff
    > [] shrink_zone+0x289/0x315
    > [] congestion_wait+0x74/0x80
    > [] autoremove_wake_function+0x0/0x2e
    > [] do_try_to_free_pages+0x259/0x3e3
    > [] try_to_free_mem_cgroup_pages+0x80/0x85
    > [] mem_cgroup_isolate_pages+0x0/0x1d2


    > Code: 0b 10 eb 04 f0 80 23 ef f0 80 23 bf 89 e8 48 8d 7b 18 48 ff 44 c6 58 48 c1 e0 04 48
    > 8b 4b 18 48 8b 57 08 48 8d 04 06 48 8d 70 08 <48> 89 51 08 48 89 0a 48 8b 50 08 59 5b 5d e
    > 9 75 f4 09 00 58 5b
    > RIP [] __mem_cgroup_move_lists+0x8b/0xa2
    > RSP
    > ---[ end trace 4eaa2a86a8e2da22 ]---
    >
    > I've not investigated deeply yet, but it seems that it is trying to
    > handle an entry which has been already removed from list.
    > (I can see some "dead" pointer in registers.)
    >
    > I was running some ltp tests (4 "page01" tests(8MB for each)
    > and 1 "shmem_test02" test(16MB)) in a group with limit=32M.
    >
    >
    > Anyway, I'll dig it more later.
    >

    Thank you.

    How about following ?
    -Kame
    ==
    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -597,8 +597,8 @@ __set_page_cgroup_lru(struct memcg_percp
    spin_lock(&mz->lru_lock);
    }
    if (PageCgroupUsed(pc) && !PageCgroupLRU(pc)) {
    - SetPageCgroupLRU(pc);
    __mem_cgroup_add_list(mz, pc);
    + SetPageCgroupLRU(pc);
    }
    }







    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  5. Re: [PATCH 0/12] memcg updates v5

    On Fri, 26 Sep 2008 11:58:10 +0900
    KAMEZAWA Hiroyuki wrote:

    > Thank you.
    >
    > How about following ?
    > -Kame
    > ==
    > Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    > ================================================== =================
    > --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    > +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    > @@ -597,8 +597,8 @@ __set_page_cgroup_lru(struct memcg_percp
    > spin_lock(&mz->lru_lock);
    > }
    > if (PageCgroupUsed(pc) && !PageCgroupLRU(pc)) {
    > - SetPageCgroupLRU(pc);
    > __mem_cgroup_add_list(mz, pc);
    > + SetPageCgroupLRU(pc);
    > }
    > }
    >

    Of course, remove side should be..
    -Kame
    ==
    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -564,8 +564,8 @@ __release_page_cgroup(struct memcg_percp
    spin_lock(&mz->lru_lock);
    }
    if (!PageCgroupUsed(pc) && PageCgroupLRU(pc)) {
    - __mem_cgroup_remove_list(mz, pc);
    ClearPageCgroupLRU(pc);
    + __mem_cgroup_remove_list(mz, pc);
    }
    }
    if (prev_mz)
    @@ -597,8 +597,8 @@ __set_page_cgroup_lru(struct memcg_percp
    spin_lock(&mz->lru_lock);
    }
    if (PageCgroupUsed(pc) && !PageCgroupLRU(pc)) {
    - SetPageCgroupLRU(pc);
    __mem_cgroup_add_list(mz, pc);
    + SetPageCgroupLRU(pc);
    }
    }


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  6. Re: [PATCH 0/12] memcg updates v5

    On Fri, 26 Sep 2008 12:04:08 +0900, KAMEZAWA Hiroyuki wrote:
    > On Fri, 26 Sep 2008 11:58:10 +0900
    > KAMEZAWA Hiroyuki wrote:
    >
    > > Thank you.
    > >
    > > How about following ?
    > > -Kame
    > > ==
    > > Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    > > ================================================== =================
    > > --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    > > +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    > > @@ -597,8 +597,8 @@ __set_page_cgroup_lru(struct memcg_percp
    > > spin_lock(&mz->lru_lock);
    > > }
    > > if (PageCgroupUsed(pc) && !PageCgroupLRU(pc)) {
    > > - SetPageCgroupLRU(pc);
    > > __mem_cgroup_add_list(mz, pc);
    > > + SetPageCgroupLRU(pc);
    > > }
    > > }
    > >

    > Of course, remove side should be..
    > -Kame
    > ==
    > Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    > ================================================== =================
    > --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    > +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    > @@ -564,8 +564,8 @@ __release_page_cgroup(struct memcg_percp
    > spin_lock(&mz->lru_lock);
    > }
    > if (!PageCgroupUsed(pc) && PageCgroupLRU(pc)) {
    > - __mem_cgroup_remove_list(mz, pc);
    > ClearPageCgroupLRU(pc);
    > + __mem_cgroup_remove_list(mz, pc);
    > }
    > }
    > if (prev_mz)
    > @@ -597,8 +597,8 @@ __set_page_cgroup_lru(struct memcg_percp
    > spin_lock(&mz->lru_lock);
    > }
    > if (PageCgroupUsed(pc) && !PageCgroupLRU(pc)) {
    > - SetPageCgroupLRU(pc);
    > __mem_cgroup_add_list(mz, pc);
    > + SetPageCgroupLRU(pc);
    > }
    > }
    >
    >


    I'll test it with updated version of 9-11 and report you back.


    Thanks,
    Daisuke Nishimura.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  7. Re: [PATCH 0/12] memcg updates v5

    On Fri, 26 Sep 2008 12:00:19 +0900
    Daisuke Nishimura wrote:

    > I'll test it with updated version of 9-11 and report you back.
    >

    Thank you. below is the new one...(Sorry!)

    -Kame
    ==
    Check LRU bit under lru_lock.

    Signed-off-by: KAMEZAWA Hiroyuki

    mm/memcontrol.c | 9 +++++----
    1 file changed, 5 insertions(+), 4 deletions(-)

    Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    ================================================== =================
    --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    @@ -340,11 +340,12 @@ void mem_cgroup_move_lists(struct page *
    if (!trylock_page_cgroup(pc))
    return;

    - if (PageCgroupUsed(pc) && PageCgroupLRU(pc)) {
    + if (PageCgroupUsed(pc)) {
    mem = pc->mem_cgroup;
    mz = page_cgroup_zoneinfo(pc);
    spin_lock_irqsave(&mz->lru_lock, flags);
    - __mem_cgroup_move_lists(pc, lru);
    + if (PageCgroupLRU(pc))
    + __mem_cgroup_move_lists(pc, lru);
    spin_unlock_irqrestore(&mz->lru_lock, flags);
    }
    unlock_page_cgroup(pc);
    @@ -564,8 +565,8 @@ __release_page_cgroup(struct memcg_percp
    spin_lock(&mz->lru_lock);
    }
    if (!PageCgroupUsed(pc) && PageCgroupLRU(pc)) {
    - __mem_cgroup_remove_list(mz, pc);
    ClearPageCgroupLRU(pc);
    + __mem_cgroup_remove_list(mz, pc);
    }
    }
    if (prev_mz)
    @@ -597,8 +598,8 @@ __set_page_cgroup_lru(struct memcg_percp
    spin_lock(&mz->lru_lock);
    }
    if (PageCgroupUsed(pc) && !PageCgroupLRU(pc)) {
    - SetPageCgroupLRU(pc);
    __mem_cgroup_add_list(mz, pc);
    + SetPageCgroupLRU(pc);
    }
    }




    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  8. Re: [PATCH 0/12] memcg updates v5

    On Fri, 26 Sep 2008 13:05:34 +0900, KAMEZAWA Hiroyuki wrote:
    > On Fri, 26 Sep 2008 12:00:19 +0900
    > Daisuke Nishimura wrote:
    >
    > > I'll test it with updated version of 9-11 and report you back.
    > >

    > Thank you. below is the new one...(Sorry!)
    >
    > -Kame
    > ==
    > Check LRU bit under lru_lock.
    >
    > Signed-off-by: KAMEZAWA Hiroyuki
    >
    > mm/memcontrol.c | 9 +++++----
    > 1 file changed, 5 insertions(+), 4 deletions(-)
    >
    > Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
    > ================================================== =================
    > --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
    > +++ mmotm-2.6.27-rc7+/mm/memcontrol.c
    > @@ -340,11 +340,12 @@ void mem_cgroup_move_lists(struct page *
    > if (!trylock_page_cgroup(pc))
    > return;
    >
    > - if (PageCgroupUsed(pc) && PageCgroupLRU(pc)) {
    > + if (PageCgroupUsed(pc)) {
    > mem = pc->mem_cgroup;
    > mz = page_cgroup_zoneinfo(pc);
    > spin_lock_irqsave(&mz->lru_lock, flags);
    > - __mem_cgroup_move_lists(pc, lru);
    > + if (PageCgroupLRU(pc))
    > + __mem_cgroup_move_lists(pc, lru);
    > spin_unlock_irqrestore(&mz->lru_lock, flags);
    > }
    > unlock_page_cgroup(pc);
    > @@ -564,8 +565,8 @@ __release_page_cgroup(struct memcg_percp
    > spin_lock(&mz->lru_lock);
    > }
    > if (!PageCgroupUsed(pc) && PageCgroupLRU(pc)) {
    > - __mem_cgroup_remove_list(mz, pc);
    > ClearPageCgroupLRU(pc);
    > + __mem_cgroup_remove_list(mz, pc);
    > }
    > }
    > if (prev_mz)
    > @@ -597,8 +598,8 @@ __set_page_cgroup_lru(struct memcg_percp
    > spin_lock(&mz->lru_lock);
    > }
    > if (PageCgroupUsed(pc) && !PageCgroupLRU(pc)) {
    > - SetPageCgroupLRU(pc);
    > __mem_cgroup_add_list(mz, pc);
    > + SetPageCgroupLRU(pc);
    > }
    > }
    >
    >


    Unfortunately, there remains some bugs yet...

    ------------[ cut here ]------------
    WARNING: at lib/list_debug.c:51 list_del+0x5c/0x87()
    list_del corruption. next->prev should be ffff88010ca291e8, but was dead000000200200
    Modules linked in: ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp ipv6
    autofs4 hidp rfcomm l2cap bluetooth sunrpc microcode dm_mirror dm_log dm_multipath dm_mod
    rfkill input_polldev sbs sbshc battery ac lp sg e1000 ide_cd_mod cdrom button acpi_memhotp
    lug parport_pc rtc_cmos rtc_core parport serio_raw rtc_lib i2c_i801 i2c_core shpchp pcspkr
    ata_piix libata megaraid_mbox megaraid_mm sd_mod scsi_mod ext3 jbd ehci_hcd ohci_hcd uhci
    _hcd
    Pid: 3940, comm: bash Tainted: G W 2.6.27-rc7-mm1-dd8bf0fe #1
    Call Trace:
    [] warn_slowpath+0xb4/0xd2
    [] prepare_to_wait_exclusive+0x38/0x5a
    [] finish_wait+0x32/0x5d
    [] __wait_on_bit_lock+0x5b/0x66
    [] __lock_page+0x5e/0x64
    [] target_load+0x2a/0x58
    [] place_entity+0x85/0xb3
    [] enqueue_entity+0x16e/0x18f
    [] zone_statistics+0x3a/0x5d
    [] zone_statistics+0x3a/0x5d
    [] get_page_from_freelist+0x455/0x5bf
    [] list_del+0x5c/0x87
    [] mem_cgroup_commit_charge+0x6f/0xdd
    [] mem_cgroup_charge_common+0x4c/0x62
    [] handle_mm_fault+0x222/0x791
    [] zone_statistics+0x3a/0x5d
    [] follow_page+0x2d/0x2c2
    [] __get_user_pages+0x2f5/0x3f3
    [] get_arg_page+0x46/0xa5
    [] copy_strings+0xfc/0x1de
    [] copy_strings_kernel+0x21/0x33
    [] do_execve+0x140/0x256
    [] sys_execve+0x35/0x4c
    [] stub_execve+0x6a/0xc0
    ---[ end trace 4eaa2a86a8e2da22 ]---
    ------------[ cut here ]------------
    WARNING: at lib/list_debug.c:48 list_del+0x30/0x87()
    list_del corruption. prev->next should be ffff88010ca29210, but was dead000000100100
    Modules linked in: ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp ipv6
    autofs4 hidp rfcomm l2cap bluetooth sunrpc microcode dm_mirror dm_log dm_multipath dm_mod
    rfkill input_polldev sbs sbshc battery ac lp sg e1000 ide_cd_mod cdrom button acpi_memhotp
    lug parport_pc rtc_cmos rtc_core parport serio_raw rtc_lib i2c_i801 i2c_core shpchp pcspkr
    ata_piix libata megaraid_mbox megaraid_mm sd_mod scsi_mod ext3 jbd ehci_hcd ohci_hcd uhci
    _hcd
    Pid: 3940, comm: bash Tainted: G W 2.6.27-rc7-mm1-dd8bf0fe #1
    Call Trace:
    [] warn_slowpath+0xb4/0xd2
    [] __getblk+0x25/0x21f
    [] __ext3_journal_dirty_metadata+0x1e/0x46 [ext3]
    [] __wake_up+0x38/0x4f
    [] __mark_inode_dirty+0x15c/0x16b
    [] touch_atime+0x109/0x112
    [] mnt_drop_write+0x25/0xdc
    [] generic_file_aio_read+0x4b8/0x515
    [] list_del+0x30/0x87
    [] __release_page_cgroup+0x68/0x8a
    [] page_remove_rmap+0x10e/0x12e
    [] unmap_vmas+0x476/0x7f2
    [] exit_mmap+0xf0/0x176
    [] secure_ip_id+0x45/0x4a
    [] mmput+0x30/0x88
    [] flush_old_exec+0x487/0x77c
    [] vfs_read+0x11e/0x133
    [] load_elf_binary+0x338/0x16b6
    [] get_arg_page+0x46/0xa5
    [] copy_strings+0x1cd/0x1de
    [] search_binary_handler+0xb0/0x22e
    [] do_execve+0x1a8/0x256
    [] sys_execve+0x35/0x4c
    [] stub_execve+0x6a/0xc0
    ---[ end trace 4eaa2a86a8e2da22 ]---
    ------------[ cut here ]------------
    WARNING: at lib/list_debug.c:48 list_del+0x30/0x87()
    list_del corruption. prev->next should be ffff88010c937d50, but was ffff88010ca052e8
    Modules linked in: ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp ipv6
    autofs4 hidp rfcomm l2cap bluetooth sunrpc microcode dm_mirror dm_log dm_multipath dm_mod
    rfkill input_polldev sbs sbshc battery ac lp sg e1000 ide_cd_mod cdrom button acpi_memhotp
    lug parport_pc rtc_cmos rtc_core parport serio_raw rtc_lib i2c_i801 i2c_core shpchp pcspkr
    ata_piix libata megaraid_mbox megaraid_mm sd_mod scsi_mod ext3 jbd ehci_hcd ohci_hcd uhci
    _hcd
    Pid: 3943, comm: shmem_test_02 Tainted: G W 2.6.27-rc7-mm1-dd8bf0fe #1
    Call Trace:
    [] warn_slowpath+0xb4/0xd2
    [] shmem_getpage+0x75/0x7a0
    [] zone_statistics+0x3a/0x5d
    [] get_page_from_freelist+0x353/0x5bf
    [] zone_statistics+0x3a/0x5d
    [] get_page_from_freelist+0x353/0x5bf
    [] list_del+0x30/0x87
    [] mem_cgroup_commit_charge+0x6f/0xdd
    [] mem_cgroup_charge_common+0x4c/0x62
    [] do_wp_page+0x3ab/0x58a
    [] handle_mm_fault+0x735/0x791
    [] fcntl_setlk+0x233/0x263
    [] do_page_fault+0x39c/0x773
    [] error_exit+0x0/0x51
    ---[ end trace 4eaa2a86a8e2da22 ]---
    ------------[ cut here ]------------
    WARNING: at lib/list_debug.c:48 list_del+0x30/0x87()
    list_del corruption. prev->next should be ffff88010ca052e8, but was ffff88010c4068a0
    Modules linked in: ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp ipv6
    autofs4 hidp rfcomm l2cap bluetooth sunrpc microcode dm_mirror dm_log dm_multipath dm_mod
    rfkill input_polldev sbs sbshc battery ac lp sg e1000 ide_cd_mod cdrom button acpi_memhotp
    lug parport_pc rtc_cmos rtc_core parport serio_raw rtc_lib i2c_i801 i2c_core shpchp pcspkr
    ata_piix libata megaraid_mbox megaraid_mm sd_mod scsi_mod ext3 jbd ehci_hcd ohci_hcd uhci
    _hcd
    Pid: 3942, comm: shmem_test_02 Tainted: G W 2.6.27-rc7-mm1-dd8bf0fe #1
    Call Trace:
    [] warn_slowpath+0xb4/0xd2
    [] rmqueue_bulk+0x61/0x8b
    [] number+0x106/0x1f9
    [] zone_statistics+0x3a/0x5d
    [] get_page_from_freelist+0x353/0x5bf
    [] free_pages_bulk+0x198/0x20b
    [] __pagevec_free+0x21/0x2e
    [] release_pages+0x151/0x19f
    [] list_del+0x30/0x87
    [] __release_page_cgroup+0x68/0x8a
    [] __remove_from_page_cache+0x45/0x8f
    [] remove_from_page_cache+0x27/0x2f
    [] truncate_complete_page+0x49/0x59
    [] truncate_inode_pages_range+0xbd/0x2ff
    [] shmem_delete_inode+0x33/0xc4
    [] shmem_delete_inode+0x0/0xc4
    [] generic_delete_inode+0xb0/0x124
    [] d_kill+0x21/0x43
    [] dput+0x111/0x11f
    [] __fput+0x14f/0x17e
    [] remove_vma+0x3d/0x72
    [] exit_mmap+0x157/0x176
    [] mmput+0x30/0x88
    [] exit_mm+0xff/0x10a
    [] do_exit+0x210/0x7a5
    [] audit_syscall_entry+0x12d/0x160
    [] do_group_exit+0x66/0x96
    [] system_call_fastpath+0x16/0x1b
    ---[ end trace 4eaa2a86a8e2da22 ]---
    ------------[ cut here ]------------
    WARNING: at lib/list_debug.c:51 list_del+0x5c/0x87()
    list_del corruption. next->prev should be ffff88010caf6b20, but was dead000000200200
    Modules linked in: ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp ipv6
    autofs4 hidp rfcomm l2cap bluetooth sunrpc microcode dm_mirror dm_log dm_multipath dm_mod
    rfkill input_polldev sbs sbshc battery ac lp sg e1000 ide_cd_mod cdrom button acpi_memhotp
    lug parport_pc rtc_cmos rtc_core parport serio_raw rtc_lib i2c_i801 i2c_core shpchp pcspkr
    ata_piix libata megaraid_mbox megaraid_mm sd_mod scsi_mod ext3 jbd ehci_hcd ohci_hcd uhci
    _hcd
    Pid: 3932, comm: page01 Tainted: G W 2.6.27-rc7-mm1-dd8bf0fe #1
    Call Trace:
    [] warn_slowpath+0xb4/0xd2
    [] free_pages_bulk+0x198/0x20b
    [] release_pages+0x18d/0x19f
    [] list_del+0x5c/0x87
    [] __release_page_cgroup+0x68/0x8a
    [] page_remove_rmap+0x10e/0x12e
    [] unmap_vmas+0x476/0x7f2
    [] exit_mmap+0xf0/0x176
    [] mmput+0x30/0x88
    [] exit_mm+0xff/0x10a
    [] do_exit+0x210/0x7a5
    [] audit_syscall_entry+0x12d/0x160
    [] do_group_exit+0x66/0x96
    [] system_call_fastpath+0x16/0x1b
    ---[ end trace 4eaa2a86a8e2da22 ]---
    ------------[ cut here ]------------
    WARNING: at lib/list_debug.c:48 list_del+0x30/0x87()
    list_del corruption. prev->next should be ffff88010c4068a0, but was dead000000100100
    Modules linked in: ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp ipv6
    autofs4 hidp rfcomm l2cap bluetooth sunrpc microcode dm_mirror dm_log dm_multipath dm_mod
    rfkill input_polldev sbs sbshc battery ac lp sg e1000 ide_cd_mod cdrom button acpi_memhotp
    lug parport_pc rtc_cmos rtc_core parport serio_raw rtc_lib i2c_i801 i2c_core shpchp pcspkr
    ata_piix libata megaraid_mbox megaraid_mm sd_mod scsi_mod ext3 jbd ehci_hcd ohci_hcd uhci
    _hcd
    Pid: 3934, comm: page01 Tainted: G W 2.6.27-rc7-mm1-dd8bf0fe #1
    Call Trace:
    [] warn_slowpath+0xb4/0xd2
    [] do_get_write_access+0x37d/0x3c3 [jbd]
    [] __getblk+0x25/0x21f
    [] bit_waitqueue+0x10/0xa0
    [] do_get_write_access+0x37d/0x3c3 [jbd]
    [] bit_waitqueue+0x10/0xa0
    [] find_get_page+0x18/0xc4
    [] bit_waitqueue+0x10/0xa0
    [] list_del+0x30/0x87
    [] __release_page_cgroup+0x68/0x8a
    [] page_remove_rmap+0x10e/0x12e
    [] unmap_vmas+0x476/0x7f2
    [] exit_mmap+0xf0/0x176
    [] mmput+0x30/0x88
    [] exit_mm+0xff/0x10a
    [] do_exit+0x210/0x7a5
    [] audit_syscall_entry+0x12d/0x160
    [] do_group_exit+0x66/0x96
    [] system_call_fastpath+0x16/0x1b
    ---[ end trace 4eaa2a86a8e2da22 ]---
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  9. Re: [PATCH 9/12] memcg allocate all page_cgroup at boot

    On Fri, 26 Sep 2008 11:05:50 +0900, KAMEZAWA Hiroyuki wrote:
    > On Fri, 26 Sep 2008 10:43:36 +0900
    > KAMEZAWA Hiroyuki wrote:
    > > > > - /*
    > > > > - * Check if our page_cgroup is valid
    > > > > - */
    > > > > - lock_page_cgroup(page);
    > > > > - pc = page_get_page_cgroup(page);
    > > > > - if (unlikely(!pc))
    > > > > - goto unlock;
    > > > > -
    > > > > - VM_BUG_ON(pc->page != page);
    > > > > + pc = lookup_page_cgroup(page);
    > > > > + if (unlikely(!pc || !PageCgroupUsed(pc)))
    > > > > + return;
    > > > > + preempt_disable();
    > > > > + lock_page_cgroup(pc);
    > > > > + if (unlikely(page_mapped(page))) {
    > > > > + unlock_page_cgroup(pc);
    > > > > + preempt_enable();
    > > > > + return;
    > > > > + }
    > > > Just for clarification, in what sequence will the page be mapped here?
    > > > mem_cgroup_uncharge_page checks whether the page is mapped.
    > > >

    > > Please think about folloing situation.
    > >
    > > There is a SwapCache which is referred from 2 process, A, B.
    > > A maps it.
    > > B doesn't maps it.
    > >
    > > And now, process A exits.
    > >
    > > CPU0(process A) CPU1 (process B)
    > >
    > > zap_pte_range()
    > > => page remove from rmap => charge() (do_swap_page)
    > > => set page->mapcount->0
    > > => uncharge() => set page->mapcount=1
    > >
    > > This race is what patch 12/12 is fixed.
    > > This only happens on cursed SwapCache.
    > >

    > Sorry, my brain seems to be sleeping.. above page_mapped() check doesn't
    > help this situation. Maybe this page_mapped() check is not necessary
    > because it's of no use.
    >
    > I think this kind of problem will not be fixed until we handle SwapCache.
    >

    I've not fully understood yet what [12/12] does, but if we handle
    swapcache properly, [12/12] would become unnecessary?

    If so, how about handling swapcache instead of adding new interface?
    I think it can be done independent of mem+swap.


    Thanks,
    Daisuke Nishimura.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  10. Re: [PATCH 9/12] memcg allocate all page_cgroup at boot

    On Fri, 26 Sep 2008 14:54:22 +0900
    Daisuke Nishimura wrote:
    > > > There is a SwapCache which is referred from 2 process, A, B.
    > > > A maps it.
    > > > B doesn't maps it.
    > > >
    > > > And now, process A exits.
    > > >
    > > > CPU0(process A) CPU1 (process B)
    > > >
    > > > zap_pte_range()
    > > > => page remove from rmap => charge() (do_swap_page)
    > > > => set page->mapcount->0
    > > > => uncharge() => set page->mapcount=1
    > > >
    > > > This race is what patch 12/12 is fixed.
    > > > This only happens on cursed SwapCache.
    > > >

    > > Sorry, my brain seems to be sleeping.. above page_mapped() check doesn't
    > > help this situation. Maybe this page_mapped() check is not necessary
    > > because it's of no use.
    > >
    > > I think this kind of problem will not be fixed until we handle SwapCache.
    > >

    > I've not fully understood yet what [12/12] does, but if we handle
    > swapcache properly, [12/12] would become unnecessary?
    >

    Maybe yes. we treat swapcache under lock_page().

    > If so, how about handling swapcache instead of adding new interface?
    > I think it can be done independent of mem+swap.
    >

    Hmm, worth to be considered. But I'll reuse the interface itself for othres
    (shmem, migrate, move_account etc)
    But, in previous trial of SwapCache handling, we saw many troubles.
    Then, I'd like to go carefully step by step to handle that.

    Thanks,
    -Kame

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  11. Re: [PATCH 7/12] memcg add function to move account

    > @@ -444,9 +445,14 @@ void mem_cgroup_move_lists(struct page *
    >
    > pc = page_get_page_cgroup(page);
    > if (pc) {
    > + mem = pc->mem_cgroup;
    > mz = page_cgroup_zoneinfo(pc);
    > spin_lock_irqsave(&mz->lru_lock, flags);
    > - __mem_cgroup_move_lists(pc, lru);
    > + /*
    > + * check against the race with move_account.
    > + */
    > + if (likely(mem == pc->mem_cgroup))
    > + __mem_cgroup_move_lists(pc, lru);


    (snip)

    > @@ -754,16 +824,24 @@ __mem_cgroup_uncharge_common(struct page
    > if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
    > && ((PageCgroupCache(pc) || page_mapped(page))))
    > goto unlock;
    > -
    > +retry:
    > + mem = pc->mem_cgroup;
    > mz = page_cgroup_zoneinfo(pc);
    > spin_lock_irqsave(&mz->lru_lock, flags);
    > + if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED &&
    > + unlikely(mem != pc->mem_cgroup)) {
    > + /* MAPPED account can be done without lock_page().
    > + Check race with mem_cgroup_move_account() */
    > + spin_unlock_irqrestore(&mz->lru_lock, flags);
    > + goto retry;
    > + }


    I'm sorry, but I've not been convinced yet why these checks are needed here.
    (Those checks are removed by [9/12] anyway.)

    IIUC, pc->mem_cgroup is moved to another group only by mem_cgroup_move_account
    under lock_page_cgroup( and mz->lru_lock).
    And those two above(mem_cgroup_move_lists and __mem_cgroup_uncharge_common) sets
    mem = pc->mem_cgroup under lock_page_cgroup, so I don't think those checks
    (mem != pc->mem_cgroup) is needed.


    Thanks,
    Daisuke Nishimura.

    > +/**
    > + * mem_cgroup_move_account - move account of the page
    > + * @page ... the target page of being moved.
    > + * @pc ... page_cgroup of the page.
    > + * @from ... mem_cgroup which the page is moved from.
    > + * @to ... mem_cgroup which the page is moved to.
    > + *
    > + * The caller must confirm following.
    > + * 1. disable irq.
    > + * 2. lru_lock of old mem_cgroup should be held.
    > + * 3. pc is guaranteed to be valid and on mem_cgroup's LRU.
    > + *
    > + * Because we cannot call try_to_free_page() here, the caller must guarantee
    > + * this moving of charge never fails. (if charge fails, this call fails.)
    > + * Currently this is called only against root cgroup.
    > + * which has no limitation of resource.
    > + * Returns 0 at success, returns 1 at failure.
    > + */
    > +int mem_cgroup_move_account(struct page *page, struct page_cgroup *pc,
    > + struct mem_cgroup *from, struct mem_cgroup *to)
    > +{
    > + struct mem_cgroup_per_zone *from_mz, *to_mz;
    > + int nid, zid;
    > + int ret = 1;
    > +
    > + VM_BUG_ON(!irqs_disabled());
    > +
    > + nid = page_to_nid(page);
    > + zid = page_zonenum(page);
    > + from_mz = mem_cgroup_zoneinfo(from, nid, zid);
    > + to_mz = mem_cgroup_zoneinfo(to, nid, zid);
    > +
    > + if (res_counter_charge(&to->res, PAGE_SIZE)) {
    > + /* Now, we assume no_limit...no failure here. */
    > + return ret;
    > + }
    > + if (!try_lock_page_cgroup(page)) {
    > + res_counter_uncharge(&to->res, PAGE_SIZE);
    > + return ret;
    > + }
    > +
    > + if (page_get_page_cgroup(page) != pc) {
    > + res_counter_uncharge(&to->res, PAGE_SIZE);
    > + goto out;
    > + }
    > +
    > + if (spin_trylock(&to_mz->lru_lock)) {
    > + __mem_cgroup_remove_list(from_mz, pc);
    > + css_put(&from->css);
    > + res_counter_uncharge(&from->res, PAGE_SIZE);
    > + pc->mem_cgroup = to;
    > + css_get(&to->css);
    > + __mem_cgroup_add_list(to_mz, pc);
    > + ret = 0;
    > + spin_unlock(&to_mz->lru_lock);
    > + } else {
    > + res_counter_uncharge(&to->res, PAGE_SIZE);
    > + }
    > +out:
    > + unlock_page_cgroup(page);
    > +
    > + return ret;
    > +}
    > +
    > /*
    > * Charge the memory controller for page usage.
    > * Return
    > __mem_cgroup_remove_list(mz, pc);
    > spin_unlock_irqrestore(&mz->lru_lock, flags);
    >
    > page_assign_page_cgroup(page, NULL);
    > unlock_page_cgroup(page);
    >
    > - mem = pc->mem_cgroup;
    > +
    > res_counter_uncharge(&mem->res, PAGE_SIZE);
    > css_put(&mem->css);
    >
    >

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  12. Re: [PATCH 1/12] memcg avoid accounting special mappings not on LRU

    KAMEZAWA Hiroyuki wrote:
    > There are not-on-LRU pages which can be mapped and they are not worth to
    > be accounted. (becasue we can't shrink them and need dirty codes to handle
    > specical case) We'd like to make use of usual objrmap/radix-tree's protcol
    > and don't want to account out-of-vm's control pages.
    >
    > When special_mapping_fault() is called, page->mapping is tend to be NULL
    > and it's charged as Anonymous page.
    > insert_page() also handles some special pages from drivers.
    >
    > This patch is for avoiding to account special pages.
    >


    Hmm... I am a little concerned that with these changes actual usage will much
    more than what we report in memory.usage_in_bytes. Why not move them to
    non-reclaimable LRU list as unevictable pages (once those patches go in, we can
    push this as well). I suspect the size of special pages is too short to affect
    anything or are you seeing something very different?
    --
    Balbir
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  13. Re: [PATCH 0/12] memcg updates v5

    KAMEZAWA Hiroyuki wrote:
    > Hi, I updated the stack and reflected comments.
    > Against the latest mmotm. (rc7-mm1)
    >
    > Major changes from previous one is
    > - page_cgroup allocation/lookup manner is changed.
    > all FLATMEM/DISCONTIGMEM/SPARSEMEM and MEMORY_HOTPLUG is supported.
    > - force_empty is totally rewritten. and a problem that "force_empty takes long time"
    > in previous version is fixed (I think...)
    > - reordered patches.
    > - first half are easy ones.
    > - second half are big ones.
    >
    > I'm still testing with full debug option. No problem found yet.
    > (I'm afraid of race condition which have not been caught yet.)
    >
    > [1/12] avoid accounting special mappings not on LRU. (fix)
    > [2/12] move charege() call to swapped-in page under lock_page() (clean up)
    > [3/12] make root cgroup to be unlimited. (change semantics.)
    > [4/12] make page->mapping NULL before calling uncharge (clean up)
    > [5/12] make page->flags to use atomic ops. (changes in infrastructure)
    > [6/12] optimize stat. (clean up)
    > [7/12] add support function for moving account. (new function)
    > [8/12] rewrite force_empty to use move_account. (change semantics.)
    > [9/12] allocate all page_cgroup at boot. (changes in infrastructure)
    > [10/12] free page_cgroup from LRU in lazy way (optimize)
    > [11/12] add page_cgroup to LRU in lazy way (optimize)
    > [12/12] fix race at charging swap (fix by new logic.)
    >
    > *Any* comment is welcome.


    Kame,

    I'm beginning to review test the patches now. It would be really nice to split
    the development patches from the maintenance ones. I think the full patchset has
    too many things and is confusing to look at.


    --
    Balbir
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  14. Re: [PATCH 2/12] memcg move charege() call to swapped-in page under lock_page()

    KAMEZAWA Hiroyuki wrote:
    > While page-cache's charge/uncharge is done under page_lock(), swap-cache
    > isn't. (anonymous page is charged when it's newly allocated.)
    >
    > This patch moves do_swap_page()'s charge() call under lock. This helps
    > us to avoid to charge already mapped one, unnecessary calls.
    >
    > Signed-off-by: KAMEZAWA Hiroyuki


    Seems reasonable to me

    Just one quick comment though, as a result of this change, mark_page_accessed is
    now called with PageLock held, I suspect you would want to move that call prior
    to lock_page().



    --
    Balbir
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  15. Re: [PATCH 3/12] memcg make root cgroup unlimited.

    KAMEZAWA Hiroyuki wrote:
    > Make root cgroup of memory resource controller to have no limit.
    >
    > By this, users cannot set limit to root group. This is for making root cgroup
    > as a kind of trash-can.
    >
    > For accounting pages which has no owner, which are created by force_empty,
    > we need some cgroup with no_limit. A patch for rewriting force_empty will
    > will follow this one.
    >
    > Signed-off-by: KAMEZAWA Hiroyuki


    This is an ABI change (although not too many people might be using it, I wonder
    if we should add memory.features (a set of flags and let users enable them and
    provide good defaults), like sched features.

    --
    Balbir
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  16. Re: [PATCH 0/12] memcg updates v5

    On Fri, 26 Sep 2008 13:48:58 +0530
    Balbir Singh wrote:

    > KAMEZAWA Hiroyuki wrote:
    > > Hi, I updated the stack and reflected comments.
    > > Against the latest mmotm. (rc7-mm1)
    > >
    > > Major changes from previous one is
    > > - page_cgroup allocation/lookup manner is changed.
    > > all FLATMEM/DISCONTIGMEM/SPARSEMEM and MEMORY_HOTPLUG is supported.
    > > - force_empty is totally rewritten. and a problem that "force_empty takes long time"
    > > in previous version is fixed (I think...)
    > > - reordered patches.
    > > - first half are easy ones.
    > > - second half are big ones.
    > >
    > > I'm still testing with full debug option. No problem found yet.
    > > (I'm afraid of race condition which have not been caught yet.)
    > >
    > > [1/12] avoid accounting special mappings not on LRU. (fix)
    > > [2/12] move charege() call to swapped-in page under lock_page() (clean up)
    > > [3/12] make root cgroup to be unlimited. (change semantics.)
    > > [4/12] make page->mapping NULL before calling uncharge (clean up)
    > > [5/12] make page->flags to use atomic ops. (changes in infrastructure)
    > > [6/12] optimize stat. (clean up)
    > > [7/12] add support function for moving account. (new function)
    > > [8/12] rewrite force_empty to use move_account. (change semantics.)
    > > [9/12] allocate all page_cgroup at boot. (changes in infrastructure)
    > > [10/12] free page_cgroup from LRU in lazy way (optimize)
    > > [11/12] add page_cgroup to LRU in lazy way (optimize)
    > > [12/12] fix race at charging swap (fix by new logic.)
    > >
    > > *Any* comment is welcome.

    >
    > Kame,
    >
    > I'm beginning to review test the patches now. It would be really nice to split
    > the development patches from the maintenance ones. I think the full patchset has
    > too many things and is confusing to look at.
    >

    I hope I can do....but maybe difficult.
    If you give me ack, 1,2,4,6, can be pushed at early stage.

    Thanks,
    -Kame




    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  17. Re: [PATCH 1/12] memcg avoid accounting special mappings not on LRU

    On Fri, 26 Sep 2008 13:55:54 +0530
    Balbir Singh wrote:

    > KAMEZAWA Hiroyuki wrote:
    > > There are not-on-LRU pages which can be mapped and they are not worth to
    > > be accounted. (becasue we can't shrink them and need dirty codes to handle
    > > specical case) We'd like to make use of usual objrmap/radix-tree's protcol
    > > and don't want to account out-of-vm's control pages.
    > >
    > > When special_mapping_fault() is called, page->mapping is tend to be NULL
    > > and it's charged as Anonymous page.
    > > insert_page() also handles some special pages from drivers.
    > >
    > > This patch is for avoiding to account special pages.
    > >

    >
    > Hmm... I am a little concerned that with these changes actual usage will much
    > more than what we report in memory.usage_in_bytes. Why not move them to
    > non-reclaimable LRU list as unevictable pages (once those patches go in, we can
    > push this as well).


    Because they are not on LRU ...i.e. !PageLRU(page)

    > I suspect the size of special pages is too short to affect
    > anything or are you seeing something very different?


    I don't want put pages never goes to LRU onto memcgroup's LRU.

    Thanks,
    -Kame

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  18. Re: [PATCH 2/12] memcg move charege() call to swapped-in page under lock_page()

    On Fri, 26 Sep 2008 14:06:02 +0530
    Balbir Singh wrote:

    > KAMEZAWA Hiroyuki wrote:
    > > While page-cache's charge/uncharge is done under page_lock(), swap-cache
    > > isn't. (anonymous page is charged when it's newly allocated.)
    > >
    > > This patch moves do_swap_page()'s charge() call under lock. This helps
    > > us to avoid to charge already mapped one, unnecessary calls.
    > >
    > > Signed-off-by: KAMEZAWA Hiroyuki

    >
    > Seems reasonable to me
    >
    > Just one quick comment though, as a result of this change, mark_page_accessed is
    > now called with PageLock held, I suspect you would want to move that call prior
    > to lock_page().
    >

    Ok. I'll check it

    Thanks,
    -Kame
    >
    >
    > --
    > Balbir
    >


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  19. Re: [PATCH 7/12] memcg add function to move account

    On Fri, 26 Sep 2008 16:30:50 +0900
    Daisuke Nishimura wrote:

    > > @@ -444,9 +445,14 @@ void mem_cgroup_move_lists(struct page *
    > >
    > > pc = page_get_page_cgroup(page);
    > > if (pc) {
    > > + mem = pc->mem_cgroup;
    > > mz = page_cgroup_zoneinfo(pc);
    > > spin_lock_irqsave(&mz->lru_lock, flags);
    > > - __mem_cgroup_move_lists(pc, lru);
    > > + /*
    > > + * check against the race with move_account.
    > > + */
    > > + if (likely(mem == pc->mem_cgroup))
    > > + __mem_cgroup_move_lists(pc, lru);

    >
    > (snip)
    >
    > > @@ -754,16 +824,24 @@ __mem_cgroup_uncharge_common(struct page
    > > if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
    > > && ((PageCgroupCache(pc) || page_mapped(page))))
    > > goto unlock;
    > > -
    > > +retry:
    > > + mem = pc->mem_cgroup;
    > > mz = page_cgroup_zoneinfo(pc);
    > > spin_lock_irqsave(&mz->lru_lock, flags);
    > > + if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED &&
    > > + unlikely(mem != pc->mem_cgroup)) {
    > > + /* MAPPED account can be done without lock_page().
    > > + Check race with mem_cgroup_move_account() */
    > > + spin_unlock_irqrestore(&mz->lru_lock, flags);
    > > + goto retry;
    > > + }

    >
    > I'm sorry, but I've not been convinced yet why these checks are needed here.
    > (Those checks are removed by [9/12] anyway.)
    >
    > IIUC, pc->mem_cgroup is moved to another group only by mem_cgroup_move_account
    > under lock_page_cgroup( and mz->lru_lock).
    > And those two above(mem_cgroup_move_lists and __mem_cgroup_uncharge_common) sets
    > mem = pc->mem_cgroup under lock_page_cgroup, so I don't think those checks
    > (mem != pc->mem_cgroup) is needed.
    >

    you're right.

    Thanks,
    -Kame



    >
    > Thanks,
    > Daisuke Nishimura.
    >
    > > +/**
    > > + * mem_cgroup_move_account - move account of the page
    > > + * @page ... the target page of being moved.
    > > + * @pc ... page_cgroup of the page.
    > > + * @from ... mem_cgroup which the page is moved from.
    > > + * @to ... mem_cgroup which the page is moved to.
    > > + *
    > > + * The caller must confirm following.
    > > + * 1. disable irq.
    > > + * 2. lru_lock of old mem_cgroup should be held.
    > > + * 3. pc is guaranteed to be valid and on mem_cgroup's LRU.
    > > + *
    > > + * Because we cannot call try_to_free_page() here, the caller must guarantee
    > > + * this moving of charge never fails. (if charge fails, this call fails.)
    > > + * Currently this is called only against root cgroup.
    > > + * which has no limitation of resource.
    > > + * Returns 0 at success, returns 1 at failure.
    > > + */
    > > +int mem_cgroup_move_account(struct page *page, struct page_cgroup *pc,
    > > + struct mem_cgroup *from, struct mem_cgroup *to)
    > > +{
    > > + struct mem_cgroup_per_zone *from_mz, *to_mz;
    > > + int nid, zid;
    > > + int ret = 1;
    > > +
    > > + VM_BUG_ON(!irqs_disabled());
    > > +
    > > + nid = page_to_nid(page);
    > > + zid = page_zonenum(page);
    > > + from_mz = mem_cgroup_zoneinfo(from, nid, zid);
    > > + to_mz = mem_cgroup_zoneinfo(to, nid, zid);
    > > +
    > > + if (res_counter_charge(&to->res, PAGE_SIZE)) {
    > > + /* Now, we assume no_limit...no failure here. */
    > > + return ret;
    > > + }
    > > + if (!try_lock_page_cgroup(page)) {
    > > + res_counter_uncharge(&to->res, PAGE_SIZE);
    > > + return ret;
    > > + }
    > > +
    > > + if (page_get_page_cgroup(page) != pc) {
    > > + res_counter_uncharge(&to->res, PAGE_SIZE);
    > > + goto out;
    > > + }
    > > +
    > > + if (spin_trylock(&to_mz->lru_lock)) {
    > > + __mem_cgroup_remove_list(from_mz, pc);
    > > + css_put(&from->css);
    > > + res_counter_uncharge(&from->res, PAGE_SIZE);
    > > + pc->mem_cgroup = to;
    > > + css_get(&to->css);
    > > + __mem_cgroup_add_list(to_mz, pc);
    > > + ret = 0;
    > > + spin_unlock(&to_mz->lru_lock);
    > > + } else {
    > > + res_counter_uncharge(&to->res, PAGE_SIZE);
    > > + }
    > > +out:
    > > + unlock_page_cgroup(page);
    > > +
    > > + return ret;
    > > +}
    > > +
    > > /*
    > > * Charge the memory controller for page usage.
    > > * Return
    > > __mem_cgroup_remove_list(mz, pc);
    > > spin_unlock_irqrestore(&mz->lru_lock, flags);
    > >
    > > page_assign_page_cgroup(page, NULL);
    > > unlock_page_cgroup(page);
    > >
    > > - mem = pc->mem_cgroup;
    > > +
    > > res_counter_uncharge(&mem->res, PAGE_SIZE);
    > > css_put(&mem->css);
    > >
    > >

    >


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  20. Re: [PATCH 3/12] memcg make root cgroup unlimited.

    On Fri, 26 Sep 2008 14:11:00 +0530
    Balbir Singh wrote:

    > KAMEZAWA Hiroyuki wrote:
    > > Make root cgroup of memory resource controller to have no limit.
    > >
    > > By this, users cannot set limit to root group. This is for making root cgroup
    > > as a kind of trash-can.
    > >
    > > For accounting pages which has no owner, which are created by force_empty,
    > > we need some cgroup with no_limit. A patch for rewriting force_empty will
    > > will follow this one.
    > >
    > > Signed-off-by: KAMEZAWA Hiroyuki

    >
    > This is an ABI change (although not too many people might be using it, I wonder
    > if we should add memory.features (a set of flags and let users enable them and
    > provide good defaults), like sched features.
    >

    I think "feature" flag is complicated, at this stage.
    We'll add more features and not settled yet.

    Hmm, if you don't like this,
    calling try_to_free_page() at force_empty() instead of move_account() ?


    Thanks,
    -Kame

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

+ Reply to Thread
Page 2 of 4 FirstFirst 1 2 3 4 LastLast