[PATCH -mm 2/2] cgroup: use multibuf for tasks file - Kernel

This is a discussion on [PATCH -mm 2/2] cgroup: use multibuf for tasks file - Kernel ; when we open a really large cgroup for read, we may failed for kmalloc() is not reliable for allocate a big buffer. the patch use multibuf for tasks file, every buf is a page apart from we need only a ...

+ Reply to Thread
Results 1 to 6 of 6

Thread: [PATCH -mm 2/2] cgroup: use multibuf for tasks file

  1. [PATCH -mm 2/2] cgroup: use multibuf for tasks file


    when we open a really large cgroup for read, we may failed
    for kmalloc() is not reliable for allocate a big buffer.

    the patch use multibuf for tasks file, every buf is a page
    apart from we need only a small buffer.

    we use obj_sort() to sort this pids, so we don't need to map this
    pages to an continuous memory region.

    Signed-off-by: Lai Jiangshan
    ---
    diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
    index bb298de..3d3c3bb 100644
    --- a/include/linux/cgroup.h
    +++ b/include/linux/cgroup.h
    @@ -141,8 +141,8 @@ struct cgroup {

    /* pids_mutex protects the fields below */
    struct rw_semaphore pids_mutex;
    - /* Array of process ids in the cgroup */
    - pid_t *tasks_pids;
    + /* Multi-array of process ids in the cgroup */
    + const pid_t *const *tasks_pids;
    /* How many files are using the current tasks_pids array */
    int pids_use_count;
    /* Length of the current tasks_pids array */
    diff --git a/kernel/cgroup.c b/kernel/cgroup.c
    index 996865a..f61b152 100644
    --- a/kernel/cgroup.c
    +++ b/kernel/cgroup.c
    @@ -2004,6 +2004,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
    *
    */

    +const static int pid_per_page = PAGE_SIZE / sizeof(pid_t);
    +
    /*
    * Load into 'pidarray' up to 'npids' of the tasks using cgroup
    * 'cgrp'. Return actual number of pids loaded. No need to
    @@ -2011,16 +2013,22 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
    * read section, so the css_set can't go away, and is
    * immutable after creation.
    */
    -static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
    +static int pid_array_load(pid_t **pidarray, int npids, struct cgroup *cgrp)
    {
    - int n = 0;
    + int n = 0, i = 0, j = 0;
    struct cgroup_iter it;
    struct task_struct *tsk;
    cgroup_iter_start(cgrp, &it);
    while ((tsk = cgroup_iter_next(cgrp, &it))) {
    if (unlikely(n == npids))
    break;
    - pidarray[n++] = task_pid_vnr(tsk);
    + pidarray[i][j] = task_pid_vnr(tsk);
    + n++;
    + j++;
    + if (j == pid_per_page) {
    + i++;
    + j = 0;
    + }
    }
    cgroup_iter_end(cgrp, &it);
    return n;
    @@ -2079,11 +2087,27 @@ err:
    return ret;
    }

    -static int cmppid(const void *a, const void *b)
    +static inline pid_t getpidofmbuf(const pid_t *const *multibuf, int index)
    +{
    + return multibuf[index / pid_per_page][index % pid_per_page];
    +}
    +
    +static int cmppid(const void *c, size_t left, size_t right)
    {
    - return *(pid_t *)a - *(pid_t *)b;
    + return getpidofmbuf(c, left) - getpidofmbuf(c, right);
    }

    +static inline pid_t *getpidptr(pid_t *const *multibuf, int index)
    +{
    + return &multibuf[index / pid_per_page][index % pid_per_page];
    +}
    +
    +static void swappid(void *c, size_t left, size_t right)
    +{
    + pid_t rpid = getpidofmbuf(c, right);
    + *getpidptr(c, right) = getpidofmbuf(c, left);
    + *getpidptr(c, left) = rpid;
    +}

    /*
    * seq_file methods for the "tasks" file. The seq_file position is the
    @@ -2100,19 +2124,19 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
    * next pid to display, if any
    */
    struct cgroup *cgrp = s->private;
    - int index = 0, pid = *pos;
    - int *iter;
    + int index = 0;
    + pid_t pid = *pos;

    down_read(&cgrp->pids_mutex);
    if (pid) {
    int end = cgrp->pids_length;
    - int i;
    while (index < end) {
    int mid = (index + end) / 2;
    - if (cgrp->tasks_pids[mid] == pid) {
    + pid_t mpid = getpidofmbuf(cgrp->tasks_pids, mid);
    + if (mpid == pid) {
    index = mid;
    break;
    - } else if (cgrp->tasks_pids[mid] <= pid)
    + } else if (mpid <= pid)
    index = mid + 1;
    else
    end = mid;
    @@ -2122,9 +2146,8 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
    if (index >= cgrp->pids_length)
    return NULL;
    /* Update the abstract position to be the actual pid that we found */
    - iter = cgrp->tasks_pids + index;
    - *pos = *iter;
    - return iter;
    + *pos = getpidofmbuf(cgrp->tasks_pids, index);
    + return (void *)(index ^ -0x10000); /* we cannot return 0 */
    }

    static void cgroup_tasks_stop(struct seq_file *s, void *v)
    @@ -2136,25 +2159,26 @@ static void cgroup_tasks_stop(struct seq_file *s, void *v)
    static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
    {
    struct cgroup *cgrp = s->private;
    - int *p = v;
    - int *end = cgrp->tasks_pids + cgrp->pids_length;
    + int index = (int)v ^ -0x10000;

    /*
    * Advance to the next pid in the array. If this goes off the
    * end, we're done
    */
    - p++;
    - if (p >= end) {
    + index++;
    + if (index >= cgrp->pids_length) {
    return NULL;
    } else {
    - *pos = *p;
    - return p;
    + *pos = getpidofmbuf(cgrp->tasks_pids, index);
    + return (void *)(index ^ -0x10000); /* we cannot return 0 */
    }
    }

    static int cgroup_tasks_show(struct seq_file *s, void *v)
    {
    - return seq_printf(s, "%d\n", *(int *)v);
    + struct cgroup *cgrp = s->private;
    + int index = (int)v ^ -0x10000;
    + return seq_printf(s, "%d\n", getpidofmbuf(cgrp->tasks_pids, index));
    }

    static struct seq_operations cgroup_tasks_seq_operations = {
    @@ -2164,12 +2188,60 @@ static struct seq_operations cgroup_tasks_seq_operations = {
    .show = cgroup_tasks_show,
    };

    +static void *alloc_mutibufs(size_t npids)
    +{
    + int i, j, npages = (npids + pid_per_page - 1) / pid_per_page;
    + unsigned long *pages;
    +
    + if (npids <= pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
    + void *pids = kmalloc(sizeof(pid_t *) + sizeof(pid_t) * npids,
    + GFP_KERNEL);
    + if (!pids)
    + return NULL;
    + /* make single buf fake multi-buf */
    + *(void **)pids = pids + sizeof(pid_t *);
    + return pids;
    + }
    +
    + pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
    + if (!pages)
    + return NULL;
    +
    + for (i = 0; i < npages; i++) {
    + pages[i] = __get_free_page(GFP_KERNEL);
    + if (unlikely(!pages[i]))
    + goto depopulate;
    + }
    + return pages;
    +
    +depopulate:
    + for (j = 0; j < i; j++)
    + free_page(pages[j]);
    + kfree(pages);
    + return NULL;
    +}
    +
    +static void free_multibufs(void *ptr, size_t npids)
    +{
    + if (!ptr)
    + return;
    +
    + if (npids > pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
    + int i, npages = (npids + pid_per_page - 1) / pid_per_page;
    + unsigned long *pages = ptr;
    + for (i = 0; i < npages; i++)
    + free_page(pages[i]);
    + }
    +
    + kfree(ptr);
    +}
    +
    static void release_cgroup_pid_array(struct cgroup *cgrp)
    {
    down_write(&cgrp->pids_mutex);
    BUG_ON(!cgrp->pids_use_count);
    if (!--cgrp->pids_use_count) {
    - kfree(cgrp->tasks_pids);
    + free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
    cgrp->tasks_pids = NULL;
    cgrp->pids_length = 0;
    }
    @@ -2202,7 +2274,7 @@ static struct file_operations cgroup_tasks_operations = {
    static int cgroup_tasks_open(struct inode *unused, struct file *file)
    {
    struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
    - pid_t *pidarray;
    + pid_t **pidarray;
    int npids;
    int retval;

    @@ -2217,19 +2289,19 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
    * show up until sometime later on.
    */
    npids = cgroup_task_count(cgrp);
    - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
    + pidarray = alloc_mutibufs(npids);
    if (!pidarray)
    return -ENOMEM;
    npids = pid_array_load(pidarray, npids, cgrp);
    - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
    + obj_sort(pidarray, 0, npids, cmppid, swappid);

    /*
    * Store the array in the cgroup, freeing the old
    * array if necessary
    */
    down_write(&cgrp->pids_mutex);
    - kfree(cgrp->tasks_pids);
    - cgrp->tasks_pids = pidarray;
    + free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
    + cgrp->tasks_pids = (const pid_t *const *)pidarray;
    cgrp->pids_length = npids;
    cgrp->pids_use_count++;
    up_write(&cgrp->pids_mutex);


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  2. Re: [PATCH -mm 2/2] cgroup: use multibuf for tasks file

    On Fri, Sep 12, 2008 at 4:55 AM, Lai Jiangshan wrote:
    >
    > when we open a really large cgroup for read, we may failed
    > for kmalloc() is not reliable for allocate a big buffer.


    This still depends on an answer to whether using plain vmalloc is too
    much overhead.

    Balbir pointed out to me that most cgroups are likely to be pretty
    small - so the solution of just doing a kmalloc() for 8K or less, and
    a vmalloc() for more than 8K (which is >2000 threads) will avoid the
    vmalloc overhead in almost all cases; the question is whether
    eliminating the remaining overhead is worth the extra complexity.

    Paul

    >
    > the patch use multibuf for tasks file, every buf is a page
    > apart from we need only a small buffer.
    >
    > we use obj_sort() to sort this pids, so we don't need to map this
    > pages to an continuous memory region.
    >
    > Signed-off-by: Lai Jiangshan
    > ---
    > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
    > index bb298de..3d3c3bb 100644
    > --- a/include/linux/cgroup.h
    > +++ b/include/linux/cgroup.h
    > @@ -141,8 +141,8 @@ struct cgroup {
    >
    > /* pids_mutex protects the fields below */
    > struct rw_semaphore pids_mutex;
    > - /* Array of process ids in the cgroup */
    > - pid_t *tasks_pids;
    > + /* Multi-array of process ids in the cgroup */
    > + const pid_t *const *tasks_pids;
    > /* How many files are using the current tasks_pids array */
    > int pids_use_count;
    > /* Length of the current tasks_pids array */
    > diff --git a/kernel/cgroup.c b/kernel/cgroup.c
    > index 996865a..f61b152 100644
    > --- a/kernel/cgroup.c
    > +++ b/kernel/cgroup.c
    > @@ -2004,6 +2004,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
    > *
    > */
    >
    > +const static int pid_per_page = PAGE_SIZE / sizeof(pid_t);
    > +
    > /*
    > * Load into 'pidarray' up to 'npids' of the tasks using cgroup
    > * 'cgrp'. Return actual number of pids loaded. No need to
    > @@ -2011,16 +2013,22 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
    > * read section, so the css_set can't go away, and is
    > * immutable after creation.
    > */
    > -static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
    > +static int pid_array_load(pid_t **pidarray, int npids, struct cgroup *cgrp)
    > {
    > - int n = 0;
    > + int n = 0, i = 0, j = 0;
    > struct cgroup_iter it;
    > struct task_struct *tsk;
    > cgroup_iter_start(cgrp, &it);
    > while ((tsk = cgroup_iter_next(cgrp, &it))) {
    > if (unlikely(n == npids))
    > break;
    > - pidarray[n++] = task_pid_vnr(tsk);
    > + pidarray[i][j] = task_pid_vnr(tsk);
    > + n++;
    > + j++;
    > + if (j == pid_per_page) {
    > + i++;
    > + j = 0;
    > + }
    > }
    > cgroup_iter_end(cgrp, &it);
    > return n;
    > @@ -2079,11 +2087,27 @@ err:
    > return ret;
    > }
    >
    > -static int cmppid(const void *a, const void *b)
    > +static inline pid_t getpidofmbuf(const pid_t *const *multibuf, int index)
    > +{
    > + return multibuf[index / pid_per_page][index % pid_per_page];
    > +}
    > +
    > +static int cmppid(const void *c, size_t left, size_t right)
    > {
    > - return *(pid_t *)a - *(pid_t *)b;
    > + return getpidofmbuf(c, left) - getpidofmbuf(c, right);
    > }
    >
    > +static inline pid_t *getpidptr(pid_t *const *multibuf, int index)
    > +{
    > + return &multibuf[index / pid_per_page][index % pid_per_page];
    > +}
    > +
    > +static void swappid(void *c, size_t left, size_t right)
    > +{
    > + pid_t rpid = getpidofmbuf(c, right);
    > + *getpidptr(c, right) = getpidofmbuf(c, left);
    > + *getpidptr(c, left) = rpid;
    > +}
    >
    > /*
    > * seq_file methods for the "tasks" file. The seq_file position is the
    > @@ -2100,19 +2124,19 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
    > * next pid to display, if any
    > */
    > struct cgroup *cgrp = s->private;
    > - int index = 0, pid = *pos;
    > - int *iter;
    > + int index = 0;
    > + pid_t pid = *pos;
    >
    > down_read(&cgrp->pids_mutex);
    > if (pid) {
    > int end = cgrp->pids_length;
    > - int i;
    > while (index < end) {
    > int mid = (index + end) / 2;
    > - if (cgrp->tasks_pids[mid] == pid) {
    > + pid_t mpid = getpidofmbuf(cgrp->tasks_pids, mid);
    > + if (mpid == pid) {
    > index = mid;
    > break;
    > - } else if (cgrp->tasks_pids[mid] <= pid)
    > + } else if (mpid <= pid)
    > index = mid + 1;
    > else
    > end = mid;
    > @@ -2122,9 +2146,8 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
    > if (index >= cgrp->pids_length)
    > return NULL;
    > /* Update the abstract position to be the actual pid that we found */
    > - iter = cgrp->tasks_pids + index;
    > - *pos = *iter;
    > - return iter;
    > + *pos = getpidofmbuf(cgrp->tasks_pids, index);
    > + return (void *)(index ^ -0x10000); /* we cannot return 0 */
    > }
    >
    > static void cgroup_tasks_stop(struct seq_file *s, void *v)
    > @@ -2136,25 +2159,26 @@ static void cgroup_tasks_stop(struct seq_file *s, void *v)
    > static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
    > {
    > struct cgroup *cgrp = s->private;
    > - int *p = v;
    > - int *end = cgrp->tasks_pids + cgrp->pids_length;
    > + int index = (int)v ^ -0x10000;
    >
    > /*
    > * Advance to the next pid in the array. If this goes off the
    > * end, we're done
    > */
    > - p++;
    > - if (p >= end) {
    > + index++;
    > + if (index >= cgrp->pids_length) {
    > return NULL;
    > } else {
    > - *pos = *p;
    > - return p;
    > + *pos = getpidofmbuf(cgrp->tasks_pids, index);
    > + return (void *)(index ^ -0x10000); /* we cannot return 0 */
    > }
    > }
    >
    > static int cgroup_tasks_show(struct seq_file *s, void *v)
    > {
    > - return seq_printf(s, "%d\n", *(int *)v);
    > + struct cgroup *cgrp = s->private;
    > + int index = (int)v ^ -0x10000;
    > + return seq_printf(s, "%d\n", getpidofmbuf(cgrp->tasks_pids, index));
    > }
    >
    > static struct seq_operations cgroup_tasks_seq_operations = {
    > @@ -2164,12 +2188,60 @@ static struct seq_operations cgroup_tasks_seq_operations = {
    > .show = cgroup_tasks_show,
    > };
    >
    > +static void *alloc_mutibufs(size_t npids)
    > +{
    > + int i, j, npages = (npids + pid_per_page - 1) / pid_per_page;
    > + unsigned long *pages;
    > +
    > + if (npids <= pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
    > + void *pids = kmalloc(sizeof(pid_t *) + sizeof(pid_t) * npids,
    > + GFP_KERNEL);
    > + if (!pids)
    > + return NULL;
    > + /* make single buf fake multi-buf */
    > + *(void **)pids = pids + sizeof(pid_t *);
    > + return pids;
    > + }
    > +
    > + pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
    > + if (!pages)
    > + return NULL;
    > +
    > + for (i = 0; i < npages; i++) {
    > + pages[i] = __get_free_page(GFP_KERNEL);
    > + if (unlikely(!pages[i]))
    > + goto depopulate;
    > + }
    > + return pages;
    > +
    > +depopulate:
    > + for (j = 0; j < i; j++)
    > + free_page(pages[j]);
    > + kfree(pages);
    > + return NULL;
    > +}
    > +
    > +static void free_multibufs(void *ptr, size_t npids)
    > +{
    > + if (!ptr)
    > + return;
    > +
    > + if (npids > pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
    > + int i, npages = (npids + pid_per_page - 1) / pid_per_page;
    > + unsigned long *pages = ptr;
    > + for (i = 0; i < npages; i++)
    > + free_page(pages[i]);
    > + }
    > +
    > + kfree(ptr);
    > +}
    > +
    > static void release_cgroup_pid_array(struct cgroup *cgrp)
    > {
    > down_write(&cgrp->pids_mutex);
    > BUG_ON(!cgrp->pids_use_count);
    > if (!--cgrp->pids_use_count) {
    > - kfree(cgrp->tasks_pids);
    > + free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
    > cgrp->tasks_pids = NULL;
    > cgrp->pids_length = 0;
    > }
    > @@ -2202,7 +2274,7 @@ static struct file_operations cgroup_tasks_operations = {
    > static int cgroup_tasks_open(struct inode *unused, struct file *file)
    > {
    > struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
    > - pid_t *pidarray;
    > + pid_t **pidarray;
    > int npids;
    > int retval;
    >
    > @@ -2217,19 +2289,19 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
    > * show up until sometime later on.
    > */
    > npids = cgroup_task_count(cgrp);
    > - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
    > + pidarray = alloc_mutibufs(npids);
    > if (!pidarray)
    > return -ENOMEM;
    > npids = pid_array_load(pidarray, npids, cgrp);
    > - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
    > + obj_sort(pidarray, 0, npids, cmppid, swappid);
    >
    > /*
    > * Store the array in the cgroup, freeing the old
    > * array if necessary
    > */
    > down_write(&cgrp->pids_mutex);
    > - kfree(cgrp->tasks_pids);
    > - cgrp->tasks_pids = pidarray;
    > + free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
    > + cgrp->tasks_pids = (const pid_t *const *)pidarray;
    > cgrp->pids_length = npids;
    > cgrp->pids_use_count++;
    > up_write(&cgrp->pids_mutex);
    >
    >
    >

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  3. Re: [PATCH -mm 2/2] cgroup: use multibuf for tasks file

    Paul Menage wrote:
    > On Fri, Sep 12, 2008 at 4:55 AM, Lai Jiangshan wrote:
    >> when we open a really large cgroup for read, we may failed
    >> for kmalloc() is not reliable for allocate a big buffer.

    >
    > This still depends on an answer to whether using plain vmalloc is too
    > much overhead.
    >
    > Balbir pointed out to me that most cgroups are likely to be pretty
    > small - so the solution of just doing a kmalloc() for 8K or less, and
    > a vmalloc() for more than 8K (which is >2000 threads) will avoid the
    > vmalloc overhead in almost all cases; the question is whether
    > eliminating the remaining overhead is worth the extra complexity.
    >


    I think open cgroup.tasks to read is not a critical path.
    so using plain vmalloc(even more overhead functions) is worth.

    >
    >> the patch use multibuf for tasks file, every buf is a page
    >> apart from we need only a small buffer.
    >>
    >> we use obj_sort() to sort this pids, so we don't need to map this
    >> pages to an continuous memory region.
    >>
    >> Signed-off-by: Lai Jiangshan
    >> ---
    >> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
    >> index bb298de..3d3c3bb 100644
    >> --- a/include/linux/cgroup.h
    >> +++ b/include/linux/cgroup.h
    >> @@ -141,8 +141,8 @@ struct cgroup {
    >>
    >> /* pids_mutex protects the fields below */
    >> struct rw_semaphore pids_mutex;
    >> - /* Array of process ids in the cgroup */
    >> - pid_t *tasks_pids;
    >> + /* Multi-array of process ids in the cgroup */
    >> + const pid_t *const *tasks_pids;
    >> /* How many files are using the current tasks_pids array */
    >> int pids_use_count;
    >> /* Length of the current tasks_pids array */
    >> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
    >> index 996865a..f61b152 100644
    >> --- a/kernel/cgroup.c
    >> +++ b/kernel/cgroup.c
    >> @@ -2004,6 +2004,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
    >> *
    >> */
    >>
    >> +const static int pid_per_page = PAGE_SIZE / sizeof(pid_t);
    >> +
    >> /*
    >> * Load into 'pidarray' up to 'npids' of the tasks using cgroup
    >> * 'cgrp'. Return actual number of pids loaded. No need to
    >> @@ -2011,16 +2013,22 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
    >> * read section, so the css_set can't go away, and is
    >> * immutable after creation.
    >> */
    >> -static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
    >> +static int pid_array_load(pid_t **pidarray, int npids, struct cgroup *cgrp)
    >> {
    >> - int n = 0;
    >> + int n = 0, i = 0, j = 0;
    >> struct cgroup_iter it;
    >> struct task_struct *tsk;
    >> cgroup_iter_start(cgrp, &it);
    >> while ((tsk = cgroup_iter_next(cgrp, &it))) {
    >> if (unlikely(n == npids))
    >> break;
    >> - pidarray[n++] = task_pid_vnr(tsk);
    >> + pidarray[i][j] = task_pid_vnr(tsk);
    >> + n++;
    >> + j++;
    >> + if (j == pid_per_page) {
    >> + i++;
    >> + j = 0;
    >> + }
    >> }
    >> cgroup_iter_end(cgrp, &it);
    >> return n;
    >> @@ -2079,11 +2087,27 @@ err:
    >> return ret;
    >> }
    >>
    >> -static int cmppid(const void *a, const void *b)
    >> +static inline pid_t getpidofmbuf(const pid_t *const *multibuf, int index)
    >> +{
    >> + return multibuf[index / pid_per_page][index % pid_per_page];
    >> +}
    >> +
    >> +static int cmppid(const void *c, size_t left, size_t right)
    >> {
    >> - return *(pid_t *)a - *(pid_t *)b;
    >> + return getpidofmbuf(c, left) - getpidofmbuf(c, right);
    >> }
    >>
    >> +static inline pid_t *getpidptr(pid_t *const *multibuf, int index)
    >> +{
    >> + return &multibuf[index / pid_per_page][index % pid_per_page];
    >> +}
    >> +
    >> +static void swappid(void *c, size_t left, size_t right)
    >> +{
    >> + pid_t rpid = getpidofmbuf(c, right);
    >> + *getpidptr(c, right) = getpidofmbuf(c, left);
    >> + *getpidptr(c, left) = rpid;
    >> +}
    >>
    >> /*
    >> * seq_file methods for the "tasks" file. The seq_file position is the
    >> @@ -2100,19 +2124,19 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
    >> * next pid to display, if any
    >> */
    >> struct cgroup *cgrp = s->private;
    >> - int index = 0, pid = *pos;
    >> - int *iter;
    >> + int index = 0;
    >> + pid_t pid = *pos;
    >>
    >> down_read(&cgrp->pids_mutex);
    >> if (pid) {
    >> int end = cgrp->pids_length;
    >> - int i;
    >> while (index < end) {
    >> int mid = (index + end) / 2;
    >> - if (cgrp->tasks_pids[mid] == pid) {
    >> + pid_t mpid = getpidofmbuf(cgrp->tasks_pids, mid);
    >> + if (mpid == pid) {
    >> index = mid;
    >> break;
    >> - } else if (cgrp->tasks_pids[mid] <= pid)
    >> + } else if (mpid <= pid)
    >> index = mid + 1;
    >> else
    >> end = mid;
    >> @@ -2122,9 +2146,8 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
    >> if (index >= cgrp->pids_length)
    >> return NULL;
    >> /* Update the abstract position to be the actual pid that we found */
    >> - iter = cgrp->tasks_pids + index;
    >> - *pos = *iter;
    >> - return iter;
    >> + *pos = getpidofmbuf(cgrp->tasks_pids, index);
    >> + return (void *)(index ^ -0x10000); /* we cannot return 0 */
    >> }
    >>
    >> static void cgroup_tasks_stop(struct seq_file *s, void *v)
    >> @@ -2136,25 +2159,26 @@ static void cgroup_tasks_stop(struct seq_file *s, void *v)
    >> static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
    >> {
    >> struct cgroup *cgrp = s->private;
    >> - int *p = v;
    >> - int *end = cgrp->tasks_pids + cgrp->pids_length;
    >> + int index = (int)v ^ -0x10000;
    >>
    >> /*
    >> * Advance to the next pid in the array. If this goes off the
    >> * end, we're done
    >> */
    >> - p++;
    >> - if (p >= end) {
    >> + index++;
    >> + if (index >= cgrp->pids_length) {
    >> return NULL;
    >> } else {
    >> - *pos = *p;
    >> - return p;
    >> + *pos = getpidofmbuf(cgrp->tasks_pids, index);
    >> + return (void *)(index ^ -0x10000); /* we cannot return 0 */
    >> }
    >> }
    >>
    >> static int cgroup_tasks_show(struct seq_file *s, void *v)
    >> {
    >> - return seq_printf(s, "%d\n", *(int *)v);
    >> + struct cgroup *cgrp = s->private;
    >> + int index = (int)v ^ -0x10000;
    >> + return seq_printf(s, "%d\n", getpidofmbuf(cgrp->tasks_pids, index));
    >> }
    >>
    >> static struct seq_operations cgroup_tasks_seq_operations = {
    >> @@ -2164,12 +2188,60 @@ static struct seq_operations cgroup_tasks_seq_operations = {
    >> .show = cgroup_tasks_show,
    >> };
    >>
    >> +static void *alloc_mutibufs(size_t npids)
    >> +{
    >> + int i, j, npages = (npids + pid_per_page - 1) / pid_per_page;
    >> + unsigned long *pages;
    >> +
    >> + if (npids <= pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
    >> + void *pids = kmalloc(sizeof(pid_t *) + sizeof(pid_t) * npids,
    >> + GFP_KERNEL);
    >> + if (!pids)
    >> + return NULL;
    >> + /* make single buf fake multi-buf */
    >> + *(void **)pids = pids + sizeof(pid_t *);
    >> + return pids;
    >> + }
    >> +
    >> + pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
    >> + if (!pages)
    >> + return NULL;
    >> +
    >> + for (i = 0; i < npages; i++) {
    >> + pages[i] = __get_free_page(GFP_KERNEL);
    >> + if (unlikely(!pages[i]))
    >> + goto depopulate;
    >> + }
    >> + return pages;
    >> +
    >> +depopulate:
    >> + for (j = 0; j < i; j++)
    >> + free_page(pages[j]);
    >> + kfree(pages);
    >> + return NULL;
    >> +}
    >> +
    >> +static void free_multibufs(void *ptr, size_t npids)
    >> +{
    >> + if (!ptr)
    >> + return;
    >> +
    >> + if (npids > pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
    >> + int i, npages = (npids + pid_per_page - 1) / pid_per_page;
    >> + unsigned long *pages = ptr;
    >> + for (i = 0; i < npages; i++)
    >> + free_page(pages[i]);
    >> + }
    >> +
    >> + kfree(ptr);
    >> +}
    >> +
    >> static void release_cgroup_pid_array(struct cgroup *cgrp)
    >> {
    >> down_write(&cgrp->pids_mutex);
    >> BUG_ON(!cgrp->pids_use_count);
    >> if (!--cgrp->pids_use_count) {
    >> - kfree(cgrp->tasks_pids);
    >> + free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
    >> cgrp->tasks_pids = NULL;
    >> cgrp->pids_length = 0;
    >> }
    >> @@ -2202,7 +2274,7 @@ static struct file_operations cgroup_tasks_operations = {
    >> static int cgroup_tasks_open(struct inode *unused, struct file *file)
    >> {
    >> struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
    >> - pid_t *pidarray;
    >> + pid_t **pidarray;
    >> int npids;
    >> int retval;
    >>
    >> @@ -2217,19 +2289,19 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
    >> * show up until sometime later on.
    >> */
    >> npids = cgroup_task_count(cgrp);
    >> - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
    >> + pidarray = alloc_mutibufs(npids);
    >> if (!pidarray)
    >> return -ENOMEM;
    >> npids = pid_array_load(pidarray, npids, cgrp);
    >> - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
    >> + obj_sort(pidarray, 0, npids, cmppid, swappid);
    >>
    >> /*
    >> * Store the array in the cgroup, freeing the old
    >> * array if necessary
    >> */
    >> down_write(&cgrp->pids_mutex);
    >> - kfree(cgrp->tasks_pids);
    >> - cgrp->tasks_pids = pidarray;
    >> + free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
    >> + cgrp->tasks_pids = (const pid_t *const *)pidarray;
    >> cgrp->pids_length = npids;
    >> cgrp->pids_use_count++;
    >> up_write(&cgrp->pids_mutex);
    >>
    >>
    >>

    >
    >
    >



    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  4. Re: [PATCH -mm 2/2] cgroup: use multibuf for tasks file

    Lai Jiangshan wrote:
    > Paul Menage wrote:
    >> On Fri, Sep 12, 2008 at 4:55 AM, Lai Jiangshan wrote:
    >>> when we open a really large cgroup for read, we may failed
    >>> for kmalloc() is not reliable for allocate a big buffer.

    >> This still depends on an answer to whether using plain vmalloc is too
    >> much overhead.
    >>
    >> Balbir pointed out to me that most cgroups are likely to be pretty
    >> small - so the solution of just doing a kmalloc() for 8K or less, and
    >> a vmalloc() for more than 8K (which is >2000 threads) will avoid the
    >> vmalloc overhead in almost all cases; the question is whether
    >> eliminating the remaining overhead is worth the extra complexity.
    >>

    >
    > I think open cgroup.tasks to read is not a critical path.
    > so using plain vmalloc(even more overhead functions) is worth.
    >


    This patch does not only add runtime overhead, but also make code much more
    complex, so the code is harder to read and harder to maintain, and object size
    is increased, which means increased memory footprint.

    And is there any reason not using plain vmalloc? Don't bloat the kernel without
    good reasons IMO...
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  5. Re: [PATCH -mm 2/2] cgroup: use multibuf for tasks file

    Li Zefan wrote:
    > Lai Jiangshan wrote:
    >> Paul Menage wrote:
    >>> On Fri, Sep 12, 2008 at 4:55 AM, Lai Jiangshan wrote:
    >>>> when we open a really large cgroup for read, we may failed
    >>>> for kmalloc() is not reliable for allocate a big buffer.
    >>> This still depends on an answer to whether using plain vmalloc is too
    >>> much overhead.
    >>>
    >>> Balbir pointed out to me that most cgroups are likely to be pretty
    >>> small - so the solution of just doing a kmalloc() for 8K or less, and
    >>> a vmalloc() for more than 8K (which is >2000 threads) will avoid the
    >>> vmalloc overhead in almost all cases; the question is whether
    >>> eliminating the remaining overhead is worth the extra complexity.
    >>>

    >> I think open cgroup.tasks to read is not a critical path.
    >> so using plain vmalloc(even more overhead functions) is worth.
    >>

    >
    > This patch does not only add runtime overhead, but also make code much more
    > complex, so the code is harder to read and harder to maintain, and object size
    > is increased, which means increased memory footprint.
    >
    > And is there any reason not using plain vmalloc? Don't bloat the kernel without
    > good reasons IMO...
    >


    I said that vmalloc is worth.
    vmalloc was the fist choice of my opinion. ^_^

    >
    >



    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  6. Re: [PATCH -mm 2/2] cgroup: use multibuf for tasks file

    > Li Zefan wrote:
    > > Lai Jiangshan wrote:
    > >> Paul Menage wrote:
    > >>> On Fri, Sep 12, 2008 at 4:55 AM, Lai Jiangshan wrote:
    > >>>> when we open a really large cgroup for read, we may failed
    > >>>> for kmalloc() is not reliable for allocate a big buffer.
    > >>> This still depends on an answer to whether using plain vmalloc is too
    > >>> much overhead.
    > >>>
    > >>> Balbir pointed out to me that most cgroups are likely to be pretty
    > >>> small - so the solution of just doing a kmalloc() for 8K or less, and
    > >>> a vmalloc() for more than 8K (which is >2000 threads) will avoid the
    > >>> vmalloc overhead in almost all cases; the question is whether
    > >>> eliminating the remaining overhead is worth the extra complexity.
    > >>>
    > >> I think open cgroup.tasks to read is not a critical path.
    > >> so using plain vmalloc(even more overhead functions) is worth.
    > >>

    > >
    > > This patch does not only add runtime overhead, but also make code much more
    > > complex, so the code is harder to read and harder to maintain, and object size
    > > is increased, which means increased memory footprint.
    > >
    > > And is there any reason not using plain vmalloc? Don't bloat the kernel without
    > > good reasons IMO...
    > >

    >
    > I said that vmalloc is worth.
    > vmalloc was the fist choice of my opinion. ^_^


    I agreed with Paul Menage's opinion because ..

    - plain vmalloc cause unnecessary overhead.
    - vmalloc sholdn't use for small allocation
    because virtual address space is valuable resource on 32bit machine.



    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

+ Reply to Thread