[PATCH] sched: Include group statistics in /proc/sched_debug - Kernel

This is a discussion on [PATCH] sched: Include group statistics in /proc/sched_debug - Kernel ; Include group statistics in /proc/sched_debug. Since the statistics of a group entity isn't exported directly from the kernel, it becomes difficult to obtain some of the group statistics. For example, the current method to obtain exec time of a group ...

+ Reply to Thread
Results 1 to 6 of 6

Thread: [PATCH] sched: Include group statistics in /proc/sched_debug

  1. [PATCH] sched: Include group statistics in /proc/sched_debug

    Include group statistics in /proc/sched_debug.

    Since the statistics of a group entity isn't exported directly from the
    kernel, it becomes difficult to obtain some of the group statistics.
    For example, the current method to obtain exec time of a group entity
    is not always accurate. One has to read the exec times of all
    the tasks(/proc//sched) in the group and add them. This method
    fails (or becomes difficult) if we want to collect stats of a group over
    a duration where tasks get created and terminated.

    This patch makes it easier to obtain group stats by directly including
    them in /proc/sched_debug. Stats like group exec time would help user
    programs (like LTP) to accurately measure the group fairness.

    Signed-off-by: Bharata B Rao
    CC: Peter Zijlstra
    CC: Ingo Molnar
    CC: Srivatsa Vaddagiri
    ---
    kernel/sched_debug.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++-
    1 file changed, 66 insertions(+), 1 deletion(-)

    --- a/kernel/sched_debug.c
    +++ b/kernel/sched_debug.c
    @@ -53,6 +53,70 @@ static unsigned long nsec_low(unsigned l

    #define SPLIT_NS(x) nsec_high(x), nsec_low(x)

    +#ifdef CONFIG_GROUP_SCHED
    +
    +static void print_group_stats(struct seq_file *m, struct task_group *tg,
    + int cpu)
    +{
    + struct sched_entity *se = tg->se[cpu];
    +
    +#if defined(CONFIG_CGROUP_SCHED)
    + char path[128] = "";
    + struct cgroup *cgroup = NULL;
    +
    + if (tg)
    + cgroup = tg->css.cgroup;
    +
    + if (cgroup)
    + cgroup_path(cgroup, path, sizeof(path));
    +
    + SEQ_printf(m, "\ngroup[%d]:%s\n", cpu, path);
    +#else
    + SEQ_printf(m, "\ngroup[%d]:\n", cpu);
    +#endif
    +#define P(F) \
    + SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
    +#define PN(F) \
    + SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
    +
    + PN(se->exec_start);
    + PN(se->vruntime);
    + PN(se->sum_exec_runtime);
    +#ifdef CONFIG_SCHEDSTATS
    + PN(se->wait_start);
    + PN(se->sleep_start);
    + PN(se->block_start);
    + PN(se->sleep_max);
    + PN(se->block_max);
    + PN(se->exec_max);
    + PN(se->slice_max);
    + PN(se->wait_max);
    + PN(se->wait_sum);
    + P(se->wait_count);
    +#endif
    + P(se->load.weight);
    +#undef PN
    +#undef P
    +}
    +
    +static void print_tg_stats(struct seq_file *m, int cpu)
    +{
    + struct task_group *tg;
    +
    + rcu_read_lock();
    + list_for_each_entry(tg, &task_groups, list) {
    + if (tg->se[cpu])
    + print_group_stats(m, tg, cpu);
    + }
    + rcu_read_unlock();
    +}
    +#else
    +static void print_tg_stats(struct seq_file *m, int cpu)
    +{
    + return;
    +}
    +#endif
    +
    static void
    print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
    {
    @@ -264,6 +328,7 @@ static void print_cpu(struct seq_file *m
    print_rt_stats(m, cpu);

    print_rq(m, rq, cpu);
    + print_tg_stats(m, cpu);
    }

    static int sched_debug_show(struct seq_file *m, void *v)
    @@ -271,7 +336,7 @@ static int sched_debug_show(struct seq_f
    u64 now = ktime_to_ns(ktime_get());
    int cpu;

    - SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n",
    + SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
    init_utsname()->release,
    (int)strcspn(init_utsname()->version, " "),
    init_utsname()->version);
    --

    An extract of /proc/sched_debug showing group stats obtained from
    this patch:

    group[1]:/3/a/1
    .se->exec_start : 256484.781577
    .se->vruntime : 12868.176994
    .se->sum_exec_runtime : 3243.669709
    .se->wait_start : 0.000000
    .se->sleep_start : 0.000000
    .se->block_start : 0.000000
    .se->sleep_max : 0.000000
    .se->block_max : 0.000000
    .se->exec_max : 1.002095
    .se->slice_max : 13.997073
    .se->wait_max : 67.978322
    .se->wait_sum : 7141.676906
    .se->wait_count : 203
    .se->load.weight : 255

    group[1]:/3/a
    .se->exec_start : 256484.781577
    .se->vruntime : 18649.700858
    .se->sum_exec_runtime : 3302.827997
    .se->wait_start : 0.000000
    .se->sleep_start : 0.000000
    .se->block_start : 0.000000
    .se->sleep_max : 0.000000
    .se->block_max : 0.000000
    .se->exec_max : 1.009411
    .se->slice_max : 7.015594
    .se->wait_max : 67.978322
    .se->wait_sum : 7082.536617
    .se->wait_count : 214
    .se->load.weight : 255

    group[1]:/3
    .se->exec_start : 256484.781577
    .se->vruntime : 224861.311011
    .se->sum_exec_runtime : 3302.827997
    .se->wait_start : 0.000000
    .se->sleep_start : 0.000000
    .se->block_start : 0.000000
    .se->sleep_max : 0.000000
    .se->block_max : 0.000000
    .se->exec_max : 1.002095
    .se->slice_max : 19.007546
    .se->wait_max : 67.978322
    .se->wait_sum : 7082.536617
    .se->wait_count : 214
    .se->load.weight : 24

    Regards,
    Bharata.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  2. Re: [PATCH] sched: Include group statistics in /proc/sched_debug

    On Mon, 2008-11-10 at 14:53 +0530, Bharata B Rao wrote:
    > Include group statistics in /proc/sched_debug.
    >
    > Since the statistics of a group entity isn't exported directly from the
    > kernel, it becomes difficult to obtain some of the group statistics.
    > For example, the current method to obtain exec time of a group entity
    > is not always accurate. One has to read the exec times of all
    > the tasks(/proc//sched) in the group and add them. This method
    > fails (or becomes difficult) if we want to collect stats of a group over
    > a duration where tasks get created and terminated.
    >
    > This patch makes it easier to obtain group stats by directly including
    > them in /proc/sched_debug. Stats like group exec time would help user
    > programs (like LTP) to accurately measure the group fairness.
    >
    > Signed-off-by: Bharata B Rao
    > CC: Peter Zijlstra
    > CC: Ingo Molnar
    > CC: Srivatsa Vaddagiri
    > ---




    > An extract of /proc/sched_debug showing group stats obtained from
    > this patch:
    >
    > group[1]:/3/a/1
    > .se->exec_start : 256484.781577
    > .se->vruntime : 12868.176994
    > .se->sum_exec_runtime : 3243.669709
    > .se->wait_start : 0.000000
    > .se->sleep_start : 0.000000
    > .se->block_start : 0.000000
    > .se->sleep_max : 0.000000
    > .se->block_max : 0.000000
    > .se->exec_max : 1.002095
    > .se->slice_max : 13.997073
    > .se->wait_max : 67.978322
    > .se->wait_sum : 7141.676906
    > .se->wait_count : 203
    > .se->load.weight : 255


    Why not simply add them to the cfs_rq[n]:/path sections we already have?

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  3. [PATCH] sched: cleanup debug info

    Subject: sched: cleanup debug info
    From: Peter Zijlstra
    Date: Thu Oct 30 23:57:11 CET 2008

    While looking over the sched_debug code I noticed that we printed the rq
    schedstats for every cfs_rq, ammend this.

    Also change nr_spead_over into an int, and fix a little buglet in
    min_vruntime printing.

    Signed-off-by: Peter Zijlstra
    ---
    kernel/sched.c | 2 +-
    kernel/sched_debug.c | 41 +++++++++++++++++++++--------------------
    2 files changed, 22 insertions(+), 21 deletions(-)

    Index: linux-2.6/kernel/sched.c
    ================================================== =================
    --- linux-2.6.orig/kernel/sched.c
    +++ linux-2.6/kernel/sched.c
    @@ -399,7 +399,7 @@ struct cfs_rq {
    */
    struct sched_entity *curr, *next, *last;

    - unsigned long nr_spread_over;
    + unsigned int nr_spread_over;

    #ifdef CONFIG_FAIR_GROUP_SCHED
    struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
    Index: linux-2.6/kernel/sched_debug.c
    ================================================== =================
    --- linux-2.6.orig/kernel/sched_debug.c
    +++ linux-2.6/kernel/sched_debug.c
    @@ -139,7 +139,7 @@ void print_cfs_rq(struct seq_file *m, in
    last = __pick_last_entity(cfs_rq);
    if (last)
    max_vruntime = last->vruntime;
    - min_vruntime = rq->cfs.min_vruntime;
    + min_vruntime = cfs_rq->min_vruntime;
    rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
    spin_unlock_irqrestore(&rq->lock, flags);
    SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
    @@ -156,26 +156,8 @@ void print_cfs_rq(struct seq_file *m, in
    SPLIT_NS(spread0));
    SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
    SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
    -#ifdef CONFIG_SCHEDSTATS
    -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
    -
    - P(yld_exp_empty);
    - P(yld_act_empty);
    - P(yld_both_empty);
    - P(yld_count);

    - P(sched_switch);
    - P(sched_count);
    - P(sched_goidle);
    -
    - P(ttwu_count);
    - P(ttwu_local);
    -
    - P(bkl_count);
    -
    -#undef P
    -#endif
    - SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
    + SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
    cfs_rq->nr_spread_over);
    #ifdef CONFIG_FAIR_GROUP_SCHED
    #ifdef CONFIG_SMP
    @@ -250,6 +232,25 @@ static void print_cpu(struct seq_file *m
    #undef P
    #undef PN

    +#ifdef CONFIG_SCHEDSTATS
    +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
    +
    + P(yld_exp_empty);
    + P(yld_act_empty);
    + P(yld_both_empty);
    + P(yld_count);
    +
    + P(sched_switch);
    + P(sched_count);
    + P(sched_goidle);
    +
    + P(ttwu_count);
    + P(ttwu_local);
    +
    + P(bkl_count);
    +
    +#undef P
    +#endif
    print_cfs_stats(m, cpu);
    print_rt_stats(m, cpu);




    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  4. Re: [PATCH] sched: cleanup debug info


    * Peter Zijlstra wrote:

    > Subject: sched: cleanup debug info
    > From: Peter Zijlstra
    > Date: Thu Oct 30 23:57:11 CET 2008
    >
    > While looking over the sched_debug code I noticed that we printed the rq
    > schedstats for every cfs_rq, ammend this.
    >
    > Also change nr_spead_over into an int, and fix a little buglet in
    > min_vruntime printing.
    >
    > Signed-off-by: Peter Zijlstra
    > ---
    > kernel/sched.c | 2 +-
    > kernel/sched_debug.c | 41 +++++++++++++++++++++--------------------
    > 2 files changed, 22 insertions(+), 21 deletions(-)


    applied to tip/sched/urgent, thanks Peter!

    Ingo
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  5. Re: [PATCH -v2] sched: Include group statistics in /proc/sched_debug

    On Mon, Nov 10, 2008 at 10:30:07AM +0100, Peter Zijlstra wrote:
    > On Mon, 2008-11-10 at 14:53 +0530, Bharata B Rao wrote:
    >
    > > An extract of /proc/sched_debug showing group stats obtained from
    > > this patch:
    > >
    > > group[1]:/3/a/1
    > > .se->exec_start : 256484.781577
    > > .se->vruntime : 12868.176994
    > > .se->sum_exec_runtime : 3243.669709
    > > .se->wait_start : 0.000000
    > > .se->sleep_start : 0.000000
    > > .se->block_start : 0.000000
    > > .se->sleep_max : 0.000000
    > > .se->block_max : 0.000000
    > > .se->exec_max : 1.002095
    > > .se->slice_max : 13.997073
    > > .se->wait_max : 67.978322
    > > .se->wait_sum : 7141.676906
    > > .se->wait_count : 203
    > > .se->load.weight : 255

    >
    > Why not simply add them to the cfs_rq[n]:/path sections we already have?


    Makes sense. Here is the updated patch which puts group statistics
    under cfs_rq[n]:/path section as you suggest.

    Include group statistics in /proc/sched_debug.

    Since the statistics of a group entity isn't exported directly from the
    kernel, it becomes difficult to obtain some of the group statistics.
    For example, the current method to obtain exec time of a group entity
    is not always accurate. One has to read the exec times of all
    the tasks(/proc//sched) in the group and add them. This method
    fails (or becomes difficult) if we want to collect stats of a group over
    a duration where tasks get created and terminated.

    This patch makes it easier to obtain group stats by directly including
    them in /proc/sched_debug. Stats like group exec time would help user
    programs (like LTP) to accurately measure the group fairness.

    Signed-off-by: Bharata B Rao
    CC: Peter Zijlstra
    CC: Ingo Molnar
    CC: Srivatsa Vaddagiri
    ---
    kernel/sched_debug.c | 37 ++++++++++++++++++++++++++++++++++++-
    1 file changed, 36 insertions(+), 1 deletion(-)

    --- a/kernel/sched_debug.c
    +++ b/kernel/sched_debug.c
    @@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned l

    #define SPLIT_NS(x) nsec_high(x), nsec_low(x)

    +#ifdef CONFIG_FAIR_GROUP_SCHED
    +static void print_cfs_group_stats(struct seq_file *m, int cpu,
    + struct task_group *tg)
    +{
    + struct sched_entity *se = tg->se[cpu];
    + if (!se)
    + return;
    +
    +#define P(F) \
    + SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
    +#define PN(F) \
    + SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
    +
    + PN(se->exec_start);
    + PN(se->vruntime);
    + PN(se->sum_exec_runtime);
    +#ifdef CONFIG_SCHEDSTATS
    + PN(se->wait_start);
    + PN(se->sleep_start);
    + PN(se->block_start);
    + PN(se->sleep_max);
    + PN(se->block_max);
    + PN(se->exec_max);
    + PN(se->slice_max);
    + PN(se->wait_max);
    + PN(se->wait_sum);
    + P(se->wait_count);
    +#endif
    + P(se->load.weight);
    +#undef PN
    +#undef P
    +}
    +#endif
    +
    static void
    print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
    {
    @@ -186,6 +220,7 @@ void print_cfs_rq(struct seq_file *m, in
    #ifdef CONFIG_SMP
    SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
    #endif
    + print_cfs_group_stats(m, cpu, cfs_rq->tg);
    #endif
    }

    @@ -271,7 +306,7 @@ static int sched_debug_show(struct seq_f
    u64 now = ktime_to_ns(ktime_get());
    int cpu;

    - SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n",
    + SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
    init_utsname()->release,
    (int)strcspn(init_utsname()->version, " "),
    init_utsname()->version);

    --
    An example output of group stats from /proc/sched_debug:

    cfs_rq[3]:/3/a/1
    .exec_clock : 89.598007
    .MIN_vruntime : 0.000001
    .min_vruntime : 256300.970506
    .max_vruntime : 0.000001
    .spread : 0.000000
    .spread0 : -25373.372248
    .nr_running : 0
    .load : 0
    .yld_exp_empty : 0
    .yld_act_empty : 0
    .yld_both_empty : 0
    .yld_count : 4474
    .sched_switch : 0
    .sched_count : 40507
    .sched_goidle : 12686
    .ttwu_count : 15114
    .ttwu_local : 11950
    .bkl_count : 67
    .nr_spread_over : 0
    .shares : 0
    .se->exec_start : 113676.727170
    .se->vruntime : 1592.612714
    .se->sum_exec_runtime : 89.598007
    .se->wait_start : 0.000000
    .se->sleep_start : 0.000000
    .se->block_start : 0.000000
    .se->sleep_max : 0.000000
    .se->block_max : 0.000000
    .se->exec_max : 1.000282
    .se->slice_max : 1.999750
    .se->wait_max : 54.981093
    .se->wait_sum : 217.610521
    .se->wait_count : 50
    .se->load.weight : 2

    Regards,
    Bharata.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  6. Re: [PATCH -v2] sched: Include group statistics in /proc/sched_debug


    * Bharata B Rao wrote:

    > > Why not simply add them to the cfs_rq[n]:/path sections we already
    > > have?

    >
    > Makes sense. Here is the updated patch which puts group statistics
    > under cfs_rq[n]:/path section as you suggest.
    >
    > Include group statistics in /proc/sched_debug.


    applied to tip/sched/core, thanks Bharata!

    Ingo
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

+ Reply to Thread