[PATCH 0/8] scheduler patches - Kernel

This is a discussion on [PATCH 0/8] scheduler patches - Kernel ; 1-3 definite 28 patches 4-5 wanna-be 28 patches 6-8 29 work -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at ...

+ Reply to Thread
Results 1 to 10 of 10

Thread: [PATCH 0/8] scheduler patches

  1. [PATCH 0/8] scheduler patches


    1-3 definite 28 patches
    4-5 wanna-be 28 patches
    6-8 29 work


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  2. [PATCH 4/8] sched: re-instate vruntime based wakeup preemption

    The advantage is that vruntime based wakeup preemption has a better
    conceptual model. Here wakeup_gran = 0 means: preempt when 'fair'.
    Therefore wakeup_gran is the granularity of unfairness we allow in order
    to make progress.

    Signed-off-by: Peter Zijlstra
    ---
    kernel/sched_fair.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++----
    1 file changed, 92 insertions(+), 6 deletions(-)

    Index: linux-2.6/kernel/sched_fair.c
    ================================================== =================
    --- linux-2.6.orig/kernel/sched_fair.c
    +++ linux-2.6/kernel/sched_fair.c
    @@ -143,6 +143,49 @@ static inline struct sched_entity *paren
    return se->parent;
    }

    +/* return depth at which a sched entity is present in the hierarchy */
    +static inline int depth_se(struct sched_entity *se)
    +{
    + int depth = 0;
    +
    + for_each_sched_entity(se)
    + depth++;
    +
    + return depth;
    +}
    +
    +static void
    +find_matching_se(struct sched_entity **se, struct sched_entity **pse)
    +{
    + int se_depth, pse_depth;
    +
    + /*
    + * preemption test can be made between sibling entities who are in the
    + * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
    + * both tasks until we find their ancestors who are siblings of common
    + * parent.
    + */
    +
    + /* First walk up until both entities are at same depth */
    + se_depth = depth_se(*se);
    + pse_depth = depth_se(*pse);
    +
    + while (se_depth > pse_depth) {
    + se_depth--;
    + *se = parent_entity(*se);
    + }
    +
    + while (pse_depth > se_depth) {
    + pse_depth--;
    + *pse = parent_entity(*pse);
    + }
    +
    + while (!is_same_group(*se, *pse)) {
    + *se = parent_entity(*se);
    + *pse = parent_entity(*pse);
    + }
    +}
    +
    #else /* CONFIG_FAIR_GROUP_SCHED */

    static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
    @@ -193,6 +236,11 @@ static inline struct sched_entity *paren
    return NULL;
    }

    +static inline void
    +find_matching_se(struct sched_entity **se, struct sched_entity **pse)
    +{
    +}
    +
    #endif /* CONFIG_FAIR_GROUP_SCHED */


    @@ -1244,13 +1292,42 @@ static unsigned long wakeup_gran(struct
    * More easily preempt - nice tasks, while not making it harder for
    * + nice tasks.
    */
    - if (sched_feat(ASYM_GRAN))
    - gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
    + if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
    + gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);

    return gran;
    }

    /*
    + * Should 'se' preempt 'curr'.
    + *
    + * |s1
    + * |s2
    + * |s3
    + * g
    + * |<--->|c
    + *
    + * w(c, s1) = -1
    + * w(c, s2) = 0
    + * w(c, s3) = 1
    + *
    + */
    +static int
    +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
    +{
    + s64 gran, vdiff = curr->vruntime - se->vruntime;
    +
    + if (vdiff <= 0)
    + return -1;
    +
    + gran = wakeup_gran(curr);
    + if (vdiff > gran)
    + return 1;
    +
    + return 0;
    +}
    +
    +/*
    * Preempt the current task with a newly woken task if needed:
    */
    static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
    @@ -1258,7 +1335,6 @@ static void check_preempt_wakeup(struct
    struct task_struct *curr = rq->curr;
    struct cfs_rq *cfs_rq = task_cfs_rq(curr);
    struct sched_entity *se = &curr->se, *pse = &p->se;
    - s64 delta_exec;

    if (unlikely(rt_prio(p->prio))) {
    update_rq_clock(rq);
    @@ -1296,9 +1372,19 @@ static void check_preempt_wakeup(struct
    return;
    }

    - delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
    - if (delta_exec > wakeup_gran(pse))
    - resched_task(curr);
    + find_matching_se(&se, &pse);
    +
    + while (se) {
    + BUG_ON(!pse);
    +
    + if (wakeup_preempt_entity(se, pse) == 1) {
    + resched_task(curr);
    + break;
    + }
    +
    + se = parent_entity(se);
    + pse = parent_entity(pse);
    + }
    }

    static struct task_struct *pick_next_task_fair(struct rq *rq)

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  3. [PATCH 8/8] use avg_vruntime for task placement

    Allow to use the avg_vruntime for task placement.

    The pro: its the 'fair' place to insert tasks.
    The con: its an extra division.

    Signed-off-by: Peter Zijlstra
    ---
    kernel/sched.c | 9 +++++++--
    kernel/sched_fair.c | 7 ++++++-
    kernel/sched_features.h | 1 +
    3 files changed, 14 insertions(+), 3 deletions(-)

    Index: linux-2.6/kernel/sched.c
    ================================================== =================
    --- linux-2.6.orig/kernel/sched.c
    +++ linux-2.6/kernel/sched.c
    @@ -1847,8 +1847,13 @@ void set_task_cpu(struct task_struct *p,
    schedstat_inc(p, se.nr_forced2_migrations);
    }
    #endif
    - p->se.vruntime -= old_cfsrq->min_vruntime -
    - new_cfsrq->min_vruntime;
    + if (sched_feat(AVG_VRUNTIME)) {
    + p->se.vruntime -=
    + avg_vruntime(old_cfsrq) - avg_vruntime(new_cfsrq);
    + } else {
    + p->se.vruntime -=
    + old_cfsrq->min_vruntime - new_cfsrq->min_vruntime;
    + }

    __set_task_cpu(p, new_cpu);
    }
    Index: linux-2.6/kernel/sched_fair.c
    ================================================== =================
    --- linux-2.6.orig/kernel/sched_fair.c
    +++ linux-2.6/kernel/sched_fair.c
    @@ -719,7 +719,12 @@ static void check_spread(struct cfs_rq *
    static void
    place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
    {
    - u64 vruntime = cfs_rq->min_vruntime;
    + u64 vruntime;
    +
    + if (sched_feat(AVG_VRUNTIME))
    + vruntime = avg_vruntime(cfs_rq);
    + else
    + vruntime = cfs_rq->min_vruntime;

    /*
    * The 'current' period is already promised to the current tasks,
    Index: linux-2.6/kernel/sched_features.h
    ================================================== =================
    --- linux-2.6.orig/kernel/sched_features.h
    +++ linux-2.6/kernel/sched_features.h
    @@ -11,4 +11,5 @@ SCHED_FEAT(ASYM_GRAN, 1)
    SCHED_FEAT(LB_BIAS, 1)
    SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
    SCHED_FEAT(ASYM_EFF_LOAD, 1)
    +SCHED_FEAT(AVG_VRUNTIME, 0)
    SCHED_FEAT(WAKEUP_OVERLAP, 0)

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  4. Re: [PATCH 0/8] scheduler patches


    * Peter Zijlstra wrote:

    > 1-3 definite 28 patches
    > 4-5 wanna-be 28 patches


    ok, applied them to tip/sched/urgent, thanks Peter!

    > 6-8 29 work


    Could we have a #9 that uses a no-fastpath-overhead min() method instead
    of avg_vruntime? Maybe we can get away without that overhead?

    Ingo
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  5. Re: [PATCH 0/8] scheduler patches

    On Fri, 2008-10-24 at 12:26 +0200, Ingo Molnar wrote:
    > * Peter Zijlstra wrote:


    > > 6-8 29 work

    >
    > Could we have a #9 that uses a no-fastpath-overhead min() method instead
    > of avg_vruntime? Maybe we can get away without that overhead?


    Looking at that suggestion, maybe I'll redo 7/8 so at to not depend on
    6/8 using min from the get-go and fixing that XXX as well.

    Will have to poke at it a little.

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  6. Re: [PATCH 7/8] sched: non-zero lag renice

    Peter Zijlstra wrote:

    > Then renicing, esp when lowering nice value (getting heavier), its possible
    > to get into a starvation scenario. If you got too much runtime as a very
    > light task, you get shot way far too the right, which means you'll have to
    > wait for a long time in order to run again.
    >
    > If during that wait you get reniced down, fairness would suggest you get run
    > earlier, because you deserve more time.
    >
    > This can be solved by scaling the vruntime so that we keep the real-time
    > lag invariant.


    If we've already been shot way out to the right, presumably that would give us
    a large real-time lag. If we renice to a lower nice level, wouldn't we want
    to reduce the real-time lag rather than make it constant?

    Chris
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  7. Re: [PATCH 7/8] sched: non-zero lag renice

    On Fri, 2008-10-24 at 11:47 -0600, Chris Friesen wrote:
    > Peter Zijlstra wrote:
    >
    > > Then renicing, esp when lowering nice value (getting heavier), its possible
    > > to get into a starvation scenario. If you got too much runtime as a very
    > > light task, you get shot way far too the right, which means you'll have to
    > > wait for a long time in order to run again.
    > >
    > > If during that wait you get reniced down, fairness would suggest you get run
    > > earlier, because you deserve more time.
    > >
    > > This can be solved by scaling the vruntime so that we keep the real-time
    > > lag invariant.

    >
    > If we've already been shot way out to the right, presumably that would give us
    > a large real-time lag. If we renice to a lower nice level, wouldn't we want
    > to reduce the real-time lag rather than make it constant?


    Ah, see but a 1ms real-time lag might be gigantic on weight=15, but
    nearly nothing on weight=88761.

    1e6 * 1024/15 is massively larger than 1e6 * 1024/88761.

    1000000*1024/15 = 68266666
    1000000*1024/88761 = 11536




    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  8. Re: [PATCH 7/8] sched: non-zero lag renice

    Peter Zijlstra wrote:
    > On Fri, 2008-10-24 at 11:47 -0600, Chris Friesen wrote:
    >> Peter Zijlstra wrote:
    >>
    >> > Then renicing, esp when lowering nice value (getting heavier), its possible
    >> > to get into a starvation scenario. If you got too much runtime as a very
    >> > light task, you get shot way far too the right, which means you'll have to
    >> > wait for a long time in order to run again.
    >> >
    >> > If during that wait you get reniced down, fairness would suggest you get run
    >> > earlier, because you deserve more time.
    >> >
    >> > This can be solved by scaling the vruntime so that we keep the real-time
    >> > lag invariant.

    >>
    >> If we've already been shot way out to the right, presumably that would give us
    >> a large real-time lag. If we renice to a lower nice level, wouldn't we want
    >> to reduce the real-time lag rather than make it constant?


    Sorry, the patches arrived out of order and my comments were sent out before
    reading your definition of "lag" as "the difference between the service time
    received from the ideal model and the actual scheduler".

    I was using a definition of lag as "amount of time before the process gets
    scheduled again".

    Given your definition, I think the patch makes sense.

    Chris
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  9. Re: [PATCH 6/8] sched: avg_vruntime

    On Fri, 2008-10-24 at 11:06 +0200, Peter Zijlstra wrote:
    > plain text document attachment (sched-avg-vruntime.patch)
    > Renicing requires scaling the lag. Therefore we need a way to compute the it.
    > Lag is defined as the difference between the service time received from the
    > ideal model and the actual scheduler.
    >
    > The defining property of a fair scheduler is that the sum of all lags is zero;
    > which can be seen is trivially true for the ideal case, as all lags are zero.
    >
    > Therefore, the average of all virtual runtimes will be the point of zero lag.
    >
    > We cannot prove fairness for CFS due to sleeper fairness (without it we can).
    > However since we can observe it does converge to fairness in stable operation,
    > we can say the zero lag point converges to the average.
    >
    > We can't just take the average of vruntime - as it will use the full range
    > of its u64 and will wrap around. Instead we'll use the average of
    > (vruntime - min_vruntime)
    >
    > \Sum_{i}^{n} 1/n (v_{i} - v) = 1/n (\Sum_{i}^{n} v_{i}) - vn
    >
    > By factoring out the 1/n (never storing that) we avoid rounding, which
    > would bring an accumulating error.


    Hi Fabio,

    you were right, this is wrong.

    How about this..

    The fluid model, would for each task t_i, generate an execution time e_i

    de_i = w_i / w_sum * dt

    However, any real scheduler will be imperfect and have an error eps_i

    dE_i = de_i + eps_i,

    But due to only dt actual time having past we can state that

    \Sum_i dE_i = dt, therefore \Sum_i eps_i = 0.

    This will be reflected in a virtual runtime skew of

    dv_i = eps_i / w_i

    If we now wish to obtain the zero lag point, there were all tasks would
    be in the fluid model, we get

    eps_i = dv_i * w_i, which yields: \Sum dv_i * w_i = 0

    IOW avg(v_i*w_i) = v_fluid

    1/n \Sum_i v_i*w_i, [v_i -> v_i-x] ->
    1/n \sum_i (v_i-x)*w_i =
    1/n \Sum v_i*w_i - \Sum x*w_i =
    1/n \Sum v_i*w_i - x \Sum w_i

    which in turn would yield a patch like below..

    I'll also try and quantify the error and effect of using min_vruntime as
    zero lag point as Ingo suggested.

    ---
    Index: linux-2.6/kernel/sched.c
    ================================================== =================
    --- linux-2.6.orig/kernel/sched.c 2008-10-29 16:43:16.000000000 +0100
    +++ linux-2.6/kernel/sched.c 2008-10-29 16:43:27.000000000 +0100
    @@ -384,6 +384,10 @@ struct cfs_rq {
    struct load_weight load;
    unsigned long nr_running;

    + long nr_queued;
    + long avg_load;
    + s64 avg_vruntime;
    +
    u64 exec_clock;
    u64 min_vruntime;

    Index: linux-2.6/kernel/sched_debug.c
    ================================================== =================
    --- linux-2.6.orig/kernel/sched_debug.c 2008-10-29 16:43:04.000000000 +0100
    +++ linux-2.6/kernel/sched_debug.c 2008-10-29 16:43:37.000000000 +0100
    @@ -161,6 +161,9 @@ void print_cfs_rq(struct seq_file *m, in
    SPLIT_NS(spread0));
    SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
    SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
    + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
    + SPLIT_NS(avg_vruntime(cfs_rq)));
    +
    #ifdef CONFIG_SCHEDSTATS
    #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);

    Index: linux-2.6/kernel/sched_fair.c
    ================================================== =================
    --- linux-2.6.orig/kernel/sched_fair.c 2008-10-29 16:43:17.000000000 +0100
    +++ linux-2.6/kernel/sched_fair.c 2008-10-29 16:46:41.000000000 +0100
    @@ -271,6 +271,60 @@ static inline s64 entity_key(struct cfs_
    return se->vruntime - cfs_rq->min_vruntime;
    }

    +static void
    +avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
    +{
    + s64 key = entity_key(cfs_rq, se);
    + cfs_rq->avg_load += se->load.weight;
    + cfs_rq->avg_vruntime += key * se->load.weight;
    + cfs_rq->nr_queued++;
    +}
    +
    +static void
    +avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
    +{
    + s64 key = entity_key(cfs_rq, se);
    + cfs_rq->avg_load -= se->load.weight;
    + cfs_rq->avg_vruntime -= key * se->load.weight;
    + cfs_rq->nr_queued--;
    +}
    +
    +static inline
    +void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
    +{
    + cfs_rq->avg_vruntime -= cfs_rq->nr_queued * cfs_rq->avg_load * delta;
    +}
    +
    +static u64 avg_vruntime(struct cfs_rq *cfs_rq)
    +{
    + s64 avg = cfs_rq->avg_vruntime;
    + long nr_queued = cfs_rq->nr_queued;
    +
    + if (cfs_rq->curr) {
    + nr_queued++;
    + avg += entity_key(cfs_rq, cfs_rq->curr) * cfs_rq->curr->load.weight;
    + }
    +
    + avg >>= NICE_0_SHIFT;
    +
    + if (nr_queued)
    + avg = div_s64(avg, nr_queued);
    +
    + return cfs_rq->min_vruntime + avg;
    +}
    +
    +static void __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
    +{
    + /*
    + * open coded max_vruntime() to allow updating avg_vruntime
    + */
    + s64 delta = (s64)(vruntime - cfs_rq->min_vruntime);
    + if (delta > 0) {
    + avg_vruntime_update(cfs_rq, delta);
    + cfs_rq->min_vruntime = vruntime;
    + }
    +}
    +
    static void update_min_vruntime(struct cfs_rq *cfs_rq)
    {
    u64 vruntime = cfs_rq->min_vruntime;
    @@ -289,7 +343,7 @@ static void update_min_vruntime(struct c
    vruntime = min_vruntime(vruntime, se->vruntime);
    }

    - cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
    + __update_min_vruntime(cfs_rq, vruntime);
    }

    /*
    @@ -303,6 +357,8 @@ static void __enqueue_entity(struct cfs_
    s64 key = entity_key(cfs_rq, se);
    int leftmost = 1;

    + avg_vruntime_add(cfs_rq, se);
    +
    /*
    * Find the right place in the rbtree:
    */
    @@ -345,6 +401,7 @@ static void __dequeue_entity(struct cfs_
    cfs_rq->next = NULL;

    rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
    + avg_vruntime_sub(cfs_rq, se);
    }

    static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  10. Re: [PATCH 6/8] sched: avg_vruntime

    Hi,

    > From: Peter Zijlstra
    > Date: Wed, Oct 29, 2008 04:48:34PM +0100
    >
    > On Fri, 2008-10-24 at 11:06 +0200, Peter Zijlstra wrote:

    ....
    > How about this..
    >
    > The fluid model, would for each task t_i, generate an execution time e_i
    >
    > de_i = w_i / w_sum * dt
    >
    > However, any real scheduler will be imperfect and have an error eps_i
    >
    > dE_i = de_i + eps_i,
    >


    ...according to this equation...


    > But due to only dt actual time having past we can state that
    >
    > \Sum_i dE_i = dt, therefore \Sum_i eps_i = 0.
    >
    > This will be reflected in a virtual runtime skew of
    >
    > dv_i = eps_i / w_i
    >


    ....and to this one, what you call ``virtual runtime skew'' is:

    dv_i = (dE_i - de_i) / w_i.


    > If we now wish to obtain the zero lag point, there were all tasks would
    > be in the fluid model, we get
    >
    > eps_i = dv_i * w_i, which yields: \Sum dv_i * w_i = 0
    >
    > IOW avg(v_i*w_i) = v_fluid
    >


    Looking at the code, it seems that you use the vruntime values of the
    entities when you do the average, which are different from what you
    previously called ``virtual runtime skew.'' I don't understand the
    connection between the previous dv_i and the v_i there. Calling dVR_i
    the vruntime increment for the i-th entity, dVR_i = dE_i / w_i, which
    clearly differs from dv_i.

    Moreover, v_fluid (considering all the flows backlogged from the beginning
    and the set of active flows constant) is defined as:

    v_fluid = 1 / w_sum * \sum w_i * VR_i,

    so, unless w_sum == N, this differs from your expression for v_fluid.
    Am I missing something there?


    > 1/n \Sum_i v_i*w_i, [v_i -> v_i-x] ->
    > 1/n \sum_i (v_i-x)*w_i =
    > 1/n \Sum v_i*w_i - \Sum x*w_i =
    > 1/n \Sum v_i*w_i - x \Sum w_i
    >
    > which in turn would yield a patch like below..
    >
    > I'll also try and quantify the error and effect of using min_vruntime as
    > zero lag point as Ingo suggested.
    >


    min_vruntime, given its definition, is very likely to be near to the
    maximum lag point...


    > ---
    > Index: linux-2.6/kernel/sched.c
    > ================================================== =================
    > --- linux-2.6.orig/kernel/sched.c 2008-10-29 16:43:16.000000000 +0100
    > +++ linux-2.6/kernel/sched.c 2008-10-29 16:43:27.000000000 +0100
    > @@ -384,6 +384,10 @@ struct cfs_rq {
    > struct load_weight load;
    > unsigned long nr_running;
    >
    > + long nr_queued;
    > + long avg_load;
    > + s64 avg_vruntime;
    > +
    > u64 exec_clock;
    > u64 min_vruntime;
    >
    > Index: linux-2.6/kernel/sched_debug.c
    > ================================================== =================
    > --- linux-2.6.orig/kernel/sched_debug.c 2008-10-29 16:43:04.000000000 +0100
    > +++ linux-2.6/kernel/sched_debug.c 2008-10-29 16:43:37.000000000 +0100
    > @@ -161,6 +161,9 @@ void print_cfs_rq(struct seq_file *m, in
    > SPLIT_NS(spread0));
    > SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
    > SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
    > + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
    > + SPLIT_NS(avg_vruntime(cfs_rq)));
    > +
    > #ifdef CONFIG_SCHEDSTATS
    > #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
    >
    > Index: linux-2.6/kernel/sched_fair.c
    > ================================================== =================
    > --- linux-2.6.orig/kernel/sched_fair.c 2008-10-29 16:43:17.000000000 +0100
    > +++ linux-2.6/kernel/sched_fair.c 2008-10-29 16:46:41.000000000 +0100
    > @@ -271,6 +271,60 @@ static inline s64 entity_key(struct cfs_
    > return se->vruntime - cfs_rq->min_vruntime;
    > }
    >
    > +static void
    > +avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
    > +{
    > + s64 key = entity_key(cfs_rq, se);
    > + cfs_rq->avg_load += se->load.weight;
    > + cfs_rq->avg_vruntime += key * se->load.weight;
    > + cfs_rq->nr_queued++;
    > +}
    > +
    > +static void
    > +avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
    > +{
    > + s64 key = entity_key(cfs_rq, se);
    > + cfs_rq->avg_load -= se->load.weight;
    > + cfs_rq->avg_vruntime -= key * se->load.weight;
    > + cfs_rq->nr_queued--;
    > +}
    > +
    > +static inline
    > +void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
    > +{
    > + cfs_rq->avg_vruntime -= cfs_rq->nr_queued * cfs_rq->avg_load * delta;
    > +}
    > +
    > +static u64 avg_vruntime(struct cfs_rq *cfs_rq)
    > +{
    > + s64 avg = cfs_rq->avg_vruntime;
    > + long nr_queued = cfs_rq->nr_queued;
    > +
    > + if (cfs_rq->curr) {
    > + nr_queued++;
    > + avg += entity_key(cfs_rq, cfs_rq->curr) * cfs_rq->curr->load.weight;
    > + }
    > +
    > + avg >>= NICE_0_SHIFT;
    > +
    > + if (nr_queued)
    > + avg = div_s64(avg, nr_queued);
    > +
    > + return cfs_rq->min_vruntime + avg;
    > +}
    > +
    > +static void __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
    > +{
    > + /*
    > + * open coded max_vruntime() to allow updating avg_vruntime
    > + */
    > + s64 delta = (s64)(vruntime - cfs_rq->min_vruntime);
    > + if (delta > 0) {
    > + avg_vruntime_update(cfs_rq, delta);
    > + cfs_rq->min_vruntime = vruntime;
    > + }
    > +}
    > +
    > static void update_min_vruntime(struct cfs_rq *cfs_rq)
    > {
    > u64 vruntime = cfs_rq->min_vruntime;
    > @@ -289,7 +343,7 @@ static void update_min_vruntime(struct c
    > vruntime = min_vruntime(vruntime, se->vruntime);
    > }
    >
    > - cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
    > + __update_min_vruntime(cfs_rq, vruntime);
    > }
    >
    > /*
    > @@ -303,6 +357,8 @@ static void __enqueue_entity(struct cfs_
    > s64 key = entity_key(cfs_rq, se);
    > int leftmost = 1;
    >
    > + avg_vruntime_add(cfs_rq, se);
    > +
    > /*
    > * Find the right place in the rbtree:
    > */
    > @@ -345,6 +401,7 @@ static void __dequeue_entity(struct cfs_
    > cfs_rq->next = NULL;
    >
    > rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
    > + avg_vruntime_sub(cfs_rq, se);
    > }
    >
    > static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
    >

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

+ Reply to Thread