[PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu - Kernel

This is a discussion on [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu - Kernel ; RCU can only control the lifetime of allocated memory blocks, which forces all the call structures to be allocated. This is expensive compared to allocating them on the stack, which is the common case for synchronous calls. This patch takes ...

+ Reply to Thread
Page 1 of 2 1 2 LastLast
Results 1 to 20 of 33

Thread: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

  1. [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    RCU can only control the lifetime of allocated memory blocks, which
    forces all the call structures to be allocated. This is expensive
    compared to allocating them on the stack, which is the common case for
    synchronous calls.

    This patch takes a different approach. Rather than using RCU, the
    queues are managed under rwlocks. Adding or removing from the queue
    requires holding the lock for writing, but multiple CPUs can walk the
    queues to process function calls under read locks. In the common
    case, where the structures are stack allocated, the calling CPU need
    only wait for its call to be done, take the lock for writing and
    remove the call structure.

    Lock contention - particularly write vs read - is reduced by using
    multiple queues.

    Signed-off-by: Jeremy Fitzhardinge
    ---
    kernel/smp.c | 140 +++++++++++++++++++++++-----------------------------------
    1 file changed, 56 insertions(+), 84 deletions(-)

    ================================================== =================
    --- a/kernel/smp.c
    +++ b/kernel/smp.c
    @@ -7,8 +7,6 @@
    #include
    #include
    #include
    -#include
    -#include
    #include
    #include

    @@ -26,7 +24,7 @@
    static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
    struct ____cacheline_aligned queue {
    struct list_head list;
    - spinlock_t lock;
    + rwlock_t rwlock;
    };

    static __cacheline_aligned struct queue call_function_queues[NQUEUES];
    @@ -40,7 +38,7 @@
    struct call_single_data csd;
    atomic_t refs;
    cpumask_t cpumask;
    - struct rcu_head rcu_head;
    + struct list_head cull_list;
    };

    struct call_single_queue {
    @@ -98,15 +96,6 @@
    csd_flag_wait(data);
    }

    -static void rcu_free_call_data(struct rcu_head *head)
    -{
    - struct call_function_data *data;
    -
    - data = container_of(head, struct call_function_data, rcu_head);
    -
    - kfree(data);
    -}
    -
    /*
    * Invoked by arch to handle an IPI for call function. Must be called with
    * interrupts disabled.
    @@ -115,17 +104,14 @@
    {
    struct call_function_data *data;
    struct queue *queue;
    + LIST_HEAD(cull_list);
    int cpu = get_cpu();

    queue = &call_function_queues[queue_no];
    -
    - /*
    - * It's ok to use list_for_each_rcu() here even though we may delete
    - * 'pos', since list_del_rcu() doesn't clear ->next
    - */
    - rcu_read_lock();
    - list_for_each_entry_rcu(data, &queue->list, csd.list) {
    - if (!cpu_isset(cpu, data->cpumask))
    +
    + read_lock(&queue->rwlock);
    + list_for_each_entry(data, &queue->list, csd.list) {
    + if (!cpu_isset(cpu, data->cpumask))
    continue;

    data->csd.func(data->csd.info);
    @@ -134,22 +120,36 @@
    if (!atomic_dec_and_test(&data->refs))
    continue;

    - spin_lock(&queue->lock);
    - list_del_rcu(&data->csd.list);
    - spin_unlock(&queue->lock);
    -
    if (data->csd.flags & CSD_FLAG_WAIT) {
    /*
    - * serialize stores to data with the flag clear
    - * and wakeup
    + * Serialize stores to data with the flag
    + * clear and wakeup. Waiter will remove us
    + * from the list.
    */
    smp_wmb();
    data->csd.flags &= ~CSD_FLAG_WAIT;
    + } else {
    + /*
    + * If there's no waiter, then the data must
    + * have been heap-allocated.
    + */
    + BUG_ON(!(data->csd.flags & CSD_FLAG_ALLOC));
    +
    + list_add_tail(&data->cull_list, &cull_list);
    }
    - if (data->csd.flags & CSD_FLAG_ALLOC)
    - call_rcu(&data->rcu_head, rcu_free_call_data);
    }
    - rcu_read_unlock();
    + read_unlock(&queue->rwlock);
    +
    + if (!list_empty(&cull_list)) {
    + struct call_function_data *next;
    +
    + write_lock(&queue->rwlock);
    + list_for_each_entry_safe(data, next, &cull_list, cull_list) {
    + list_del(&data->csd.list);
    + kfree(data);
    + }
    + write_unlock(&queue->rwlock);
    + }

    put_cpu();
    }
    @@ -271,42 +271,6 @@
    generic_exec_single(cpu, data);
    }

    -/* Dummy function */
    -static void quiesce_dummy(void *unused)
    -{
    -}
    -
    -/*
    - * Ensure stack based data used in call function mask is safe to free.
    - *
    - * This is needed by smp_call_function_mask when using on-stack data, because
    - * a single call function queue is shared by all CPUs, and any CPU may pick up
    - * the data item on the queue at any time before it is deleted. So we need to
    - * ensure that all CPUs have transitioned through a quiescent state after
    - * this call.
    - *
    - * This is a very slow function, implemented by sending synchronous IPIs to
    - * all possible CPUs. For this reason, we have to alloc data rather than use
    - * stack based data even in the case of synchronous calls. The stack based
    - * data is then just used for deadlock/oom fallback which will be very rare.
    - *
    - * If a faster scheme can be made, we could go back to preferring stack based
    - * data -- the data allocation/free is non-zero cost.
    - */
    -static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
    -{
    - struct call_single_data data;
    - int cpu;
    -
    - data.func = quiesce_dummy;
    - data.info = NULL;
    -
    - for_each_cpu_mask(cpu, mask) {
    - data.flags = CSD_FLAG_WAIT;
    - generic_exec_single(cpu, &data);
    - }
    -}
    -
    /**
    * smp_call_function_mask(): Run a function on a set of other CPUs.
    * @mask: The set of cpus to run on.
    @@ -332,7 +296,6 @@
    cpumask_t allbutself;
    unsigned long flags;
    int cpu, num_cpus;
    - int slowpath = 0;
    unsigned queue_no;
    struct queue *queue;

    @@ -359,16 +322,20 @@
    return smp_call_function_single(cpu, func, info, wait);
    }

    - data = kmalloc(sizeof(*data), GFP_ATOMIC);
    - if (data) {
    - data->csd.flags = CSD_FLAG_ALLOC;
    - if (wait)
    - data->csd.flags |= CSD_FLAG_WAIT;
    - } else {
    + /*
    + * Allocate data if it's an async call, otherwise use stack.
    + * If the allocation fails, then convert it to a sync call and
    + * use the stack anyway.
    + */
    + if (!wait) {
    + data = kmalloc(sizeof(*data), GFP_ATOMIC);
    + if (data)
    + data->csd.flags = CSD_FLAG_ALLOC;
    + }
    + if (!data) {
    data = &d;
    data->csd.flags = CSD_FLAG_WAIT;
    wait = 1;
    - slowpath = 1;
    }

    data->csd.func = func;
    @@ -376,9 +343,9 @@
    atomic_set(&data->refs, num_cpus);
    data->cpumask = mask;

    - spin_lock_irqsave(&queue->lock, flags);
    - list_add_tail_rcu(&data->csd.list, &queue->list);
    - spin_unlock_irqrestore(&queue->lock, flags);
    + write_lock_irqsave(&queue->rwlock, flags);
    + list_add_tail(&data->csd.list, &queue->list);
    + write_unlock_irqrestore(&queue->rwlock, flags);

    /* Send a message to all CPUs in the map */
    arch_send_call_function_ipi(mask);
    @@ -386,8 +353,13 @@
    /* optionally wait for the CPUs to complete */
    if (wait) {
    csd_flag_wait(&data->csd);
    - if (unlikely(slowpath))
    - smp_call_function_mask_quiesce_stack(mask);
    +
    + write_lock_irqsave(&queue->rwlock, flags);
    + list_del(&data->csd.list);
    + write_unlock_irqrestore(&queue->rwlock, flags);
    +
    + /* We should never wait for allocated data. */
    + BUG_ON(data->csd.flags & CSD_FLAG_ALLOC);
    }

    return 0;
    @@ -425,7 +397,7 @@
    int i;

    for(i = 0; i < NQUEUES; i++)
    - spin_lock(&call_function_queues[i].lock);
    + write_lock(&call_function_queues[i].rwlock);
    }

    void ipi_call_unlock(void)
    @@ -433,7 +405,7 @@
    int i;

    for(i = 0; i < NQUEUES; i++)
    - spin_unlock(&call_function_queues[i].lock);
    + write_unlock(&call_function_queues[i].rwlock);
    }

    void ipi_call_lock_irq(void)
    @@ -443,7 +415,7 @@
    spin_lock_irq(&queues_lock);

    for(i = 0; i < NQUEUES; i++)
    - spin_lock_nest_lock(&call_function_queues[i].lock, &queues_lock);
    + write_lock(&call_function_queues[i].rwlock);
    }

    void ipi_call_unlock_irq(void)
    @@ -451,7 +423,7 @@
    int i;

    for(i = 0; i < NQUEUES; i++)
    - spin_unlock(&call_function_queues[i].lock);
    + write_unlock(&call_function_queues[i].rwlock);

    spin_unlock_irq(&queues_lock);
    }
    @@ -463,7 +435,7 @@

    for(i = 0; i < NQUEUES; i++) {
    INIT_LIST_HEAD(&call_function_queues[i].list);
    - spin_lock_init(&call_function_queues[i].lock);
    + rwlock_init(&call_function_queues[i].rwlock);
    }

    printk(KERN_INFO "smp function calls: using %d/%d queues\n",


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  2. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    On Friday 22 August 2008 10:29, Jeremy Fitzhardinge wrote:
    > RCU can only control the lifetime of allocated memory blocks, which
    > forces all the call structures to be allocated. This is expensive
    > compared to allocating them on the stack, which is the common case for
    > synchronous calls.
    >
    > This patch takes a different approach. Rather than using RCU, the
    > queues are managed under rwlocks. Adding or removing from the queue
    > requires holding the lock for writing, but multiple CPUs can walk the
    > queues to process function calls under read locks. In the common
    > case, where the structures are stack allocated, the calling CPU need
    > only wait for its call to be done, take the lock for writing and
    > remove the call structure.
    >
    > Lock contention - particularly write vs read - is reduced by using
    > multiple queues.


    Could be reasonable. Still, it's adding like 4 or 5 more atomic
    operations, which will be approaching the cost of a slab allocation.

    Another approach to reduce slab allocation cost would be to do the
    call_rcu at the caller, in the wait case so we hit the CPU-local
    freeing path in slab.


    >
    > Signed-off-by: Jeremy Fitzhardinge
    > ---
    > kernel/smp.c | 140
    > +++++++++++++++++++++++----------------------------------- 1 file changed,
    > 56 insertions(+), 84 deletions(-)
    >
    > ================================================== =================
    > --- a/kernel/smp.c
    > +++ b/kernel/smp.c
    > @@ -7,8 +7,6 @@
    > #include
    > #include
    > #include
    > -#include
    > -#include
    > #include
    > #include
    >
    > @@ -26,7 +24,7 @@
    > static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
    > struct ____cacheline_aligned queue {
    > struct list_head list;
    > - spinlock_t lock;
    > + rwlock_t rwlock;
    > };
    >
    > static __cacheline_aligned struct queue call_function_queues[NQUEUES];
    > @@ -40,7 +38,7 @@
    > struct call_single_data csd;
    > atomic_t refs;
    > cpumask_t cpumask;
    > - struct rcu_head rcu_head;
    > + struct list_head cull_list;
    > };
    >
    > struct call_single_queue {
    > @@ -98,15 +96,6 @@
    > csd_flag_wait(data);
    > }
    >
    > -static void rcu_free_call_data(struct rcu_head *head)
    > -{
    > - struct call_function_data *data;
    > -
    > - data = container_of(head, struct call_function_data, rcu_head);
    > -
    > - kfree(data);
    > -}
    > -
    > /*
    > * Invoked by arch to handle an IPI for call function. Must be called with
    > * interrupts disabled.
    > @@ -115,17 +104,14 @@
    > {
    > struct call_function_data *data;
    > struct queue *queue;
    > + LIST_HEAD(cull_list);
    > int cpu = get_cpu();
    >
    > queue = &call_function_queues[queue_no];
    > -
    > - /*
    > - * It's ok to use list_for_each_rcu() here even though we may delete
    > - * 'pos', since list_del_rcu() doesn't clear ->next
    > - */
    > - rcu_read_lock();
    > - list_for_each_entry_rcu(data, &queue->list, csd.list) {
    > - if (!cpu_isset(cpu, data->cpumask))
    > +
    > + read_lock(&queue->rwlock);
    > + list_for_each_entry(data, &queue->list, csd.list) {
    > + if (!cpu_isset(cpu, data->cpumask))
    > continue;
    >
    > data->csd.func(data->csd.info);
    > @@ -134,22 +120,36 @@
    > if (!atomic_dec_and_test(&data->refs))
    > continue;
    >
    > - spin_lock(&queue->lock);
    > - list_del_rcu(&data->csd.list);
    > - spin_unlock(&queue->lock);
    > -
    > if (data->csd.flags & CSD_FLAG_WAIT) {
    > /*
    > - * serialize stores to data with the flag clear
    > - * and wakeup
    > + * Serialize stores to data with the flag
    > + * clear and wakeup. Waiter will remove us
    > + * from the list.
    > */
    > smp_wmb();
    > data->csd.flags &= ~CSD_FLAG_WAIT;
    > + } else {
    > + /*
    > + * If there's no waiter, then the data must
    > + * have been heap-allocated.
    > + */
    > + BUG_ON(!(data->csd.flags & CSD_FLAG_ALLOC));
    > +
    > + list_add_tail(&data->cull_list, &cull_list);
    > }
    > - if (data->csd.flags & CSD_FLAG_ALLOC)
    > - call_rcu(&data->rcu_head, rcu_free_call_data);
    > }
    > - rcu_read_unlock();
    > + read_unlock(&queue->rwlock);
    > +
    > + if (!list_empty(&cull_list)) {
    > + struct call_function_data *next;
    > +
    > + write_lock(&queue->rwlock);
    > + list_for_each_entry_safe(data, next, &cull_list, cull_list) {
    > + list_del(&data->csd.list);
    > + kfree(data);
    > + }
    > + write_unlock(&queue->rwlock);
    > + }
    >
    > put_cpu();
    > }
    > @@ -271,42 +271,6 @@
    > generic_exec_single(cpu, data);
    > }
    >
    > -/* Dummy function */
    > -static void quiesce_dummy(void *unused)
    > -{
    > -}
    > -
    > -/*
    > - * Ensure stack based data used in call function mask is safe to free.
    > - *
    > - * This is needed by smp_call_function_mask when using on-stack data,
    > because - * a single call function queue is shared by all CPUs, and any CPU
    > may pick up - * the data item on the queue at any time before it is
    > deleted. So we need to - * ensure that all CPUs have transitioned through a
    > quiescent state after - * this call.
    > - *
    > - * This is a very slow function, implemented by sending synchronous IPIs
    > to - * all possible CPUs. For this reason, we have to alloc data rather
    > than use - * stack based data even in the case of synchronous calls. The
    > stack based - * data is then just used for deadlock/oom fallback which will
    > be very rare. - *
    > - * If a faster scheme can be made, we could go back to preferring stack
    > based - * data -- the data allocation/free is non-zero cost.
    > - */
    > -static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
    > -{
    > - struct call_single_data data;
    > - int cpu;
    > -
    > - data.func = quiesce_dummy;
    > - data.info = NULL;
    > -
    > - for_each_cpu_mask(cpu, mask) {
    > - data.flags = CSD_FLAG_WAIT;
    > - generic_exec_single(cpu, &data);
    > - }
    > -}
    > -
    > /**
    > * smp_call_function_mask(): Run a function on a set of other CPUs.
    > * @mask: The set of cpus to run on.
    > @@ -332,7 +296,6 @@
    > cpumask_t allbutself;
    > unsigned long flags;
    > int cpu, num_cpus;
    > - int slowpath = 0;
    > unsigned queue_no;
    > struct queue *queue;
    >
    > @@ -359,16 +322,20 @@
    > return smp_call_function_single(cpu, func, info, wait);
    > }
    >
    > - data = kmalloc(sizeof(*data), GFP_ATOMIC);
    > - if (data) {
    > - data->csd.flags = CSD_FLAG_ALLOC;
    > - if (wait)
    > - data->csd.flags |= CSD_FLAG_WAIT;
    > - } else {
    > + /*
    > + * Allocate data if it's an async call, otherwise use stack.
    > + * If the allocation fails, then convert it to a sync call and
    > + * use the stack anyway.
    > + */
    > + if (!wait) {
    > + data = kmalloc(sizeof(*data), GFP_ATOMIC);
    > + if (data)
    > + data->csd.flags = CSD_FLAG_ALLOC;
    > + }
    > + if (!data) {
    > data = &d;
    > data->csd.flags = CSD_FLAG_WAIT;
    > wait = 1;
    > - slowpath = 1;
    > }
    >
    > data->csd.func = func;
    > @@ -376,9 +343,9 @@
    > atomic_set(&data->refs, num_cpus);
    > data->cpumask = mask;
    >
    > - spin_lock_irqsave(&queue->lock, flags);
    > - list_add_tail_rcu(&data->csd.list, &queue->list);
    > - spin_unlock_irqrestore(&queue->lock, flags);
    > + write_lock_irqsave(&queue->rwlock, flags);
    > + list_add_tail(&data->csd.list, &queue->list);
    > + write_unlock_irqrestore(&queue->rwlock, flags);
    >
    > /* Send a message to all CPUs in the map */
    > arch_send_call_function_ipi(mask);
    > @@ -386,8 +353,13 @@
    > /* optionally wait for the CPUs to complete */
    > if (wait) {
    > csd_flag_wait(&data->csd);
    > - if (unlikely(slowpath))
    > - smp_call_function_mask_quiesce_stack(mask);
    > +
    > + write_lock_irqsave(&queue->rwlock, flags);
    > + list_del(&data->csd.list);
    > + write_unlock_irqrestore(&queue->rwlock, flags);
    > +
    > + /* We should never wait for allocated data. */
    > + BUG_ON(data->csd.flags & CSD_FLAG_ALLOC);
    > }
    >
    > return 0;
    > @@ -425,7 +397,7 @@
    > int i;
    >
    > for(i = 0; i < NQUEUES; i++)
    > - spin_lock(&call_function_queues[i].lock);
    > + write_lock(&call_function_queues[i].rwlock);
    > }
    >
    > void ipi_call_unlock(void)
    > @@ -433,7 +405,7 @@
    > int i;
    >
    > for(i = 0; i < NQUEUES; i++)
    > - spin_unlock(&call_function_queues[i].lock);
    > + write_unlock(&call_function_queues[i].rwlock);
    > }
    >
    > void ipi_call_lock_irq(void)
    > @@ -443,7 +415,7 @@
    > spin_lock_irq(&queues_lock);
    >
    > for(i = 0; i < NQUEUES; i++)
    > - spin_lock_nest_lock(&call_function_queues[i].lock, &queues_lock);
    > + write_lock(&call_function_queues[i].rwlock);
    > }
    >
    > void ipi_call_unlock_irq(void)
    > @@ -451,7 +423,7 @@
    > int i;
    >
    > for(i = 0; i < NQUEUES; i++)
    > - spin_unlock(&call_function_queues[i].lock);
    > + write_unlock(&call_function_queues[i].rwlock);
    >
    > spin_unlock_irq(&queues_lock);
    > }
    > @@ -463,7 +435,7 @@
    >
    > for(i = 0; i < NQUEUES; i++) {
    > INIT_LIST_HEAD(&call_function_queues[i].list);
    > - spin_lock_init(&call_function_queues[i].lock);
    > + rwlock_init(&call_function_queues[i].rwlock);
    > }
    >
    > printk(KERN_INFO "smp function calls: using %d/%d queues\n",

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  3. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu


    * Jeremy Fitzhardinge wrote:

    > RCU can only control the lifetime of allocated memory blocks, which
    > forces all the call structures to be allocated. This is expensive
    > compared to allocating them on the stack, which is the common case for
    > synchronous calls.
    >
    > This patch takes a different approach. Rather than using RCU, the
    > queues are managed under rwlocks. Adding or removing from the queue
    > requires holding the lock for writing, but multiple CPUs can walk the
    > queues to process function calls under read locks. In the common
    > case, where the structures are stack allocated, the calling CPU need
    > only wait for its call to be done, take the lock for writing and
    > remove the call structure.
    >
    > Lock contention - particularly write vs read - is reduced by using
    > multiple queues.


    hm, is there any authorative data on what is cheaper on a big box, a
    full-blown MESI cache miss that occurs for every reader in this new
    fastpath, or a local SLAB/SLUB allocation+free that occurs with the
    current RCU approach?

    Ingo
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  4. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    Hi Ingo,

    On Fri, Aug 22, 2008 at 9:28 AM, Ingo Molnar wrote:
    >
    > * Jeremy Fitzhardinge wrote:
    >
    >> RCU can only control the lifetime of allocated memory blocks, which
    >> forces all the call structures to be allocated. This is expensive
    >> compared to allocating them on the stack, which is the common case for
    >> synchronous calls.
    >>
    >> This patch takes a different approach. Rather than using RCU, the
    >> queues are managed under rwlocks. Adding or removing from the queue
    >> requires holding the lock for writing, but multiple CPUs can walk the
    >> queues to process function calls under read locks. In the common
    >> case, where the structures are stack allocated, the calling CPU need
    >> only wait for its call to be done, take the lock for writing and
    >> remove the call structure.
    >>
    >> Lock contention - particularly write vs read - is reduced by using
    >> multiple queues.

    >
    > hm, is there any authorative data on what is cheaper on a big box, a
    > full-blown MESI cache miss that occurs for every reader in this new
    > fastpath, or a local SLAB/SLUB allocation+free that occurs with the
    > current RCU approach?


    Christoph might have an idea about it.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  5. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu


    * Pekka Enberg wrote:

    > Hi Ingo,
    >
    > On Fri, Aug 22, 2008 at 9:28 AM, Ingo Molnar wrote:
    > >
    > > * Jeremy Fitzhardinge wrote:
    > >
    > >> RCU can only control the lifetime of allocated memory blocks, which
    > >> forces all the call structures to be allocated. This is expensive
    > >> compared to allocating them on the stack, which is the common case for
    > >> synchronous calls.
    > >>
    > >> This patch takes a different approach. Rather than using RCU, the
    > >> queues are managed under rwlocks. Adding or removing from the queue
    > >> requires holding the lock for writing, but multiple CPUs can walk the
    > >> queues to process function calls under read locks. In the common
    > >> case, where the structures are stack allocated, the calling CPU need
    > >> only wait for its call to be done, take the lock for writing and
    > >> remove the call structure.
    > >>
    > >> Lock contention - particularly write vs read - is reduced by using
    > >> multiple queues.

    > >
    > > hm, is there any authorative data on what is cheaper on a big box, a
    > > full-blown MESI cache miss that occurs for every reader in this new
    > > fastpath, or a local SLAB/SLUB allocation+free that occurs with the
    > > current RCU approach?

    >
    > Christoph might have an idea about it.


    .... thought of that missing Cc: line entry exactly 1.3 seconds after
    having sent the mail

    Christoph, any preferences/suggestions?

    Ingo
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  6. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    On Friday 22 August 2008 17:12, Ingo Molnar wrote:
    > * Pekka Enberg wrote:
    > > Hi Ingo,
    > >
    > > On Fri, Aug 22, 2008 at 9:28 AM, Ingo Molnar wrote:
    > > > * Jeremy Fitzhardinge wrote:
    > > >> RCU can only control the lifetime of allocated memory blocks, which
    > > >> forces all the call structures to be allocated. This is expensive
    > > >> compared to allocating them on the stack, which is the common case for
    > > >> synchronous calls.
    > > >>
    > > >> This patch takes a different approach. Rather than using RCU, the
    > > >> queues are managed under rwlocks. Adding or removing from the queue
    > > >> requires holding the lock for writing, but multiple CPUs can walk the
    > > >> queues to process function calls under read locks. In the common
    > > >> case, where the structures are stack allocated, the calling CPU need
    > > >> only wait for its call to be done, take the lock for writing and
    > > >> remove the call structure.
    > > >>
    > > >> Lock contention - particularly write vs read - is reduced by using
    > > >> multiple queues.
    > > >
    > > > hm, is there any authorative data on what is cheaper on a big box, a
    > > > full-blown MESI cache miss that occurs for every reader in this new
    > > > fastpath, or a local SLAB/SLUB allocation+free that occurs with the
    > > > current RCU approach?

    > >
    > > Christoph might have an idea about it.

    >
    > ... thought of that missing Cc: line entry exactly 1.3 seconds after
    > having sent the mail
    >
    > Christoph, any preferences/suggestions?


    I think it's just going to be a matter of benchmarking it and seeing.
    And small/medium systems are probably more important than huge ones
    unless there is a pathological scalability problem with one of the
    approaches (which there probably isn't seeing as there is already
    locking there).
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  7. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    Pekka Enberg wrote:
    > Hi Ingo,
    >
    > On Fri, Aug 22, 2008 at 9:28 AM, Ingo Molnar wrote:
    >> * Jeremy Fitzhardinge wrote:
    >>
    >>> RCU can only control the lifetime of allocated memory blocks, which
    >>> forces all the call structures to be allocated. This is expensive
    >>> compared to allocating them on the stack, which is the common case for
    >>> synchronous calls.
    >>>
    >>> This patch takes a different approach. Rather than using RCU, the
    >>> queues are managed under rwlocks. Adding or removing from the queue
    >>> requires holding the lock for writing, but multiple CPUs can walk the
    >>> queues to process function calls under read locks. In the common
    >>> case, where the structures are stack allocated, the calling CPU need
    >>> only wait for its call to be done, take the lock for writing and
    >>> remove the call structure.
    >>>
    >>> Lock contention - particularly write vs read - is reduced by using
    >>> multiple queues.

    >> hm, is there any authorative data on what is cheaper on a big box, a
    >> full-blown MESI cache miss that occurs for every reader in this new
    >> fastpath, or a local SLAB/SLUB allocation+free that occurs with the
    >> current RCU approach?

    >
    > Christoph might have an idea about it.


    Its on the stack which is presumably hot so no cache miss? If its async then
    presumably we do not need to wait so its okay to call an allocator.

    Generally: The larger the box (longer cacheline acquisition latencies) and the
    higher the contention (cannot get cacheline because of contention) the better
    a slab allocation will be compared to a cacheline miss.

    RCU is problematic because it lets cachelines get cold. A hot cacheline that
    is used frequently read and written to by the same cpu is very good thing for
    performace.

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  8. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    On Fri, Aug 22, 2008 at 09:01:22AM -0500, Christoph Lameter wrote:
    > Pekka Enberg wrote:
    > > Hi Ingo,
    > >
    > > On Fri, Aug 22, 2008 at 9:28 AM, Ingo Molnar wrote:
    > >> * Jeremy Fitzhardinge wrote:
    > >>
    > >>> RCU can only control the lifetime of allocated memory blocks, which
    > >>> forces all the call structures to be allocated. This is expensive
    > >>> compared to allocating them on the stack, which is the common case for
    > >>> synchronous calls.
    > >>>
    > >>> This patch takes a different approach. Rather than using RCU, the
    > >>> queues are managed under rwlocks. Adding or removing from the queue
    > >>> requires holding the lock for writing, but multiple CPUs can walk the
    > >>> queues to process function calls under read locks. In the common
    > >>> case, where the structures are stack allocated, the calling CPU need
    > >>> only wait for its call to be done, take the lock for writing and
    > >>> remove the call structure.
    > >>>
    > >>> Lock contention - particularly write vs read - is reduced by using
    > >>> multiple queues.
    > >> hm, is there any authorative data on what is cheaper on a big box, a
    > >> full-blown MESI cache miss that occurs for every reader in this new
    > >> fastpath, or a local SLAB/SLUB allocation+free that occurs with the
    > >> current RCU approach?

    > >
    > > Christoph might have an idea about it.

    >
    > Its on the stack which is presumably hot so no cache miss? If its async then
    > presumably we do not need to wait so its okay to call an allocator.
    >
    > Generally: The larger the box (longer cacheline acquisition latencies) and the
    > higher the contention (cannot get cacheline because of contention) the better
    > a slab allocation will be compared to a cacheline miss.
    >
    > RCU is problematic because it lets cachelines get cold. A hot cacheline that
    > is used frequently read and written to by the same cpu is very good thing for
    > performace.


    So on your these large boxes, read-only cachelines are preferentially
    ejected from the cache, so that one should write to per-CPU data
    occasionally to keep it resident? Or is the issue the long RCU grace
    periods which allow the structure being freed to age out of all relevant
    caches? (My guess would be the second.)

    Thanx, Paul
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  9. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    Paul E. McKenney wrote:

    >> RCU is problematic because it lets cachelines get cold. A hot cacheline that
    >> is used frequently read and written to by the same cpu is very good thing for
    >> performace.

    >
    > So on your these large boxes, read-only cachelines are preferentially
    > ejected from the cache, so that one should write to per-CPU data
    > occasionally to keep it resident? Or is the issue the long RCU grace
    > periods which allow the structure being freed to age out of all relevant
    > caches? (My guess would be the second.)


    The issue are the RCU grace period that are generally long enough to make the
    cacheline fall out of all caches.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  10. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    On Fri, Aug 22, 2008 at 12:14:37PM -0500, Christoph Lameter wrote:
    > Paul E. McKenney wrote:
    >
    > >> RCU is problematic because it lets cachelines get cold. A hot cacheline that
    > >> is used frequently read and written to by the same cpu is very good thing for
    > >> performace.

    > >
    > > So on your these large boxes, read-only cachelines are preferentially
    > > ejected from the cache, so that one should write to per-CPU data
    > > occasionally to keep it resident? Or is the issue the long RCU grace
    > > periods which allow the structure being freed to age out of all relevant
    > > caches? (My guess would be the second.)

    >
    > The issue are the RCU grace period that are generally long enough to make the
    > cacheline fall out of all caches.


    Would it make sense to push the freed-by-RCU memory further up the
    hierarchy, so that such memory is not mistaken for recently freed
    hot-in-cache memory?

    Thanx, Paul
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  11. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    Andi Kleen wrote:
    > Right now my impression is that it is not well understood why
    > the kmalloc makes the IPI that much slower. In theory a kmalloc
    > shouldn't be all that slow, it's essentially just a
    > "disable interrupts; unlink object from cpu cache; enable interrupts"
    > with some window dressing. kfree() is similar.
    >
    > Does it bounce a cache line on freeing perhaps?


    I think it's just an assumption that it would be slower. Has anyone
    measured it?

    (Note: The measurements I posted do not cover this path, because it was
    on a two cpu system, and it was always using the call-single path.)

    J
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  12. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    Paul E. McKenney wrote:

    >>> So on your these large boxes, read-only cachelines are preferentially
    >>> ejected from the cache, so that one should write to per-CPU data
    >>> occasionally to keep it resident? Or is the issue the long RCU grace
    >>> periods which allow the structure being freed to age out of all relevant
    >>> caches? (My guess would be the second.)

    >> The issue are the RCU grace period that are generally long enough to make the
    >> cacheline fall out of all caches.

    >
    > Would it make sense to push the freed-by-RCU memory further up the
    > hierarchy, so that such memory is not mistaken for recently freed
    > hot-in-cache memory?


    That would mean passing a gfp flag like __GFP_COLD on free from RCU? Or how
    would that work at the higher levels?


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  13. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    > Would it make sense to push the freed-by-RCU memory further up the
    > hierarchy, so that such memory is not mistaken for recently freed
    > hot-in-cache memory?


    Right now my impression is that it is not well understood why
    the kmalloc makes the IPI that much slower. In theory a kmalloc
    shouldn't be all that slow, it's essentially just a
    "disable interrupts; unlink object from cpu cache; enable interrupts"
    with some window dressing. kfree() is similar.

    Does it bounce a cache line on freeing perhaps?

    -Andi
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  14. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    On Fri, Aug 22, 2008 at 01:36:37PM -0500, Christoph Lameter wrote:
    > Paul E. McKenney wrote:
    >
    > >>> So on your these large boxes, read-only cachelines are preferentially
    > >>> ejected from the cache, so that one should write to per-CPU data
    > >>> occasionally to keep it resident? Or is the issue the long RCU grace
    > >>> periods which allow the structure being freed to age out of all relevant
    > >>> caches? (My guess would be the second.)
    > >> The issue are the RCU grace period that are generally long enough to make the
    > >> cacheline fall out of all caches.

    > >
    > > Would it make sense to push the freed-by-RCU memory further up the
    > > hierarchy, so that such memory is not mistaken for recently freed
    > > hot-in-cache memory?

    >
    > That would mean passing a gfp flag like __GFP_COLD on free from RCU? Or how
    > would that work at the higher levels?


    I was indeed thinking in terms of the free from RCU being specially marked.

    Thanx, Paul
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  15. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    Paul E. McKenney wrote:

    > I was indeed thinking in terms of the free from RCU being specially marked.


    Isnt there some way to shorten the rcu periods significantly? Critical
    sections do not take that long after all.

    If the RCU periods are much shorter then the chance of cache hotness of the
    objects is increased.

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  16. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    On Fri, Aug 22, 2008 at 03:03:13PM -0500, Christoph Lameter wrote:
    > Paul E. McKenney wrote:
    >
    > > I was indeed thinking in terms of the free from RCU being specially marked.

    >
    > Isnt there some way to shorten the rcu periods significantly? Critical
    > sections do not take that long after all.


    In theory, yes. However, the shorter the grace period, the greater the
    per-update overhead of grace-period detection -- the general approach
    is to use a per-CPU high-resolution timer to force RCU grace period
    processing every 100 microseconds or so. Also, by definition, the RCU
    grace period can be no shorter than the longest active RCU read-side
    critical section. Nevertheless, I have designed my current hierarchical
    RCU patch with expedited grace periods in mind, though more for the
    purpose of reducing latency of long strings of operations that involve
    synchronize_rcu() than for cache locality.

    > If the RCU periods are much shorter then the chance of cache hotness of the
    > objects is increased.


    How short does the grace period need to be to significantly increase the
    chance of an RCU-protected data element remaining in cache across an RCU
    grace period? The last time I calculated this, the knee of the curve was
    at a few tens of milliseconds, but to give you an idea of how long ago
    that was, the workload I used was TPC/A. Which might no longer be very
    representative. ;-)

    Thanx, Paul
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  17. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    On Fri, Aug 22, 2008 at 08:33:46PM +0200, Andi Kleen wrote:
    > > Would it make sense to push the freed-by-RCU memory further up the
    > > hierarchy, so that such memory is not mistaken for recently freed
    > > hot-in-cache memory?

    >
    > Right now my impression is that it is not well understood why
    > the kmalloc makes the IPI that much slower. In theory a kmalloc
    > shouldn't be all that slow, it's essentially just a
    > "disable interrupts; unlink object from cpu cache; enable interrupts"
    > with some window dressing. kfree() is similar.
    >
    > Does it bounce a cache line on freeing perhaps?


    It is indeed easy to focus on the wrong area when attempting to
    improve performance. Done it many times myself... :-/

    Thanx, Paul
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  18. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    On Fri, Aug 22, 2008 at 11:35:46AM -0700, Jeremy Fitzhardinge wrote:
    > Andi Kleen wrote:
    > > Right now my impression is that it is not well understood why
    > > the kmalloc makes the IPI that much slower. In theory a kmalloc
    > > shouldn't be all that slow, it's essentially just a
    > > "disable interrupts; unlink object from cpu cache; enable interrupts"
    > > with some window dressing. kfree() is similar.
    > >
    > > Does it bounce a cache line on freeing perhaps?

    >
    > I think it's just an assumption that it would be slower. Has anyone
    > measured it?


    It's likely slower than no kmalloc because
    there will be more instructions executed, the question is just how much.

    >
    > (Note: The measurements I posted do not cover this path, because it was
    > on a two cpu system, and it was always using the call-single path.)


    Ah so it was already 25% slower even without kmalloc? I thought
    that was with already. That doesn't sound good. Any idea where that slowdown
    comes from?

    -Andi
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  19. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    Andi Kleen wrote:
    > On Fri, Aug 22, 2008 at 11:35:46AM -0700, Jeremy Fitzhardinge wrote:
    >
    >> Andi Kleen wrote:
    >>
    >>> Right now my impression is that it is not well understood why
    >>> the kmalloc makes the IPI that much slower. In theory a kmalloc
    >>> shouldn't be all that slow, it's essentially just a
    >>> "disable interrupts; unlink object from cpu cache; enable interrupts"
    >>> with some window dressing. kfree() is similar.
    >>>
    >>> Does it bounce a cache line on freeing perhaps?
    >>>

    >> I think it's just an assumption that it would be slower. Has anyone
    >> measured it?
    >>

    >
    > It's likely slower than no kmalloc because
    > there will be more instructions executed, the question is just how much.
    >
    >
    >> (Note: The measurements I posted do not cover this path, because it was
    >> on a two cpu system, and it was always using the call-single path.)
    >>

    >
    > Ah so it was already 25% slower even without kmalloc? I thought
    > that was with already. That doesn't sound good. Any idea where that slowdown
    > comes from?


    Just longer code path, I think. It calls the generic
    smp_call_function_mask(), which then does a popcount on the cpu mask
    (which it needs to do anyway), sees only one bit set, and then punts to
    the smp_call_function_single() path. If we maintained a cpus_online
    count, then we could fast-path the call to smp_call_function_single() in
    the two core/cpu case more efficiently (would still need to scan the
    mask to extract the cpu number).

    Or alternatively, maybe it isn't actually worth special casing
    smp_call_function_single() with a multi-queue smp_call_function_mask()
    implementation?

    J
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  20. Re: [PATCH 2/2] smp_call_function: use rwlocks on queues rather than rcu

    > > Ah so it was already 25% slower even without kmalloc? I thought
    > > that was with already. That doesn't sound good. Any idea where that slowdown
    > > comes from?

    >
    > Just longer code path, I think. It calls the generic


    I did IPI measurements quite some time ago and what I remember
    from them is that IPI latencies were in the low multiple thousands cycle
    ballpark.

    > smp_call_function_mask(), which then does a popcount on the cpu mask
    > (which it needs to do anyway), sees only one bit set, and then punts to
    > the smp_call_function_single() path.


    But that is more in the a few tens of cycles (or maybe 1-2 hundreds
    if you have a NR_CPU==4096 kernel with really large cpumask)

    Doesn't really explain a 25% slowdown I would say.

    Are you sure there isn't a new cache miss in there or something? Actually
    it must be even multiple ones to account for such a slow down.

    -Andi

    --
    ak@linux.intel.com
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

+ Reply to Thread
Page 1 of 2 1 2 LastLast