[PATCH][RFC] trace: profile likely and unlikely annotations - Kernel

This is a discussion on [PATCH][RFC] trace: profile likely and unlikely annotations - Kernel ; Andrew Morton recently suggested having an in-kernel way to profile likely and unlikely macros. This patch achieves that goal. When configured, every(*) likely and unlikely macro gets a counter attached to it. When the condition is hit, the hit and ...

+ Reply to Thread
Results 1 to 17 of 17

Thread: [PATCH][RFC] trace: profile likely and unlikely annotations

  1. [PATCH][RFC] trace: profile likely and unlikely annotations


    Andrew Morton recently suggested having an in-kernel way to profile
    likely and unlikely macros. This patch achieves that goal.

    When configured, every(*) likely and unlikely macro gets a counter attached
    to it. When the condition is hit, the hit and misses of that condition
    are recorded. These numbers can later be retrieved by:

    /debugfs/tracing/profile_likely - All likely markers
    /debugfs/tracing/profile_unlikely - All unlikely markers.

    # cat /debug/tracing/profile_unlikely | \
    awk '{ if ((($1 + 1.0)/ ($2 + 1.0)) > 0.5) { print $0; }}' |head -15
    Hit Missed Function
    --- ------ --------
    36278 44749 __switch_to+0x129/0x55d
    70064 10964 __switch_to+0xcd/0x55d
    70060 10964 __switch_to+0x63/0x55d
    2628 0 syscall_trace_leave+0x0/0x154
    1 0 native_smp_prepare_cpus+0x3e9/0x485
    287349 98809 place_entity+0xac/0x11c
    141308 236748 calc_delta_mine+0x8b/0x109
    55091 0 enqueue_task+0x0/0x9b
    71724 52117 pick_next_task_fair+0x0/0xc1
    11 5 yield_task_fair+0x0/0x13b
    1269 0 pre_schedule_rt+0x0/0x98
    46392 1265 pick_next_task_rt+0x0/0x12f
    51523 0 dequeue_task+0x57/0x114

    # cat /debug/tracing/profile_likely | \
    awk '{ if ((($2 + 1.0)/ ($1 + 1.0)) > 0.5) { print $0; }}'
    Hit Missed Function
    --- ------ --------
    43316 37700 schedule+0x6e7/0xdb2
    23032 12997 schedule+0xa9b/0xdb2
    0 7 __cancel_work_timer+0x4b/0x292
    885 1683 update_wall_time+0x27b/0x4fd
    0 369948 audit_syscall_exit+0x131/0x403
    19 370851 audit_free+0x0/0x27e
    0 368981 audit_syscall_entry+0x1a4/0x226
    19 900 audit_free+0xf0/0x27e
    0 973 audit_alloc+0xa6/0x13c
    0 6 move_masked_irq+0x2b/0x174
    12878 223023 page_evictable+0x9a/0x1f1
    9 7 flush_cpu_slab+0x0/0x143
    634634 324186 kmem_cache_free+0x79/0x139
    3142 61979 dentry_lru_del_init+0x0/0x89
    948 1166 load_elf_binary+0xa00/0x18ad
    102 82 disk_part_iter_next+0x0/0xf1
    0 11 tcp_mss_split_point+0x0/0xbd


    As you can see by the above, there's a bit of work to do in rethinking
    the use of some unlikelys and likelys. Note: the unlikely case had 60 hits
    that were more than 50%.

    (*) Not ever unlikely is recorded, those that are used by vsyscalls
    (a few of them) had to be marked as unlikely_notrace().

    Signed-off-by: Steven Rostedt
    ---
    arch/x86/kernel/vsyscall_64.c | 5 -
    include/asm-generic/vmlinux.lds.h | 14 +++
    include/linux/compiler.h | 38 ++++++++
    include/linux/seqlock.h | 3
    kernel/trace/Kconfig | 16 +++
    kernel/trace/Makefile | 2
    kernel/trace/trace_unlikely.c | 163 ++++++++++++++++++++++++++++++++++++++
    7 files changed, 237 insertions(+), 4 deletions(-)

    Index: linux-tip.git/kernel/trace/Makefile
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/Makefile 2008-10-27 23:11:52.000000000 -0400
    +++ linux-tip.git/kernel/trace/Makefile 2008-10-27 23:12:08.000000000 -0400
    @@ -26,4 +26,6 @@ obj-$(CONFIG_FUNCTION_PROFILER) += trace
    obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
    obj-$(CONFIG_BOOT_TRACER) += trace_boot.o

    +obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
    +
    libftrace-y := ftrace.o
    Index: linux-tip.git/arch/x86/kernel/vsyscall_64.c
    ================================================== =================
    --- linux-tip.git.orig/arch/x86/kernel/vsyscall_64.c 2008-10-27 23:11:52.000000000 -0400
    +++ linux-tip.git/arch/x86/kernel/vsyscall_64.c 2008-10-27 23:12:44.000000000 -0400
    @@ -124,7 +124,8 @@ static __always_inline void do_vgettimeo
    seq = read_seqbegin(&__vsyscall_gtod_data.lock);

    vread = __vsyscall_gtod_data.clock.vread;
    - if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
    + if (unlikely_notrace(!__vsyscall_gtod_data.sysctl_ena bled ||
    + !vread)) {
    gettimeofday(tv,NULL);
    return;
    }
    @@ -165,7 +166,7 @@ time_t __vsyscall(1) vtime(time_t *t)
    {
    struct timeval tv;
    time_t result;
    - if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
    + if (unlikely_notrace(!__vsyscall_gtod_data.sysctl_ena bled))
    return time_syscall(t);

    vgettimeofday(&tv, NULL);
    Index: linux-tip.git/include/asm-generic/vmlinux.lds.h
    ================================================== =================
    --- linux-tip.git.orig/include/asm-generic/vmlinux.lds.h 2008-10-27 23:11:52.000000000 -0400
    +++ linux-tip.git/include/asm-generic/vmlinux.lds.h 2008-10-27 23:12:08.000000000 -0400
    @@ -45,6 +45,17 @@
    #define MCOUNT_REC()
    #endif

    +#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
    +#define LIKELY_PROFILE() VMLINUX_SYMBOL(__start_likely_profile) = .; \
    + *(_ftrace_likely) \
    + VMLINUX_SYMBOL(__stop_likely_profile) = .; \
    + VMLINUX_SYMBOL(__start_unlikely_profile) = .; \
    + *(_ftrace_unlikely) \
    + VMLINUX_SYMBOL(__stop_unlikely_profile) = .;
    +#else
    +#define LIKELY_PROFILE()
    +#endif
    +
    /* .data section */
    #define DATA_DATA \
    *(.data) \
    @@ -62,7 +73,8 @@
    VMLINUX_SYMBOL(__stop___markers) = .; \
    VMLINUX_SYMBOL(__start___tracepoints) = .; \
    *(__tracepoints) \
    - VMLINUX_SYMBOL(__stop___tracepoints) = .;
    + VMLINUX_SYMBOL(__stop___tracepoints) = .; \
    + LIKELY_PROFILE()

    #define RO_DATA(align) \
    . = ALIGN((align)); \
    Index: linux-tip.git/include/linux/compiler.h
    ================================================== =================
    --- linux-tip.git.orig/include/linux/compiler.h 2008-10-27 23:11:52.000000000 -0400
    +++ linux-tip.git/include/linux/compiler.h 2008-10-28 00:00:50.000000000 -0400
    @@ -59,8 +59,46 @@ extern void __chk_io_ptr(const volatile
    * specific implementations come from the above header files
    */

    +#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
    +struct ftrace_likely_data {
    + unsigned long ip;
    + unsigned long hit;
    + unsigned long missed;
    +};
    +void ftrace_likely_update(struct ftrace_likely_data *f, int val);
    +
    +#define likely_notrace(x) __builtin_expect(!!(x), 1)
    +#define unlikely_notrace(x) __builtin_expect(!!(x), 0)
    +
    +/* Can't use _THIS_IP_ because it screws up header dependencies */
    +#define __THIS_IP__ ({ __label__ __here; __here: (unsigned long)&&__here; })
    +
    +#define likely(x) ({ \
    + int ______r; \
    + static struct ftrace_likely_data ______f \
    + __attribute__((__aligned__(4))) \
    + __attribute__((section("_ftrace_likely"))); \
    + if (unlikely_notrace(!______f.ip)) \
    + ______f.ip = __THIS_IP__; \
    + ______r = likely_notrace(x); \
    + ftrace_likely_update(&______f, ______r); \
    + ______r; \
    + })
    +#define unlikely(x) ({ \
    + int ______r; \
    + static struct ftrace_likely_data ______f \
    + __attribute__((__aligned__(4))) \
    + __attribute__((section("_ftrace_unlikely"))); \
    + if (unlikely_notrace(!______f.ip)) \
    + ______f.ip = __THIS_IP__; \
    + ______r = unlikely_notrace(x); \
    + ftrace_likely_update(&______f, ______r); \
    + ______r; \
    + })
    +#else
    #define likely(x) __builtin_expect(!!(x), 1)
    #define unlikely(x) __builtin_expect(!!(x), 0)
    +#endif

    /* Optimization barrier */
    #ifndef barrier
    Index: linux-tip.git/kernel/trace/Kconfig
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/Kconfig 2008-10-27 23:11:52.000000000 -0400
    +++ linux-tip.git/kernel/trace/Kconfig 2008-10-27 23:12:08.000000000 -0400
    @@ -138,6 +138,22 @@ config BOOT_TRACER
    selected, because the self-tests are an initcall as well and that
    would invalidate the boot trace. )

    +config TRACE_UNLIKELY_PROFILE
    + bool "Trace likely/unlikely profiler"
    + depends on DEBUG_KERNEL
    + select TRACING
    + help
    + This tracer profiles all the the likely and unlikely macros
    + in the kernel. It will display the results in:
    +
    + /debugfs/tracing/profile_likely
    + /debugfs/tracing/profile_unlikely
    +
    + Note: this will add a significant overhead, only turn this
    + on if you need to profile the system's use of these macros.
    +
    + Say N if unsure.
    +
    config STACK_TRACER
    bool "Trace max stack"
    depends on HAVE_FUNCTION_TRACER
    Index: linux-tip.git/kernel/trace/trace_unlikely.c
    ================================================== =================
    --- /dev/null 1970-01-01 00:00:00.000000000 +0000
    +++ linux-tip.git/kernel/trace/trace_unlikely.c 2008-10-27 23:13:53.000000000 -0400
    @@ -0,0 +1,163 @@
    +/*
    + * unlikely profiler
    + *
    + * Copyright (C) 2008 Steven Rostedt
    + */
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include "trace.h"
    +
    +void ftrace_likely_update(struct ftrace_likely_data *f, int val)
    +{
    + /* FIXME: Make this atomic! */
    + if (val)
    + f->hit++;
    + else
    + f->missed++;
    +}
    +EXPORT_SYMBOL(ftrace_likely_update);
    +
    +struct ftrace_pointer {
    + void *start;
    + void *stop;
    +};
    +
    +static void *
    +t_next(struct seq_file *m, void *v, loff_t *pos)
    +{
    + struct ftrace_pointer *f = m->private;
    + struct ftrace_likely_data *p = v;
    +
    + (*pos)++;
    +
    + if (v == (void *)1)
    + return f->start;
    +
    + ++p;
    +
    + if ((void *)p >= (void *)f->stop)
    + return NULL;
    +
    + return p;
    +}
    +
    +static void *t_start(struct seq_file *m, loff_t *pos)
    +{
    + void *t = (void *)1;
    + loff_t l = 0;
    +
    + for (; t && l < *pos; t = t_next(m, t, &l))
    + ;
    +
    + return t;
    +}
    +
    +static void t_stop(struct seq_file *m, void *p)
    +{
    +}
    +
    +static int trace_lookup_sym(struct seq_file *m, unsigned long addr)
    +{
    +#ifdef CONFIG_KALLSYMS
    + char str[KSYM_SYMBOL_LEN];
    +
    + sprint_symbol(str, addr);
    +
    + return seq_printf(m, "%s\n", str);
    +#else
    + return seq_printf(m, "%p\n", (void *)addr);
    +#endif
    +}
    +
    +static int t_show(struct seq_file *m, void *v)
    +{
    + struct ftrace_likely_data *p = v;
    +
    + if (v == (void *)1) {
    + seq_printf(m, " Hit Missed Function\n"
    + " --- ------ --------\n");
    + return 0;
    + }
    +
    + /* skip null records */
    + if (!p->ip)
    + return 0;
    +
    + seq_printf(m, " %8lu %8lu ", p->hit, p->missed);
    + trace_lookup_sym(m, p->ip);
    + return 0;
    +}
    +
    +static struct seq_operations tracing_likely_seq_ops = {
    + .start = t_start,
    + .next = t_next,
    + .stop = t_stop,
    + .show = t_show,
    +};
    +
    +static int tracing_likely_open(struct inode *inode, struct file *file)
    +{
    + int ret;
    +
    + ret = seq_open(file, &tracing_likely_seq_ops);
    + if (!ret) {
    + struct seq_file *m = file->private_data;
    + m->private = (void *)inode->i_private;
    + }
    +
    + return ret;
    +}
    +
    +static struct file_operations tracing_likely_fops = {
    + .open = tracing_likely_open,
    + .read = seq_read,
    + .llseek = seq_lseek,
    +};
    +
    +extern unsigned long __start_likely_profile[];
    +extern unsigned long __stop_likely_profile[];
    +extern unsigned long __start_unlikely_profile[];
    +extern unsigned long __stop_unlikely_profile[];
    +
    +static struct ftrace_pointer ftrace_likely_pos = {
    + .start = __start_likely_profile,
    + .stop = __stop_likely_profile,
    +};
    +
    +static struct ftrace_pointer ftrace_unlikely_pos = {
    + .start = __start_unlikely_profile,
    + .stop = __stop_unlikely_profile,
    +};
    +
    +static __init int ftrace_unlikely_init(void)
    +{
    + struct dentry *d_tracer;
    + struct dentry *entry;
    +
    + d_tracer = tracing_init_dentry();
    +
    + entry = debugfs_create_file("profile_likely", 0444, d_tracer,
    + &ftrace_likely_pos,
    + &tracing_likely_fops);
    + if (!entry)
    + pr_warning("Could not create debugfs 'profile_likely' entry\n");
    +
    + entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
    + &ftrace_unlikely_pos,
    + &tracing_likely_fops);
    + if (!entry)
    + pr_warning("Could not create debugfs"
    + " 'profile_unlikely' entry\n");
    +
    + return 0;
    +}
    +
    +device_initcall(ftrace_unlikely_init);
    Index: linux-tip.git/include/linux/seqlock.h
    ================================================== =================
    --- linux-tip.git.orig/include/linux/seqlock.h 2008-10-27 23:11:52.000000000 -0400
    +++ linux-tip.git/include/linux/seqlock.h 2008-10-27 23:12:08.000000000 -0400
    @@ -90,7 +90,8 @@ static __always_inline unsigned read_seq
    repeat:
    ret = sl->sequence;
    smp_rmb();
    - if (unlikely(ret & 1)) {
    + /* Used in vsyscall, need notrace version */
    + if (unlikely_notrace(ret & 1)) {
    cpu_relax();
    goto repeat;
    }
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  2. Re: [PATCH][RFC] trace: profile likely and unlikely annotations

    On Tue, 28 Oct 2008 00:12:48 -0400 (EDT)
    Steven Rostedt wrote:

    >
    > Andrew Morton recently suggested having an in-kernel way to profile
    > likely and unlikely macros. This patch achieves that goal.



    alternatively we remove likely/unlikely

    or maybe only ban it from drivers or something.


    --
    Arjan van de Ven Intel Open Source Technology Centre
    For development, discussion and tips for power savings,
    visit http://www.lesswatts.org
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  3. Re: [PATCH][RFC] trace: profile likely and unlikely annotations

    On Tue, 28 Oct 2008 00:12:48 -0400 (EDT) Steven Rostedt wrote:

    > (*) Not ever unlikely is recorded, those that are used by vsyscalls
    > (a few of them) had to be marked as unlikely_notrace().


    Yes, it took continuous maintenance to weed this stuff out of
    profile-likely-unlikely-macros.patch.

    I'd suggest that you take a peek at this patch and make sure that
    you've addressed all the sites which had to be fixed up while
    maintaining this one.

    (I've been maintaining this for 2-1/2 years. Sniff)


    From: Daniel Walker

    -ENOCHANGELOG!

    Creates /proc/likely_prof.

    [randy.dunlap@oracle.com: profile_likely: export do_check_likely]
    [akpm@linux-foundation.org: fixes and general maintenance]
    [dwalker@mvista.com: likely_prof changed to use proc_create]
    [12o3l@tiscali.nl: likeliness accounting change and cleanup]
    [adobriyan@gmail.com: proc: remove proc_root from drivers]
    [12o3l@tiscali.nl: update to test_and_set_bit_lock / clear_bit_unlock]
    [dwalker@mvista.com: likely-profiling: disable ftrace]
    Signed-off-by: Daniel Walker
    Signed-off-by: Hua Zhong
    Cc: Andi Kleen
    Signed-off-by: Roel Kluin <12o3l@tiscali.nl>
    Signed-off-by: Alexey Dobriyan
    Signed-off-by: Andrew Morton
    ---

    arch/x86/kernel/vsyscall_64.c | 2
    arch/x86/vdso/vclock_gettime.c | 2
    include/linux/compiler.h | 36 +++++++
    lib/Kconfig.debug | 8 +
    lib/Makefile | 5 +
    lib/likely_prof.c | 150 +++++++++++++++++++++++++++++++
    6 files changed, 203 insertions(+)

    diff -puN arch/x86/kernel/vsyscall_64.c~profile-likely-unlikely-macros arch/x86/kernel/vsyscall_64.c
    --- a/arch/x86/kernel/vsyscall_64.c~profile-likely-unlikely-macros
    +++ a/arch/x86/kernel/vsyscall_64.c
    @@ -17,6 +17,8 @@
    * want per guest time just set the kernel.vsyscall64 sysctl to 0.
    */

    +#define SUPPRESS_LIKELY_PROFILING
    +
    #include
    #include
    #include
    diff -puN arch/x86/vdso/vclock_gettime.c~profile-likely-unlikely-macros arch/x86/vdso/vclock_gettime.c
    --- a/arch/x86/vdso/vclock_gettime.c~profile-likely-unlikely-macros
    +++ a/arch/x86/vdso/vclock_gettime.c
    @@ -9,6 +9,8 @@
    * Also alternative() doesn't work.
    */

    +#define SUPPRESS_LIKELY_PROFILING
    +
    #include
    #include
    #include
    diff -puN include/linux/compiler.h~profile-likely-unlikely-macros include/linux/compiler.h
    --- a/include/linux/compiler.h~profile-likely-unlikely-macros
    +++ a/include/linux/compiler.h
    @@ -53,6 +53,41 @@ extern void __chk_io_ptr(const volatile
    # include
    #endif

    +#if defined(CONFIG_PROFILE_LIKELY) && !defined(SUPPRESS_LIKELY_PROFILING) && \
    + !(defined(CONFIG_MODULE_UNLOAD) && defined(MODULE))
    +struct likeliness {
    + char *const file;
    + unsigned long caller;
    + unsigned int count[2];
    + struct likeliness *next;
    + unsigned int label;
    +};
    +
    +extern int do_check_likely(struct likeliness *likeliness, unsigned int exp);
    +
    +#define LP_IS_EXPECTED 1
    +#define LP_UNSEEN 4
    +#define LP_LINE_SHIFT 3
    +
    +#define __check_likely(exp, is_likely) \
    + ({ \
    + static struct likeliness likeliness = { \
    + .file = __FILE__, \
    + .label = __LINE__ << LP_LINE_SHIFT | \
    + LP_UNSEEN | is_likely, \
    + }; \
    + do_check_likely(&likeliness, !!(exp)); \
    + })
    +
    +/*
    + * We check for constant values with __builtin_constant_p() since
    + * it's not interesting to profile them, and there is a compiler
    + * bug in gcc 3.x which blows up during constant evalution when
    + * CONFIG_PROFILE_LIKELY is turned on.
    + */
    +#define likely(x) (__builtin_constant_p(x) ? (!!(x)) : __check_likely((x), 1))
    +#define unlikely(x) (__builtin_constant_p(x) ? (!!(x)) : __check_likely((x), 0))
    +#else
    /*
    * Generic compiler-dependent macros required for kernel
    * build go below this comment. Actual compiler/compiler version
    @@ -61,6 +96,7 @@ extern void __chk_io_ptr(const volatile

    #define likely(x) __builtin_expect(!!(x), 1)
    #define unlikely(x) __builtin_expect(!!(x), 0)
    +#endif

    /* Optimization barrier */
    #ifndef barrier
    diff -puN lib/Kconfig.debug~profile-likely-unlikely-macros lib/Kconfig.debug
    --- a/lib/Kconfig.debug~profile-likely-unlikely-macros
    +++ a/lib/Kconfig.debug
    @@ -568,6 +568,14 @@ config DEBUG_SYNCHRO_TEST

    See Documentation/synchro-test.txt.

    +config PROFILE_LIKELY
    + bool "Record return values from likely/unlikely macros"
    + default n
    + help
    + Adds profiling on likely/unlikly macros . To see the
    + results of the profiling you can view the following,
    + /proc/likely_prof
    +
    config BOOT_PRINTK_DELAY
    bool "Delay each boot printk message by N milliseconds"
    depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY
    diff -puN lib/Makefile~profile-likely-unlikely-macros lib/Makefile
    --- a/lib/Makefile~profile-likely-unlikely-macros
    +++ a/lib/Makefile
    @@ -22,6 +22,9 @@ obj-y += bcd.o div64.o sort.o parser.o h
    bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
    string_helpers.o

    +# likely profiling can cause recursion in ftrace, so don't trace it.
    +CFLAGS_REMOVE_likely_prof.o = -pg
    +
    ifeq ($(CONFIG_DEBUG_KOBJECT),y)
    CFLAGS_kobject.o += -DDEBUG
    CFLAGS_kobject_uevent.o += -DDEBUG
    @@ -82,6 +85,8 @@ obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += sys

    obj-$(CONFIG_DYNAMIC_PRINTK_DEBUG) += dynamic_printk.o

    +obj-$(CONFIG_PROFILE_LIKELY) += likely_prof.o
    +
    hostprogs-y := gen_crc32table
    clean-files := crc32table.h

    diff -puN /dev/null lib/likely_prof.c
    --- /dev/null
    +++ a/lib/likely_prof.c
    @@ -0,0 +1,150 @@
    +/*
    + * This code should enable profiling the likely and unlikely macros.
    + *
    + * Output goes in /proc/likely_prof
    + *
    + * Authors:
    + * Daniel Walker
    + * Hua Zhong
    + * Andrew Morton
    + */
    +
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +
    +#include
    +#include
    +
    +static struct likeliness *likeliness_head;
    +
    +int do_check_likely(struct likeliness *likeliness, unsigned int ret)
    +{
    + static unsigned long likely_lock;
    +
    + likeliness->count[ret]++;
    +
    + if (likeliness->label & LP_UNSEEN) {
    + /*
    + * We don't simply use a spinlock because internally to the
    + * spinlock there is a call to unlikely which causes recursion.
    + * We opted for this method because we didn't need a preempt/irq
    + * disable and it was a bit cleaner then using internal __raw
    + * spinlock calls.
    + */
    + if (!test_and_set_bit_lock(0, &likely_lock)) {
    + if (likeliness->label & LP_UNSEEN) {
    + likeliness->label &= (~LP_UNSEEN);
    + likeliness->next = likeliness_head;
    + likeliness_head = likeliness;
    + likeliness->caller = (unsigned long)
    + __builtin_return_address(0);
    + }
    + clear_bit_unlock(0, &likely_lock);
    + }
    + }
    +
    + return ret;
    +}
    +EXPORT_SYMBOL(do_check_likely);
    +
    +static void * lp_seq_start(struct seq_file *out, loff_t *pos)
    +{
    +
    + if (!*pos) {
    +
    + seq_printf(out, "Likely Profiling Results\n");
    + seq_printf(out, " --------------------------------------------"
    + "------------------------\n");
    + seq_printf(out, "[+- ]Type | # True | # False | Function@"
    + "Filename:Line\n");
    +
    + out->private = likeliness_head;
    + }
    +
    + return out->private;
    +}
    +
    +static void *lp_seq_next(struct seq_file *out, void *p, loff_t *pos)
    +{
    + struct likeliness *entry = p;
    +
    + if (entry->next) {
    + ++(*pos);
    + out->private = entry->next;
    + } else
    + out->private = NULL;
    +
    + return out->private;
    +}
    +
    +static int lp_seq_show(struct seq_file *out, void *p)
    +{
    + struct likeliness *entry = p;
    + unsigned int pos = entry->count[1];
    + unsigned int neg = entry->count[0];
    + char function[KSYM_SYMBOL_LEN];
    +
    + /*
    + * Balanced if the suggestion was false in less than 5% of the tests
    + */
    + if (!(entry->label & LP_IS_EXPECTED)) {
    + if (pos + neg < 20 * pos)
    + seq_printf(out, "+");
    + else
    + seq_printf(out, " ");
    +
    + seq_printf(out, "unlikely ");
    + } else {
    + if (pos + neg < 20 * neg)
    + seq_printf(out, "-");
    + else
    + seq_printf(out, " ");
    +
    + seq_printf(out, "likely ");
    + }
    +
    + sprint_symbol(function, entry->caller);
    + seq_printf(out, "|%9u|%9u|\t%s@%s:%u\n", pos, neg, function,
    + entry->file, entry->label >> LP_LINE_SHIFT);
    +
    + return 0;
    +}
    +
    +static void lp_seq_stop(struct seq_file *m, void *p)
    +{
    +}
    +
    +static struct seq_operations likely_profiling_ops = {
    + .start = lp_seq_start,
    + .next = lp_seq_next,
    + .stop = lp_seq_stop,
    + .show = lp_seq_show
    +};
    +
    +static int lp_results_open(struct inode *inode, struct file *file)
    +{
    + return seq_open(file, &likely_profiling_ops);
    +}
    +
    +static struct file_operations proc_likely_operations = {
    + .open = lp_results_open,
    + .read = seq_read,
    + .llseek = seq_lseek,
    + .release = seq_release,
    +};
    +
    +static int __init init_likely(void)
    +{
    + struct proc_dir_entry *entry =
    + proc_create("likely_prof", 0, NULL, &proc_likely_operations);
    + if (!entry)
    + return 1;
    +
    + return 0;
    +}
    +__initcall(init_likely);
    _

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  4. Re: [PATCH][RFC] trace: profile likely and unlikely annotations

    On Tue, Oct 28, 2008 at 12:12:48AM -0400, Steven Rostedt wrote:
    >
    > Andrew Morton recently suggested having an in-kernel way to profile
    > likely and unlikely macros. This patch achieves that goal.


    Maybe I'm confused, but when I read through the patch, it looks like
    that 'hit' is incremented whenever the condition is true, and 'missed'
    is incremented whenever the condition is false, correct?

    Is that what you intended? So for profile_unlikely, "missed" is good,
    and "hit" is bad, and for profile_likely, "hit" is good, and "missed"
    is bad. That seems horribly confusing.

    If that wasn't what you intended, the meaning of "hit" and "missed"
    seems to be highly confusing, either way. Can we perhaps use some
    other terminology? Simply using "True" and "False" would be better,
    since there's no possible confusion what the labels mean.

    > +#define unlikely(x) ({ \
    > + int ______r; \
    > + static struct ftrace_likely_data ______f \
    > + __attribute__((__aligned__(4))) \
    > + __attribute__((section("_ftrace_unlikely"))); \
    > + if (unlikely_notrace(!______f.ip)) \
    > + ______f.ip = __THIS_IP__; \
    > + ______r = unlikely_notrace(x); \
    > + ftrace_likely_update(&______f, ______r); \
    > + ______r; \
    > + })


    Note that unlikely(x) calls ftrace_likely_update(), which does this:

    > +void ftrace_likely_update(struct ftrace_likely_data *f, int val)
    > +{
    > + /* FIXME: Make this atomic! */
    > + if (val)
    > + f->hit++;
    > + else
    > + f->missed++;
    > +}
    > +EXPORT_SYMBOL(ftrace_likely_update);



    So that seems to mean that if unlikely(x) is false, then _____r is 0,
    which means we increment f->missed. Or am I missing something?

    I would have thought that if unlikely(x) is false, that's *good*,
    since it means the unlikely label was correct. And normally, when
    people think about cache hits vs cache misses, hits are good and
    misses are bad. Which is why I think the terminology is highly
    confusing...

    - Ted
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  5. Re: [PATCH][RFC] trace: profile likely and unlikely annotations

    On Tue, 28 Oct 2008 10:37:20 -0400
    Theodore Tso wrote:

    > On Tue, Oct 28, 2008 at 12:12:48AM -0400, Steven Rostedt wrote:
    > >
    > > Andrew Morton recently suggested having an in-kernel way to profile
    > > likely and unlikely macros. This patch achieves that goal.

    >
    > Maybe I'm confused, but when I read through the patch, it looks like
    > that 'hit' is incremented whenever the condition is true, and 'missed'
    > is incremented whenever the condition is false, correct?
    >
    > Is that what you intended? So for profile_unlikely, "missed" is good,
    > and "hit" is bad, and for profile_likely, "hit" is good, and "missed"
    > is bad. That seems horribly confusing.
    >
    > If that wasn't what you intended, the meaning of "hit" and "missed"
    > seems to be highly confusing, either way. Can we perhaps use some
    > other terminology? Simply using "True" and "False" would be better,
    > since there's no possible confusion what the labels mean.


    or "correct" and "incorrect"



    --
    Arjan van de Ven Intel Open Source Technology Centre
    For development, discussion and tips for power savings,
    visit http://www.lesswatts.org
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  6. Re: [PATCH][RFC] trace: profile likely and unlikely annotations


    On Tue, 28 Oct 2008, Theodore Tso wrote:

    > On Tue, Oct 28, 2008 at 12:12:48AM -0400, Steven Rostedt wrote:
    > >
    > > Andrew Morton recently suggested having an in-kernel way to profile
    > > likely and unlikely macros. This patch achieves that goal.

    >
    > Maybe I'm confused, but when I read through the patch, it looks like
    > that 'hit' is incremented whenever the condition is true, and 'missed'
    > is incremented whenever the condition is false, correct?


    Correct.

    >
    > Is that what you intended? So for profile_unlikely, "missed" is good,
    > and "hit" is bad, and for profile_likely, "hit" is good, and "missed"
    > is bad. That seems horribly confusing.


    Correct. Yeah, I figured I'd get complaints about this (hence the RFC).
    If you look at my awk example, you will also notice that I switched the
    $1 and $2 around when reading the other file.

    This can be confusing either way. I did this to reuse the code for both
    outputs.

    >
    > If that wasn't what you intended, the meaning of "hit" and "missed"
    > seems to be highly confusing, either way. Can we perhaps use some
    > other terminology? Simply using "True" and "False" would be better,
    > since there's no possible confusion what the labels mean.


    So renaming 'hit' and 'miss' to 'True' and 'False' would be good enough?
    That is, it will still mean that a 'True' is bad for unlikely but good for
    a likely?

    >
    > > +#define unlikely(x) ({ \
    > > + int ______r; \
    > > + static struct ftrace_likely_data ______f \
    > > + __attribute__((__aligned__(4))) \
    > > + __attribute__((section("_ftrace_unlikely"))); \
    > > + if (unlikely_notrace(!______f.ip)) \
    > > + ______f.ip = __THIS_IP__; \
    > > + ______r = unlikely_notrace(x); \
    > > + ftrace_likely_update(&______f, ______r); \
    > > + ______r; \
    > > + })

    >
    > Note that unlikely(x) calls ftrace_likely_update(), which does this:
    >
    > > +void ftrace_likely_update(struct ftrace_likely_data *f, int val)
    > > +{
    > > + /* FIXME: Make this atomic! */
    > > + if (val)
    > > + f->hit++;
    > > + else
    > > + f->missed++;
    > > +}
    > > +EXPORT_SYMBOL(ftrace_likely_update);

    >
    >
    > So that seems to mean that if unlikely(x) is false, then _____r is 0,
    > which means we increment f->missed. Or am I missing something?
    >
    > I would have thought that if unlikely(x) is false, that's *good*,
    > since it means the unlikely label was correct. And normally, when
    > people think about cache hits vs cache misses, hits are good and
    > misses are bad. Which is why I think the terminology is highly
    > confusing...


    OK, I'm fine with changing the terminology. v2 will do:

    s/hit/True/
    s/missed/False/

    Fine with you?

    -- Steve

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  7. Re: [PATCH][RFC] trace: profile likely and unlikely annotations

    On Tue, Oct 28, 2008 at 10:49:16AM -0400, Steven Rostedt wrote:
    >
    > OK, I'm fine with changing the terminology. v2 will do:
    >
    > s/hit/True/
    > s/missed/False/
    >
    > Fine with you?


    I'm OK with either that, or with Arjan's suggestion of "Correct" and
    "Incorrect" --- although that would changing a line in the definition
    of #define unlikely(x):

    ftrace_likely_update(&______f, !______r);

    Either "True" / "False" or "Correct" / "Incorrect" has the advantage
    of being unambiguous. "Correct" / "Incorrect" has the advantage that
    people don't have to think about the fact that for
    /proc/profile_unlikely, high numbers of "False" is a good thing, where
    as for /proc/profile_likely, high numbers of "True" is a good thing.
    With "Correct" / "Incorrect" it's easier to undersatnd that high
    numbers of "Correct" is good.

    So I can see why Arjan suggested Correct/Incorrect, although I can
    live with either.

    - Ted
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  8. Re: [PATCH][RFC] trace: profile likely and unlikely annotations


    On Tue, 28 Oct 2008, Theodore Tso wrote:

    > On Tue, Oct 28, 2008 at 10:49:16AM -0400, Steven Rostedt wrote:
    > >
    > > OK, I'm fine with changing the terminology. v2 will do:
    > >
    > > s/hit/True/
    > > s/missed/False/
    > >
    > > Fine with you?

    >
    > I'm OK with either that, or with Arjan's suggestion of "Correct" and
    > "Incorrect" --- although that would changing a line in the definition
    > of #define unlikely(x):
    >
    > ftrace_likely_update(&______f, !______r);
    >
    > Either "True" / "False" or "Correct" / "Incorrect" has the advantage
    > of being unambiguous. "Correct" / "Incorrect" has the advantage that
    > people don't have to think about the fact that for
    > /proc/profile_unlikely, high numbers of "False" is a good thing, where
    > as for /proc/profile_likely, high numbers of "True" is a good thing.
    > With "Correct" / "Incorrect" it's easier to undersatnd that high
    > numbers of "Correct" is good.
    >
    > So I can see why Arjan suggested Correct/Incorrect, although I can
    > live with either.


    I'm almost done with v2. Using Arjan's version (Correct/Incorrect) was
    actually a trivial change.

    -- Steve

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  9. Re: [PATCH][RFC] trace: profile likely and unlikely annotations


    On Tue, 28 Oct 2008, Arjan van de Ven wrote:

    > On Tue, 28 Oct 2008 10:37:20 -0400
    > Theodore Tso wrote:
    >
    > > On Tue, Oct 28, 2008 at 12:12:48AM -0400, Steven Rostedt wrote:
    > > >
    > > > Andrew Morton recently suggested having an in-kernel way to profile
    > > > likely and unlikely macros. This patch achieves that goal.

    > >
    > > Maybe I'm confused, but when I read through the patch, it looks like
    > > that 'hit' is incremented whenever the condition is true, and 'missed'
    > > is incremented whenever the condition is false, correct?
    > >
    > > Is that what you intended? So for profile_unlikely, "missed" is good,
    > > and "hit" is bad, and for profile_likely, "hit" is good, and "missed"
    > > is bad. That seems horribly confusing.
    > >
    > > If that wasn't what you intended, the meaning of "hit" and "missed"
    > > seems to be highly confusing, either way. Can we perhaps use some
    > > other terminology? Simply using "True" and "False" would be better,
    > > since there's no possible confusion what the labels mean.

    >
    > or "correct" and "incorrect"


    This means that the code will need to be different for both. Or at least a
    way to differentiate between the two. Not that hard, but I wanted to make
    the code as trivial as possible.

    -- Steve

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  10. [PATCH 2/2 v2][RFC] ftrace: unlikely annotation tracer


    This patch adds a way to record the instances of the likely and unlikely
    condition annotations.

    When "unlikely" is set in /debugfs/tracing/iter_ctrl the unlikely conditions
    will be added to any of the ftrace tracers.

    For example:

    irqbalance-2662 [002] 41127.102858: [correct] debug_mutex_free_waiter 51
    -0 [001] 41127.102858: [correct] add_preempt_count 4307
    irqbalance-2662 [002] 41127.102859: [INCORRECT] debug_smp_processor_id 15
    -0 [001] 41127.102860: [correct] add_preempt_count 4316
    irqbalance-2662 [002] 41127.102861: [correct] slab_alloc 1608

    Which shows the normal tracer heading, as well as whether the condition was
    correct "[correct]" or was mistaken "[INCORRECT]", followed by the
    function and line number.


    Signed-off-by: Steven Rostedt
    ---
    kernel/Makefile | 5 +
    kernel/trace/Kconfig | 22 +++++++
    kernel/trace/Makefile | 5 +
    kernel/trace/ring_buffer.c | 11 +++
    kernel/trace/trace.c | 37 +++++++++++
    kernel/trace/trace.h | 43 +++++++++++++
    kernel/trace/trace_unlikely.c | 130 ++++++++++++++++++++++++++++++++++++++++++
    7 files changed, 253 insertions(+)

    Index: linux-tip.git/kernel/trace/Kconfig
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/Kconfig 2008-10-29 12:32:30.000000000 -0400
    +++ linux-tip.git/kernel/trace/Kconfig 2008-10-29 12:32:35.000000000 -0400
    @@ -154,6 +154,28 @@ config TRACE_UNLIKELY_PROFILE

    Say N if unsure.

    +config TRACING_UNLIKELY
    + bool
    + help
    + Selected by tracers that will trace the likely and unlikely
    + conditions. This prevents the tracers themselves from being
    + profiled. Profiling the tracing infrastructure can only happen
    + when the likelys and unlikelys are not being traced.
    +
    +config UNLIKELY_TRACER
    + bool "Trace likely/unlikely instances"
    + depends on TRACE_UNLIKELY_PROFILE
    + select TRACING_UNLIKELY
    + help
    + This is traces the events of likely and unlikely condition
    + calls in the kernel. The difference between this and the
    + "Trace likely/unlikely profiler" is that this is not a
    + histogram of the callers, but actually places the calling
    + events into a running buffer to see when and where the
    + events happened, as well as their results.
    +
    + Say N if unsure.
    +
    config STACK_TRACER
    bool "Trace max stack"
    depends on HAVE_FUNCTION_TRACER
    Index: linux-tip.git/kernel/Makefile
    ================================================== =================
    --- linux-tip.git.orig/kernel/Makefile 2008-10-29 12:32:30.000000000 -0400
    +++ linux-tip.git/kernel/Makefile 2008-10-29 12:32:35.000000000 -0400
    @@ -24,6 +24,11 @@ CFLAGS_REMOVE_sched_clock.o = -pg
    CFLAGS_REMOVE_sched.o = -mno-spe -pg
    endif

    +ifdef CONFIG_TRACING_UNLIKELY
    +# Do not profile tracepoint if we are tracing the unlikely conditions
    +CFLAGS_tracepoint.o += '-Dunlikely(x)=unlikely_notrace(x)'
    +endif
    +
    obj-$(CONFIG_FREEZER) += freezer.o
    obj-$(CONFIG_PROFILING) += profile.o
    obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
    Index: linux-tip.git/kernel/trace/Makefile
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/Makefile 2008-10-29 12:32:30.000000000 -0400
    +++ linux-tip.git/kernel/trace/Makefile 2008-10-29 12:32:35.000000000 -0400
    @@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg
    obj-y += trace_selftest_dynamic.o
    endif

    +ifdef CONFIG_TRACING_UNLIKELY
    +KBUILD_CFLAGS += '-Dlikely(x)=likely_notrace(x)'
    +KBUILD_CFLAGS += '-Dunlikely(x)=unlikely_notrace(x)'
    +endif
    +
    obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
    obj-$(CONFIG_RING_BUFFER) += ring_buffer.o

    Index: linux-tip.git/kernel/trace/trace_unlikely.c
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/trace_unlikely.c 2008-10-29 12:32:30.000000000 -0400
    +++ linux-tip.git/kernel/trace/trace_unlikely.c 2008-10-29 12:32:35.000000000 -0400
    @@ -15,8 +15,138 @@
    #include
    #include "trace.h"

    +#ifdef CONFIG_UNLIKELY_TRACER
    +
    +static int unlikely_tracing_enabled __read_mostly;
    +static DEFINE_MUTEX(unlikely_tracing_mutex);
    +static DEFINE_PER_CPU(atomic_t, unlikely_trace_disabled);
    +static struct trace_array *unlikely_tracer;
    +
    +void trace_unlikely_stop(void)
    +{
    + /* preemption should be disabled */
    + int cpu = smp_processor_id();
    +
    + atomic_inc(&per_cpu(unlikely_trace_disabled, cpu));
    +}
    +
    +void trace_unlikely_start(void)
    +{
    + /* preemption should be disabled */
    + int cpu = smp_processor_id();
    +
    + atomic_dec(&per_cpu(unlikely_trace_disabled, cpu));
    +}
    +
    +static void
    +probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
    +{
    + struct trace_array *tr = unlikely_tracer;
    + struct ring_buffer_event *event;
    + struct trace_unlikely *entry;
    + unsigned long flags, irq_flags;
    + int cpu, resched, pc;
    +
    + if (unlikely(!tr))
    + return;
    +
    + pc = preempt_count();
    + /* Can be called within schedule */
    + resched = need_resched();
    + preempt_disable_notrace();
    + cpu = raw_smp_processor_id();
    + if (atomic_inc_return(&per_cpu(unlikely_trace_disabled, cpu)) != 1)
    + goto out;
    +
    + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
    + &irq_flags);
    + if (!event)
    + goto out;
    +
    + local_save_flags(flags);
    + entry = ring_buffer_event_data(event);
    + tracing_generic_entry_update(&entry->ent, flags, pc);
    + entry->ent.type = TRACE_UNLIKELY;
    +
    + /*
    + * I would love to save just the ftrace_likely_data pointer, but
    + * this code can also be used by modules. Ugly things can happen
    + * if the module is unloaded, and then we go and read the
    + * pointer. This is slower, but much safer.
    + */
    +
    + strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
    + strncpy(entry->file, f->file, TRACE_FILE_SIZE);
    + entry->func[TRACE_FUNC_SIZE] = 0;
    + entry->file[TRACE_FILE_SIZE] = 0;
    + entry->line = f->line;
    + entry->correct = val == expect;
    +
    + ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
    +
    + out:
    + atomic_dec(&per_cpu(unlikely_trace_disabled, cpu));
    + if (resched)
    + preempt_enable_no_resched_notrace();
    + else
    + preempt_enable_notrace();
    +}
    +
    +static inline
    +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
    +{
    + if (!unlikely_tracing_enabled)
    + return;
    +
    + probe_likely_condition(f, val, expect);
    +}
    +
    +int enable_unlikely_tracing(struct trace_array *tr)
    +{
    + int ret = 0;
    +
    + mutex_lock(&unlikely_tracing_mutex);
    + unlikely_tracer = tr;
    + /*
    + * Must be seen before enabling. The reader is a condition
    + * where we do not need a matching rmb()
    + */
    + smp_wmb();
    + unlikely_tracing_enabled++;
    + mutex_unlock(&unlikely_tracing_mutex);
    +
    + return ret;
    +}
    +
    +void disable_unlikely_tracing(void)
    +{
    + mutex_lock(&unlikely_tracing_mutex);
    +
    + if (!unlikely_tracing_enabled)
    + goto out_unlock;
    +
    + unlikely_tracing_enabled--;
    +
    + out_unlock:
    + mutex_unlock(&unlikely_tracing_mutex);
    +}
    +#else
    +static inline
    +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
    +{
    +}
    +#endif /* CONFIG_UNLIKELY_TRACER */
    +
    void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
    {
    + /*
    + * I would love to have a trace point here instead, but the
    + * trace point code is so inundated with unlikely and likely
    + * conditions that the recursive nightmare that exists is too
    + * much to try to get working. At least for now.
    + */
    + trace_likely_condition(f, val, expect);
    +
    /* FIXME: Make this atomic! */
    if (val == expect)
    f->correct++;
    Index: linux-tip.git/kernel/trace/trace.c
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/trace.c 2008-10-29 12:32:30.000000000 -0400
    +++ linux-tip.git/kernel/trace/trace.c 2008-10-29 12:32:35.000000000 -0400
    @@ -235,6 +235,9 @@ static const char *trace_options[] = {
    "stacktrace",
    "sched-tree",
    "ftrace_printk",
    +#ifdef CONFIG_UNLIKELY_TRACER
    + "unlikely",
    +#endif
    NULL
    };

    @@ -1457,6 +1460,17 @@ print_lat_fmt(struct trace_iterator *ite
    trace_seq_print_cont(s, iter);
    break;
    }
    + case TRACE_UNLIKELY: {
    + struct trace_unlikely *field;
    +
    + trace_assign_type(field, entry);
    +
    + trace_seq_printf(s, "[%s] %-30.30s %d\n",
    + field->correct ? "correct" : "INCORRECT",
    + field->func,
    + field->line);
    + break;
    + }
    default:
    trace_seq_printf(s, "Unknown type %d\n", entry->type);
    }
    @@ -1593,6 +1607,17 @@ static enum print_line_t print_trace_fmt
    trace_seq_print_cont(s, iter);
    break;
    }
    + case TRACE_UNLIKELY: {
    + struct trace_unlikely *field;
    +
    + trace_assign_type(field, entry);
    +
    + trace_seq_printf(s, "[%s] %-30.30s %d\n",
    + field->correct ? "correct" : "INCORRECT",
    + field->func,
    + field->line);
    + break;
    + }
    }
    return TRACE_TYPE_HANDLED;
    }
    @@ -1675,6 +1700,10 @@ static enum print_line_t print_raw_fmt(s
    trace_seq_print_cont(s, iter);
    break;
    }
    + case TRACE_UNLIKELY: {
    + /* IGNORE */
    + break;
    + };
    }
    return TRACE_TYPE_HANDLED;
    }
    @@ -2362,8 +2391,14 @@ tracing_ctrl_write(struct file *filp, co

    tr->ctrl = val;

    + if (!val)
    + trace_unlikely_disable();
    +
    if (current_trace && current_trace->ctrl_update)
    current_trace->ctrl_update(tr);
    +
    + if (val)
    + trace_unlikely_enable(tr);
    }
    mutex_unlock(&trace_types_lock);

    @@ -2425,6 +2460,7 @@ tracing_set_trace_write(struct file *fil
    if (t == current_trace)
    goto out;

    + trace_unlikely_disable();
    if (current_trace && current_trace->reset)
    current_trace->reset(tr);

    @@ -2432,6 +2468,7 @@ tracing_set_trace_write(struct file *fil
    if (t->init)
    t->init(tr);

    + trace_unlikely_enable(tr);
    out:
    mutex_unlock(&trace_types_lock);

    Index: linux-tip.git/kernel/trace/trace.h
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/trace.h 2008-10-29 12:32:30.000000000 -0400
    +++ linux-tip.git/kernel/trace/trace.h 2008-10-29 12:32:35.000000000 -0400
    @@ -21,6 +21,7 @@ enum trace_type {
    TRACE_SPECIAL,
    TRACE_MMIO_RW,
    TRACE_MMIO_MAP,
    + TRACE_UNLIKELY,
    TRACE_BOOT,

    __TRACE_LAST_TYPE
    @@ -117,6 +118,16 @@ struct trace_boot {
    struct boot_trace initcall;
    };

    +#define TRACE_FUNC_SIZE 30
    +#define TRACE_FILE_SIZE 20
    +struct trace_unlikely {
    + struct trace_entry ent;
    + unsigned line;
    + char func[TRACE_FUNC_SIZE+1];
    + char file[TRACE_FILE_SIZE+1];
    + char correct;
    +};
    +
    /*
    * trace_flag_type is an enumeration that holds different
    * states when a trace occurs. These are:
    @@ -217,6 +228,7 @@ extern void __ftrace_bad_type(void);
    IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
    TRACE_MMIO_MAP); \
    IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \
    + IF_ASSIGN(var, ent, struct trace_unlikely, TRACE_UNLIKELY); \
    __ftrace_bad_type(); \
    } while (0)

    @@ -413,8 +425,39 @@ enum trace_iterator_flags {
    TRACE_ITER_STACKTRACE = 0x100,
    TRACE_ITER_SCHED_TREE = 0x200,
    TRACE_ITER_PRINTK = 0x400,
    +#ifdef CONFIG_UNLIKELY_TRACER
    + TRACE_ITER_UNLIKELY = 0x800,
    +#endif
    };

    extern struct tracer nop_trace;

    +#ifdef CONFIG_UNLIKELY_TRACER
    +extern int enable_unlikely_tracing(struct trace_array *tr);
    +extern void disable_unlikely_tracing(void);
    +static inline int trace_unlikely_enable(struct trace_array *tr)
    +{
    + if (trace_flags & TRACE_ITER_UNLIKELY)
    + return enable_unlikely_tracing(tr);
    + return 0;
    +}
    +static inline void trace_unlikely_disable(void)
    +{
    + /* due to races, always disable */
    + disable_unlikely_tracing();
    +}
    +extern void trace_unlikely_stop(void);
    +extern void trace_unlikely_start(void);
    +#else
    +static inline int trace_unlikely_enable(struct trace_array *tr)
    +{
    + return 0;
    +}
    +static inline void trace_unlikely_disable(void)
    +{
    +}
    +static inline void trace_unlikely_stop(void) { }
    +static inline void trace_unlikely_start(void) { }
    +#endif /* CONFIG_UNLIKELY_TRACER */
    +
    #endif /* _LINUX_KERNEL_TRACE_H */
    Index: linux-tip.git/kernel/trace/ring_buffer.c
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/ring_buffer.c 2008-10-29 12:32:30.000000000 -0400
    +++ linux-tip.git/kernel/trace/ring_buffer.c 2008-10-29 12:32:35.000000000 -0400
    @@ -16,6 +16,8 @@
    #include
    #include

    +#include "trace.h"
    +
    /* Up this if you want to test the TIME_EXTENTS and normalization */
    #define DEBUG_SHIFT 0

    @@ -1139,7 +1141,14 @@ ring_buffer_lock_reserve(struct ring_buf
    if (length > BUF_PAGE_SIZE)
    goto out;

    + /*
    + * Ugly, but the unlikely tracing will enter back into
    + * the ring buffer code. Namely through the timestamp, causing
    + * a triggering of the "again" loop, that will be infinite.
    + */
    + trace_unlikely_stop();
    event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
    + trace_unlikely_start();
    if (!event)
    goto out;

    @@ -1251,8 +1260,10 @@ int ring_buffer_write(struct ring_buffer
    goto out;

    event_length = rb_calculate_event_length(length);
    + trace_unlikely_stop();
    event = rb_reserve_next_event(cpu_buffer,
    RINGBUF_TYPE_DATA, event_length);
    + trace_unlikely_start();
    if (!event)
    goto out;



    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  11. [PATCH 1/2 v2][RFC] trace: profile likely and unlikely annotations


    Andrew Morton recently suggested having an in-kernel way to profile
    likely and unlikely macros. This patch achieves that goal.

    When configured, every(*) likely and unlikely macro gets a counter attached
    to it. When the condition is hit, the hit and misses of that condition
    are recorded. These numbers can later be retrieved by:

    /debugfs/tracing/profile_likely - All likely markers
    /debugfs/tracing/profile_unlikely - All unlikely markers.

    # cat /debug/tracing/profile_unlikely | \
    awk '{if (($1 || $2) && (($2+0.1)/($1+0.1) > 0.25)) { print $0; }}' | \
    head -20
    correct incorrect Function File Line
    ------- --------- -------- ---- ----
    46894 38216 __switch_to process_64.c 624
    11951 73138 __switch_to process_64.c 594
    11951 73097 __switch_to process_64.c 590
    117873 35049 need_resched sched.h 2103
    1345 711 syscall_get_error syscall.h 51
    0 2510 syscall_trace_leave ptrace.c 1568
    0 1 native_smp_prepare_cpus smpboot.c 1231
    91635 276125 calc_delta_fair sched_fair.c 411
    219028 113449 calc_delta_mine sched.c 1279
    0 58048 sched_info_queued sched_stats.h 222
    55449 71591 pick_next_task_fair sched_fair.c 1394
    10 5 yield_task_fair sched_fair.c 983
    7117 2553 rt_policy sched.c 144
    0 1220 pre_schedule_rt sched_rt.c 1260
    1216 52402 pick_next_task_rt sched_rt.c 884
    0 48009 sched_info_dequeued sched_stats.h 177
    0 14 sched_move_task sched.c 8707
    0 14 sched_move_task sched.c 8697


    # cat /debug/tracing/profile_likely | \
    awk '{if (($1 || $2) && (($2+0.1)/($1+0.1) > 0.25)) { print $0; }}'
    correct incorrect Function File Line
    ------- --------- -------- ---- ----
    42987 40240 pick_next_task sched.c 4407
    21419 18103 switch_mm mmu_context_64.h 18
    0 7 __cancel_work_timer workqueue.c 560
    861 3960 clocksource_adjust timekeeping.c 456
    390 1913 clocksource_adjust timekeeping.c 450
    0 363683 audit_syscall_exit auditsc.c 1570
    30 364573 audit_get_context auditsc.c 732
    0 362744 audit_syscall_entry auditsc.c 1541
    30 879 audit_free auditsc.c 1446
    0 952 audit_alloc auditsc.c 862
    2606 952 audit_alloc auditsc.c 858
    0 6 move_masked_irq migration.c 20
    4465 2091 __grab_cache_page filemap.c 2212
    9495 2787 iov_iter_advance filemap.c 1884
    3121 899 __set_page_dirty page-writeback.c 1134
    12874 221371 mapping_unevictable pagemap.h 50
    4 10 __flush_cpu_slab slub.c 1466
    635239 320760 slab_free slub.c 1731
    3105 61055 dentry_lru_del_init dcache.c 153
    942 1156 load_elf_binary binfmt_elf.c 904
    102 82 disk_put_part genhd.h 206
    0 1 tcp_mss_split_point tcp_output.c 1126



    As you can see by the above, there's a bit of work to do in rethinking
    the use of some unlikelys and likelys. Note: the unlikely case had 76 hits
    that were more than 25%.

    Note: After submitting my first version of this patch, Andrew Morton
    showed me a version written by Daniel Walker, where I picked up
    the following ideas from:

    1) Using __builtin_constant_p to avoid profiling fixed values.
    2) Using __FILE__ instead of instruction pointers.
    3) Using the preprocessor to stop all profiling of likely
    annotations from vsyscall_64.c.

    Thanks to Andrew Morton, Arjan van de Ven, Theodore Tso and Ingo Molnar
    for their feed back on this patch.

    (*) Not ever unlikely is recorded, those that are used by vsyscalls
    (a few of them) had to have profiling disabled.

    Signed-off-by: Steven Rostedt
    ---
    arch/x86/kernel/vsyscall_64.c | 8 +
    include/asm-generic/vmlinux.lds.h | 14 +++
    include/linux/compiler.h | 61 ++++++++++++++
    kernel/trace/Kconfig | 16 +++
    kernel/trace/Makefile | 2
    kernel/trace/trace_unlikely.c | 157 ++++++++++++++++++++++++++++++++++++++
    6 files changed, 255 insertions(+), 3 deletions(-)

    Index: linux-tip.git/kernel/trace/Makefile
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/Makefile 2008-10-29 11:30:05.000000000 -0400
    +++ linux-tip.git/kernel/trace/Makefile 2008-10-29 12:32:40.000000000 -0400
    @@ -25,4 +25,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stac
    obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
    obj-$(CONFIG_BOOT_TRACER) += trace_boot.o

    +obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
    +
    libftrace-y := ftrace.o
    Index: linux-tip.git/arch/x86/kernel/vsyscall_64.c
    ================================================== =================
    --- linux-tip.git.orig/arch/x86/kernel/vsyscall_64.c 2008-10-29 11:30:05.000000000 -0400
    +++ linux-tip.git/arch/x86/kernel/vsyscall_64.c 2008-10-29 11:30:12.000000000 -0400
    @@ -17,6 +17,14 @@
    * want per guest time just set the kernel.vsyscall64 sysctl to 0.
    */

    +/* Protect userspace from profiling */
    +#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
    +# undef likely
    +# undef unlikely
    +# define likely(x) likely_notrace(x)
    +# define unlikely(x) unlikely_notrace(x)
    +#endif
    +
    #include
    #include
    #include
    Index: linux-tip.git/include/asm-generic/vmlinux.lds.h
    ================================================== =================
    --- linux-tip.git.orig/include/asm-generic/vmlinux.lds.h 2008-10-29 11:30:06.000000000 -0400
    +++ linux-tip.git/include/asm-generic/vmlinux.lds.h 2008-10-29 11:30:12.000000000 -0400
    @@ -45,6 +45,17 @@
    #define MCOUNT_REC()
    #endif

    +#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
    +#define LIKELY_PROFILE() VMLINUX_SYMBOL(__start_likely_profile) = .; \
    + *(_ftrace_likely) \
    + VMLINUX_SYMBOL(__stop_likely_profile) = .; \
    + VMLINUX_SYMBOL(__start_unlikely_profile) = .; \
    + *(_ftrace_unlikely) \
    + VMLINUX_SYMBOL(__stop_unlikely_profile) = .;
    +#else
    +#define LIKELY_PROFILE()
    +#endif
    +
    /* .data section */
    #define DATA_DATA \
    *(.data) \
    @@ -62,7 +73,8 @@
    VMLINUX_SYMBOL(__stop___markers) = .; \
    VMLINUX_SYMBOL(__start___tracepoints) = .; \
    *(__tracepoints) \
    - VMLINUX_SYMBOL(__stop___tracepoints) = .;
    + VMLINUX_SYMBOL(__stop___tracepoints) = .; \
    + LIKELY_PROFILE()

    #define RO_DATA(align) \
    . = ALIGN((align)); \
    Index: linux-tip.git/include/linux/compiler.h
    ================================================== =================
    --- linux-tip.git.orig/include/linux/compiler.h 2008-10-29 11:30:06.000000000 -0400
    +++ linux-tip.git/include/linux/compiler.h 2008-10-29 11:32:05.000000000 -0400
    @@ -59,8 +59,65 @@ extern void __chk_io_ptr(const volatile
    * specific implementations come from the above header files
    */

    -#define likely(x) __builtin_expect(!!(x), 1)
    -#define unlikely(x) __builtin_expect(!!(x), 0)
    +#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
    +struct ftrace_likely_data {
    + const char *func;
    + const char *file;
    + unsigned line;
    + unsigned long correct;
    + unsigned long incorrect;
    +};
    +void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect);
    +
    +#define likely_notrace(x) __builtin_expect(!!(x), 1)
    +#define unlikely_notrace(x) __builtin_expect(!!(x), 0)
    +
    +#define likely_check(x) ({ \
    + int ______r; \
    + static struct ftrace_likely_data \
    + __attribute__((__aligned__(4))) \
    + __attribute__((section("_ftrace_likely"))) \
    + ______f = { \
    + .func = __func__, \
    + .file = __FILE__, \
    + .line = __LINE__, \
    + }; \
    + ______f.line = __LINE__; \
    + ______r = likely_notrace(x); \
    + ftrace_likely_update(&______f, ______r, 1); \
    + ______r; \
    + })
    +#define unlikely_check(x) ({ \
    + int ______r; \
    + static struct ftrace_likely_data \
    + __attribute__((__aligned__(4))) \
    + __attribute__((section("_ftrace_unlikely"))) \
    + ______f = { \
    + .func = __func__, \
    + .file = __FILE__, \
    + .line = __LINE__, \
    + }; \
    + ______f.line = __LINE__; \
    + ______r = unlikely_notrace(x); \
    + ftrace_likely_update(&______f, ______r, 0); \
    + ______r; \
    + })
    +
    +/*
    + * Using __builtin_constant_p(x) to ignore cases where the return
    + * value is always the same. This idea is taken from a similar patch
    + * written by Daniel Walker.
    + */
    +# ifndef likely
    +# define likely(x) (__builtin_constant_p(x) ? !!(x) : likely_check(x))
    +# endif
    +# ifndef unlikely
    +# define unlikely(x) (__builtin_constant_p(x) ? !!(x) : unlikely_check(x))
    +# endif
    +#else
    +# define likely(x) __builtin_expect(!!(x), 1)
    +# define unlikely(x) __builtin_expect(!!(x), 0)
    +#endif

    /* Optimization barrier */
    #ifndef barrier
    Index: linux-tip.git/kernel/trace/Kconfig
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/Kconfig 2008-10-29 11:30:10.000000000 -0400
    +++ linux-tip.git/kernel/trace/Kconfig 2008-10-29 12:32:40.000000000 -0400
    @@ -138,6 +138,22 @@ config BOOT_TRACER
    selected, because the self-tests are an initcall as well and that
    would invalidate the boot trace. )

    +config TRACE_UNLIKELY_PROFILE
    + bool "Trace likely/unlikely profiler"
    + depends on DEBUG_KERNEL
    + select TRACING
    + help
    + This tracer profiles all the the likely and unlikely macros
    + in the kernel. It will display the results in:
    +
    + /debugfs/tracing/profile_likely
    + /debugfs/tracing/profile_unlikely
    +
    + Note: this will add a significant overhead, only turn this
    + on if you need to profile the system's use of these macros.
    +
    + Say N if unsure.
    +
    config STACK_TRACER
    bool "Trace max stack"
    depends on HAVE_FUNCTION_TRACER
    Index: linux-tip.git/kernel/trace/trace_unlikely.c
    ================================================== =================
    --- /dev/null 1970-01-01 00:00:00.000000000 +0000
    +++ linux-tip.git/kernel/trace/trace_unlikely.c 2008-10-29 12:32:40.000000000 -0400
    @@ -0,0 +1,157 @@
    +/*
    + * unlikely profiler
    + *
    + * Copyright (C) 2008 Steven Rostedt
    + */
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include "trace.h"
    +
    +void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
    +{
    + /* FIXME: Make this atomic! */
    + if (val == expect)
    + f->correct++;
    + else
    + f->incorrect++;
    +}
    +EXPORT_SYMBOL(ftrace_likely_update);
    +
    +struct ftrace_pointer {
    + void *start;
    + void *stop;
    +};
    +
    +static void *
    +t_next(struct seq_file *m, void *v, loff_t *pos)
    +{
    + struct ftrace_pointer *f = m->private;
    + struct ftrace_likely_data *p = v;
    +
    + (*pos)++;
    +
    + if (v == (void *)1)
    + return f->start;
    +
    + ++p;
    +
    + if ((void *)p >= (void *)f->stop)
    + return NULL;
    +
    + return p;
    +}
    +
    +static void *t_start(struct seq_file *m, loff_t *pos)
    +{
    + void *t = (void *)1;
    + loff_t l = 0;
    +
    + for (; t && l < *pos; t = t_next(m, t, &l))
    + ;
    +
    + return t;
    +}
    +
    +static void t_stop(struct seq_file *m, void *p)
    +{
    +}
    +
    +static int t_show(struct seq_file *m, void *v)
    +{
    + struct ftrace_likely_data *p = v;
    + const char *f;
    +
    + if (v == (void *)1) {
    + seq_printf(m, " correct incorrect"
    + " Function "
    + " File Line\n"
    + " ------- ---------"
    + " -------- "
    + " ---- ----\n");
    + return 0;
    + }
    +
    + /* Only print the file, not the path */
    + f = p->file + strlen(p->file);
    + while (f >= p->file && *f != '/')
    + f--;
    + f++;
    +
    + seq_printf(m, " %8lu %8lu ", p->correct, p->incorrect);
    + seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
    + return 0;
    +}
    +
    +static struct seq_operations tracing_likely_seq_ops = {
    + .start = t_start,
    + .next = t_next,
    + .stop = t_stop,
    + .show = t_show,
    +};
    +
    +static int tracing_likely_open(struct inode *inode, struct file *file)
    +{
    + int ret;
    +
    + ret = seq_open(file, &tracing_likely_seq_ops);
    + if (!ret) {
    + struct seq_file *m = file->private_data;
    + m->private = (void *)inode->i_private;
    + }
    +
    + return ret;
    +}
    +
    +static struct file_operations tracing_likely_fops = {
    + .open = tracing_likely_open,
    + .read = seq_read,
    + .llseek = seq_lseek,
    +};
    +
    +extern unsigned long __start_likely_profile[];
    +extern unsigned long __stop_likely_profile[];
    +extern unsigned long __start_unlikely_profile[];
    +extern unsigned long __stop_unlikely_profile[];
    +
    +static struct ftrace_pointer ftrace_likely_pos = {
    + .start = __start_likely_profile,
    + .stop = __stop_likely_profile,
    +};
    +
    +static struct ftrace_pointer ftrace_unlikely_pos = {
    + .start = __start_unlikely_profile,
    + .stop = __stop_unlikely_profile,
    +};
    +
    +static __init int ftrace_unlikely_init(void)
    +{
    + struct dentry *d_tracer;
    + struct dentry *entry;
    +
    + d_tracer = tracing_init_dentry();
    +
    + entry = debugfs_create_file("profile_likely", 0444, d_tracer,
    + &ftrace_likely_pos,
    + &tracing_likely_fops);
    + if (!entry)
    + pr_warning("Could not create debugfs 'profile_likely' entry\n");
    +
    + entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
    + &ftrace_unlikely_pos,
    + &tracing_likely_fops);
    + if (!entry)
    + pr_warning("Could not create debugfs"
    + " 'profile_unlikely' entry\n");
    +
    + return 0;
    +}
    +
    +device_initcall(ftrace_unlikely_init);


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  12. Re: [PATCH 1/2 v2][RFC] trace: profile likely and unlikely annotations

    On Wed, 29 Oct 2008 12:35:44 -0400 (EDT)
    Steven Rostedt wrote:

    >
    > Andrew Morton recently suggested having an in-kernel way to profile
    > likely and unlikely macros. This patch achieves that goal.
    >
    > When configured, every(*) likely and unlikely macro gets a counter
    > attached to it. When the condition is hit, the hit and misses of that
    > condition are recorded. These numbers can later be retrieved by:
    >
    > /debugfs/tracing/profile_likely - All likely markers
    > /debugfs/tracing/profile_unlikely - All unlikely markers.
    >


    I like the general approach
    it'd be very convenient if the debugfs files also had a
    percentage-incorrect (or percentage-correct, either way) listed,
    so that we humans can focus straight on which ones are outliers rather
    than having to do math in our heads first...


    --
    Arjan van de Ven Intel Open Source Technology Centre
    For development, discussion and tips for power savings,
    visit http://www.lesswatts.org
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  13. [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations


    [
    changes since v2:

    Added Arjan's recommendation about calculating a percentage of
    failures in the output.

    Question: Do people prefer the "func file line" notation or the
    kallsyms "func+0x" notation?
    ]

    Andrew Morton recently suggested having an in-kernel way to profile
    likely and unlikely macros. This patch achieves that goal.

    When configured, every(*) likely and unlikely macro gets a counter attached
    to it. When the condition is hit, the hit and misses of that condition
    are recorded. These numbers can later be retrieved by:

    /debugfs/tracing/profile_likely - All likely markers
    /debugfs/tracing/profile_unlikely - All unlikely markers.

    # cat /debug/tracing/profile_unlikely | head
    correct incorrect % Function File Line
    ------- --------- - -------- ---- ----
    2494 0 0 do_arch_prctl process_64.c 832
    0 0 0 do_arch_prctl process_64.c 804
    2988 0 0 IS_ERR err.h 34
    124668 6133 4 __switch_to process_64.c 673
    130811 0 0 __switch_to process_64.c 639
    69768 61064 87 __switch_to process_64.c 624
    15557 115251 100 __switch_to process_64.c 594
    15555 115227 100 __switch_to process_64.c 590

    # cat /debug/tracing/profile_unlikely | \
    awk '{ if ($3 > 25) print $0; }' |head -20
    59851 49675 82 __switch_to process_64.c 624
    12688 96800 100 __switch_to process_64.c 594
    12688 96778 100 __switch_to process_64.c 590
    152290 49467 32 need_resched sched.h 2103
    1385 1034 74 syscall_get_error syscall.h 51
    0 2967 100 syscall_trace_leave ptrace.c 1568
    0 1 100 native_smp_prepare_cpus smpboot.c 1231
    304770 92199 30 update_curr sched_fair.c 489
    113195 297065 100 calc_delta_fair sched_fair.c 411
    227018 123309 54 calc_delta_mine sched.c 1279
    0 70299 100 sched_info_queued sched_stats.h 222
    67434 100268 100 pick_next_task_fair sched_fair.c 1394
    7 9 100 yield_task_fair sched_fair.c 983
    7215 3321 46 rt_policy sched.c 144
    0 1611 100 pre_schedule_rt sched_rt.c 1260
    1607 66013 100 pick_next_task_rt sched_rt.c 884
    0 60198 100 sched_info_dequeued sched_stats.h 177
    0 15 100 sched_move_task sched.c 8707
    0 15 100 sched_move_task sched.c 8697
    68373 50822 74 schedule sched.c 4467


    # cat /debug/tracing/profile_likely | awk '{ if ($3 > 25) print $0; }'
    54665 53649 98 pick_next_task sched.c 4407
    25344 23614 93 switch_mm mmu_context_64.h 18
    0 7 100 __cancel_work_timer workqueue.c 560
    1202 1354 100 clocksource_adjust timekeeping.c 456
    0 441888 100 audit_syscall_exit auditsc.c 1570
    22 442776 100 audit_get_context auditsc.c 732
    0 440930 100 audit_syscall_entry auditsc.c 1541
    22 889 100 audit_free auditsc.c 1446
    0 962 100 audit_alloc auditsc.c 862
    2614 962 36 audit_alloc auditsc.c 858
    0 6 100 move_masked_irq migration.c 20
    4482 2110 47 __grab_cache_page filemap.c 2212
    9575 2787 29 iov_iter_advance filemap.c 1884
    3121 905 28 __set_page_dirty page-writeback.c 1134
    12882 222991 100 mapping_unevictable pagemap.h 50
    4 11 100 __flush_cpu_slab slub.c 1466
    664688 320446 48 slab_free slub.c 1731
    2962 61920 100 dentry_lru_del_init dcache.c 153
    950 1157 100 load_elf_binary binfmt_elf.c 904
    102 82 80 disk_put_part genhd.h 206
    0 4 100 tcp_mss_split_point tcp_output.c 1126


    As you can see by the above, there's a bit of work to do in rethinking
    the use of some unlikelys and likelys. Note: the unlikely case had 77 hits
    that were more than 25%.

    Note: After submitting my first version of this patch, Andrew Morton
    showed me a version written by Daniel Walker, where I picked up
    the following ideas from:

    1) Using __builtin_constant_p to avoid profiling fixed values.
    2) Using __FILE__ instead of instruction pointers.
    3) Using the preprocessor to stop all profiling of likely
    annotations from vsyscall_64.c.

    Thanks to Andrew Morton, Arjan van de Ven, Theodore Tso and Ingo Molnar
    for their feed back on this patch.

    (*) Not ever unlikely is recorded, those that are used by vsyscalls
    (a few of them) had to have profiling disabled.

    Signed-off-by: Steven Rostedt
    ---
    arch/x86/kernel/vsyscall_64.c | 8 +
    include/asm-generic/vmlinux.lds.h | 14 ++-
    include/linux/compiler.h | 61 +++++++++++++
    kernel/trace/Kconfig | 16 +++
    kernel/trace/Makefile | 2
    kernel/trace/trace_unlikely.c | 167 ++++++++++++++++++++++++++++++++++++++
    6 files changed, 265 insertions(+), 3 deletions(-)

    Index: linux-tip.git/kernel/trace/Makefile
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/Makefile 2008-10-29 12:38:54.000000000 -0400
    +++ linux-tip.git/kernel/trace/Makefile 2008-10-29 18:31:26.000000000 -0400
    @@ -25,4 +25,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stac
    obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
    obj-$(CONFIG_BOOT_TRACER) += trace_boot.o

    +obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
    +
    libftrace-y := ftrace.o
    Index: linux-tip.git/arch/x86/kernel/vsyscall_64.c
    ================================================== =================
    --- linux-tip.git.orig/arch/x86/kernel/vsyscall_64.c 2008-10-29 12:38:54.000000000 -0400
    +++ linux-tip.git/arch/x86/kernel/vsyscall_64.c 2008-10-29 16:13:00.000000000 -0400
    @@ -17,6 +17,14 @@
    * want per guest time just set the kernel.vsyscall64 sysctl to 0.
    */

    +/* Protect userspace from profiling */
    +#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
    +# undef likely
    +# undef unlikely
    +# define likely(x) likely_notrace(x)
    +# define unlikely(x) unlikely_notrace(x)
    +#endif
    +
    #include
    #include
    #include
    Index: linux-tip.git/include/asm-generic/vmlinux.lds.h
    ================================================== =================
    --- linux-tip.git.orig/include/asm-generic/vmlinux.lds.h 2008-10-29 12:38:54.000000000 -0400
    +++ linux-tip.git/include/asm-generic/vmlinux.lds.h 2008-10-29 16:13:00.000000000 -0400
    @@ -45,6 +45,17 @@
    #define MCOUNT_REC()
    #endif

    +#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
    +#define LIKELY_PROFILE() VMLINUX_SYMBOL(__start_likely_profile) = .; \
    + *(_ftrace_likely) \
    + VMLINUX_SYMBOL(__stop_likely_profile) = .; \
    + VMLINUX_SYMBOL(__start_unlikely_profile) = .; \
    + *(_ftrace_unlikely) \
    + VMLINUX_SYMBOL(__stop_unlikely_profile) = .;
    +#else
    +#define LIKELY_PROFILE()
    +#endif
    +
    /* .data section */
    #define DATA_DATA \
    *(.data) \
    @@ -62,7 +73,8 @@
    VMLINUX_SYMBOL(__stop___markers) = .; \
    VMLINUX_SYMBOL(__start___tracepoints) = .; \
    *(__tracepoints) \
    - VMLINUX_SYMBOL(__stop___tracepoints) = .;
    + VMLINUX_SYMBOL(__stop___tracepoints) = .; \
    + LIKELY_PROFILE()

    #define RO_DATA(align) \
    . = ALIGN((align)); \
    Index: linux-tip.git/include/linux/compiler.h
    ================================================== =================
    --- linux-tip.git.orig/include/linux/compiler.h 2008-10-29 12:38:54.000000000 -0400
    +++ linux-tip.git/include/linux/compiler.h 2008-10-29 16:13:00.000000000 -0400
    @@ -59,8 +59,65 @@ extern void __chk_io_ptr(const volatile
    * specific implementations come from the above header files
    */

    -#define likely(x) __builtin_expect(!!(x), 1)
    -#define unlikely(x) __builtin_expect(!!(x), 0)
    +#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
    +struct ftrace_likely_data {
    + const char *func;
    + const char *file;
    + unsigned line;
    + unsigned long correct;
    + unsigned long incorrect;
    +};
    +void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect);
    +
    +#define likely_notrace(x) __builtin_expect(!!(x), 1)
    +#define unlikely_notrace(x) __builtin_expect(!!(x), 0)
    +
    +#define likely_check(x) ({ \
    + int ______r; \
    + static struct ftrace_likely_data \
    + __attribute__((__aligned__(4))) \
    + __attribute__((section("_ftrace_likely"))) \
    + ______f = { \
    + .func = __func__, \
    + .file = __FILE__, \
    + .line = __LINE__, \
    + }; \
    + ______f.line = __LINE__; \
    + ______r = likely_notrace(x); \
    + ftrace_likely_update(&______f, ______r, 1); \
    + ______r; \
    + })
    +#define unlikely_check(x) ({ \
    + int ______r; \
    + static struct ftrace_likely_data \
    + __attribute__((__aligned__(4))) \
    + __attribute__((section("_ftrace_unlikely"))) \
    + ______f = { \
    + .func = __func__, \
    + .file = __FILE__, \
    + .line = __LINE__, \
    + }; \
    + ______f.line = __LINE__; \
    + ______r = unlikely_notrace(x); \
    + ftrace_likely_update(&______f, ______r, 0); \
    + ______r; \
    + })
    +
    +/*
    + * Using __builtin_constant_p(x) to ignore cases where the return
    + * value is always the same. This idea is taken from a similar patch
    + * written by Daniel Walker.
    + */
    +# ifndef likely
    +# define likely(x) (__builtin_constant_p(x) ? !!(x) : likely_check(x))
    +# endif
    +# ifndef unlikely
    +# define unlikely(x) (__builtin_constant_p(x) ? !!(x) : unlikely_check(x))
    +# endif
    +#else
    +# define likely(x) __builtin_expect(!!(x), 1)
    +# define unlikely(x) __builtin_expect(!!(x), 0)
    +#endif

    /* Optimization barrier */
    #ifndef barrier
    Index: linux-tip.git/kernel/trace/Kconfig
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/Kconfig 2008-10-29 16:12:53.000000000 -0400
    +++ linux-tip.git/kernel/trace/Kconfig 2008-10-29 18:31:26.000000000 -0400
    @@ -138,6 +138,22 @@ config BOOT_TRACER
    selected, because the self-tests are an initcall as well and that
    would invalidate the boot trace. )

    +config TRACE_UNLIKELY_PROFILE
    + bool "Trace likely/unlikely profiler"
    + depends on DEBUG_KERNEL
    + select TRACING
    + help
    + This tracer profiles all the the likely and unlikely macros
    + in the kernel. It will display the results in:
    +
    + /debugfs/tracing/profile_likely
    + /debugfs/tracing/profile_unlikely
    +
    + Note: this will add a significant overhead, only turn this
    + on if you need to profile the system's use of these macros.
    +
    + Say N if unsure.
    +
    config STACK_TRACER
    bool "Trace max stack"
    depends on HAVE_FUNCTION_TRACER
    Index: linux-tip.git/kernel/trace/trace_unlikely.c
    ================================================== =================
    --- /dev/null 1970-01-01 00:00:00.000000000 +0000
    +++ linux-tip.git/kernel/trace/trace_unlikely.c 2008-10-29 18:31:26.000000000 -0400
    @@ -0,0 +1,167 @@
    +/*
    + * unlikely profiler
    + *
    + * Copyright (C) 2008 Steven Rostedt
    + */
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include "trace.h"
    +
    +void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
    +{
    + /* FIXME: Make this atomic! */
    + if (val == expect)
    + f->correct++;
    + else
    + f->incorrect++;
    +}
    +EXPORT_SYMBOL(ftrace_likely_update);
    +
    +struct ftrace_pointer {
    + void *start;
    + void *stop;
    +};
    +
    +static void *
    +t_next(struct seq_file *m, void *v, loff_t *pos)
    +{
    + struct ftrace_pointer *f = m->private;
    + struct ftrace_likely_data *p = v;
    +
    + (*pos)++;
    +
    + if (v == (void *)1)
    + return f->start;
    +
    + ++p;
    +
    + if ((void *)p >= (void *)f->stop)
    + return NULL;
    +
    + return p;
    +}
    +
    +static void *t_start(struct seq_file *m, loff_t *pos)
    +{
    + void *t = (void *)1;
    + loff_t l = 0;
    +
    + for (; t && l < *pos; t = t_next(m, t, &l))
    + ;
    +
    + return t;
    +}
    +
    +static void t_stop(struct seq_file *m, void *p)
    +{
    +}
    +
    +static int t_show(struct seq_file *m, void *v)
    +{
    + struct ftrace_likely_data *p = v;
    + const char *f;
    + unsigned long percent;
    +
    + if (v == (void *)1) {
    + seq_printf(m, " correct incorrect %% "
    + " Function "
    + " File Line\n"
    + " ------- --------- - "
    + " -------- "
    + " ---- ----\n");
    + return 0;
    + }
    +
    + /* Only print the file, not the path */
    + f = p->file + strlen(p->file);
    + while (f >= p->file && *f != '/')
    + f--;
    + f++;
    +
    + if (p->correct) {
    + percent = p->incorrect * 100;
    + percent /= p->correct;
    + /* No need to see huge numbers */
    + if (percent > 100)
    + percent = 100;
    + } else
    + percent = p->incorrect ? 100 : 0;
    +
    + seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent);
    + seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
    + return 0;
    +}
    +
    +static struct seq_operations tracing_likely_seq_ops = {
    + .start = t_start,
    + .next = t_next,
    + .stop = t_stop,
    + .show = t_show,
    +};
    +
    +static int tracing_likely_open(struct inode *inode, struct file *file)
    +{
    + int ret;
    +
    + ret = seq_open(file, &tracing_likely_seq_ops);
    + if (!ret) {
    + struct seq_file *m = file->private_data;
    + m->private = (void *)inode->i_private;
    + }
    +
    + return ret;
    +}
    +
    +static struct file_operations tracing_likely_fops = {
    + .open = tracing_likely_open,
    + .read = seq_read,
    + .llseek = seq_lseek,
    +};
    +
    +extern unsigned long __start_likely_profile[];
    +extern unsigned long __stop_likely_profile[];
    +extern unsigned long __start_unlikely_profile[];
    +extern unsigned long __stop_unlikely_profile[];
    +
    +static struct ftrace_pointer ftrace_likely_pos = {
    + .start = __start_likely_profile,
    + .stop = __stop_likely_profile,
    +};
    +
    +static struct ftrace_pointer ftrace_unlikely_pos = {
    + .start = __start_unlikely_profile,
    + .stop = __stop_unlikely_profile,
    +};
    +
    +static __init int ftrace_unlikely_init(void)
    +{
    + struct dentry *d_tracer;
    + struct dentry *entry;
    +
    + d_tracer = tracing_init_dentry();
    +
    + entry = debugfs_create_file("profile_likely", 0444, d_tracer,
    + &ftrace_likely_pos,
    + &tracing_likely_fops);
    + if (!entry)
    + pr_warning("Could not create debugfs 'profile_likely' entry\n");
    +
    + entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
    + &ftrace_unlikely_pos,
    + &tracing_likely_fops);
    + if (!entry)
    + pr_warning("Could not create debugfs"
    + " 'profile_unlikely' entry\n");
    +
    + return 0;
    +}
    +
    +device_initcall(ftrace_unlikely_init);
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  14. [PATCH 2/2 v3][RFC] ftrace: unlikely annotation tracer


    [
    Changes since v2:

    Changed to a "func:file:line-num" format.
    ]

    This patch adds a way to record the instances of the likely and unlikely
    condition annotations.

    When "unlikely" is set in /debugfs/tracing/iter_ctrl the unlikely conditions
    will be added to any of the ftrace tracers.

    For example:

    bash-3471 [003] 357.014755: [INCORRECT] sched_info_dequeued:sched_stats.h:177
    bash-3471 [003] 357.014756: [correct] update_curr:sched_fair.c:489
    bash-3471 [003] 357.014758: [correct] calc_delta_fair:sched_fair.c:411
    bash-3471 [003] 357.014759: [correct] account_group_exec_runtime:sched_stats.h:356
    bash-3471 [003] 357.014761: [correct] update_curr:sched_fair.c:489
    bash-3471 [003] 357.014763: [INCORRECT] calc_delta_fair:sched_fair.c:411
    bash-3471 [003] 357.014765: [correct] calc_delta_mine:sched.c:1279


    Which shows the normal tracer heading, as well as whether the condition was
    correct "[correct]" or was mistaken "[INCORRECT]", followed by the function,
    file name and line number.


    Signed-off-by: Steven Rostedt
    ---
    kernel/Makefile | 5 +
    kernel/trace/Kconfig | 22 ++++++
    kernel/trace/Makefile | 5 +
    kernel/trace/ring_buffer.c | 11 +++
    kernel/trace/trace.c | 39 +++++++++++
    kernel/trace/trace.h | 43 +++++++++++++
    kernel/trace/trace_unlikely.c | 137 ++++++++++++++++++++++++++++++++++++++++++
    7 files changed, 262 insertions(+)

    Index: linux-tip.git/kernel/trace/Kconfig
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/Kconfig 2008-10-29 17:01:11.000000000 -0400
    +++ linux-tip.git/kernel/trace/Kconfig 2008-10-29 17:46:37.000000000 -0400
    @@ -154,6 +154,28 @@ config TRACE_UNLIKELY_PROFILE

    Say N if unsure.

    +config TRACING_UNLIKELY
    + bool
    + help
    + Selected by tracers that will trace the likely and unlikely
    + conditions. This prevents the tracers themselves from being
    + profiled. Profiling the tracing infrastructure can only happen
    + when the likelys and unlikelys are not being traced.
    +
    +config UNLIKELY_TRACER
    + bool "Trace likely/unlikely instances"
    + depends on TRACE_UNLIKELY_PROFILE
    + select TRACING_UNLIKELY
    + help
    + This is traces the events of likely and unlikely condition
    + calls in the kernel. The difference between this and the
    + "Trace likely/unlikely profiler" is that this is not a
    + histogram of the callers, but actually places the calling
    + events into a running buffer to see when and where the
    + events happened, as well as their results.
    +
    + Say N if unsure.
    +
    config STACK_TRACER
    bool "Trace max stack"
    depends on HAVE_FUNCTION_TRACER
    Index: linux-tip.git/kernel/Makefile
    ================================================== =================
    --- linux-tip.git.orig/kernel/Makefile 2008-10-29 17:01:11.000000000 -0400
    +++ linux-tip.git/kernel/Makefile 2008-10-29 17:46:37.000000000 -0400
    @@ -24,6 +24,11 @@ CFLAGS_REMOVE_sched_clock.o = -pg
    CFLAGS_REMOVE_sched.o = -mno-spe -pg
    endif

    +ifdef CONFIG_TRACING_UNLIKELY
    +# Do not profile tracepoint if we are tracing the unlikely conditions
    +CFLAGS_tracepoint.o += '-Dunlikely(x)=unlikely_notrace(x)'
    +endif
    +
    obj-$(CONFIG_FREEZER) += freezer.o
    obj-$(CONFIG_PROFILING) += profile.o
    obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
    Index: linux-tip.git/kernel/trace/Makefile
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/Makefile 2008-10-29 17:01:11.000000000 -0400
    +++ linux-tip.git/kernel/trace/Makefile 2008-10-29 17:46:37.000000000 -0400
    @@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg
    obj-y += trace_selftest_dynamic.o
    endif

    +ifdef CONFIG_TRACING_UNLIKELY
    +KBUILD_CFLAGS += '-Dlikely(x)=likely_notrace(x)'
    +KBUILD_CFLAGS += '-Dunlikely(x)=unlikely_notrace(x)'
    +endif
    +
    obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
    obj-$(CONFIG_RING_BUFFER) += ring_buffer.o

    Index: linux-tip.git/kernel/trace/trace_unlikely.c
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/trace_unlikely.c 2008-10-29 17:25:31.000000000 -0400
    +++ linux-tip.git/kernel/trace/trace_unlikely.c 2008-10-29 18:03:30.000000000 -0400
    @@ -15,8 +15,145 @@
    #include
    #include "trace.h"

    +#ifdef CONFIG_UNLIKELY_TRACER
    +
    +static int unlikely_tracing_enabled __read_mostly;
    +static DEFINE_MUTEX(unlikely_tracing_mutex);
    +static DEFINE_PER_CPU(atomic_t, unlikely_trace_disabled);
    +static struct trace_array *unlikely_tracer;
    +
    +void trace_unlikely_stop(void)
    +{
    + /* preemption should be disabled */
    + int cpu = smp_processor_id();
    +
    + atomic_inc(&per_cpu(unlikely_trace_disabled, cpu));
    +}
    +
    +void trace_unlikely_start(void)
    +{
    + /* preemption should be disabled */
    + int cpu = smp_processor_id();
    +
    + atomic_dec(&per_cpu(unlikely_trace_disabled, cpu));
    +}
    +
    +static void
    +probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
    +{
    + struct trace_array *tr = unlikely_tracer;
    + struct ring_buffer_event *event;
    + struct trace_unlikely *entry;
    + unsigned long flags, irq_flags;
    + int cpu, resched, pc;
    + const char *p;
    +
    + /*
    + * I would love to save just the ftrace_likely_data pointer, but
    + * this code can also be used by modules. Ugly things can happen
    + * if the module is unloaded, and then we go and read the
    + * pointer. This is slower, but much safer.
    + */
    +
    + if (unlikely(!tr))
    + return;
    +
    + pc = preempt_count();
    + /* Can be called within schedule */
    + resched = need_resched();
    + preempt_disable_notrace();
    + cpu = raw_smp_processor_id();
    + if (atomic_inc_return(&per_cpu(unlikely_trace_disabled, cpu)) != 1)
    + goto out;
    +
    + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
    + &irq_flags);
    + if (!event)
    + goto out;
    +
    + local_save_flags(flags);
    + entry = ring_buffer_event_data(event);
    + tracing_generic_entry_update(&entry->ent, flags, pc);
    + entry->ent.type = TRACE_UNLIKELY;
    +
    + /* Strip off the path, only save the file */
    + p = f->file + strlen(f->file);
    + while (p >= f->file && *p != '/')
    + p--;
    + p++;
    +
    + strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
    + strncpy(entry->file, p, TRACE_FILE_SIZE);
    + entry->func[TRACE_FUNC_SIZE] = 0;
    + entry->file[TRACE_FILE_SIZE] = 0;
    + entry->line = f->line;
    + entry->correct = val == expect;
    +
    + ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
    +
    + out:
    + atomic_dec(&per_cpu(unlikely_trace_disabled, cpu));
    + if (resched)
    + preempt_enable_no_resched_notrace();
    + else
    + preempt_enable_notrace();
    +}
    +
    +static inline
    +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
    +{
    + if (!unlikely_tracing_enabled)
    + return;
    +
    + probe_likely_condition(f, val, expect);
    +}
    +
    +int enable_unlikely_tracing(struct trace_array *tr)
    +{
    + int ret = 0;
    +
    + mutex_lock(&unlikely_tracing_mutex);
    + unlikely_tracer = tr;
    + /*
    + * Must be seen before enabling. The reader is a condition
    + * where we do not need a matching rmb()
    + */
    + smp_wmb();
    + unlikely_tracing_enabled++;
    + mutex_unlock(&unlikely_tracing_mutex);
    +
    + return ret;
    +}
    +
    +void disable_unlikely_tracing(void)
    +{
    + mutex_lock(&unlikely_tracing_mutex);
    +
    + if (!unlikely_tracing_enabled)
    + goto out_unlock;
    +
    + unlikely_tracing_enabled--;
    +
    + out_unlock:
    + mutex_unlock(&unlikely_tracing_mutex);
    +}
    +#else
    +static inline
    +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
    +{
    +}
    +#endif /* CONFIG_UNLIKELY_TRACER */
    +
    void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
    {
    + /*
    + * I would love to have a trace point here instead, but the
    + * trace point code is so inundated with unlikely and likely
    + * conditions that the recursive nightmare that exists is too
    + * much to try to get working. At least for now.
    + */
    + trace_likely_condition(f, val, expect);
    +
    /* FIXME: Make this atomic! */
    if (val == expect)
    f->correct++;
    Index: linux-tip.git/kernel/trace/trace.c
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/trace.c 2008-10-29 17:01:11.000000000 -0400
    +++ linux-tip.git/kernel/trace/trace.c 2008-10-29 18:04:04.000000000 -0400
    @@ -235,6 +235,9 @@ static const char *trace_options[] = {
    "stacktrace",
    "sched-tree",
    "ftrace_printk",
    +#ifdef CONFIG_UNLIKELY_TRACER
    + "unlikely",
    +#endif
    NULL
    };

    @@ -1457,6 +1460,18 @@ print_lat_fmt(struct trace_iterator *ite
    trace_seq_print_cont(s, iter);
    break;
    }
    + case TRACE_UNLIKELY: {
    + struct trace_unlikely *field;
    +
    + trace_assign_type(field, entry);
    +
    + trace_seq_printf(s, "[%s] %s:%s:%d\n",
    + field->correct ? "correct" : "INCORRECT",
    + field->func,
    + field->file,
    + field->line);
    + break;
    + }
    default:
    trace_seq_printf(s, "Unknown type %d\n", entry->type);
    }
    @@ -1593,6 +1608,18 @@ static enum print_line_t print_trace_fmt
    trace_seq_print_cont(s, iter);
    break;
    }
    + case TRACE_UNLIKELY: {
    + struct trace_unlikely *field;
    +
    + trace_assign_type(field, entry);
    +
    + trace_seq_printf(s, "[%s] %s:%s:%d\n",
    + field->correct ? "correct" : "INCORRECT",
    + field->func,
    + field->file,
    + field->line);
    + break;
    + }
    }
    return TRACE_TYPE_HANDLED;
    }
    @@ -1675,6 +1702,10 @@ static enum print_line_t print_raw_fmt(s
    trace_seq_print_cont(s, iter);
    break;
    }
    + case TRACE_UNLIKELY: {
    + /* IGNORE */
    + break;
    + };
    }
    return TRACE_TYPE_HANDLED;
    }
    @@ -2362,8 +2393,14 @@ tracing_ctrl_write(struct file *filp, co

    tr->ctrl = val;

    + if (!val)
    + trace_unlikely_disable();
    +
    if (current_trace && current_trace->ctrl_update)
    current_trace->ctrl_update(tr);
    +
    + if (val)
    + trace_unlikely_enable(tr);
    }
    mutex_unlock(&trace_types_lock);

    @@ -2425,6 +2462,7 @@ tracing_set_trace_write(struct file *fil
    if (t == current_trace)
    goto out;

    + trace_unlikely_disable();
    if (current_trace && current_trace->reset)
    current_trace->reset(tr);

    @@ -2432,6 +2470,7 @@ tracing_set_trace_write(struct file *fil
    if (t->init)
    t->init(tr);

    + trace_unlikely_enable(tr);
    out:
    mutex_unlock(&trace_types_lock);

    Index: linux-tip.git/kernel/trace/trace.h
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/trace.h 2008-10-29 17:01:11.000000000 -0400
    +++ linux-tip.git/kernel/trace/trace.h 2008-10-29 17:46:37.000000000 -0400
    @@ -21,6 +21,7 @@ enum trace_type {
    TRACE_SPECIAL,
    TRACE_MMIO_RW,
    TRACE_MMIO_MAP,
    + TRACE_UNLIKELY,
    TRACE_BOOT,

    __TRACE_LAST_TYPE
    @@ -117,6 +118,16 @@ struct trace_boot {
    struct boot_trace initcall;
    };

    +#define TRACE_FUNC_SIZE 30
    +#define TRACE_FILE_SIZE 20
    +struct trace_unlikely {
    + struct trace_entry ent;
    + unsigned line;
    + char func[TRACE_FUNC_SIZE+1];
    + char file[TRACE_FILE_SIZE+1];
    + char correct;
    +};
    +
    /*
    * trace_flag_type is an enumeration that holds different
    * states when a trace occurs. These are:
    @@ -217,6 +228,7 @@ extern void __ftrace_bad_type(void);
    IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
    TRACE_MMIO_MAP); \
    IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \
    + IF_ASSIGN(var, ent, struct trace_unlikely, TRACE_UNLIKELY); \
    __ftrace_bad_type(); \
    } while (0)

    @@ -413,8 +425,39 @@ enum trace_iterator_flags {
    TRACE_ITER_STACKTRACE = 0x100,
    TRACE_ITER_SCHED_TREE = 0x200,
    TRACE_ITER_PRINTK = 0x400,
    +#ifdef CONFIG_UNLIKELY_TRACER
    + TRACE_ITER_UNLIKELY = 0x800,
    +#endif
    };

    extern struct tracer nop_trace;

    +#ifdef CONFIG_UNLIKELY_TRACER
    +extern int enable_unlikely_tracing(struct trace_array *tr);
    +extern void disable_unlikely_tracing(void);
    +static inline int trace_unlikely_enable(struct trace_array *tr)
    +{
    + if (trace_flags & TRACE_ITER_UNLIKELY)
    + return enable_unlikely_tracing(tr);
    + return 0;
    +}
    +static inline void trace_unlikely_disable(void)
    +{
    + /* due to races, always disable */
    + disable_unlikely_tracing();
    +}
    +extern void trace_unlikely_stop(void);
    +extern void trace_unlikely_start(void);
    +#else
    +static inline int trace_unlikely_enable(struct trace_array *tr)
    +{
    + return 0;
    +}
    +static inline void trace_unlikely_disable(void)
    +{
    +}
    +static inline void trace_unlikely_stop(void) { }
    +static inline void trace_unlikely_start(void) { }
    +#endif /* CONFIG_UNLIKELY_TRACER */
    +
    #endif /* _LINUX_KERNEL_TRACE_H */
    Index: linux-tip.git/kernel/trace/ring_buffer.c
    ================================================== =================
    --- linux-tip.git.orig/kernel/trace/ring_buffer.c 2008-10-29 17:01:11.000000000 -0400
    +++ linux-tip.git/kernel/trace/ring_buffer.c 2008-10-29 17:46:37.000000000 -0400
    @@ -16,6 +16,8 @@
    #include
    #include

    +#include "trace.h"
    +
    /* Up this if you want to test the TIME_EXTENTS and normalization */
    #define DEBUG_SHIFT 0

    @@ -1151,7 +1153,14 @@ ring_buffer_lock_reserve(struct ring_buf
    if (length > BUF_PAGE_SIZE)
    goto out;

    + /*
    + * Ugly, but the unlikely tracing will enter back into
    + * the ring buffer code. Namely through the timestamp, causing
    + * a triggering of the "again" loop, that will be infinite.
    + */
    + trace_unlikely_stop();
    event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
    + trace_unlikely_start();
    if (!event)
    goto out;

    @@ -1263,8 +1272,10 @@ int ring_buffer_write(struct ring_buffer
    goto out;

    event_length = rb_calculate_event_length(length);
    + trace_unlikely_stop();
    event = rb_reserve_next_event(cpu_buffer,
    RINGBUF_TYPE_DATA, event_length);
    + trace_unlikely_start();
    if (!event)
    goto out;



    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  15. Re: [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations

    On Wed, 29 October 2008 18:39:55 -0400, Steven Rostedt wrote:
    >
    > 69768 61064 87 __switch_to process_64.c 624
    > 15557 115251 100 __switch_to process_64.c 594
    > 15555 115227 100 __switch_to process_64.c 590


    I may be out of school for a while, but that math sure looks odd.

    > + if (p->correct) {
    > + percent = p->incorrect * 100;
    > + percent /= p->correct;

    percent /= p->correct + p->incorect;
    > + /* No need to see huge numbers */
    > + if (percent > 100)
    > + percent = 100;
    > + } else
    > + percent = p->incorrect ? 100 : 0;


    Jörn

    --
    To my face you have the audacity to advise me to become a thief - the worst
    kind of thief that is conceivable, a thief of spiritual things, a thief of
    ideas! It is insufferable, intolerable!
    -- M. Binet in Scarabouche
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  16. Re: [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations

    On Thu, Oct 30, 2008 at 03:32:39PM +0100, Jrn Engel wrote:
    > On Wed, 29 October 2008 18:39:55 -0400, Steven Rostedt wrote:
    > >
    > > 69768 61064 87 __switch_to process_64.c 624
    > > 15557 115251 100 __switch_to process_64.c 594
    > > 15555 115227 100 __switch_to process_64.c 590

    >
    > I may be out of school for a while, but that math sure looks odd.
    >
    > > + if (p->correct) {
    > > + percent = p->incorrect * 100;
    > > + percent /= p->correct;

    > percent /= p->correct + p->incorect;


    And once you do the above fix, I don't think what is below is
    necessary any more. :-)

    > > + /* No need to see huge numbers */
    > > + if (percent > 100)
    > > + percent = 100;


    I would also calculate the percent correct rather than the percent
    incorrect, on the general theory that 100% good, 0% bad is easier for
    my little brain to understand, but that's just a minor thing...

    - Ted
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  17. Re: [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations


    On Thu, 30 Oct 2008, Theodore Tso wrote:

    > On Thu, Oct 30, 2008 at 03:32:39PM +0100, J?rn Engel wrote:
    > > On Wed, 29 October 2008 18:39:55 -0400, Steven Rostedt wrote:
    > > >
    > > > 69768 61064 87 __switch_to process_64.c 624
    > > > 15557 115251 100 __switch_to process_64.c 594
    > > > 15555 115227 100 __switch_to process_64.c 590

    > >
    > > I may be out of school for a while, but that math sure looks odd.
    > >
    > > > + if (p->correct) {
    > > > + percent = p->incorrect * 100;
    > > > + percent /= p->correct;

    > > percent /= p->correct + p->incorect;

    >
    > And once you do the above fix, I don't think what is below is
    > necessary any more. :-)


    Bah! Total brain-fart. Thanks ;-)

    >
    > > > + /* No need to see huge numbers */
    > > > + if (percent > 100)
    > > > + percent = 100;

    >
    > I would also calculate the percent correct rather than the percent
    > incorrect, on the general theory that 100% good, 0% bad is easier for
    > my little brain to understand, but that's just a minor thing...


    I thought about whether or not to show percent correct or precent
    incorrect. I chose, incorrect, just because it is easier to spot the
    trouble makers. You can say, lets allow 25% incorrect, better than saying
    lets have a minimum of 75%. I think the mind can pick out those that go
    over a number better than it can see those that are under a number.

    Thanks,

    -- Steve

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

+ Reply to Thread