[RFC patch 00/27] Jump-based NMI-safe immediate values and markers for sched-devel.git - Kernel

This is a discussion on [RFC patch 00/27] Jump-based NMI-safe immediate values and markers for sched-devel.git - Kernel ; Since it will not be used by other kernel objects, it makes sense to declare it static. Signed-off-by: Mathieu Desnoyers Acked-by: Ananth N Mavinakayanahalli Acked-by: Masami Hiramatsu CC: hch@infradead.org CC: anil.s.keshavamurthy@intel.com CC: davem@davemloft.net --- kernel/kprobes.c | 2 +- 1 file ...

+ Reply to Thread
Page 2 of 2 FirstFirst 1 2
Results 21 to 36 of 36

Thread: [RFC patch 00/27] Jump-based NMI-safe immediate values and markers for sched-devel.git

  1. [RFC patch 06/27] Kprobes - declare kprobe_mutex static

    Since it will not be used by other kernel objects, it makes sense to declare it
    static.

    Signed-off-by: Mathieu Desnoyers
    Acked-by: Ananth N Mavinakayanahalli
    Acked-by: Masami Hiramatsu
    CC: hch@infradead.org
    CC: anil.s.keshavamurthy@intel.com
    CC: davem@davemloft.net
    ---
    kernel/kprobes.c | 2 +-
    1 file changed, 1 insertion(+), 1 deletion(-)

    Index: linux-2.6-lttng/kernel/kprobes.c
    ================================================== =================
    --- linux-2.6-lttng.orig/kernel/kprobes.c 2007-08-19 09:09:15.000000000 -0400
    +++ linux-2.6-lttng/kernel/kprobes.c 2007-08-19 17:18:07.000000000 -0400
    @@ -68,7 +68,7 @@ static struct hlist_head kretprobe_inst_
    /* NOTE: change this value only with kprobe_mutex held */
    static bool kprobe_enabled;

    -DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
    +static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
    DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
    static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;


    --
    Mathieu Desnoyers
    Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
    OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  2. [RFC patch 13/27] Add text_poke and sync_core to powerpc

    - Needed on architectures where we must surround live instruction modification
    with "WP flag disable".
    - Turns into a memcpy on powerpc since there is no WP flag activated for
    instruction pages (yet..).
    - Add empty sync_core to powerpc so it can be used in architecture independent
    code.

    Signed-off-by: Mathieu Desnoyers
    CC: Rusty Russell
    CC: Christoph Hellwig
    CC: Paul Mackerras
    CC: Adrian Bunk
    CC: Andi Kleen
    CC: mingo@elte.hu
    CC: akpm@osdl.org
    ---
    include/asm-powerpc/cacheflush.h | 4 +++-
    1 file changed, 3 insertions(+), 1 deletion(-)

    Index: linux-2.6-lttng/include/asm-powerpc/cacheflush.h
    ================================================== =================
    --- linux-2.6-lttng.orig/include/asm-powerpc/cacheflush.h 2007-11-19 12:05:50.000000000 -0500
    +++ linux-2.6-lttng/include/asm-powerpc/cacheflush.h 2007-11-19 13:27:36.000000000 -0500
    @@ -63,7 +63,9 @@ extern void flush_dcache_phys_range(unsi
    #define copy_from_user_page(vma, page, vaddr, dst, src, len) \
    memcpy(dst, src, len)

    -
    +#define text_poke memcpy
    +#define text_poke_early text_poke
    +#define sync_core()

    #ifdef CONFIG_DEBUG_PAGEALLOC
    /* internal debugging function */

    --
    Mathieu Desnoyers
    Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
    OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  3. [RFC patch 20/27] Immediate Values - Move Kprobes x86 restore_interrupt to kdebug.h

    Since the breakpoint handler is useful both to kprobes and immediate values, it
    makes sense to make the required restore_interrupt() available through
    asm-i386/kdebug.h.

    Signed-off-by: Mathieu Desnoyers
    Acked-by: Ananth N Mavinakayanahalli
    CC: Christoph Hellwig
    CC: anil.s.keshavamurthy@intel.com
    CC: davem@davemloft.net
    CC: Thomas Gleixner
    CC: Ingo Molnar
    CC: H. Peter Anvin
    ---
    include/asm-x86/kdebug.h | 12 ++++++++++++
    include/asm-x86/kprobes.h | 9 ---------
    2 files changed, 12 insertions(+), 9 deletions(-)

    Index: linux-2.6-lttng/include/asm-x86/kdebug.h
    ================================================== =================
    --- linux-2.6-lttng.orig/include/asm-x86/kdebug.h 2008-03-25 08:56:54.000000000 -0400
    +++ linux-2.6-lttng/include/asm-x86/kdebug.h 2008-03-25 09:00:17.000000000 -0400
    @@ -3,6 +3,9 @@

    #include

    +#include
    +#include
    +
    struct pt_regs;

    /* Grossly misnamed. */
    @@ -34,4 +37,13 @@ extern void show_regs(struct pt_regs *re
    extern unsigned long oops_begin(void);
    extern void oops_end(unsigned long, struct pt_regs *, int signr);

    +/* trap3/1 are intr gates for kprobes. So, restore the status of IF,
    + * if necessary, before executing the original int3/1 (trap) handler.
    + */
    +static inline void restore_interrupts(struct pt_regs *regs)
    +{
    + if (regs->flags & X86_EFLAGS_IF)
    + local_irq_enable();
    +}
    +
    #endif
    Index: linux-2.6-lttng/include/asm-x86/kprobes.h
    ================================================== =================
    --- linux-2.6-lttng.orig/include/asm-x86/kprobes.h 2008-03-25 08:56:54.000000000 -0400
    +++ linux-2.6-lttng/include/asm-x86/kprobes.h 2008-03-25 09:00:17.000000000 -0400
    @@ -82,15 +82,6 @@ struct kprobe_ctlblk {
    struct prev_kprobe prev_kprobe;
    };

    -/* trap3/1 are intr gates for kprobes. So, restore the status of IF,
    - * if necessary, before executing the original int3/1 (trap) handler.
    - */
    -static inline void restore_interrupts(struct pt_regs *regs)
    -{
    - if (regs->flags & X86_EFLAGS_IF)
    - local_irq_enable();
    -}
    -
    extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
    extern int kprobe_exceptions_notify(struct notifier_block *self,
    unsigned long val, void *data);

    --
    Mathieu Desnoyers
    Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
    OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  4. [RFC patch 09/27] Add all cpus option to stop machine run

    -allow stop_mahcine_run() to call a function on all cpus. Calling
    stop_machine_run() with a 'ALL_CPUS' invokes this new behavior.
    stop_machine_run() proceeds as normal until the calling cpu has
    invoked 'fn'. Then, we tell all the other cpus to call 'fn'.

    Signed-off-by: Jason Baron
    Signed-off-by: Mathieu Desnoyers
    CC: Rusty Russell
    CC: Adrian Bunk
    CC: Andi Kleen
    CC: Christoph Hellwig
    CC: mingo@elte.hu
    CC: akpm@osdl.org
    ---

    include/linux/stop_machine.h | 8 +++++++-
    kernel/stop_machine.c | 31 ++++++++++++++++++++++++-------
    2 files changed, 31 insertions(+), 8 deletions(-)


    Index: linux-2.6-sched-devel/include/linux/stop_machine.h
    ================================================== =================
    --- linux-2.6-sched-devel.orig/include/linux/stop_machine.h 2008-04-16 11:07:24.000000000 -0400
    +++ linux-2.6-sched-devel/include/linux/stop_machine.h 2008-04-16 11:13:48.000000000 -0400
    @@ -8,11 +8,17 @@
    #include

    #if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
    +
    +#define ALL_CPUS ~0U
    +
    /**
    * stop_machine_run: freeze the machine on all CPUs and run this function
    * @fn: the function to run
    * @data: the data ptr for the @fn()
    - * @cpu: the cpu to run @fn() on (or any, if @cpu == NR_CPUS.
    + * @cpu: if @cpu == n, run @fn() on cpu n
    + * if @cpu == NR_CPUS, run @fn() on any cpu
    + * if @cpu == ALL_CPUS, run @fn() first on the calling cpu, and then
    + * concurrently on all the other cpus
    *
    * Description: This causes a thread to be scheduled on every other cpu,
    * each of which disables interrupts, and finally interrupts are disabled
    Index: linux-2.6-sched-devel/kernel/stop_machine.c
    ================================================== =================
    --- linux-2.6-sched-devel.orig/kernel/stop_machine.c 2008-04-16 11:07:24.000000000 -0400
    +++ linux-2.6-sched-devel/kernel/stop_machine.c 2008-04-16 11:13:48.000000000 -0400
    @@ -23,9 +23,17 @@ enum stopmachine_state {
    STOPMACHINE_WAIT,
    STOPMACHINE_PREPARE,
    STOPMACHINE_DISABLE_IRQ,
    + STOPMACHINE_RUN,
    STOPMACHINE_EXIT,
    };

    +struct stop_machine_data {
    + int (*fn)(void *);
    + void *data;
    + struct completion done;
    + int run_all;
    +} smdata;
    +
    static enum stopmachine_state stopmachine_state;
    static unsigned int stopmachine_num_threads;
    static atomic_t stopmachine_thread_ack;
    @@ -34,6 +42,7 @@ static int stopmachine(void *cpu)
    {
    int irqs_disabled = 0;
    int prepared = 0;
    + int ran = 0;

    set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));

    @@ -58,6 +67,11 @@ static int stopmachine(void *cpu)
    prepared = 1;
    smp_mb(); /* Must read state first. */
    atomic_inc(&stopmachine_thread_ack);
    + } else if (stopmachine_state == STOPMACHINE_RUN && !ran) {
    + smdata.fn(smdata.data);
    + ran = 1;
    + smp_mb(); /* Must read state first. */
    + atomic_inc(&stopmachine_thread_ack);
    }
    /* Yield in first stage: migration threads need to
    * help our sisters onto their CPUs. */
    @@ -135,12 +149,10 @@ static void restart_machine(void)
    preempt_enable_no_resched();
    }

    -struct stop_machine_data
    +static void run_other_cpus(void)
    {
    - int (*fn)(void *);
    - void *data;
    - struct completion done;
    -};
    + stopmachine_set_state(STOPMACHINE_RUN);
    +}

    static int do_stop(void *_smdata)
    {
    @@ -150,6 +162,8 @@ static int do_stop(void *_smdata)
    ret = stop_machine();
    if (ret == 0) {
    ret = smdata->fn(smdata->data);
    + if (smdata->run_all)
    + run_other_cpus();
    restart_machine();
    }

    @@ -173,14 +187,17 @@ struct task_struct *__stop_machine_run(i
    struct stop_machine_data smdata;
    struct task_struct *p;

    + mutex_lock(&stopmachine_mutex);
    +
    smdata.fn = fn;
    smdata.data = data;
    + smdata.run_all = (cpu == ALL_CPUS) ? 1 : 0;
    init_completion(&smdata.done);

    - mutex_lock(&stopmachine_mutex);
    + smp_wmb(); /* make sure other cpus see smdata updates */

    /* If they don't care which CPU fn runs on, bind to any online one. */
    - if (cpu == NR_CPUS)
    + if (cpu == NR_CPUS || cpu == ALL_CPUS)
    cpu = raw_smp_processor_id();

    p = kthread_create(do_stop, &smdata, "kstopmachine");

    --
    Mathieu Desnoyers
    Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
    OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  5. [RFC patch 22/27] Immediate Values - x86 Optimization NMI and MCE support

    x86 optimization of the immediate values which uses a movl with code patching
    to set/unset the value used to populate the register used as variable source.
    It uses a breakpoint to bypass the instruction being changed, which lessens the
    interrupt latency of the operation and protects against NMIs and MCE.

    - More reentrant immediate value : uses a breakpoint. Needs to know the
    instruction's first byte. This is why we keep the "instruction size"
    variable, so we can support the REX prefixed instructions too.

    Changelog:
    - Change the immediate.c update code to support variable length opcodes.
    - Use text_poke_early with cr0 WP save/restore to patch the bypass. We are doing
    non atomic writes to a code region only touched by us (nobody can execute it
    since we are protected by the imv_mutex).
    - Add x86_64 support, ready for i386+x86_64 -> x86 merge.
    - Use asm-x86/asm.h.
    - Change the immediate.c update code to support variable length opcodes.
    - Use imv_* instead of immediate_*.
    - Use kernel_wp_disable/enable instead of save/restore.
    - Fix 1 byte immediate value so it declares its instruction size.

    Signed-off-by: Mathieu Desnoyers
    CC: Andi Kleen
    CC: "H. Peter Anvin"
    CC: Chuck Ebbert
    CC: Christoph Hellwig
    CC: Jeremy Fitzhardinge
    CC: Thomas Gleixner
    CC: Ingo Molnar
    ---
    arch/x86/kernel/Makefile | 1
    arch/x86/kernel/immediate.c | 291 ++++++++++++++++++++++++++++++++++++++++++++
    arch/x86/kernel/traps_32.c | 9 -
    include/asm-x86/immediate.h | 48 ++++++-
    4 files changed, 339 insertions(+), 10 deletions(-)

    Index: linux-2.6-sched-devel/include/asm-x86/immediate.h
    ================================================== =================
    --- linux-2.6-sched-devel.orig/include/asm-x86/immediate.h 2008-04-16 12:01:00.000000000 -0400
    +++ linux-2.6-sched-devel/include/asm-x86/immediate.h 2008-04-16 12:01:01.000000000 -0400
    @@ -12,6 +12,18 @@

    #include

    +struct __imv {
    + unsigned long var; /* Pointer to the identifier variable of the
    + * immediate value
    + */
    + unsigned long imv; /*
    + * Pointer to the memory location of the
    + * immediate value within the instruction.
    + */
    + unsigned char size; /* Type size. */
    + unsigned char insn_size;/* Instruction size. */
    +} __attribute__ ((packed));
    +
    /**
    * imv_read - read immediate variable
    * @name: immediate value name
    @@ -26,6 +38,11 @@
    * what will generate an instruction with 8 bytes immediate value (not the REX.W
    * prefixed one that loads a sign extended 32 bits immediate value in a r64
    * register).
    + *
    + * Create the instruction in a discarded section to calculate its size. This is
    + * how we can align the beginning of the instruction on an address that will
    + * permit atomic modification of the immediate value without knowing the size of
    + * the opcode used by the compiler. The operand size is known in advance.
    */
    #define imv_read(name) \
    ({ \
    @@ -33,9 +50,14 @@
    BUILD_BUG_ON(sizeof(value) > 8); \
    switch (sizeof(value)) { \
    case 1: \
    - asm(".section __imv,\"aw\",@progbits\n\t" \
    + asm(".section __discard,\"\",@progbits\n\t" \
    + "1:\n\t" \
    + "mov $0,%0\n\t" \
    + "2:\n\t" \
    + ".previous\n\t" \
    + ".section __imv,\"aw\",@progbits\n\t" \
    _ASM_PTR "%c1, (3f)-%c2\n\t" \
    - ".byte %c2\n\t" \
    + ".byte %c2, (2b-1b)\n\t" \
    ".previous\n\t" \
    "mov $0,%0\n\t" \
    "3:\n\t" \
    @@ -45,10 +67,16 @@
    break; \
    case 2: \
    case 4: \
    - asm(".section __imv,\"aw\",@progbits\n\t" \
    + asm(".section __discard,\"\",@progbits\n\t" \
    + "1:\n\t" \
    + "mov $0,%0\n\t" \
    + "2:\n\t" \
    + ".previous\n\t" \
    + ".section __imv,\"aw\",@progbits\n\t" \
    _ASM_PTR "%c1, (3f)-%c2\n\t" \
    - ".byte %c2\n\t" \
    + ".byte %c2, (2b-1b)\n\t" \
    ".previous\n\t" \
    + ".org . + ((-.-(2b-1b)) & (%c2-1)), 0x90\n\t" \
    "mov $0,%0\n\t" \
    "3:\n\t" \
    : "=r" (value) \
    @@ -60,10 +88,16 @@
    value = name##__imv; \
    break; \
    } \
    - asm(".section __imv,\"aw\",@progbits\n\t" \
    + asm(".section __discard,\"\",@progbits\n\t" \
    + "1:\n\t" \
    + "mov $0xFEFEFEFE01010101,%0\n\t" \
    + "2:\n\t" \
    + ".previous\n\t" \
    + ".section __imv,\"aw\",@progbits\n\t" \
    _ASM_PTR "%c1, (3f)-%c2\n\t" \
    - ".byte %c2\n\t" \
    + ".byte %c2, (2b-1b)\n\t" \
    ".previous\n\t" \
    + ".org . + ((-.-(2b-1b)) & (%c2-1)), 0x90\n\t" \
    "mov $0xFEFEFEFE01010101,%0\n\t" \
    "3:\n\t" \
    : "=r" (value) \
    @@ -74,4 +108,6 @@
    value; \
    })

    +extern int arch_imv_update(const struct __imv *imv, int early);
    +
    #endif /* _ASM_X86_IMMEDIATE_H */
    Index: linux-2.6-sched-devel/arch/x86/kernel/traps_32.c
    ================================================== =================
    --- linux-2.6-sched-devel.orig/arch/x86/kernel/traps_32.c 2008-04-16 12:01:00.000000000 -0400
    +++ linux-2.6-sched-devel/arch/x86/kernel/traps_32.c 2008-04-16 12:01:01.000000000 -0400
    @@ -592,7 +592,7 @@ void do_##name(struct pt_regs *regs, lon
    }

    DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
    -#ifndef CONFIG_KPROBES
    +#if !defined(CONFIG_KPROBES) && !defined(CONFIG_IMMEDIATE)
    DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
    #endif
    DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
    @@ -857,7 +857,7 @@ void restart_nmi(void)
    acpi_nmi_enable();
    }

    -#ifdef CONFIG_KPROBES
    +#if defined(CONFIG_KPROBES) || defined(CONFIG_IMMEDIATE)
    void __kprobes do_int3(struct pt_regs *regs, long error_code)
    {
    trace_hardirqs_fixup();
    @@ -866,9 +866,10 @@ void __kprobes do_int3(struct pt_regs *r
    == NOTIFY_STOP)
    return;
    /*
    - * This is an interrupt gate, because kprobes wants interrupts
    - * disabled. Normal trap handlers don't.
    + * This is an interrupt gate, because kprobes and immediate values wants
    + * interrupts disabled. Normal trap handlers don't.
    */
    +
    restore_interrupts(regs);

    do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
    Index: linux-2.6-sched-devel/arch/x86/kernel/Makefile
    ================================================== =================
    --- linux-2.6-sched-devel.orig/arch/x86/kernel/Makefile 2008-04-16 12:01:00.000000000 -0400
    +++ linux-2.6-sched-devel/arch/x86/kernel/Makefile 2008-04-16 12:01:01.000000000 -0400
    @@ -68,6 +68,7 @@ obj-y += vsmp_64.o
    obj-$(CONFIG_KPROBES) += kprobes.o
    obj-$(CONFIG_MODULES) += module_$(BITS).o
    obj-$(CONFIG_ACPI_SRAT) += srat_32.o
    +obj-$(CONFIG_IMMEDIATE) += immediate.o
    obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
    obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
    obj-$(CONFIG_KGDB) += kgdb.o
    Index: linux-2.6-sched-devel/arch/x86/kernel/immediate.c
    ================================================== =================
    --- /dev/null 1970-01-01 00:00:00.000000000 +0000
    +++ linux-2.6-sched-devel/arch/x86/kernel/immediate.c 2008-04-16 12:01:20.000000000 -0400
    @@ -0,0 +1,291 @@
    +/*
    + * Immediate Value - x86 architecture specific code.
    + *
    + * Rationale
    + *
    + * Required because of :
    + * - Erratum 49 fix for Intel PIII.
    + * - Still present on newer processors : Intel Core 2 Duo Processor for Intel
    + * Centrino Duo Processor Technology Specification Update, AH33.
    + * Unsynchronized Cross-Modifying Code Operations Can Cause Unexpected
    + * Instruction Execution Results.
    + *
    + * Permits immediate value modification by XMC with correct serialization.
    + *
    + * Reentrant for NMI and trap handler instrumentation. Permits XMC to a
    + * location that has preemption enabled because it involves no temporary or
    + * reused data structure.
    + *
    + * Quoting Richard J Moore, source of the information motivating this
    + * implementation which differs from the one proposed by Intel which is not
    + * suitable for kernel context (does not support NMI and would require disabling
    + * interrupts on every CPU for a long period) :
    + *
    + * "There is another issue to consider when looking into using probes other
    + * then int3:
    + *
    + * Intel erratum 54 - Unsynchronized Cross-modifying code - refers to the
    + * practice of modifying code on one processor where another has prefetched
    + * the unmodified version of the code. Intel states that unpredictable general
    + * protection faults may result if a synchronizing instruction (iret, int,
    + * int3, cpuid, etc ) is not executed on the second processor before it
    + * executes the pre-fetched out-of-date copy of the instruction.
    + *
    + * When we became aware of this I had a long discussion with Intel's
    + * microarchitecture guys. It turns out that the reason for this erratum
    + * (which incidentally Intel does not intend to fix) is because the trace
    + * cache - the stream of micro-ops resulting from instruction interpretation -
    + * cannot be guaranteed to be valid. Reading between the lines I assume this
    + * issue arises because of optimization done in the trace cache, where it is
    + * no longer possible to identify the original instruction boundaries. If the
    + * CPU discoverers that the trace cache has been invalidated because of
    + * unsynchronized cross-modification then instruction execution will be
    + * aborted with a GPF. Further discussion with Intel revealed that replacing
    + * the first opcode byte with an int3 would not be subject to this erratum.
    + *
    + * So, is cmpxchg reliable? One has to guarantee more than mere atomicity."
    + *
    + * Overall design
    + *
    + * The algorithm proposed by Intel applies not so well in kernel context: it
    + * would imply disabling interrupts and looping on every CPUs while modifying
    + * the code and would not support instrumentation of code called from interrupt
    + * sources that cannot be disabled.
    + *
    + * Therefore, we use a different algorithm to respect Intel's erratum (see the
    + * quoted discussion above). We make sure that no CPU sees an out-of-date copy
    + * of a pre-fetched instruction by 1 - using a breakpoint, which skips the
    + * instruction that is going to be modified, 2 - issuing an IPI to every CPU to
    + * execute a sync_core(), to make sure that even when the breakpoint is removed,
    + * no cpu could possibly still have the out-of-date copy of the instruction,
    + * modify the now unused 2nd byte of the instruction, and then put back the
    + * original 1st byte of the instruction.
    + *
    + * It has exactly the same intent as the algorithm proposed by Intel, but
    + * it has less side-effects, scales better and supports NMI, SMI and MCE.
    + *
    + * Mathieu Desnoyers
    + */
    +
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +
    +#include
    +
    +#define BREAKPOINT_INSTRUCTION 0xcc
    +#define BREAKPOINT_INS_LEN 1
    +#define NR_NOPS 10
    +
    +static unsigned long target_after_int3; /* EIP of the target after the int3 */
    +static unsigned long bypass_eip; /* EIP of the bypass. */
    +static unsigned long bypass_after_int3; /* EIP after the end-of-bypass int3 */
    +static unsigned long after_imv; /*
    + * EIP where to resume after the
    + * single-stepping.
    + */
    +
    +/*
    + * Internal bypass used during value update. The bypass is skipped by the
    + * function in which it is inserted.
    + * No need to be aligned because we exclude readers from the site during
    + * update.
    + * Layout is:
    + * (10x nop) int3
    + * (maximum size is 2 bytes opcode + 8 bytes immediate value for long on x86_64)
    + * The nops are the target replaced by the instruction to single-step.
    + * Align on 16 bytes to make sure the nops fit within a single page so remapping
    + * it can be done easily.
    + */
    +static inline void _imv_bypass(unsigned long *bypassaddr,
    + unsigned long *breaknextaddr)
    +{
    + asm volatile("jmp 2f;\n\t"
    + ".align 16;\n\t"
    + "0:\n\t"
    + ".space 10, 0x90;\n\t"
    + "1:\n\t"
    + "int3;\n\t"
    + "2:\n\t"
    + "mov $(0b),%0;\n\t"
    + "mov $((1b)+1),%1;\n\t"
    + : "=r" (*bypassaddr),
    + "=r" (*breaknextaddr));
    +}
    +
    +static void imv_synchronize_core(void *info)
    +{
    + sync_core(); /* use cpuid to stop speculative execution */
    +}
    +
    +/*
    + * The eip value points right after the breakpoint instruction, in the second
    + * byte of the movl.
    + * Disable preemption in the bypass to make sure no thread will be preempted in
    + * it. We can then use synchronize_sched() to make sure every bypass users have
    + * ended.
    + */
    +static int imv_notifier(struct notifier_block *nb,
    + unsigned long val, void *data)
    +{
    + enum die_val die_val = (enum die_val) val;
    + struct die_args *args = data;
    +
    + if (!args->regs || user_mode_vm(args->regs))
    + return NOTIFY_DONE;
    +
    + if (die_val == DIE_INT3) {
    + if (args->regs->ip == target_after_int3) {
    + preempt_disable();
    + args->regs->ip = bypass_eip;
    + return NOTIFY_STOP;
    + } else if (args->regs->ip == bypass_after_int3) {
    + args->regs->ip = after_imv;
    + preempt_enable();
    + return NOTIFY_STOP;
    + }
    + }
    + return NOTIFY_DONE;
    +}
    +
    +static struct notifier_block imv_notify = {
    + .notifier_call = imv_notifier,
    + .priority = 0x7fffffff, /* we need to be notified first */
    +};
    +
    +/**
    + * arch_imv_update - update one immediate value
    + * @imv: pointer of type const struct __imv to update
    + * @early: early boot (1) or normal (0)
    + *
    + * Update one immediate value. Must be called with imv_mutex held.
    + */
    +__kprobes int arch_imv_update(const struct __imv *imv, int early)
    +{
    + int ret;
    + unsigned char opcode_size = imv->insn_size - imv->size;
    + unsigned long insn = imv->imv - opcode_size;
    + unsigned long len;
    + char *vaddr;
    + struct page *pages[1];
    +
    +#ifdef CONFIG_KPROBES
    + /*
    + * Fail if a kprobe has been set on this instruction.
    + * (TODO: we could eventually do better and modify all the (possibly
    + * nested) kprobes for this site if kprobes had an API for this.
    + */
    + if (unlikely(!early
    + && *(unsigned char *)insn == BREAKPOINT_INSTRUCTION)) {
    + printk(KERN_WARNING "Immediate value in conflict with kprobe. "
    + "Variable at %p, "
    + "instruction at %p, size %hu\n",
    + (void *)imv->imv,
    + (void *)imv->var, imv->size);
    + return -EBUSY;
    + }
    +#endif
    +
    + /*
    + * If the variable and the instruction have the same value, there is
    + * nothing to do.
    + */
    + switch (imv->size) {
    + case 1: if (*(uint8_t *)imv->imv
    + == *(uint8_t *)imv->var)
    + return 0;
    + break;
    + case 2: if (*(uint16_t *)imv->imv
    + == *(uint16_t *)imv->var)
    + return 0;
    + break;
    + case 4: if (*(uint32_t *)imv->imv
    + == *(uint32_t *)imv->var)
    + return 0;
    + break;
    +#ifdef CONFIG_X86_64
    + case 8: if (*(uint64_t *)imv->imv
    + == *(uint64_t *)imv->var)
    + return 0;
    + break;
    +#endif
    + default:return -EINVAL;
    + }
    +
    + if (!early) {
    + /* bypass is 10 bytes long for x86_64 long */
    + WARN_ON(imv->insn_size > 10);
    + _imv_bypass(&bypass_eip, &bypass_after_int3);
    +
    + after_imv = imv->imv + imv->size;
    +
    + /*
    + * Using the _early variants because nobody is executing the
    + * bypass code while we patch it. It is protected by the
    + * imv_mutex. Since we modify the instructions non atomically
    + * (for nops), we have to use the _early variant.
    + * We must however deal with RO pages.
    + * Use a single page : 10 bytes are aligned on 16 bytes
    + * boundaries.
    + */
    + pages[0] = virt_to_page((void *)bypass_eip);
    + vaddr = vmap(pages, 1, VM_MAP, PAGE_KERNEL);
    + BUG_ON(!vaddr);
    + text_poke_early(&vaddr[bypass_eip & ~PAGE_MASK],
    + (void *)insn, imv->insn_size);
    + /*
    + * Fill the rest with nops.
    + */
    + len = NR_NOPS - imv->insn_size;
    + add_nops((void *)
    + &vaddr[(bypass_eip & ~PAGE_MASK) + imv->insn_size],
    + len);
    + vunmap(vaddr);
    +
    + target_after_int3 = insn + BREAKPOINT_INS_LEN;
    + /* register_die_notifier has memory barriers */
    + register_die_notifier(&imv_notify);
    + /* The breakpoint will single-step the bypass */
    + text_poke((void *)insn,
    + ((unsigned char[]){BREAKPOINT_INSTRUCTION}), 1);
    + /*
    + * Make sure the breakpoint is set before we continue (visible
    + * to other CPUs and interrupts).
    + */
    + wmb();
    + /*
    + * Execute serializing instruction on each CPU.
    + */
    + ret = on_each_cpu(imv_synchronize_core, NULL, 1, 1);
    + BUG_ON(ret != 0);
    +
    + text_poke((void *)(insn + opcode_size), (void *)imv->var,
    + imv->size);
    + /*
    + * Make sure the value can be seen from other CPUs and
    + * interrupts.
    + */
    + wmb();
    + text_poke((void *)insn, (unsigned char *)bypass_eip, 1);
    + /*
    + * Wait for all int3 handlers to end (interrupts are disabled in
    + * int3). This CPU is clearly not in a int3 handler, because
    + * int3 handler is not preemptible and there cannot be any more
    + * int3 handler called for this site, because we placed the
    + * original instruction back. synchronize_sched has memory
    + * barriers.
    + */
    + synchronize_sched();
    + unregister_die_notifier(&imv_notify);
    + /* unregister_die_notifier has memory barriers */
    + } else
    + text_poke_early((void *)imv->imv, (void *)imv->var,
    + imv->size);
    + return 0;
    +}

    --
    Mathieu Desnoyers
    Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
    OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  6. [RFC patch 02/27] x86 NMI-safe INT3 and Page Fault

    Implements an alternative iret with popf and return so trap and exception
    handlers can return to the NMI handler without issuing iret. iret would cause
    NMIs to be reenabled prematurely. x86_32 uses popf and far return. x86_64 has to
    copy the return instruction pointer to the top of the previous stack, issue a
    popf, loads the previous esp and issue a near return (ret).

    It allows placing immediate values (and therefore optimized trace_marks) in NMI
    code since returning from a breakpoint would be valid. Accessing vmalloc'd
    memory, which allows executing module code or accessing vmapped or vmalloc'd
    areas from NMI context, would also be valid. This is very useful to tracers like
    LTTng.

    This patch makes all faults, traps and exception safe to be called from NMI
    context *except* single-stepping, which requires iret to restore the TF (trap
    flag) and jump to the return address in a single instruction. Sorry, no kprobes
    support in NMI handlers because of this limitation. We cannot single-step an
    NMI handler, because iret must set the TF flag and return back to the
    instruction to single-step in a single instruction. This cannot be emulated with
    popf/lret, because lret would be single-stepped. It does not apply to immediate
    values because they do not use single-stepping. This code detects if the TF
    flag is set and uses the iret path for single-stepping, even if it reactivates
    NMIs prematurely.

    alpha and avr32 use the active count bit 31. This patch moves them to 28.

    TODO : test alpha and avr32 active count modification

    tested on x86_32 (tests implemented in a separate patch) :
    - instrumented the return path to export the EIP, CS and EFLAGS values when
    taken so we know the return path code has been executed.
    - trace_mark, using immediate values, with 10ms delay with the breakpoint
    activated. Runs well through the return path.
    - tested vmalloc faults in NMI handler by placing a non-optimized marker in the
    NMI handler (so no breakpoint is executed) and connecting a probe which
    touches every pages of a 20MB vmalloc'd buffer. It executes trough the return
    path without problem.
    - Tested with and without preemption

    tested on x86_64
    - instrumented the return path to export the EIP, CS and EFLAGS values when
    taken so we know the return path code has been executed.
    - trace_mark, using immediate values, with 10ms delay with the breakpoint
    activated. Runs well through the return path.

    To test on x86_64 :
    - Test without preemption
    - Test vmalloc faults
    - Test on Intel 64 bits CPUs.

    "This way lies madness. Don't go there."
    - Andi

    Changelog since v1 :
    - x86_64 fixes.
    Changelog since v2 :
    - paravirt support

    Signed-off-by: Mathieu Desnoyers
    CC: Andi Kleen
    CC: akpm@osdl.org
    CC: mingo@elte.hu
    CC: "H. Peter Anvin"
    CC: Jeremy Fitzhardinge
    CC: Steven Rostedt
    CC: "Frank Ch. Eigler"
    ---
    arch/x86/kernel/entry_32.S | 25 +++++++++++++++-
    arch/x86/kernel/entry_64.S | 31 ++++++++++++++++++++
    include/asm-alpha/thread_info.h | 2 -
    include/asm-avr32/thread_info.h | 2 -
    include/asm-x86/irqflags.h | 61 ++++++++++++++++++++++++++++++++++++++++
    include/asm-x86/paravirt.h | 2 +
    include/linux/hardirq.h | 24 ++++++++++++++-
    7 files changed, 142 insertions(+), 5 deletions(-)

    Index: linux-2.6-sched-devel/include/linux/hardirq.h
    ================================================== =================
    --- linux-2.6-sched-devel.orig/include/linux/hardirq.h 2008-04-16 12:29:24.000000000 -0400
    +++ linux-2.6-sched-devel/include/linux/hardirq.h 2008-04-16 12:29:42.000000000 -0400
    @@ -22,10 +22,13 @@
    * PREEMPT_MASK: 0x000000ff
    * SOFTIRQ_MASK: 0x0000ff00
    * HARDIRQ_MASK: 0x0fff0000
    + * HARDNMI_MASK: 0x40000000
    */
    #define PREEMPT_BITS 8
    #define SOFTIRQ_BITS 8

    +#define HARDNMI_BITS 1
    +
    #ifndef HARDIRQ_BITS
    #define HARDIRQ_BITS 12

    @@ -45,16 +48,19 @@
    #define PREEMPT_SHIFT 0
    #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
    #define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
    +#define HARDNMI_SHIFT (30)

    #define __IRQ_MASK(x) ((1UL << (x))-1)

    #define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
    #define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
    #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
    +#define HARDNMI_MASK (__IRQ_MASK(HARDNMI_BITS) << HARDNMI_SHIFT)

    #define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
    #define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
    #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
    +#define HARDNMI_OFFSET (1UL << HARDNMI_SHIFT)

    #if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
    #error PREEMPT_ACTIVE is too low!
    @@ -63,6 +69,7 @@
    #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
    #define softirq_count() (preempt_count() & SOFTIRQ_MASK)
    #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
    +#define hardnmi_count() (preempt_count() & HARDNMI_MASK)

    /*
    * Are we doing bottom half or hardware interrupt processing?
    @@ -71,6 +78,7 @@
    #define in_irq() (hardirq_count())
    #define in_softirq() (softirq_count())
    #define in_interrupt() (irq_count())
    +#define in_nmi() (hardnmi_count())

    /*
    * Are we running in atomic context? WARNING: this macro cannot
    @@ -159,7 +167,19 @@ extern void irq_enter(void);
    */
    extern void irq_exit(void);

    -#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0)
    -#define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0)
    +#define nmi_enter() \
    + do { \
    + lockdep_off(); \
    + BUG_ON(hardnmi_count()); \
    + add_preempt_count(HARDNMI_OFFSET); \
    + __irq_enter(); \
    + } while (0)
    +
    +#define nmi_exit() \
    + do { \
    + __irq_exit(); \
    + sub_preempt_count(HARDNMI_OFFSET); \
    + lockdep_on(); \
    + } while (0)

    #endif /* LINUX_HARDIRQ_H */
    Index: linux-2.6-sched-devel/arch/x86/kernel/entry_32.S
    ================================================== =================
    --- linux-2.6-sched-devel.orig/arch/x86/kernel/entry_32.S 2008-04-16 12:29:25.000000000 -0400
    +++ linux-2.6-sched-devel/arch/x86/kernel/entry_32.S 2008-04-16 12:29:42.000000000 -0400
    @@ -72,7 +72,6 @@
    #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
    #else
    #define preempt_stop(clobbers)
    -#define resume_kernel restore_nocheck
    #endif

    .macro TRACE_IRQS_IRET
    @@ -258,6 +257,8 @@ END(ret_from_exception)
    #ifdef CONFIG_PREEMPT
    ENTRY(resume_kernel)
    DISABLE_INTERRUPTS(CLBR_ANY)
    + testl $0x40000000,TI_preempt_count(%ebp) # nested over NMI ?
    + jnz return_to_nmi
    cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
    jnz restore_nocheck
    need_resched:
    @@ -269,6 +270,12 @@ need_resched:
    call preempt_schedule_irq
    jmp need_resched
    END(resume_kernel)
    +#else
    +ENTRY(resume_kernel)
    + testl $0x40000000,TI_preempt_count(%ebp) # nested over NMI ?
    + jnz return_to_nmi
    + jmp restore_nocheck
    +END(resume_kernel)
    #endif
    CFI_ENDPROC

    @@ -408,6 +415,22 @@ restore_nocheck_notrace:
    CFI_ADJUST_CFA_OFFSET -4
    irq_return:
    INTERRUPT_RETURN
    +return_to_nmi:
    + testl $X86_EFLAGS_TF, PT_EFLAGS(%esp)
    + jnz restore_nocheck /*
    + * If single-stepping an NMI handler,
    + * use the normal iret path instead of
    + * the popf/lret because lret would be
    + * single-stepped. It should not
    + * happen : it will reactivate NMIs
    + * prematurely.
    + */
    + TRACE_IRQS_IRET
    + RESTORE_REGS
    + addl $4, %esp # skip orig_eax/error_code
    + CFI_ADJUST_CFA_OFFSET -4
    + INTERRUPT_RETURN_NMI_SAFE
    +
    .section .fixup,"ax"
    ENTRY(iret_exc)
    pushl $0 # no error code
    Index: linux-2.6-sched-devel/arch/x86/kernel/entry_64.S
    ================================================== =================
    --- linux-2.6-sched-devel.orig/arch/x86/kernel/entry_64.S 2008-04-16 12:29:25.000000000 -0400
    +++ linux-2.6-sched-devel/arch/x86/kernel/entry_64.S 2008-04-16 12:29:42.000000000 -0400
    @@ -681,12 +681,27 @@ retint_restore_args: /* return to kernel
    * The iretq could re-enable interrupts:
    */
    TRACE_IRQS_IRETQ
    + testl $0x40000000,threadinfo_preempt_count(%rcx) /* Nested over NMI ? */
    + jnz return_to_nmi
    restore_args:
    RESTORE_ARGS 0,8,0

    irq_return:
    INTERRUPT_RETURN

    +return_to_nmi: /*
    + * If single-stepping an NMI handler,
    + * use the normal iret path instead of
    + * the popf/lret because lret would be
    + * single-stepped. It should not
    + * happen : it will reactivate NMIs
    + * prematurely.
    + */
    + bt $8,EFLAGS-ARGOFFSET(%rsp) /* trap flag? */
    + jc restore_args
    + RESTORE_ARGS 0,8,0
    + INTERRUPT_RETURN_NMI_SAFE
    +
    .section __ex_table, "a"
    .quad irq_return, bad_iret
    .previous
    @@ -902,6 +917,10 @@ END(spurious_interrupt)
    .macro paranoidexit trace=1
    /* ebx: no swapgs flag */
    paranoid_exit\trace:
    + GET_THREAD_INFO(%rcx)
    + testl $0x40000000,threadinfo_preempt_count(%rcx) /* Nested over NMI ? */
    + jnz paranoid_return_to_nmi\trace
    +paranoid_exit_no_nmi\trace:
    testl %ebx,%ebx /* swapgs needed? */
    jnz paranoid_restore\trace
    testl $3,CS(%rsp)
    @@ -914,6 +933,18 @@ paranoid_swapgs\trace:
    paranoid_restore\trace:
    RESTORE_ALL 8
    jmp irq_return
    +paranoid_return_to_nmi\trace: /*
    + * If single-stepping an NMI handler,
    + * use the normal iret path instead of
    + * the popf/lret because lret would be
    + * single-stepped. It should not
    + * happen : it will reactivate NMIs
    + * prematurely.
    + */
    + bt $8,EFLAGS-0(%rsp) /* trap flag? */
    + jc paranoid_exit_no_nmi\trace
    + RESTORE_ALL 8
    + INTERRUPT_RETURN_NMI_SAFE
    paranoid_userspace\trace:
    GET_THREAD_INFO(%rcx)
    movl threadinfo_flags(%rcx),%ebx
    Index: linux-2.6-sched-devel/include/asm-x86/irqflags.h
    ================================================== =================
    --- linux-2.6-sched-devel.orig/include/asm-x86/irqflags.h 2008-04-16 12:29:24.000000000 -0400
    +++ linux-2.6-sched-devel/include/asm-x86/irqflags.h 2008-04-16 12:29:42.000000000 -0400
    @@ -112,12 +112,73 @@ static inline unsigned long __raw_local_

    #ifdef CONFIG_X86_64
    #define INTERRUPT_RETURN iretq
    +
    +/*
    + * Only returns from a trap or exception to a NMI context (intra-privilege
    + * level near return) to the same SS and CS segments. Should be used
    + * upon trap or exception return when nested over a NMI context so no iret is
    + * issued. It takes care of modifying the eflags, rsp and returning to the
    + * previous function.
    + *
    + * The stack, at that point, looks like :
    + *
    + * 0(rsp) RIP
    + * 8(rsp) CS
    + * 16(rsp) EFLAGS
    + * 24(rsp) RSP
    + * 32(rsp) SS
    + *
    + * Upon execution :
    + * Copy EIP to the top of the return stack
    + * Update top of return stack address
    + * Pop eflags into the eflags register
    + * Make the return stack current
    + * Near return (popping the return address from the return stack)
    + */
    +#define INTERRUPT_RETURN_NMI_SAFE pushq %rax; \
    + pushq %rbx; \
    + movq 40(%rsp), %rax; \
    + movq 16(%rsp), %rbx; \
    + subq $8, %rax; \
    + movq %rbx, (%rax); \
    + movq %rax, 40(%rsp); \
    + popq %rbx; \
    + popq %rax; \
    + addq $16, %rsp; \
    + popfq; \
    + movq (%rsp), %rsp; \
    + ret; \
    +
    #define ENABLE_INTERRUPTS_SYSCALL_RET \
    movq %gsda_oldrsp, %rsp; \
    swapgs; \
    sysretq;
    #else
    #define INTERRUPT_RETURN iret
    +
    +/*
    + * Protected mode only, no V8086. Implies that protected mode must
    + * be entered before NMIs or MCEs are enabled. Only returns from a trap or
    + * exception to a NMI context (intra-privilege level far return). Should be used
    + * upon trap or exception return when nested over a NMI context so no iret is
    + * issued.
    + *
    + * The stack, at that point, looks like :
    + *
    + * 0(esp) EIP
    + * 4(esp) CS
    + * 8(esp) EFLAGS
    + *
    + * Upon execution :
    + * Copy the stack eflags to top of stack
    + * Pop eflags into the eflags register
    + * Far return: pop EIP and CS into their register, and additionally pop EFLAGS.
    + */
    +#define INTERRUPT_RETURN_NMI_SAFE pushl 8(%esp); \
    + popfl; \
    + .byte 0xCA; \
    + .word 4;
    +
    #define ENABLE_INTERRUPTS_SYSCALL_RET sti; sysexit
    #define GET_CR0_INTO_EAX movl %cr0, %eax
    #endif
    Index: linux-2.6-sched-devel/include/asm-alpha/thread_info.h
    ================================================== =================
    --- linux-2.6-sched-devel.orig/include/asm-alpha/thread_info.h 2008-04-16 12:29:24.000000000 -0400
    +++ linux-2.6-sched-devel/include/asm-alpha/thread_info.h 2008-04-16 12:29:42.000000000 -0400
    @@ -57,7 +57,7 @@ register struct thread_info *__current_t

    #endif /* __ASSEMBLY__ */

    -#define PREEMPT_ACTIVE 0x40000000
    +#define PREEMPT_ACTIVE 0x10000000

    /*
    * Thread information flags:
    Index: linux-2.6-sched-devel/include/asm-avr32/thread_info.h
    ================================================== =================
    --- linux-2.6-sched-devel.orig/include/asm-avr32/thread_info.h 2008-04-16 12:29:24.000000000 -0400
    +++ linux-2.6-sched-devel/include/asm-avr32/thread_info.h 2008-04-16 12:29:42.000000000 -0400
    @@ -70,7 +70,7 @@ static inline struct thread_info *curren

    #endif /* !__ASSEMBLY__ */

    -#define PREEMPT_ACTIVE 0x40000000
    +#define PREEMPT_ACTIVE 0x10000000

    /*
    * Thread information flags
    Index: linux-2.6-sched-devel/include/asm-x86/paravirt.h
    ================================================== =================
    --- linux-2.6-sched-devel.orig/include/asm-x86/paravirt.h 2008-04-16 11:07:24.000000000 -0400
    +++ linux-2.6-sched-devel/include/asm-x86/paravirt.h 2008-04-16 12:29:42.000000000 -0400
    @@ -1385,6 +1385,8 @@ static inline unsigned long __raw_local_
    PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \
    jmp *%csv_cpu_ops+PV_CPU_iret)

    +#define INTERRUPT_RETURN_NMI_SAFE INTERRUPT_RETURN
    +
    #define DISABLE_INTERRUPTS(clobbers) \
    PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
    PV_SAVE_REGS; \

    --
    Mathieu Desnoyers
    Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
    OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  7. Re: [RFC patch 23/27] Immediate Values - Powerpc Optimization NMI MCE support

    Mathieu Desnoyers writes:

    > Use an atomic update for immediate values.


    What is meant by an "atomic" update in this context? AFAICS you are
    using memcpy, which is not in any way guaranteed to be atomic.

    Paul.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  8. Re: [RFC patch 23/27] Immediate Values - Powerpc Optimization NMI MCE support

    * Paul Mackerras (paulus@samba.org) wrote:
    > Mathieu Desnoyers writes:
    >
    > > Use an atomic update for immediate values.

    >
    > What is meant by an "atomic" update in this context? AFAICS you are
    > using memcpy, which is not in any way guaranteed to be atomic.
    >
    > Paul.


    I expect memcpy to perform the copy in one memory access, given I put a

    .align 2

    before the 2 bytes instruction. It makes sure the instruction modified
    fits in a single, aligned, memory write.

    Or maybe am I expecting too much from memcpy ?

    Mathieu

    --
    Mathieu Desnoyers
    Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
    OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  9. Re: [RFC patch 23/27] Immediate Values - Powerpc Optimization NMI MCE support

    Mathieu Desnoyers writes:

    > * Paul Mackerras (paulus@samba.org) wrote:
    > > Mathieu Desnoyers writes:
    > >
    > > > Use an atomic update for immediate values.

    > >
    > > What is meant by an "atomic" update in this context? AFAICS you are
    > > using memcpy, which is not in any way guaranteed to be atomic.
    > >
    > > Paul.

    >
    > I expect memcpy to perform the copy in one memory access, given I put a
    >
    > .align 2
    >
    > before the 2 bytes instruction. It makes sure the instruction modified
    > fits in a single, aligned, memory write.


    My original question was in the context of the powerpc architecture,
    where instructions are always 4 bytes long and aligned. So that's not
    an issue.

    > Or maybe am I expecting too much from memcpy ?


    I don't think memcpy gives you any such guarantees. It would be quite
    within its rights to say "it's only a few bytes, I'll do it byte by
    byte".

    If you really want it to be atomic (which I agree is probably a good
    idea), I think the best way to do it is to use an asm to generate a
    sth (store halfword) instruction to the immediate field (instruction
    address + 2). That's on powerpc of course; I don't know what you
    would do on other architectures.

    Paul.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  10. Re: [RFC patch 23/27] Immediate Values - Powerpc Optimization NMI MCE support

    * Paul Mackerras (paulus@samba.org) wrote:
    > Mathieu Desnoyers writes:
    >
    > > * Paul Mackerras (paulus@samba.org) wrote:
    > > > Mathieu Desnoyers writes:
    > > >
    > > > > Use an atomic update for immediate values.
    > > >
    > > > What is meant by an "atomic" update in this context? AFAICS you are
    > > > using memcpy, which is not in any way guaranteed to be atomic.
    > > >
    > > > Paul.

    > >
    > > I expect memcpy to perform the copy in one memory access, given I put a
    > >
    > > .align 2
    > >
    > > before the 2 bytes instruction. It makes sure the instruction modified
    > > fits in a single, aligned, memory write.

    >
    > My original question was in the context of the powerpc architecture,
    > where instructions are always 4 bytes long and aligned. So that's not
    > an issue.
    >


    Sorry, I meant 4 byte instruction with 2 bytes immediate value, but we
    both understand it would be a memory write aligned on 2 bytes since we
    only change the immediate value.

    > > Or maybe am I expecting too much from memcpy ?

    >
    > I don't think memcpy gives you any such guarantees. It would be quite
    > within its rights to say "it's only a few bytes, I'll do it byte by
    > byte".
    >
    > If you really want it to be atomic (which I agree is probably a good
    > idea), I think the best way to do it is to use an asm to generate a
    > sth (store halfword) instruction to the immediate field (instruction
    > address + 2). That's on powerpc of course; I don't know what you
    > would do on other architectures.
    >


    A simple

    *(uint16_t* )destptr = newvalue;

    seems to generate the "sth" instruction.

    Do you see any reason why the compiler could choose a different, non
    atomic assembler primitive ?

    quoting Documentation/RCU/whatisRCU.txt :

    "In contrast, RCU-based updaters typically take advantage of the fact
    that writes to single aligned pointers are atomic on modern CPUs"

    Paul E. McKenney could say if I am wrong if I assume that any object
    smaller or equal to the architecture pointer size, aligned on a multiple
    of its own size, will be read or written atomically.

    Therefore, I would suggest the following replacement patch :


    Immediate Values - Powerpc Optimization NMI MCE support

    Use an atomic update for immediate values.

    - Changelog :
    Use a direct assignment instead of memcpy to be sure the update is
    atomic.

    Signed-off-by: Mathieu Desnoyers
    CC: Rusty Russell
    CC: Christoph Hellwig
    CC: Paul Mackerras
    ---
    arch/powerpc/kernel/Makefile | 1
    arch/powerpc/kernel/immediate.c | 70 ++++++++++++++++++++++++++++++++++++++++
    include/asm-powerpc/immediate.h | 18 ++++++++++
    3 files changed, 89 insertions(+)

    Index: linux-2.6-lttng/arch/powerpc/kernel/immediate.c
    ================================================== =================
    --- /dev/null 1970-01-01 00:00:00.000000000 +0000
    +++ linux-2.6-lttng/arch/powerpc/kernel/immediate.c 2008-04-16 21:22:29.000000000 -0400
    @@ -0,0 +1,70 @@
    +/*
    + * Powerpc optimized immediate values enabling/disabling.
    + *
    + * Mathieu Desnoyers
    + */
    +
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +
    +#define LI_OPCODE_LEN 2
    +
    +/**
    + * arch_imv_update - update one immediate value
    + * @imv: pointer of type const struct __imv to update
    + * @early: early boot (1), normal (0)
    + *
    + * Update one immediate value. Must be called with imv_mutex held.
    + */
    +int arch_imv_update(const struct __imv *imv, int early)
    +{
    +#ifdef CONFIG_KPROBES
    + kprobe_opcode_t *insn;
    + /*
    + * Fail if a kprobe has been set on this instruction.
    + * (TODO: we could eventually do better and modify all the (possibly
    + * nested) kprobes for this site if kprobes had an API for this.
    + */
    + switch (imv->size) {
    + case 1: /* The uint8_t points to the 3rd byte of the
    + * instruction */
    + insn = (void *)(imv->imv - 1 - LI_OPCODE_LEN);
    + break;
    + case 2: insn = (void *)(imv->imv - LI_OPCODE_LEN);
    + break;
    + default:
    + return -EINVAL;
    + }
    +
    + if (unlikely(!early && *insn == BREAKPOINT_INSTRUCTION)) {
    + printk(KERN_WARNING "Immediate value in conflict with kprobe. "
    + "Variable at %p, "
    + "instruction at %p, size %lu\n",
    + (void *)imv->imv,
    + (void *)imv->var, imv->size);
    + return -EBUSY;
    + }
    +#endif
    +
    + /*
    + * If the variable and the instruction have the same value, there is
    + * nothing to do.
    + */
    + switch (imv->size) {
    + case 1: if (*(uint8_t *)imv->imv == *(uint8_t *)imv->var)
    + return 0;
    + *(uint8_t *)imv->imv = *(uint8_t *)imv->var;
    + break;
    + case 2: if (*(uint16_t *)imv->imv == *(uint16_t *)imv->var)
    + return 0;
    + *(uint16_t *)imv->imv = *(uint16_t *)imv->var;
    + break;
    + default:return -EINVAL;
    + }
    + flush_icache_range(imv->imv, imv->imv + imv->size);
    + return 0;
    +}
    Index: linux-2.6-lttng/include/asm-powerpc/immediate.h
    ================================================== =================
    --- linux-2.6-lttng.orig/include/asm-powerpc/immediate.h 2008-04-16 12:25:42.000000000 -0400
    +++ linux-2.6-lttng/include/asm-powerpc/immediate.h 2008-04-16 20:49:48.000000000 -0400
    @@ -12,6 +12,16 @@

    #include

    +struct __imv {
    + unsigned long var; /* Identifier variable of the immediate value */
    + unsigned long imv; /*
    + * Pointer to the memory location that holds
    + * the immediate value within the load immediate
    + * instruction.
    + */
    + unsigned char size; /* Type size. */
    +} __attribute__ ((packed));
    +
    /**
    * imv_read - read immediate variable
    * @name: immediate value name
    @@ -19,6 +29,11 @@
    * Reads the value of @name.
    * Optimized version of the immediate.
    * Do not use in __init and __exit functions. Use _imv_read() instead.
    + * Makes sure the 2 bytes update will be atomic by aligning the immediate
    + * value. Use a normal memory read for the 4 bytes immediate because there is no
    + * way to atomically update it without using a seqlock read side, which would
    + * cost more in term of total i-cache and d-cache space than a simple memory
    + * read.
    */
    #define imv_read(name) \
    ({ \
    @@ -40,6 +55,7 @@
    PPC_LONG "%c1, ((1f)-2)\n\t" \
    ".byte 2\n\t" \
    ".previous\n\t" \
    + ".align 2\n\t" \
    "li %0,0\n\t" \
    "1:\n\t" \
    : "=r" (value) \
    @@ -52,4 +68,6 @@
    value; \
    })

    +extern int arch_imv_update(const struct __imv *imv, int early);
    +
    #endif /* _ASM_POWERPC_IMMEDIATE_H */
    Index: linux-2.6-lttng/arch/powerpc/kernel/Makefile
    ================================================== =================
    --- linux-2.6-lttng.orig/arch/powerpc/kernel/Makefile 2008-04-16 12:23:07.000000000 -0400
    +++ linux-2.6-lttng/arch/powerpc/kernel/Makefile 2008-04-16 12:25:44.000000000 -0400
    @@ -45,6 +45,7 @@ obj-$(CONFIG_HIBERNATION) += swsusp.o su
    obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o
    obj-$(CONFIG_MODULES) += module_$(CONFIG_WORD_SIZE).o
    obj-$(CONFIG_44x) += cpu_setup_44x.o
    +obj-$(CONFIG_IMMEDIATE) += immediate.o

    ifeq ($(CONFIG_PPC_MERGE),y)


    --
    Mathieu Desnoyers
    Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
    OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  11. Re: [RFC patch 16/27] Immediate Values Support init

    > #else
    >
    > @@ -73,7 +76,9 @@ extern void imv_update_range(const struc
    >
    > static inline void core_imv_update(void) { }
    > static inline void module_imv_update(void) { }
    > -
    > +static inline void imv_unref_core_init(void) { }
    > +static inline void imv_unref_init(struct __imv *begin, struct __imv *end,
    > + void *init, unsigned long init_size) { }
    > #endif


    err.
    When turn off CONFIG_IMMEDIATE, "struct __imv" is not defined.
    is cause following warnings.

    include/linux/immediate.h:81: warning: 'struct __imv' declared inside parameter list
    include/linux/immediate.h:81: warning: its scope is only this definition or declaration, \
    which is probably not what you want


    and

    > +extern void imv_unref(struct __imv *begin, struct __imv *end, void *start,
    > + unsigned long size);
    >
    > #else
    >
    > (snip)
    > +static inline void imv_unref_init(struct __imv *begin, struct __imv *end,
    > + void *init, unsigned long init_size) { }
    > #endif


    if CONFIG_IMMEDIATE is on, imv_unref() is declared.
    but if CONFIG_IMMEDIATE is off, imv_unref_init() is declared instead imv_unref()
    it cause following error.


    CC kernel/module.o
    kernel/module.c: In function 'sys_init_module':
    kernel/module.c:2211: error: implicit declaration of function 'imv_unref'
    kernel/module.c:2211: error: 'struct module' has no member named 'immediate'
    kernel/module.c:2211: error: 'struct module' has no member named 'immediate'
    kernel/module.c:2211: error: 'struct module' has no member named 'num_immediate'
    make[1]: *** [kernel/module.o] Error 1


    and,

    in kernel/module.c#sys_init_module(),
    immediate member of struct module is used though CONFIG_IMMEDIATE is off.

    > imv_unref(mod->immediate, mod->immediate + mod->num_immediate,
    > mod->module_init, mod->init_size);


    it cause following error.

    CC kernel/module.o
    kernel/module.c: In function 'sys_init_module':
    kernel/module.c:2211: error: 'struct module' has no member named 'immediate'
    kernel/module.c:2211: error: 'struct module' has no member named 'immediate'
    kernel/module.c:2211: error: 'struct module' has no member named 'num_immediate'
    make[1]: *** [kernel/module.o] Error 1


    bellow patch fixed these.


    Signed-off-by: KOSAKI Motohiro

    ---
    include/linux/immediate.h | 8 ++++++--
    include/linux/module.h | 21 +++++++++++++++++++++
    kernel/module.c | 3 ++-
    3 files changed, 29 insertions(+), 3 deletions(-)

    Index: b/include/linux/immediate.h
    ================================================== =================
    --- a/include/linux/immediate.h 2008-04-19 19:53:03.000000000 +0900
    +++ b/include/linux/immediate.h 2008-04-19 20:04:58.000000000 +0900
    @@ -56,6 +56,10 @@ extern void imv_unref(struct __imv *begi
    * Generic immediate values: a simple, standard, memory load.
    */

    +/* empty declaration for avoid warning */
    +struct __imv {
    +};
    +
    /**
    * imv_read - read immediate variable
    * @name: immediate value name
    @@ -77,8 +81,8 @@ extern void imv_unref(struct __imv *begi
    static inline void core_imv_update(void) { }
    static inline void module_imv_update(void) { }
    static inline void imv_unref_core_init(void) { }
    -static inline void imv_unref_init(struct __imv *begin, struct __imv *end,
    - void *init, unsigned long init_size) { }
    +static inline void imv_unref(struct __imv *begin, struct __imv *end,
    + void *start, unsigned long size) { }
    #endif

    #define DECLARE_IMV(type, name) extern __typeof__(type) name##__imv
    Index: b/include/linux/module.h
    ================================================== =================
    --- a/include/linux/module.h 2008-04-19 19:53:03.000000000 +0900
    +++ b/include/linux/module.h 2008-04-19 20:22:14.000000000 +0900
    @@ -634,4 +634,25 @@ static inline void module_remove_modinfo

    #define __MODULE_STRING(x) __stringify(x)

    +#ifdef CONFIG_IMMEDIATE
    +static inline struct __imv* mod_immediate_address(struct module* mod)
    +{
    + return mod->immediate;
    +}
    +static inline unsigned int mod_num_immediate(struct module* mod)
    +{
    + return mod->num_immediate;
    +}
    +#else
    +static inline struct __imv* mod_immediate_address(struct module* mod)
    +{
    + return NULL;
    +}
    +static inline unsigned int mod_num_immediate(struct module* mod)
    +{
    + return 0;
    +}
    +#endif
    +
    +
    #endif /* _LINUX_MODULE_H */
    Index: b/kernel/module.c
    ================================================== =================
    --- a/kernel/module.c 2008-04-19 19:53:03.000000000 +0900
    +++ b/kernel/module.c 2008-04-19 20:23:51.000000000 +0900
    @@ -2208,7 +2208,8 @@ sys_init_module(void __user *umod,
    /* Drop initial reference. */
    module_put(mod);
    unwind_remove_table(mod->unwind_info, 1);
    - imv_unref(mod->immediate, mod->immediate + mod->num_immediate,
    + imv_unref(mod_immediate_address(mod),
    + mod_immediate_address(mod) + mod_num_immediate(mod),
    mod->module_init, mod->init_size);
    module_free(mod, mod->module_init);
    mod->module_init = NULL;



    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  12. Re: [RFC patch 26/27] Immediate Values - Jump

    > Index: linux-2.6-lttng/include/linux/immediate.h
    > ================================================== =================
    > --- linux-2.6-lttng.orig/include/linux/immediate.h 2008-04-16 14:04:47.000000000 -0400
    > +++ linux-2.6-lttng/include/linux/immediate.h 2008-04-16 14:04:48.000000000 -0400
    > @@ -33,8 +33,7 @@
    > * Internal update functions.
    > */
    > extern void core_imv_update(void);
    > -extern void imv_update_range(const struct __imv *begin,
    > - const struct __imv *end);
    > +extern void imv_update_range(struct __imv *begin, struct __imv *end);
    > extern void imv_unref_core_init(void);
    > extern void imv_unref(struct __imv *begin, struct __imv *end, void *start,
    > unsigned long size);
    > @@ -54,6 +53,14 @@ extern void imv_unref(struct __imv *begi
    > #define imv_read(name) _imv_read(name)
    >
    > /**
    > + * imv_cond - read immediate variable use as condition for if()
    > + * @name: immediate value name
    > + *
    > + * Reads the value of @name.
    > + */
    > +#define imv_cond _imv_read(name)
    > +
    > +/**


    err, missing name argument.



    Signed-off-by: KOSAKI Motohiro

    ---
    include/linux/immediate.h | 2 +-
    1 file changed, 1 insertion(+), 1 deletion(-)

    Index: b/include/linux/immediate.h
    ================================================== =================
    --- a/include/linux/immediate.h 2008-04-19 20:57:19.000000000 +0900
    +++ b/include/linux/immediate.h 2008-04-19 21:09:01.000000000 +0900
    @@ -62,7 +62,7 @@ struct __imv {
    *
    * Reads the value of @name.
    */
    -#define imv_cond _imv_read(name)
    +#define imv_cond(name) _imv_read(name)

    /**
    * imv_set - set immediate variable (with locking)



    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  13. Re: [RFC patch 16/27] Immediate Values Support init

    * KOSAKI Motohiro (kosaki.motohiro@jp.fujitsu.com) wrote:
    > > #else
    > >
    > > @@ -73,7 +76,9 @@ extern void imv_update_range(const struc
    > >
    > > static inline void core_imv_update(void) { }
    > > static inline void module_imv_update(void) { }
    > > -
    > > +static inline void imv_unref_core_init(void) { }
    > > +static inline void imv_unref_init(struct __imv *begin, struct __imv *end,
    > > + void *init, unsigned long init_size) { }
    > > #endif

    >
    > err.
    > When turn off CONFIG_IMMEDIATE, "struct __imv" is not defined.
    > is cause following warnings.
    >
    > include/linux/immediate.h:81: warning: 'struct __imv' declared inside parameter list
    > include/linux/immediate.h:81: warning: its scope is only this definition or declaration, \
    > which is probably not what you want
    >
    >
    > and
    >
    > > +extern void imv_unref(struct __imv *begin, struct __imv *end, void *start,
    > > + unsigned long size);
    > >
    > > #else
    > >
    > > (snip)
    > > +static inline void imv_unref_init(struct __imv *begin, struct __imv *end,
    > > + void *init, unsigned long init_size) { }
    > > #endif

    >
    > if CONFIG_IMMEDIATE is on, imv_unref() is declared.
    > but if CONFIG_IMMEDIATE is off, imv_unref_init() is declared instead imv_unref()
    > it cause following error.
    >
    >
    > CC kernel/module.o
    > kernel/module.c: In function 'sys_init_module':
    > kernel/module.c:2211: error: implicit declaration of function 'imv_unref'
    > kernel/module.c:2211: error: 'struct module' has no member named 'immediate'
    > kernel/module.c:2211: error: 'struct module' has no member named 'immediate'
    > kernel/module.c:2211: error: 'struct module' has no member named 'num_immediate'
    > make[1]: *** [kernel/module.o] Error 1
    >
    >
    > and,
    >
    > in kernel/module.c#sys_init_module(),
    > immediate member of struct module is used though CONFIG_IMMEDIATE is off.
    >
    > > imv_unref(mod->immediate, mod->immediate + mod->num_immediate,
    > > mod->module_init, mod->init_size);

    >
    > it cause following error.
    >
    > CC kernel/module.o
    > kernel/module.c: In function 'sys_init_module':
    > kernel/module.c:2211: error: 'struct module' has no member named 'immediate'
    > kernel/module.c:2211: error: 'struct module' has no member named 'immediate'
    > kernel/module.c:2211: error: 'struct module' has no member named 'num_immediate'
    > make[1]: *** [kernel/module.o] Error 1
    >
    >
    > bellow patch fixed these.
    >
    >
    > Signed-off-by: KOSAKI Motohiro
    >
    > ---
    > include/linux/immediate.h | 8 ++++++--
    > include/linux/module.h | 21 +++++++++++++++++++++
    > kernel/module.c | 3 ++-
    > 3 files changed, 29 insertions(+), 3 deletions(-)
    >
    > Index: b/include/linux/immediate.h
    > ================================================== =================
    > --- a/include/linux/immediate.h 2008-04-19 19:53:03.000000000 +0900
    > +++ b/include/linux/immediate.h 2008-04-19 20:04:58.000000000 +0900
    > @@ -56,6 +56,10 @@ extern void imv_unref(struct __imv *begi
    > * Generic immediate values: a simple, standard, memory load.
    > */
    >
    > +/* empty declaration for avoid warning */
    > +struct __imv {
    > +};
    > +


    I prefer to add an ifdef CONFIG_IMMEDIATE to module.c to follow what I
    have already done previously. Defining this empty structure is a bit
    odd. Here is the updated patch.

    Thanks for testing/reporting this.

    Mathieu


    Immediate Values Support init

    Supports placing immediate values in init code

    We need to put the immediate values in RW data section so we can edit them
    before init section unload.

    This code puts NULL pointers in lieu of original pointer referencing init code
    before the init sections are freed, both in the core kernel and in modules.

    TODO : support __exit section.

    Changelog:
    - Fix !CONFIG_IMMEDIATE

    Signed-off-by: Mathieu Desnoyers
    CC: Rusty Russell
    CC: "Frank Ch. Eigler"
    CC: KOSAKI Motohiro
    ---
    Documentation/immediate.txt | 8 ++++----
    include/asm-generic/vmlinux.lds.h | 8 ++++----
    include/asm-powerpc/immediate.h | 4 ++--
    include/asm-x86/immediate.h | 6 +++---
    include/linux/immediate.h | 4 ++++
    include/linux/module.h | 2 +-
    init/main.c | 1 +
    kernel/immediate.c | 31 +++++++++++++++++++++++++++++--
    kernel/module.c | 4 ++++
    9 files changed, 52 insertions(+), 16 deletions(-)

    Index: linux-2.6-lttng/kernel/immediate.c
    ================================================== =================
    --- linux-2.6-lttng.orig/kernel/immediate.c 2008-04-19 09:10:20.000000000 -0400
    +++ linux-2.6-lttng/kernel/immediate.c 2008-04-19 09:20:53.000000000 -0400
    @@ -22,6 +22,7 @@
    #include
    #include

    +#include
    #include

    /*
    @@ -30,8 +31,8 @@
    static int imv_early_boot_complete;
    static int wrote_text;

    -extern const struct __imv __start___imv[];
    -extern const struct __imv __stop___imv[];
    +extern struct __imv __start___imv[];
    +extern struct __imv __stop___imv[];

    static int stop_machine_imv_update(void *imv_ptr)
    {
    @@ -118,6 +119,8 @@ void imv_update_range(const struct __imv
    int ret;
    for (iter = begin; iter < end; iter++) {
    mutex_lock(&imv_mutex);
    + if (!iter->imv) /* Skip removed __init immediate values */
    + goto skip;
    ret = apply_imv_update(iter);
    if (imv_early_boot_complete && ret)
    printk(KERN_WARNING
    @@ -126,6 +129,7 @@ void imv_update_range(const struct __imv
    "instruction at %p, size %hu\n",
    (void *)iter->imv,
    (void *)iter->var, iter->size);
    +skip:
    mutex_unlock(&imv_mutex);
    }
    }
    @@ -143,6 +147,29 @@ void core_imv_update(void)
    }
    EXPORT_SYMBOL_GPL(core_imv_update);

    +/**
    + * imv_unref
    + *
    + * Deactivate any immediate value reference pointing into the code region in the
    + * range start to start + size.
    + */
    +void imv_unref(struct __imv *begin, struct __imv *end, void *start,
    + unsigned long size)
    +{
    + struct __imv *iter;
    +
    + for (iter = begin; iter < end; iter++)
    + if (iter->imv >= (unsigned long)start
    + && iter->imv < (unsigned long)start + size)
    + iter->imv = 0UL;
    +}
    +
    +void imv_unref_core_init(void)
    +{
    + imv_unref(__start___imv, __stop___imv, __init_begin,
    + (unsigned long)__init_end - (unsigned long)__init_begin);
    +}
    +
    void __init imv_init_complete(void)
    {
    imv_early_boot_complete = 1;
    Index: linux-2.6-lttng/kernel/module.c
    ================================================== =================
    --- linux-2.6-lttng.orig/kernel/module.c 2008-04-19 09:10:20.000000000 -0400
    +++ linux-2.6-lttng/kernel/module.c 2008-04-19 09:20:55.000000000 -0400
    @@ -2208,6 +2208,10 @@ sys_init_module(void __user *umod,
    /* Drop initial reference. */
    module_put(mod);
    unwind_remove_table(mod->unwind_info, 1);
    +#ifdef CONFIG_IMMEDIATE
    + imv_unref(mod->immediate, mod->immediate + mod->num_immediate,
    + mod->module_init, mod->init_size);
    +#endif
    module_free(mod, mod->module_init);
    mod->module_init = NULL;
    mod->init_size = 0;
    Index: linux-2.6-lttng/include/linux/module.h
    ================================================== =================
    --- linux-2.6-lttng.orig/include/linux/module.h 2008-04-19 09:10:20.000000000 -0400
    +++ linux-2.6-lttng/include/linux/module.h 2008-04-19 09:20:46.000000000 -0400
    @@ -357,7 +357,7 @@ struct module
    keeping pointers to this stuff */
    char *args;
    #ifdef CONFIG_IMMEDIATE
    - const struct __imv *immediate;
    + struct __imv *immediate;
    unsigned int num_immediate;
    #endif
    #ifdef CONFIG_MARKERS
    Index: linux-2.6-lttng/include/asm-generic/vmlinux.lds.h
    ================================================== =================
    --- linux-2.6-lttng.orig/include/asm-generic/vmlinux.lds.h 2008-04-19 09:10:20.000000000 -0400
    +++ linux-2.6-lttng/include/asm-generic/vmlinux.lds.h 2008-04-19 09:10:20.000000000 -0400
    @@ -52,7 +52,10 @@
    . = ALIGN(8); \
    VMLINUX_SYMBOL(__start___markers) = .; \
    *(__markers) \
    - VMLINUX_SYMBOL(__stop___markers) = .;
    + VMLINUX_SYMBOL(__stop___markers) = .; \
    + VMLINUX_SYMBOL(__start___imv) = .; \
    + *(__imv) /* Immediate values: pointers */ \
    + VMLINUX_SYMBOL(__stop___imv) = .;

    #define RO_DATA(align) \
    . = ALIGN((align)); \
    @@ -61,9 +64,6 @@
    *(.rodata) *(.rodata.*) \
    *(__vermagic) /* Kernel version magic */ \
    *(__markers_strings) /* Markers: strings */ \
    - VMLINUX_SYMBOL(__start___imv) = .; \
    - *(__imv) /* Immediate values: pointers */ \
    - VMLINUX_SYMBOL(__stop___imv) = .; \
    } \
    \
    .rodata1 : AT(ADDR(.rodata1) - LOAD_OFFSET) { \
    Index: linux-2.6-lttng/include/linux/immediate.h
    ================================================== =================
    --- linux-2.6-lttng.orig/include/linux/immediate.h 2008-04-19 09:10:20.000000000 -0400
    +++ linux-2.6-lttng/include/linux/immediate.h 2008-04-19 09:21:34.000000000 -0400
    @@ -46,6 +46,9 @@ struct __imv {
    extern void core_imv_update(void);
    extern void imv_update_range(const struct __imv *begin,
    const struct __imv *end);
    +extern void imv_unref_core_init(void);
    +extern void imv_unref(struct __imv *begin, struct __imv *end, void *start,
    + unsigned long size);

    #else

    @@ -73,6 +76,7 @@ extern void imv_update_range(const struc

    static inline void core_imv_update(void) { }
    static inline void module_imv_update(void) { }
    +static inline void imv_unref_core_init(void) { }

    #endif

    Index: linux-2.6-lttng/init/main.c
    ================================================== =================
    --- linux-2.6-lttng.orig/init/main.c 2008-04-19 09:10:20.000000000 -0400
    +++ linux-2.6-lttng/init/main.c 2008-04-19 09:10:20.000000000 -0400
    @@ -776,6 +776,7 @@ static void run_init_process(char *init_
    */
    static int noinline init_post(void)
    {
    + imv_unref_core_init();
    free_initmem();
    unlock_kernel();
    mark_rodata_ro();
    Index: linux-2.6-lttng/include/asm-x86/immediate.h
    ================================================== =================
    --- linux-2.6-lttng.orig/include/asm-x86/immediate.h 2008-04-19 09:10:20.000000000 -0400
    +++ linux-2.6-lttng/include/asm-x86/immediate.h 2008-04-19 09:20:54.000000000 -0400
    @@ -33,7 +33,7 @@
    BUILD_BUG_ON(sizeof(value) > 8); \
    switch (sizeof(value)) { \
    case 1: \
    - asm(".section __imv,\"a\",@progbits\n\t" \
    + asm(".section __imv,\"aw\",@progbits\n\t" \
    _ASM_PTR "%c1, (3f)-%c2\n\t" \
    ".byte %c2\n\t" \
    ".previous\n\t" \
    @@ -45,7 +45,7 @@
    break; \
    case 2: \
    case 4: \
    - asm(".section __imv,\"a\",@progbits\n\t" \
    + asm(".section __imv,\"aw\",@progbits\n\t" \
    _ASM_PTR "%c1, (3f)-%c2\n\t" \
    ".byte %c2\n\t" \
    ".previous\n\t" \
    @@ -60,7 +60,7 @@
    value = name##__imv; \
    break; \
    } \
    - asm(".section __imv,\"a\",@progbits\n\t" \
    + asm(".section __imv,\"aw\",@progbits\n\t" \
    _ASM_PTR "%c1, (3f)-%c2\n\t" \
    ".byte %c2\n\t" \
    ".previous\n\t" \
    Index: linux-2.6-lttng/include/asm-powerpc/immediate.h
    ================================================== =================
    --- linux-2.6-lttng.orig/include/asm-powerpc/immediate.h 2008-04-19 09:10:20.000000000 -0400
    +++ linux-2.6-lttng/include/asm-powerpc/immediate.h 2008-04-19 09:20:54.000000000 -0400
    @@ -26,7 +26,7 @@
    BUILD_BUG_ON(sizeof(value) > 8); \
    switch (sizeof(value)) { \
    case 1: \
    - asm(".section __imv,\"a\",@progbits\n\t" \
    + asm(".section __imv,\"aw\",@progbits\n\t" \
    PPC_LONG "%c1, ((1f)-1)\n\t" \
    ".byte 1\n\t" \
    ".previous\n\t" \
    @@ -36,7 +36,7 @@
    : "i" (&name##__imv)); \
    break; \
    case 2: \
    - asm(".section __imv,\"a\",@progbits\n\t" \
    + asm(".section __imv,\"aw\",@progbits\n\t" \
    PPC_LONG "%c1, ((1f)-2)\n\t" \
    ".byte 2\n\t" \
    ".previous\n\t" \
    Index: linux-2.6-lttng/Documentation/immediate.txt
    ================================================== =================
    --- linux-2.6-lttng.orig/Documentation/immediate.txt 2008-04-19 09:10:20.000000000 -0400
    +++ linux-2.6-lttng/Documentation/immediate.txt 2008-04-19 09:10:20.000000000 -0400
    @@ -42,10 +42,10 @@ The immediate mechanism supports inserti
    immediate. Immediate values can be put in inline functions, inlined static
    functions, and unrolled loops.

    -If you have to read the immediate values from a function declared as __init or
    -__exit, you should explicitly use _imv_read(), which will fall back on a
    -global variable read. Failing to do so will leave a reference to the __init
    -section after it is freed (it would generate a modpost warning).
    +If you have to read the immediate values from a function declared as __exit, you
    +should explicitly use _imv_read(), which will fall back on a global variable
    +read. Failing to do so will leave a reference to the __exit section in kernel
    +without module unload support. imv_read() in the __init section is supported.

    You can choose to set an initial static value to the immediate by using, for
    instance:

    --
    Mathieu Desnoyers
    Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
    OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  14. Re: [RFC patch 26/27] Immediate Values - Jump

    * KOSAKI Motohiro (kosaki.motohiro@jp.fujitsu.com) wrote:
    > > Index: linux-2.6-lttng/include/linux/immediate.h
    > > ================================================== =================
    > > --- linux-2.6-lttng.orig/include/linux/immediate.h 2008-04-16 14:04:47.000000000 -0400
    > > +++ linux-2.6-lttng/include/linux/immediate.h 2008-04-16 14:04:48.000000000 -0400
    > > @@ -33,8 +33,7 @@
    > > * Internal update functions.
    > > */
    > > extern void core_imv_update(void);
    > > -extern void imv_update_range(const struct __imv *begin,
    > > - const struct __imv *end);
    > > +extern void imv_update_range(struct __imv *begin, struct __imv *end);
    > > extern void imv_unref_core_init(void);
    > > extern void imv_unref(struct __imv *begin, struct __imv *end, void *start,
    > > unsigned long size);
    > > @@ -54,6 +53,14 @@ extern void imv_unref(struct __imv *begi
    > > #define imv_read(name) _imv_read(name)
    > >
    > > /**
    > > + * imv_cond - read immediate variable use as condition for if()
    > > + * @name: immediate value name
    > > + *
    > > + * Reads the value of @name.
    > > + */
    > > +#define imv_cond _imv_read(name)
    > > +
    > > +/**

    >
    > err, missing name argument.
    >


    Thanks, I merged it into my patchset.

    Mathieu

    >
    >
    > Signed-off-by: KOSAKI Motohiro
    >
    > ---
    > include/linux/immediate.h | 2 +-
    > 1 file changed, 1 insertion(+), 1 deletion(-)
    >
    > Index: b/include/linux/immediate.h
    > ================================================== =================
    > --- a/include/linux/immediate.h 2008-04-19 20:57:19.000000000 +0900
    > +++ b/include/linux/immediate.h 2008-04-19 21:09:01.000000000 +0900
    > @@ -62,7 +62,7 @@ struct __imv {
    > *
    > * Reads the value of @name.
    > */
    > -#define imv_cond _imv_read(name)
    > +#define imv_cond(name) _imv_read(name)
    >
    > /**
    > * imv_set - set immediate variable (with locking)
    >
    >
    >


    --
    Mathieu Desnoyers
    Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
    OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  15. Re: [RFC patch 16/27] Immediate Values Support init

    > I prefer to add an ifdef CONFIG_IMMEDIATE to module.c to follow what I
    > have already done previously. Defining this empty structure is a bit
    > odd. Here is the updated patch.
    >
    > Thanks for testing/reporting this.


    OK.
    I tested and confirmed your latest patch solved my reporting problem.

    Thanks.


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  16. Re: [RFC patch 23/27] Immediate Values - Powerpc Optimization NMI MCE support

    On Wed, Apr 16, 2008 at 09:24:20PM -0400, Mathieu Desnoyers wrote:
    > * Paul Mackerras (paulus@samba.org) wrote:
    > > Mathieu Desnoyers writes:
    > >
    > > > * Paul Mackerras (paulus@samba.org) wrote:
    > > > > Mathieu Desnoyers writes:
    > > > >
    > > > > > Use an atomic update for immediate values.
    > > > >
    > > > > What is meant by an "atomic" update in this context? AFAICS you are
    > > > > using memcpy, which is not in any way guaranteed to be atomic.
    > > > >
    > > > > Paul.
    > > >
    > > > I expect memcpy to perform the copy in one memory access, given I put a
    > > >
    > > > .align 2
    > > >
    > > > before the 2 bytes instruction. It makes sure the instruction modified
    > > > fits in a single, aligned, memory write.

    > >
    > > My original question was in the context of the powerpc architecture,
    > > where instructions are always 4 bytes long and aligned. So that's not
    > > an issue.
    > >

    >
    > Sorry, I meant 4 byte instruction with 2 bytes immediate value, but we
    > both understand it would be a memory write aligned on 2 bytes since we
    > only change the immediate value.
    >
    > > > Or maybe am I expecting too much from memcpy ?

    > >
    > > I don't think memcpy gives you any such guarantees. It would be quite
    > > within its rights to say "it's only a few bytes, I'll do it byte by
    > > byte".
    > >
    > > If you really want it to be atomic (which I agree is probably a good
    > > idea), I think the best way to do it is to use an asm to generate a
    > > sth (store halfword) instruction to the immediate field (instruction
    > > address + 2). That's on powerpc of course; I don't know what you
    > > would do on other architectures.
    > >

    >
    > A simple
    >
    > *(uint16_t* )destptr = newvalue;
    >
    > seems to generate the "sth" instruction.
    >
    > Do you see any reason why the compiler could choose a different, non
    > atomic assembler primitive ?
    >
    > quoting Documentation/RCU/whatisRCU.txt :
    >
    > "In contrast, RCU-based updaters typically take advantage of the fact
    > that writes to single aligned pointers are atomic on modern CPUs"
    >
    > Paul E. McKenney could say if I am wrong if I assume that any object
    > smaller or equal to the architecture pointer size, aligned on a multiple
    > of its own size, will be read or written atomically.


    There have been CPUs in the past for which this was false. I am not aware
    of any these days, but I would need to ask the architecture maintainers.

    A lot depends on the compiler as well as the CPU, of course. :-(

    Thanx, Paul

    > Therefore, I would suggest the following replacement patch :
    >
    >
    > Immediate Values - Powerpc Optimization NMI MCE support
    >
    > Use an atomic update for immediate values.
    >
    > - Changelog :
    > Use a direct assignment instead of memcpy to be sure the update is
    > atomic.
    >
    > Signed-off-by: Mathieu Desnoyers
    > CC: Rusty Russell
    > CC: Christoph Hellwig
    > CC: Paul Mackerras
    > ---
    > arch/powerpc/kernel/Makefile | 1
    > arch/powerpc/kernel/immediate.c | 70 ++++++++++++++++++++++++++++++++++++++++
    > include/asm-powerpc/immediate.h | 18 ++++++++++
    > 3 files changed, 89 insertions(+)
    >
    > Index: linux-2.6-lttng/arch/powerpc/kernel/immediate.c
    > ================================================== =================
    > --- /dev/null 1970-01-01 00:00:00.000000000 +0000
    > +++ linux-2.6-lttng/arch/powerpc/kernel/immediate.c 2008-04-16 21:22:29.000000000 -0400
    > @@ -0,0 +1,70 @@
    > +/*
    > + * Powerpc optimized immediate values enabling/disabling.
    > + *
    > + * Mathieu Desnoyers
    > + */
    > +
    > +#include
    > +#include
    > +#include
    > +#include
    > +#include
    > +#include
    > +
    > +#define LI_OPCODE_LEN 2
    > +
    > +/**
    > + * arch_imv_update - update one immediate value
    > + * @imv: pointer of type const struct __imv to update
    > + * @early: early boot (1), normal (0)
    > + *
    > + * Update one immediate value. Must be called with imv_mutex held.
    > + */
    > +int arch_imv_update(const struct __imv *imv, int early)
    > +{
    > +#ifdef CONFIG_KPROBES
    > + kprobe_opcode_t *insn;
    > + /*
    > + * Fail if a kprobe has been set on this instruction.
    > + * (TODO: we could eventually do better and modify all the (possibly
    > + * nested) kprobes for this site if kprobes had an API for this.
    > + */
    > + switch (imv->size) {
    > + case 1: /* The uint8_t points to the 3rd byte of the
    > + * instruction */
    > + insn = (void *)(imv->imv - 1 - LI_OPCODE_LEN);
    > + break;
    > + case 2: insn = (void *)(imv->imv - LI_OPCODE_LEN);
    > + break;
    > + default:
    > + return -EINVAL;
    > + }
    > +
    > + if (unlikely(!early && *insn == BREAKPOINT_INSTRUCTION)) {
    > + printk(KERN_WARNING "Immediate value in conflict with kprobe. "
    > + "Variable at %p, "
    > + "instruction at %p, size %lu\n",
    > + (void *)imv->imv,
    > + (void *)imv->var, imv->size);
    > + return -EBUSY;
    > + }
    > +#endif
    > +
    > + /*
    > + * If the variable and the instruction have the same value, there is
    > + * nothing to do.
    > + */
    > + switch (imv->size) {
    > + case 1: if (*(uint8_t *)imv->imv == *(uint8_t *)imv->var)
    > + return 0;
    > + *(uint8_t *)imv->imv = *(uint8_t *)imv->var;
    > + break;
    > + case 2: if (*(uint16_t *)imv->imv == *(uint16_t *)imv->var)
    > + return 0;
    > + *(uint16_t *)imv->imv = *(uint16_t *)imv->var;
    > + break;
    > + default:return -EINVAL;
    > + }
    > + flush_icache_range(imv->imv, imv->imv + imv->size);
    > + return 0;
    > +}
    > Index: linux-2.6-lttng/include/asm-powerpc/immediate.h
    > ================================================== =================
    > --- linux-2.6-lttng.orig/include/asm-powerpc/immediate.h 2008-04-16 12:25:42.000000000 -0400
    > +++ linux-2.6-lttng/include/asm-powerpc/immediate.h 2008-04-16 20:49:48.000000000 -0400
    > @@ -12,6 +12,16 @@
    >
    > #include
    >
    > +struct __imv {
    > + unsigned long var; /* Identifier variable of the immediate value */
    > + unsigned long imv; /*
    > + * Pointer to the memory location that holds
    > + * the immediate value within the load immediate
    > + * instruction.
    > + */
    > + unsigned char size; /* Type size. */
    > +} __attribute__ ((packed));
    > +
    > /**
    > * imv_read - read immediate variable
    > * @name: immediate value name
    > @@ -19,6 +29,11 @@
    > * Reads the value of @name.
    > * Optimized version of the immediate.
    > * Do not use in __init and __exit functions. Use _imv_read() instead.
    > + * Makes sure the 2 bytes update will be atomic by aligning the immediate
    > + * value. Use a normal memory read for the 4 bytes immediate because there is no
    > + * way to atomically update it without using a seqlock read side, which would
    > + * cost more in term of total i-cache and d-cache space than a simple memory
    > + * read.
    > */
    > #define imv_read(name) \
    > ({ \
    > @@ -40,6 +55,7 @@
    > PPC_LONG "%c1, ((1f)-2)\n\t" \
    > ".byte 2\n\t" \
    > ".previous\n\t" \
    > + ".align 2\n\t" \
    > "li %0,0\n\t" \
    > "1:\n\t" \
    > : "=r" (value) \
    > @@ -52,4 +68,6 @@
    > value; \
    > })
    >
    > +extern int arch_imv_update(const struct __imv *imv, int early);
    > +
    > #endif /* _ASM_POWERPC_IMMEDIATE_H */
    > Index: linux-2.6-lttng/arch/powerpc/kernel/Makefile
    > ================================================== =================
    > --- linux-2.6-lttng.orig/arch/powerpc/kernel/Makefile 2008-04-16 12:23:07.000000000 -0400
    > +++ linux-2.6-lttng/arch/powerpc/kernel/Makefile 2008-04-16 12:25:44.000000000 -0400
    > @@ -45,6 +45,7 @@ obj-$(CONFIG_HIBERNATION) += swsusp.o su
    > obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o
    > obj-$(CONFIG_MODULES) += module_$(CONFIG_WORD_SIZE).o
    > obj-$(CONFIG_44x) += cpu_setup_44x.o
    > +obj-$(CONFIG_IMMEDIATE) += immediate.o
    >
    > ifeq ($(CONFIG_PPC_MERGE),y)
    >
    >
    > --
    > Mathieu Desnoyers
    > Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
    > OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

+ Reply to Thread
Page 2 of 2 FirstFirst 1 2