[RFC][PATCH 1/1] MAZE: Mazed processes monitor - Kernel

This is a discussion on [RFC][PATCH 1/1] MAZE: Mazed processes monitor - Kernel ; This patch for linux-2.6.25-mm1 that is tested on x86_64 hardware. Signed-off-by: Hirofumi Nakagawa --- include/linux/maze.h | 92 +++++++++ include/linux/sched.h | 3 init/Kconfig | 10 + init/main.c | 2 kernel/Makefile | 1 kernel/exit.c | 2 kernel/fork.c | 2 kernel/maze.c | 466 ...

+ Reply to Thread
Results 1 to 2 of 2

Thread: [RFC][PATCH 1/1] MAZE: Mazed processes monitor

  1. [RFC][PATCH 1/1] MAZE: Mazed processes monitor

    This patch for linux-2.6.25-mm1 that is tested on x86_64 hardware.

    Signed-off-by: Hirofumi Nakagawa
    ---
    include/linux/maze.h | 92 +++++++++
    include/linux/sched.h | 3
    init/Kconfig | 10 +
    init/main.c | 2
    kernel/Makefile | 1
    kernel/exit.c | 2
    kernel/fork.c | 2
    kernel/maze.c | 466 ++++++++++++++++++++++++++++++++++++++++++++++++++
    kernel/sched.c | 3
    9 files changed, 581 insertions(+)

    --- /dev/null 1970-01-01 00:00:00.000000000 +0000
    +++ linux-2.6.25-mm1-maze/include/linux/maze.h 2008-05-13 17:52:23.000000000 +0900
    @@ -0,0 +1,92 @@
    +/*
    + * Copyright (C) 2007-2008 MIRACLE LINUX Corp.
    + *
    + * Written by Hirofumi Nakagawa
    + *
    + * This program is free software; you can redistribute it and/or
    + * modify it under the terms of the GNU General Public License as
    + * published by the Free Software Foundation, version 2.
    + */
    +#ifndef _LINUX_MAZE_H
    +#define _LINUX_MAZE_H
    +
    +#ifdef __KERNEL__
    +
    +#include
    +#include
    +
    +struct maze_context {
    + struct task_struct *task;
    + int pid;
    +
    + /* The stime of task when the maze count was reseted */
    + cputime_t reset_stime;
    + /* The utime of task when the maze count was reseted */
    + cputime_t reset_utime;
    +
    + /* The soft limit of maze count */
    + unsigned long soft_limit;
    + /* THe hard limit of maze count */
    + unsigned long hard_limit;
    +
    + /* The send signal when the maze count reach soft limit */
    + int soft_signal;
    + /* The send signal when the maze count reach hard limit */
    + int hard_signal;
    +
    +#define MAZE_SENT_SOFT_SIG 1
    +#define MAZE_SENT_HARD_SIG 2
    + /* The flags of sent signal */
    + unsigned long flags;
    +
    + /* This value is 1, if maze_context is enqueueing */
    + int enqueue;
    + /* This value is cpu number of enqueueing */
    + int enqueue_cpu;
    +
    + /* This value is 1, if preempt_notifier is registered */
    + atomic_t registered_notifier;
    +
    + struct preempt_notifier notifier;
    +
    + /* This list_head is linked from the maze_queue */
    + struct list_head queue;
    + /* This list_head is linked from the maze_list */
    + struct list_head list;
    +};
    +
    +#ifdef CONFIG_MAZE
    +
    +/* This function is called start_kernel() */
    +extern void maze_init_early(void);
    +
    +/* This function is called do_exit() */
    +extern void maze_exit(struct task_struct *task);
    +
    +/* This function is called copy_process() */
    +extern void maze_fork(struct task_struct *task);
    +
    +/* This function is called sys_sched_yield() */
    +extern void maze_sched_yield(struct task_struct *task);
    +
    +#else /* !CONFIG_MAZE */
    +
    +static inline void maze_init_early(void)
    +{
    +}
    +
    +static inline maze_exit(struct task_struct *task)
    +{
    +}
    +
    +static inline void maze_fork(struct task_struct *task)
    +{
    +}
    +
    +static inline void maze_sched_yield(struct task_struct *task)
    +{
    +}
    +#endif /* CONFIG_MAZE */
    +
    +#endif /* __KERNEL__ */
    +#endif /* _LINUX_MAZE_H */
    --- linux-2.6.25-mm1-maze.orig/include/linux/sched.h 2008-05-13 17:40:43.000000000 +0900
    +++ linux-2.6.25-mm1-maze/include/linux/sched.h 2008-05-13 17:52:23.000000000 +0900
    @@ -1254,6 +1254,9 @@ struct task_struct {
    /* cg_list protected by css_set_lock and tsk->alloc_lock */
    struct list_head cg_list;
    #endif
    +#ifdef CONFIG_MAZE
    + struct maze_context *maze_context;
    +#endif
    #ifdef CONFIG_FUTEX
    struct robust_list_head __user *robust_list;
    #ifdef CONFIG_COMPAT
    --- linux-2.6.25-mm1-maze.orig/init/Kconfig 2008-05-13 17:40:43.000000000 +0900
    +++ linux-2.6.25-mm1-maze/init/Kconfig 2008-05-13 17:52:23.000000000 +0900
    @@ -371,6 +371,16 @@ config RESOURCE_COUNTERS
    infrastructure that works with cgroups
    depends on CGROUPS

    +config MAZE
    + bool "MAZE monitor support"
    + select PREEMPT_NOTIFIERS
    + help
    + MAZE is a function to monitor mazed processes which use excessive CPU cycles.
    + This is a CGL (Carrier Grade Linux) requirement (AVL.14.0). MAZE detects such
    + processes and sends specified signals to the processes for their termination.
    +
    + Say N if unsure.
    +
    config MM_OWNER
    bool

    --- linux-2.6.25-mm1-maze.orig/init/main.c 2008-05-13 17:40:43.000000000 +0900
    +++ linux-2.6.25-mm1-maze/init/main.c 2008-05-13 17:52:23.000000000 +0900
    @@ -61,6 +61,7 @@
    #include
    #include
    #include
    +#include

    #include
    #include
    @@ -543,6 +544,7 @@ asmlinkage void __init start_kernel(void
    boot_init_stack_canary();
    debug_objects_early_init();
    cgroup_init_early();
    + maze_init_early();

    local_irq_disable();
    early_boot_irqs_off();
    --- linux-2.6.25-mm1-maze.orig/kernel/Makefile 2008-05-13 17:40:43.000000000 +0900
    +++ linux-2.6.25-mm1-maze/kernel/Makefile 2008-05-13 17:52:23.000000000 +0900
    @@ -80,6 +80,7 @@ obj-$(CONFIG_MARKERS) += marker.o
    obj-$(CONFIG_LATENCYTOP) += latencytop.o
    obj-$(CONFIG_FTRACE) += trace/
    obj-$(CONFIG_TRACING) += trace/
    +obj-$(CONFIG_MAZE) += maze.o

    ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
    # According to Alan Modra , the -fno-omit-frame-pointer is
    --- linux-2.6.25-mm1-maze.orig/kernel/exit.c 2008-05-13 17:40:43.000000000 +0900
    +++ linux-2.6.25-mm1-maze/kernel/exit.c 2008-05-13 17:52:23.000000000 +0900
    @@ -44,6 +44,7 @@
    #include
    #include
    #include
    +#include

    #include
    #include
    @@ -1058,6 +1059,7 @@ NORET_TYPE void do_exit(long code)
    __exit_fs(tsk);
    check_stack_usage();
    exit_thread();
    + maze_exit(tsk);
    cgroup_exit(tsk, 1);
    exit_keys(tsk);

    --- linux-2.6.25-mm1-maze.orig/kernel/fork.c 2008-05-13 17:40:43.000000000 +0900
    +++ linux-2.6.25-mm1-maze/kernel/fork.c 2008-05-13 17:52:23.000000000 +0900
    @@ -53,6 +53,7 @@
    #include
    #include
    #include
    +#include

    #include
    #include
    @@ -1363,6 +1364,7 @@ static struct task_struct *copy_process(
    write_unlock_irq(&tasklist_lock);
    proc_fork_connector(p);
    cgroup_post_fork(p);
    + maze_fork(p);
    return p;

    bad_fork_free_pid:
    --- /dev/null 1970-01-01 00:00:00.000000000 +0000
    +++ linux-2.6.25-mm1-maze/kernel/maze.c 2008-05-13 20:07:55.000000000 +0900
    @@ -0,0 +1,466 @@
    +/*
    + * Copyright (C) 2007-2008 MIRACLE LINUX Corp.
    + *
    + * Written by Hirofumi Nakagawa
    + *
    + * This program is free software; you can redistribute it and/or
    + * modify it under the terms of the GNU General Public License as
    + * published by the Free Software Foundation, version 2.
    + */
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +
    +#define ERRPRINT(str, ...) printk("ERROR [%s] " str, __FUNCTION__, ##__VA_ARGS__);
    +
    +/* The root directory of procfs*/
    +static struct proc_dir_entry *maze_proc_dir;
    +
    +static int maze_initialized;
    +
    +static struct list_head maze_list;
    +static spinlock_t maze_list_lock;
    +
    +static int maze_timer_interval = 10;
    +
    +DEFINE_PER_CPU(struct list_head, maze_queue);
    +DEFINE_PER_CPU(spinlock_t, maze_queue_lock);
    +
    +DEFINE_PER_CPU(struct timer_list, maze_timer);
    +
    +static inline void reset_maze_count(struct maze_context *context)
    +{
    + context->reset_utime = context->task->utime;
    + context->reset_stime = context->task->stime;
    + context->flags = 0;
    +}
    +
    +static inline cputime_t get_maze_count(struct maze_context *context)
    +{
    + return (context->task->utime - context->reset_utime) +
    + (context->task->stime - context->reset_stime);
    +}
    +
    +/*
    + * This function is called start_kernel()
    + */
    +void maze_init_early(void)
    +{
    + init_task.maze_context = NULL;
    +}
    +
    +/*
    + * This function is called do_exit()
    + */
    +void maze_exit(struct task_struct *task)
    +{
    + unsigned long flags;
    +
    + task_lock(task);
    + if (task->maze_context) {
    + spin_lock_irqsave(&per_cpu(maze_queue_lock,
    + task->maze_context->enqueue_cpu), flags);
    + if (task->maze_context->enqueue)
    + list_del(&task->maze_context->queue);
    + spin_unlock_irqrestore(&per_cpu(maze_queue_lock,
    + task->maze_context->enqueue_cpu), flags);
    +
    + spin_lock(&maze_list_lock);
    + list_del(&task->maze_context->list);
    + spin_unlock(&maze_list_lock);
    +
    + kfree(task->maze_context);
    + task->maze_context = NULL;
    + }
    + task_unlock(task);
    +}
    +
    +static inline void copy_limit_and_signal(struct maze_context *to,
    + struct maze_context *from)
    +{
    + to->soft_limit = from->soft_limit;
    + to->hard_limit = from->hard_limit;
    + to->soft_signal = from->soft_signal;
    + to->hard_signal = from->hard_signal;
    +}
    +
    +/*
    + * Must be called under task_lock().
    + */
    +static inline void enqueue(struct maze_context *context)
    +{
    + unsigned long flags;
    +
    + spin_lock_irqsave(&__get_cpu_var(maze_queue_lock), flags);
    + if (!context->enqueue) {
    + list_add(&context->queue, &__get_cpu_var(maze_queue));
    + context->enqueue_cpu = smp_processor_id();
    + context->enqueue = 1;
    + }
    + spin_unlock_irqrestore(&__get_cpu_var(maze_queue_lock), flags);
    +}
    +
    +static void sched_in_event(struct preempt_notifier *notifier, int cpu)
    +{
    + task_lock(current);
    + if (current->maze_context) {
    + if (current->state != TASK_RUNNING)
    + reset_maze_count(current->maze_context);
    + enqueue(current->maze_context);
    + }
    + task_unlock(current);
    +}
    +
    +static void sched_out_event(struct preempt_notifier *notifier,
    + struct task_struct *next)
    +{
    + task_lock(next);
    + if (next->maze_context)
    + enqueue(next->maze_context);
    + task_unlock(next);
    +}
    +
    +static struct preempt_ops preempt_ops = {
    + .sched_in = sched_in_event,
    + .sched_out = sched_out_event,
    +};
    +
    +/*
    + * This function is called copy_process()
    + */
    +void maze_fork(struct task_struct *task)
    +{
    + task->maze_context = NULL;
    +
    + if (!current->maze_context)
    + return;
    +
    + task_lock(task);
    +
    + task->maze_context = kzalloc(sizeof(struct maze_context), GFP_ATOMIC);
    + if (unlikely(!task->maze_context)) {
    + ERRPRINT("fail to alloc maze_context.\n");
    + goto task_unlock;
    + }
    + task->maze_context->task = task;
    + task->maze_context->pid = task->pid;
    +
    + copy_limit_and_signal(task->maze_context, current->maze_context);
    + preempt_notifier_init(&task->maze_context->notifier, &preempt_ops);
    +
    + spin_lock(&maze_list_lock);
    + list_add(&task->maze_context->list, &maze_list);
    + spin_unlock(&maze_list_lock);
    +
    +task_unlock:
    + task_unlock(task);
    +}
    +
    +/*
    + * This function is called sys_sched_yield()
    + */
    +void maze_sched_yield(struct task_struct *task)
    +{
    + if (task->maze_context)
    + reset_maze_count(task->maze_context);
    +}
    +
    +static void soft_limit_event(struct task_struct *task)
    +{
    + send_sig(task->maze_context->soft_signal, task, 1);
    + task->maze_context->flags |= MAZE_SENT_SOFT_SIG;
    +}
    +
    +static void hard_limit_event(struct task_struct *task)
    +{
    + send_sig(task->maze_context->hard_signal, task, 1);
    + task->maze_context->flags |= MAZE_SENT_HARD_SIG;
    +}
    +
    +static inline void set_preempt_notifir(struct maze_context *context)
    +{
    + if (!atomic_xchg(&context->registered_notifier, 1))
    + preempt_notifier_register(&context->notifier);
    +}
    +
    +static int add_maze_context(struct maze_context *context)
    +{
    + struct task_struct *task;
    + int ret = 0;
    +
    + rcu_read_lock();
    +
    + task = find_task_by_pid(context->pid);
    + if (unlikely(!task))
    + goto read_unlock;
    +
    + task_lock(task);
    +
    + if (!task->maze_context) {
    + task->maze_context = context;
    + context->task = task;
    +
    + spin_lock(&maze_list_lock);
    + list_add(&context->list, &maze_list);
    + spin_unlock(&maze_list_lock);
    +
    + reset_maze_count(context);
    +
    + preempt_notifier_init(&context->notifier, &preempt_ops);
    + if (current == task)
    + set_preempt_notifir(context);
    + } else {
    + copy_limit_and_signal(task->maze_context, context);
    + ret = 1;
    + }
    +
    + task_unlock(task);
    +read_unlock:
    + rcu_read_unlock();
    + return ret;
    +}
    +
    +static int build_maze_context(const char *read_line, struct maze_context *context)
    +{
    + unsigned long soft_limit, hard_limit;
    + int soft_signal, hard_signal;
    + int pid, res;
    +
    + res = sscanf(read_line, "%d %ld %ld %d %d", &pid, &soft_limit,
    + &hard_limit, &soft_signal, &hard_signal);
    +
    + if (res != 5 || pid < 0 ||
    + soft_limit < 0 || hard_limit < 0 ||
    + soft_signal < 0 || hard_signal < 0)
    + return -EINVAL;
    +
    + context->pid = pid;
    + context->soft_limit = soft_limit;
    + context->hard_limit = hard_limit;
    + context->soft_signal = soft_signal;
    + context->hard_signal = hard_signal;
    +
    + return 0;
    +}
    +
    +static void timer_handler(unsigned long data);
    +
    +/*
    + * Setup softirq timer
    + */
    +static void continue_timer(int cpu)
    +{
    + struct timer_list *t;
    +
    + t = &per_cpu(maze_timer, cpu);
    + t->function = timer_handler;
    + t->expires = jiffies + maze_timer_interval;
    + init_timer(t);
    + add_timer_on(t, cpu);
    +}
    +
    +static inline void check_limit(struct maze_context *context)
    +{
    + cputime_t t = get_maze_count(context);
    +
    + if (!(context->flags & MAZE_SENT_SOFT_SIG)) {
    + if (t >= context->soft_limit)
    + soft_limit_event(context->task);
    + } else if (!(context->flags & MAZE_SENT_HARD_SIG)) {
    + if (t >= context->hard_limit)
    + hard_limit_event(context->task);
    + }
    +}
    +
    +static void timer_handler(unsigned long data)
    +{
    + struct maze_context *context, *next;
    +
    + spin_lock(&__get_cpu_var(maze_queue_lock));
    +
    + if (current->maze_context) {
    + set_preempt_notifir(current->maze_context);
    + if (!current->maze_context->enqueue)
    + check_limit(current->maze_context);
    + }
    +
    + if (!list_empty(&__get_cpu_var(maze_queue))) {
    + list_for_each_entry_safe (context, next,
    + &__get_cpu_var(maze_queue), queue) {
    + check_limit(context);
    + list_del(&context->queue);
    + context->enqueue = 0;
    + }
    + }
    +
    + spin_unlock(&__get_cpu_var(maze_queue_lock));
    +
    + continue_timer(smp_processor_id());
    +}
    +
    +
    +static int maze_entries_file_show(struct seq_file *seq, void *nouse)
    +{
    + struct maze_context *context;
    +
    + spin_lock(&maze_list_lock);
    +
    + list_for_each_entry(context, &maze_list, list) {
    + seq_printf(seq, "pid:%5d ", context->pid);
    + seq_printf(seq, "count:%6ld ", get_maze_count(context));
    + seq_printf(seq, "soft limit:%6ld ", context->soft_limit);
    + seq_printf(seq, "hard limit:%6ld ", context->hard_limit);
    + seq_printf(seq, "soft signal:%2d ", context->soft_signal);
    + seq_printf(seq, "hard signal:%2d ", context->hard_signal);
    + seq_printf(seq, "\n");
    + }
    +
    + spin_unlock(&maze_list_lock);
    +
    + return 0;
    +}
    +
    +/*
    + * Open operation of /proc/maze/entries
    + */
    +static int maze_entries_file_open(struct inode *inode, struct file *file)
    +{
    + return single_open(file, maze_entries_file_show, NULL);
    +}
    +
    +/*
    + * Release operation of /proc/maze/entries
    + */
    +static int maze_entries_file_release(struct inode *inode, struct file *file)
    +{
    + return single_release(inode, file);
    +}
    +
    +/*
    + * Write operation of /proc/maze/entries
    + */
    +static ssize_t maze_entries_file_write(struct file *file,
    + const char __user *buffer,
    + size_t count, loff_t *ppos)
    +{
    + struct maze_context *context;
    + char read_line[32];
    + int ret;
    +
    + if (!capable(CAP_SYS_ADMIN))
    + return -EPERM;
    +
    + if (count > sizeof(read_line) - 1)
    + return -EINVAL;
    +
    + if (copy_from_user(read_line, buffer, count))
    + return -EFAULT;
    +
    + read_line[count] = '\0';
    +
    + context = kzalloc(sizeof(struct maze_context), GFP_ATOMIC);
    + if (unlikely(!context)) {
    + ERRPRINT("fail to alloc maze_context.\n");
    + return -ENOMEM;
    + }
    +
    + ret = build_maze_context(read_line, context);
    + if (ret) {
    + kfree(context);
    + return ret;
    + }
    +
    + if (add_maze_context(context))
    + kfree(context);
    +
    + return count;
    +}
    +
    +static struct file_operations maze_entries_file_ops = {
    + .owner = THIS_MODULE,
    + .open = maze_entries_file_open,
    + .read = seq_read,
    + .write = maze_entries_file_write,
    + .llseek = seq_lseek,
    + .release = maze_entries_file_release,
    +};
    +
    +/*
    + * Creating /proc/maze/
    + */
    +static inline int init_dir(void)
    +{
    + maze_proc_dir =
    + create_proc_entry("maze",
    + S_IFDIR | S_IRUGO | S_IXUGO,
    + NULL);
    +
    + if (!maze_proc_dir)
    + panic("fail to create /proc/maze\n");
    +
    + return 0;
    +}
    +
    +
    +/*
    + * Creating /proc/maze/entries
    + */
    +static inline int init_entries(void)
    +{
    + struct proc_dir_entry *p;
    +
    + p = create_proc_entry("entries", S_IRUGO, maze_proc_dir);
    + if (p == NULL)
    + panic("fail to create /proc/maze/entries\n");
    +
    + p->proc_fops = &maze_entries_file_ops;
    +
    + return 0;
    +}
    +
    +/*
    + * Initializing maze procfs
    + */
    +static void __init init_proc(void)
    +{
    + init_dir();
    + init_entries();
    +}
    +
    +static void __init init_cpu(int cpu)
    +{
    + spin_lock_init(&per_cpu(maze_queue_lock, cpu));
    + INIT_LIST_HEAD(&per_cpu(maze_queue, cpu));
    +
    + /* Setup softirq timer */
    + continue_timer(cpu);
    +}
    +
    +static int __init maze_init(void)
    +{
    + int cpu;
    +
    + printk(KERN_INFO "Maze: Initializing\n");
    + maze_initialized = 1;
    + init_proc();
    +
    + spin_lock_init(&maze_list_lock);
    + INIT_LIST_HEAD(&maze_list);
    +
    + for_each_online_cpu(cpu)
    + init_cpu(cpu);
    +
    + return 0;
    +}
    +late_initcall(maze_init);
    --- linux-2.6.25-mm1-maze.orig/kernel/sched.c 2008-05-13 17:40:43.000000000 +0900
    +++ linux-2.6.25-mm1-maze/kernel/sched.c 2008-05-13 17:52:23.000000000 +0900
    @@ -68,6 +68,7 @@
    #include
    #include
    #include
    +#include

    #include
    #include
    @@ -5157,6 +5158,8 @@ asmlinkage long sys_sched_yield(void)
    _raw_spin_unlock(&rq->lock);
    preempt_enable_no_resched();

    + maze_sched_yield(current);
    +
    schedule();

    return 0;



    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  2. Re: [RFC][PATCH 1/1] MAZE: Mazed processes monitor

    On Tue, 13 May 2008 20:47:32 +0900 Hirofumi Nakagawa wrote:

    > This patch for linux-2.6.25-mm1 that is tested on x86_64 hardware.
    >
    > ...
    >
    > +static struct proc_dir_entry *maze_proc_dir;
    > +
    > +static int maze_initialized;
    > +
    > +static struct list_head maze_list;
    > +static spinlock_t maze_list_lock;
    > +
    > +static int maze_timer_interval = 10;
    > +
    > +DEFINE_PER_CPU(struct list_head, maze_queue);
    > +DEFINE_PER_CPU(spinlock_t, maze_queue_lock);
    > +
    > +DEFINE_PER_CPU(struct timer_list, maze_timer);


    These three can be static?

    Please document the data structures here at their definition sites with
    code comments: what they are used for, what their locking rules are.
    Also, a per-cpu spinlock is somewhat unusual and a few words describing
    why it is needed would be useful.

    > +static inline void reset_maze_count(struct maze_context *context)
    > +{
    > + context->reset_utime = context->task->utime;
    > + context->reset_stime = context->task->stime;
    > + context->flags = 0;
    > +}
    > +
    > +static inline cputime_t get_maze_count(struct maze_context *context)
    > +{
    > + return (context->task->utime - context->reset_utime) +
    > + (context->task->stime - context->reset_stime);
    > +}
    > +
    > +/*
    > + * This function is called start_kernel()
    > + */
    > +void maze_init_early(void)
    > +{
    > + init_task.maze_context = NULL;
    > +}
    > +
    > +/*
    > + * This function is called do_exit()
    > + */
    > +void maze_exit(struct task_struct *task)
    > +{
    > + unsigned long flags;
    > +
    > + task_lock(task);
    > + if (task->maze_context) {
    > + spin_lock_irqsave(&per_cpu(maze_queue_lock,
    > + task->maze_context->enqueue_cpu), flags);


    There's a moderate amount of whitespace brokenness in here.
    scripts/checkpatch.pl detects some of it.

    > + if (task->maze_context->enqueue)
    > + list_del(&task->maze_context->queue);
    > + spin_unlock_irqrestore(&per_cpu(maze_queue_lock,
    > + task->maze_context->enqueue_cpu), flags);
    > +
    > + spin_lock(&maze_list_lock);
    > + list_del(&task->maze_context->list);
    > + spin_unlock(&maze_list_lock);
    > +
    > + kfree(task->maze_context);
    > + task->maze_context = NULL;
    > + }
    > + task_unlock(task);
    > +}
    >
    > ...
    >
    > +/*
    > + * Must be called under task_lock().
    > + */
    > +static inline void enqueue(struct maze_context *context)
    > +{
    > + unsigned long flags;
    > +
    > + spin_lock_irqsave(&__get_cpu_var(maze_queue_lock), flags);
    > + if (!context->enqueue) {
    > + list_add(&context->queue, &__get_cpu_var(maze_queue));
    > + context->enqueue_cpu = smp_processor_id();
    > + context->enqueue = 1;
    > + }
    > + spin_unlock_irqrestore(&__get_cpu_var(maze_queue_lock), flags);
    > +}


    The code uses `inline' too much, and is probably slower as a result.
    Most of all of them should be removed, please.

    > +static void sched_in_event(struct preempt_notifier *notifier, int cpu)
    > +{
    > + task_lock(current);
    > + if (current->maze_context) {
    > + if (current->state != TASK_RUNNING)
    > + reset_maze_count(current->maze_context);
    > + enqueue(current->maze_context);
    > + }
    > + task_unlock(current);
    > +}


    Please update the description of task_lock() in sched.h when adding new
    rules to it. I assume that it protects task_struct.maze_context.

    I'm a little surprised to see a global lock being taken under
    task_lock(): generally I think of task_lock() as a low-level thing
    which protects only task internals. I assume this code was runtime
    tested with lockdep enabled?

    But that doesn't mean that the chosen lock ranking is desirable. From
    a quick look the locking does seem logical.

    > +static void sched_out_event(struct preempt_notifier *notifier,
    > + struct task_struct *next)
    > +{
    > + task_lock(next);
    > + if (next->maze_context)
    > + enqueue(next->maze_context);
    > + task_unlock(next);
    > +}
    > +
    > +static struct preempt_ops preempt_ops = {
    > + .sched_in = sched_in_event,
    > + .sched_out = sched_out_event,
    > +};
    > +
    > +/*
    > + * This function is called copy_process()
    > + */


    Some additional description here would be useful. What does the function do?

    > +void maze_fork(struct task_struct *task)
    > +{
    > + task->maze_context = NULL;
    > +
    > + if (!current->maze_context)
    > + return;
    > +
    > + task_lock(task);
    > +
    > + task->maze_context = kzalloc(sizeof(struct maze_context), GFP_ATOMIC);


    GFP_ATOMIC is weak, and can fail. With some reorganisation this could
    be moved outside the lock and switched to GFP_KERNEL.

    > + if (unlikely(!task->maze_context)) {
    > + ERRPRINT("fail to alloc maze_context.\n");
    > + goto task_unlock;
    > + }
    > + task->maze_context->task = task;
    > + task->maze_context->pid = task->pid;
    > +
    > + copy_limit_and_signal(task->maze_context, current->maze_context);
    > + preempt_notifier_init(&task->maze_context->notifier, &preempt_ops);
    > +
    > + spin_lock(&maze_list_lock);
    > + list_add(&task->maze_context->list, &maze_list);
    > + spin_unlock(&maze_list_lock);
    > +
    > +task_unlock:
    > + task_unlock(task);
    > +}
    >
    > ...
    >
    > +static inline void set_preempt_notifir(struct maze_context *context)


    "notifier"

    > +{
    > + if (!atomic_xchg(&context->registered_notifier, 1))
    > + preempt_notifier_register(&context->notifier);
    > +}
    > +
    > +static int add_maze_context(struct maze_context *context)
    > +{
    > + struct task_struct *task;
    > + int ret = 0;
    > +
    > + rcu_read_lock();
    > +
    > + task = find_task_by_pid(context->pid);
    > + if (unlikely(!task))
    > + goto read_unlock;
    > +
    > + task_lock(task);
    > +
    > + if (!task->maze_context) {
    > + task->maze_context = context;
    > + context->task = task;
    > +
    > + spin_lock(&maze_list_lock);
    > + list_add(&context->list, &maze_list);
    > + spin_unlock(&maze_list_lock);
    > +
    > + reset_maze_count(context);
    > +
    > + preempt_notifier_init(&context->notifier, &preempt_ops);
    > + if (current == task)
    > + set_preempt_notifir(context);


    Please use only hard tabs for code indenting.

    > + } else {
    > + copy_limit_and_signal(task->maze_context, context);
    > + ret = 1;
    > + }
    > +
    > + task_unlock(task);
    > +read_unlock:
    > + rcu_read_unlock();
    > + return ret;
    > +}
    > +
    > +static int build_maze_context(const char *read_line, struct maze_context *context)
    > +{
    > + unsigned long soft_limit, hard_limit;
    > + int soft_signal, hard_signal;
    > + int pid, res;
    > +
    > + res = sscanf(read_line, "%d %ld %ld %d %d", &pid, &soft_limit,
    > + &hard_limit, &soft_signal, &hard_signal);
    > +
    > + if (res != 5 || pid < 0 ||
    > + soft_limit < 0 || hard_limit < 0 ||
    > + soft_signal < 0 || hard_signal < 0)
    > + return -EINVAL;
    > +
    > + context->pid = pid;
    > + context->soft_limit = soft_limit;
    > + context->hard_limit = hard_limit;
    > + context->soft_signal = soft_signal;
    > + context->hard_signal = hard_signal;
    > +
    > + return 0;
    > +}


    pids are not unique in a containerised system. What are the
    implications of this?

    > +static void timer_handler(unsigned long data);
    > +
    > +/*
    > + * Setup softirq timer
    > + */
    > +static void continue_timer(int cpu)
    > +{
    > + struct timer_list *t;
    > +
    > + t = &per_cpu(maze_timer, cpu);
    > + t->function = timer_handler;
    > + t->expires = jiffies + maze_timer_interval;
    > + init_timer(t);


    setup_timer()

    > + add_timer_on(t, cpu);
    > +}
    >
    > ...
    >
    > +static void timer_handler(unsigned long data)
    > +{
    > + struct maze_context *context, *next;
    > +
    > + spin_lock(&__get_cpu_var(maze_queue_lock));
    > +
    > + if (current->maze_context) {
    > + set_preempt_notifir(current->maze_context);
    > + if (!current->maze_context->enqueue)
    > + check_limit(current->maze_context);
    > + }
    > +
    > + if (!list_empty(&__get_cpu_var(maze_queue))) {
    > + list_for_each_entry_safe (context, next,
    > + &__get_cpu_var(maze_queue), queue) {
    > + check_limit(context);
    > + list_del(&context->queue);
    > + context->enqueue = 0;
    > + }
    > + }
    > +
    > + spin_unlock(&__get_cpu_var(maze_queue_lock));
    > +
    > + continue_timer(smp_processor_id());
    > +}


    It is odd that the timer runs every ten jiffies. Because jiffies can
    alter by a factor of ten, depending upon Kconfig settings and
    architecture, etc.

    Having numerous timers expire once per ten jiffies will be a power
    consumption problem for some people. Can this be fixed?

    There's not really enough information in either code comments or the
    changelog to permit me to understand what this timer is here for.

    > +
    > +static int maze_entries_file_show(struct seq_file *seq, void *nouse)
    > +{
    > + struct maze_context *context;
    > +
    > + spin_lock(&maze_list_lock);
    > +
    > + list_for_each_entry(context, &maze_list, list) {
    > + seq_printf(seq, "pid:%5d ", context->pid);
    > + seq_printf(seq, "count:%6ld ", get_maze_count(context));
    > + seq_printf(seq, "soft limit:%6ld ", context->soft_limit);
    > + seq_printf(seq, "hard limit:%6ld ", context->hard_limit);
    > + seq_printf(seq, "soft signal:%2d ", context->soft_signal);
    > + seq_printf(seq, "hard signal:%2d ", context->hard_signal);
    > + seq_printf(seq, "\n");
    > + }
    > +
    > + spin_unlock(&maze_list_lock);
    > +
    > + return 0;
    > +}
    >
    > ...
    >
    > +/*
    > + * Write operation of /proc/maze/entries
    > + */
    > +static ssize_t maze_entries_file_write(struct file *file,
    > + const char __user *buffer,
    > + size_t count, loff_t *ppos)
    > +{
    > + struct maze_context *context;
    > + char read_line[32];
    > + int ret;
    > +
    > + if (!capable(CAP_SYS_ADMIN))
    > + return -EPERM;
    > +
    > + if (count > sizeof(read_line) - 1)
    > + return -EINVAL;
    > +
    > + if (copy_from_user(read_line, buffer, count))
    > + return -EFAULT;
    > +
    > + read_line[count] = '\0';
    > +
    > + context = kzalloc(sizeof(struct maze_context), GFP_ATOMIC);


    Use GFP_KERNEL.

    > + if (unlikely(!context)) {
    > + ERRPRINT("fail to alloc maze_context.\n");
    > + return -ENOMEM;
    > + }
    > +
    > + ret = build_maze_context(read_line, context);
    > + if (ret) {
    > + kfree(context);
    > + return ret;
    > + }
    > +
    > + if (add_maze_context(context))
    > + kfree(context);
    > +
    > + return count;
    > +}


    It looks like this can leak the memory at *context?

    > +static struct file_operations maze_entries_file_ops = {
    > + .owner = THIS_MODULE,
    > + .open = maze_entries_file_open,
    > + .read = seq_read,
    > + .write = maze_entries_file_write,
    > + .llseek = seq_lseek,
    > + .release = maze_entries_file_release,
    > +};


    Please document newly-added /proc files in Documentation/filesystems/proc.txt

    > +/*
    > + * Creating /proc/maze/
    > + */
    > +static inline int init_dir(void)
    > +{
    > + maze_proc_dir =
    > + create_proc_entry("maze",
    > + S_IFDIR | S_IRUGO | S_IXUGO,
    > + NULL);
    > +
    > + if (!maze_proc_dir)
    > + panic("fail to create /proc/maze\n");
    > +
    > + return 0;
    > +}


    uninline, fix whitespace...

    > +
    > +/*
    > + * Creating /proc/maze/entries
    > + */
    > +static inline int init_entries(void)
    > +{
    > + struct proc_dir_entry *p;
    > +
    > + p = create_proc_entry("entries", S_IRUGO, maze_proc_dir);
    > + if (p == NULL)
    > + panic("fail to create /proc/maze/entries\n");
    > +
    > + p->proc_fops = &maze_entries_file_ops;
    > +
    > + return 0;
    > +}
    >
    > ...
    >
    > +static int __init maze_init(void)
    > +{
    > + int cpu;
    > +
    > + printk(KERN_INFO "Maze: Initializing\n");
    > + maze_initialized = 1;
    > + init_proc();
    > +
    > + spin_lock_init(&maze_list_lock);


    Remove this, use DEFINE_SPINLOCK().

    > + INIT_LIST_HEAD(&maze_list);


    Remove this, use LIST_HEAD().

    > + for_each_online_cpu(cpu)
    > + init_cpu(cpu);
    > +
    > + return 0;
    > +}
    > +late_initcall(maze_init);


    I am unable to tell from the implementation why late_initcall() was
    chosen instead of plain old __initcall() (or module_innit()). Please
    add a comment.

    > --- linux-2.6.25-mm1-maze.orig/kernel/sched.c 2008-05-13 17:40:43.000000000 +0900
    > +++ linux-2.6.25-mm1-maze/kernel/sched.c 2008-05-13 17:52:23.000000000 +0900
    > @@ -68,6 +68,7 @@
    > #include
    > #include
    > #include
    > +#include
    >
    > #include
    > #include
    > @@ -5157,6 +5158,8 @@ asmlinkage long sys_sched_yield(void)
    > _raw_spin_unlock(&rq->lock);
    > preempt_enable_no_resched();
    >
    > + maze_sched_yield(current);
    > +
    > schedule();
    >
    > return 0;
    >
    >

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

+ Reply to Thread