Introduce res_counter_ratelimit as a generic structure to implement
throttling-based cgroup subsystems.

[ Only the interfaces needed by the IO controller are implemented right now ]

Signed-off-by: Andrea Righi
---
include/linux/res_counter.h | 70 +++++++++++++++++++++++++
kernel/res_counter.c | 118 ++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 187 insertions(+), 1 deletions(-)

diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 626f8aa..8c44746 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -14,6 +14,7 @@
*/

#include
+#include

/*
* The core object. the cgroup that wishes to account for some
@@ -45,6 +46,38 @@ struct res_counter {
spinlock_t lock;
};

+/* The various policies that can be used for throttling */
+#define RATELIMIT_LEAKY_BUCKET 0
+#define RATELIMIT_TOKEN_BUCKET 1
+
+struct res_counter_ratelimit {
+ /*
+ * the current resource consumption level
+ */
+ unsigned long long usage;
+ /*
+ * the maximal value of the usage from the counter creation
+ */
+ unsigned long long max_usage;
+ /*
+ * the rate limit that cannot be exceeded
+ */
+ unsigned long long limit;
+ /*
+ * the limiting policy / algorithm
+ */
+ unsigned long long policy;
+ /*
+ * timestamp of the last accounted resource request
+ */
+ unsigned long long timestamp;
+ /*
+ * the lock to protect all of the above.
+ * the routines below consider this to be IRQ-safe
+ */
+ spinlock_t lock;
+};
+
/**
* Helpers to interact with userspace
* res_counter_read_u64() - returns the value of the specified member.
@@ -60,10 +93,17 @@ struct res_counter {

u64 res_counter_read_u64(struct res_counter *counter, int member);

+u64 res_counter_ratelimit_read_u64(struct res_counter_ratelimit *counter,
+ int member);
+
ssize_t res_counter_read(struct res_counter *counter, int member,
const char __user *buf, size_t nbytes, loff_t *pos,
int (*read_strategy)(unsigned long long val, char *s));

+ssize_t res_counter_ratelimit_read(struct res_counter_ratelimit *counter,
+ int member, const char __user *buf, size_t nbytes, loff_t *pos,
+ int (*read_strategy)(unsigned long long val, char *s));
+
typedef int (*write_strategy_fn)(const char *buf, unsigned long long *val);

int res_counter_memparse_write_strategy(const char *buf,
@@ -80,6 +120,8 @@ enum {
RES_USAGE,
RES_MAX_USAGE,
RES_LIMIT,
+ RES_POLICY,
+ RES_TIMESTAMP,
RES_FAILCNT,
};

@@ -89,6 +131,8 @@ enum {

void res_counter_init(struct res_counter *counter);

+void res_counter_ratelimit_init(struct res_counter_ratelimit *counter);
+
/*
* charge - try to consume more resource.
*
@@ -126,6 +170,15 @@ static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
return false;
}

+static inline unsigned long long
+res_counter_ratelimit_delta_t(struct res_counter_ratelimit *res)
+{
+ return (long long)get_jiffies_64() - (long long)res->timestamp;
+}
+
+unsigned long long
+res_counter_ratelimit_sleep(struct res_counter_ratelimit *res, ssize_t val);
+
/*
* Helper function to detect if the cgroup is within it's limit or
* not. It's currently called from cgroup_rss_prepare()
@@ -174,6 +227,23 @@ static inline int res_counter_set_limit(struct res_counter *cnt,
return ret;
}

+static inline int
+res_counter_ratelimit_set_limit(struct res_counter_ratelimit *cnt,
+ unsigned long long policy,
+ unsigned long long limit, unsigned long long max)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cnt->lock, flags);
+ cnt->limit = limit;
+ cnt->max_usage = max;
+ cnt->policy = policy;
+ cnt->timestamp = get_jiffies_64();
+ cnt->usage = 0;
+ spin_unlock_irqrestore(&cnt->lock, flags);
+ return 0;
+}
+
/*
* Add the value val to the resource counter and check if we are
* still under the limit.
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index f275c8e..cf23205 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -9,6 +9,7 @@

#include
#include
+#include
#include
#include
#include
@@ -21,6 +22,15 @@ void res_counter_init(struct res_counter *counter)
counter->limit = (unsigned long long)LLONG_MAX;
}

+void res_counter_ratelimit_init(struct res_counter_ratelimit *counter)
+{
+ spin_lock_init(&counter->lock);
+ counter->limit = (unsigned long long)LLONG_MAX;
+ counter->max_usage = (unsigned long long)LLONG_MAX;
+ counter->usage = 0;
+ counter->timestamp = get_jiffies_64();
+}
+
int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
{
if (counter->usage + val > counter->limit) {
@@ -62,7 +72,6 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
spin_unlock_irqrestore(&counter->lock, flags);
}

-
static inline unsigned long long *
res_counter_member(struct res_counter *counter, int member)
{
@@ -81,6 +90,26 @@ res_counter_member(struct res_counter *counter, int member)
return NULL;
}

+static inline unsigned long long *
+res_counter_ratelimit_member(struct res_counter_ratelimit *counter, int member)
+{
+ switch (member) {
+ case RES_USAGE:
+ return &counter->usage;
+ case RES_MAX_USAGE:
+ return &counter->max_usage;
+ case RES_LIMIT:
+ return &counter->limit;
+ case RES_POLICY:
+ return &counter->policy;
+ case RES_TIMESTAMP:
+ return &counter->timestamp;
+ };
+
+ BUG();
+ return NULL;
+}
+
ssize_t res_counter_read(struct res_counter *counter, int member,
const char __user *userbuf, size_t nbytes, loff_t *pos,
int (*read_strategy)(unsigned long long val, char *st_buf))
@@ -98,11 +127,35 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
pos, buf, s - buf);
}

+ssize_t res_counter_ratelimit_read(struct res_counter_ratelimit *counter,
+ int member, const char __user *userbuf, size_t nbytes,
+ loff_t *pos,
+ int (*read_strategy)(unsigned long long val, char *st_buf))
+{
+ unsigned long long *val;
+ char buf[64], *s;
+
+ s = buf;
+ val = res_counter_ratelimit_member(counter, member);
+ if (read_strategy)
+ s += read_strategy(*val, s);
+ else
+ s += sprintf(s, "%llu\n", *val);
+ return simple_read_from_buffer((void __user *)userbuf, nbytes,
+ pos, buf, s - buf);
+}
+
u64 res_counter_read_u64(struct res_counter *counter, int member)
{
return *res_counter_member(counter, member);
}

+u64 res_counter_ratelimit_read_u64(struct res_counter_ratelimit *counter,
+ int member)
+{
+ return *res_counter_ratelimit_member(counter, member);
+}
+
int res_counter_memparse_write_strategy(const char *buf,
unsigned long long *res)
{
@@ -137,3 +190,66 @@ int res_counter_write(struct res_counter *counter, int member,
spin_unlock_irqrestore(&counter->lock, flags);
return 0;
}
+
+static unsigned long long
+ratelimit_leaky_bucket(struct res_counter_ratelimit *res, ssize_t val)
+{
+ unsigned long long delta, t;
+
+ res->usage += val;
+ delta = res_counter_ratelimit_delta_t(res);
+ if (!delta)
+ return 0;
+ t = res->usage * USEC_PER_SEC;
+ t = usecs_to_jiffies(div_u64(t, res->limit));
+ if (t > delta)
+ return t - delta;
+ /* Reset i/o statistics */
+ res->usage = 0;
+ res->timestamp = get_jiffies_64();
+ return 0;
+}
+
+static unsigned long long
+ratelimit_token_bucket(struct res_counter_ratelimit *res, ssize_t val)
+{
+ unsigned long long delta;
+ long long tok;
+
+ res->usage -= val;
+ delta = jiffies_to_msecs(res_counter_ratelimit_delta_t(res ));
+ res->timestamp = get_jiffies_64();
+ tok = (long long)res->usage * MSEC_PER_SEC;
+ if (delta) {
+ long long max = (long long)res->max_usage * MSEC_PER_SEC;
+
+ tok += delta * res->limit;
+ if (tok > max)
+ tok = max;
+ res->usage = (unsigned long long)div_s64(tok, MSEC_PER_SEC);
+ }
+ return (tok < 0) ? msecs_to_jiffies(div_u64(-tok, res->limit)) : 0;
+}
+
+unsigned long long
+res_counter_ratelimit_sleep(struct res_counter_ratelimit *res, ssize_t val)
+{
+ unsigned long long sleep = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&res->lock, flags);
+ if (res->limit)
+ switch (res->policy) {
+ case RATELIMIT_LEAKY_BUCKET:
+ sleep = ratelimit_leaky_bucket(res, val);
+ break;
+ case RATELIMIT_TOKEN_BUCKET:
+ sleep = ratelimit_token_bucket(res, val);
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+ spin_unlock_irqrestore(&res->lock, flags);
+ return sleep;
+}
--
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/