191a712090
Pull cgroup updates from Tejun Heo: - Fixes and a lot of cleanups. Locking cleanup is finally complete. cgroup_mutex is no longer exposed to individual controlelrs which used to cause nasty deadlock issues. Li fixed and cleaned up quite a bit including long standing ones like racy cgroup_path(). - device cgroup now supports proper hierarchy thanks to Aristeu. - perf_event cgroup now supports proper hierarchy. - A new mount option "__DEVEL__sane_behavior" is added. As indicated by the name, this option is to be used for development only at this point and generates a warning message when used. Unfortunately, cgroup interface currently has too many brekages and inconsistencies to implement a consistent and unified hierarchy on top. The new flag is used to collect the behavior changes which are necessary to implement consistent unified hierarchy. It's likely that this flag won't be used verbatim when it becomes ready but will be enabled implicitly along with unified hierarchy. The option currently disables some of broken behaviors in cgroup core and also .use_hierarchy switch in memcg (will be routed through -mm), which can be used to make very unusual hierarchy where nesting is partially honored. It will also be used to implement hierarchy support for blk-throttle which would be impossible otherwise without introducing a full separate set of control knobs. This is essentially versioning of interface which isn't very nice but at this point I can't see any other options which would allow keeping the interface the same while moving towards hierarchy behavior which is at least somewhat sane. The planned unified hierarchy is likely to require some level of adaptation from userland anyway, so I think it'd be best to take the chance and update the interface such that it's supportable in the long term. Maintaining the existing interface does complicate cgroup core but shouldn't put too much strain on individual controllers and I think it'd be manageable for the foreseeable future. Maybe we'll be able to drop it in a decade. Fix up conflicts (including a semantic one adding a new #include to ppc that was uncovered by header the file changes) as per Tejun. * 'for-3.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (45 commits) cpuset: fix compile warning when CONFIG_SMP=n cpuset: fix cpu hotplug vs rebuild_sched_domains() race cpuset: use rebuild_sched_domains() in cpuset_hotplug_workfn() cgroup: restore the call to eventfd->poll() cgroup: fix use-after-free when umounting cgroupfs cgroup: fix broken file xattrs devcg: remove parent_cgroup. memcg: force use_hierarchy if sane_behavior cgroup: remove cgrp->top_cgroup cgroup: introduce sane_behavior mount option move cgroupfs_root to include/linux/cgroup.h cgroup: convert cgroupfs_root flag bits to masks and add CGRP_ prefix cgroup: make cgroup_path() not print double slashes Revert "cgroup: remove bind() method from cgroup_subsys." perf: make perf_event cgroup hierarchical cgroup: implement cgroup_is_descendant() cgroup: make sure parent won't be destroyed before its children cgroup: remove bind() method from cgroup_subsys. devcg: remove broken_hierarchy tag cgroup: remove cgroup_lock_is_held() ...
227 lines
5.7 KiB
C
227 lines
5.7 KiB
C
#ifndef __RES_COUNTER_H__
|
|
#define __RES_COUNTER_H__
|
|
|
|
/*
|
|
* Resource Counters
|
|
* Contain common data types and routines for resource accounting
|
|
*
|
|
* Copyright 2007 OpenVZ SWsoft Inc
|
|
*
|
|
* Author: Pavel Emelianov <xemul@openvz.org>
|
|
*
|
|
* See Documentation/cgroups/resource_counter.txt for more
|
|
* info about what this counter is.
|
|
*/
|
|
|
|
#include <linux/spinlock.h>
|
|
#include <linux/errno.h>
|
|
|
|
/*
|
|
* The core object. the cgroup that wishes to account for some
|
|
* resource may include this counter into its structures and use
|
|
* the helpers described beyond
|
|
*/
|
|
|
|
struct res_counter {
|
|
/*
|
|
* the current resource consumption level
|
|
*/
|
|
unsigned long long usage;
|
|
/*
|
|
* the maximal value of the usage from the counter creation
|
|
*/
|
|
unsigned long long max_usage;
|
|
/*
|
|
* the limit that usage cannot exceed
|
|
*/
|
|
unsigned long long limit;
|
|
/*
|
|
* the limit that usage can be exceed
|
|
*/
|
|
unsigned long long soft_limit;
|
|
/*
|
|
* the number of unsuccessful attempts to consume the resource
|
|
*/
|
|
unsigned long long failcnt;
|
|
/*
|
|
* the lock to protect all of the above.
|
|
* the routines below consider this to be IRQ-safe
|
|
*/
|
|
spinlock_t lock;
|
|
/*
|
|
* Parent counter, used for hierarchial resource accounting
|
|
*/
|
|
struct res_counter *parent;
|
|
};
|
|
|
|
#define RESOURCE_MAX (unsigned long long)LLONG_MAX
|
|
|
|
/**
|
|
* Helpers to interact with userspace
|
|
* res_counter_read_u64() - returns the value of the specified member.
|
|
* res_counter_read/_write - put/get the specified fields from the
|
|
* res_counter struct to/from the user
|
|
*
|
|
* @counter: the counter in question
|
|
* @member: the field to work with (see RES_xxx below)
|
|
* @buf: the buffer to opeate on,...
|
|
* @nbytes: its size...
|
|
* @pos: and the offset.
|
|
*/
|
|
|
|
u64 res_counter_read_u64(struct res_counter *counter, int member);
|
|
|
|
ssize_t res_counter_read(struct res_counter *counter, int member,
|
|
const char __user *buf, size_t nbytes, loff_t *pos,
|
|
int (*read_strategy)(unsigned long long val, char *s));
|
|
|
|
int res_counter_memparse_write_strategy(const char *buf,
|
|
unsigned long long *res);
|
|
|
|
/*
|
|
* the field descriptors. one for each member of res_counter
|
|
*/
|
|
|
|
enum {
|
|
RES_USAGE,
|
|
RES_MAX_USAGE,
|
|
RES_LIMIT,
|
|
RES_FAILCNT,
|
|
RES_SOFT_LIMIT,
|
|
};
|
|
|
|
/*
|
|
* helpers for accounting
|
|
*/
|
|
|
|
void res_counter_init(struct res_counter *counter, struct res_counter *parent);
|
|
|
|
/*
|
|
* charge - try to consume more resource.
|
|
*
|
|
* @counter: the counter
|
|
* @val: the amount of the resource. each controller defines its own
|
|
* units, e.g. numbers, bytes, Kbytes, etc
|
|
*
|
|
* returns 0 on success and <0 if the counter->usage will exceed the
|
|
* counter->limit _locked call expects the counter->lock to be taken
|
|
*
|
|
* charge_nofail works the same, except that it charges the resource
|
|
* counter unconditionally, and returns < 0 if the after the current
|
|
* charge we are over limit.
|
|
*/
|
|
|
|
int __must_check res_counter_charge_locked(struct res_counter *counter,
|
|
unsigned long val, bool force);
|
|
int __must_check res_counter_charge(struct res_counter *counter,
|
|
unsigned long val, struct res_counter **limit_fail_at);
|
|
int res_counter_charge_nofail(struct res_counter *counter,
|
|
unsigned long val, struct res_counter **limit_fail_at);
|
|
|
|
/*
|
|
* uncharge - tell that some portion of the resource is released
|
|
*
|
|
* @counter: the counter
|
|
* @val: the amount of the resource
|
|
*
|
|
* these calls check for usage underflow and show a warning on the console
|
|
* _locked call expects the counter->lock to be taken
|
|
*
|
|
* returns the total charges still present in @counter.
|
|
*/
|
|
|
|
u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
|
|
u64 res_counter_uncharge(struct res_counter *counter, unsigned long val);
|
|
|
|
u64 res_counter_uncharge_until(struct res_counter *counter,
|
|
struct res_counter *top,
|
|
unsigned long val);
|
|
/**
|
|
* res_counter_margin - calculate chargeable space of a counter
|
|
* @cnt: the counter
|
|
*
|
|
* Returns the difference between the hard limit and the current usage
|
|
* of resource counter @cnt.
|
|
*/
|
|
static inline unsigned long long res_counter_margin(struct res_counter *cnt)
|
|
{
|
|
unsigned long long margin;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&cnt->lock, flags);
|
|
if (cnt->limit > cnt->usage)
|
|
margin = cnt->limit - cnt->usage;
|
|
else
|
|
margin = 0;
|
|
spin_unlock_irqrestore(&cnt->lock, flags);
|
|
return margin;
|
|
}
|
|
|
|
/**
|
|
* Get the difference between the usage and the soft limit
|
|
* @cnt: The counter
|
|
*
|
|
* Returns 0 if usage is less than or equal to soft limit
|
|
* The difference between usage and soft limit, otherwise.
|
|
*/
|
|
static inline unsigned long long
|
|
res_counter_soft_limit_excess(struct res_counter *cnt)
|
|
{
|
|
unsigned long long excess;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&cnt->lock, flags);
|
|
if (cnt->usage <= cnt->soft_limit)
|
|
excess = 0;
|
|
else
|
|
excess = cnt->usage - cnt->soft_limit;
|
|
spin_unlock_irqrestore(&cnt->lock, flags);
|
|
return excess;
|
|
}
|
|
|
|
static inline void res_counter_reset_max(struct res_counter *cnt)
|
|
{
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&cnt->lock, flags);
|
|
cnt->max_usage = cnt->usage;
|
|
spin_unlock_irqrestore(&cnt->lock, flags);
|
|
}
|
|
|
|
static inline void res_counter_reset_failcnt(struct res_counter *cnt)
|
|
{
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&cnt->lock, flags);
|
|
cnt->failcnt = 0;
|
|
spin_unlock_irqrestore(&cnt->lock, flags);
|
|
}
|
|
|
|
static inline int res_counter_set_limit(struct res_counter *cnt,
|
|
unsigned long long limit)
|
|
{
|
|
unsigned long flags;
|
|
int ret = -EBUSY;
|
|
|
|
spin_lock_irqsave(&cnt->lock, flags);
|
|
if (cnt->usage <= limit) {
|
|
cnt->limit = limit;
|
|
ret = 0;
|
|
}
|
|
spin_unlock_irqrestore(&cnt->lock, flags);
|
|
return ret;
|
|
}
|
|
|
|
static inline int
|
|
res_counter_set_soft_limit(struct res_counter *cnt,
|
|
unsigned long long soft_limit)
|
|
{
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&cnt->lock, flags);
|
|
cnt->soft_limit = soft_limit;
|
|
spin_unlock_irqrestore(&cnt->lock, flags);
|
|
return 0;
|
|
}
|
|
|
|
#endif
|