#include <linux/cgroup.h>
#include <linux/cgroup_dmem.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/page_counter.h>
#include <linux/parser.h>
#include <linux/slab.h>
struct dmem_cgroup_region {
struct kref ref;
struct rcu_head rcu;
struct list_head region_node;
struct list_head pools;
u64 size;
char *name;
bool unregistered;
};
struct dmemcg_state {
struct cgroup_subsys_state css;
struct list_head pools;
};
struct dmem_cgroup_pool_state {
struct dmem_cgroup_region *region;
struct dmemcg_state *cs;
struct list_head css_node;
struct list_head region_node;
struct rcu_head rcu;
struct page_counter cnt;
bool inited;
};
static DEFINE_SPINLOCK(dmemcg_lock);
static LIST_HEAD(dmem_cgroup_regions);
static inline struct dmemcg_state *
css_to_dmemcs(struct cgroup_subsys_state *css)
{
return container_of(css, struct dmemcg_state, css);
}
static inline struct dmemcg_state *get_current_dmemcs(void)
{
return css_to_dmemcs(task_get_css(current, dmem_cgrp_id));
}
static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg)
{
return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL;
}
static void free_cg_pool(struct dmem_cgroup_pool_state *pool)
{
list_del(&pool->region_node);
kfree(pool);
}
static void
set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val)
{
page_counter_set_min(&pool->cnt, val);
}
static void
set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val)
{
page_counter_set_low(&pool->cnt, val);
}
static void
set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val)
{
page_counter_set_max(&pool->cnt, val);
}
static u64 get_resource_low(struct dmem_cgroup_pool_state *pool)
{
return pool ? READ_ONCE(pool->cnt.low) : 0;
}
static u64 get_resource_min(struct dmem_cgroup_pool_state *pool)
{
return pool ? READ_ONCE(pool->cnt.min) : 0;
}
static u64 get_resource_max(struct dmem_cgroup_pool_state *pool)
{
return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX;
}
static u64 get_resource_current(struct dmem_cgroup_pool_state *pool)
{
return pool ? page_counter_read(&pool->cnt) : 0;
}
static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool)
{
set_resource_min(rpool, 0);
set_resource_low(rpool, 0);
set_resource_max(rpool, PAGE_COUNTER_MAX);
}
static void dmemcs_offline(struct cgroup_subsys_state *css)
{
struct dmemcg_state *dmemcs = css_to_dmemcs(css);
struct dmem_cgroup_pool_state *pool;
rcu_read_lock();
list_for_each_entry_rcu(pool, &dmemcs->pools, css_node)
reset_all_resource_limits(pool);
rcu_read_unlock();
}
static void dmemcs_free(struct cgroup_subsys_state *css)
{
struct dmemcg_state *dmemcs = css_to_dmemcs(css);
struct dmem_cgroup_pool_state *pool, *next;
spin_lock(&dmemcg_lock);
list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) {
list_del(&pool->css_node);
free_cg_pool(pool);
}
spin_unlock(&dmemcg_lock);
kfree(dmemcs);
}
static struct cgroup_subsys_state *
dmemcs_alloc(struct cgroup_subsys_state *parent_css)
{
struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL);
if (!dmemcs)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&dmemcs->pools);
return &dmemcs->css;
}
static struct dmem_cgroup_pool_state *
find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region)
{
struct dmem_cgroup_pool_state *pool;
list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock))
if (pool->region == region)
return pool;
return NULL;
}
static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool)
{
if (!pool->cnt.parent)
return NULL;
return container_of(pool->cnt.parent, typeof(*pool), cnt);
}
static void
dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool,
struct dmem_cgroup_pool_state *test_pool)
{
struct page_counter *climit;
struct cgroup_subsys_state *css;
struct dmemcg_state *dmemcg_iter;
struct dmem_cgroup_pool_state *pool, *found_pool;
climit = &limit_pool->cnt;
rcu_read_lock();
css_for_each_descendant_pre(css, &limit_pool->cs->css) {
dmemcg_iter = container_of(css, struct dmemcg_state, css);
found_pool = NULL;
list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) {
if (pool->region == limit_pool->region) {
found_pool = pool;
break;
}
}
if (!found_pool)
continue;
page_counter_calculate_protection(
climit, &found_pool->cnt, true);
if (found_pool == test_pool)
break;
}
rcu_read_unlock();
}
bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool,
struct dmem_cgroup_pool_state *test_pool,
bool ignore_low, bool *ret_hit_low)
{
struct dmem_cgroup_pool_state *pool = test_pool;
struct page_counter *ctest;
u64 used, min, low;
if (limit_pool == test_pool)
return true;
if (limit_pool) {
if (!parent_dmemcs(limit_pool->cs))
return true;
for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool))
{}
if (!pool)
return false;
} else {
for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool))
{}
}
ctest = &test_pool->cnt;
dmem_cgroup_calculate_protection(limit_pool, test_pool);
used = page_counter_read(ctest);
min = READ_ONCE(ctest->emin);
if (used <= min)
return false;
if (!ignore_low) {
low = READ_ONCE(ctest->elow);
if (used > low)
return true;
*ret_hit_low = true;
return false;
}
return true;
}
EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable);
static struct dmem_cgroup_pool_state *
alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
struct dmem_cgroup_pool_state **allocpool)
{
struct dmemcg_state *parent = parent_dmemcs(dmemcs);
struct dmem_cgroup_pool_state *pool, *ppool = NULL;
if (!*allocpool) {
pool = kzalloc(sizeof(*pool), GFP_NOWAIT);
if (!pool)
return ERR_PTR(-ENOMEM);
} else {
pool = *allocpool;
*allocpool = NULL;
}
pool->region = region;
pool->cs = dmemcs;
if (parent)
ppool = find_cg_pool_locked(parent, region);
page_counter_init(&pool->cnt,
ppool ? &ppool->cnt : NULL, true);
reset_all_resource_limits(pool);
list_add_tail_rcu(&pool->css_node, &dmemcs->pools);
list_add_tail(&pool->region_node, ®ion->pools);
if (!parent)
pool->inited = true;
else
pool->inited = ppool ? ppool->inited : false;
return pool;
}
static struct dmem_cgroup_pool_state *
get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
struct dmem_cgroup_pool_state **allocpool)
{
struct dmem_cgroup_pool_state *pool, *ppool, *retpool;
struct dmemcg_state *p, *pp;
for (p = dmemcs; p; p = parent_dmemcs(p)) {
pool = find_cg_pool_locked(p, region);
if (!pool)
pool = alloc_pool_single(p, region, allocpool);
if (IS_ERR(pool))
return pool;
if (p == dmemcs && pool->inited)
return pool;
if (pool->inited)
break;
}
retpool = pool = find_cg_pool_locked(dmemcs, region);
for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) {
if (pool->inited)
break;
ppool = find_cg_pool_locked(pp, region);
pool->cnt.parent = &ppool->cnt;
pool->inited = true;
pool = ppool;
}
return retpool;
}
static void dmemcg_free_rcu(struct rcu_head *rcu)
{
struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu);
struct dmem_cgroup_pool_state *pool, *next;
list_for_each_entry_safe(pool, next, ®ion->pools, region_node)
free_cg_pool(pool);
kfree(region->name);
kfree(region);
}
static void dmemcg_free_region(struct kref *ref)
{
struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref);
call_rcu(&cgregion->rcu, dmemcg_free_rcu);
}
void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region)
{
struct list_head *entry;
if (!region)
return;
spin_lock(&dmemcg_lock);
list_del_rcu(®ion->region_node);
list_for_each_rcu(entry, ®ion->pools) {
struct dmem_cgroup_pool_state *pool =
container_of(entry, typeof(*pool), region_node);
list_del_rcu(&pool->css_node);
}
region->unregistered = true;
spin_unlock(&dmemcg_lock);
kref_put(®ion->ref, dmemcg_free_region);
}
EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region);
struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...)
{
struct dmem_cgroup_region *ret;
char *region_name;
va_list ap;
if (!size)
return NULL;
va_start(ap, fmt);
region_name = kvasprintf(GFP_KERNEL, fmt, ap);
va_end(ap);
if (!region_name)
return ERR_PTR(-ENOMEM);
ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret) {
kfree(region_name);
return ERR_PTR(-ENOMEM);
}
INIT_LIST_HEAD(&ret->pools);
ret->name = region_name;
ret->size = size;
kref_init(&ret->ref);
spin_lock(&dmemcg_lock);
list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions);
spin_unlock(&dmemcg_lock);
return ret;
}
EXPORT_SYMBOL_GPL(dmem_cgroup_register_region);
static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name)
{
struct dmem_cgroup_region *region;
list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock))
if (!strcmp(name, region->name) &&
kref_get_unless_zero(®ion->ref))
return region;
return NULL;
}
void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool)
{
if (pool)
css_put(&pool->cs->css);
}
EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put);
static struct dmem_cgroup_pool_state *
get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
{
struct dmem_cgroup_pool_state *pool, *allocpool = NULL;
rcu_read_lock();
pool = find_cg_pool_locked(cg, region);
if (pool && !READ_ONCE(pool->inited))
pool = NULL;
rcu_read_unlock();
while (!pool) {
spin_lock(&dmemcg_lock);
if (!region->unregistered)
pool = get_cg_pool_locked(cg, region, &allocpool);
else
pool = ERR_PTR(-ENODEV);
spin_unlock(&dmemcg_lock);
if (pool == ERR_PTR(-ENOMEM)) {
pool = NULL;
if (WARN_ON(allocpool))
continue;
allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL);
if (allocpool) {
pool = NULL;
continue;
}
}
}
kfree(allocpool);
return pool;
}
void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size)
{
if (!pool)
return;
page_counter_uncharge(&pool->cnt, size);
css_put(&pool->cs->css);
}
EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge);
int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
struct dmem_cgroup_pool_state **ret_pool,
struct dmem_cgroup_pool_state **ret_limit_pool)
{
struct dmemcg_state *cg;
struct dmem_cgroup_pool_state *pool;
struct page_counter *fail;
int ret;
*ret_pool = NULL;
if (ret_limit_pool)
*ret_limit_pool = NULL;
cg = get_current_dmemcs();
pool = get_cg_pool_unlocked(cg, region);
if (IS_ERR(pool)) {
ret = PTR_ERR(pool);
goto err;
}
if (!page_counter_try_charge(&pool->cnt, size, &fail)) {
if (ret_limit_pool) {
*ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt);
css_get(&(*ret_limit_pool)->cs->css);
}
ret = -EAGAIN;
goto err;
}
*ret_pool = pool;
return 0;
err:
css_put(&cg->css);
return ret;
}
EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge);
static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v)
{
struct dmem_cgroup_region *region;
rcu_read_lock();
list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
seq_puts(sf, region->name);
seq_printf(sf, " %llu\n", region->size);
}
rcu_read_unlock();
return 0;
}
static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region,
u64 *new_limit)
{
char *end;
if (!strcmp(options, "max")) {
*new_limit = PAGE_COUNTER_MAX;
return 0;
}
*new_limit = memparse(options, &end);
if (*end != '\0')
return -EINVAL;
return 0;
}
static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off,
void (*apply)(struct dmem_cgroup_pool_state *, u64))
{
struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of));
int err = 0;
while (buf && !err) {
struct dmem_cgroup_pool_state *pool = NULL;
char *options, *region_name;
struct dmem_cgroup_region *region;
u64 new_limit;
options = buf;
buf = strchr(buf, '\n');
if (buf)
*buf++ = '\0';
options = strstrip(options);
if (!options[0])
continue;
region_name = strsep(&options, " \t");
if (!region_name[0])
continue;
rcu_read_lock();
region = dmemcg_get_region_by_name(region_name);
rcu_read_unlock();
if (!region)
return -EINVAL;
err = dmemcg_parse_limit(options, region, &new_limit);
if (err < 0)
goto out_put;
pool = get_cg_pool_unlocked(dmemcs, region);
if (IS_ERR(pool)) {
err = PTR_ERR(pool);
goto out_put;
}
apply(pool, new_limit);
out_put:
kref_put(®ion->ref, dmemcg_free_region);
}
return err ?: nbytes;
}
static int dmemcg_limit_show(struct seq_file *sf, void *v,
u64 (*fn)(struct dmem_cgroup_pool_state *))
{
struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf));
struct dmem_cgroup_region *region;
rcu_read_lock();
list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region);
u64 val;
seq_puts(sf, region->name);
val = fn(pool);
if (val < PAGE_COUNTER_MAX)
seq_printf(sf, " %lld\n", val);
else
seq_puts(sf, " max\n");
}
rcu_read_unlock();
return 0;
}
static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v)
{
return dmemcg_limit_show(sf, v, get_resource_current);
}
static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v)
{
return dmemcg_limit_show(sf, v, get_resource_min);
}
static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min);
}
static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v)
{
return dmemcg_limit_show(sf, v, get_resource_low);
}
static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low);
}
static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v)
{
return dmemcg_limit_show(sf, v, get_resource_max);
}
static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max);
}
static struct cftype files[] = {
{
.name = "capacity",
.seq_show = dmem_cgroup_region_capacity_show,
.flags = CFTYPE_ONLY_ON_ROOT,
},
{
.name = "current",
.seq_show = dmem_cgroup_region_current_show,
},
{
.name = "min",
.write = dmem_cgroup_region_min_write,
.seq_show = dmem_cgroup_region_min_show,
.flags = CFTYPE_NOT_ON_ROOT,
},
{
.name = "low",
.write = dmem_cgroup_region_low_write,
.seq_show = dmem_cgroup_region_low_show,
.flags = CFTYPE_NOT_ON_ROOT,
},
{
.name = "max",
.write = dmem_cgroup_region_max_write,
.seq_show = dmem_cgroup_region_max_show,
.flags = CFTYPE_NOT_ON_ROOT,
},
{ }
};
struct cgroup_subsys dmem_cgrp_subsys = {
.css_alloc = dmemcs_alloc,
.css_free = dmemcs_free,
.css_offline = dmemcs_offline,
.legacy_cftypes = files,
.dfl_cftypes = files,
};