Release 4.15 kernel/cgroup/cpuset.c

Directory: kernel/cgroup
/*
 *  kernel/cpuset.c
 *
 *  Processor and Memory placement constraints for sets of tasks.
 *
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
 *  Copyright (C) 2006 Google, Inc
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  2006 Rework by Paul Menage to use generic cgroups
 *  2008 Rework of the scheduler domains and CPU hotplug handling
 *       by Max Krasnyansky
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/seq_file.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/time.h>
#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
#include <linux/oom.h>
#include <linux/sched/isolation.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
#include <linux/cgroup.h>
#include <linux/wait.h>


DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);

DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);

/* See "Frequency meter" comments, below. */


struct fmeter {
	
int cnt;		/* unprocessed events count */
	
int val;		/* most recent output value */
	
time64_t time;		/* clock (secs) when val computed */
	
spinlock_t lock;	/* guards read or write of above */
};


struct cpuset {
	
struct cgroup_subsys_state css;

	
unsigned long flags;		/* "unsigned long" so bitops work */

	/*
         * On default hierarchy:
         *
         * The user-configured masks can only be changed by writing to
         * cpuset.cpus and cpuset.mems, and won't be limited by the
         * parent masks.
         *
         * The effective masks is the real masks that apply to the tasks
         * in the cpuset. They may be changed if the configured masks are
         * changed or hotplug happens.
         *
         * effective_mask == configured_mask & parent's effective_mask,
         * and if it ends up empty, it will inherit the parent's mask.
         *
         *
         * On legacy hierachy:
         *
         * The user-configured masks are always the same with effective masks.
         */

	/* user-configured CPUs and Memory Nodes allow to tasks */
	
cpumask_var_t cpus_allowed;
	
nodemask_t mems_allowed;

	/* effective CPUs and Memory Nodes allow to tasks */
	
cpumask_var_t effective_cpus;
	
nodemask_t effective_mems;

	/*
         * This is old Memory Nodes tasks took on.
         *
         * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
         * - A new cpuset's old_mems_allowed is initialized when some
         *   task is moved into it.
         * - old_mems_allowed is used in cpuset_migrate_mm() when we change
         *   cpuset.mems_allowed and have tasks' nodemask updated, and
         *   then old_mems_allowed is updated to mems_allowed.
         */
	
nodemask_t old_mems_allowed;

	
struct fmeter fmeter;		/* memory_pressure filter */

	/*
         * Tasks are being attached to this cpuset.  Used to prevent
         * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
         */
	
int attach_in_progress;

	/* partition number for rebuild_sched_domains() */
	
int pn;

	/* for custom sched domain */
	
int relax_domain_level;
};



static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
	return css ? container_of(css, struct cpuset, css) : NULL;
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 21 72.41% 1 50.00%
Tejun Heo 8 27.59% 1 50.00%
Total 29 100.00% 2 100.00%

/* Retrieve the cpuset for a task */


static inline struct cpuset *task_cs(struct task_struct *task)
{
	return css_cs(task_css(task, cpuset_cgrp_id));
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 22 88.00% 1 25.00%
Tejun Heo 3 12.00% 3 75.00%
Total 25 100.00% 4 100.00%



static inline struct cpuset *parent_cs(struct cpuset *cs)
{
	return css_cs(cs->css.parent);
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 24 100.00% 3 100.00%
Total 24 100.00% 3 100.00%

#ifdef CONFIG_NUMA


static inline bool task_has_mempolicy(struct task_struct *task)
{
	return task->mempolicy;
}
Contributors
Person Tokens Prop Commits CommitProp
David Rientjes 17 100.00% 1 100.00%
Total 17 100.00% 1 100.00%

#else


static inline bool task_has_mempolicy(struct task_struct *task)
{
	return false;
}
Contributors
Person Tokens Prop Commits CommitProp
David Rientjes 15 100.00% 1 100.00%
Total 15 100.00% 1 100.00%

#endif


/* bits in struct cpuset flags field */
typedef enum {
	
CS_ONLINE,
	
CS_CPU_EXCLUSIVE,
	
CS_MEM_EXCLUSIVE,
	
CS_MEM_HARDWALL,
	
CS_MEMORY_MIGRATE,
	
CS_SCHED_LOAD_BALANCE,
	
CS_SPREAD_PAGE,
	
CS_SPREAD_SLAB,
} 
cpuset_flagbits_t;

/* convenient tests for these bits */


static inline bool is_cpuset_online(struct cpuset *cs)
{
	return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 31 96.88% 2 66.67%
Srivatsa S. Bhat 1 3.12% 1 33.33%
Total 32 100.00% 3 100.00%



static inline int is_cpu_exclusive(const struct cpuset *cs)
{
	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 24 100.00% 1 100.00%
Total 24 100.00% 1 100.00%



static inline int is_mem_exclusive(const struct cpuset *cs)
{
	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 24 100.00% 1 100.00%
Total 24 100.00% 1 100.00%



static inline int is_mem_hardwall(const struct cpuset *cs)
{
	return test_bit(CS_MEM_HARDWALL, &cs->flags);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 24 100.00% 1 100.00%
Total 24 100.00% 1 100.00%



static inline int is_sched_load_balance(const struct cpuset *cs)
{
	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 24 100.00% 1 100.00%
Total 24 100.00% 1 100.00%



static inline int is_memory_migrate(const struct cpuset *cs)
{
	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 24 100.00% 1 100.00%
Total 24 100.00% 1 100.00%



static inline int is_spread_page(const struct cpuset *cs)
{
	return test_bit(CS_SPREAD_PAGE, &cs->flags);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 24 100.00% 1 100.00%
Total 24 100.00% 1 100.00%



static inline int is_spread_slab(const struct cpuset *cs)
{
	return test_bit(CS_SPREAD_SLAB, &cs->flags);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 24 100.00% 1 100.00%
Total 24 100.00% 1 100.00%


static struct cpuset top_cpuset = {
	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
		  (1 << CS_MEM_EXCLUSIVE)),
};

/**
 * cpuset_for_each_child - traverse online children of a cpuset
 * @child_cs: loop cursor pointing to the current child
 * @pos_css: used for iteration
 * @parent_cs: target cpuset to walk children of
 *
 * Walk @child_cs through the online children of @parent_cs.  Must be used
 * with RCU read locked.
 */

#define cpuset_for_each_child(child_cs, pos_css, parent_cs)		\
	css_for_each_child((pos_css), &(parent_cs)->css)                \
                if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))

/**
 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
 * @des_cs: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @root_cs: target cpuset to walk ancestor of
 *
 * Walk @des_cs through the online descendants of @root_cs.  Must be used
 * with RCU read locked.  The caller may modify @pos_css by calling
 * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
 * iteration and the first node to be visited.
 */

#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)	\
	css_for_each_descendant_pre((pos_css), &(root_cs)->css)         \
                if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))

/*
 * There are two global locks guarding cpuset structures - cpuset_mutex and
 * callback_lock. We also require taking task_lock() when dereferencing a
 * task's cpuset pointer. See "The task_lock() exception", at the end of this
 * comment.
 *
 * A task must hold both locks to modify cpusets.  If a task holds
 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
 * is the only task able to also acquire callback_lock and be able to
 * modify cpusets.  It can perform various checks on the cpuset structure
 * first, knowing nothing will change.  It can also allocate memory while
 * just holding cpuset_mutex.  While it is performing these checks, various
 * callback routines can briefly acquire callback_lock to query cpusets.
 * Once it is ready to make the changes, it takes callback_lock, blocking
 * everyone else.
 *
 * Calls to the kernel memory allocator can not be made while holding
 * callback_lock, as that would risk double tripping on callback_lock
 * from one of the callbacks into the cpuset code from within
 * __alloc_pages().
 *
 * If a task is only holding callback_lock, then it has read-only
 * access to cpusets.
 *
 * Now, the task_struct fields mems_allowed and mempolicy may be changed
 * by other task, we use alloc_lock in the task_struct fields to protect
 * them.
 *
 * The cpuset_common_file_read() handlers only hold callback_lock across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
 *
 * Accessing a task's cpuset should be done in accordance with the
 * guidelines for accessing subsystem state in kernel/cgroup.c
 */

static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_SPINLOCK(callback_lock);


static struct workqueue_struct *cpuset_migrate_mm_wq;

/*
 * CPU / memory hotplug is handled asynchronously.
 */
static void cpuset_hotplug_workfn(struct work_struct *work);
static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);

static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);

/*
 * Cgroup v2 behavior is used when on default hierarchy or the
 * cgroup_v2_mode flag is set.
 */


static inline bool is_in_v2_mode(void)
{
	return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
	      (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}

Contributors
Person Tokens Prop Commits CommitProp
Waiman Long 25 100.00% 1 100.00%
Total 25 100.00% 1 100.00%

/*
 * This is ugly, but preserves the userspace API for existing cpuset
 * users. If someone tries to mount the "cpuset" filesystem, we
 * silently switch it to mount "cgroup" instead
 */


static struct dentry *cpuset_mount(struct file_system_type *fs_type,
			 int flags, const char *unused_dev_name, void *data)
{
	struct file_system_type *cgroup_fs = get_fs_type("cgroup");
	struct dentry *ret = ERR_PTR(-ENODEV);
	if (cgroup_fs) {
		char mountopts[] =
			"cpuset,noprefix,"
			"release_agent=/sbin/cpuset_release_agent";
		ret = cgroup_fs->mount(cgroup_fs, flags,
					   unused_dev_name, mountopts);
		put_filesystem(cgroup_fs);
	}
	return ret;
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 62 75.61% 1 33.33%
Al Viro 11 13.41% 1 33.33%
Paul Jackson 9 10.98% 1 33.33%
Total 82 100.00% 3 100.00%


static struct file_system_type cpuset_fs_type = {
	.name = "cpuset",
	.mount = cpuset_mount,
};

/*
 * Return in pmask the portion of a cpusets's cpus_allowed that
 * are online.  If none are online, walk up the cpuset hierarchy
 * until we find one that does have some online cpus.
 *
 * One way or another, we guarantee to return some non-empty subset
 * of cpu_online_mask.
 *
 * Call with callback_lock or cpuset_mutex held.
 */


static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
		cs = parent_cs(cs);
		if (unlikely(!cs)) {
			/*
                         * The top cpuset doesn't have any online cpu as a
                         * consequence of a race between cpuset_hotplug_work
                         * and cpu hotplug notifier.  But we know the top
                         * cpuset's effective_cpus is on its way to to be
                         * identical to cpu_online_mask.
                         */
			cpumask_copy(pmask, cpu_online_mask);
			return;
		}
	}
	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 22 32.84% 1 14.29%
Joonwoo Park 21 31.34% 1 14.29%
Paul Menage 13 19.40% 1 14.29%
Li Zefan 8 11.94% 3 42.86%
Tejun Heo 3 4.48% 1 14.29%
Total 67 100.00% 7 100.00%

/*
 * Return in *pmask the portion of a cpusets's mems_allowed that
 * are online, with memory.  If none are online with memory, walk
 * up the cpuset hierarchy until we find one that does have some
 * online mems.  The top cpuset always has some mems online.
 *
 * One way or another, we guarantee to return some non-empty subset
 * of node_states[N_MEMORY].
 *
 * Call with callback_lock or cpuset_mutex held.
 */


static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
	while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
		cs = parent_cs(cs);
	nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 24 46.15% 1 20.00%
Paul Jackson 21 40.38% 1 20.00%
Tejun Heo 3 5.77% 1 20.00%
Li Zefan 2 3.85% 1 20.00%
Lai Jiangshan 2 3.85% 1 20.00%
Total 52 100.00% 5 100.00%

/*
 * update task's spread flag if cpuset's page/slab spread flag is set
 *
 * Call with callback_lock or cpuset_mutex held.
 */


static void cpuset_update_task_spread_flag(struct cpuset *cs,
					struct task_struct *tsk)
{
	if (is_spread_page(cs))
		task_set_spread_page(tsk);
	else
		task_clear_spread_page(tsk);

	if (is_spread_slab(cs))
		task_set_spread_slab(tsk);
	else
		task_clear_spread_slab(tsk);
}

Contributors
Person Tokens Prop Commits CommitProp
Miao Xie 40 76.92% 1 50.00%
Li Zefan 12 23.08% 1 50.00%
Total 52 100.00% 2 100.00%

/*
 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
 * are only set if the other's are set.  Call holding cpuset_mutex.
 */



static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
		nodes_subset(p->mems_allowed, q->mems_allowed) &&
		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
		is_mem_exclusive(p) <= is_mem_exclusive(q);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 57 93.44% 1 33.33%
Paul Jackson 3 4.92% 1 33.33%
Li Zefan 1 1.64% 1 33.33%
Total 61 100.00% 3 100.00%

/**
 * alloc_trial_cpuset - allocate a trial cpuset
 * @cs: the cpuset that the trial cpuset duplicates
 */


static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
{
	struct cpuset *trial;

	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
	if (!trial)
		return NULL;

	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
		goto free_cs;
	if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
		goto free_cpus;

	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
	return trial;

free_cpus:
	free_cpumask_var(trial->cpus_allowed);
free_cs:
	kfree(trial);
	return NULL;
}

Contributors
Person Tokens Prop Commits CommitProp
Li Zefan 117 100.00% 3 100.00%
Total 117 100.00% 3 100.00%

/**
 * free_trial_cpuset - free the trial cpuset
 * @trial: the trial cpuset to be freed
 */


static void free_trial_cpuset(struct cpuset *trial)
{
	free_cpumask_var(trial->effective_cpus);
	free_cpumask_var(trial->cpus_allowed);
	kfree(trial);
}

Contributors
Person Tokens Prop Commits CommitProp
Li Zefan 30 100.00% 3 100.00%
Total 30 100.00% 3 100.00%

/*
 * validate_change() - Used to validate that any proposed cpuset change
 *                     follows the structural rules for cpusets.
 *
 * If we replaced the flag and mask values of the current cpuset
 * (cur) with those values in the trial cpuset (trial), would
 * our various subset and exclusive rules still be valid?  Presumes
 * cpuset_mutex held.
 *
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * such as list traversal that depend on the actual address of the
 * cpuset in the list must use cur below, not trial.
 *
 * 'trial' is the address of bulk structure copy of cur, with
 * perhaps one or more of the fields cpus_allowed, mems_allowed,
 * or flags changed to new, trial values.
 *
 * Return 0 if valid, -errno if not.
 */



static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
	struct cgroup_subsys_state *css;
	struct cpuset *c, *par;
	int ret;

	rcu_read_lock();

	/* Each of our child cpusets must be a subset of us */
	ret = -EBUSY;
	cpuset_for_each_child(c, css, cur)
		if (!is_cpuset_subset(c, trial))
			goto out;

	/* Remaining checks don't apply to root cpuset */
	ret = 0;
	if (cur == &top_cpuset)
		goto out;

	par = parent_cs(cur);

	/* On legacy hiearchy, we must be a subset of our parent cpuset. */
	ret = -EACCES;
	if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
		goto out;

	/*
         * If either I or some sibling (!= me) is exclusive, we can't
         * overlap
         */
	ret = -EINVAL;
	cpuset_for_each_child(c, css, par) {
		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
		    c != cur &&
		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
			goto out;
		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
		    c != cur &&
		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
			goto out;
	}

	/*
         * Cpusets with tasks - existing or newly being attached - can't
         * be changed to have empty cpus_allowed or mems_allowed.
         */
	ret = -ENOSPC;
	if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
		if (!cpumask_empty(cur->cpus_allowed) &&
		    cpumask_empty(trial->cpus_allowed))
			goto out;
		if (!nodes_empty(cur->mems_allowed) &&
		    nodes_empty(trial->mems_allowed))
			goto out;
	}

	/*
         * We can't shrink if we won't have enough room for SCHED_DEADLINE
         * tasks.
         */
	ret = -EBUSY;
	if (is_cpu_exclusive(cur) &&
	    !cpuset_cpumask_can_shrink(cur->cpus_allowed,
				       trial->cpus_allowed))
		goto out;

	ret = 0;
out:
	rcu_read_unlock();
	return ret;
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 102 34.93% 3 17.65%
Tejun Heo 80 27.40% 5 29.41%
Paul Menage 46 15.75% 2 11.76%
Li Zefan 32 10.96% 4 23.53%
Juri Lelli 28 9.59% 1 5.88%
Dave Hansen 2 0.68% 1 5.88%
Waiman Long 2 0.68% 1 5.88%
Total 292 100.00% 17 100.00%

#ifdef CONFIG_SMP
/*
 * Helper routine for generate_sched_domains().
 * Do cpusets a, b have overlapping effective cpus_allowed masks?
 */


static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
	return cpumask_intersects(a->effective_cpus, b->effective_cpus);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 25 89.29% 1 33.33%
Li Zefan 3 10.71% 2 66.67%
Total 28 100.00% 3 100.00%


static void

update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
{
	if (dattr->relax_domain_level < c->relax_domain_level)
		dattr->relax_domain_level = c->relax_domain_level;
	return;
}

Contributors
Person Tokens Prop Commits CommitProp
Hidetoshi Seto 35 100.00% 1 100.00%
Total 35 100.00% 1 100.00%



static void update_domain_attr_tree(struct sched_domain_attr *dattr,
				    struct cpuset *root_cs)
{
	struct cpuset *cp;
	struct cgroup_subsys_state *pos_css;

	rcu_read_lock();
	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
		/* skip the whole subtree if @cp doesn't have any CPU */
		if (cpumask_empty(cp->cpus_allowed)) {
			pos_css = css_rightmost_descendant(pos_css);
			continue;
		}

		if (is_sched_load_balance(cp))
			update_domain_attr(dattr, cp);
	}
	rcu_read_unlock();
}

Contributors
Person Tokens Prop Commits CommitProp
Lai Jiangshan 51 67.11% 1 20.00%
Tejun Heo 24 31.58% 3 60.00%
Li Zefan 1 1.32% 1 20.00%
Total 76 100.00% 5 100.00%

/* Must be called with cpuset_mutex held.  */


static inline int nr_cpusets(void)
{
	/* jump label reference count + the top-level cpuset */
	return static_key_count(&cpusets_enabled_key.key) + 1;
}

Contributors
Person Tokens Prop Commits CommitProp
Paolo Bonzini 21 100.00% 1 100.00%
Total 21 100.00% 1 100.00%

/*
 * generate_sched_domains()
 *
 * This function builds a partial partition of the systems CPUs
 * A 'partial partition' is a set of non-overlapping subsets whose
 * union is a subset of that set.
 * The output of this function needs to be passed to kernel/sched/core.c
 * partition_sched_domains() routine, which will rebuild the scheduler's
 * load balancing domains (sched domains) as specified by that partial
 * partition.
 *
 * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
 * for a background explanation of this.
 *
 * Does not return errors, on the theory that the callers of this
 * routine would rather not worry about failures to rebuild sched
 * domains when operating in the severe memory shortage situations
 * that could cause allocation failures below.
 *
 * Must be called with cpuset_mutex held.
 *
 * The three key local variables below are:
 *    q  - a linked-list queue of cpuset pointers, used to implement a
 *         top-down scan of all cpusets.  This scan loads a pointer
 *         to each cpuset marked is_sched_load_balance into the
 *         array 'csa'.  For our purposes, rebuilding the schedulers
 *         sched domains, we can ignore !is_sched_load_balance cpusets.
 *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
 *         that need to be load balanced, for convenient iterative
 *         access by the subsequent code that finds the best partition,
 *         i.e the set of domains (subsets) of CPUs such that the
 *         cpus_allowed of every cpuset marked is_sched_load_balance
 *         is a subset of one of these domains, while there are as
 *         many such domains as possible, each as small as possible.
 * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
 *         the kernel/sched/core.c routine partition_sched_domains() in a
 *         convenient format, that can be easily compared to the prior
 *         value to determine what partition elements (sched domains)
 *         were changed (added or removed.)
 *
 * Finding the best partition (set of domains):
 *      The triple nested loops below over i, j, k scan over the
 *      load balanced cpusets (using the array of cpuset pointers in
 *      csa[]) looking for pairs of cpusets that have overlapping
 *      cpus_allowed, but which don't have the same 'pn' partition
 *      number and gives them in the same partition number.  It keeps
 *      looping on the 'restart' label until it can no longer find
 *      any such pairs.
 *
 *      The union of the cpus_allowed masks from the set of
 *      all cpusets having the same 'pn' value then form the one
 *      element of the partition (one sched domain) to be passed to
 *      partition_sched_domains().
 */


static int generate_sched_domains(cpumask_var_t **domains,
			struct sched_domain_attr **attributes)
{
	struct cpuset *cp;	/* scans q */
	struct cpuset **csa;	/* array of all cpuset ptrs */
	int csn;		/* how many cpuset ptrs in csa so far */
	int i, j, k;		/* indices for partition finding loops */
	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
	struct sched_domain_attr *dattr;  /* attributes for custom domains */
	int ndoms = 0;		/* number of sched domains in result */
	int nslot;		/* next empty doms[] struct cpumask slot */
	struct cgroup_subsys_state *pos_css;

	doms = NULL;
	dattr = NULL;
	csa = NULL;

	/* Special case for the 99% of systems with one, full, sched domain */
	if (is_sched_load_balance(&top_cpuset)) {
		ndoms = 1;
		doms = alloc_sched_domains(ndoms);
		if (!doms)
			goto done;

		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
		if (dattr) {
			*dattr = SD_ATTR_INIT;
			update_domain_attr_tree(dattr, &top_cpuset);
		}
		cpumask_and(doms[0], top_cpuset.effective_cpus,
			    housekeeping_cpumask(HK_FLAG_DOMAIN));

		goto done;
	}

	csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
	if (!csa)
		goto done;
	csn = 0;

	rcu_read_lock();
	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
		if (cp == &top_cpuset)
			continue;
		/*
                 * Continue traversing beyond @cp iff @cp has some CPUs and
                 * isn't load balancing.  The former is obvious.  The
                 * latter: All child cpusets contain a subset of the
                 * parent's cpus, so just skip them, and then we call
                 * update_domain_attr_tree() to calc relax_domain_level of
                 * the corresponding sched domain.
                 */
		if (!cpumask_empty(cp->cpus_allowed) &&
		    !(is_sched_load_balance(cp) &&
		      cpumask_intersects(cp->cpus_allowed,
					 housekeeping_cpumask(HK_FLAG_DOMAIN))))
			continue;

		if (is_sched_load_balance(cp))
			csa[csn++] = cp;

		/* skip @cp's subtree */
		pos_css = css_rightmost_descendant(pos_css);
	}
	rcu_read_unlock();

	for (i = 0; i < csn; i++)
		csa[i]->pn = i;
	ndoms = csn;

restart:
	/* Find the best partition (set of sched domains) */
	for (i = 0; i < csn; i++) {
		struct cpuset *a = csa[i];
		int apn = a->pn;

		for (j = 0; j < csn; j++) {
			struct cpuset *b = csa[j];
			int bpn = b->pn;

			if (apn != bpn && cpusets_overlap(a, b)) {
				for (k = 0; k < csn; k++) {
					struct cpuset *c = csa[k];

					if (c->pn == bpn)
						c->pn = apn;
				}
				ndoms--;	/* one less element */
				goto restart;
			}
		}
	}

	/*
         * Now we know how many domains to create.
         * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
         */
	doms = alloc_sched_domains(ndoms);
	if (!doms)
		goto done;

	/*
         * The rest of the code, including the scheduler, can deal with
         * dattr==NULL case. No need to abort if alloc fails.
         */
	dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);

	for (nslot = 0, i = 0; i < csn; i++) {
		struct cpuset *a = csa[i];
		struct cpumask *dp;
		int apn = a->pn;

		if (apn < 0) {
			/* Skip completed partitions */
			continue;
		}

		dp = doms[nslot];

		if (nslot == ndoms) {
			static int warnings = 10;
			if (warnings) {
				pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
					nslot, ndoms, csn, i, apn);
				warnings--;
			}
			continue;
		}

		cpumask_clear(dp);
		if (dattr)
			*(dattr + nslot) = SD_ATTR_INIT;
		for (j = i; j < csn; j++) {
			struct cpuset *b = csa[j];

			if (apn == b->pn) {
				cpumask_or(dp, dp, b->effective_cpus);
				cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
				if (dattr)
					update_domain_attr_tree(dattr + nslot, b);

				/* Done with this partition */
				b->pn = -1;
			}
		}
		nslot++;
	}
	BUG_ON(nslot != ndoms);

done:
	kfree(csa);

	/*
         * Fallback to the default domain if kmalloc() failed.
         * See comments in partition_sched_domains().
         */
	if (doms == NULL)
		ndoms = 1;

	*domains    = doms;
	*attributes = dattr;
	return ndoms;
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 423 61.39% 1 4.35%
Hidetoshi Seto 75 10.89% 1 4.35%
Maksim Krasnyanskiy 58 8.42% 1 4.35%
Tejun Heo 35 5.08% 4 17.39%
Li Zefan 27 3.92% 6 26.09%
Rik Van Riel 20 2.90% 1 4.35%
Rusty Russell 14 2.03% 1 4.35%
Frédéric Weisbecker 12 1.74% 1 4.35%
Lai Jiangshan 10 1.45% 2 8.70%
Miao Xie 6 0.87% 1 4.35%
Paul Menage 3 0.44% 1 4.35%
Mel Gorman 2 0.29% 1 4.35%
Ingo Molnar 2 0.29% 1 4.35%
Fabian Frederick 2 0.29% 1 4.35%
Total 689 100.00% 23 100.00%

/*
 * Rebuild scheduler domains.
 *
 * If the flag 'sched_load_balance' of any cpuset with non-empty
 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
 * which has that flag enabled, or if any cpuset with a non-empty
 * 'cpus' is removed, then call this routine to rebuild the
 * scheduler's dynamic sched domains.
 *
 * Call with cpuset_mutex held.  Takes get_online_cpus().
 */


static void rebuild_sched_domains_locked(void)
{
	struct sched_domain_attr *attr;
	cpumask_var_t *doms;
	int ndoms;

	lockdep_assert_held(&cpuset_mutex);
	get_online_cpus();

	/*
         * We have raced with CPU hotplug. Don't do anything to avoid
         * passing doms with offlined cpu to partition_sched_domains().
         * Anyways, hotplug work item will rebuild sched domains.
         */
	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
		goto out;

	/* Generate domain masks and attrs */
	ndoms = generate_sched_domains(&doms, &attr);

	/* Have scheduler rebuild the domains */
	partition_sched_domains(ndoms, doms, attr);
out:
	put_online_cpus();
}
Contributors
Person Tokens Prop Commits CommitProp
Maksim Krasnyanskiy 31 43.06% 1 10.00%
Li Zefan 18 25.00% 2 20.00%
Paul Jackson 9 12.50% 1 10.00%
Tejun Heo 7 9.72% 2 20.00%
Paul Menage 3 4.17% 1 10.00%
Gautham R. Shenoy 2 2.78% 1 10.00%
Hidetoshi Seto 1 1.39% 1 10.00%
Rusty Russell 1 1.39% 1 10.00%
Total 72 100.00% 10 100.00%

#else /* !CONFIG_SMP */


static void rebuild_sched_domains_locked(void)
{
}
Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 5 71.43% 1 50.00%
Tejun Heo 2 28.57% 1 50.00%
Total 7 100.00% 2 100.00%

#endif /* CONFIG_SMP */



void rebuild_sched_domains(void)
{
	mutex_lock(&cpuset_mutex);
	rebuild_sched_domains_locked();
	mutex_unlock(&cpuset_mutex);
}

Contributors
Person Tokens Prop Commits CommitProp
Maksim Krasnyanskiy 11 50.00% 1 16.67%
Tejun Heo 8 36.36% 2 33.33%
Paul Jackson 2 9.09% 2 33.33%
Paul Menage 1 4.55% 1 16.67%
Total 22 100.00% 6 100.00%

/**
 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
 *
 * Iterate through each task of @cs updating its cpus_allowed to the
 * effective cpuset's.  As this function is called with cpuset_mutex held,
 * cpuset membership stays stable.
 */


static void update_tasks_cpumask(struct cpuset *cs)
{
	struct css_task_iter it;
	struct task_struct *task;

	css_task_iter_start(&cs->css, 0, &it);
	while ((task = css_task_iter_next(&it)))
		set_cpus_allowed_ptr(task, cs->effective_cpus);
	css_task_iter_end(&it);
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 44 73.33% 4 40.00%
Li Zefan 6 10.00% 3 30.00%
Miao Xie 6 10.00% 1 10.00%
Cliff Wickman 3 5.00% 1 10.00%
Adrian Bunk 1 1.67% 1 10.00%
Total 60 100.00% 10 100.00%

/*
 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
 * @cs: the cpuset to consider
 * @new_cpus: temp variable for calculating new effective_cpus
 *
 * When congifured cpumask is changed, the effective cpumasks of this cpuset
 * and all its descendants need to be updated.
 *
 * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
 *
 * Called with cpuset_mutex held
 */


static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
{
	struct cpuset *cp;
	struct cgroup_subsys_state *pos_css;
	bool need_rebuild_sched_domains = false;

	rcu_read_lock();
	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
		struct cpuset *parent = parent_cs(cp);

		cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);

		/*
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some CPUs.
                 */
		if (is_in_v2_mode() && cpumask_empty(new_cpus))
			cpumask_copy(new_cpus, parent->effective_cpus);

		/* Skip the whole subtree if the cpumask remains the same. */
		if (cpumask_equal(new_cpus, cp->effective_cpus)) {
			pos_css = css_rightmost_descendant(pos_css);
			continue;
		}

		if (!css_tryget_online(&cp->css))
			continue;
		rcu_read_unlock();

		spin_lock_irq(&callback_lock);
		cpumask_copy(cp->effective_cpus, new_cpus);
		spin_unlock_irq(&callback_lock);

		WARN_ON(!is_in_v2_mode() &&
			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));

		update_tasks_cpumask(cp);

		/*
                 * If the effective cpumask of any non-empty cpuset is changed,
                 * we need to rebuild sched domains.
                 */
		if (!cpumask_empty(cp->cpus_allowed) &&
		    is_sched_load_balance(cp))
			need_rebuild_sched_domains = true;

		rcu_read_lock();
		css_put(&cp->css);
	}
	rcu_read_unlock();

	if (need_rebuild_sched_domains)
		rebuild_sched_domains_locked();
}

Contributors
Person Tokens Prop Commits CommitProp
Li Zefan 180 85.71% 5 35.71%
Tejun Heo 11 5.24% 3 21.43%
Paul Jackson 7 3.33% 1 7.14%
Vladimir Davydov 4 1.90% 1 7.14%
Waiman Long 4 1.90% 1 7.14%
Paul Menage 3 1.43% 2 14.29%
Cliff Wickman 1 0.48% 1 7.14%
Total 210 100.00% 14 100.00%

/**
 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
 * @cs: the cpuset to consider
 * @trialcs: trial cpuset
 * @buf: buffer of cpu numbers written to this cpuset
 */


static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
			  const char *buf)
{
	int retval;

	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
	if (cs == &top_cpuset)
		return -EACCES;

	/*
         * An empty cpus_allowed is ok only if the cpuset has no tasks.
         * Since cpulist_parse() fails on an empty mask, we special case
         * that parsing.  The validate_change() call ensures that cpusets
         * with tasks have cpus.
         */
	if (!*buf) {
		cpumask_clear(trialcs->cpus_allowed);
	} else {
		retval = cpulist_parse(buf, trialcs->cpus_allowed);
		if (retval < 0)
			return retval;

		if (!cpumask_subset(trialcs->cpus_allowed,
				    top_cpuset.cpus_allowed))
			return -EINVAL;
	}

	/* Nothing to do if the cpus didn't change */
	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
		return 0;

	retval = validate_change(cs, trialcs);
	if (retval < 0)
		return retval;

	spin_lock_irq(&callback_lock);
	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
	spin_unlock_irq(&callback_lock);

	/* use trialcs->cpus_allowed as a temp variable */
	update_cpumasks_hier(cs, trialcs->cpus_allowed);
	return 0;
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 55 33.54% 3 14.29%
Paul Jackson 43 26.22% 5 23.81%
Li Zefan 41 25.00% 6 28.57%
Lai Jiangshan 9 5.49% 1 4.76%
David P. Quigley 6 3.66% 1 4.76%
Vladimir Davydov 4 2.44% 1 4.76%
Cliff Wickman 3 1.83% 1 4.76%
David Howells 1 0.61% 1 4.76%
Miao Xie 1 0.61% 1 4.76%
Rusty Russell 1 0.61% 1 4.76%
Total 164 100.00% 21 100.00%

/*
 * Migrate memory region from one set of nodes to another.  This is
 * performed asynchronously as it can be called from process migration path
 * holding locks involved in process management.  All mm migrations are
 * performed in the queued order and can be waited for by flushing
 * cpuset_migrate_mm_wq.
 */


struct cpuset_migrate_mm_work {
	
struct work_struct	work;
	
struct mm_struct	*mm;
	
nodemask_t		from;
	
nodemask_t		to;
};



static void cpuset_migrate_mm_workfn(struct work_struct *work)
{
	struct cpuset_migrate_mm_work *mwork =
		container_of(work, struct cpuset_migrate_mm_work, work);

	/* on a wq worker, no need to worry about %current's mems_allowed */
	do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
	mmput(mwork->mm);
	kfree(mwork);
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 58 100.00% 1 100.00%
Total 58 100.00% 1 100.00%



static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
							const nodemask_t *to)
{
	struct cpuset_migrate_mm_work *mwork;

	mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
	if (mwork) {
		mwork->mm = mm;
		mwork->from = *from;
		mwork->to = *to;
		INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
		queue_work(cpuset_migrate_mm_wq, &mwork->work);
	} else {
		mmput(mm);
	}
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 55 59.14% 1 20.00%
Paul Menage 28 30.11% 1 20.00%
Paul Jackson 10 10.75% 3 60.00%
Total 93 100.00% 5 100.00%



static void cpuset_post_attach(void)
{
	flush_workqueue(cpuset_migrate_mm_wq);
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 11 84.62% 2 50.00%
Li Zefan 1 7.69% 1 25.00%
Miao Xie 1 7.69% 1 25.00%
Total 13 100.00% 4 100.00%

/*
 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
 * @tsk: the task to change
 * @newmems: new nodes that the task will be set
 *
 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
 * and rebind an eventual tasks' mempolicy. If the task is allocating in
 * parallel, it might temporarily see an empty intersection, which results in
 * a seqlock check and retry before OOM or allocation failure.
 */


static void cpuset_change_task_nodemask(struct task_struct *tsk,
					nodemask_t *newmems)
{
	task_lock(tsk);

	local_irq_disable();
	write_seqcount_begin(&tsk->mems_allowed_seq);

	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
	mpol_rebind_task(tsk, newmems);
	tsk->mems_allowed = *newmems;

	write_seqcount_end(&tsk->mems_allowed_seq);
	local_irq_enable();

	task_unlock(tsk);
}

Contributors
Person Tokens Prop Commits CommitProp
Miao Xie 51 68.00% 2 40.00%
Mel Gorman 16 21.33% 1 20.00%
Peter Zijlstra 6 8.00% 1 20.00%
Paul Jackson 2 2.67% 1 20.00%
Total 75 100.00% 5 100.00%


static void *cpuset_being_rebound;

/**
 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
 *
 * Iterate through each task of @cs updating its mems_allowed to the
 * effective cpuset's.  As this function is called with cpuset_mutex held,
 * cpuset membership stays stable.
 */


static void update_tasks_nodemask(struct cpuset *cs)
{
	static nodemask_t newmems;	/* protected by cpuset_mutex */
	struct css_task_iter it;
	struct task_struct *task;

	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */

	guarantee_online_mems(cs, &newmems);

	/*
         * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
         * take while holding tasklist_lock.  Forks can happen - the
         * mpol_dup() cpuset_being_rebound check will catch such forks,
         * and rebind their vma mempolicies too.  Because we still hold
         * the global cpuset_mutex, we know that no other rebind effort
         * will be contending for the global variable cpuset_being_rebound.
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
         * is idempotent.  Also migrate pages in each mm to new nodes.
         */
	css_task_iter_start(&cs->css, 0, &it);
	while ((task = css_task_iter_next(&it))) {
		struct mm_struct *mm;
		bool migrate;

		cpuset_change_task_nodemask(task, &newmems);

		mm = get_task_mm(task);
		if (!mm)
			continue;

		migrate = is_memory_migrate(cs);

		mpol_rebind_mm(mm, &cs->mems_allowed);
		if (migrate)
			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
		else
			mmput(mm);
	}
	css_task_iter_end(&it);

	/*
         * All the tasks' nodemasks have been updated, update
         * cs->old_mems_allowed.
         */
	cs->old_mems_allowed = newmems;

	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
	cpuset_being_rebound = NULL;
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 64 41.83% 5 41.67%
Paul Menage 33 21.57% 1 8.33%
Paul Jackson 29 18.95% 1 8.33%
Li Zefan 21 13.73% 3 25.00%
Miao Xie 6 3.92% 2 16.67%
Total 153 100.00% 12 100.00%

/*
 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
 * @cs: the cpuset to consider
 * @new_mems: a temp variable for calculating new effective_mems
 *
 * When configured nodemask is changed, the effective nodemasks of this cpuset
 * and all its descendants need to be updated.
 *
 * On legacy hiearchy, effective_mems will be the same with mems_allowed.
 *
 * Called with cpuset_mutex held
 */


static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
	struct cpuset *cp;
	struct cgroup_subsys_state *pos_css;

	rcu_read_lock();
	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
		struct cpuset *parent = parent_cs(cp);

		nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);

		/*
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some MEMs.
                 */
		if (is_in_v2_mode() && nodes_empty(*new_mems))
			*new_mems = parent->effective_mems;

		/* Skip the whole subtree if the nodemask remains the same. */
		if (nodes_equal(*new_mems, cp->effective_mems)) {
			pos_css = css_rightmost_descendant(pos_css);
			continue;
		}

		if (!css_tryget_online(&cp->css))
			continue;
		rcu_read_unlock();

		spin_lock_irq(&callback_lock);
		cp->effective_mems = *new_mems;
		spin_unlock_irq(&callback_lock);

		WARN_ON(!is_in_v2_mode() &&
			!nodes_equal(cp->mems_allowed, cp->effective_mems));

		update_tasks_nodemask(cp);

		rcu_read_lock();
		css_put(&cp->css);
	}
	rcu_read_unlock();
}

Contributors
Person Tokens Prop Commits CommitProp
Li Zefan 154 87.50% 6 46.15%
Tejun Heo 11 6.25% 3 23.08%
Vladimir Davydov 4 2.27% 1 7.69%
Waiman Long 4 2.27% 1 7.69%
Paul Menage 2 1.14% 1 7.69%
Paul Jackson 1 0.57% 1 7.69%
Total 176 100.00% 13 100.00%

/*
 * Handle user request to change the 'mems' memory placement
 * of a cpuset.  Needs to validate the request, update the
 * cpusets mems_allowed, and for each task in the cpuset,
 * update mems_allowed and rebind task's mempolicy and any vma
 * mempolicies and if the cpuset is marked 'memory_migrate',
 * migrate the tasks pages to the new memory.
 *
 * Call with cpuset_mutex held. May take callback_lock during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
 */


static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
			   const char *buf)
{
	int retval;

	/*
         * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
         * it's read-only
         */
	if (cs == &top_cpuset) {
		retval = -EACCES;
		goto done;
	}

	/*
         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
         * Since nodelist_parse() fails on an empty mask, we special case
         * that parsing.  The validate_change() call ensures that cpusets
         * with tasks have memory.
         */
	if (!*buf) {
		nodes_clear(trialcs->mems_allowed);
	} else {
		retval = nodelist_parse(buf, trialcs->mems_allowed);
		if (retval < 0)
			goto done;

		if (!nodes_subset(trialcs->mems_allowed,
				  top_cpuset.mems_allowed)) {
			retval = -EINVAL;
			goto done;
		}
	}

	if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
		retval = 0;		/* Too easy - nothing to do */
		goto done;
	}
	retval = validate_change(cs, trialcs);
	if (retval < 0)
		goto done;

	spin_lock_irq(&callback_lock);
	cs->mems_allowed = trialcs->mems_allowed;
	spin_unlock_irq(&callback_lock);

	/* use trialcs->mems_allowed as a temp variable */
	update_nodemasks_hier(cs, &trialcs->mems_allowed);
done:
	return retval;
}

Contributors
Person Tokens Prop Commits CommitProp
Miao Xie 155 85.16% 2 22.22%
Li Zefan 21 11.54% 4 44.44%
Vladimir Davydov 4 2.20% 1 11.11%
Lai Jiangshan 1 0.55% 1 11.11%
Alban Crequy 1 0.55% 1 11.11%
Total 182 100.00% 9 100.00%



int current_cpuset_is_being_rebound(void)
{
	int ret;

	rcu_read_lock();
	ret = task_cs(current) == cpuset_being_rebound;
	rcu_read_unlock();

	return ret;
}

Contributors
Person Tokens Prop Commits CommitProp
Gu Zheng 14 50.00% 1 33.33%
Paul Jackson 8 28.57% 1 33.33%
Paul Menage 6 21.43% 1 33.33%
Total 28 100.00% 3 100.00%



static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
	if (val < -1 || val >= sched_domain_level_max)
		return -EINVAL;
#endif

	if (val != cs->relax_domain_level) {
		cs->relax_domain_level = val;
		if (!cpumask_empty(cs->cpus_allowed) &&
		    is_sched_load_balance(cs))
			rebuild_sched_domains_locked();
	}

	return 0;
}

Contributors
Person Tokens Prop Commits CommitProp
Hidetoshi Seto 39 54.93% 1 12.50%
Li Zefan 23 32.39% 3 37.50%
Paul Menage 7 9.86% 2 25.00%
Peter Zijlstra 1 1.41% 1 12.50%
Tejun Heo 1 1.41% 1 12.50%
Total 71 100.00% 8 100.00%

/**
 * update_tasks_flags - update the spread flags of tasks in the cpuset.
 * @cs: the cpuset in which each task's spread flags needs to be changed
 *
 * Iterate through each task of @cs updating its spread flags.  As this
 * function is called with cpuset_mutex held, cpuset membership stays
 * stable.
 */


static void update_tasks_flags(struct cpuset *cs)
{
	struct css_task_iter it;
	struct task_struct *task;

	css_task_iter_start(&cs->css, 0, &it);
	while ((task = css_task_iter_next(&it)))
		cpuset_update_task_spread_flag(cs, task);
	css_task_iter_end(&it);
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 42 72.41% 4 80.00%
Miao Xie 16 27.59% 1 20.00%
Total 58 100.00% 5 100.00%

/*
 * update_flag - read a 0 or a 1 in a file and update associated flag
 * bit:         the bit to update (see cpuset_flagbits_t)
 * cs:          the cpuset to update
 * turning_on:  whether the flag is being set or cleared
 *
 * Call with cpuset_mutex held.
 */



static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
		       int turning_on)
{
	struct cpuset *trialcs;
	int balance_flag_changed;
	int spread_flag_changed;
	int err;

	trialcs = alloc_trial_cpuset(cs);
	if (!trialcs)
		return -ENOMEM;

	if (turning_on)
		set_bit(bit, &trialcs->flags);
	else
		clear_bit(bit, &trialcs->flags);

	err = validate_change(cs, trialcs);
	if (err < 0)
		goto out;

	balance_flag_changed = (is_sched_load_balance(cs) !=
				is_sched_load_balance(trialcs));

	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
			|| (is_spread_page(cs) != is_spread_page(trialcs)));

	spin_lock_irq(&callback_lock);
	cs->flags = trialcs->flags;
	spin_unlock_irq(&callback_lock);

	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
		rebuild_sched_domains_locked();

	if (spread_flag_changed)
		update_tasks_flags(cs);
out:
	free_trial_cpuset(trialcs);
	return err;
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 54 29.03% 2 16.67%
Paul Jackson 51 27.42% 4 33.33%
Miao Xie 42 22.58% 1 8.33%
Li Zefan 29 15.59% 2 16.67%
Rakib Mullick 5 2.69% 1 8.33%
Vladimir Davydov 4 2.15% 1 8.33%
Tejun Heo 1 0.54% 1 8.33%
Total 186 100.00% 12 100.00%

/*
 * Frequency meter - How fast is some event occurring?
 *
 * These routines manage a digitally filtered, constant time based,
 * event frequency meter.  There are four routines:
 *   fmeter_init() - initialize a frequency meter.
 *   fmeter_markevent() - called each time the event happens.
 *   fmeter_getrate() - returns the recent rate of such events.
 *   fmeter_update() - internal routine used to update fmeter.
 *
 * A common data structure is passed to each of these routines,
 * which is used to keep track of the state required to manage the
 * frequency meter and its digital filter.
 *
 * The filter works on the number of events marked per unit time.
 * The filter is single-pole low-pass recursive (IIR).  The time unit
 * is 1 second.  Arithmetic is done using 32-bit integers scaled to
 * simulate 3 decimal digits of precision (multiplied by 1000).
 *
 * With an FM_COEF of 933, and a time base of 1 second, the filter
 * has a half-life of 10 seconds, meaning that if the events quit
 * happening, then the rate returned from the fmeter_getrate()
 * will be cut in half each 10 seconds, until it converges to zero.
 *
 * It is not worth doing a real infinitely recursive filter.  If more
 * than FM_MAXTICKS ticks have elapsed since the last filter event,
 * just compute FM_MAXTICKS ticks worth, by which point the level
 * will be stable.
 *
 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
 * arithmetic overflow in the fmeter_update() routine.
 *
 * Given the simple 32 bit integer arithmetic used, this meter works
 * best for reporting rates between one per millisecond (msec) and
 * one per 32 (approx) seconds.  At constant rates faster than one
 * per msec it maxes out at values just under 1,000,000.  At constant
 * rates between one per msec, and one per second it will stabilize
 * to a value N*1000, where N is the rate of events per second.
 * At constant rates between one per second and one per 32 seconds,
 * it will be choppy, moving up on the seconds that have an event,
 * and then decaying until the next event.  At rates slower than
 * about one in 32 seconds, it decays all the way back to zero between
 * each event.
 */


#define FM_COEF 933		
/* coefficient for half-life of 10 secs */

#define FM_MAXTICKS ((u32)99)   
/* useless computing more ticks than this */

#define FM_MAXCNT 1000000	
/* limit cnt to avoid overflow */

#define FM_SCALE 1000		
/* faux fixed point scale */

/* Initialize a frequency meter */


static void fmeter_init(struct fmeter *fmp)
{
	fmp->cnt = 0;
	fmp->val = 0;
	fmp->time = 0;
	spin_lock_init(&fmp->lock);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 26 70.27% 1 33.33%
Paul Jackson 11 29.73% 2 66.67%
Total 37 100.00% 3 100.00%

/* Internal meter update - process cnt events and update value */


static void fmeter_update(struct fmeter *fmp)
{
	time64_t now;
	u32 ticks;

	now = ktime_get_seconds();
	ticks = now - fmp->time;

	if (ticks == 0)
		return;

	ticks = min(FM_MAXTICKS, ticks);
	while (ticks-- > 0)
		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
	fmp->time = now;

	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
	fmp->cnt = 0;
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 67 69.07% 1 33.33%
Paul Jackson 19 19.59% 1 33.33%
Arnd Bergmann 11 11.34% 1 33.33%
Total 97 100.00% 3 100.00%

/* Process any previous ticks, then bump cnt by one (times scale). */


static void fmeter_markevent(struct fmeter *fmp)
{
	spin_lock(&fmp->lock);
	fmeter_update(fmp);
	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
	spin_unlock(&fmp->lock);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 27 57.45% 1 50.00%
Paul Jackson 20 42.55% 1 50.00%
Total 47 100.00% 2 100.00%

/* Process any previous ticks, then return current value. */


static int fmeter_getrate(struct fmeter *fmp)
{
	int val;

	spin_lock(&fmp->lock);
	fmeter_update(fmp);
	val = fmp->val;
	spin_unlock(&fmp->lock);
	return val;
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 28 63.64% 1 50.00%
Paul Jackson 16 36.36% 1 50.00%
Total 44 100.00% 2 100.00%


static struct cpuset *cpuset_attach_old_cs;

/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */


static int cpuset_can_attach(struct cgroup_taskset *tset)
{
	struct cgroup_subsys_state *css;
	struct cpuset *cs;
	struct task_struct *task;
	int ret;

	/* used later by cpuset_attach() */
	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
	cs = css_cs(css);

	mutex_lock(&cpuset_mutex);

	/* allow moving tasks into an empty cpuset if on default hierarchy */
	ret = -ENOSPC;
	if (!is_in_v2_mode() &&
	    (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
		goto out_unlock;

	cgroup_taskset_for_each(task, css, tset) {
		ret = task_can_attach(task, cs->cpus_allowed);
		if (ret)
			goto out_unlock;
		ret = security_task_setscheduler(task);
		if (ret)
			goto out_unlock;
	}

	/*
         * Mark attach is in progress.  This makes validate_change() fail
         * changes which zero cpus/mems_allowed.
         */
	cs->attach_in_progress++;
	ret = 0;
out_unlock:
	mutex_unlock(&cpuset_mutex);
	return ret;
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 96 64.00% 8 47.06%
Paul Menage 14 9.33% 1 5.88%
Paul Jackson 14 9.33% 1 5.88%
Juri Lelli 9 6.00% 1 5.88%
Ben Blum 7 4.67% 2 11.76%
Li Zefan 5 3.33% 2 11.76%
David Rientjes 3 2.00% 1 5.88%
Waiman Long 2 1.33% 1 5.88%
Total 150 100.00% 17 100.00%



static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
	struct cgroup_subsys_state *css;
	struct cpuset *cs;

	cgroup_taskset_first(tset, &css);
	cs = css_cs(css);

	mutex_lock(&cpuset_mutex);
	css_cs(css)->attach_in_progress--;
	mutex_unlock(&cpuset_mutex);
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 45 80.36% 4 66.67%
Ben Blum 11 19.64% 2 33.33%
Total 56 100.00% 6 100.00%

/*
 * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
 * but we can't allocate it dynamically there.  Define it global and
 * allocate from cpuset_init().
 */

static cpumask_var_t cpus_attach;



static void cpuset_attach(struct cgroup_taskset *tset)
{
	/* static buf protected by cpuset_mutex */
	static nodemask_t cpuset_attach_nodemask_to;
	struct task_struct *task;
	struct task_struct *leader;
	struct cgroup_subsys_state *css;
	struct cpuset *cs;
	struct cpuset *oldcs = cpuset_attach_old_cs;

	cgroup_taskset_first(tset, &css);
	cs = css_cs(css);

	mutex_lock(&cpuset_mutex);

	/* prepare for attach */
	if (cs == &top_cpuset)
		cpumask_copy(cpus_attach, cpu_possible_mask);
	else
		guarantee_online_cpus(cs, cpus_attach);

	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);

	cgroup_taskset_for_each(task, css, tset) {
		/*
                 * can_attach beforehand should guarantee that this doesn't
                 * fail.  TODO: have a better way to handle failure here
                 */
		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));

		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
		cpuset_update_task_spread_flag(cs, task);
	}

	/*
         * Change mm for all threadgroup leaders. This is expensive and may
         * sleep and should be moved outside migration path proper.
         */
	cpuset_attach_nodemask_to = cs->effective_mems;
	cgroup_taskset_for_each_leader(leader, css, tset) {
		struct mm_struct *mm = get_task_mm(leader);

		if (mm) {
			mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);

			/*
                         * old_mems_allowed is the same with mems_allowed
                         * here, except if this task is being moved
                         * automatically due to hotplug.  In that case
                         * @mems_allowed has been updated and is empty, so
                         * @old_mems_allowed is the right nodesets that we
                         * migrate mm from.
                         */
			if (is_memory_migrate(cs))
				cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
						  &cpuset_attach_nodemask_to);
			else
				mmput(mm);
		}
	}

	cs->old_mems_allowed = cpuset_attach_nodemask_to;

	cs->attach_in_progress--;
	if (!cs->attach_in_progress)
		wake_up(&cpuset_attach_wq);

	mutex_unlock(&cpuset_mutex);
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 126 55.02% 11 50.00%
Paul Menage 37 16.16% 1 4.55%
Ben Blum 34 14.85% 2 9.09%
Li Zefan 21 9.17% 7 31.82%
Paul Jackson 11 4.80% 1 4.55%
Total 229 100.00% 22 100.00%

/* The various types of files and directories in a cpuset file system */

typedef enum {
	
FILE_MEMORY_MIGRATE,
	
FILE_CPULIST,
	
FILE_MEMLIST,
	
FILE_EFFECTIVE_CPULIST,
	
FILE_EFFECTIVE_MEMLIST,
	
FILE_CPU_EXCLUSIVE,
	
FILE_MEM_EXCLUSIVE,
	
FILE_MEM_HARDWALL,
	
FILE_SCHED_LOAD_BALANCE,
	
FILE_SCHED_RELAX_DOMAIN_LEVEL,
	
FILE_MEMORY_PRESSURE_ENABLED,
	
FILE_MEMORY_PRESSURE,
	
FILE_SPREAD_PAGE,
	
FILE_SPREAD_SLAB,
} 
cpuset_filetype_t;



static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
			    u64 val)
{
	struct cpuset *cs = css_cs(css);
	cpuset_filetype_t type = cft->private;
	int retval = 0;

	mutex_lock(&cpuset_mutex);
	if (!is_cpuset_online(cs)) {
		retval = -ENODEV;
		goto out_unlock;
	}

	switch (type) {
	case FILE_CPU_EXCLUSIVE:
		retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
		break;
	case FILE_MEM_EXCLUSIVE:
		retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
		break;
	case FILE_MEM_HARDWALL:
		retval = update_flag(CS_MEM_HARDWALL, cs, val);
		break;
	case FILE_SCHED_LOAD_BALANCE:
		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
		break;
	case FILE_MEMORY_MIGRATE:
		retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
		break;
	case FILE_MEMORY_PRESSURE_ENABLED:
		cpuset_memory_pressure_enabled = !!val;
		break;
	case FILE_SPREAD_PAGE:
		retval = update_flag(CS_SPREAD_PAGE, cs, val);
		break;
	case FILE_SPREAD_SLAB:
		retval = update_flag(CS_SPREAD_SLAB, cs, val);
		break;
	default:
		retval = -EINVAL;
		break;
	}
out_unlock:
	mutex_unlock(&cpuset_mutex);
	return retval;
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 127 62.25% 4 40.00%
Paul Jackson 33 16.18% 2 20.00%
Tejun Heo 26 12.75% 2 20.00%
Hidetoshi Seto 10 4.90% 1 10.00%
Li Zefan 8 3.92% 1 10.00%
Total 204 100.00% 10 100.00%



static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
			    s64 val)
{
	struct cpuset *cs = css_cs(css);
	cpuset_filetype_t type = cft->private;
	int retval = -ENODEV;

	mutex_lock(&cpuset_mutex);
	if (!is_cpuset_online(cs))
		goto out_unlock;

	switch (type) {
	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
		retval = update_relax_domain_level(cs, val);
		break;
	default:
		retval = -EINVAL;
		break;
	}
out_unlock:
	mutex_unlock(&cpuset_mutex);
	return retval;
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 70 72.92% 2 50.00%
Tejun Heo 26 27.08% 2 50.00%
Total 96 100.00% 4 100.00%

/*
 * Common handling for a write to a "cpus" or "mems" file.
 */


static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
				    char *buf, size_t nbytes, loff_t off)
{
	struct cpuset *cs = css_cs(of_css(of));
	struct cpuset *trialcs;
	int retval = -ENODEV;

	buf = strstrip(buf);

	/*
         * CPU or memory hotunplug may leave @cs w/o any execution
         * resources, in which case the hotplug code asynchronously updates
         * configuration and transfers all tasks to the nearest ancestor
         * which can execute.
         *
         * As writes to "cpus" or "mems" may restore @cs's execution
         * resources, wait for the previously scheduled operations before
         * proceeding, so that we don't end up keep removing tasks added
         * after execution capability is restored.
         *
         * cpuset_hotplug_work calls back into cgroup core via
         * cgroup_transfer_tasks() and waiting for it from a cgroupfs
         * operation like this one can lead to a deadlock through kernfs
         * active_ref protection.  Let's break the protection.  Losing the
         * protection is okay as we check whether @cs is online after
         * grabbing cpuset_mutex anyway.  This only happens on the legacy
         * hierarchies.
         */
	css_get(&cs->css);
	kernfs_break_active_protection(of->kn);
	flush_work(&cpuset_hotplug_work);

	mutex_lock(&cpuset_mutex);
	if (!is_cpuset_online(cs))
		goto out_unlock;

	trialcs = alloc_trial_cpuset(cs);
	if (!trialcs) {
		retval = -ENOMEM;
		goto out_unlock;
	}

	switch (of_cft(of)->private) {
	case FILE_CPULIST:
		retval = update_cpumask(cs, trialcs, buf);
		break;
	case FILE_MEMLIST:
		retval = update_nodemask(cs, trialcs, buf);
		break;
	default:
		retval = -EINVAL;
		break;
	}

	free_trial_cpuset(trialcs);
out_unlock:
	mutex_unlock(&cpuset_mutex);
	kernfs_unbreak_active_protection(of->kn);
	css_put(&cs->css);
	flush_workqueue(cpuset_migrate_mm_wq);
	return retval ?: nbytes;
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 94 47.00% 6 66.67%
Paul Menage 60 30.00% 1 11.11%
Li Zefan 46 23.00% 2 22.22%
Total 200 100.00% 9 100.00%

/*
 * These ascii lists should be read in a single call, by using a user
 * buffer large enough to hold the entire map.  If read in smaller
 * chunks, there is no guarantee of atomicity.  Since the display format
 * used, list of ranges of sequential numbers, is variable length,
 * and since these maps can change value dynamically, one could read
 * gibberish by doing partial reads while a list was changing.
 */


static int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
	struct cpuset *cs = css_cs(seq_css(sf));
	cpuset_filetype_t type = seq_cft(sf)->private;
	int ret = 0;

	spin_lock_irq(&callback_lock);

	switch (type) {
	case FILE_CPULIST:
		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
		break;
	case FILE_MEMLIST:
		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
		break;
	case FILE_EFFECTIVE_CPULIST:
		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
		break;
	case FILE_EFFECTIVE_MEMLIST:
		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
		break;
	default:
		ret = -EINVAL;
	}

	spin_unlock_irq(&callback_lock);
	return ret;
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 61 42.36% 4 50.00%
Paul Jackson 33 22.92% 1 12.50%
Li Zefan 24 16.67% 1 12.50%
Paul Menage 22 15.28% 1 12.50%
Vladimir Davydov 4 2.78% 1 12.50%
Total 144 100.00% 8 100.00%



static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
	struct cpuset *cs = css_cs(css);
	cpuset_filetype_t type = cft->private;
	switch (type) {
	case FILE_CPU_EXCLUSIVE:
		return is_cpu_exclusive(cs);
	case FILE_MEM_EXCLUSIVE:
		return is_mem_exclusive(cs);
	case FILE_MEM_HARDWALL:
		return is_mem_hardwall(cs);
	case FILE_SCHED_LOAD_BALANCE:
		return is_sched_load_balance(cs);
	case FILE_MEMORY_MIGRATE:
		return is_memory_migrate(cs);
	case FILE_MEMORY_PRESSURE_ENABLED:
		return cpuset_memory_pressure_enabled;
	case FILE_MEMORY_PRESSURE:
		return fmeter_getrate(&cs->fmeter);
	case FILE_SPREAD_PAGE:
		return is_spread_page(cs);
	case FILE_SPREAD_SLAB:
		return is_spread_slab(cs);
	default:
		BUG();
	}

	/* Unreachable but makes gcc happy */
	return 0;
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 120 93.75% 2 50.00%
Maksim Krasnyanskiy 4 3.12% 1 25.00%
Tejun Heo 4 3.12% 1 25.00%
Total 128 100.00% 4 100.00%



static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
{
	struct cpuset *cs = css_cs(css);
	cpuset_filetype_t type = cft->private;
	switch (type) {
	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
		return cs->relax_domain_level;
	default:
		BUG();
	}

	/* Unrechable but makes gcc happy */
	return 0;
}


Contributors
Person Tokens Prop Commits CommitProp
Paul Menage 47 85.45% 1 33.33%
Maksim Krasnyanskiy 4 7.27% 1 33.33%
Tejun Heo 4 7.27% 1 33.33%
Total 55 100.00% 3 100.00%

/*
 * for the common functions, 'private' gives the type of file
 */


static struct cftype files[] = {
	{
		.name = "cpus",
		.seq_show = cpuset_common_seq_show,
		.write = cpuset_write_resmask,
		.max_write_len = (100U + 6 * NR_CPUS),
		.private = FILE_CPULIST,
        },

	{
		.name = "mems",
		.seq_show = cpuset_common_seq_show,
		.write = cpuset_write_resmask,
		.max_write_len = (100U + 6 * MAX_NUMNODES),
		.private = FILE_MEMLIST,
        },

	{
		.name = "effective_cpus",
		.seq_show = cpuset_common_seq_show,
		.private = FILE_EFFECTIVE_CPULIST,
        },

	{
		.name = "effective_mems",
		.seq_show = cpuset_common_seq_show,
		.private = FILE_EFFECTIVE_MEMLIST,
        },

	{
		.name = "cpu_exclusive",
		.read_u64 = cpuset_read_u64,
		.write_u64 = cpuset_write_u64,
		.private = FILE_CPU_EXCLUSIVE,
        },

	{
		.name = "mem_exclusive",
		.read_u64 = cpuset_read_u64,
		.write_u64 = cpuset_write_u64,
		.private = FILE_MEM_EXCLUSIVE,
        },

	{
		.name = "mem_hardwall",
		.read_u64 = cpuset_read_u64,
		.write_u64 = cpuset_write_u64,
		.private = FILE_MEM_HARDWALL,
        },

	{
		.name = "sched_load_balance",
		.read_u64 = cpuset_read_u64,
		.write_u64 = cpuset_write_u64,
		.private = FILE_SCHED_LOAD_BALANCE,
        },

	{
		.name = "sched_relax_domain_level",
		.read_s64 = cpuset_read_s64,
		.write_s64 = cpuset_write_s64,
		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
        },

	{
		.name = "memory_migrate",
		.read_u64 = cpuset_read_u64,
		.write_u64 = cpuset_write_u64,
		.private = FILE_MEMORY_MIGRATE,
        },

	{
		.name = "memory_pressure",
		.read_u64 = cpuset_read_u64,
		.private = FILE_MEMORY_PRESSURE,
        },

	{
		.name = "memory_spread_page",
		.read_u64 = cpuset_read_u64,
		.write_u64 = cpuset_write_u64,
		.private = FILE_SPREAD_PAGE,
        },

	{
		.name = "memory_spread_slab",
		.read_u64 = cpuset_read_u64,
		.write_u64 = cpuset_write_u64,
		.private = FILE_SPREAD_SLAB,
        },

	{
		.name = "memory_pressure_enabled",
		.flags = CFTYPE_ONLY_ON_ROOT,
		.read_u64 = cpuset_read_u64,
		.write_u64 = cpuset_write_u64,
		.private = FILE_MEMORY_PRESSURE_ENABLED,
        },

	{ }	/* terminate */
};

/*
 *      cpuset_css_alloc - allocate a cpuset css
 *      cgrp:   control group that the new cpuset will be part of
 */


static struct cgroup_subsys_state *

cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
	struct cpuset *cs;

	if (!parent_css)
		return &top_cpuset.css;

	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
	if (!cs)
		return ERR_PTR(-ENOMEM);
	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
		goto free_cs;
	if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
		goto free_cpus;

	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
	cpumask_clear(cs->cpus_allowed);
	nodes_clear(cs->mems_allowed);
	cpumask_clear(cs->effective_cpus);
	nodes_clear(cs->effective_mems);
	fmeter_init(&cs->fmeter);
	cs->relax_domain_level = -1;

	return &cs->css;

free_cpus:
	free_cpumask_var(cs->cpus_allowed);
free_cs:
	kfree(cs);
	return ERR_PTR(-ENOMEM);
}

Contributors
Person Tokens Prop Commits CommitProp
Li Zefan 70 41.67% 2 16.67%
Paul Jackson 41 24.40% 3 25.00%
Paul Menage 39 23.21% 1 8.33%
Tejun Heo 10 5.95% 4 33.33%
Mike Travis 5 2.98% 1 8.33%
Hidetoshi Seto 3 1.79% 1 8.33%
Total 168 100.00% 12 100.00%



static int cpuset_css_online(struct cgroup_subsys_state *css)
{
	struct cpuset *cs = css_cs(css);
	struct cpuset *parent = parent_cs(cs);
	struct cpuset *tmp_cs;
	struct cgroup_subsys_state *pos_css;

	if (!parent)
		return 0;

	mutex_lock(&cpuset_mutex);

	set_bit(CS_ONLINE, &cs->flags);
	if (is_spread_page(parent))
		set_bit(CS_SPREAD_PAGE, &cs->flags);
	if (is_spread_slab(parent))
		set_bit(CS_SPREAD_SLAB, &cs->flags);

	cpuset_inc();

	spin_lock_irq(&callback_lock);
	if (is_in_v2_mode()) {
		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
		cs->effective_mems = parent->effective_mems;
	}
	spin_unlock_irq(&callback_lock);

	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
		goto out_unlock;

	/*
         * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
         * set.  This flag handling is implemented in cgroup core for
         * histrical reasons - the flag may be specified during mount.
         *
         * Currently, if any sibling cpusets have exclusive cpus or mem, we
         * refuse to clone the configuration - thereby refusing the task to
         * be entered, and as a result refusing the sys_unshare() or
         * clone() which initiated it.  If this becomes a problem for some
         * users who wish to allow that scenario, then this could be
         * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
         * (and likewise for mems) to the new cgroup.
         */
	rcu_read_lock();
	cpuset_for_each_child(tmp_cs, pos_css, parent) {
		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
			rcu_read_unlock();
			goto out_unlock;
		}
	}
	rcu_read_unlock();

	spin_lock_irq(&callback_lock);
	cs->mems_allowed = parent->mems_allowed;
	cs->effective_mems = parent->mems_allowed;
	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
	spin_unlock_irq(&callback_lock);
out_unlock:
	mutex_unlock(&cpuset_mutex);
	return 0;
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 174 67.97% 8 47.06%
Li Zefan 51 19.92% 2 11.76%
Paul Menage 15 5.86% 1 5.88%
Vladimir Davydov 7 2.73% 1 5.88%
Paul Jackson 4 1.56% 2 11.76%
Mel Gorman 2 0.78% 1 5.88%
Waiman Long 2 0.78% 1 5.88%
Dan Carpenter 1 0.39% 1 5.88%
Total 256 100.00% 17 100.00%

/*
 * If the cpuset being removed has its flag 'sched_load_balance'
 * enabled, then simulate turning sched_load_balance off, which
 * will call rebuild_sched_domains_locked().
 */



static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
	struct cpuset *cs = css_cs(css);

	mutex_lock(&cpuset_mutex);

	if (is_sched_load_balance(cs))
		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);

	cpuset_dec();
	clear_bit(CS_ONLINE, &cs->flags);

	mutex_unlock(&cpuset_mutex);
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 28 45.16% 4 40.00%
Paul Jackson 18 29.03% 3 30.00%
Paul Menage 14 22.58% 2 20.00%
Mel Gorman 2 3.23% 1 10.00%
Total 62 100.00% 10 100.00%



static void cpuset_css_free(struct cgroup_subsys_state *css)
{
	struct cpuset *cs = css_cs(css);

	free_cpumask_var(cs->effective_cpus);
	free_cpumask_var(cs->cpus_allowed);
	kfree(cs);
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 20 50.00% 2 28.57%
Li Zefan 14 35.00% 2 28.57%
Paul Jackson 4 10.00% 2 28.57%
Paul Menage 2 5.00% 1 14.29%
Total 40 100.00% 7 100.00%



static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
	mutex_lock(&cpuset_mutex);
	spin_lock_irq(&callback_lock);

	if (is_in_v2_mode()) {
		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
		top_cpuset.mems_allowed = node_possible_map;
	} else {
		cpumask_copy(top_cpuset.cpus_allowed,
			     top_cpuset.effective_cpus);
		top_cpuset.mems_allowed = top_cpuset.effective_mems;
	}

	spin_unlock_irq(&callback_lock);
	mutex_unlock(&cpuset_mutex);
}

Contributors
Person Tokens Prop Commits CommitProp
Li Zefan 73 92.41% 1 33.33%
Vladimir Davydov 4 5.06% 1 33.33%
Waiman Long 2 2.53% 1 33.33%
Total 79 100.00% 3 100.00%

/*
 * Make sure the new task conform to the current state of its parent,
 * which could have been changed by cpuset just after it inherits the
 * state from the parent and before it sits on the cgroup's task list.
 */


static void cpuset_fork(struct task_struct *task)
{
	if (task_css_is_root(task, cpuset_cgrp_id))
		return;

	set_cpus_allowed_ptr(task, &current->cpus_allowed);
	task->mems_allowed = current->mems_allowed;
}

Contributors
Person Tokens Prop Commits CommitProp
Li Zefan 38 97.44% 1 50.00%
Wei Yongjun 1 2.56% 1 50.00%
Total 39 100.00% 2 100.00%


struct cgroup_subsys cpuset_cgrp_subsys = {
	.css_alloc	= cpuset_css_alloc,
	.css_online	= cpuset_css_online,
	.css_offline	= cpuset_css_offline,
	.css_free	= cpuset_css_free,
	.can_attach	= cpuset_can_attach,
	.cancel_attach	= cpuset_cancel_attach,
	.attach		= cpuset_attach,
	.post_attach	= cpuset_post_attach,
	.bind		= cpuset_bind,
	.fork		= cpuset_fork,
	.legacy_cftypes	= files,
	.early_init	= true,
};

/**
 * cpuset_init - initialize cpusets at system boot
 *
 * Description: Initialize top_cpuset and the cpuset internal file system,
 **/



int __init cpuset_init(void)
{
	int err = 0;

	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));

	cpumask_setall(top_cpuset.cpus_allowed);
	nodes_setall(top_cpuset.mems_allowed);
	cpumask_setall(top_cpuset.effective_cpus);
	nodes_setall(top_cpuset.effective_mems);

	fmeter_init(&top_cpuset.fmeter);
	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
	top_cpuset.relax_domain_level = -1;

	err = register_filesystem(&cpuset_fs_type);
	if (err < 0)
		return err;

	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));

	return 0;
}

Contributors
Person Tokens Prop Commits CommitProp
Li Zefan 42 33.33% 3 21.43%
Paul Jackson 36 28.57% 4 28.57%
Paul Menage 15 11.90% 1 7.14%
Nicholas Mc Guire 9 7.14% 1 7.14%
Miao Xie 8 6.35% 1 7.14%
Hidetoshi Seto 7 5.56% 1 7.14%
Mike Travis 5 3.97% 1 7.14%
Dave Hansen 2 1.59% 1 7.14%
Yinghai Lu 2 1.59% 1 7.14%
Total 126 100.00% 14 100.00%

/*
 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
 * removing that CPU or node from all cpusets.  If this removes the
 * last CPU or node from a cpuset, then move the tasks in the empty
 * cpuset to its next-highest non-empty parent.
 */


static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
	struct cpuset *parent;

	/*
         * Find its next-highest non-empty parent, (top cpuset
         * has online cpus, so can't be empty).
         */
	parent = parent_cs(cs);
	while (cpumask_empty(parent->cpus_allowed) ||
			nodes_empty(parent->mems_allowed))
		parent = parent_cs(parent);

	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
		pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
		pr_cont_cgroup_name(cs->css.cgroup);
		pr_cont("\n");
	}
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 40 47.06% 3 37.50%
Cliff Wickman 34 40.00% 1 12.50%
Paul Jackson 7 8.24% 1 12.50%
Paul Menage 2 2.35% 1 12.50%
Li Zefan 1 1.18% 1 12.50%
Fabian Frederick 1 1.18% 1 12.50%
Total 85 100.00% 8 100.00%


static void

hotplug_update_tasks_legacy(struct cpuset *cs,
			    struct cpumask *new_cpus, nodemask_t *new_mems,
			    bool cpus_updated, bool mems_updated)
{
	bool is_empty;

	spin_lock_irq(&callback_lock);
	cpumask_copy(cs->cpus_allowed, new_cpus);
	cpumask_copy(cs->effective_cpus, new_cpus);
	cs->mems_allowed = *new_mems;
	cs->effective_mems = *new_mems;
	spin_unlock_irq(&callback_lock);

	/*
         * Don't call update_tasks_cpumask() if the cpuset becomes empty,
         * as the tasks will be migratecd to an ancestor.
         */
	if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
		update_tasks_cpumask(cs);
	if (mems_updated && !nodes_empty(cs->mems_allowed))
		update_tasks_nodemask(cs);

	is_empty = cpumask_empty(cs->cpus_allowed) ||
		   nodes_empty(cs->mems_allowed);

	mutex_unlock(&cpuset_mutex);

	/*
         * Move tasks to the nearest ancestor with execution resources,
         * This is full cgroup operation which will also call back into
         * cpuset. Should be done outside any lock.
         */
	if (is_empty)
		remove_tasks_in_empty_cpuset(cs);

	mutex_lock(&cpuset_mutex);
}

Contributors
Person Tokens Prop Commits CommitProp
Li Zefan 86 58.90% 8 50.00%
Srivatsa S. Bhat 31 21.23% 2 12.50%
Tejun Heo 15 10.27% 2 12.50%
Cliff Wickman 5 3.42% 1 6.25%
Paul Jackson 4 2.74% 1 6.25%
Vladimir Davydov 4 2.74% 1 6.25%
Miao Xie 1 0.68% 1 6.25%
Total 146 100.00% 16 100.00%


static void

hotplug_update_tasks(struct cpuset *cs,
		     struct cpumask *new_cpus, nodemask_t *new_mems,
		     bool cpus_updated, bool mems_updated)
{
	if (cpumask_empty(new_cpus))
		cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
	if (nodes_empty(*new_mems))
		*new_mems = parent_cs(cs)->effective_mems;

	spin_lock_irq(&callback_lock);
	cpumask_copy(cs->effective_cpus, new_cpus);
	cs->effective_mems = *new_mems;
	spin_unlock_irq(&callback_lock);

	if (cpus_updated)
		update_tasks_cpumask(cs);
	if (mems_updated)
		update_tasks_nodemask(cs);
}

Contributors
Person Tokens Prop Commits CommitProp
Li Zefan 81 74.31% 5 45.45%
Cliff Wickman 9 8.26% 1 9.09%
Paul Jackson 7 6.42% 2 18.18%
Tejun Heo 6 5.50% 1 9.09%
Vladimir Davydov 4 3.67% 1 9.09%
Miao Xie 2 1.83% 1 9.09%
Total 109 100.00% 11 100.00%

/**
 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
 * @cs: cpuset in interest
 *
 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
 * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
 * all its tasks are moved to the nearest ancestor with both resources.
 */


static void cpuset_hotplug_update_tasks(struct cpuset *cs)
{
	static cpumask_t new_cpus;
	static nodemask_t new_mems;
	bool cpus_updated;
	bool mems_updated;
retry:
	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);

	mutex_lock(&cpuset_mutex);

	/*
         * We have raced with task attaching. We wait until attaching
         * is finished, so we won't attach a task to an empty cpuset.
         */
	if (cs->attach_in_progress) {
		mutex_unlock(&cpuset_mutex);
		goto retry;
	}

	cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
	nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);

	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
	mems_updated = !nodes_equal(new_mems, cs->effective_mems);

	if (is_in_v2_mode())
		hotplug_update_tasks(cs, &new_cpus, &new_mems,
				     cpus_updated, mems_updated);
	else
		hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
					    cpus_updated, mems_updated);

	mutex_unlock(&cpuset_mutex);
}

Contributors
Person Tokens Prop Commits CommitProp
Li Zefan 127 78.40% 3 30.00%
Tejun Heo 20 12.35% 2 20.00%
Paul Jackson 8 4.94% 1 10.00%
Paul Menage 3 1.85% 1 10.00%
Waiman Long 2 1.23% 1 10.00%
Cliff Wickman 1 0.62% 1 10.00%
Miao Xie 1 0.62% 1 10.00%
Total 162 100.00% 10 100.00%


static bool force_rebuild;



void cpuset_force_rebuild(void)
{
	force_rebuild = true;
}

Contributors
Person Tokens Prop Commits CommitProp
Peter Zijlstra 11 100.00% 1 100.00%
Total 11 100.00% 1 100.00%

/**
 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
 *
 * This function is called after either CPU or memory configuration has
 * changed and updates cpuset accordingly.  The top_cpuset is always
 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
 * order to make cpusets transparent (of no affect) on systems that are
 * actively using CPU hotplug but making no active use of cpusets.
 *
 * Non-root cpusets are only affected by offlining.  If any CPUs or memory
 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
 * all descendants.
 *
 * Note that CPU offlining during suspend is ignored.  We don't modify
 * cpusets across suspend/resume cycles at all.
 */


static void cpuset_hotplug_workfn(struct work_struct *work)
{
	static cpumask_t new_cpus;
	static nodemask_t new_mems;
	bool cpus_updated, mems_updated;
	bool on_dfl = is_in_v2_mode();

	mutex_lock(&cpuset_mutex);

	/* fetch the available cpus/mems and find out which changed how */
	cpumask_copy(&new_cpus, cpu_active_mask);
	new_mems = node_states[N_MEMORY];

	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);

	/* synchronize cpus_allowed to cpu_active_mask */
	if (cpus_updated) {
		spin_lock_irq(&callback_lock);
		if (!on_dfl)
			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
		spin_unlock_irq(&callback_lock);
		/* we don't mess with cpumasks of tasks in top_cpuset */
	}

	/* synchronize mems_allowed to N_MEMORY */
	if (mems_updated) {
		spin_lock_irq(&callback_lock);
		if (!on_dfl)
			top_cpuset.mems_allowed = new_mems;
		top_cpuset.effective_mems = new_mems;
		spin_unlock_irq(&callback_lock);
		update_tasks_nodemask(&top_cpuset);
	}

	mutex_unlock(&cpuset_mutex);

	/* if cpus or mems changed, we need to propagate to descendants */
	if (cpus_updated || mems_updated) {
		struct cpuset *cs;
		struct cgroup_subsys_state *pos_css;

		rcu_read_lock();
		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
			if (cs == &top_cpuset || !css_tryget_online(&cs->css))
				continue;
			rcu_read_unlock();

			cpuset_hotplug_update_tasks(cs);

			rcu_read_lock();
			css_put(&cs->css);
		}
		rcu_read_unlock();
	}

	/* rebuild sched domains if cpus_allowed has changed */
	if (cpus_updated || force_rebuild) {
		force_rebuild = false;
		rebuild_sched_domains();
	}
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 92 35.80% 9 36.00%
Li Zefan 81 31.52% 6 24.00%
Srivatsa S. Bhat 26 10.12% 1 4.00%
Paul Jackson 18 7.00% 2 8.00%
Maksim Krasnyanskiy 14 5.45% 1 4.00%
Vladimir Davydov 8 3.11% 1 4.00%
Peter Zijlstra 8 3.11% 1 4.00%
Miao Xie 5 1.95% 1 4.00%
Li Zhong 2 0.78% 1 4.00%
Waiman Long 2 0.78% 1 4.00%
Cliff Wickman 1 0.39% 1 4.00%
Total 257 100.00% 25 100.00%



void cpuset_update_active_cpus(void)
{
	/*
         * We're inside cpu hotplug critical region which usually nests
         * inside cgroup synchronization.  Bounce actual hotplug processing
         * to a work item to avoid reverse locking order.
         */
	schedule_work(&cpuset_hotplug_work);
}

Contributors
Person Tokens Prop Commits CommitProp
Tejun Heo 12 85.71% 2 50.00%
Rakib Mullick 1 7.14% 1 25.00%
Peter Zijlstra 1 7.14% 1 25.00%
Total 14 100.00% 4 100.00%



void cpuset_wait_for_hotplug(void)
{
	flush_work(&cpuset_hotplug_work);
}

Contributors
Person Tokens Prop Commits CommitProp
Peter Zijlstra 13 100.00% 1 100.00%
Total 13 100.00% 1 100.00%

/*
 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
 * Call this routine anytime after node_states[N_MEMORY] changes.
 * See cpuset_update_active_cpus() for CPU hotplug handling.
 */


static int cpuset_track_online_nodes(struct notifier_block *self,
				unsigned long action, void *arg)
{
	schedule_work(&cpuset_hotplug_work);
	return NOTIFY_OK;
}

Contributors
Person Tokens Prop Commits CommitProp
Miao Xie 16 57.14% 1 16.67%
Paul Jackson 4 14.29% 1 16.67%
Al Viro 3 10.71% 1 16.67%
Dmitry Adamushko 2 7.14% 1 16.67%
Tejun Heo 2 7.14% 1 16.67%
Maksim Krasnyanskiy 1 3.57% 1 16.67%
Total 28 100.00% 6 100.00%


static struct notifier_block cpuset_track_online_nodes_nb = {
	.notifier_call = cpuset_track_online_nodes,
	.priority = 10,		/* ??! */
};

/**
 * cpuset_init_smp - initialize cpus_allowed
 *
 * Description: Finish top cpuset after cpu, node maps are initialized
 */


void __init cpuset_init_smp(void)
{
	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
	top_cpuset.mems_allowed = node_states[N_MEMORY];
	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;

	cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
	top_cpuset.effective_mems = node_states[N_MEMORY];

	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);

	cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
	BUG_ON(!cpuset_migrate_mm_wq);
}

Contributors
Person Tokens Prop Commits CommitProp
Li Zefan 30 41.10% 3 30.00%
Paul Jackson 17 23.29% 1 10.00%
Tejun Heo 15 20.55% 1 10.00%
Miao Xie 3 4.11% 1 10.00%
Andrew Morton 3 4.11% 1 10.00%
Christoph Lameter 3 4.11% 1 10.00%
Peter Zijlstra 1 1.37% 1 10.00%
Lai Jiangshan 1 1.37% 1 10.00%
Total 73 100.00% 10 100.00%

/**
 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
 *
 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
 * subset of cpu_online_mask, even if this means going outside the
 * tasks cpuset.
 **/



void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
	unsigned long flags;

	spin_lock_irqsave(&callback_lock, flags);
	rcu_read_lock();
	guarantee_online_cpus(task_cs(tsk), pmask);
	rcu_read_unlock();
	spin_unlock_irqrestore(&callback_lock, flags);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 20 39.22% 1 12.50%
Vladimir Davydov 12 23.53% 1 12.50%
Li Zefan 7 13.73% 3 37.50%
Mike Travis 5 9.80% 1 12.50%
Oleg Nesterov 4 7.84% 1 12.50%
Paul Menage 3 5.88% 1 12.50%
Total 51 100.00% 8 100.00%



void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
	rcu_read_lock();
	do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
	rcu_read_unlock();

	/*
         * We own tsk->cpus_allowed, nobody can change it under us.
         *
         * But we used cs && cs->cpus_allowed lockless and thus can
         * race with cgroup_attach_task() or update_cpumask() and get
         * the wrong tsk->cpus_allowed. However, both cases imply the
         * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
         * which takes task_rq_lock().
         *
         * If we are called after it dropped the lock we must see all
         * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
         * set any mask even if it is not right from task_cs() pov,
         * the pending set_cpus_allowed_ptr() will fix things.
         *
         * select_fallback_rq() will fix things ups and set cpu_possible_mask
         * if required.
         */
}

Contributors
Person Tokens Prop Commits CommitProp
Oleg Nesterov 22 75.86% 1 25.00%
Li Zefan 5 17.24% 2 50.00%
Peter Zijlstra 2 6.90% 1 25.00%
Total 29 100.00% 4 100.00%



void __init cpuset_init_current_mems_allowed(void)
{
	nodes_setall(current->mems_allowed);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 11 73.33% 1 33.33%
Mike Travis 3 20.00% 1 33.33%
Rasmus Villemoes 1 6.67% 1 33.33%
Total 15 100.00% 3 100.00%

/**
 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
 *
 * Description: Returns the nodemask_t mems_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
 * subset of node_states[N_MEMORY], even if this means going outside the
 * tasks cpuset.
 **/



nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
	nodemask_t mask;
	unsigned long flags;

	spin_lock_irqsave(&callback_lock, flags);
	rcu_read_lock();
	guarantee_online_mems(task_cs(tsk), &mask);
	rcu_read_unlock();
	spin_unlock_irqrestore(&callback_lock, flags);

	return mask;
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 33 62.26% 1 20.00%
Vladimir Davydov 12 22.64% 1 20.00%
Li Zefan 5 9.43% 2 40.00%
Paul Menage 3 5.66% 1 20.00%
Total 53 100.00% 5 100.00%

/**
 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
 * @nodemask: the nodemask to be checked
 *
 * Are any of the nodes in the nodemask allowed in current->mems_allowed?
 */


int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
	return nodes_intersects(*nodemask, current->mems_allowed);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 13 65.00% 1 50.00%
Mel Gorman 7 35.00% 1 50.00%
Total 20 100.00% 2 100.00%

/*
 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
 * mem_hardwall ancestor to the specified cpuset.  Call holding
 * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
 * (an unusual configuration), then returns the root cpuset.
 */


static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
{
	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
		cs = parent_cs(cs);
	return cs;
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 29 67.44% 1 33.33%
Paul Menage 8 18.60% 1 33.33%
Tejun Heo 6 13.95% 1 33.33%
Total 43 100.00% 3 100.00%

/**
 * cpuset_node_allowed - Can we allocate on a memory node?
 * @node: is this an allowed node?
 * @gfp_mask: memory allocation flags
 *
 * If we're in interrupt, yes, we can always allocate.  If @node is set in
 * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
 * yes.  If current has access to memory reserves as an oom victim, yes.
 * Otherwise, no.
 *
 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
 * and do not allow allocations outside the current tasks cpuset
 * unless the task has been OOM killed.
 * GFP_KERNEL allocations are not so marked, so can escape to the
 * nearest enclosing hardwalled ancestor cpuset.
 *
 * Scanning up parent cpusets requires callback_lock.  The
 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
 * current tasks mems_allowed came up empty on the first pass over
 * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
 * cpuset are short of memory, might require taking the callback_lock.
 *
 * The first call here from mm/page_alloc:get_page_from_freelist()
 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
 * so no allocation on a node outside the cpuset is allowed (unless
 * in interrupt, of course).
 *
 * The second pass through get_page_from_freelist() doesn't even call
 * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
 * in alloc_flags.  That logic and the checks below have the combined
 * affect that:
 *      in_interrupt - any node ok (current task context irrelevant)
 *      GFP_ATOMIC   - any node ok
 *      tsk_is_oom_victim   - any node ok
 *      GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
 */


bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
	struct cpuset *cs;		/* current cpuset ancestors */
	int allowed;			/* is allocation in zone z allowed? */
	unsigned long flags;

	if (in_interrupt())
		return true;
	if (node_isset(node, current->mems_allowed))
		return true;
	/*
         * Allow tasks that have access to memory reserves because they have
         * been OOM killed to get memory anywhere.
         */
	if (unlikely(tsk_is_oom_victim(current)))
		return true;
	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
		return false;

	if (current->flags & PF_EXITING) /* Let dying task have memory */
		return true;

	/* Not hardwall and node outside mems_allowed: scan up cpusets */
	spin_lock_irqsave(&callback_lock, flags);

	rcu_read_lock();
	cs = nearest_hardwall_ancestor(task_cs(current));
	allowed = node_isset(node, cs->mems_allowed);
	rcu_read_unlock();

	spin_unlock_irqrestore(&callback_lock, flags);
	return allowed;
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 81 62.31% 3 18.75%
Vladimir Davydov 13 10.00% 2 12.50%
David Rientjes 13 10.00% 2 12.50%
Vlastimil Babka 6 4.62% 1 6.25%
Bob Picco 5 3.85% 1 6.25%
Paul Menage 4 3.08% 2 12.50%
Li Zefan 3 2.31% 2 12.50%
Michal Hocko 2 1.54% 1 6.25%
Linus Torvalds 2 1.54% 1 6.25%
Al Viro 1 0.77% 1 6.25%
Total 130 100.00% 16 100.00%

/**
 * cpuset_mem_spread_node() - On which node to begin search for a file page
 * cpuset_slab_spread_node() - On which node to begin search for a slab page
 *
 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
 * tasks in a cpuset with is_spread_page or is_spread_slab set),
 * and if the memory allocation used cpuset_mem_spread_node()
 * to determine on which node to start looking, as it will for
 * certain page cache or slab cache pages such as used for file
 * system buffers and inode caches, then instead of starting on the
 * local node to look for a free page, rather spread the starting
 * node around the tasks mems_allowed nodes.
 *
 * We don't have to worry about the returned node being offline
 * because "it can't happen", and even if it did, it would be ok.
 *
 * The routines calling guarantee_online_mems() are careful to
 * only set nodes in task->mems_allowed that are online.  So it
 * should not be possible for the following code to return an
 * offline node.  But if it did, that would be ok, as this routine
 * is not returning the node where the allocation must be, only
 * the node where the search should start.  The zonelist passed to
 * __alloc_pages() will include all nodes.  If the slab allocator
 * is passed an offline node, it will fall back to the local node.
 * See kmem_cache_alloc_node().
 */



static int cpuset_spread_node(int *rotor)
{
	return *rotor = next_node_in(*rotor, current->mems_allowed);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 13 54.17% 1 33.33%
Jack Steiner 7 29.17% 1 33.33%
Andrew Morton 4 16.67% 1 33.33%
Total 24 100.00% 3 100.00%



int cpuset_mem_spread_node(void)
{
	if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
		current->cpuset_mem_spread_rotor =
			node_random(&current->mems_allowed);

	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
}

Contributors
Person Tokens Prop Commits CommitProp
Michal Hocko 20 55.56% 1 50.00%
Jack Steiner 16 44.44% 1 50.00%
Total 36 100.00% 2 100.00%



int cpuset_slab_spread_node(void)
{
	if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
		current->cpuset_slab_spread_rotor =
			node_random(&current->mems_allowed);

	return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
}

Contributors
Person Tokens Prop Commits CommitProp
Michal Hocko 20 55.56% 1 50.00%
Jack Steiner 16 44.44% 1 50.00%
Total 36 100.00% 2 100.00%


EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);

/**
 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
 * @tsk1: pointer to task_struct of some task.
 * @tsk2: pointer to task_struct of some other task.
 *
 * Description: Return true if @tsk1's mems_allowed intersects the
 * mems_allowed of @tsk2.  Used by the OOM killer to determine if
 * one of the task's memory usage might impact the memory available
 * to the other.
 **/



int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
				   const struct task_struct *tsk2)
{
	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 20 68.97% 2 66.67%
David Rientjes 9 31.03% 1 33.33%
Total 29 100.00% 3 100.00%

/**
 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
 *
 * Description: Prints current's name, cpuset name, and cached copy of its
 * mems_allowed to the kernel log.
 */


void cpuset_print_current_mems_allowed(void)
{
	struct cgroup *cgrp;

	rcu_read_lock();

	cgrp = task_cs(current)->css.cgroup;
	pr_info("%s cpuset=", current->comm);
	pr_cont_cgroup_name(cgrp);
	pr_cont(" mems_allowed=%*pbl\n",
		nodemask_pr_args(&current->mems_allowed));

	rcu_read_unlock();
}

Contributors
Person Tokens Prop Commits CommitProp
David Rientjes 27 48.21% 2 28.57%
Li Zefan 14 25.00% 2 28.57%
Tejun Heo 14 25.00% 2 28.57%
Fabian Frederick 1 1.79% 1 14.29%
Total 56 100.00% 7 100.00%

/*
 * Collection of memory_pressure is suppressed unless
 * this flag is enabled by writing "1" to the special
 * cpuset file 'memory_pressure_enabled' in the root cpuset.
 */


int cpuset_memory_pressure_enabled __read_mostly;

/**
 * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
 *
 * Keep a running average of the rate of synchronous (direct)
 * page reclaim efforts initiated by tasks in each cpuset.
 *
 * This represents the rate at which some task in the cpuset
 * ran low on memory on all nodes it was allowed to use, and
 * had to enter the kernels page reclaim code in an effort to
 * create more free memory by tossing clean pages or swapping
 * or writing dirty pages.
 *
 * Display to user space in the per-cpuset read-only file
 * "memory_pressure".  Value displayed is an integer
 * representing the recent rate of entry into the synchronous
 * (direct) page reclaim by any task attached to the cpuset.
 **/



void __cpuset_memory_pressure_bump(void)
{
	rcu_read_lock();
	fmeter_markevent(&task_cs(current)->fmeter);
	rcu_read_unlock();
}

Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 16 66.67% 1 33.33%
Li Zefan 4 16.67% 1 33.33%
Paul Menage 4 16.67% 1 33.33%
Total 24 100.00% 3 100.00%

#ifdef CONFIG_PROC_PID_CPUSET
/*
 * proc_cpuset_show()
 *  - Print tasks cpuset path into seq_file.
 *  - Used for /proc/<pid>/cpuset.
 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
 *    doesn't really matter if tsk->cpuset changes after we read it,
 *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
 *    anyway.
 */


int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
		     struct pid *pid, struct task_struct *tsk)
{
	char *buf;
	struct cgroup_subsys_state *css;
	int retval;

	retval = -ENOMEM;
	buf = kmalloc(PATH_MAX, GFP_KERNEL);
	if (!buf)
		goto out;

	css = task_get_css(tsk, cpuset_cgrp_id);
	retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
				current->nsproxy->cgroup_ns);
	css_put(css);
	if (retval >= PATH_MAX)
		retval = -ENAMETOOLONG;
	if (retval < 0)
		goto out_free;
	seq_puts(m, buf);
	seq_putc(m, '\n');
	retval = 0;
out_free:
	kfree(buf);
out:
	return retval;
}
Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 72 50.70% 2 16.67%
Tejun Heo 22 15.49% 4 33.33%
Eric W. Biedermann 16 11.27% 2 16.67%
Aditya Kali 12 8.45% 1 8.33%
Paul Menage 11 7.75% 1 8.33%
Li Zefan 9 6.34% 2 16.67%
Total 142 100.00% 12 100.00%

#endif /* CONFIG_PROC_PID_CPUSET */

/* Display task mems_allowed in /proc/<pid>/status file. */


void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
	seq_printf(m, "Mems_allowed:\t%*pb\n",
		   nodemask_pr_args(&task->mems_allowed));
	seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
		   nodemask_pr_args(&task->mems_allowed));
}
Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 18 40.00% 1 20.00%
Mike Travis 10 22.22% 1 20.00%
Tejun Heo 8 17.78% 1 20.00%
Eric W. Biedermann 7 15.56% 1 20.00%
Lai Jiangshan 2 4.44% 1 20.00%
Total 45 100.00% 5 100.00%

Overall Contributors
Person Tokens Prop Commits CommitProp
Paul Jackson 1887 22.48% 22 9.24%
Tejun Heo 1654 19.70% 54 22.69%
Li Zefan 1652 19.68% 51 21.43%
Paul Menage 1485 17.69% 9 3.78%
Miao Xie 364 4.34% 9 3.78%
Hidetoshi Seto 192 2.29% 1 0.42%
Maksim Krasnyanskiy 125 1.49% 1 0.42%
Vladimir Davydov 95 1.13% 2 0.84%
David Rientjes 93 1.11% 7 2.94%
Lai Jiangshan 78 0.93% 5 2.10%
Cliff Wickman 58 0.69% 2 0.84%
Srivatsa S. Bhat 58 0.69% 2 0.84%
Ben Blum 52 0.62% 2 0.84%
Waiman Long 51 0.61% 2 0.84%
Peter Zijlstra 47 0.56% 6 2.52%
Michal Hocko 46 0.55% 2 0.84%
Jack Steiner 40 0.48% 1 0.42%
Juri Lelli 37 0.44% 2 0.84%
Mel Gorman 31 0.37% 3 1.26%
Mike Travis 28 0.33% 2 0.84%
Andrew Morton 27 0.32% 2 0.84%
Oleg Nesterov 26 0.31% 2 0.84%
Eric W. Biedermann 23 0.27% 3 1.26%
Joonwoo Park 22 0.26% 1 0.42%
Paolo Bonzini 22 0.26% 1 0.42%
Rik Van Riel 20 0.24% 1 0.42%
Rusty Russell 17 0.20% 2 0.84%
Al Viro 17 0.20% 3 1.26%
Arnd Bergmann 16 0.19% 1 0.42%
Frédéric Weisbecker 15 0.18% 1 0.42%
Gu Zheng 14 0.17% 1 0.42%
Aditya Kali 12 0.14% 1 0.42%
Vlastimil Babka 11 0.13% 2 0.84%
Ingo Molnar 10 0.12% 4 1.68%
David P. Quigley 9 0.11% 1 0.42%
Nicholas Mc Guire 9 0.11% 1 0.42%
Rakib Mullick 6 0.07% 2 0.84%
Bob Picco 5 0.06% 1 0.42%
Dima Zavin 5 0.06% 1 0.42%
Fabian Frederick 5 0.06% 2 0.84%
Dave Hansen 4 0.05% 1 0.42%
Linus Torvalds 3 0.04% 2 0.84%
Christoph Lameter 3 0.04% 1 0.42%
Gautham R. Shenoy 2 0.02% 1 0.42%
Dmitry Adamushko 2 0.02% 1 0.42%
Yinghai Lu 2 0.02% 1 0.42%
David Howells 2 0.02% 2 0.84%
Li Zhong 2 0.02% 1 0.42%
Arun Sharma 1 0.01% 1 0.42%
Alban Crequy 1 0.01% 1 0.42%
Rasmus Villemoes 1 0.01% 1 0.42%
Viresh Kumar 1 0.01% 1 0.42%
Wei Yongjun 1 0.01% 1 0.42%
Adrian Bunk 1 0.01% 1 0.42%
Heiko Carstens 1 0.01% 1 0.42%
Zhao Hongjiang 1 0.01% 1 0.42%
Paul Gortmaker 1 0.01% 1 0.42%
Dan Carpenter 1 0.01% 1 0.42%
Total 8394 100.00% 238 100.00%
Directory: kernel/cgroup

Information contained on this website is for historical information purposes only and does not indicate or represent copyright ownership.
Created with cregit.
Person	Tokens	Prop	Commits	CommitProp
Paul Menage	21	72.41%	1	50.00%
Tejun Heo	8	27.59%	1	50.00%
Total	29	100.00%	2	100.00%
cregit-Linux how code gets into the kernel

Release 4.15 kernel/cgroup/cpuset.c

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors

Contributors