cregit-Linux how code gets into the kernel

Release 4.15 kernel/cgroup/cpuset.c

Directory: kernel/cgroup
/*
 *  kernel/cpuset.c
 *
 *  Processor and Memory placement constraints for sets of tasks.
 *
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
 *  Copyright (C) 2006 Google, Inc
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  2006 Rework by Paul Menage to use generic cgroups
 *  2008 Rework of the scheduler domains and CPU hotplug handling
 *       by Max Krasnyansky
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/seq_file.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/time.h>
#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
#include <linux/oom.h>
#include <linux/sched/isolation.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
#include <linux/cgroup.h>
#include <linux/wait.h>


DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);

DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);

/* See "Frequency meter" comments, below. */


struct fmeter {
	
int cnt;		/* unprocessed events count */
	
int val;		/* most recent output value */
	
time64_t time;		/* clock (secs) when val computed */
	
spinlock_t lock;	/* guards read or write of above */
};


struct cpuset {
	
struct cgroup_subsys_state css;

	
unsigned long flags;		/* "unsigned long" so bitops work */

	/*
         * On default hierarchy:
         *
         * The user-configured masks can only be changed by writing to
         * cpuset.cpus and cpuset.mems, and won't be limited by the
         * parent masks.
         *
         * The effective masks is the real masks that apply to the tasks
         * in the cpuset. They may be changed if the configured masks are
         * changed or hotplug happens.
         *
         * effective_mask == configured_mask & parent's effective_mask,
         * and if it ends up empty, it will inherit the parent's mask.
         *
         *
         * On legacy hierachy:
         *
         * The user-configured masks are always the same with effective masks.
         */

	/* user-configured CPUs and Memory Nodes allow to tasks */
	
cpumask_var_t cpus_allowed;
	
nodemask_t mems_allowed;

	/* effective CPUs and Memory Nodes allow to tasks */
	
cpumask_var_t effective_cpus;
	
nodemask_t effective_mems;

	/*
         * This is old Memory Nodes tasks took on.
         *
         * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
         * - A new cpuset's old_mems_allowed is initialized when some
         *   task is moved into it.
         * - old_mems_allowed is used in cpuset_migrate_mm() when we change
         *   cpuset.mems_allowed and have tasks' nodemask updated, and
         *   then old_mems_allowed is updated to mems_allowed.
         */
	
nodemask_t old_mems_allowed;

	
struct fmeter fmeter;		/* memory_pressure filter */

	/*
         * Tasks are being attached to this cpuset.  Used to prevent
         * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
         */
	
int attach_in_progress;

	/* partition number for rebuild_sched_domains() */
	
int pn;

	/* for custom sched domain */
	
int relax_domain_level;
};


static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) { return css ? container_of(css, struct cpuset, css) : NULL; }

Contributors

PersonTokensPropCommitsCommitProp
Paul Menage2172.41%150.00%
Tejun Heo827.59%150.00%
Total29100.00%2100.00%

/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task) { return css_cs(task_css(task, cpuset_cgrp_id)); }

Contributors

PersonTokensPropCommitsCommitProp
Paul Menage2288.00%125.00%
Tejun Heo312.00%375.00%
Total25100.00%4100.00%


static inline struct cpuset *parent_cs(struct cpuset *cs) { return css_cs(cs->css.parent); }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo24100.00%3100.00%
Total24100.00%3100.00%

#ifdef CONFIG_NUMA
static inline bool task_has_mempolicy(struct task_struct *task) { return task->mempolicy; }

Contributors

PersonTokensPropCommitsCommitProp
David Rientjes17100.00%1100.00%
Total17100.00%1100.00%

#else
static inline bool task_has_mempolicy(struct task_struct *task) { return false; }

Contributors

PersonTokensPropCommitsCommitProp
David Rientjes15100.00%1100.00%
Total15100.00%1100.00%

#endif /* bits in struct cpuset flags field */ typedef enum { CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, CS_MEMORY_MIGRATE, CS_SCHED_LOAD_BALANCE, CS_SPREAD_PAGE, CS_SPREAD_SLAB, } cpuset_flagbits_t; /* convenient tests for these bits */
static inline bool is_cpuset_online(struct cpuset *cs) { return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo3196.88%266.67%
Srivatsa S. Bhat13.12%133.33%
Total32100.00%3100.00%


static inline int is_cpu_exclusive(const struct cpuset *cs) { return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); }

Contributors

PersonTokensPropCommitsCommitProp
Paul Jackson24100.00%1100.00%
Total24100.00%1100.00%


static inline int is_mem_exclusive(const struct cpuset *cs) { return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); }

Contributors

PersonTokensPropCommitsCommitProp
Paul Jackson24100.00%1100.00%
Total24100.00%1100.00%


static inline int is_mem_hardwall(const struct cpuset *cs) { return test_bit(CS_MEM_HARDWALL, &cs->flags); }

Contributors

PersonTokensPropCommitsCommitProp
Paul Menage24100.00%1100.00%
Total24100.00%1100.00%


static inline int is_sched_load_balance(const struct cpuset *cs) { return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); }

Contributors

PersonTokensPropCommitsCommitProp
Paul Jackson24100.00%1100.00%
Total24100.00%1100.00%


static inline int is_memory_migrate(const struct cpuset *cs) { return test_bit(CS_MEMORY_MIGRATE, &cs->flags); }

Contributors

PersonTokensPropCommitsCommitProp
Paul Jackson24100.00%1100.00%
Total24100.00%1100.00%


static inline int is_spread_page(const struct cpuset *cs) { return test_bit(CS_SPREAD_PAGE, &cs->flags); }

Contributors

PersonTokensPropCommitsCommitProp
Paul Jackson24100.00%1100.00%
Total24100.00%1100.00%


static inline int is_spread_slab(const struct cpuset *cs) { return test_bit(CS_SPREAD_SLAB, &cs->flags); }

Contributors

PersonTokensPropCommitsCommitProp
Paul Jackson24100.00%1100.00%
Total24100.00%1100.00%

static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), }; /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child * @pos_css: used for iteration * @parent_cs: target cpuset to walk children of * * Walk @child_cs through the online children of @parent_cs. Must be used * with RCU read locked. */ #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ css_for_each_child((pos_css), &(parent_cs)->css) \ if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) /** * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants * @des_cs: loop cursor pointing to the current descendant * @pos_css: used for iteration * @root_cs: target cpuset to walk ancestor of * * Walk @des_cs through the online descendants of @root_cs. Must be used * with RCU read locked. The caller may modify @pos_css by calling * css_rightmost_descendant() to skip subtree. @root_cs is included in the * iteration and the first node to be visited. */ #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* * There are two global locks guarding cpuset structures - cpuset_mutex and * callback_lock. We also require taking task_lock() when dereferencing a * task's cpuset pointer. See "The task_lock() exception", at the end of this * comment. * * A task must hold both locks to modify cpusets. If a task holds * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it * is the only task able to also acquire callback_lock and be able to * modify cpusets. It can perform various checks on the cpuset structure * first, knowing nothing will change. It can also allocate memory while * just holding cpuset_mutex. While it is performing these checks, various * callback routines can briefly acquire callback_lock to query cpusets. * Once it is ready to make the changes, it takes callback_lock, blocking * everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_lock, as that would risk double tripping on callback_lock * from one of the callbacks into the cpuset code from within * __alloc_pages(). * * If a task is only holding callback_lock, then it has read-only * access to cpusets. * * Now, the task_struct fields mems_allowed and mempolicy may be changed * by other task, we use alloc_lock in the task_struct fields to protect * them. * * The cpuset_common_file_read() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * * Accessing a task's cpuset should be done in accordance with the * guidelines for accessing subsystem state in kernel/cgroup.c */ static DEFINE_MUTEX(cpuset_mutex); static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; /* * CPU / memory hotplug is handled asynchronously. */ static void cpuset_hotplug_workfn(struct work_struct *work); static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); /* * Cgroup v2 behavior is used when on default hierarchy or the * cgroup_v2_mode flag is set. */
static inline bool is_in_v2_mode(void) { return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); }

Contributors

PersonTokensPropCommitsCommitProp
Waiman Long25100.00%1100.00%
Total25100.00%1100.00%

/* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */
static struct dentry *cpuset_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { struct file_system_type *cgroup_fs = get_fs_type("cgroup"); struct dentry *ret = ERR_PTR(-ENODEV); if (cgroup_fs) { char mountopts[] = "cpuset,noprefix," "release_agent=/sbin/cpuset_release_agent"; ret = cgroup_fs->mount(cgroup_fs, flags, unused_dev_name, mountopts); put_filesystem(cgroup_fs); } return ret; }

Contributors

PersonTokensPropCommitsCommitProp
Paul Menage6275.61%133.33%
Al Viro1113.41%133.33%
Paul Jackson910.98%133.33%
Total82100.00%3100.00%

static struct file_system_type cpuset_fs_type = { .name = "cpuset", .mount = cpuset_mount, }; /* * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy * until we find one that does have some online cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * * Call with callback_lock or cpuset_mutex held. */
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { cs = parent_cs(cs); if (unlikely(!cs)) { /* * The top cpuset doesn't have any online cpu as a * consequence of a race between cpuset_hotplug_work * and cpu hotplug notifier. But we know the top * cpuset's effective_cpus is on its way to to be * identical to cpu_online_mask. */ cpumask_copy(pmask, cpu_online_mask); return; } } cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); }

Contributors

PersonTokensPropCommitsCommitProp
Paul Jackson2232.84%114.29%
Joonwoo Park2131.34%114.29%
Paul Menage1319.40%114.29%
Li Zefan811.94%342.86%
Tejun Heo34.48%114.29%
Total67100.00%7100.00%

/* * Return in *pmask the portion of a cpusets's mems_allowed that * are online, with memory. If none are online with memory, walk * up the cpuset hierarchy until we find one that does have some * online mems. The top cpuset always has some mems online. * * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * * Call with callback_lock or cpuset_mutex held. */
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) cs = parent_cs(cs); nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); }

Contributors

PersonTokensPropCommitsCommitProp
Paul Menage2446.15%120.00%
Paul Jackson2140.38%120.00%
Tejun Heo35.77%120.00%
Li Zefan23.85%120.00%
Lai Jiangshan23.85%120.00%
Total52100.00%5100.00%

/* * update task's spread flag if cpuset's page/slab spread flag is set * * Call with callback_lock or cpuset_mutex held. */
static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) { if (is_spread_page(cs)) task_set_spread_page(tsk); else task_clear_spread_page(tsk); if (is_spread_slab(cs)) task_set_spread_slab(tsk); else task_clear_spread_slab(tsk); }

Contributors

PersonTokensPropCommitsCommitProp
Miao Xie4076.92%150.00%
Li Zefan1223.08%150.00%
Total52100.00%2100.00%

/* * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags * are only set if the other's are set. Call holding cpuset_mutex. */
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) { return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && nodes_subset(p->mems_allowed, q->mems_allowed) && is_cpu_exclusive(p) <= is_cpu_exclusive(q) && is_mem_exclusive(p) <= is_mem_exclusive(q); }

Contributors

PersonTokensPropCommitsCommitProp
Paul Menage5793.44%133.33%
Paul Jackson34.92%133.33%
Li Zefan11.64%133.33%
Total61100.00%3100.00%

/** * alloc_trial_cpuset - allocate a trial cpuset * @cs: the cpuset that the trial cpuset duplicates */
static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) { struct cpuset *trial; trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); if (!trial) return NULL; if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) goto free_cs; if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) goto free_cpus; cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial; free_cpus: free_cpumask_var(trial->cpus_allowed); free_cs: kfree(trial); return NULL; }

Contributors

PersonTokensPropCommitsCommitProp
Li Zefan117100.00%3100.00%
Total117100.00%3100.00%

/** * free_trial_cpuset - free the trial cpuset * @trial: the trial cpuset to be freed */
static void free_trial_cpuset(struct cpuset *trial) { free_cpumask_var(trial->effective_cpus); free_cpumask_var(trial->cpus_allowed); kfree(trial); }

Contributors

PersonTokensPropCommitsCommitProp
Li Zefan30100.00%3100.00%
Total30100.00%3100.00%

/* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes * cpuset_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the * cpuset in the list must use cur below, not trial. * * 'trial' is the address of bulk structure copy of cur, with * perhaps one or more of the fields cpus_allowed, mems_allowed, * or flags changed to new, trial values. * * Return 0 if valid, -errno if not. */
static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; int ret; rcu_read_lock(); /* Each of our child cpusets must be a subset of us */ ret = -EBUSY; cpuset_for_each_child(c, css, cur) if (!is_cpuset_subset(c, trial)) goto out; /* Remaining checks don't apply to root cpuset */ ret = 0; if (cur == &top_cpuset) goto out; par = parent_cs(cur); /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) goto out; /* * If either I or some sibling (!= me) is exclusive, we can't * overlap */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) goto out; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) goto out; } /* * Cpusets with tasks - existing or newly being attached - can't * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; if (!nodes_empty(cur->mems_allowed) && nodes_empty(trial->mems_allowed)) goto out; } /* * We can't shrink if we won't have enough room for SCHED_DEADLINE * tasks. */ ret = -EBUSY; if (is_cpu_exclusive(cur) && !cpuset_cpumask_can_shrink(cur->cpus_allowed, trial->cpus_allowed)) goto out; ret = 0; out: rcu_read_unlock(); return ret; }

Contributors

PersonTokensPropCommitsCommitProp
Paul Jackson10234.93%317.65%
Tejun Heo8027.40%529.41%
Paul Menage4615.75%211.76%
Li Zefan3210.96%423.53%
Juri Lelli289.59%15.88%
Dave Hansen20.68%15.88%
Waiman Long20.68%15.88%
Total292100.00%17100.00%

#ifdef CONFIG_SMP /* * Helper routine for generate_sched_domains(). * Do cpusets a, b have overlapping effective cpus_allowed masks? */
static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { return cpumask_intersects(a->effective_cpus, b->effective_cpus); }

Contributors

PersonTokensPropCommitsCommitProp
Paul Jackson2589.29%133.33%
Li Zefan310.71%266.67%
Total28100.00%3100.00%


static void update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) { if (dattr->relax_domain_level < c->relax_domain_level) dattr->relax_domain_level = c->relax_domain_level; return; }

Contributors

PersonTokensPropCommitsCommitProp
Hidetoshi Seto35100.00%1100.00%
Total35100.00%1100.00%


static void update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *root_cs) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { pos_css = css_rightmost_descendant(pos_css); continue; } if (is_sched_load_balance(cp)) update_domain_attr(dattr, cp); } rcu_read_unlock(); }

Contributors

PersonTokensPropCommitsCommitProp
Lai Jiangshan5167.11%120.00%
Tejun Heo2431.58%360.00%
Li Zefan11.32%120.00%
Total76100.00%5100.00%

/* Must be called with cpuset_mutex held. */
static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ return static_key_count(&cpusets_enabled_key.key) + 1; }

Contributors

PersonTokensPropCommitsCommitProp
Paolo Bonzini21100.00%1100.00%
Total21100.00%1100.00%

/* * generate_sched_domains() * * This function builds a partial partition of the systems CPUs * A 'partial partition' is a set of non-overlapping subsets whose * union is a subset of that set. * The output of this function needs to be passed to kernel/sched/core.c * partition_sched_domains() routine, which will rebuild the scheduler's * load balancing domains (sched domains) as specified by that partial * partition. * * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt * for a background explanation of this. * * Does not return errors, on the theory that the callers of this * routine would rather not worry about failures to rebuild sched * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * * Must be called with cpuset_mutex held. * * The three key local variables below are: * q - a linked-list queue of cpuset pointers, used to implement a * top-down scan of all cpusets. This scan loads a pointer * to each cpuset marked is_sched_load_balance into the * array 'csa'. For our purposes, rebuilding the schedulers * sched domains, we can ignore !is_sched_load_balance cpusets. * csa - (for CpuSet Array) Array of pointers to all the cpusets * that need to be load balanced, for convenient iterative * access by the subsequent code that finds the best partition, * i.e the set of domains (subsets) of CPUs such that the * cpus_allowed of every cpuset marked is_sched_load_balance * is a subset of one of these domains, while there are as * many such domains as possible, each as small as possible. * doms - Conversion of 'csa' to an array of cpumasks, for passing to * the kernel/sched/core.c routine partition_sched_domains() in a * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) * * Finding the best partition (set of domains): * The triple nested loops below over i, j, k scan over the * load balanced cpusets (using the array of cpuset pointers in * csa[]) looking for pairs of cpusets that have overlapping * cpus_allowed, but which don't have the same 'pn' partition * number and gives them in the same partition number. It keeps * looping on the 'restart' label until it can no longer find * any such pairs. * * The union of the cpus_allowed masks from the set of * all cpusets having the same 'pn' value then form the one * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */
static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) goto done; dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); if (dattr) { *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, housekeeping_cpumask(HK_FLAG_DOMAIN)); goto done; } csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; /* * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The * latter: All child cpusets contain a subset of the * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && cpumask_intersects(cp->cpus_allowed, housekeeping_cpumask(HK_FLAG_DOMAIN)))) continue; if (is_sched_load_balance(cp)) csa[csn++] = cp; /* skip @cp's subtree */ pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); for (i = 0; i < csn; i++) csa[i]->pn = i; ndoms = csn; restart: /* Find the best partition (set of sched domains) */ for (i = 0; i < csn; i++) { struct cpuset *a = csa[i]; int apn = a->pn; for (j = 0; j < csn; j++) { struct cpuset *b = csa[j]; int bpn = b->pn; if (apn != bpn && cpusets_overlap(a, b)) { for (k = 0; k < csn; k++) { struct cpuset *c = csa[k]; if (c->pn == bpn) c->pn = apn; } ndoms--; /* one less element */ goto restart; } } } /* * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. */ doms = alloc_sched_domains(ndoms); if (!doms) goto done; /* * The rest of the code, including the scheduler, can deal with * dattr==NULL case. No need to abort if alloc fails. */ dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; struct cpumask *dp; int apn = a->pn; if (apn < 0) { /* Skip completed partitions */ continue; } dp = doms[nslot]; if (nslot == ndoms) { static int warnings = 10; if (warnings) { pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", nslot, ndoms, csn, i, apn); warnings--; } continue; } cpumask_clear(dp); if (dattr) *(dattr + nslot) = SD_ATTR_INIT; for (j = i; j < csn; j++) { struct cpuset *b = csa[j]; if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); if (dattr) update_domain_attr_tree(dattr + nslot, b); /* Done with this partition */ b->pn = -1; } } nslot++; } BUG_ON(nslot != ndoms); done: kfree(csa); /* * Fallback to the default domain if kmalloc() failed. * See comments in partition_sched_domains(). */ if (doms == NULL) ndoms = 1; *domains = doms; *attributes = dattr; return ndoms; }

Contributors

PersonTokensPropCommitsCommitProp
Paul Jackson42361.39%14.35%
Hidetoshi Seto7510.89%14.35%
Maksim Krasnyanskiy588.42%14.35%
Tejun Heo355.08%417.39%
Li Zefan273.92%626.09%
Rik Van Riel202.90%14.35%
Rusty Russell142.03%14.35%
Frédéric Weisbecker121.74%14.35%
Lai Jiangshan101.45%28.70%
Miao Xie60.87%14.35%
Paul Menage30.44%14.35%
Mel Gorman20.29%14.35%
Ingo Molnar20.29%14.35%
Fabian Frederick20.29%14.35%
Total689100.00%23100.00%

/* * Rebuild scheduler domains. * * If the flag 'sched_load_balance' of any cpuset with non-empty * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset * which has that flag enabled, or if any cpuset with a non-empty * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * * Call with cpuset_mutex held. Takes get_online_cpus(). */
static void rebuild_sched_domains_locked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; lockdep_assert_held(&cpuset_mutex); get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid * passing doms with offlined cpu to partition_sched_domains(). * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) goto out; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); out: put_online_cpus(); }

Contributors

PersonTokensPropCommitsCommitProp
Maksim Krasnyanskiy3143.06%110.00%
Li Zefan1825.00%220.00%
Paul Jackson912.50%110.00%
Tejun Heo79.72%220.00%
Paul Menage34.17%110.00%
Gautham R. Shenoy22.78%110.00%
Hidetoshi Seto11.39%110.00%
Rusty Russell11.39%110.00%
Total72100.00%10100.00%

#else /* !CONFIG_SMP */
static void rebuild_sched_domains_locked(void) { }

Contributors

PersonTokensPropCommitsCommitProp
Paul Menage571.43%150.00%
Tejun Heo228.57%150.00%
Total7100.00%2100.00%

#endif /* CONFIG_SMP */
void rebuild_sched_domains(void) { mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); }

Contributors

PersonTokensPropCommitsCommitProp
Maksim Krasnyanskiy1150.00%116.67%
Tejun Heo836.36%233.33%
Paul Jackson29.09%233.33%
Paul Menage14.55%116.67%
Total22100.00%6100.00%

/** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */
static void update_tasks_cpumask(struct cpuset *cs) { struct css_task_iter it; struct task_struct *task; css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) set_cpus_allowed_ptr(task, cs->effective_cpus); css_task_iter_end(&it); }

Contributors

PersonTokensPropCommitsCommitProp
Tejun Heo4473.33%440.00%
Li Zefan610.00%330.00%
Miao Xie610.00%110.00%
Cliff Wickman35.00%110.00%
Adrian Bunk11.67%110.00%
Total60100.00%10100.00%

/* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @new_cpus: temp variable for calculating new effective_cpus * * When congifured cpumask is changed, the effective cpumasks of this cpuset * and all its descendants need to be updated. * * On legacy hierachy, effective_cpus will be the same with cpu_allowed. * * Called with cpuset_mutex held */
static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; bool need_rebuild_sched_domains = false; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus); /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ if (is_in_v2_mode() && cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ if (cpumask_equal(new_cpus, cp->effective_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; } if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, new_cpus); spin_unlock_irq(&callback_lock); WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); update_tasks_cpumask(cp); /* * If the effective cpumask of any non-empty cpuset is changed, * we need to rebuild sched domains. */ if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp)) need_rebuild_sched_domains = true; rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); if (need_rebuild_sched_domains) rebuild_sched_domains_locked(); }

Contributors

PersonTokensPropCommitsCommitProp
Li Zefan18085.71%535.71%
Tejun Heo115.24%321.43%
Paul Jackson73.33%17.14%
Vladimir Davydov41.90%17.14%
Waiman Long41.90%17.14%
Paul Menage31.43%214.29%
Cliff Wickman10.48%17.14%
Total210100.00%14100.00%

/** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider * @trialcs: trial cpuset * @buf: buffer of cpu numbers written to this cpuset */
static int update_cpumask(struct cpuset *cs,