cregit-Linux how code gets into the kernel

Release 4.15 kernel/sched/topology.c

Directory: kernel/sched
// SPDX-License-Identifier: GPL-2.0
/*
 * Scheduler topology setup/handling methods
 */
#include <linux/sched.h>
#include <linux/mutex.h>
#include <linux/sched/isolation.h>

#include "sched.h"


DEFINE_MUTEX(sched_domains_mutex);

/* Protected by sched_domains_mutex: */

cpumask_var_t sched_domains_tmpmask;

cpumask_var_t sched_domains_tmpmask2;

#ifdef CONFIG_SCHED_DEBUG


static int __init sched_debug_setup(char *str) { sched_debug_enabled = true; return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar1794.44%150.00%
Peter Zijlstra15.56%150.00%
Total18100.00%2100.00%

early_param("sched_debug", sched_debug_setup);
static inline bool sched_debug(void) { return sched_debug_enabled; }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar12100.00%1100.00%
Total12100.00%1100.00%


static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { struct sched_group *group = sd->groups; cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); if (sd->parent) printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" " has parent"); return -1; } printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } if (!cpumask_test_cpu(cpu, sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { printk("\n"); printk(KERN_ERR "ERROR: group is NULL\n"); break; } if (!cpumask_weight(sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } if (!(sd->flags & SD_OVERLAP) && cpumask_intersects(groupmask, sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } cpumask_or(groupmask, groupmask, sched_group_span(group)); printk(KERN_CONT " %d:{ span=%*pbl", group->sgc->id, cpumask_pr_args(sched_group_span(group))); if ((sd->flags & SD_OVERLAP) && !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(group_balance_mask(group))); } if (group->sgc->capacity != SCHED_CAPACITY_SCALE) printk(KERN_CONT " cap=%lu", group->sgc->capacity); if (group == sd->groups && sd->child && !cpumask_equal(sched_domain_span(sd->child), sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n"); } printk(KERN_CONT " }"); group = group->next; if (group != sd->groups) printk(KERN_CONT ","); } while (group != sd->groups); printk(KERN_CONT "\n"); if (!cpumask_equal(sched_domain_span(sd), groupmask)) printk(KERN_ERR "ERROR: groups don't span domain->span\n"); if (sd->parent && !cpumask_subset(groupmask, sched_domain_span(sd->parent))) printk(KERN_ERR "ERROR: parent span is not a superset " "of domain->span\n"); return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar35376.08%114.29%
Peter Zijlstra11123.92%685.71%
Total464100.00%7100.00%


static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; if (!sched_debug_enabled) return; if (!sd) { printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); return; } printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu); for (;;) { if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) break; level++; sd = sd->parent; if (!sd) break; } }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar8498.82%150.00%
Peter Zijlstra11.18%150.00%
Total85100.00%2100.00%

#else /* !CONFIG_SCHED_DEBUG */ # define sched_debug_enabled 0 # define sched_domain_debug(sd, cpu) do { } while (0)
static inline bool sched_debug(void) { return false; }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar12100.00%1100.00%
Total12100.00%1100.00%

#endif /* CONFIG_SCHED_DEBUG */
static int sd_degenerate(struct sched_domain *sd) { if (cpumask_weight(sched_domain_span(sd)) == 1) return 1; /* Following flags need at least 2 groups */ if (sd->flags & (SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) return 0; } /* Following flags don't use groups */ if (sd->flags & (SD_WAKE_AFFINE)) return 0; return 1; }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar85100.00%1100.00%
Total85100.00%1100.00%


static int sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) { unsigned long cflags = sd->flags, pflags = parent->flags; if (sd_degenerate(parent)) return 1; if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { pflags &= ~(SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | SD_SHARE_POWERDOMAIN); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } if (~cflags & pflags) return 0; return 1; }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar121100.00%1100.00%
Total121100.00%1100.00%


static void free_rootdomain(struct rcu_head *rcu) { struct root_domain *rd = container_of(rcu, struct root_domain, rcu); cpupri_cleanup(&rd->cpupri); cpudl_cleanup(&rd->cpudl); free_cpumask_var(rd->dlo_mask); free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); kfree(rd); }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar75100.00%1100.00%
Total75100.00%1100.00%


void rq_attach_root(struct rq *rq, struct root_domain *rd) { struct root_domain *old_rd = NULL; unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { old_rd = rq->rd; if (cpumask_test_cpu(rq->cpu, old_rd->online)) set_rq_offline(rq); cpumask_clear_cpu(rq->cpu, old_rd->span); /* * If we dont want to free the old_rd yet then * set old_rd to NULL to skip the freeing later * in this function: */ if (!atomic_dec_and_test(&old_rd->refcount)) old_rd = NULL; } atomic_inc(&rd->refcount); rq->rd = rd; cpumask_set_cpu(rq->cpu, rd->span); if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) call_rcu_sched(&old_rd->rcu, free_rootdomain); }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar160100.00%1100.00%
Total160100.00%1100.00%


static int init_rootdomain(struct root_domain *rd) { if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) goto free_online; if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; #ifdef HAVE_RT_PUSH_IPI rd->rto_cpu = -1; raw_spin_lock_init(&rd->rto_lock); init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); #endif init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; if (cpupri_init(&rd->cpupri) != 0) goto free_cpudl; return 0; free_cpudl: cpudl_cleanup(&rd->cpudl); free_rto_mask: free_cpumask_var(rd->rto_mask); free_dlo_mask: free_cpumask_var(rd->dlo_mask); free_online: free_cpumask_var(rd->online); free_span: free_cpumask_var(rd->span); out: return -ENOMEM; }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar16884.85%150.00%
Steven Rostedt3015.15%150.00%
Total198100.00%2100.00%

/* * By default the system creates a single root-domain with all CPUs as * members (mimicking the global state we have today). */ struct root_domain def_root_domain;
void init_defrootdomain(void) { init_rootdomain(&def_root_domain); atomic_set(&def_root_domain.refcount, 1); }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar23100.00%1100.00%
Total23100.00%1100.00%


static struct root_domain *alloc_rootdomain(void) { struct root_domain *rd; rd = kzalloc(sizeof(*rd), GFP_KERNEL); if (!rd) return NULL; if (init_rootdomain(rd) != 0) { kfree(rd); return NULL; } return rd; }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar5798.28%150.00%
Viresh Kumar11.72%150.00%
Total58100.00%2100.00%


static void free_sched_groups(struct sched_group *sg, int free_sgc) { struct sched_group *tmp, *first; if (!sg) return; first = sg; do { tmp = sg->next; if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) kfree(sg->sgc); if (atomic_dec_and_test(&sg->ref)) kfree(sg); sg = tmp; } while (sg != first); }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar7888.64%150.00%
Shu Wang1011.36%150.00%
Total88100.00%2100.00%


static void destroy_sched_domain(struct sched_domain *sd) { /* * A normal sched domain may have multiple group references, an * overlapping domain, having private groups, only one. Iterate, * dropping group/capacity references, freeing where none remain. */ free_sched_groups(sd->groups, 1); if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) kfree(sd->shared); kfree(sd); }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar4897.96%150.00%
Peter Zijlstra12.04%150.00%
Total49100.00%2100.00%


static void destroy_sched_domains_rcu(struct rcu_head *rcu) { struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); while (sd) { struct sched_domain *parent = sd->parent; destroy_sched_domain(sd); sd = parent; } }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar50100.00%1100.00%
Total50100.00%1100.00%


static void destroy_sched_domains(struct sched_domain *sd) { if (sd) call_rcu(&sd->rcu, destroy_sched_domains_rcu); }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar25100.00%1100.00%
Total25100.00%1100.00%

/* * Keep a special pointer to the highest sched_domain that has * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * allows us to avoid some pointer chasing select_idle_sibling(). * * Also keep a unique ID per domain (we use the first CPU number in * the cpumask of the domain), this allows us to quickly tell if * two CPUs are in the same cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu) { struct sched_domain_shared *sds = NULL; struct sched_domain *sd; int id = cpu; int size = 1; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); sds = sd->shared; } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar156100.00%1100.00%
Total156100.00%1100.00%

/* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */
static void cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) { struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; if (!parent) break; if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; /* * Transfer SD_PREFER_SIBLING down in case of a * degenerate parent; the spans match for this * so the property transfers. */ if (parent->flags & SD_PREFER_SIBLING) tmp->flags |= SD_PREFER_SIBLING; destroy_sched_domain(parent); } else tmp = tmp->parent; } if (sd && sd_degenerate(sd)) { tmp = sd; sd = sd->parent; destroy_sched_domain(tmp); if (sd) sd->child = NULL; } sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); dirty_sched_domain_sysctl(cpu); destroy_sched_domains(tmp); update_top_cache_domain(cpu); }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar19697.51%150.00%
Peter Zijlstra52.49%150.00%
Total201100.00%2100.00%

struct s_data { struct sched_domain ** __percpu sd; struct root_domain *rd; }; enum s_alloc { sa_rootdomain, sa_sd, sa_sd_storage, sa_none, }; /* * Return the canonical balance CPU for this group, this is the first CPU * of this group that's also in the balance mask. * * The balance mask are all those CPUs that could actually end up at this * group. See build_balance_mask(). * * Also see should_we_balance(). */
int group_balance_cpu(struct sched_group *sg) { return cpumask_first(group_balance_mask(sg)); }

Contributors

PersonTokensPropCommitsCommitProp
Peter Zijlstra19100.00%3100.00%
Total19100.00%3100.00%

/* * NUMA topology (first read the regular topology blurb below) * * Given a node-distance table, for example: * * node 0 1 2 3 * 0: 10 20 30 20 * 1: 20 10 20 30 * 2: 30 20 10 20 * 3: 20 30 20 10 * * which represents a 4 node ring topology like: * * 0 ----- 1 * | | * | | * | | * 3 ----- 2 * * We want to construct domains and groups to represent this. The way we go * about doing this is to build the domains on 'hops'. For each NUMA level we * construct the mask of all nodes reachable in @level hops. * * For the above NUMA topology that gives 3 levels: * * NUMA-2 0-3 0-3 0-3 0-3 * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2} * * NUMA-1 0-1,3 0-2 1-3 0,2-3 * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3} * * NUMA-0 0 1 2 3 * * * As can be seen; things don't nicely line up as with the regular topology. * When we iterate a domain in child domain chunks some nodes can be * represented multiple times -- hence the "overlap" naming for this part of * the topology. * * In order to minimize this overlap, we only build enough groups to cover the * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3. * * Because: * * - the first group of each domain is its child domain; this * gets us the first 0-1,3 * - the only uncovered node is 2, who's child domain is 1-3. * * However, because of the overlap, computing a unique CPU for each group is * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both * groups include the CPUs of Node-0, while those CPUs would not in fact ever * end up at those groups (they would end up in group: 0-1,3). * * To correct this we have to introduce the group balance mask. This mask * will contain those CPUs in the group that can reach this group given the * (child) domain tree. * * With this we can once again compute balance_cpu and sched_group_capacity * relations. * * XXX include words on how balance_cpu is unique and therefore can be * used for sched_group_capacity links. * * * Another 'interesting' topology is: * * node 0 1 2 3 * 0: 10 20 20 30 * 1: 20 10 20 20 * 2: 20 20 10 20 * 3: 30 20 20 10 * * Which looks a little like: * * 0 ----- 1 * | / | * | / | * | / | * 2 ----- 3 * * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3 * are not. * * This leads to a few particularly weird cases where the sched_domain's are * not of the same number for each cpu. Consider: * * NUMA-2 0-3 0-3 * groups: {0-2},{1-3} {1-3},{0-2} * * NUMA-1 0-2 0-3 0-3 1-3 * * NUMA-0 0 1 2 3 * */ /* * Build the balance mask; it contains only those CPUs that can arrive at this * group and should be considered to continue balancing. * * We do this during the group creation pass, therefore the group information * isn't complete yet, however since each group represents a (child) domain we * can fully construct this using the sched_domain bits (which are already * complete). */
static void build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) { const struct cpumask *sg_span = sched_group_span(sg); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; cpumask_clear(mask); for_each_cpu(i, sg_span) { sibling = *per_cpu_ptr(sdd->sd, i); /* * Can happen in the asymmetric case, where these siblings are * unused. The mask will not be empty because those CPUs that * do have the top domain _should_ span the domain. */ if (!sibling->child) continue; /* If we would not end up here, we can't continue from here */ if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) continue; cpumask_set_cpu(i, mask); } /* We must not have empty masks here */ WARN_ON_ONCE(cpumask_empty(mask)); }

Contributors

PersonTokensPropCommitsCommitProp
Ingo Molnar7766.38%116.67%
Peter Zijlstra3631.03%466.67%
Lauro Ramos Venancio32.59%116.67%
Total116100.00%6100.00%

/* * XXX: This creates per-node group entries; since the load-balancer will * immediately access remote memory to construct this group's load-balance * statistics having the groups node local is of dubious benefit. */
static struct sched_group * build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) { struct sched_group *sg; struct cpumask *sg_span; sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(cpu)); if (!sg) return NULL; sg_span = sched_group_span(sg); if (sd->child) cpumask_copy(sg_span, sched_domain_span(sd->child)); else cpumask_copy(sg_span, sched_domain_span(sd)); atomic_inc(&sg->ref); return sg; }

Contributors

PersonTokensPropCommitsCommitProp
Lauro Ramos Venancio9290.20%125.00%
Shu Wang87.84%125.00%
Peter Zijlstra10.98%125.00%
Ingo Molnar10.98%125.00%
Total102100.00%4100.00%


static void init_overlap_sched_group(struct sched_domain *sd, struct sched_group *sg) { struct cpumask *mask = sched_domains_tmpmask2; struct sd_data *sdd = sd->private; struct cpumask *sg_span; int cpu; build_balance_mask(sd, sg, mask); cpu = cpumask_first_and(sched_group_span(sg), mask); sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); if (atomic_inc_return(&sg->sgc->ref) == 1) cpumask_copy(group_balance_mask(sg), mask); else WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask)); /* * Initialize sgc->capacity such that even if we mess up the * domains and no possible iteration will get us here, we won't * die on a /0 trap. */ sg_span = sched_group_span(sg); sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; }

Contributors

PersonTokensPropCommitsCommitProp
Lauro Ramos Venancio9062.94%120.00%
Peter Zijlstra5337.06%480.00%
Total143100.00%5100.00%


static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { struct sched_group *first = NULL, *last = NULL, *sg; const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered = sched_domains_tmpmask; struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; cpumask_clear(covered); for_each_cpu_wrap(i, span, cpu) { struct cpumask *sg_span; if (cpumask_test_cpu(i, covered)) continue; sibling = *per_cpu_ptr(sdd->sd, i); /* * Asymmetric node setups can result in situations where the * domain tree is of unequal depth, make sure to skip domains * that already cover the entire range. * * In that case build_sched_domains() will have terminated the * iteration early and our sibling sd spans will be empty. * Domains should always include the CPU they're built on, so * check that. */ if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; sg = build_group_from_child_sched_domain(sibling, cpu); if (!sg) goto fail; sg_span = sched_group_span(sg); cpumask_or(covered, covered, sg_span); init_overlap_sched_group(sd, sg); if (!first) first = sg; if (last) last->next = sg; last = sg; last->next = first; } sd->groups = first; return 0; fail: free_sched_groups(first, 0); return -ENOMEM; }

Contributors

Pe