Contributors: 21
Author Tokens Token Proportion Commits Commit Proportion
Thomas Gleixner 1768 94.85% 33 54.10%
Mike Travis 17 0.91% 3 4.92%
Gu Zheng 16 0.86% 2 3.28%
Vitaly Kuznetsov 9 0.48% 1 1.64%
Jiang Liu 7 0.38% 1 1.64%
Andi Kleen 6 0.32% 3 4.92%
Jeremy Fitzhardinge 6 0.32% 1 1.64%
Ingo Molnar 5 0.27% 3 4.92%
Glauber de Oliveira Costa 4 0.21% 1 1.64%
Alexey Y. Starikovskiy 3 0.16% 1 1.64%
Yinghai Lu 3 0.16% 2 3.28%
Yury Norov 3 0.16% 1 1.64%
Henrik Kretzschmar 3 0.16% 1 1.64%
Jaswinder Singh Rajput 3 0.16% 1 1.64%
Frédéric Weisbecker 2 0.11% 1 1.64%
Hiroshi Shimamoto 2 0.11% 1 1.64%
Vladislav Zolotarov 2 0.11% 1 1.64%
Kuppuswamy Sathyanarayanan 2 0.11% 1 1.64%
Borislav Petkov 1 0.05% 1 1.64%
Tejun Heo 1 0.05% 1 1.64%
Brian Gerst 1 0.05% 1 1.64%
Total 1864 61


// SPDX-License-Identifier: GPL-2.0-only
/*
 * CPU/APIC topology
 *
 * The APIC IDs describe the system topology in multiple domain levels.
 * The CPUID topology parser provides the information which part of the
 * APIC ID is associated to the individual levels:
 *
 * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD]
 *
 * The root space contains the package (socket) IDs.
 *
 * Not enumerated levels consume 0 bits space, but conceptually they are
 * always represented. If e.g. only CORE and THREAD levels are enumerated
 * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE.
 *
 * If SMT is not supported, then the THREAD domain is still used. It then
 * has the same physical ID as the CORE domain and is the only child of
 * the core domain.
 *
 * This allows a unified view on the system independent of the enumerated
 * domain levels without requiring any conditionals in the code.
 */
#define pr_fmt(fmt) "CPU topo: " fmt
#include <linux/cpu.h>

#include <xen/xen.h>

#include <asm/apic.h>
#include <asm/hypervisor.h>
#include <asm/io_apic.h>
#include <asm/mpspec.h>
#include <asm/smp.h>

#include "cpu.h"

/*
 * Map cpu index to physical APIC ID
 */
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);

/* Bitmap of physically present CPUs. */
DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly;

/* Used for CPU number allocation and parallel CPU bringup */
u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, };

/* Bitmaps to mark registered APICs at each topology domain */
static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init;

/*
 * Keep track of assigned, disabled and rejected CPUs. Present assigned
 * with 1 as CPU #0 is reserved for the boot CPU.
 */
static struct {
	unsigned int		nr_assigned_cpus;
	unsigned int		nr_disabled_cpus;
	unsigned int		nr_rejected_cpus;
	u32			boot_cpu_apic_id;
	u32			real_bsp_apic_id;
} topo_info __ro_after_init = {
	.nr_assigned_cpus	= 1,
	.boot_cpu_apic_id	= BAD_APICID,
	.real_bsp_apic_id	= BAD_APICID,
};

#define domain_weight(_dom)	bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC)

bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
{
	return phys_id == (u64)cpuid_to_apicid[cpu];
}

#ifdef CONFIG_SMP
static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
{
	if (!(apicid & (__max_threads_per_core - 1)))
		cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
}
#else
static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { }
#endif

/*
 * Convert the APIC ID to a domain level ID by masking out the low bits
 * below the domain level @dom.
 */
static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom)
{
	if (dom == TOPO_SMT_DOMAIN)
		return apicid;
	return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
}

static int topo_lookup_cpuid(u32 apic_id)
{
	int i;

	/* CPU# to APICID mapping is persistent once it is established */
	for (i = 0; i < topo_info.nr_assigned_cpus; i++) {
		if (cpuid_to_apicid[i] == apic_id)
			return i;
	}
	return -ENODEV;
}

static __init int topo_get_cpunr(u32 apic_id)
{
	int cpu = topo_lookup_cpuid(apic_id);

	if (cpu >= 0)
		return cpu;

	return topo_info.nr_assigned_cpus++;
}

static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
{
#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
	early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id;
	early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id;
#endif
	set_cpu_present(cpu, true);
}

static __init bool check_for_real_bsp(u32 apic_id)
{
	bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6;
	u64 msr;

	/*
	 * There is no real good way to detect whether this a kdump()
	 * kernel, but except on the Voyager SMP monstrosity which is not
	 * longer supported, the real BSP APIC ID is the first one which is
	 * enumerated by firmware. That allows to detect whether the boot
	 * CPU is the real BSP. If it is not, then do not register the APIC
	 * because sending INIT to the real BSP would reset the whole
	 * system.
	 *
	 * The first APIC ID which is enumerated by firmware is detectable
	 * because the boot CPU APIC ID is registered before that without
	 * invoking this code.
	 */
	if (topo_info.real_bsp_apic_id != BAD_APICID)
		return false;

	/*
	 * Check whether the enumeration order is broken by evaluating the
	 * BSP bit in the APICBASE MSR. If the CPU does not have the
	 * APICBASE MSR then the BSP detection is not possible and the
	 * kernel must rely on the firmware enumeration order.
	 */
	if (has_apic_base) {
		rdmsrl(MSR_IA32_APICBASE, msr);
		is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
	}

	if (apic_id == topo_info.boot_cpu_apic_id) {
		/*
		 * If the boot CPU has the APIC BSP bit set then the
		 * firmware enumeration is agreeing. If the CPU does not
		 * have the APICBASE MSR then the only choice is to trust
		 * the enumeration order.
		 */
		if (is_bsp || !has_apic_base) {
			topo_info.real_bsp_apic_id = apic_id;
			return false;
		}
		/*
		 * If the boot APIC is enumerated first, but the APICBASE
		 * MSR does not have the BSP bit set, then there is no way
		 * to discover the real BSP here. Assume a crash kernel and
		 * limit the number of CPUs to 1 as an INIT to the real BSP
		 * would reset the machine.
		 */
		pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id);
		pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n");
		set_nr_cpu_ids(1);
		goto fwbug;
	}

	pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n",
		topo_info.boot_cpu_apic_id, apic_id);

	if (is_bsp) {
		/*
		 * The boot CPU has the APIC BSP bit set. Use it and complain
		 * about the broken firmware enumeration.
		 */
		topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id;
		goto fwbug;
	}

	pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n");

	topo_info.real_bsp_apic_id = apic_id;
	return true;

fwbug:
	pr_warn(FW_BUG "APIC enumeration order not specification compliant\n");
	return false;
}

static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level,
				    unsigned long *map)
{
	unsigned int id, end, cnt = 0;

	/* Calculate the exclusive end */
	end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]);

	/* Unfortunately there is no bitmap_weight_range() */
	for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id))
		cnt++;
	return cnt;
}

static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
{
	int cpu, dom;

	if (present) {
		set_bit(apic_id, phys_cpu_present_map);

		/*
		 * Double registration is valid in case of the boot CPU
		 * APIC because that is registered before the enumeration
		 * of the APICs via firmware parsers or VM guest
		 * mechanisms.
		 */
		if (apic_id == topo_info.boot_cpu_apic_id)
			cpu = 0;
		else
			cpu = topo_get_cpunr(apic_id);

		cpuid_to_apicid[cpu] = apic_id;
		topo_set_cpuids(cpu, apic_id, acpi_id);
	} else {
		u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN);

		/*
		 * Check for present APICs in the same package when running
		 * on bare metal. Allow the bogosity in a guest.
		 */
		if (hypervisor_is_type(X86_HYPER_NATIVE) &&
		    topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) {
			pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n",
				     apic_id);
			topo_info.nr_rejected_cpus++;
			return;
		}

		topo_info.nr_disabled_cpus++;
	}

	/*
	 * Register present and possible CPUs in the domain
	 * maps. cpu_possible_map will be updated in
	 * topology_init_possible_cpus() after enumeration is done.
	 */
	for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++)
		set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map);
}

/**
 * topology_register_apic - Register an APIC in early topology maps
 * @apic_id:	The APIC ID to set up
 * @acpi_id:	The ACPI ID associated to the APIC
 * @present:	True if the corresponding CPU is present
 */
void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present)
{
	if (apic_id >= MAX_LOCAL_APIC) {
		pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1);
		topo_info.nr_rejected_cpus++;
		return;
	}

	if (check_for_real_bsp(apic_id)) {
		topo_info.nr_rejected_cpus++;
		return;
	}

	/* CPU numbers exhausted? */
	if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) {
		pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids);
		topo_info.nr_rejected_cpus++;
		return;
	}

	topo_register_apic(apic_id, acpi_id, present);
}

/**
 * topology_register_boot_apic - Register the boot CPU APIC
 * @apic_id:	The APIC ID to set up
 *
 * Separate so CPU #0 can be assigned
 */
void __init topology_register_boot_apic(u32 apic_id)
{
	WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID);

	topo_info.boot_cpu_apic_id = apic_id;
	topo_register_apic(apic_id, CPU_ACPIID_INVALID, true);
}

/**
 * topology_get_logical_id - Retrieve the logical ID at a given topology domain level
 * @apicid:		The APIC ID for which to lookup the logical ID
 * @at_level:		The topology domain level to use
 *
 * @apicid must be a full APIC ID, not the normalized variant. It's valid to have
 * all bits below the domain level specified by @at_level to be clear. So both
 * real APIC IDs and backshifted normalized APIC IDs work correctly.
 *
 * Returns:
 *  - >= 0:	The requested logical ID
 *  - -ERANGE:	@apicid is out of range
 *  - -ENODEV:	@apicid is not registered
 */
int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level)
{
	/* Remove the bits below @at_level to get the proper level ID of @apicid */
	unsigned int lvlid = topo_apicid(apicid, at_level);

	if (lvlid >= MAX_LOCAL_APIC)
		return -ERANGE;
	if (!test_bit(lvlid, apic_maps[at_level].map))
		return -ENODEV;
	/* Get the number of set bits before @lvlid. */
	return bitmap_weight(apic_maps[at_level].map, lvlid);
}
EXPORT_SYMBOL_GPL(topology_get_logical_id);

/**
 * topology_unit_count - Retrieve the count of specified units at a given topology domain level
 * @apicid:		The APIC ID which specifies the search range
 * @which_units:	The domain level specifying the units to count
 * @at_level:		The domain level at which @which_units have to be counted
 *
 * This returns the number of possible units according to the enumerated
 * information.
 *
 * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN)
 * counts the number of possible cores in the package to which @apicid
 * belongs.
 *
 * @at_level must obviously be greater than @which_level to produce useful
 * results.  If @at_level is equal to @which_units the result is
 * unsurprisingly 1. If @at_level is less than @which_units the results
 * is by definition undefined and the function returns 0.
 */
unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
				 enum x86_topology_domains at_level)
{
	/* Remove the bits below @at_level to get the proper level ID of @apicid */
	unsigned int lvlid = topo_apicid(apicid, at_level);

	if (lvlid >= MAX_LOCAL_APIC)
		return 0;
	if (!test_bit(lvlid, apic_maps[at_level].map))
		return 0;
	if (which_units > at_level)
		return 0;
	if (which_units == at_level)
		return 1;
	return topo_unit_count(lvlid, at_level, apic_maps[which_units].map);
}

#ifdef CONFIG_ACPI_HOTPLUG_CPU
/**
 * topology_hotplug_apic - Handle a physical hotplugged APIC after boot
 * @apic_id:	The APIC ID to set up
 * @acpi_id:	The ACPI ID associated to the APIC
 */
int topology_hotplug_apic(u32 apic_id, u32 acpi_id)
{
	int cpu;

	if (apic_id >= MAX_LOCAL_APIC)
		return -EINVAL;

	/* Reject if the APIC ID was not registered during enumeration. */
	if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map))
		return -ENODEV;

	cpu = topo_lookup_cpuid(apic_id);
	if (cpu < 0)
		return -ENOSPC;

	set_bit(apic_id, phys_cpu_present_map);
	topo_set_cpuids(cpu, apic_id, acpi_id);
	cpu_mark_primary_thread(cpu, apic_id);
	return cpu;
}

/**
 * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot
 * @cpu:	The CPU number for which the APIC ID is removed
 */
void topology_hotunplug_apic(unsigned int cpu)
{
	u32 apic_id = cpuid_to_apicid[cpu];

	if (apic_id == BAD_APICID)
		return;

	per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
	clear_bit(apic_id, phys_cpu_present_map);
	set_cpu_present(cpu, false);
}
#endif

#ifdef CONFIG_X86_LOCAL_APIC
static unsigned int max_possible_cpus __initdata = NR_CPUS;

/**
 * topology_apply_cmdline_limits_early - Apply topology command line limits early
 *
 * Ensure that command line limits are in effect before firmware parsing
 * takes place.
 */
void __init topology_apply_cmdline_limits_early(void)
{
	unsigned int possible = nr_cpu_ids;

	/* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' 'noapic' */
	if (!setup_max_cpus || ioapic_is_disabled || apic_is_disabled)
		possible = 1;

	/* 'possible_cpus=N' */
	possible = min_t(unsigned int, max_possible_cpus, possible);

	if (possible < nr_cpu_ids) {
		pr_info("Limiting to %u possible CPUs\n", possible);
		set_nr_cpu_ids(possible);
	}
}

static __init bool restrict_to_up(void)
{
	if (!smp_found_config || ioapic_is_disabled)
		return true;
	/*
	 * XEN PV is special as it does not advertise the local APIC
	 * properly, but provides a fake topology for it so that the
	 * infrastructure works. So don't apply the restrictions vs. APIC
	 * here.
	 */
	if (xen_pv_domain())
		return false;

	return apic_is_disabled;
}

void __init topology_init_possible_cpus(void)
{
	unsigned int assigned = topo_info.nr_assigned_cpus;
	unsigned int disabled = topo_info.nr_disabled_cpus;
	unsigned int cnta, cntb, cpu, allowed = 1;
	unsigned int total = assigned + disabled;
	u32 apicid, firstid;

	/*
	 * If there was no APIC registered, then fake one so that the
	 * topology bitmap is populated. That ensures that the code below
	 * is valid and the various query interfaces can be used
	 * unconditionally. This does not affect the actual APIC code in
	 * any way because either the local APIC address has not been
	 * registered or the local APIC was disabled on the command line.
	 */
	if (topo_info.boot_cpu_apic_id == BAD_APICID)
		topology_register_boot_apic(0);

	if (!restrict_to_up()) {
		if (WARN_ON_ONCE(assigned > nr_cpu_ids)) {
			disabled += assigned - nr_cpu_ids;
			assigned = nr_cpu_ids;
		}
		allowed = min_t(unsigned int, total, nr_cpu_ids);
	}

	if (total > allowed)
		pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed);

	assigned = min_t(unsigned int, allowed, assigned);
	disabled = allowed - assigned;

	topo_info.nr_assigned_cpus = assigned;
	topo_info.nr_disabled_cpus = disabled;

	total_cpus = allowed;
	set_nr_cpu_ids(allowed);

	cnta = domain_weight(TOPO_PKG_DOMAIN);
	cntb = domain_weight(TOPO_DIE_DOMAIN);
	__max_logical_packages = cnta;
	__max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));

	pr_info("Max. logical packages: %3u\n", cnta);
	pr_info("Max. logical dies:     %3u\n", cntb);
	pr_info("Max. dies per package: %3u\n", __max_dies_per_package);

	cnta = domain_weight(TOPO_CORE_DOMAIN);
	cntb = domain_weight(TOPO_SMT_DOMAIN);
	/*
	 * Can't use order delta here as order(cnta) can be equal
	 * order(cntb) even if cnta != cntb.
	 */
	__max_threads_per_core = DIV_ROUND_UP(cntb, cnta);
	pr_info("Max. threads per core: %3u\n", __max_threads_per_core);

	firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC);
	__num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN);
	pr_info("Num. cores per package:   %3u\n", __num_cores_per_package);
	__num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN);
	pr_info("Num. threads per package: %3u\n", __num_threads_per_package);

	pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled);
	if (topo_info.nr_rejected_cpus)
		pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus);

	init_cpu_present(cpumask_of(0));
	init_cpu_possible(cpumask_of(0));

	/* Assign CPU numbers to non-present CPUs */
	for (apicid = 0; disabled; disabled--, apicid++) {
		apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map,
					      MAX_LOCAL_APIC, apicid);
		if (apicid >= MAX_LOCAL_APIC)
			break;
		cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid;
	}

	for (cpu = 0; cpu < allowed; cpu++) {
		apicid = cpuid_to_apicid[cpu];

		set_cpu_possible(cpu, true);

		if (apicid == BAD_APICID)
			continue;

		cpu_mark_primary_thread(cpu, apicid);
		set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map));
	}
}

/*
 * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed.
 */
void __init topology_reset_possible_cpus_up(void)
{
	init_cpu_present(cpumask_of(0));
	init_cpu_possible(cpumask_of(0));

	bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC);
	if (topo_info.boot_cpu_apic_id != BAD_APICID)
		set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map);
}

static int __init setup_possible_cpus(char *str)
{
	get_option(&str, &max_possible_cpus);
	return 0;
}
early_param("possible_cpus", setup_possible_cpus);
#endif