Contributors: 1
Author Tokens Token Proportion Commits Commit Proportion
Jason Gunthorpe 1716 100.00% 4 100.00%
Total 1716 4


/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
 *
 * AMD IOMMU v1 page table
 *
 * This is described in Section "2.2.3 I/O Page Tables for Host Translations"
 * of the "AMD I/O Virtualization Technology (IOMMU) Specification"
 *
 * Note the level numbering here matches the core code, so level 0 is the same
 * as mode 1.
 *
 */
#ifndef __GENERIC_PT_FMT_AMDV1_H
#define __GENERIC_PT_FMT_AMDV1_H

#include "defs_amdv1.h"
#include "../pt_defs.h"

#include <asm/page.h>
#include <linux/bitfield.h>
#include <linux/container_of.h>
#include <linux/mem_encrypt.h>
#include <linux/minmax.h>
#include <linux/sizes.h>
#include <linux/string.h>

enum {
	PT_ITEM_WORD_SIZE = sizeof(u64),
	/*
	 * The IOMMUFD selftest uses the AMDv1 format with some alterations It
	 * uses a 2k page size to test cases where the CPU page size is not the
	 * same.
	 */
#ifdef AMDV1_IOMMUFD_SELFTEST
	PT_MAX_VA_ADDRESS_LG2 = 56,
	PT_MAX_OUTPUT_ADDRESS_LG2 = 51,
	PT_MAX_TOP_LEVEL = 4,
	PT_GRANULE_LG2SZ = 11,
#else
	PT_MAX_VA_ADDRESS_LG2 = 64,
	PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
	PT_MAX_TOP_LEVEL = 5,
	PT_GRANULE_LG2SZ = 12,
#endif
	PT_TABLEMEM_LG2SZ = 12,

	/* The DTE only has these bits for the top phyiscal address */
	PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
};

/* PTE bits */
enum {
	AMDV1PT_FMT_PR = BIT(0),
	AMDV1PT_FMT_D = BIT(6),
	AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9),
	AMDV1PT_FMT_OA = GENMASK_ULL(51, 12),
	AMDV1PT_FMT_FC = BIT_ULL(60),
	AMDV1PT_FMT_IR = BIT_ULL(61),
	AMDV1PT_FMT_IW = BIT_ULL(62),
};

/*
 * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make
 * these defines to avoid it.
 */
#define AMDV1PT_FMT_NL_DEFAULT 0
#define AMDV1PT_FMT_NL_SIZE 7

static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts)
{
	u64 entry = pts->entry;

	if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
		entry = __sme_clr(entry);
	return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ);
}
#define pt_table_pa amdv1pt_table_pa

/* Returns the oa for the start of the contiguous entry */
static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts)
{
	u64 entry = pts->entry;
	pt_oaddr_t oa;

	if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
		entry = __sme_clr(entry);
	oa = FIELD_GET(AMDV1PT_FMT_OA, entry);

	if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) {
		unsigned int sz_bits = oaffz(oa);

		oa = oalog2_set_mod(oa, 0, sz_bits);
	} else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) !=
			      AMDV1PT_FMT_NL_DEFAULT))
		return 0;
	return oalog2_mul(oa, PT_GRANULE_LG2SZ);
}
#define pt_entry_oa amdv1pt_entry_oa

static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts)
{
	/*
	 * Table 15: Page Table Level Parameters
	 * The top most level cannot have translation entries
	 */
	return pts->level < PT_MAX_TOP_LEVEL;
}
#define pt_can_have_leaf amdv1pt_can_have_leaf

/* Body in pt_fmt_defaults.h */
static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);

static inline unsigned int
amdv1pt_entry_num_contig_lg2(const struct pt_state *pts)
{
	u32 code;

	if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) ==
	    AMDV1PT_FMT_NL_DEFAULT)
		return ilog2(1);

	PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) !=
		   AMDV1PT_FMT_NL_SIZE);

	/*
	 * The contiguous size is encoded in the length of a string of 1's in
	 * the low bits of the OA. Reverse the equation:
	 *  code = log2_to_int(num_contig_lg2 + item_lg2sz -
	 *              PT_GRANULE_LG2SZ - 1) - 1
	 * Which can be expressed as:
	 *  num_contig_lg2 = oalog2_ffz(code) + 1 -
	 *              item_lg2sz - PT_GRANULE_LG2SZ
	 *
	 * Assume the bit layout is correct and remove the masking. Reorganize
	 * the equation to move all the arithmetic before the ffz.
	 */
	code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 +
			      pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ);
	return ffz_t(u32, code);
}
#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2

static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts)
{
	/*
	 * Top entry covers bits [63:57] only, this is handled through
	 * max_vasz_lg2.
	 */
	if (PT_WARN_ON(pts->level == 5))
		return 7;
	return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
}
#define pt_num_items_lg2 amdv1pt_num_items_lg2

static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts)
{
	unsigned int isz_lg2 = pt_table_item_lg2sz(pts);

	if (!amdv1pt_can_have_leaf(pts))
		return 0;

	/*
	 * Table 14: Example Page Size Encodings
	 * Address bits 51:32 can be used to encode page sizes greater than 4
	 * Gbytes. Address bits 63:52 are zero-extended.
	 *
	 * 512GB Pages are not supported due to a hardware bug.
	 * Otherwise every power of two size is supported.
	 */
	return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1),
			   isz_lg2) & ~SZ_512G;
}
#define pt_possible_sizes amdv1pt_possible_sizes

static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts)
{
	const u64 *tablep = pt_cur_table(pts, u64) + pts->index;
	unsigned int next_level;
	u64 entry;

	pts->entry = entry = READ_ONCE(*tablep);
	if (!(entry & AMDV1PT_FMT_PR))
		return PT_ENTRY_EMPTY;

	next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry);
	if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT ||
	    next_level == AMDV1PT_FMT_NL_SIZE)
		return PT_ENTRY_OA;
	return PT_ENTRY_TABLE;
}
#define pt_load_entry_raw amdv1pt_load_entry_raw

static inline void
amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
			   unsigned int oasz_lg2,
			   const struct pt_write_attrs *attrs)
{
	unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
	u64 *tablep = pt_cur_table(pts, u64) + pts->index;
	u64 entry;

	if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
		return;

	entry = AMDV1PT_FMT_PR |
		FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
		attrs->descriptor_bits;

	if (oasz_lg2 == isz_lg2) {
		entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
				    AMDV1PT_FMT_NL_DEFAULT);
		WRITE_ONCE(*tablep, entry);
	} else {
		unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2;
		u64 *end = tablep + log2_to_int(num_contig_lg2);

		entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
				    AMDV1PT_FMT_NL_SIZE) |
			 FIELD_PREP(AMDV1PT_FMT_OA,
				    oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ -
						  1) -
					    1);

		/* See amdv1pt_clear_entries() */
		if (num_contig_lg2 <= ilog2(32)) {
			for (; tablep != end; tablep++)
				WRITE_ONCE(*tablep, entry);
		} else {
			memset64(tablep, entry, log2_to_int(num_contig_lg2));
		}
	}
	pts->entry = entry;
}
#define pt_install_leaf_entry amdv1pt_install_leaf_entry

static inline bool amdv1pt_install_table(struct pt_state *pts,
					 pt_oaddr_t table_pa,
					 const struct pt_write_attrs *attrs)
{
	u64 entry;

	/*
	 * IR and IW are ANDed from the table levels along with the PTE. We
	 * always control permissions from the PTE, so always set IR and IW for
	 * tables.
	 */
	entry = AMDV1PT_FMT_PR |
		FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) |
		FIELD_PREP(AMDV1PT_FMT_OA,
			   log2_div(table_pa, PT_GRANULE_LG2SZ)) |
		AMDV1PT_FMT_IR | AMDV1PT_FMT_IW;
	if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
		entry = __sme_set(entry);
	return pt_table_install64(pts, entry);
}
#define pt_install_table amdv1pt_install_table

static inline void amdv1pt_attr_from_entry(const struct pt_state *pts,
					   struct pt_write_attrs *attrs)
{
	attrs->descriptor_bits =
		pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW);
}
#define pt_attr_from_entry amdv1pt_attr_from_entry

static inline void amdv1pt_clear_entries(struct pt_state *pts,
					 unsigned int num_contig_lg2)
{
	u64 *tablep = pt_cur_table(pts, u64) + pts->index;
	u64 *end = tablep + log2_to_int(num_contig_lg2);

	/*
	 * gcc generates rep stos for the io-pgtable code, and this difference
	 * can show in microbenchmarks with larger contiguous page sizes.
	 * rep is slower for small cases.
	 */
	if (num_contig_lg2 <= ilog2(32)) {
		for (; tablep != end; tablep++)
			WRITE_ONCE(*tablep, 0);
	} else {
		memset64(tablep, 0, log2_to_int(num_contig_lg2));
	}
}
#define pt_clear_entries amdv1pt_clear_entries

static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts)
{
	unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
	u64 *tablep = pt_cur_table(pts, u64) +
		      log2_set_mod(pts->index, 0, num_contig_lg2);
	u64 *end = tablep + log2_to_int(num_contig_lg2);

	for (; tablep != end; tablep++)
		if (READ_ONCE(*tablep) & AMDV1PT_FMT_D)
			return true;
	return false;
}
#define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty

static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts)
{
	unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
	u64 *tablep = pt_cur_table(pts, u64) +
		      log2_set_mod(pts->index, 0, num_contig_lg2);
	u64 *end = tablep + log2_to_int(num_contig_lg2);

	for (; tablep != end; tablep++)
		WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D);
}
#define pt_entry_make_write_clean amdv1pt_entry_make_write_clean

static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts)
{
	u64 *tablep = pt_cur_table(pts, u64) + pts->index;
	u64 new = pts->entry | AMDV1PT_FMT_D;

	return try_cmpxchg64(tablep, &pts->entry, new);
}
#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty

/* --- iommu */
#include <linux/generic_pt/iommu.h>
#include <linux/iommu.h>

#define pt_iommu_table pt_iommu_amdv1

/* The common struct is in the per-format common struct */
static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
{
	return &container_of(iommu_table, struct pt_iommu_amdv1, iommu)
			->amdpt.common;
}

static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
{
	return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu;
}

static inline int amdv1pt_iommu_set_prot(struct pt_common *common,
					 struct pt_write_attrs *attrs,
					 unsigned int iommu_prot)
{
	u64 pte = 0;

	if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE))
		pte |= AMDV1PT_FMT_FC;
	if (iommu_prot & IOMMU_READ)
		pte |= AMDV1PT_FMT_IR;
	if (iommu_prot & IOMMU_WRITE)
		pte |= AMDV1PT_FMT_IW;

	/*
	 * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
	 * control this. For now if the tables use sme_set then so do the ptes.
	 */
	if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES))
		pte = __sme_set(pte);

	attrs->descriptor_bits = pte;
	return 0;
}
#define pt_iommu_set_prot amdv1pt_iommu_set_prot

static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table,
					 const struct pt_iommu_amdv1_cfg *cfg)
{
	struct pt_amdv1 *table = &iommu_table->amdpt;
	unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;

	if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL)
		return -EINVAL;

	if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) &&
	    cfg->starting_level != PT_MAX_TOP_LEVEL)
		max_vasz_lg2 = PT_GRANULE_LG2SZ +
			       (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) *
				       (cfg->starting_level + 1);

	table->common.max_vasz_lg2 =
		min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2);
	table->common.max_oasz_lg2 =
		min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
	pt_top_set_level(&table->common, cfg->starting_level);
	return 0;
}
#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init

#ifndef PT_FMT_VARIANT
static inline void
amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table,
			  const struct pt_range *top_range,
			  struct pt_iommu_amdv1_hw_info *info)
{
	info->host_pt_root = virt_to_phys(top_range->top_table);
	PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK);
	info->mode = top_range->top_level + 1;
}
#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info
#endif

#if defined(GENERIC_PT_KUNIT)
static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = {
	/* Matches what io_pgtable does */
	[0] = { .starting_level = 2 },
};
#define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs
enum { KUNIT_FMT_FEATURES = 0 };
#endif

#endif