Contributors: 4
Author Tokens Token Proportion Commits Commit Proportion
Jason Gunthorpe 2264 72.77% 8 57.14%
Steve Sistare 825 26.52% 3 21.43%
Yi L Liu 16 0.51% 1 7.14%
Nicolin Chen 6 0.19% 2 14.29%
Total 3111 14


// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
 */
#include <linux/file.h>
#include <linux/interval_tree.h>
#include <linux/iommu.h>
#include <linux/iommufd.h>
#include <uapi/linux/iommufd.h>

#include "io_pagetable.h"

void iommufd_ioas_destroy(struct iommufd_object *obj)
{
	struct iommufd_ioas *ioas = container_of(obj, struct iommufd_ioas, obj);
	int rc;

	rc = iopt_unmap_all(&ioas->iopt, NULL);
	WARN_ON(rc && rc != -ENOENT);
	iopt_destroy_table(&ioas->iopt);
	mutex_destroy(&ioas->mutex);
}

struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx)
{
	struct iommufd_ioas *ioas;

	ioas = iommufd_object_alloc(ictx, ioas, IOMMUFD_OBJ_IOAS);
	if (IS_ERR(ioas))
		return ioas;

	iopt_init_table(&ioas->iopt);
	INIT_LIST_HEAD(&ioas->hwpt_list);
	mutex_init(&ioas->mutex);
	return ioas;
}

int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd)
{
	struct iommu_ioas_alloc *cmd = ucmd->cmd;
	struct iommufd_ioas *ioas;
	int rc;

	if (cmd->flags)
		return -EOPNOTSUPP;

	ioas = iommufd_ioas_alloc(ucmd->ictx);
	if (IS_ERR(ioas))
		return PTR_ERR(ioas);

	cmd->out_ioas_id = ioas->obj.id;
	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
	if (rc)
		goto out_table;

	down_read(&ucmd->ictx->ioas_creation_lock);
	iommufd_object_finalize(ucmd->ictx, &ioas->obj);
	up_read(&ucmd->ictx->ioas_creation_lock);
	return 0;

out_table:
	iommufd_object_abort_and_destroy(ucmd->ictx, &ioas->obj);
	return rc;
}

int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd)
{
	struct iommu_iova_range __user *ranges;
	struct iommu_ioas_iova_ranges *cmd = ucmd->cmd;
	struct iommufd_ioas *ioas;
	struct interval_tree_span_iter span;
	u32 max_iovas;
	int rc;

	if (cmd->__reserved)
		return -EOPNOTSUPP;

	ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
	if (IS_ERR(ioas))
		return PTR_ERR(ioas);

	down_read(&ioas->iopt.iova_rwsem);
	max_iovas = cmd->num_iovas;
	ranges = u64_to_user_ptr(cmd->allowed_iovas);
	cmd->num_iovas = 0;
	cmd->out_iova_alignment = ioas->iopt.iova_alignment;
	interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
				    ULONG_MAX) {
		if (!span.is_hole)
			continue;
		if (cmd->num_iovas < max_iovas) {
			struct iommu_iova_range elm = {
				.start = span.start_hole,
				.last = span.last_hole,
			};

			if (copy_to_user(&ranges[cmd->num_iovas], &elm,
					 sizeof(elm))) {
				rc = -EFAULT;
				goto out_put;
			}
		}
		cmd->num_iovas++;
	}
	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
	if (rc)
		goto out_put;
	if (cmd->num_iovas > max_iovas)
		rc = -EMSGSIZE;
out_put:
	up_read(&ioas->iopt.iova_rwsem);
	iommufd_put_object(ucmd->ictx, &ioas->obj);
	return rc;
}

static int iommufd_ioas_load_iovas(struct rb_root_cached *itree,
				   struct iommu_iova_range __user *ranges,
				   u32 num)
{
	u32 i;

	for (i = 0; i != num; i++) {
		struct iommu_iova_range range;
		struct iopt_allowed *allowed;

		if (copy_from_user(&range, ranges + i, sizeof(range)))
			return -EFAULT;

		if (range.start >= range.last)
			return -EINVAL;

		if (interval_tree_iter_first(itree, range.start, range.last))
			return -EINVAL;

		allowed = kzalloc(sizeof(*allowed), GFP_KERNEL_ACCOUNT);
		if (!allowed)
			return -ENOMEM;
		allowed->node.start = range.start;
		allowed->node.last = range.last;

		interval_tree_insert(&allowed->node, itree);
	}
	return 0;
}

int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd)
{
	struct iommu_ioas_allow_iovas *cmd = ucmd->cmd;
	struct rb_root_cached allowed_iova = RB_ROOT_CACHED;
	struct interval_tree_node *node;
	struct iommufd_ioas *ioas;
	struct io_pagetable *iopt;
	int rc = 0;

	if (cmd->__reserved)
		return -EOPNOTSUPP;

	ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
	if (IS_ERR(ioas))
		return PTR_ERR(ioas);
	iopt = &ioas->iopt;

	rc = iommufd_ioas_load_iovas(&allowed_iova,
				     u64_to_user_ptr(cmd->allowed_iovas),
				     cmd->num_iovas);
	if (rc)
		goto out_free;

	/*
	 * We want the allowed tree update to be atomic, so we have to keep the
	 * original nodes around, and keep track of the new nodes as we allocate
	 * memory for them. The simplest solution is to have a new/old tree and
	 * then swap new for old. On success we free the old tree, on failure we
	 * free the new tree.
	 */
	rc = iopt_set_allow_iova(iopt, &allowed_iova);
out_free:
	while ((node = interval_tree_iter_first(&allowed_iova, 0, ULONG_MAX))) {
		interval_tree_remove(node, &allowed_iova);
		kfree(container_of(node, struct iopt_allowed, node));
	}
	iommufd_put_object(ucmd->ictx, &ioas->obj);
	return rc;
}

static int conv_iommu_prot(u32 map_flags)
{
	/*
	 * We provide no manual cache coherency ioctls to userspace and most
	 * architectures make the CPU ops for cache flushing privileged.
	 * Therefore we require the underlying IOMMU to support CPU coherent
	 * operation. Support for IOMMU_CACHE is enforced by the
	 * IOMMU_CAP_CACHE_COHERENCY test during bind.
	 */
	int iommu_prot = IOMMU_CACHE;

	if (map_flags & IOMMU_IOAS_MAP_WRITEABLE)
		iommu_prot |= IOMMU_WRITE;
	if (map_flags & IOMMU_IOAS_MAP_READABLE)
		iommu_prot |= IOMMU_READ;
	return iommu_prot;
}

int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd)
{
	struct iommu_ioas_map_file *cmd = ucmd->cmd;
	unsigned long iova = cmd->iova;
	struct iommufd_ioas *ioas;
	unsigned int flags = 0;
	struct file *file;
	int rc;

	if (cmd->flags &
	     ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
	       IOMMU_IOAS_MAP_READABLE))
		return -EOPNOTSUPP;

	if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
		return -EOVERFLOW;

	if (!(cmd->flags &
	      (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)))
		return -EINVAL;

	ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
	if (IS_ERR(ioas))
		return PTR_ERR(ioas);

	if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
		flags = IOPT_ALLOC_IOVA;

	file = fget(cmd->fd);
	if (!file)
		return -EBADF;

	rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, file,
				 cmd->start, cmd->length,
				 conv_iommu_prot(cmd->flags), flags);
	if (rc)
		goto out_put;

	cmd->iova = iova;
	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
out_put:
	iommufd_put_object(ucmd->ictx, &ioas->obj);
	fput(file);
	return rc;
}

int iommufd_ioas_map(struct iommufd_ucmd *ucmd)
{
	struct iommu_ioas_map *cmd = ucmd->cmd;
	unsigned long iova = cmd->iova;
	struct iommufd_ioas *ioas;
	unsigned int flags = 0;
	int rc;

	if ((cmd->flags &
	     ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
	       IOMMU_IOAS_MAP_READABLE)) ||
	    cmd->__reserved)
		return -EOPNOTSUPP;
	if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
		return -EOVERFLOW;

	if (!(cmd->flags &
	      (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)))
		return -EINVAL;

	ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
	if (IS_ERR(ioas))
		return PTR_ERR(ioas);

	if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
		flags = IOPT_ALLOC_IOVA;
	rc = iopt_map_user_pages(ucmd->ictx, &ioas->iopt, &iova,
				 u64_to_user_ptr(cmd->user_va), cmd->length,
				 conv_iommu_prot(cmd->flags), flags);
	if (rc)
		goto out_put;

	cmd->iova = iova;
	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
out_put:
	iommufd_put_object(ucmd->ictx, &ioas->obj);
	return rc;
}

int iommufd_ioas_copy(struct iommufd_ucmd *ucmd)
{
	struct iommu_ioas_copy *cmd = ucmd->cmd;
	struct iommufd_ioas *src_ioas;
	struct iommufd_ioas *dst_ioas;
	unsigned int flags = 0;
	LIST_HEAD(pages_list);
	unsigned long iova;
	int rc;

	iommufd_test_syz_conv_iova_id(ucmd, cmd->src_ioas_id, &cmd->src_iova,
				      &cmd->flags);

	if ((cmd->flags &
	     ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
	       IOMMU_IOAS_MAP_READABLE)))
		return -EOPNOTSUPP;
	if (cmd->length >= ULONG_MAX || cmd->src_iova >= ULONG_MAX ||
	    cmd->dst_iova >= ULONG_MAX)
		return -EOVERFLOW;

	if (!(cmd->flags &
	      (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)))
		return -EINVAL;

	src_ioas = iommufd_get_ioas(ucmd->ictx, cmd->src_ioas_id);
	if (IS_ERR(src_ioas))
		return PTR_ERR(src_ioas);
	rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length,
			    &pages_list);
	iommufd_put_object(ucmd->ictx, &src_ioas->obj);
	if (rc)
		return rc;

	dst_ioas = iommufd_get_ioas(ucmd->ictx, cmd->dst_ioas_id);
	if (IS_ERR(dst_ioas)) {
		rc = PTR_ERR(dst_ioas);
		goto out_pages;
	}

	if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
		flags = IOPT_ALLOC_IOVA;
	iova = cmd->dst_iova;
	rc = iopt_map_pages(&dst_ioas->iopt, &pages_list, cmd->length, &iova,
			    conv_iommu_prot(cmd->flags), flags);
	if (rc)
		goto out_put_dst;

	cmd->dst_iova = iova;
	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
out_put_dst:
	iommufd_put_object(ucmd->ictx, &dst_ioas->obj);
out_pages:
	iopt_free_pages_list(&pages_list);
	return rc;
}

int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
{
	struct iommu_ioas_unmap *cmd = ucmd->cmd;
	struct iommufd_ioas *ioas;
	unsigned long unmapped = 0;
	int rc;

	ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
	if (IS_ERR(ioas))
		return PTR_ERR(ioas);

	if (cmd->iova == 0 && cmd->length == U64_MAX) {
		rc = iopt_unmap_all(&ioas->iopt, &unmapped);
		if (rc)
			goto out_put;
	} else {
		if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) {
			rc = -EOVERFLOW;
			goto out_put;
		}
		rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length,
				     &unmapped);
		if (rc)
			goto out_put;
	}

	cmd->length = unmapped;
	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));

out_put:
	iommufd_put_object(ucmd->ictx, &ioas->obj);
	return rc;
}

static void iommufd_release_all_iova_rwsem(struct iommufd_ctx *ictx,
					   struct xarray *ioas_list)
{
	struct iommufd_ioas *ioas;
	unsigned long index;

	xa_for_each(ioas_list, index, ioas) {
		up_write(&ioas->iopt.iova_rwsem);
		refcount_dec(&ioas->obj.users);
	}
	up_write(&ictx->ioas_creation_lock);
	xa_destroy(ioas_list);
}

static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx,
				       struct xarray *ioas_list)
{
	struct iommufd_object *obj;
	unsigned long index;
	int rc;

	/*
	 * This is very ugly, it is done instead of adding a lock around
	 * pages->source_mm, which is a performance path for mdev, we just
	 * obtain the write side of all the iova_rwsems which also protects the
	 * pages->source_*. Due to copies we can't know which IOAS could read
	 * from the pages, so we just lock everything. This is the only place
	 * locks are nested and they are uniformly taken in ID order.
	 *
	 * ioas_creation_lock prevents new IOAS from being installed in the
	 * xarray while we do this, and also prevents more than one thread from
	 * holding nested locks.
	 */
	down_write(&ictx->ioas_creation_lock);
	xa_lock(&ictx->objects);
	xa_for_each(&ictx->objects, index, obj) {
		struct iommufd_ioas *ioas;

		if (!obj || obj->type != IOMMUFD_OBJ_IOAS)
			continue;

		if (!refcount_inc_not_zero(&obj->users))
			continue;

		xa_unlock(&ictx->objects);

		ioas = container_of(obj, struct iommufd_ioas, obj);
		down_write_nest_lock(&ioas->iopt.iova_rwsem,
				     &ictx->ioas_creation_lock);

		rc = xa_err(xa_store(ioas_list, index, ioas, GFP_KERNEL));
		if (rc) {
			iommufd_release_all_iova_rwsem(ictx, ioas_list);
			return rc;
		}

		xa_lock(&ictx->objects);
	}
	xa_unlock(&ictx->objects);
	return 0;
}

static bool need_charge_update(struct iopt_pages *pages)
{
	switch (pages->account_mode) {
	case IOPT_PAGES_ACCOUNT_NONE:
		return false;
	case IOPT_PAGES_ACCOUNT_MM:
		return pages->source_mm != current->mm;
	case IOPT_PAGES_ACCOUNT_USER:
		/*
		 * Update when mm changes because it also accounts
		 * in mm->pinned_vm.
		 */
		return (pages->source_user != current_user()) ||
		       (pages->source_mm != current->mm);
	}
	return true;
}

static int charge_current(unsigned long *npinned)
{
	struct iopt_pages tmp = {
		.source_mm = current->mm,
		.source_task = current->group_leader,
		.source_user = current_user(),
	};
	unsigned int account_mode;
	int rc;

	for (account_mode = 0; account_mode != IOPT_PAGES_ACCOUNT_MODE_NUM;
	     account_mode++) {
		if (!npinned[account_mode])
			continue;

		tmp.account_mode = account_mode;
		rc = iopt_pages_update_pinned(&tmp, npinned[account_mode], true,
					      NULL);
		if (rc)
			goto err_undo;
	}
	return 0;

err_undo:
	while (account_mode != 0) {
		account_mode--;
		if (!npinned[account_mode])
			continue;
		tmp.account_mode = account_mode;
		iopt_pages_update_pinned(&tmp, npinned[account_mode], false,
					 NULL);
	}
	return rc;
}

static void change_mm(struct iopt_pages *pages)
{
	struct task_struct *old_task = pages->source_task;
	struct user_struct *old_user = pages->source_user;
	struct mm_struct *old_mm = pages->source_mm;

	pages->source_mm = current->mm;
	mmgrab(pages->source_mm);
	mmdrop(old_mm);

	pages->source_task = current->group_leader;
	get_task_struct(pages->source_task);
	put_task_struct(old_task);

	pages->source_user = get_uid(current_user());
	free_uid(old_user);
}

#define for_each_ioas_area(_xa, _index, _ioas, _area) \
	xa_for_each((_xa), (_index), (_ioas)) \
		for (_area = iopt_area_iter_first(&_ioas->iopt, 0, ULONG_MAX); \
		     _area; \
		     _area = iopt_area_iter_next(_area, 0, ULONG_MAX))

int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
{
	struct iommu_ioas_change_process *cmd = ucmd->cmd;
	struct iommufd_ctx *ictx = ucmd->ictx;
	unsigned long all_npinned[IOPT_PAGES_ACCOUNT_MODE_NUM] = {};
	struct iommufd_ioas *ioas;
	struct iopt_area *area;
	struct iopt_pages *pages;
	struct xarray ioas_list;
	unsigned long index;
	int rc;

	if (cmd->__reserved)
		return -EOPNOTSUPP;

	xa_init(&ioas_list);
	rc = iommufd_take_all_iova_rwsem(ictx, &ioas_list);
	if (rc)
		return rc;

	for_each_ioas_area(&ioas_list, index, ioas, area)  {
		if (area->pages->type != IOPT_ADDRESS_FILE) {
			rc = -EINVAL;
			goto out;
		}
	}

	/*
	 * Count last_pinned pages, then clear it to avoid double counting
	 * if the same iopt_pages is visited multiple times in this loop.
	 * Since we are under all the locks, npinned == last_npinned, so we
	 * can easily restore last_npinned before we return.
	 */
	for_each_ioas_area(&ioas_list, index, ioas, area)  {
		pages = area->pages;

		if (need_charge_update(pages)) {
			all_npinned[pages->account_mode] += pages->last_npinned;
			pages->last_npinned = 0;
		}
	}

	rc = charge_current(all_npinned);

	if (rc) {
		/* Charge failed.  Fix last_npinned and bail. */
		for_each_ioas_area(&ioas_list, index, ioas, area)
			area->pages->last_npinned = area->pages->npinned;
		goto out;
	}

	for_each_ioas_area(&ioas_list, index, ioas, area) {
		pages = area->pages;

		/* Uncharge the old one (which also restores last_npinned) */
		if (need_charge_update(pages)) {
			int r = iopt_pages_update_pinned(pages, pages->npinned,
							 false, NULL);

			if (WARN_ON(r))
				rc = r;
		}
		change_mm(pages);
	}

out:
	iommufd_release_all_iova_rwsem(ictx, &ioas_list);
	return rc;
}

int iommufd_option_rlimit_mode(struct iommu_option *cmd,
			       struct iommufd_ctx *ictx)
{
	if (cmd->object_id)
		return -EOPNOTSUPP;

	if (cmd->op == IOMMU_OPTION_OP_GET) {
		cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM;
		return 0;
	}
	if (cmd->op == IOMMU_OPTION_OP_SET) {
		int rc = 0;

		if (!capable(CAP_SYS_RESOURCE))
			return -EPERM;

		xa_lock(&ictx->objects);
		if (!xa_empty(&ictx->objects)) {
			rc = -EBUSY;
		} else {
			if (cmd->val64 == 0)
				ictx->account_mode = IOPT_PAGES_ACCOUNT_USER;
			else if (cmd->val64 == 1)
				ictx->account_mode = IOPT_PAGES_ACCOUNT_MM;
			else
				rc = -EINVAL;
		}
		xa_unlock(&ictx->objects);

		return rc;
	}
	return -EOPNOTSUPP;
}

static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd,
					  struct iommufd_ioas *ioas)
{
	if (cmd->op == IOMMU_OPTION_OP_GET) {
		cmd->val64 = !ioas->iopt.disable_large_pages;
		return 0;
	}
	if (cmd->op == IOMMU_OPTION_OP_SET) {
		if (cmd->val64 == 0)
			return iopt_disable_large_pages(&ioas->iopt);
		if (cmd->val64 == 1) {
			iopt_enable_large_pages(&ioas->iopt);
			return 0;
		}
		return -EINVAL;
	}
	return -EOPNOTSUPP;
}

int iommufd_ioas_option(struct iommufd_ucmd *ucmd)
{
	struct iommu_option *cmd = ucmd->cmd;
	struct iommufd_ioas *ioas;
	int rc = 0;

	if (cmd->__reserved)
		return -EOPNOTSUPP;

	ioas = iommufd_get_ioas(ucmd->ictx, cmd->object_id);
	if (IS_ERR(ioas))
		return PTR_ERR(ioas);

	switch (cmd->option_id) {
	case IOMMU_OPTION_HUGE_PAGES:
		rc = iommufd_ioas_option_huge_pages(cmd, ioas);
		break;
	default:
		rc = -EOPNOTSUPP;
	}

	iommufd_put_object(ucmd->ictx, &ioas->obj);
	return rc;
}