Contributors: 6
Author Tokens Token Proportion Commits Commit Proportion
Yishai Hadas 5284 97.40% 21 70.00%
Yi L Liu 68 1.25% 1 3.33%
Jason Gunthorpe 44 0.81% 4 13.33%
Shay Drory 16 0.29% 1 3.33%
Dan Carpenter 11 0.20% 2 6.67%
Shang XiaoJing 2 0.04% 1 3.33%
Total 5425 30


// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
 */

#include <linux/device.h>
#include <linux/eventfd.h>
#include <linux/file.h>
#include <linux/interrupt.h>
#include <linux/iommu.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/notifier.h>
#include <linux/pci.h>
#include <linux/pm_runtime.h>
#include <linux/types.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include <linux/sched/mm.h>
#include <linux/anon_inodes.h>

#include "cmd.h"

/* Arbitrary to prevent userspace from consuming endless memory */
#define MAX_MIGRATION_SIZE (512*1024*1024)

static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
{
	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);

	return container_of(core_device, struct mlx5vf_pci_core_device,
			    core_device);
}

struct page *
mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
			  unsigned long offset)
{
	unsigned long cur_offset = 0;
	struct scatterlist *sg;
	unsigned int i;

	/* All accesses are sequential */
	if (offset < buf->last_offset || !buf->last_offset_sg) {
		buf->last_offset = 0;
		buf->last_offset_sg = buf->table.sgt.sgl;
		buf->sg_last_entry = 0;
	}

	cur_offset = buf->last_offset;

	for_each_sg(buf->last_offset_sg, sg,
			buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
		if (offset < sg->length + cur_offset) {
			buf->last_offset_sg = sg;
			buf->sg_last_entry += i;
			buf->last_offset = cur_offset;
			return nth_page(sg_page(sg),
					(offset - cur_offset) / PAGE_SIZE);
		}
		cur_offset += sg->length;
	}
	return NULL;
}

int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
			       unsigned int npages)
{
	unsigned int to_alloc = npages;
	struct page **page_list;
	unsigned long filled;
	unsigned int to_fill;
	int ret;

	to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
	page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL);
	if (!page_list)
		return -ENOMEM;

	do {
		filled = alloc_pages_bulk_array(GFP_KERNEL, to_fill, page_list);
		if (!filled) {
			ret = -ENOMEM;
			goto err;
		}
		to_alloc -= filled;
		ret = sg_alloc_append_table_from_pages(
			&buf->table, page_list, filled, 0,
			filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
			GFP_KERNEL);

		if (ret)
			goto err;
		buf->allocated_length += filled * PAGE_SIZE;
		/* clean input for another bulk allocation */
		memset(page_list, 0, filled * sizeof(*page_list));
		to_fill = min_t(unsigned int, to_alloc,
				PAGE_SIZE / sizeof(*page_list));
	} while (to_alloc > 0);

	kvfree(page_list);
	return 0;

err:
	kvfree(page_list);
	return ret;
}

static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
{
	mutex_lock(&migf->lock);
	migf->state = MLX5_MIGF_STATE_ERROR;
	migf->filp->f_pos = 0;
	mutex_unlock(&migf->lock);
}

static int mlx5vf_release_file(struct inode *inode, struct file *filp)
{
	struct mlx5_vf_migration_file *migf = filp->private_data;

	mlx5vf_disable_fd(migf);
	mutex_destroy(&migf->lock);
	kfree(migf);
	return 0;
}

static struct mlx5_vhca_data_buffer *
mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
			      bool *end_of_data)
{
	struct mlx5_vhca_data_buffer *buf;
	bool found = false;

	*end_of_data = false;
	spin_lock_irq(&migf->list_lock);
	if (list_empty(&migf->buf_list)) {
		*end_of_data = true;
		goto end;
	}

	buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
			       buf_elm);
	if (pos >= buf->start_pos &&
	    pos < buf->start_pos + buf->length) {
		found = true;
		goto end;
	}

	/*
	 * As we use a stream based FD we may expect having the data always
	 * on first chunk
	 */
	migf->state = MLX5_MIGF_STATE_ERROR;

end:
	spin_unlock_irq(&migf->list_lock);
	return found ? buf : NULL;
}

static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
			       char __user **buf, size_t *len, loff_t *pos)
{
	unsigned long offset;
	ssize_t done = 0;
	size_t copy_len;

	copy_len = min_t(size_t,
			 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
	while (copy_len) {
		size_t page_offset;
		struct page *page;
		size_t page_len;
		u8 *from_buff;
		int ret;

		offset = *pos - vhca_buf->start_pos;
		page_offset = offset % PAGE_SIZE;
		offset -= page_offset;
		page = mlx5vf_get_migration_page(vhca_buf, offset);
		if (!page)
			return -EINVAL;
		page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
		from_buff = kmap_local_page(page);
		ret = copy_to_user(*buf, from_buff + page_offset, page_len);
		kunmap_local(from_buff);
		if (ret)
			return -EFAULT;
		*pos += page_len;
		*len -= page_len;
		*buf += page_len;
		done += page_len;
		copy_len -= page_len;
	}

	if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
		spin_lock_irq(&vhca_buf->migf->list_lock);
		list_del_init(&vhca_buf->buf_elm);
		list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
		spin_unlock_irq(&vhca_buf->migf->list_lock);
	}

	return done;
}

static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
			       loff_t *pos)
{
	struct mlx5_vf_migration_file *migf = filp->private_data;
	struct mlx5_vhca_data_buffer *vhca_buf;
	bool first_loop_call = true;
	bool end_of_data;
	ssize_t done = 0;

	if (pos)
		return -ESPIPE;
	pos = &filp->f_pos;

	if (!(filp->f_flags & O_NONBLOCK)) {
		if (wait_event_interruptible(migf->poll_wait,
				!list_empty(&migf->buf_list) ||
				migf->state == MLX5_MIGF_STATE_ERROR ||
				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
				migf->state == MLX5_MIGF_STATE_PRE_COPY ||
				migf->state == MLX5_MIGF_STATE_COMPLETE))
			return -ERESTARTSYS;
	}

	mutex_lock(&migf->lock);
	if (migf->state == MLX5_MIGF_STATE_ERROR) {
		done = -ENODEV;
		goto out_unlock;
	}

	while (len) {
		ssize_t count;

		vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
							 &end_of_data);
		if (first_loop_call) {
			first_loop_call = false;
			/* Temporary end of file as part of PRE_COPY */
			if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
				done = -ENOMSG;
				goto out_unlock;
			}

			if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
				if (filp->f_flags & O_NONBLOCK) {
					done = -EAGAIN;
					goto out_unlock;
				}
			}
		}

		if (end_of_data)
			goto out_unlock;

		if (!vhca_buf) {
			done = -EINVAL;
			goto out_unlock;
		}

		count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
		if (count < 0) {
			done = count;
			goto out_unlock;
		}
		done += count;
	}

out_unlock:
	mutex_unlock(&migf->lock);
	return done;
}

static __poll_t mlx5vf_save_poll(struct file *filp,
				 struct poll_table_struct *wait)
{
	struct mlx5_vf_migration_file *migf = filp->private_data;
	__poll_t pollflags = 0;

	poll_wait(filp, &migf->poll_wait, wait);

	mutex_lock(&migf->lock);
	if (migf->state == MLX5_MIGF_STATE_ERROR)
		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
	else if (!list_empty(&migf->buf_list) ||
		 migf->state == MLX5_MIGF_STATE_COMPLETE)
		pollflags = EPOLLIN | EPOLLRDNORM;
	mutex_unlock(&migf->lock);

	return pollflags;
}

/*
 * FD is exposed and user can use it after receiving an error.
 * Mark migf in error, and wake the user.
 */
static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
{
	migf->state = MLX5_MIGF_STATE_ERROR;
	wake_up_interruptible(&migf->poll_wait);
}

static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
				 unsigned long arg)
{
	struct mlx5_vf_migration_file *migf = filp->private_data;
	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
	struct mlx5_vhca_data_buffer *buf;
	struct vfio_precopy_info info = {};
	loff_t *pos = &filp->f_pos;
	unsigned long minsz;
	size_t inc_length = 0;
	bool end_of_data;
	int ret;

	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
		return -ENOTTY;

	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);

	if (copy_from_user(&info, (void __user *)arg, minsz))
		return -EFAULT;

	if (info.argsz < minsz)
		return -EINVAL;

	mutex_lock(&mvdev->state_mutex);
	if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
	    mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
		ret = -EINVAL;
		goto err_state_unlock;
	}

	/*
	 * We can't issue a SAVE command when the device is suspended, so as
	 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
	 * bytes that can't be read.
	 */
	if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
		/*
		 * Once the query returns it's guaranteed that there is no
		 * active SAVE command.
		 * As so, the other code below is safe with the proper locks.
		 */
		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
							    MLX5VF_QUERY_INC);
		if (ret)
			goto err_state_unlock;
	}

	mutex_lock(&migf->lock);
	if (migf->state == MLX5_MIGF_STATE_ERROR) {
		ret = -ENODEV;
		goto err_migf_unlock;
	}

	buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data);
	if (buf) {
		if (buf->start_pos == 0) {
			info.initial_bytes = buf->header_image_size - *pos;
		} else if (buf->start_pos ==
				sizeof(struct mlx5_vf_migration_header)) {
			/* First data buffer following the header */
			info.initial_bytes = buf->start_pos +
						buf->length - *pos;
		} else {
			info.dirty_bytes = buf->start_pos + buf->length - *pos;
		}
	} else {
		if (!end_of_data) {
			ret = -EINVAL;
			goto err_migf_unlock;
		}

		info.dirty_bytes = inc_length;
	}

	if (!end_of_data || !inc_length) {
		mutex_unlock(&migf->lock);
		goto done;
	}

	mutex_unlock(&migf->lock);
	/*
	 * We finished transferring the current state and the device has a
	 * dirty state, save a new state to be ready for.
	 */
	buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
	if (IS_ERR(buf)) {
		ret = PTR_ERR(buf);
		mlx5vf_mark_err(migf);
		goto err_state_unlock;
	}

	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
	if (ret) {
		mlx5vf_mark_err(migf);
		mlx5vf_put_data_buffer(buf);
		goto err_state_unlock;
	}

done:
	mlx5vf_state_mutex_unlock(mvdev);
	if (copy_to_user((void __user *)arg, &info, minsz))
		return -EFAULT;
	return 0;

err_migf_unlock:
	mutex_unlock(&migf->lock);
err_state_unlock:
	mlx5vf_state_mutex_unlock(mvdev);
	return ret;
}

static const struct file_operations mlx5vf_save_fops = {
	.owner = THIS_MODULE,
	.read = mlx5vf_save_read,
	.poll = mlx5vf_save_poll,
	.unlocked_ioctl = mlx5vf_precopy_ioctl,
	.compat_ioctl = compat_ptr_ioctl,
	.release = mlx5vf_release_file,
	.llseek = no_llseek,
};

static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
{
	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
	struct mlx5_vhca_data_buffer *buf;
	size_t length;
	int ret;

	if (migf->state == MLX5_MIGF_STATE_ERROR)
		return -ENODEV;

	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
				MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
	if (ret)
		goto err;

	buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
	if (IS_ERR(buf)) {
		ret = PTR_ERR(buf);
		goto err;
	}

	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
	if (ret)
		goto err_save;

	return 0;

err_save:
	mlx5vf_put_data_buffer(buf);
err:
	mlx5vf_mark_err(migf);
	return ret;
}

static struct mlx5_vf_migration_file *
mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
{
	struct mlx5_vf_migration_file *migf;
	struct mlx5_vhca_data_buffer *buf;
	size_t length;
	int ret;

	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
	if (!migf)
		return ERR_PTR(-ENOMEM);

	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
					O_RDONLY);
	if (IS_ERR(migf->filp)) {
		ret = PTR_ERR(migf->filp);
		goto end;
	}

	migf->mvdev = mvdev;
	ret = mlx5vf_cmd_alloc_pd(migf);
	if (ret)
		goto out_free;

	stream_open(migf->filp->f_inode, migf->filp);
	mutex_init(&migf->lock);
	init_waitqueue_head(&migf->poll_wait);
	init_completion(&migf->save_comp);
	/*
	 * save_comp is being used as a binary semaphore built from
	 * a completion. A normal mutex cannot be used because the lock is
	 * passed between kernel threads and lockdep can't model this.
	 */
	complete(&migf->save_comp);
	mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
	INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
	INIT_LIST_HEAD(&migf->buf_list);
	INIT_LIST_HEAD(&migf->avail_list);
	spin_lock_init(&migf->list_lock);
	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
	if (ret)
		goto out_pd;

	buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
	if (IS_ERR(buf)) {
		ret = PTR_ERR(buf);
		goto out_pd;
	}

	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
	if (ret)
		goto out_save;
	return migf;
out_save:
	mlx5vf_free_data_buffer(buf);
out_pd:
	mlx5vf_cmd_dealloc_pd(migf);
out_free:
	fput(migf->filp);
end:
	kfree(migf);
	return ERR_PTR(ret);
}

static int
mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
			      const char __user **buf, size_t *len,
			      loff_t *pos, ssize_t *done)
{
	unsigned long offset;
	size_t page_offset;
	struct page *page;
	size_t page_len;
	u8 *to_buff;
	int ret;

	offset = *pos - vhca_buf->start_pos;
	page_offset = offset % PAGE_SIZE;

	page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
	if (!page)
		return -EINVAL;
	page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
	to_buff = kmap_local_page(page);
	ret = copy_from_user(to_buff + page_offset, *buf, page_len);
	kunmap_local(to_buff);
	if (ret)
		return -EFAULT;

	*pos += page_len;
	*done += page_len;
	*buf += page_len;
	*len -= page_len;
	vhca_buf->length += page_len;
	return 0;
}

static int
mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
				   loff_t requested_length,
				   const char __user **buf, size_t *len,
				   loff_t *pos, ssize_t *done)
{
	int ret;

	if (requested_length > MAX_MIGRATION_SIZE)
		return -ENOMEM;

	if (vhca_buf->allocated_length < requested_length) {
		ret = mlx5vf_add_migration_pages(
			vhca_buf,
			DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
				     PAGE_SIZE));
		if (ret)
			return ret;
	}

	while (*len) {
		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
						    done);
		if (ret)
			return ret;
	}

	return 0;
}

static ssize_t
mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
			 struct mlx5_vhca_data_buffer *vhca_buf,
			 size_t image_size, const char __user **buf,
			 size_t *len, loff_t *pos, ssize_t *done,
			 bool *has_work)
{
	size_t copy_len, to_copy;
	int ret;

	to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
	copy_len = to_copy;
	while (to_copy) {
		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
						    done);
		if (ret)
			return ret;
	}

	*len -= copy_len;
	if (vhca_buf->length == image_size) {
		migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
		migf->max_pos += image_size;
		*has_work = true;
	}

	return 0;
}

static int
mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
			  struct mlx5_vhca_data_buffer *vhca_buf,
			  const char __user **buf,
			  size_t *len, loff_t *pos,
			  ssize_t *done, bool *has_work)
{
	struct page *page;
	size_t copy_len;
	u8 *to_buff;
	int ret;

	copy_len = min_t(size_t, *len,
		sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
	page = mlx5vf_get_migration_page(vhca_buf, 0);
	if (!page)
		return -EINVAL;
	to_buff = kmap_local_page(page);
	ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
	if (ret) {
		ret = -EFAULT;
		goto end;
	}

	*buf += copy_len;
	*pos += copy_len;
	*done += copy_len;
	*len -= copy_len;
	vhca_buf->length += copy_len;
	if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
		u64 flags;

		vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff);
		if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) {
			ret = -ENOMEM;
			goto end;
		}

		flags = le64_to_cpup((__le64 *)(to_buff +
			    offsetof(struct mlx5_vf_migration_header, flags)));
		if (flags) {
			ret = -EOPNOTSUPP;
			goto end;
		}

		migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
		migf->max_pos += vhca_buf->length;
		*has_work = true;
	}
end:
	kunmap_local(to_buff);
	return ret;
}

static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
				   size_t len, loff_t *pos)
{
	struct mlx5_vf_migration_file *migf = filp->private_data;
	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
	struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
	loff_t requested_length;
	bool has_work = false;
	ssize_t done = 0;
	int ret = 0;

	if (pos)
		return -ESPIPE;
	pos = &filp->f_pos;

	if (*pos < 0 ||
	    check_add_overflow((loff_t)len, *pos, &requested_length))
		return -EINVAL;

	mutex_lock(&migf->mvdev->state_mutex);
	mutex_lock(&migf->lock);
	if (migf->state == MLX5_MIGF_STATE_ERROR) {
		ret = -ENODEV;
		goto out_unlock;
	}

	while (len || has_work) {
		has_work = false;
		switch (migf->load_state) {
		case MLX5_VF_LOAD_STATE_READ_HEADER:
			ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
							&buf, &len, pos,
							&done, &has_work);
			if (ret)
				goto out_unlock;
			break;
		case MLX5_VF_LOAD_STATE_PREP_IMAGE:
		{
			u64 size = vhca_buf_header->header_image_size;

			if (vhca_buf->allocated_length < size) {
				mlx5vf_free_data_buffer(vhca_buf);

				migf->buf = mlx5vf_alloc_data_buffer(migf,
							size, DMA_TO_DEVICE);
				if (IS_ERR(migf->buf)) {
					ret = PTR_ERR(migf->buf);
					migf->buf = NULL;
					goto out_unlock;
				}

				vhca_buf = migf->buf;
			}

			vhca_buf->start_pos = migf->max_pos;
			migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
			break;
		}
		case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
			ret = mlx5vf_resume_read_image_no_header(vhca_buf,
						requested_length,
						&buf, &len, pos, &done);
			if (ret)
				goto out_unlock;
			break;
		case MLX5_VF_LOAD_STATE_READ_IMAGE:
			ret = mlx5vf_resume_read_image(migf, vhca_buf,
						vhca_buf_header->header_image_size,
						&buf, &len, pos, &done, &has_work);
			if (ret)
				goto out_unlock;
			break;
		case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
			ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
			if (ret)
				goto out_unlock;
			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;

			/* prep header buf for next image */
			vhca_buf_header->length = 0;
			vhca_buf_header->header_image_size = 0;
			/* prep data buf for next image */
			vhca_buf->length = 0;

			break;
		default:
			break;
		}
	}

out_unlock:
	if (ret)
		migf->state = MLX5_MIGF_STATE_ERROR;
	mutex_unlock(&migf->lock);
	mlx5vf_state_mutex_unlock(migf->mvdev);
	return ret ? ret : done;
}

static const struct file_operations mlx5vf_resume_fops = {
	.owner = THIS_MODULE,
	.write = mlx5vf_resume_write,
	.release = mlx5vf_release_file,
	.llseek = no_llseek,
};

static struct mlx5_vf_migration_file *
mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
{
	struct mlx5_vf_migration_file *migf;
	struct mlx5_vhca_data_buffer *buf;
	int ret;

	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
	if (!migf)
		return ERR_PTR(-ENOMEM);

	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
					O_WRONLY);
	if (IS_ERR(migf->filp)) {
		ret = PTR_ERR(migf->filp);
		goto end;
	}

	migf->mvdev = mvdev;
	ret = mlx5vf_cmd_alloc_pd(migf);
	if (ret)
		goto out_free;

	buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
	if (IS_ERR(buf)) {
		ret = PTR_ERR(buf);
		goto out_pd;
	}

	migf->buf = buf;
	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
		buf = mlx5vf_alloc_data_buffer(migf,
			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
		if (IS_ERR(buf)) {
			ret = PTR_ERR(buf);
			goto out_buf;
		}

		migf->buf_header = buf;
		migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
	} else {
		/* Initial state will be to read the image */
		migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
	}

	stream_open(migf->filp->f_inode, migf->filp);
	mutex_init(&migf->lock);
	INIT_LIST_HEAD(&migf->buf_list);
	INIT_LIST_HEAD(&migf->avail_list);
	spin_lock_init(&migf->list_lock);
	return migf;
out_buf:
	mlx5vf_free_data_buffer(migf->buf);
out_pd:
	mlx5vf_cmd_dealloc_pd(migf);
out_free:
	fput(migf->filp);
end:
	kfree(migf);
	return ERR_PTR(ret);
}

void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
{
	if (mvdev->resuming_migf) {
		mlx5vf_disable_fd(mvdev->resuming_migf);
		mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
		fput(mvdev->resuming_migf->filp);
		mvdev->resuming_migf = NULL;
	}
	if (mvdev->saving_migf) {
		mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
		cancel_work_sync(&mvdev->saving_migf->async_data.work);
		mlx5vf_disable_fd(mvdev->saving_migf);
		mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
		fput(mvdev->saving_migf->filp);
		mvdev->saving_migf = NULL;
	}
}

static struct file *
mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
				    u32 new)
{
	u32 cur = mvdev->mig_state;
	int ret;

	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
		ret = mlx5vf_cmd_suspend_vhca(mvdev,
			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
		if (ret)
			return ERR_PTR(ret);
		return NULL;
	}

	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
		ret = mlx5vf_cmd_resume_vhca(mvdev,
			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
		if (ret)
			return ERR_PTR(ret);
		return NULL;
	}

	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
		ret = mlx5vf_cmd_suspend_vhca(mvdev,
			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
		if (ret)
			return ERR_PTR(ret);
		return NULL;
	}

	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
		ret = mlx5vf_cmd_resume_vhca(mvdev,
			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
		if (ret)
			return ERR_PTR(ret);
		return NULL;
	}

	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
		struct mlx5_vf_migration_file *migf;

		migf = mlx5vf_pci_save_device_data(mvdev, false);
		if (IS_ERR(migf))
			return ERR_CAST(migf);
		get_file(migf->filp);
		mvdev->saving_migf = migf;
		return migf->filp;
	}

	if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
	     new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
		mlx5vf_disable_fds(mvdev);
		return NULL;
	}

	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
		struct mlx5_vf_migration_file *migf;

		migf = mlx5vf_pci_resume_device_data(mvdev);
		if (IS_ERR(migf))
			return ERR_CAST(migf);
		get_file(migf->filp);
		mvdev->resuming_migf = migf;
		return migf->filp;
	}

	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
		if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
			ret = mlx5vf_cmd_load_vhca_state(mvdev,
							 mvdev->resuming_migf,
							 mvdev->resuming_migf->buf);
			if (ret)
				return ERR_PTR(ret);
		}
		mlx5vf_disable_fds(mvdev);
		return NULL;
	}

	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
	    (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
	     new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
		struct mlx5_vf_migration_file *migf;

		migf = mlx5vf_pci_save_device_data(mvdev, true);
		if (IS_ERR(migf))
			return ERR_CAST(migf);
		get_file(migf->filp);
		mvdev->saving_migf = migf;
		return migf->filp;
	}

	if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
		ret = mlx5vf_cmd_suspend_vhca(mvdev,
			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
		if (ret)
			return ERR_PTR(ret);
		ret = mlx5vf_pci_save_device_inc_data(mvdev);
		return ret ? ERR_PTR(ret) : NULL;
	}

	/*
	 * vfio_mig_get_next_state() does not use arcs other than the above
	 */
	WARN_ON(true);
	return ERR_PTR(-EINVAL);
}

/*
 * This function is called in all state_mutex unlock cases to
 * handle a 'deferred_reset' if exists.
 */
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
{
again:
	spin_lock(&mvdev->reset_lock);
	if (mvdev->deferred_reset) {
		mvdev->deferred_reset = false;
		spin_unlock(&mvdev->reset_lock);
		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
		mlx5vf_disable_fds(mvdev);
		goto again;
	}
	mutex_unlock(&mvdev->state_mutex);
	spin_unlock(&mvdev->reset_lock);
}

static struct file *
mlx5vf_pci_set_device_state(struct vfio_device *vdev,
			    enum vfio_device_mig_state new_state)
{
	struct mlx5vf_pci_core_device *mvdev = container_of(
		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
	enum vfio_device_mig_state next_state;
	struct file *res = NULL;
	int ret;

	mutex_lock(&mvdev->state_mutex);
	while (new_state != mvdev->mig_state) {
		ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
					      new_state, &next_state);
		if (ret) {
			res = ERR_PTR(ret);
			break;
		}
		res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
		if (IS_ERR(res))
			break;
		mvdev->mig_state = next_state;
		if (WARN_ON(res && new_state != mvdev->mig_state)) {
			fput(res);
			res = ERR_PTR(-EINVAL);
			break;
		}
	}
	mlx5vf_state_mutex_unlock(mvdev);
	return res;
}

static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
				    unsigned long *stop_copy_length)
{
	struct mlx5vf_pci_core_device *mvdev = container_of(
		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
	size_t state_size;
	int ret;

	mutex_lock(&mvdev->state_mutex);
	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
						    &state_size, 0);
	if (!ret)
		*stop_copy_length = state_size;
	mlx5vf_state_mutex_unlock(mvdev);
	return ret;
}

static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
				       enum vfio_device_mig_state *curr_state)
{
	struct mlx5vf_pci_core_device *mvdev = container_of(
		vdev, struct mlx5vf_pci_core_device, core_device.vdev);

	mutex_lock(&mvdev->state_mutex);
	*curr_state = mvdev->mig_state;
	mlx5vf_state_mutex_unlock(mvdev);
	return 0;
}

static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
{
	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);

	if (!mvdev->migrate_cap)
		return;

	/*
	 * As the higher VFIO layers are holding locks across reset and using
	 * those same locks with the mm_lock we need to prevent ABBA deadlock
	 * with the state_mutex and mm_lock.
	 * In case the state_mutex was taken already we defer the cleanup work
	 * to the unlock flow of the other running context.
	 */
	spin_lock(&mvdev->reset_lock);
	mvdev->deferred_reset = true;
	if (!mutex_trylock(&mvdev->state_mutex)) {
		spin_unlock(&mvdev->reset_lock);
		return;
	}
	spin_unlock(&mvdev->reset_lock);
	mlx5vf_state_mutex_unlock(mvdev);
}

static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
{
	struct mlx5vf_pci_core_device *mvdev = container_of(
		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
	struct vfio_pci_core_device *vdev = &mvdev->core_device;
	int ret;

	ret = vfio_pci_core_enable(vdev);
	if (ret)
		return ret;

	if (mvdev->migrate_cap)
		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
	vfio_pci_core_finish_enable(vdev);
	return 0;
}

static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
{
	struct mlx5vf_pci_core_device *mvdev = container_of(
		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);

	mlx5vf_cmd_close_migratable(mvdev);
	vfio_pci_core_close_device(core_vdev);
}

static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
	.migration_set_state = mlx5vf_pci_set_device_state,
	.migration_get_state = mlx5vf_pci_get_device_state,
	.migration_get_data_size = mlx5vf_pci_get_data_size,
};

static const struct vfio_log_ops mlx5vf_pci_log_ops = {
	.log_start = mlx5vf_start_page_tracker,
	.log_stop = mlx5vf_stop_page_tracker,
	.log_read_and_clear = mlx5vf_tracker_read_and_clear,
};

static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
{
	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
			struct mlx5vf_pci_core_device, core_device.vdev);
	int ret;

	ret = vfio_pci_core_init_dev(core_vdev);
	if (ret)
		return ret;

	mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
				  &mlx5vf_pci_log_ops);

	return 0;
}

static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
{
	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
			struct mlx5vf_pci_core_device, core_device.vdev);

	mlx5vf_cmd_remove_migratable(mvdev);
	vfio_pci_core_release_dev(core_vdev);
}

static const struct vfio_device_ops mlx5vf_pci_ops = {
	.name = "mlx5-vfio-pci",
	.init = mlx5vf_pci_init_dev,
	.release = mlx5vf_pci_release_dev,
	.open_device = mlx5vf_pci_open_device,
	.close_device = mlx5vf_pci_close_device,
	.ioctl = vfio_pci_core_ioctl,
	.device_feature = vfio_pci_core_ioctl_feature,
	.read = vfio_pci_core_read,
	.write = vfio_pci_core_write,
	.mmap = vfio_pci_core_mmap,
	.request = vfio_pci_core_request,
	.match = vfio_pci_core_match,
	.bind_iommufd = vfio_iommufd_physical_bind,
	.unbind_iommufd = vfio_iommufd_physical_unbind,
	.attach_ioas = vfio_iommufd_physical_attach_ioas,
};

static int mlx5vf_pci_probe(struct pci_dev *pdev,
			    const struct pci_device_id *id)
{
	struct mlx5vf_pci_core_device *mvdev;
	int ret;

	mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
				  &pdev->dev, &mlx5vf_pci_ops);
	if (IS_ERR(mvdev))
		return PTR_ERR(mvdev);

	dev_set_drvdata(&pdev->dev, &mvdev->core_device);
	ret = vfio_pci_core_register_device(&mvdev->core_device);
	if (ret)
		goto out_put_vdev;
	return 0;

out_put_vdev:
	vfio_put_device(&mvdev->core_device.vdev);
	return ret;
}

static void mlx5vf_pci_remove(struct pci_dev *pdev)
{
	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);

	vfio_pci_core_unregister_device(&mvdev->core_device);
	vfio_put_device(&mvdev->core_device.vdev);
}

static const struct pci_device_id mlx5vf_pci_table[] = {
	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
	{}
};

MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);

static const struct pci_error_handlers mlx5vf_err_handlers = {
	.reset_done = mlx5vf_pci_aer_reset_done,
	.error_detected = vfio_pci_core_aer_err_detected,
};

static struct pci_driver mlx5vf_pci_driver = {
	.name = KBUILD_MODNAME,
	.id_table = mlx5vf_pci_table,
	.probe = mlx5vf_pci_probe,
	.remove = mlx5vf_pci_remove,
	.err_handler = &mlx5vf_err_handlers,
	.driver_managed_dma = true,
};

module_pci_driver(mlx5vf_pci_driver);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
MODULE_DESCRIPTION(
	"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");