Contributors: 8
Author Tokens Token Proportion Commits Commit Proportion
Sudeep Dutt 5949 97.56% 1 9.09%
Vincent Whitchurch 86 1.41% 4 36.36%
Ashutosh Dixit 26 0.43% 1 9.09%
Vasyl Gomonovych 16 0.26% 1 9.09%
Dan Carpenter 12 0.20% 1 9.09%
Linus Torvalds 5 0.08% 1 9.09%
Thomas Gleixner 2 0.03% 1 9.09%
Al Viro 2 0.03% 1 9.09%
Total 6098 11


// SPDX-License-Identifier: GPL-2.0-only
/*
 * Intel MIC Platform Software Stack (MPSS)
 *
 * Copyright(c) 2016 Intel Corporation.
 *
 * Intel Virtio Over PCIe (VOP) driver.
 */
#include <linux/sched.h>
#include <linux/poll.h>
#include <linux/dma-mapping.h>

#include <linux/mic_common.h>
#include "../common/mic_dev.h"

#include <linux/mic_ioctl.h>
#include "vop_main.h"

/* Helper API to obtain the VOP PCIe device */
static inline struct device *vop_dev(struct vop_vdev *vdev)
{
	return vdev->vpdev->dev.parent;
}

/* Helper API to check if a virtio device is initialized */
static inline int vop_vdev_inited(struct vop_vdev *vdev)
{
	if (!vdev)
		return -EINVAL;
	/* Device has not been created yet */
	if (!vdev->dd || !vdev->dd->type) {
		dev_err(vop_dev(vdev), "%s %d err %d\n",
			__func__, __LINE__, -EINVAL);
		return -EINVAL;
	}
	/* Device has been removed/deleted */
	if (vdev->dd->type == -1) {
		dev_dbg(vop_dev(vdev), "%s %d err %d\n",
			__func__, __LINE__, -ENODEV);
		return -ENODEV;
	}
	return 0;
}

static void _vop_notify(struct vringh *vrh)
{
	struct vop_vringh *vvrh = container_of(vrh, struct vop_vringh, vrh);
	struct vop_vdev *vdev = vvrh->vdev;
	struct vop_device *vpdev = vdev->vpdev;
	s8 db = vdev->dc->h2c_vdev_db;

	if (db != -1)
		vpdev->hw_ops->send_intr(vpdev, db);
}

static void vop_virtio_init_post(struct vop_vdev *vdev)
{
	struct mic_vqconfig *vqconfig = mic_vq_config(vdev->dd);
	struct vop_device *vpdev = vdev->vpdev;
	int i, used_size;

	for (i = 0; i < vdev->dd->num_vq; i++) {
		used_size = PAGE_ALIGN(sizeof(u16) * 3 +
				sizeof(struct vring_used_elem) *
				le16_to_cpu(vqconfig->num));
		if (!le64_to_cpu(vqconfig[i].used_address)) {
			dev_warn(vop_dev(vdev), "used_address zero??\n");
			continue;
		}
		vdev->vvr[i].vrh.vring.used =
			(void __force *)vpdev->hw_ops->remap(
			vpdev,
			le64_to_cpu(vqconfig[i].used_address),
			used_size);
	}

	vdev->dc->used_address_updated = 0;

	dev_info(vop_dev(vdev), "%s: device type %d LINKUP\n",
		 __func__, vdev->virtio_id);
}

static inline void vop_virtio_device_reset(struct vop_vdev *vdev)
{
	int i;

	dev_dbg(vop_dev(vdev), "%s: status %d device type %d RESET\n",
		__func__, vdev->dd->status, vdev->virtio_id);

	for (i = 0; i < vdev->dd->num_vq; i++)
		/*
		 * Avoid lockdep false positive. The + 1 is for the vop
		 * mutex which is held in the reset devices code path.
		 */
		mutex_lock_nested(&vdev->vvr[i].vr_mutex, i + 1);

	/* 0 status means "reset" */
	vdev->dd->status = 0;
	vdev->dc->vdev_reset = 0;
	vdev->dc->host_ack = 1;

	for (i = 0; i < vdev->dd->num_vq; i++) {
		struct vringh *vrh = &vdev->vvr[i].vrh;

		vdev->vvr[i].vring.info->avail_idx = 0;
		vrh->completed = 0;
		vrh->last_avail_idx = 0;
		vrh->last_used_idx = 0;
	}

	for (i = 0; i < vdev->dd->num_vq; i++)
		mutex_unlock(&vdev->vvr[i].vr_mutex);
}

static void vop_virtio_reset_devices(struct vop_info *vi)
{
	struct list_head *pos, *tmp;
	struct vop_vdev *vdev;

	list_for_each_safe(pos, tmp, &vi->vdev_list) {
		vdev = list_entry(pos, struct vop_vdev, list);
		vop_virtio_device_reset(vdev);
		vdev->poll_wake = 1;
		wake_up(&vdev->waitq);
	}
}

static void vop_bh_handler(struct work_struct *work)
{
	struct vop_vdev *vdev = container_of(work, struct vop_vdev,
			virtio_bh_work);

	if (vdev->dc->used_address_updated)
		vop_virtio_init_post(vdev);

	if (vdev->dc->vdev_reset)
		vop_virtio_device_reset(vdev);

	vdev->poll_wake = 1;
	wake_up(&vdev->waitq);
}

static irqreturn_t _vop_virtio_intr_handler(int irq, void *data)
{
	struct vop_vdev *vdev = data;
	struct vop_device *vpdev = vdev->vpdev;

	vpdev->hw_ops->ack_interrupt(vpdev, vdev->virtio_db);
	schedule_work(&vdev->virtio_bh_work);
	return IRQ_HANDLED;
}

static int vop_virtio_config_change(struct vop_vdev *vdev, void *argp)
{
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wake);
	int ret = 0, retry, i;
	struct vop_device *vpdev = vdev->vpdev;
	struct vop_info *vi = dev_get_drvdata(&vpdev->dev);
	struct mic_bootparam *bootparam = vpdev->hw_ops->get_dp(vpdev);
	s8 db = bootparam->h2c_config_db;

	mutex_lock(&vi->vop_mutex);
	for (i = 0; i < vdev->dd->num_vq; i++)
		mutex_lock_nested(&vdev->vvr[i].vr_mutex, i + 1);

	if (db == -1 || vdev->dd->type == -1) {
		ret = -EIO;
		goto exit;
	}

	memcpy(mic_vq_configspace(vdev->dd), argp, vdev->dd->config_len);
	vdev->dc->config_change = MIC_VIRTIO_PARAM_CONFIG_CHANGED;
	vpdev->hw_ops->send_intr(vpdev, db);

	for (retry = 100; retry--;) {
		ret = wait_event_timeout(wake, vdev->dc->guest_ack,
					 msecs_to_jiffies(100));
		if (ret)
			break;
	}

	dev_dbg(vop_dev(vdev),
		"%s %d retry: %d\n", __func__, __LINE__, retry);
	vdev->dc->config_change = 0;
	vdev->dc->guest_ack = 0;
exit:
	for (i = 0; i < vdev->dd->num_vq; i++)
		mutex_unlock(&vdev->vvr[i].vr_mutex);
	mutex_unlock(&vi->vop_mutex);
	return ret;
}

static int vop_copy_dp_entry(struct vop_vdev *vdev,
			     struct mic_device_desc *argp, __u8 *type,
			     struct mic_device_desc **devpage)
{
	struct vop_device *vpdev = vdev->vpdev;
	struct mic_device_desc *devp;
	struct mic_vqconfig *vqconfig;
	int ret = 0, i;
	bool slot_found = false;

	vqconfig = mic_vq_config(argp);
	for (i = 0; i < argp->num_vq; i++) {
		if (le16_to_cpu(vqconfig[i].num) > MIC_MAX_VRING_ENTRIES) {
			ret =  -EINVAL;
			dev_err(vop_dev(vdev), "%s %d err %d\n",
				__func__, __LINE__, ret);
			goto exit;
		}
	}

	/* Find the first free device page entry */
	for (i = sizeof(struct mic_bootparam);
		i < MIC_DP_SIZE - mic_total_desc_size(argp);
		i += mic_total_desc_size(devp)) {
		devp = vpdev->hw_ops->get_dp(vpdev) + i;
		if (devp->type == 0 || devp->type == -1) {
			slot_found = true;
			break;
		}
	}
	if (!slot_found) {
		ret =  -EINVAL;
		dev_err(vop_dev(vdev), "%s %d err %d\n",
			__func__, __LINE__, ret);
		goto exit;
	}
	/*
	 * Save off the type before doing the memcpy. Type will be set in the
	 * end after completing all initialization for the new device.
	 */
	*type = argp->type;
	argp->type = 0;
	memcpy(devp, argp, mic_desc_size(argp));

	*devpage = devp;
exit:
	return ret;
}

static void vop_init_device_ctrl(struct vop_vdev *vdev,
				 struct mic_device_desc *devpage)
{
	struct mic_device_ctrl *dc;

	dc = (void *)devpage + mic_aligned_desc_size(devpage);

	dc->config_change = 0;
	dc->guest_ack = 0;
	dc->vdev_reset = 0;
	dc->host_ack = 0;
	dc->used_address_updated = 0;
	dc->c2h_vdev_db = -1;
	dc->h2c_vdev_db = -1;
	vdev->dc = dc;
}

static int vop_virtio_add_device(struct vop_vdev *vdev,
				 struct mic_device_desc *argp)
{
	struct vop_info *vi = vdev->vi;
	struct vop_device *vpdev = vi->vpdev;
	struct mic_device_desc *dd = NULL;
	struct mic_vqconfig *vqconfig;
	int vr_size, i, j, ret;
	u8 type = 0;
	s8 db = -1;
	char irqname[16];
	struct mic_bootparam *bootparam;
	u16 num;
	dma_addr_t vr_addr;

	bootparam = vpdev->hw_ops->get_dp(vpdev);
	init_waitqueue_head(&vdev->waitq);
	INIT_LIST_HEAD(&vdev->list);
	vdev->vpdev = vpdev;

	ret = vop_copy_dp_entry(vdev, argp, &type, &dd);
	if (ret) {
		dev_err(vop_dev(vdev), "%s %d err %d\n",
			__func__, __LINE__, ret);
		return ret;
	}

	vop_init_device_ctrl(vdev, dd);

	vdev->dd = dd;
	vdev->virtio_id = type;
	vqconfig = mic_vq_config(dd);
	INIT_WORK(&vdev->virtio_bh_work, vop_bh_handler);

	for (i = 0; i < dd->num_vq; i++) {
		struct vop_vringh *vvr = &vdev->vvr[i];
		struct mic_vring *vr = &vdev->vvr[i].vring;

		num = le16_to_cpu(vqconfig[i].num);
		mutex_init(&vvr->vr_mutex);
		vr_size = PAGE_ALIGN(vring_size(num, MIC_VIRTIO_RING_ALIGN) +
			sizeof(struct _mic_vring_info));
		vr->va = (void *)
			__get_free_pages(GFP_KERNEL | __GFP_ZERO,
					 get_order(vr_size));
		if (!vr->va) {
			ret = -ENOMEM;
			dev_err(vop_dev(vdev), "%s %d err %d\n",
				__func__, __LINE__, ret);
			goto err;
		}
		vr->len = vr_size;
		vr->info = vr->va + vring_size(num, MIC_VIRTIO_RING_ALIGN);
		vr->info->magic = cpu_to_le32(MIC_MAGIC + vdev->virtio_id + i);
		vr_addr = dma_map_single(&vpdev->dev, vr->va, vr_size,
					 DMA_BIDIRECTIONAL);
		if (dma_mapping_error(&vpdev->dev, vr_addr)) {
			free_pages((unsigned long)vr->va, get_order(vr_size));
			ret = -ENOMEM;
			dev_err(vop_dev(vdev), "%s %d err %d\n",
				__func__, __LINE__, ret);
			goto err;
		}
		vqconfig[i].address = cpu_to_le64(vr_addr);

		vring_init(&vr->vr, num, vr->va, MIC_VIRTIO_RING_ALIGN);
		ret = vringh_init_kern(&vvr->vrh,
				       *(u32 *)mic_vq_features(vdev->dd),
				       num, false, vr->vr.desc, vr->vr.avail,
				       vr->vr.used);
		if (ret) {
			dev_err(vop_dev(vdev), "%s %d err %d\n",
				__func__, __LINE__, ret);
			goto err;
		}
		vringh_kiov_init(&vvr->riov, NULL, 0);
		vringh_kiov_init(&vvr->wiov, NULL, 0);
		vvr->head = USHRT_MAX;
		vvr->vdev = vdev;
		vvr->vrh.notify = _vop_notify;
		dev_dbg(&vpdev->dev,
			"%s %d index %d va %p info %p vr_size 0x%x\n",
			__func__, __LINE__, i, vr->va, vr->info, vr_size);
		vvr->buf = (void *)__get_free_pages(GFP_KERNEL,
					get_order(VOP_INT_DMA_BUF_SIZE));
		vvr->buf_da = dma_map_single(&vpdev->dev,
					  vvr->buf, VOP_INT_DMA_BUF_SIZE,
					  DMA_BIDIRECTIONAL);
	}

	snprintf(irqname, sizeof(irqname), "vop%dvirtio%d", vpdev->index,
		 vdev->virtio_id);
	vdev->virtio_db = vpdev->hw_ops->next_db(vpdev);
	vdev->virtio_cookie = vpdev->hw_ops->request_irq(vpdev,
			_vop_virtio_intr_handler, irqname, vdev,
			vdev->virtio_db);
	if (IS_ERR(vdev->virtio_cookie)) {
		ret = PTR_ERR(vdev->virtio_cookie);
		dev_dbg(&vpdev->dev, "request irq failed\n");
		goto err;
	}

	vdev->dc->c2h_vdev_db = vdev->virtio_db;

	/*
	 * Order the type update with previous stores. This write barrier
	 * is paired with the corresponding read barrier before the uncached
	 * system memory read of the type, on the card while scanning the
	 * device page.
	 */
	smp_wmb();
	dd->type = type;
	argp->type = type;

	if (bootparam) {
		db = bootparam->h2c_config_db;
		if (db != -1)
			vpdev->hw_ops->send_intr(vpdev, db);
	}
	dev_dbg(&vpdev->dev, "Added virtio id %d db %d\n", dd->type, db);
	return 0;
err:
	vqconfig = mic_vq_config(dd);
	for (j = 0; j < i; j++) {
		struct vop_vringh *vvr = &vdev->vvr[j];

		dma_unmap_single(&vpdev->dev, le64_to_cpu(vqconfig[j].address),
				 vvr->vring.len, DMA_BIDIRECTIONAL);
		free_pages((unsigned long)vvr->vring.va,
			   get_order(vvr->vring.len));
	}
	return ret;
}

static void vop_dev_remove(struct vop_info *pvi, struct mic_device_ctrl *devp,
			   struct vop_device *vpdev)
{
	struct mic_bootparam *bootparam = vpdev->hw_ops->get_dp(vpdev);
	s8 db;
	int ret, retry;
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wake);

	devp->config_change = MIC_VIRTIO_PARAM_DEV_REMOVE;
	db = bootparam->h2c_config_db;
	if (db != -1)
		vpdev->hw_ops->send_intr(vpdev, db);
	else
		goto done;
	for (retry = 15; retry--;) {
		ret = wait_event_timeout(wake, devp->guest_ack,
					 msecs_to_jiffies(1000));
		if (ret)
			break;
	}
done:
	devp->config_change = 0;
	devp->guest_ack = 0;
}

static void vop_virtio_del_device(struct vop_vdev *vdev)
{
	struct vop_info *vi = vdev->vi;
	struct vop_device *vpdev = vdev->vpdev;
	int i;
	struct mic_vqconfig *vqconfig;
	struct mic_bootparam *bootparam = vpdev->hw_ops->get_dp(vpdev);

	if (!bootparam)
		goto skip_hot_remove;
	vop_dev_remove(vi, vdev->dc, vpdev);
skip_hot_remove:
	vpdev->hw_ops->free_irq(vpdev, vdev->virtio_cookie, vdev);
	flush_work(&vdev->virtio_bh_work);
	vqconfig = mic_vq_config(vdev->dd);
	for (i = 0; i < vdev->dd->num_vq; i++) {
		struct vop_vringh *vvr = &vdev->vvr[i];

		dma_unmap_single(&vpdev->dev,
				 vvr->buf_da, VOP_INT_DMA_BUF_SIZE,
				 DMA_BIDIRECTIONAL);
		free_pages((unsigned long)vvr->buf,
			   get_order(VOP_INT_DMA_BUF_SIZE));
		vringh_kiov_cleanup(&vvr->riov);
		vringh_kiov_cleanup(&vvr->wiov);
		dma_unmap_single(&vpdev->dev, le64_to_cpu(vqconfig[i].address),
				 vvr->vring.len, DMA_BIDIRECTIONAL);
		free_pages((unsigned long)vvr->vring.va,
			   get_order(vvr->vring.len));
	}
	/*
	 * Order the type update with previous stores. This write barrier
	 * is paired with the corresponding read barrier before the uncached
	 * system memory read of the type, on the card while scanning the
	 * device page.
	 */
	smp_wmb();
	vdev->dd->type = -1;
}

/*
 * vop_sync_dma - Wrapper for synchronous DMAs.
 *
 * @dev - The address of the pointer to the device instance used
 * for DMA registration.
 * @dst - destination DMA address.
 * @src - source DMA address.
 * @len - size of the transfer.
 *
 * Return DMA_SUCCESS on success
 */
static int vop_sync_dma(struct vop_vdev *vdev, dma_addr_t dst, dma_addr_t src,
			size_t len)
{
	int err = 0;
	struct dma_device *ddev;
	struct dma_async_tx_descriptor *tx;
	struct vop_info *vi = dev_get_drvdata(&vdev->vpdev->dev);
	struct dma_chan *vop_ch = vi->dma_ch;

	if (!vop_ch) {
		err = -EBUSY;
		goto error;
	}
	ddev = vop_ch->device;
	tx = ddev->device_prep_dma_memcpy(vop_ch, dst, src, len,
		DMA_PREP_FENCE);
	if (!tx) {
		err = -ENOMEM;
		goto error;
	} else {
		dma_cookie_t cookie;

		cookie = tx->tx_submit(tx);
		if (dma_submit_error(cookie)) {
			err = -ENOMEM;
			goto error;
		}
		dma_async_issue_pending(vop_ch);
		err = dma_sync_wait(vop_ch, cookie);
	}
error:
	if (err)
		dev_err(&vi->vpdev->dev, "%s %d err %d\n",
			__func__, __LINE__, err);
	return err;
}

#define VOP_USE_DMA true

/*
 * Initiates the copies across the PCIe bus from card memory to a user
 * space buffer. When transfers are done using DMA, source/destination
 * addresses and transfer length must follow the alignment requirements of
 * the MIC DMA engine.
 */
static int vop_virtio_copy_to_user(struct vop_vdev *vdev, void __user *ubuf,
				   size_t len, u64 daddr, size_t dlen,
				   int vr_idx)
{
	struct vop_device *vpdev = vdev->vpdev;
	void __iomem *dbuf = vpdev->hw_ops->remap(vpdev, daddr, len);
	struct vop_vringh *vvr = &vdev->vvr[vr_idx];
	struct vop_info *vi = dev_get_drvdata(&vpdev->dev);
	size_t dma_alignment;
	bool x200;
	size_t dma_offset, partlen;
	int err;

	if (!VOP_USE_DMA || !vi->dma_ch) {
		if (copy_to_user(ubuf, (void __force *)dbuf, len)) {
			err = -EFAULT;
			dev_err(vop_dev(vdev), "%s %d err %d\n",
				__func__, __LINE__, err);
			goto err;
		}
		vdev->in_bytes += len;
		err = 0;
		goto err;
	}

	dma_alignment = 1 << vi->dma_ch->device->copy_align;
	x200 = is_dma_copy_aligned(vi->dma_ch->device, 1, 1, 1);

	dma_offset = daddr - round_down(daddr, dma_alignment);
	daddr -= dma_offset;
	len += dma_offset;
	/*
	 * X100 uses DMA addresses as seen by the card so adding
	 * the aperture base is not required for DMA. However x200
	 * requires DMA addresses to be an offset into the bar so
	 * add the aperture base for x200.
	 */
	if (x200)
		daddr += vpdev->aper->pa;
	while (len) {
		partlen = min_t(size_t, len, VOP_INT_DMA_BUF_SIZE);
		err = vop_sync_dma(vdev, vvr->buf_da, daddr,
				   ALIGN(partlen, dma_alignment));
		if (err) {
			dev_err(vop_dev(vdev), "%s %d err %d\n",
				__func__, __LINE__, err);
			goto err;
		}
		if (copy_to_user(ubuf, vvr->buf + dma_offset,
				 partlen - dma_offset)) {
			err = -EFAULT;
			dev_err(vop_dev(vdev), "%s %d err %d\n",
				__func__, __LINE__, err);
			goto err;
		}
		daddr += partlen;
		ubuf += partlen;
		dbuf += partlen;
		vdev->in_bytes_dma += partlen;
		vdev->in_bytes += partlen;
		len -= partlen;
		dma_offset = 0;
	}
	err = 0;
err:
	vpdev->hw_ops->unmap(vpdev, dbuf);
	dev_dbg(vop_dev(vdev),
		"%s: ubuf %p dbuf %p len 0x%zx vr_idx 0x%x\n",
		__func__, ubuf, dbuf, len, vr_idx);
	return err;
}

/*
 * Initiates copies across the PCIe bus from a user space buffer to card
 * memory. When transfers are done using DMA, source/destination addresses
 * and transfer length must follow the alignment requirements of the MIC
 * DMA engine.
 */
static int vop_virtio_copy_from_user(struct vop_vdev *vdev, void __user *ubuf,
				     size_t len, u64 daddr, size_t dlen,
				     int vr_idx)
{
	struct vop_device *vpdev = vdev->vpdev;
	void __iomem *dbuf = vpdev->hw_ops->remap(vpdev, daddr, len);
	struct vop_vringh *vvr = &vdev->vvr[vr_idx];
	struct vop_info *vi = dev_get_drvdata(&vdev->vpdev->dev);
	size_t dma_alignment;
	bool x200;
	size_t partlen;
	bool dma = VOP_USE_DMA && vi->dma_ch;
	int err = 0;

	if (dma) {
		dma_alignment = 1 << vi->dma_ch->device->copy_align;
		x200 = is_dma_copy_aligned(vi->dma_ch->device, 1, 1, 1);

		if (daddr & (dma_alignment - 1)) {
			vdev->tx_dst_unaligned += len;
			dma = false;
		} else if (ALIGN(len, dma_alignment) > dlen) {
			vdev->tx_len_unaligned += len;
			dma = false;
		}
	}

	if (!dma)
		goto memcpy;

	/*
	 * X100 uses DMA addresses as seen by the card so adding
	 * the aperture base is not required for DMA. However x200
	 * requires DMA addresses to be an offset into the bar so
	 * add the aperture base for x200.
	 */
	if (x200)
		daddr += vpdev->aper->pa;
	while (len) {
		partlen = min_t(size_t, len, VOP_INT_DMA_BUF_SIZE);

		if (copy_from_user(vvr->buf, ubuf, partlen)) {
			err = -EFAULT;
			dev_err(vop_dev(vdev), "%s %d err %d\n",
				__func__, __LINE__, err);
			goto err;
		}
		err = vop_sync_dma(vdev, daddr, vvr->buf_da,
				   ALIGN(partlen, dma_alignment));
		if (err) {
			dev_err(vop_dev(vdev), "%s %d err %d\n",
				__func__, __LINE__, err);
			goto err;
		}
		daddr += partlen;
		ubuf += partlen;
		dbuf += partlen;
		vdev->out_bytes_dma += partlen;
		vdev->out_bytes += partlen;
		len -= partlen;
	}
memcpy:
	/*
	 * We are copying to IO below and should ideally use something
	 * like copy_from_user_toio(..) if it existed.
	 */
	if (copy_from_user((void __force *)dbuf, ubuf, len)) {
		err = -EFAULT;
		dev_err(vop_dev(vdev), "%s %d err %d\n",
			__func__, __LINE__, err);
		goto err;
	}
	vdev->out_bytes += len;
	err = 0;
err:
	vpdev->hw_ops->unmap(vpdev, dbuf);
	dev_dbg(vop_dev(vdev),
		"%s: ubuf %p dbuf %p len 0x%zx vr_idx 0x%x\n",
		__func__, ubuf, dbuf, len, vr_idx);
	return err;
}

#define MIC_VRINGH_READ true

/* Determine the total number of bytes consumed in a VRINGH KIOV */
static inline u32 vop_vringh_iov_consumed(struct vringh_kiov *iov)
{
	int i;
	u32 total = iov->consumed;

	for (i = 0; i < iov->i; i++)
		total += iov->iov[i].iov_len;
	return total;
}

/*
 * Traverse the VRINGH KIOV and issue the APIs to trigger the copies.
 * This API is heavily based on the vringh_iov_xfer(..) implementation
 * in vringh.c. The reason we cannot reuse vringh_iov_pull_kern(..)
 * and vringh_iov_push_kern(..) directly is because there is no
 * way to override the VRINGH xfer(..) routines as of v3.10.
 */
static int vop_vringh_copy(struct vop_vdev *vdev, struct vringh_kiov *iov,
			   void __user *ubuf, size_t len, bool read, int vr_idx,
			   size_t *out_len)
{
	int ret = 0;
	size_t partlen, tot_len = 0;

	while (len && iov->i < iov->used) {
		struct kvec *kiov = &iov->iov[iov->i];
		unsigned long daddr = (unsigned long)kiov->iov_base;

		partlen = min(kiov->iov_len, len);
		if (read)
			ret = vop_virtio_copy_to_user(vdev, ubuf, partlen,
						      daddr,
						      kiov->iov_len,
						      vr_idx);
		else
			ret = vop_virtio_copy_from_user(vdev, ubuf, partlen,
							daddr,
							kiov->iov_len,
							vr_idx);
		if (ret) {
			dev_err(vop_dev(vdev), "%s %d err %d\n",
				__func__, __LINE__, ret);
			break;
		}
		len -= partlen;
		ubuf += partlen;
		tot_len += partlen;
		iov->consumed += partlen;
		kiov->iov_len -= partlen;
		kiov->iov_base += partlen;
		if (!kiov->iov_len) {
			/* Fix up old iov element then increment. */
			kiov->iov_len = iov->consumed;
			kiov->iov_base -= iov->consumed;

			iov->consumed = 0;
			iov->i++;
		}
	}
	*out_len = tot_len;
	return ret;
}

/*
 * Use the standard VRINGH infrastructure in the kernel to fetch new
 * descriptors, initiate the copies and update the used ring.
 */
static int _vop_virtio_copy(struct vop_vdev *vdev, struct mic_copy_desc *copy)
{
	int ret = 0;
	u32 iovcnt = copy->iovcnt;
	struct iovec iov;
	struct iovec __user *u_iov = copy->iov;
	void __user *ubuf = NULL;
	struct vop_vringh *vvr = &vdev->vvr[copy->vr_idx];
	struct vringh_kiov *riov = &vvr->riov;
	struct vringh_kiov *wiov = &vvr->wiov;
	struct vringh *vrh = &vvr->vrh;
	u16 *head = &vvr->head;
	struct mic_vring *vr = &vvr->vring;
	size_t len = 0, out_len;

	copy->out_len = 0;
	/* Fetch a new IOVEC if all previous elements have been processed */
	if (riov->i == riov->used && wiov->i == wiov->used) {
		ret = vringh_getdesc_kern(vrh, riov, wiov,
					  head, GFP_KERNEL);
		/* Check if there are available descriptors */
		if (ret <= 0)
			return ret;
	}
	while (iovcnt) {
		if (!len) {
			/* Copy over a new iovec from user space. */
			ret = copy_from_user(&iov, u_iov, sizeof(*u_iov));
			if (ret) {
				ret = -EINVAL;
				dev_err(vop_dev(vdev), "%s %d err %d\n",
					__func__, __LINE__, ret);
				break;
			}
			len = iov.iov_len;
			ubuf = iov.iov_base;
		}
		/* Issue all the read descriptors first */
		ret = vop_vringh_copy(vdev, riov, ubuf, len,
				      MIC_VRINGH_READ, copy->vr_idx, &out_len);
		if (ret) {
			dev_err(vop_dev(vdev), "%s %d err %d\n",
				__func__, __LINE__, ret);
			break;
		}
		len -= out_len;
		ubuf += out_len;
		copy->out_len += out_len;
		/* Issue the write descriptors next */
		ret = vop_vringh_copy(vdev, wiov, ubuf, len,
				      !MIC_VRINGH_READ, copy->vr_idx, &out_len);
		if (ret) {
			dev_err(vop_dev(vdev), "%s %d err %d\n",
				__func__, __LINE__, ret);
			break;
		}
		len -= out_len;
		ubuf += out_len;
		copy->out_len += out_len;
		if (!len) {
			/* One user space iovec is now completed */
			iovcnt--;
			u_iov++;
		}
		/* Exit loop if all elements in KIOVs have been processed. */
		if (riov->i == riov->used && wiov->i == wiov->used)
			break;
	}
	/*
	 * Update the used ring if a descriptor was available and some data was
	 * copied in/out and the user asked for a used ring update.
	 */
	if (*head != USHRT_MAX && copy->out_len && copy->update_used) {
		u32 total = 0;

		/* Determine the total data consumed */
		total += vop_vringh_iov_consumed(riov);
		total += vop_vringh_iov_consumed(wiov);
		vringh_complete_kern(vrh, *head, total);
		*head = USHRT_MAX;
		if (vringh_need_notify_kern(vrh) > 0)
			vringh_notify(vrh);
		vringh_kiov_cleanup(riov);
		vringh_kiov_cleanup(wiov);
		/* Update avail idx for user space */
		vr->info->avail_idx = vrh->last_avail_idx;
	}
	return ret;
}

static inline int vop_verify_copy_args(struct vop_vdev *vdev,
				       struct mic_copy_desc *copy)
{
	if (!vdev || copy->vr_idx >= vdev->dd->num_vq)
		return -EINVAL;
	return 0;
}

/* Copy a specified number of virtio descriptors in a chain */
static int vop_virtio_copy_desc(struct vop_vdev *vdev,
				struct mic_copy_desc *copy)
{
	int err;
	struct vop_vringh *vvr;

	err = vop_verify_copy_args(vdev, copy);
	if (err)
		return err;

	vvr = &vdev->vvr[copy->vr_idx];
	mutex_lock(&vvr->vr_mutex);
	if (!vop_vdevup(vdev)) {
		err = -ENODEV;
		dev_err(vop_dev(vdev), "%s %d err %d\n",
			__func__, __LINE__, err);
		goto err;
	}
	err = _vop_virtio_copy(vdev, copy);
	if (err) {
		dev_err(vop_dev(vdev), "%s %d err %d\n",
			__func__, __LINE__, err);
	}
err:
	mutex_unlock(&vvr->vr_mutex);
	return err;
}

static int vop_open(struct inode *inode, struct file *f)
{
	struct vop_vdev *vdev;
	struct vop_info *vi = container_of(f->private_data,
		struct vop_info, miscdev);

	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
	if (!vdev)
		return -ENOMEM;
	vdev->vi = vi;
	mutex_init(&vdev->vdev_mutex);
	f->private_data = vdev;
	init_completion(&vdev->destroy);
	complete(&vdev->destroy);
	return 0;
}

static int vop_release(struct inode *inode, struct file *f)
{
	struct vop_vdev *vdev = f->private_data, *vdev_tmp;
	struct vop_info *vi = vdev->vi;
	struct list_head *pos, *tmp;
	bool found = false;

	mutex_lock(&vdev->vdev_mutex);
	if (vdev->deleted)
		goto unlock;
	mutex_lock(&vi->vop_mutex);
	list_for_each_safe(pos, tmp, &vi->vdev_list) {
		vdev_tmp = list_entry(pos, struct vop_vdev, list);
		if (vdev == vdev_tmp) {
			vop_virtio_del_device(vdev);
			list_del(pos);
			found = true;
			break;
		}
	}
	mutex_unlock(&vi->vop_mutex);
unlock:
	mutex_unlock(&vdev->vdev_mutex);
	if (!found)
		wait_for_completion(&vdev->destroy);
	f->private_data = NULL;
	kfree(vdev);
	return 0;
}

static long vop_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
{
	struct vop_vdev *vdev = f->private_data;
	struct vop_info *vi = vdev->vi;
	void __user *argp = (void __user *)arg;
	int ret;

	switch (cmd) {
	case MIC_VIRTIO_ADD_DEVICE:
	{
		struct mic_device_desc dd, *dd_config;

		if (copy_from_user(&dd, argp, sizeof(dd)))
			return -EFAULT;

		if (mic_aligned_desc_size(&dd) > MIC_MAX_DESC_BLK_SIZE ||
		    dd.num_vq > MIC_MAX_VRINGS)
			return -EINVAL;

		dd_config = memdup_user(argp, mic_desc_size(&dd));
		if (IS_ERR(dd_config))
			return PTR_ERR(dd_config);

		/* Ensure desc has not changed between the two reads */
		if (memcmp(&dd, dd_config, sizeof(dd))) {
			ret = -EINVAL;
			goto free_ret;
		}
		mutex_lock(&vdev->vdev_mutex);
		mutex_lock(&vi->vop_mutex);
		ret = vop_virtio_add_device(vdev, dd_config);
		if (ret)
			goto unlock_ret;
		list_add_tail(&vdev->list, &vi->vdev_list);
unlock_ret:
		mutex_unlock(&vi->vop_mutex);
		mutex_unlock(&vdev->vdev_mutex);
free_ret:
		kfree(dd_config);
		return ret;
	}
	case MIC_VIRTIO_COPY_DESC:
	{
		struct mic_copy_desc copy;

		mutex_lock(&vdev->vdev_mutex);
		ret = vop_vdev_inited(vdev);
		if (ret)
			goto _unlock_ret;

		if (copy_from_user(&copy, argp, sizeof(copy))) {
			ret = -EFAULT;
			goto _unlock_ret;
		}

		ret = vop_virtio_copy_desc(vdev, &copy);
		if (ret < 0)
			goto _unlock_ret;
		if (copy_to_user(
			&((struct mic_copy_desc __user *)argp)->out_len,
			&copy.out_len, sizeof(copy.out_len)))
			ret = -EFAULT;
_unlock_ret:
		mutex_unlock(&vdev->vdev_mutex);
		return ret;
	}
	case MIC_VIRTIO_CONFIG_CHANGE:
	{
		void *buf;

		mutex_lock(&vdev->vdev_mutex);
		ret = vop_vdev_inited(vdev);
		if (ret)
			goto __unlock_ret;
		buf = memdup_user(argp, vdev->dd->config_len);
		if (IS_ERR(buf)) {
			ret = PTR_ERR(buf);
			goto __unlock_ret;
		}
		ret = vop_virtio_config_change(vdev, buf);
		kfree(buf);
__unlock_ret:
		mutex_unlock(&vdev->vdev_mutex);
		return ret;
	}
	default:
		return -ENOIOCTLCMD;
	};
	return 0;
}

/*
 * We return EPOLLIN | EPOLLOUT from poll when new buffers are enqueued, and
 * not when previously enqueued buffers may be available. This means that
 * in the card->host (TX) path, when userspace is unblocked by poll it
 * must drain all available descriptors or it can stall.
 */
static __poll_t vop_poll(struct file *f, poll_table *wait)
{
	struct vop_vdev *vdev = f->private_data;
	__poll_t mask = 0;

	mutex_lock(&vdev->vdev_mutex);
	if (vop_vdev_inited(vdev)) {
		mask = EPOLLERR;
		goto done;
	}
	poll_wait(f, &vdev->waitq, wait);
	if (vop_vdev_inited(vdev)) {
		mask = EPOLLERR;
	} else if (vdev->poll_wake) {
		vdev->poll_wake = 0;
		mask = EPOLLIN | EPOLLOUT;
	}
done:
	mutex_unlock(&vdev->vdev_mutex);
	return mask;
}

static inline int
vop_query_offset(struct vop_vdev *vdev, unsigned long offset,
		 unsigned long *size, unsigned long *pa)
{
	struct vop_device *vpdev = vdev->vpdev;
	unsigned long start = MIC_DP_SIZE;
	int i;

	/*
	 * MMAP interface is as follows:
	 * offset				region
	 * 0x0					virtio device_page
	 * 0x1000				first vring
	 * 0x1000 + size of 1st vring		second vring
	 * ....
	 */
	if (!offset) {
		*pa = virt_to_phys(vpdev->hw_ops->get_dp(vpdev));
		*size = MIC_DP_SIZE;
		return 0;
	}

	for (i = 0; i < vdev->dd->num_vq; i++) {
		struct vop_vringh *vvr = &vdev->vvr[i];

		if (offset == start) {
			*pa = virt_to_phys(vvr->vring.va);
			*size = vvr->vring.len;
			return 0;
		}
		start += vvr->vring.len;
	}
	return -1;
}

/*
 * Maps the device page and virtio rings to user space for readonly access.
 */
static int vop_mmap(struct file *f, struct vm_area_struct *vma)
{
	struct vop_vdev *vdev = f->private_data;
	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
	unsigned long pa, size = vma->vm_end - vma->vm_start, size_rem = size;
	int i, err;

	err = vop_vdev_inited(vdev);
	if (err)
		goto ret;
	if (vma->vm_flags & VM_WRITE) {
		err = -EACCES;
		goto ret;
	}
	while (size_rem) {
		i = vop_query_offset(vdev, offset, &size, &pa);
		if (i < 0) {
			err = -EINVAL;
			goto ret;
		}
		err = remap_pfn_range(vma, vma->vm_start + offset,
				      pa >> PAGE_SHIFT, size,
				      vma->vm_page_prot);
		if (err)
			goto ret;
		size_rem -= size;
		offset += size;
	}
ret:
	return err;
}

static const struct file_operations vop_fops = {
	.open = vop_open,
	.release = vop_release,
	.unlocked_ioctl = vop_ioctl,
	.poll = vop_poll,
	.mmap = vop_mmap,
	.owner = THIS_MODULE,
};

int vop_host_init(struct vop_info *vi)
{
	int rc;
	struct miscdevice *mdev;
	struct vop_device *vpdev = vi->vpdev;

	INIT_LIST_HEAD(&vi->vdev_list);
	vi->dma_ch = vpdev->dma_ch;
	mdev = &vi->miscdev;
	mdev->minor = MISC_DYNAMIC_MINOR;
	snprintf(vi->name, sizeof(vi->name), "vop_virtio%d", vpdev->index);
	mdev->name = vi->name;
	mdev->fops = &vop_fops;
	mdev->parent = &vpdev->dev;

	rc = misc_register(mdev);
	if (rc)
		dev_err(&vpdev->dev, "%s failed rc %d\n", __func__, rc);
	return rc;
}

void vop_host_uninit(struct vop_info *vi)
{
	struct list_head *pos, *tmp;
	struct vop_vdev *vdev;

	mutex_lock(&vi->vop_mutex);
	vop_virtio_reset_devices(vi);
	list_for_each_safe(pos, tmp, &vi->vdev_list) {
		vdev = list_entry(pos, struct vop_vdev, list);
		list_del(pos);
		reinit_completion(&vdev->destroy);
		mutex_unlock(&vi->vop_mutex);
		mutex_lock(&vdev->vdev_mutex);
		vop_virtio_del_device(vdev);
		vdev->deleted = true;
		mutex_unlock(&vdev->vdev_mutex);
		complete(&vdev->destroy);
		mutex_lock(&vi->vop_mutex);
	}
	mutex_unlock(&vi->vop_mutex);
	misc_deregister(&vi->miscdev);
}