Contributors: 11
Author Tokens Token Proportion Commits Commit Proportion
Johannes Berg 4059 88.35% 5 22.73%
Vincent Whitchurch 424 9.23% 6 27.27%
Benjamin Berg 71 1.55% 2 9.09%
Kees Cook 20 0.44% 1 4.55%
Jeff Dike 8 0.17% 2 9.09%
Anton Ivanov 4 0.09% 1 4.55%
Alexey Dobriyan 2 0.04% 1 4.55%
Al Viro 2 0.04% 1 4.55%
Xiu Jianfeng 2 0.04% 1 4.55%
Alex Dewar 1 0.02% 1 4.55%
Michael S. Tsirkin 1 0.02% 1 4.55%
Total 4594 22


// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2020 Intel Corporation
 * Author: Johannes Berg <johannes@sipsolutions.net>
 */
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/virtio.h>
#include <linux/virtio_config.h>
#include <linux/logic_iomem.h>
#include <linux/of_platform.h>
#include <linux/irqdomain.h>
#include <linux/virtio_pcidev.h>
#include <linux/virtio-uml.h>
#include <linux/delay.h>
#include <linux/msi.h>
#include <asm/unaligned.h>
#include <irq_kern.h>

#define MAX_DEVICES 8
#define MAX_MSI_VECTORS 32
#define CFG_SPACE_SIZE 4096

/* for MSI-X we have a 32-bit payload */
#define MAX_IRQ_MSG_SIZE (sizeof(struct virtio_pcidev_msg) + sizeof(u32))
#define NUM_IRQ_MSGS	10

#define HANDLE_NO_FREE(ptr) ((void *)((unsigned long)(ptr) | 1))
#define HANDLE_IS_NO_FREE(ptr) ((unsigned long)(ptr) & 1)

struct um_pci_device {
	struct virtio_device *vdev;

	/* for now just standard BARs */
	u8 resptr[PCI_STD_NUM_BARS];

	struct virtqueue *cmd_vq, *irq_vq;

#define UM_PCI_STAT_WAITING	0
	unsigned long status;

	int irq;

	bool platform;
};

struct um_pci_device_reg {
	struct um_pci_device *dev;
	void __iomem *iomem;
};

static struct pci_host_bridge *bridge;
static DEFINE_MUTEX(um_pci_mtx);
static struct um_pci_device *um_pci_platform_device;
static struct um_pci_device_reg um_pci_devices[MAX_DEVICES];
static struct fwnode_handle *um_pci_fwnode;
static struct irq_domain *um_pci_inner_domain;
static struct irq_domain *um_pci_msi_domain;
static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)];

static unsigned int um_pci_max_delay_us = 40000;
module_param_named(max_delay_us, um_pci_max_delay_us, uint, 0644);

struct um_pci_message_buffer {
	struct virtio_pcidev_msg hdr;
	u8 data[8];
};

static struct um_pci_message_buffer __percpu *um_pci_msg_bufs;

static int um_pci_send_cmd(struct um_pci_device *dev,
			   struct virtio_pcidev_msg *cmd,
			   unsigned int cmd_size,
			   const void *extra, unsigned int extra_size,
			   void *out, unsigned int out_size)
{
	struct scatterlist out_sg, extra_sg, in_sg;
	struct scatterlist *sgs_list[] = {
		[0] = &out_sg,
		[1] = extra ? &extra_sg : &in_sg,
		[2] = extra ? &in_sg : NULL,
	};
	struct um_pci_message_buffer *buf;
	int delay_count = 0;
	int ret, len;
	bool posted;

	if (WARN_ON(cmd_size < sizeof(*cmd) || cmd_size > sizeof(*buf)))
		return -EINVAL;

	switch (cmd->op) {
	case VIRTIO_PCIDEV_OP_CFG_WRITE:
	case VIRTIO_PCIDEV_OP_MMIO_WRITE:
	case VIRTIO_PCIDEV_OP_MMIO_MEMSET:
		/* in PCI, writes are posted, so don't wait */
		posted = !out;
		WARN_ON(!posted);
		break;
	default:
		posted = false;
		break;
	}

	buf = get_cpu_var(um_pci_msg_bufs);
	if (buf)
		memcpy(buf, cmd, cmd_size);

	if (posted) {
		u8 *ncmd = kmalloc(cmd_size + extra_size, GFP_ATOMIC);

		if (ncmd) {
			memcpy(ncmd, cmd, cmd_size);
			if (extra)
				memcpy(ncmd + cmd_size, extra, extra_size);
			cmd = (void *)ncmd;
			cmd_size += extra_size;
			extra = NULL;
			extra_size = 0;
		} else {
			/* try without allocating memory */
			posted = false;
			cmd = (void *)buf;
		}
	} else {
		cmd = (void *)buf;
	}

	sg_init_one(&out_sg, cmd, cmd_size);
	if (extra)
		sg_init_one(&extra_sg, extra, extra_size);
	if (out)
		sg_init_one(&in_sg, out, out_size);

	/* add to internal virtio queue */
	ret = virtqueue_add_sgs(dev->cmd_vq, sgs_list,
				extra ? 2 : 1,
				out ? 1 : 0,
				posted ? cmd : HANDLE_NO_FREE(cmd),
				GFP_ATOMIC);
	if (ret) {
		if (posted)
			kfree(cmd);
		goto out;
	}

	if (posted) {
		virtqueue_kick(dev->cmd_vq);
		ret = 0;
		goto out;
	}

	/* kick and poll for getting a response on the queue */
	set_bit(UM_PCI_STAT_WAITING, &dev->status);
	virtqueue_kick(dev->cmd_vq);

	while (1) {
		void *completed = virtqueue_get_buf(dev->cmd_vq, &len);

		if (completed == HANDLE_NO_FREE(cmd))
			break;

		if (completed && !HANDLE_IS_NO_FREE(completed))
			kfree(completed);

		if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) ||
			      ++delay_count > um_pci_max_delay_us,
			      "um virt-pci delay: %d", delay_count)) {
			ret = -EIO;
			break;
		}
		udelay(1);
	}
	clear_bit(UM_PCI_STAT_WAITING, &dev->status);

out:
	put_cpu_var(um_pci_msg_bufs);
	return ret;
}

static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset,
					  int size)
{
	struct um_pci_device_reg *reg = priv;
	struct um_pci_device *dev = reg->dev;
	struct virtio_pcidev_msg hdr = {
		.op = VIRTIO_PCIDEV_OP_CFG_READ,
		.size = size,
		.addr = offset,
	};
	/* buf->data is maximum size - we may only use parts of it */
	struct um_pci_message_buffer *buf;
	u8 *data;
	unsigned long ret = ULONG_MAX;
	size_t bytes = sizeof(buf->data);

	if (!dev)
		return ULONG_MAX;

	buf = get_cpu_var(um_pci_msg_bufs);
	data = buf->data;

	if (buf)
		memset(data, 0xff, bytes);

	switch (size) {
	case 1:
	case 2:
	case 4:
#ifdef CONFIG_64BIT
	case 8:
#endif
		break;
	default:
		WARN(1, "invalid config space read size %d\n", size);
		goto out;
	}

	if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, bytes))
		goto out;

	switch (size) {
	case 1:
		ret = data[0];
		break;
	case 2:
		ret = le16_to_cpup((void *)data);
		break;
	case 4:
		ret = le32_to_cpup((void *)data);
		break;
#ifdef CONFIG_64BIT
	case 8:
		ret = le64_to_cpup((void *)data);
		break;
#endif
	default:
		break;
	}

out:
	put_cpu_var(um_pci_msg_bufs);
	return ret;
}

static void um_pci_cfgspace_write(void *priv, unsigned int offset, int size,
				  unsigned long val)
{
	struct um_pci_device_reg *reg = priv;
	struct um_pci_device *dev = reg->dev;
	struct {
		struct virtio_pcidev_msg hdr;
		/* maximum size - we may only use parts of it */
		u8 data[8];
	} msg = {
		.hdr = {
			.op = VIRTIO_PCIDEV_OP_CFG_WRITE,
			.size = size,
			.addr = offset,
		},
	};

	if (!dev)
		return;

	switch (size) {
	case 1:
		msg.data[0] = (u8)val;
		break;
	case 2:
		put_unaligned_le16(val, (void *)msg.data);
		break;
	case 4:
		put_unaligned_le32(val, (void *)msg.data);
		break;
#ifdef CONFIG_64BIT
	case 8:
		put_unaligned_le64(val, (void *)msg.data);
		break;
#endif
	default:
		WARN(1, "invalid config space write size %d\n", size);
		return;
	}

	WARN_ON(um_pci_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0));
}

static const struct logic_iomem_ops um_pci_device_cfgspace_ops = {
	.read = um_pci_cfgspace_read,
	.write = um_pci_cfgspace_write,
};

static void um_pci_bar_copy_from(void *priv, void *buffer,
				 unsigned int offset, int size)
{
	u8 *resptr = priv;
	struct um_pci_device *dev = container_of(resptr - *resptr,
						 struct um_pci_device,
						 resptr[0]);
	struct virtio_pcidev_msg hdr = {
		.op = VIRTIO_PCIDEV_OP_MMIO_READ,
		.bar = *resptr,
		.size = size,
		.addr = offset,
	};

	memset(buffer, 0xff, size);

	um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, buffer, size);
}

static unsigned long um_pci_bar_read(void *priv, unsigned int offset,
				     int size)
{
	/* buf->data is maximum size - we may only use parts of it */
	struct um_pci_message_buffer *buf;
	u8 *data;
	unsigned long ret = ULONG_MAX;

	buf = get_cpu_var(um_pci_msg_bufs);
	data = buf->data;

	switch (size) {
	case 1:
	case 2:
	case 4:
#ifdef CONFIG_64BIT
	case 8:
#endif
		break;
	default:
		WARN(1, "invalid config space read size %d\n", size);
		goto out;
	}

	um_pci_bar_copy_from(priv, data, offset, size);

	switch (size) {
	case 1:
		ret = data[0];
		break;
	case 2:
		ret = le16_to_cpup((void *)data);
		break;
	case 4:
		ret = le32_to_cpup((void *)data);
		break;
#ifdef CONFIG_64BIT
	case 8:
		ret = le64_to_cpup((void *)data);
		break;
#endif
	default:
		break;
	}

out:
	put_cpu_var(um_pci_msg_bufs);
	return ret;
}

static void um_pci_bar_copy_to(void *priv, unsigned int offset,
			       const void *buffer, int size)
{
	u8 *resptr = priv;
	struct um_pci_device *dev = container_of(resptr - *resptr,
						 struct um_pci_device,
						 resptr[0]);
	struct virtio_pcidev_msg hdr = {
		.op = VIRTIO_PCIDEV_OP_MMIO_WRITE,
		.bar = *resptr,
		.size = size,
		.addr = offset,
	};

	um_pci_send_cmd(dev, &hdr, sizeof(hdr), buffer, size, NULL, 0);
}

static void um_pci_bar_write(void *priv, unsigned int offset, int size,
			     unsigned long val)
{
	/* maximum size - we may only use parts of it */
	u8 data[8];

	switch (size) {
	case 1:
		data[0] = (u8)val;
		break;
	case 2:
		put_unaligned_le16(val, (void *)data);
		break;
	case 4:
		put_unaligned_le32(val, (void *)data);
		break;
#ifdef CONFIG_64BIT
	case 8:
		put_unaligned_le64(val, (void *)data);
		break;
#endif
	default:
		WARN(1, "invalid config space write size %d\n", size);
		return;
	}

	um_pci_bar_copy_to(priv, offset, data, size);
}

static void um_pci_bar_set(void *priv, unsigned int offset, u8 value, int size)
{
	u8 *resptr = priv;
	struct um_pci_device *dev = container_of(resptr - *resptr,
						 struct um_pci_device,
						 resptr[0]);
	struct {
		struct virtio_pcidev_msg hdr;
		u8 data;
	} msg = {
		.hdr = {
			.op = VIRTIO_PCIDEV_OP_CFG_WRITE,
			.bar = *resptr,
			.size = size,
			.addr = offset,
		},
		.data = value,
	};

	um_pci_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0);
}

static const struct logic_iomem_ops um_pci_device_bar_ops = {
	.read = um_pci_bar_read,
	.write = um_pci_bar_write,
	.set = um_pci_bar_set,
	.copy_from = um_pci_bar_copy_from,
	.copy_to = um_pci_bar_copy_to,
};

static void __iomem *um_pci_map_bus(struct pci_bus *bus, unsigned int devfn,
				    int where)
{
	struct um_pci_device_reg *dev;
	unsigned int busn = bus->number;

	if (busn > 0)
		return NULL;

	/* not allowing functions for now ... */
	if (devfn % 8)
		return NULL;

	if (devfn / 8 >= ARRAY_SIZE(um_pci_devices))
		return NULL;

	dev = &um_pci_devices[devfn / 8];
	if (!dev)
		return NULL;

	return (void __iomem *)((unsigned long)dev->iomem + where);
}

static struct pci_ops um_pci_ops = {
	.map_bus = um_pci_map_bus,
	.read = pci_generic_config_read,
	.write = pci_generic_config_write,
};

static void um_pci_rescan(void)
{
	pci_lock_rescan_remove();
	pci_rescan_bus(bridge->bus);
	pci_unlock_rescan_remove();
}

static void um_pci_irq_vq_addbuf(struct virtqueue *vq, void *buf, bool kick)
{
	struct scatterlist sg[1];

	sg_init_one(sg, buf, MAX_IRQ_MSG_SIZE);
	if (virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC))
		kfree(buf);
	else if (kick)
		virtqueue_kick(vq);
}

static void um_pci_handle_irq_message(struct virtqueue *vq,
				      struct virtio_pcidev_msg *msg)
{
	struct virtio_device *vdev = vq->vdev;
	struct um_pci_device *dev = vdev->priv;

	if (!dev->irq)
		return;

	/* we should properly chain interrupts, but on ARCH=um we don't care */

	switch (msg->op) {
	case VIRTIO_PCIDEV_OP_INT:
		generic_handle_irq(dev->irq);
		break;
	case VIRTIO_PCIDEV_OP_MSI:
		/* our MSI message is just the interrupt number */
		if (msg->size == sizeof(u32))
			generic_handle_irq(le32_to_cpup((void *)msg->data));
		else
			generic_handle_irq(le16_to_cpup((void *)msg->data));
		break;
	case VIRTIO_PCIDEV_OP_PME:
		/* nothing to do - we already woke up due to the message */
		break;
	default:
		dev_err(&vdev->dev, "unexpected virt-pci message %d\n", msg->op);
		break;
	}
}

static void um_pci_cmd_vq_cb(struct virtqueue *vq)
{
	struct virtio_device *vdev = vq->vdev;
	struct um_pci_device *dev = vdev->priv;
	void *cmd;
	int len;

	if (test_bit(UM_PCI_STAT_WAITING, &dev->status))
		return;

	while ((cmd = virtqueue_get_buf(vq, &len))) {
		if (WARN_ON(HANDLE_IS_NO_FREE(cmd)))
			continue;
		kfree(cmd);
	}
}

static void um_pci_irq_vq_cb(struct virtqueue *vq)
{
	struct virtio_pcidev_msg *msg;
	int len;

	while ((msg = virtqueue_get_buf(vq, &len))) {
		if (len >= sizeof(*msg))
			um_pci_handle_irq_message(vq, msg);

		/* recycle the message buffer */
		um_pci_irq_vq_addbuf(vq, msg, true);
	}
}

#ifdef CONFIG_OF
/* Copied from arch/x86/kernel/devicetree.c */
struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
{
	struct device_node *np;

	for_each_node_by_type(np, "pci") {
		const void *prop;
		unsigned int bus_min;

		prop = of_get_property(np, "bus-range", NULL);
		if (!prop)
			continue;
		bus_min = be32_to_cpup(prop);
		if (bus->number == bus_min)
			return np;
	}
	return NULL;
}
#endif

static int um_pci_init_vqs(struct um_pci_device *dev)
{
	struct virtqueue *vqs[2];
	static const char *const names[2] = { "cmd", "irq" };
	vq_callback_t *cbs[2] = { um_pci_cmd_vq_cb, um_pci_irq_vq_cb };
	int err, i;

	err = virtio_find_vqs(dev->vdev, 2, vqs, cbs, names, NULL);
	if (err)
		return err;

	dev->cmd_vq = vqs[0];
	dev->irq_vq = vqs[1];

	virtio_device_ready(dev->vdev);

	for (i = 0; i < NUM_IRQ_MSGS; i++) {
		void *msg = kzalloc(MAX_IRQ_MSG_SIZE, GFP_KERNEL);

		if (msg)
			um_pci_irq_vq_addbuf(dev->irq_vq, msg, false);
	}

	virtqueue_kick(dev->irq_vq);

	return 0;
}

static void __um_pci_virtio_platform_remove(struct virtio_device *vdev,
					    struct um_pci_device *dev)
{
	virtio_reset_device(vdev);
	vdev->config->del_vqs(vdev);

	mutex_lock(&um_pci_mtx);
	um_pci_platform_device = NULL;
	mutex_unlock(&um_pci_mtx);

	kfree(dev);
}

static int um_pci_virtio_platform_probe(struct virtio_device *vdev,
					struct um_pci_device *dev)
{
	int ret;

	dev->platform = true;

	mutex_lock(&um_pci_mtx);

	if (um_pci_platform_device) {
		mutex_unlock(&um_pci_mtx);
		ret = -EBUSY;
		goto out_free;
	}

	ret = um_pci_init_vqs(dev);
	if (ret) {
		mutex_unlock(&um_pci_mtx);
		goto out_free;
	}

	um_pci_platform_device = dev;

	mutex_unlock(&um_pci_mtx);

	ret = of_platform_default_populate(vdev->dev.of_node, NULL, &vdev->dev);
	if (ret)
		__um_pci_virtio_platform_remove(vdev, dev);

	return ret;

out_free:
	kfree(dev);
	return ret;
}

static int um_pci_virtio_probe(struct virtio_device *vdev)
{
	struct um_pci_device *dev;
	int i, free = -1;
	int err = -ENOSPC;

	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
	if (!dev)
		return -ENOMEM;

	dev->vdev = vdev;
	vdev->priv = dev;

	if (of_device_is_compatible(vdev->dev.of_node, "simple-bus"))
		return um_pci_virtio_platform_probe(vdev, dev);

	mutex_lock(&um_pci_mtx);
	for (i = 0; i < MAX_DEVICES; i++) {
		if (um_pci_devices[i].dev)
			continue;
		free = i;
		break;
	}

	if (free < 0)
		goto error;

	err = um_pci_init_vqs(dev);
	if (err)
		goto error;

	dev->irq = irq_alloc_desc(numa_node_id());
	if (dev->irq < 0) {
		err = dev->irq;
		goto err_reset;
	}
	um_pci_devices[free].dev = dev;
	vdev->priv = dev;

	mutex_unlock(&um_pci_mtx);

	device_set_wakeup_enable(&vdev->dev, true);

	/*
	 * In order to do suspend-resume properly, don't allow VQs
	 * to be suspended.
	 */
	virtio_uml_set_no_vq_suspend(vdev, true);

	um_pci_rescan();
	return 0;
err_reset:
	virtio_reset_device(vdev);
	vdev->config->del_vqs(vdev);
error:
	mutex_unlock(&um_pci_mtx);
	kfree(dev);
	return err;
}

static void um_pci_virtio_remove(struct virtio_device *vdev)
{
	struct um_pci_device *dev = vdev->priv;
	int i;

	if (dev->platform) {
		of_platform_depopulate(&vdev->dev);
		__um_pci_virtio_platform_remove(vdev, dev);
		return;
	}

	device_set_wakeup_enable(&vdev->dev, false);

	mutex_lock(&um_pci_mtx);
	for (i = 0; i < MAX_DEVICES; i++) {
		if (um_pci_devices[i].dev != dev)
			continue;

		um_pci_devices[i].dev = NULL;
		irq_free_desc(dev->irq);

		break;
	}
	mutex_unlock(&um_pci_mtx);

	if (i < MAX_DEVICES) {
		struct pci_dev *pci_dev;

		pci_dev = pci_get_slot(bridge->bus, i);
		if (pci_dev)
			pci_stop_and_remove_bus_device_locked(pci_dev);
	}

	/* Stop all virtqueues */
	virtio_reset_device(vdev);
	dev->cmd_vq = NULL;
	dev->irq_vq = NULL;
	vdev->config->del_vqs(vdev);

	kfree(dev);
}

static struct virtio_device_id id_table[] = {
	{ CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID, VIRTIO_DEV_ANY_ID },
	{ 0 },
};
MODULE_DEVICE_TABLE(virtio, id_table);

static struct virtio_driver um_pci_virtio_driver = {
	.driver.name = "virtio-pci",
	.id_table = id_table,
	.probe = um_pci_virtio_probe,
	.remove = um_pci_virtio_remove,
};

static struct resource virt_cfgspace_resource = {
	.name = "PCI config space",
	.start = 0xf0000000 - MAX_DEVICES * CFG_SPACE_SIZE,
	.end = 0xf0000000 - 1,
	.flags = IORESOURCE_MEM,
};

static long um_pci_map_cfgspace(unsigned long offset, size_t size,
				const struct logic_iomem_ops **ops,
				void **priv)
{
	if (WARN_ON(size > CFG_SPACE_SIZE || offset % CFG_SPACE_SIZE))
		return -EINVAL;

	if (offset / CFG_SPACE_SIZE < MAX_DEVICES) {
		*ops = &um_pci_device_cfgspace_ops;
		*priv = &um_pci_devices[offset / CFG_SPACE_SIZE];
		return 0;
	}

	WARN(1, "cannot map offset 0x%lx/0x%zx\n", offset, size);
	return -ENOENT;
}

static const struct logic_iomem_region_ops um_pci_cfgspace_ops = {
	.map = um_pci_map_cfgspace,
};

static struct resource virt_iomem_resource = {
	.name = "PCI iomem",
	.start = 0xf0000000,
	.end = 0xffffffff,
	.flags = IORESOURCE_MEM,
};

struct um_pci_map_iomem_data {
	unsigned long offset;
	size_t size;
	const struct logic_iomem_ops **ops;
	void **priv;
	long ret;
};

static int um_pci_map_iomem_walk(struct pci_dev *pdev, void *_data)
{
	struct um_pci_map_iomem_data *data = _data;
	struct um_pci_device_reg *reg = &um_pci_devices[pdev->devfn / 8];
	struct um_pci_device *dev;
	int i;

	if (!reg->dev)
		return 0;

	for (i = 0; i < ARRAY_SIZE(dev->resptr); i++) {
		struct resource *r = &pdev->resource[i];

		if ((r->flags & IORESOURCE_TYPE_BITS) != IORESOURCE_MEM)
			continue;

		/*
		 * must be the whole or part of the resource,
		 * not allowed to only overlap
		 */
		if (data->offset < r->start || data->offset > r->end)
			continue;
		if (data->offset + data->size - 1 > r->end)
			continue;

		dev = reg->dev;
		*data->ops = &um_pci_device_bar_ops;
		dev->resptr[i] = i;
		*data->priv = &dev->resptr[i];
		data->ret = data->offset - r->start;

		/* no need to continue */
		return 1;
	}

	return 0;
}

static long um_pci_map_iomem(unsigned long offset, size_t size,
			     const struct logic_iomem_ops **ops,
			     void **priv)
{
	struct um_pci_map_iomem_data data = {
		/* we want the full address here */
		.offset = offset + virt_iomem_resource.start,
		.size = size,
		.ops = ops,
		.priv = priv,
		.ret = -ENOENT,
	};

	pci_walk_bus(bridge->bus, um_pci_map_iomem_walk, &data);
	return data.ret;
}

static const struct logic_iomem_region_ops um_pci_iomem_ops = {
	.map = um_pci_map_iomem,
};

static void um_pci_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
{
	/*
	 * This is a very low address and not actually valid 'physical' memory
	 * in UML, so we can simply map MSI(-X) vectors to there, it cannot be
	 * legitimately written to by the device in any other way.
	 * We use the (virtual) IRQ number here as the message to simplify the
	 * code that receives the message, where for now we simply trust the
	 * device to send the correct message.
	 */
	msg->address_hi = 0;
	msg->address_lo = 0xa0000;
	msg->data = data->irq;
}

static struct irq_chip um_pci_msi_bottom_irq_chip = {
	.name = "UM virtio MSI",
	.irq_compose_msi_msg = um_pci_compose_msi_msg,
};

static int um_pci_inner_domain_alloc(struct irq_domain *domain,
				     unsigned int virq, unsigned int nr_irqs,
				     void *args)
{
	unsigned long bit;

	WARN_ON(nr_irqs != 1);

	mutex_lock(&um_pci_mtx);
	bit = find_first_zero_bit(um_pci_msi_used, MAX_MSI_VECTORS);
	if (bit >= MAX_MSI_VECTORS) {
		mutex_unlock(&um_pci_mtx);
		return -ENOSPC;
	}

	set_bit(bit, um_pci_msi_used);
	mutex_unlock(&um_pci_mtx);

	irq_domain_set_info(domain, virq, bit, &um_pci_msi_bottom_irq_chip,
			    domain->host_data, handle_simple_irq,
			    NULL, NULL);

	return 0;
}

static void um_pci_inner_domain_free(struct irq_domain *domain,
				     unsigned int virq, unsigned int nr_irqs)
{
	struct irq_data *d = irq_domain_get_irq_data(domain, virq);

	mutex_lock(&um_pci_mtx);

	if (!test_bit(d->hwirq, um_pci_msi_used))
		pr_err("trying to free unused MSI#%lu\n", d->hwirq);
	else
		__clear_bit(d->hwirq, um_pci_msi_used);

	mutex_unlock(&um_pci_mtx);
}

static const struct irq_domain_ops um_pci_inner_domain_ops = {
	.alloc = um_pci_inner_domain_alloc,
	.free = um_pci_inner_domain_free,
};

static struct irq_chip um_pci_msi_irq_chip = {
	.name = "UM virtio PCIe MSI",
	.irq_mask = pci_msi_mask_irq,
	.irq_unmask = pci_msi_unmask_irq,
};

static struct msi_domain_info um_pci_msi_domain_info = {
	.flags	= MSI_FLAG_USE_DEF_DOM_OPS |
		  MSI_FLAG_USE_DEF_CHIP_OPS |
		  MSI_FLAG_PCI_MSIX,
	.chip	= &um_pci_msi_irq_chip,
};

static struct resource busn_resource = {
	.name	= "PCI busn",
	.start	= 0,
	.end	= 0,
	.flags	= IORESOURCE_BUS,
};

static int um_pci_map_irq(const struct pci_dev *pdev, u8 slot, u8 pin)
{
	struct um_pci_device_reg *reg = &um_pci_devices[pdev->devfn / 8];

	if (WARN_ON(!reg->dev))
		return -EINVAL;

	/* Yes, we map all pins to the same IRQ ... doesn't matter for now. */
	return reg->dev->irq;
}

void *pci_root_bus_fwnode(struct pci_bus *bus)
{
	return um_pci_fwnode;
}

static long um_pci_map_platform(unsigned long offset, size_t size,
				const struct logic_iomem_ops **ops,
				void **priv)
{
	if (!um_pci_platform_device)
		return -ENOENT;

	*ops = &um_pci_device_bar_ops;
	*priv = &um_pci_platform_device->resptr[0];

	return offset;
}

static const struct logic_iomem_region_ops um_pci_platform_ops = {
	.map = um_pci_map_platform,
};

static struct resource virt_platform_resource = {
	.name = "platform",
	.start = 0x10000000,
	.end = 0x1fffffff,
	.flags = IORESOURCE_MEM,
};

static int __init um_pci_init(void)
{
	int err, i;

	WARN_ON(logic_iomem_add_region(&virt_cfgspace_resource,
				       &um_pci_cfgspace_ops));
	WARN_ON(logic_iomem_add_region(&virt_iomem_resource,
				       &um_pci_iomem_ops));
	WARN_ON(logic_iomem_add_region(&virt_platform_resource,
				       &um_pci_platform_ops));

	if (WARN(CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID < 0,
		 "No virtio device ID configured for PCI - no PCI support\n"))
		return 0;

	um_pci_msg_bufs = alloc_percpu(struct um_pci_message_buffer);
	if (!um_pci_msg_bufs)
		return -ENOMEM;

	bridge = pci_alloc_host_bridge(0);
	if (!bridge) {
		err = -ENOMEM;
		goto free;
	}

	um_pci_fwnode = irq_domain_alloc_named_fwnode("um-pci");
	if (!um_pci_fwnode) {
		err = -ENOMEM;
		goto free;
	}

	um_pci_inner_domain = __irq_domain_add(um_pci_fwnode, MAX_MSI_VECTORS,
					       MAX_MSI_VECTORS, 0,
					       &um_pci_inner_domain_ops, NULL);
	if (!um_pci_inner_domain) {
		err = -ENOMEM;
		goto free;
	}

	um_pci_msi_domain = pci_msi_create_irq_domain(um_pci_fwnode,
						      &um_pci_msi_domain_info,
						      um_pci_inner_domain);
	if (!um_pci_msi_domain) {
		err = -ENOMEM;
		goto free;
	}

	pci_add_resource(&bridge->windows, &virt_iomem_resource);
	pci_add_resource(&bridge->windows, &busn_resource);
	bridge->ops = &um_pci_ops;
	bridge->map_irq = um_pci_map_irq;

	for (i = 0; i < MAX_DEVICES; i++) {
		resource_size_t start;

		start = virt_cfgspace_resource.start + i * CFG_SPACE_SIZE;
		um_pci_devices[i].iomem = ioremap(start, CFG_SPACE_SIZE);
		if (WARN(!um_pci_devices[i].iomem, "failed to map %d\n", i)) {
			err = -ENOMEM;
			goto free;
		}
	}

	err = pci_host_probe(bridge);
	if (err)
		goto free;

	err = register_virtio_driver(&um_pci_virtio_driver);
	if (err)
		goto free;
	return 0;
free:
	if (um_pci_inner_domain)
		irq_domain_remove(um_pci_inner_domain);
	if (um_pci_fwnode)
		irq_domain_free_fwnode(um_pci_fwnode);
	if (bridge) {
		pci_free_resource_list(&bridge->windows);
		pci_free_host_bridge(bridge);
	}
	free_percpu(um_pci_msg_bufs);
	return err;
}
module_init(um_pci_init);

static void __exit um_pci_exit(void)
{
	unregister_virtio_driver(&um_pci_virtio_driver);
	irq_domain_remove(um_pci_msi_domain);
	irq_domain_remove(um_pci_inner_domain);
	pci_free_resource_list(&bridge->windows);
	pci_free_host_bridge(bridge);
	free_percpu(um_pci_msg_bufs);
}
module_exit(um_pci_exit);