Contributors: 12
Author Tokens Token Proportion Commits Commit Proportion
Matias Björling 5178 70.99% 30 43.48%
Javier González 1716 23.53% 16 23.19%
Hannes Reinecke 235 3.22% 1 1.45%
Christoph Hellwig 108 1.48% 12 17.39%
Igor Konopko 22 0.30% 3 4.35%
Simon A. F. Lund 16 0.22% 1 1.45%
Geert Uytterhoeven 5 0.07% 1 1.45%
Sagi Grimberg 5 0.07% 1 1.45%
Wenwei Tao 4 0.05% 1 1.45%
Chaitanya Kulkarni 3 0.04% 1 1.45%
Arnd Bergmann 1 0.01% 1 1.45%
Johannes Thumshirn 1 0.01% 1 1.45%
Total 7294 69


// SPDX-License-Identifier: GPL-2.0
/*
 * nvme-lightnvm.c - LightNVM NVMe device
 *
 * Copyright (C) 2014-2015 IT University of Copenhagen
 * Initial release: Matias Bjorling <mb@lightnvm.io>
 */

#include "nvme.h"

#include <linux/nvme.h>
#include <linux/bitops.h>
#include <linux/lightnvm.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
#include <uapi/linux/lightnvm.h>

enum nvme_nvm_admin_opcode {
	nvme_nvm_admin_identity		= 0xe2,
	nvme_nvm_admin_get_bb_tbl	= 0xf2,
	nvme_nvm_admin_set_bb_tbl	= 0xf1,
};

enum nvme_nvm_log_page {
	NVME_NVM_LOG_REPORT_CHUNK	= 0xca,
};

struct nvme_nvm_ph_rw {
	__u8			opcode;
	__u8			flags;
	__u16			command_id;
	__le32			nsid;
	__u64			rsvd2;
	__le64			metadata;
	__le64			prp1;
	__le64			prp2;
	__le64			spba;
	__le16			length;
	__le16			control;
	__le32			dsmgmt;
	__le64			resv;
};

struct nvme_nvm_erase_blk {
	__u8			opcode;
	__u8			flags;
	__u16			command_id;
	__le32			nsid;
	__u64			rsvd[2];
	__le64			prp1;
	__le64			prp2;
	__le64			spba;
	__le16			length;
	__le16			control;
	__le32			dsmgmt;
	__le64			resv;
};

struct nvme_nvm_identity {
	__u8			opcode;
	__u8			flags;
	__u16			command_id;
	__le32			nsid;
	__u64			rsvd[2];
	__le64			prp1;
	__le64			prp2;
	__u32			rsvd11[6];
};

struct nvme_nvm_getbbtbl {
	__u8			opcode;
	__u8			flags;
	__u16			command_id;
	__le32			nsid;
	__u64			rsvd[2];
	__le64			prp1;
	__le64			prp2;
	__le64			spba;
	__u32			rsvd4[4];
};

struct nvme_nvm_setbbtbl {
	__u8			opcode;
	__u8			flags;
	__u16			command_id;
	__le32			nsid;
	__le64			rsvd[2];
	__le64			prp1;
	__le64			prp2;
	__le64			spba;
	__le16			nlb;
	__u8			value;
	__u8			rsvd3;
	__u32			rsvd4[3];
};

struct nvme_nvm_command {
	union {
		struct nvme_common_command common;
		struct nvme_nvm_ph_rw ph_rw;
		struct nvme_nvm_erase_blk erase;
		struct nvme_nvm_identity identity;
		struct nvme_nvm_getbbtbl get_bb;
		struct nvme_nvm_setbbtbl set_bb;
	};
};

struct nvme_nvm_id12_grp {
	__u8			mtype;
	__u8			fmtype;
	__le16			res16;
	__u8			num_ch;
	__u8			num_lun;
	__u8			num_pln;
	__u8			rsvd1;
	__le16			num_chk;
	__le16			num_pg;
	__le16			fpg_sz;
	__le16			csecs;
	__le16			sos;
	__le16			rsvd2;
	__le32			trdt;
	__le32			trdm;
	__le32			tprt;
	__le32			tprm;
	__le32			tbet;
	__le32			tbem;
	__le32			mpos;
	__le32			mccap;
	__le16			cpar;
	__u8			reserved[906];
} __packed;

struct nvme_nvm_id12_addrf {
	__u8			ch_offset;
	__u8			ch_len;
	__u8			lun_offset;
	__u8			lun_len;
	__u8			pln_offset;
	__u8			pln_len;
	__u8			blk_offset;
	__u8			blk_len;
	__u8			pg_offset;
	__u8			pg_len;
	__u8			sec_offset;
	__u8			sec_len;
	__u8			res[4];
} __packed;

struct nvme_nvm_id12 {
	__u8			ver_id;
	__u8			vmnt;
	__u8			cgrps;
	__u8			res;
	__le32			cap;
	__le32			dom;
	struct nvme_nvm_id12_addrf ppaf;
	__u8			resv[228];
	struct nvme_nvm_id12_grp grp;
	__u8			resv2[2880];
} __packed;

struct nvme_nvm_bb_tbl {
	__u8	tblid[4];
	__le16	verid;
	__le16	revid;
	__le32	rvsd1;
	__le32	tblks;
	__le32	tfact;
	__le32	tgrown;
	__le32	tdresv;
	__le32	thresv;
	__le32	rsvd2[8];
	__u8	blk[0];
};

struct nvme_nvm_id20_addrf {
	__u8			grp_len;
	__u8			pu_len;
	__u8			chk_len;
	__u8			lba_len;
	__u8			resv[4];
};

struct nvme_nvm_id20 {
	__u8			mjr;
	__u8			mnr;
	__u8			resv[6];

	struct nvme_nvm_id20_addrf lbaf;

	__le32			mccap;
	__u8			resv2[12];

	__u8			wit;
	__u8			resv3[31];

	/* Geometry */
	__le16			num_grp;
	__le16			num_pu;
	__le32			num_chk;
	__le32			clba;
	__u8			resv4[52];

	/* Write data requirements */
	__le32			ws_min;
	__le32			ws_opt;
	__le32			mw_cunits;
	__le32			maxoc;
	__le32			maxocpu;
	__u8			resv5[44];

	/* Performance related metrics */
	__le32			trdt;
	__le32			trdm;
	__le32			twrt;
	__le32			twrm;
	__le32			tcrst;
	__le32			tcrsm;
	__u8			resv6[40];

	/* Reserved area */
	__u8			resv7[2816];

	/* Vendor specific */
	__u8			vs[1024];
};

struct nvme_nvm_chk_meta {
	__u8	state;
	__u8	type;
	__u8	wi;
	__u8	rsvd[5];
	__le64	slba;
	__le64	cnlb;
	__le64	wp;
};

/*
 * Check we didn't inadvertently grow the command struct
 */
static inline void _nvme_nvm_check_size(void)
{
	BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_nvm_id12_grp) != 960);
	BUILD_BUG_ON(sizeof(struct nvme_nvm_id12_addrf) != 16);
	BUILD_BUG_ON(sizeof(struct nvme_nvm_id12) != NVME_IDENTIFY_DATA_SIZE);
	BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_nvm_id20_addrf) != 8);
	BUILD_BUG_ON(sizeof(struct nvme_nvm_id20) != NVME_IDENTIFY_DATA_SIZE);
	BUILD_BUG_ON(sizeof(struct nvme_nvm_chk_meta) != 32);
	BUILD_BUG_ON(sizeof(struct nvme_nvm_chk_meta) !=
						sizeof(struct nvm_chk_meta));
}

static void nvme_nvm_set_addr_12(struct nvm_addrf_12 *dst,
				 struct nvme_nvm_id12_addrf *src)
{
	dst->ch_len = src->ch_len;
	dst->lun_len = src->lun_len;
	dst->blk_len = src->blk_len;
	dst->pg_len = src->pg_len;
	dst->pln_len = src->pln_len;
	dst->sec_len = src->sec_len;

	dst->ch_offset = src->ch_offset;
	dst->lun_offset = src->lun_offset;
	dst->blk_offset = src->blk_offset;
	dst->pg_offset = src->pg_offset;
	dst->pln_offset = src->pln_offset;
	dst->sec_offset = src->sec_offset;

	dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset;
	dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset;
	dst->blk_mask = ((1ULL << dst->blk_len) - 1) << dst->blk_offset;
	dst->pg_mask = ((1ULL << dst->pg_len) - 1) << dst->pg_offset;
	dst->pln_mask = ((1ULL << dst->pln_len) - 1) << dst->pln_offset;
	dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset;
}

static int nvme_nvm_setup_12(struct nvme_nvm_id12 *id,
			     struct nvm_geo *geo)
{
	struct nvme_nvm_id12_grp *src;
	int sec_per_pg, sec_per_pl, pg_per_blk;

	if (id->cgrps != 1)
		return -EINVAL;

	src = &id->grp;

	if (src->mtype != 0) {
		pr_err("nvm: memory type not supported\n");
		return -EINVAL;
	}

	/* 1.2 spec. only reports a single version id - unfold */
	geo->major_ver_id = id->ver_id;
	geo->minor_ver_id = 2;

	/* Set compacted version for upper layers */
	geo->version = NVM_OCSSD_SPEC_12;

	geo->num_ch = src->num_ch;
	geo->num_lun = src->num_lun;
	geo->all_luns = geo->num_ch * geo->num_lun;

	geo->num_chk = le16_to_cpu(src->num_chk);

	geo->csecs = le16_to_cpu(src->csecs);
	geo->sos = le16_to_cpu(src->sos);

	pg_per_blk = le16_to_cpu(src->num_pg);
	sec_per_pg = le16_to_cpu(src->fpg_sz) / geo->csecs;
	sec_per_pl = sec_per_pg * src->num_pln;
	geo->clba = sec_per_pl * pg_per_blk;

	geo->all_chunks = geo->all_luns * geo->num_chk;
	geo->total_secs = geo->clba * geo->all_chunks;

	geo->ws_min = sec_per_pg;
	geo->ws_opt = sec_per_pg;
	geo->mw_cunits = geo->ws_opt << 3;	/* default to MLC safe values */

	/* Do not impose values for maximum number of open blocks as it is
	 * unspecified in 1.2. Users of 1.2 must be aware of this and eventually
	 * specify these values through a quirk if restrictions apply.
	 */
	geo->maxoc = geo->all_luns * geo->num_chk;
	geo->maxocpu = geo->num_chk;

	geo->mccap = le32_to_cpu(src->mccap);

	geo->trdt = le32_to_cpu(src->trdt);
	geo->trdm = le32_to_cpu(src->trdm);
	geo->tprt = le32_to_cpu(src->tprt);
	geo->tprm = le32_to_cpu(src->tprm);
	geo->tbet = le32_to_cpu(src->tbet);
	geo->tbem = le32_to_cpu(src->tbem);

	/* 1.2 compatibility */
	geo->vmnt = id->vmnt;
	geo->cap = le32_to_cpu(id->cap);
	geo->dom = le32_to_cpu(id->dom);

	geo->mtype = src->mtype;
	geo->fmtype = src->fmtype;

	geo->cpar = le16_to_cpu(src->cpar);
	geo->mpos = le32_to_cpu(src->mpos);

	geo->pln_mode = NVM_PLANE_SINGLE;

	if (geo->mpos & 0x020202) {
		geo->pln_mode = NVM_PLANE_DOUBLE;
		geo->ws_opt <<= 1;
	} else if (geo->mpos & 0x040404) {
		geo->pln_mode = NVM_PLANE_QUAD;
		geo->ws_opt <<= 2;
	}

	geo->num_pln = src->num_pln;
	geo->num_pg = le16_to_cpu(src->num_pg);
	geo->fpg_sz = le16_to_cpu(src->fpg_sz);

	nvme_nvm_set_addr_12((struct nvm_addrf_12 *)&geo->addrf, &id->ppaf);

	return 0;
}

static void nvme_nvm_set_addr_20(struct nvm_addrf *dst,
				 struct nvme_nvm_id20_addrf *src)
{
	dst->ch_len = src->grp_len;
	dst->lun_len = src->pu_len;
	dst->chk_len = src->chk_len;
	dst->sec_len = src->lba_len;

	dst->sec_offset = 0;
	dst->chk_offset = dst->sec_len;
	dst->lun_offset = dst->chk_offset + dst->chk_len;
	dst->ch_offset = dst->lun_offset + dst->lun_len;

	dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset;
	dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset;
	dst->chk_mask = ((1ULL << dst->chk_len) - 1) << dst->chk_offset;
	dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset;
}

static int nvme_nvm_setup_20(struct nvme_nvm_id20 *id,
			     struct nvm_geo *geo)
{
	geo->major_ver_id = id->mjr;
	geo->minor_ver_id = id->mnr;

	/* Set compacted version for upper layers */
	geo->version = NVM_OCSSD_SPEC_20;

	geo->num_ch = le16_to_cpu(id->num_grp);
	geo->num_lun = le16_to_cpu(id->num_pu);
	geo->all_luns = geo->num_ch * geo->num_lun;

	geo->num_chk = le32_to_cpu(id->num_chk);
	geo->clba = le32_to_cpu(id->clba);

	geo->all_chunks = geo->all_luns * geo->num_chk;
	geo->total_secs = geo->clba * geo->all_chunks;

	geo->ws_min = le32_to_cpu(id->ws_min);
	geo->ws_opt = le32_to_cpu(id->ws_opt);
	geo->mw_cunits = le32_to_cpu(id->mw_cunits);
	geo->maxoc = le32_to_cpu(id->maxoc);
	geo->maxocpu = le32_to_cpu(id->maxocpu);

	geo->trdt = le32_to_cpu(id->trdt);
	geo->trdm = le32_to_cpu(id->trdm);
	geo->tprt = le32_to_cpu(id->twrt);
	geo->tprm = le32_to_cpu(id->twrm);
	geo->tbet = le32_to_cpu(id->tcrst);
	geo->tbem = le32_to_cpu(id->tcrsm);

	nvme_nvm_set_addr_20(&geo->addrf, &id->lbaf);

	return 0;
}

static int nvme_nvm_identity(struct nvm_dev *nvmdev)
{
	struct nvme_ns *ns = nvmdev->q->queuedata;
	struct nvme_nvm_id12 *id;
	struct nvme_nvm_command c = {};
	int ret;

	c.identity.opcode = nvme_nvm_admin_identity;
	c.identity.nsid = cpu_to_le32(ns->head->ns_id);

	id = kmalloc(sizeof(struct nvme_nvm_id12), GFP_KERNEL);
	if (!id)
		return -ENOMEM;

	ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
				id, sizeof(struct nvme_nvm_id12));
	if (ret) {
		ret = -EIO;
		goto out;
	}

	/*
	 * The 1.2 and 2.0 specifications share the first byte in their geometry
	 * command to make it possible to know what version a device implements.
	 */
	switch (id->ver_id) {
	case 1:
		ret = nvme_nvm_setup_12(id, &nvmdev->geo);
		break;
	case 2:
		ret = nvme_nvm_setup_20((struct nvme_nvm_id20 *)id,
							&nvmdev->geo);
		break;
	default:
		dev_err(ns->ctrl->device, "OCSSD revision not supported (%d)\n",
							id->ver_id);
		ret = -EINVAL;
	}

out:
	kfree(id);
	return ret;
}

static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
								u8 *blks)
{
	struct request_queue *q = nvmdev->q;
	struct nvm_geo *geo = &nvmdev->geo;
	struct nvme_ns *ns = q->queuedata;
	struct nvme_ctrl *ctrl = ns->ctrl;
	struct nvme_nvm_command c = {};
	struct nvme_nvm_bb_tbl *bb_tbl;
	int nr_blks = geo->num_chk * geo->num_pln;
	int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks;
	int ret = 0;

	c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl;
	c.get_bb.nsid = cpu_to_le32(ns->head->ns_id);
	c.get_bb.spba = cpu_to_le64(ppa.ppa);

	bb_tbl = kzalloc(tblsz, GFP_KERNEL);
	if (!bb_tbl)
		return -ENOMEM;

	ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c,
								bb_tbl, tblsz);
	if (ret) {
		dev_err(ctrl->device, "get bad block table failed (%d)\n", ret);
		ret = -EIO;
		goto out;
	}

	if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' ||
		bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') {
		dev_err(ctrl->device, "bbt format mismatch\n");
		ret = -EINVAL;
		goto out;
	}

	if (le16_to_cpu(bb_tbl->verid) != 1) {
		ret = -EINVAL;
		dev_err(ctrl->device, "bbt version not supported\n");
		goto out;
	}

	if (le32_to_cpu(bb_tbl->tblks) != nr_blks) {
		ret = -EINVAL;
		dev_err(ctrl->device,
				"bbt unsuspected blocks returned (%u!=%u)",
				le32_to_cpu(bb_tbl->tblks), nr_blks);
		goto out;
	}

	memcpy(blks, bb_tbl->blk, geo->num_chk * geo->num_pln);
out:
	kfree(bb_tbl);
	return ret;
}

static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas,
							int nr_ppas, int type)
{
	struct nvme_ns *ns = nvmdev->q->queuedata;
	struct nvme_nvm_command c = {};
	int ret = 0;

	c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl;
	c.set_bb.nsid = cpu_to_le32(ns->head->ns_id);
	c.set_bb.spba = cpu_to_le64(ppas->ppa);
	c.set_bb.nlb = cpu_to_le16(nr_ppas - 1);
	c.set_bb.value = type;

	ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
								NULL, 0);
	if (ret)
		dev_err(ns->ctrl->device, "set bad block table failed (%d)\n",
									ret);
	return ret;
}

/*
 * Expect the lba in device format
 */
static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
				 sector_t slba, int nchks,
				 struct nvm_chk_meta *meta)
{
	struct nvm_geo *geo = &ndev->geo;
	struct nvme_ns *ns = ndev->q->queuedata;
	struct nvme_ctrl *ctrl = ns->ctrl;
	struct nvme_nvm_chk_meta *dev_meta, *dev_meta_off;
	struct ppa_addr ppa;
	size_t left = nchks * sizeof(struct nvme_nvm_chk_meta);
	size_t log_pos, offset, len;
	int i, max_len;
	int ret = 0;

	/*
	 * limit requests to maximum 256K to avoid issuing arbitrary large
	 * requests when the device does not specific a maximum transfer size.
	 */
	max_len = min_t(unsigned int, ctrl->max_hw_sectors << 9, 256 * 1024);

	dev_meta = kmalloc(max_len, GFP_KERNEL);
	if (!dev_meta)
		return -ENOMEM;

	/* Normalize lba address space to obtain log offset */
	ppa.ppa = slba;
	ppa = dev_to_generic_addr(ndev, ppa);

	log_pos = ppa.m.chk;
	log_pos += ppa.m.pu * geo->num_chk;
	log_pos += ppa.m.grp * geo->num_lun * geo->num_chk;

	offset = log_pos * sizeof(struct nvme_nvm_chk_meta);

	while (left) {
		len = min_t(unsigned int, left, max_len);

		memset(dev_meta, 0, max_len);
		dev_meta_off = dev_meta;

		ret = nvme_get_log(ctrl, ns->head->ns_id,
				NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len,
				offset);
		if (ret) {
			dev_err(ctrl->device, "Get REPORT CHUNK log error\n");
			break;
		}

		for (i = 0; i < len; i += sizeof(struct nvme_nvm_chk_meta)) {
			meta->state = dev_meta_off->state;
			meta->type = dev_meta_off->type;
			meta->wi = dev_meta_off->wi;
			meta->slba = le64_to_cpu(dev_meta_off->slba);
			meta->cnlb = le64_to_cpu(dev_meta_off->cnlb);
			meta->wp = le64_to_cpu(dev_meta_off->wp);

			meta++;
			dev_meta_off++;
		}

		offset += len;
		left -= len;
	}

	kfree(dev_meta);

	return ret;
}

static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
				    struct nvme_nvm_command *c)
{
	c->ph_rw.opcode = rqd->opcode;
	c->ph_rw.nsid = cpu_to_le32(ns->head->ns_id);
	c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa);
	c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list);
	c->ph_rw.control = cpu_to_le16(rqd->flags);
	c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1);
}

static void nvme_nvm_end_io(struct request *rq, blk_status_t status)
{
	struct nvm_rq *rqd = rq->end_io_data;

	rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
	rqd->error = nvme_req(rq)->status;
	nvm_end_io(rqd);

	kfree(nvme_req(rq)->cmd);
	blk_mq_free_request(rq);
}

static struct request *nvme_nvm_alloc_request(struct request_queue *q,
					      struct nvm_rq *rqd,
					      struct nvme_nvm_command *cmd)
{
	struct nvme_ns *ns = q->queuedata;
	struct request *rq;

	nvme_nvm_rqtocmd(rqd, ns, cmd);

	rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY);
	if (IS_ERR(rq))
		return rq;

	rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;

	if (rqd->bio)
		blk_rq_append_bio(rq, &rqd->bio);
	else
		rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);

	return rq;
}

static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
{
	struct request_queue *q = dev->q;
	struct nvme_nvm_command *cmd;
	struct request *rq;

	cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
	if (!cmd)
		return -ENOMEM;

	rq = nvme_nvm_alloc_request(q, rqd, cmd);
	if (IS_ERR(rq)) {
		kfree(cmd);
		return PTR_ERR(rq);
	}

	rq->end_io_data = rqd;

	blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);

	return 0;
}

static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd)
{
	struct request_queue *q = dev->q;
	struct request *rq;
	struct nvme_nvm_command cmd;
	int ret = 0;

	memset(&cmd, 0, sizeof(struct nvme_nvm_command));

	rq = nvme_nvm_alloc_request(q, rqd, &cmd);
	if (IS_ERR(rq))
		return PTR_ERR(rq);

	/* I/Os can fail and the error is signaled through rqd. Callers must
	 * handle the error accordingly.
	 */
	blk_execute_rq(q, NULL, rq, 0);
	if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
		ret = -EINTR;

	rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
	rqd->error = nvme_req(rq)->status;

	blk_mq_free_request(rq);

	return ret;
}

static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name,
					int size)
{
	struct nvme_ns *ns = nvmdev->q->queuedata;

	return dma_pool_create(name, ns->ctrl->dev, size, PAGE_SIZE, 0);
}

static void nvme_nvm_destroy_dma_pool(void *pool)
{
	struct dma_pool *dma_pool = pool;

	dma_pool_destroy(dma_pool);
}

static void *nvme_nvm_dev_dma_alloc(struct nvm_dev *dev, void *pool,
				    gfp_t mem_flags, dma_addr_t *dma_handler)
{
	return dma_pool_alloc(pool, mem_flags, dma_handler);
}

static void nvme_nvm_dev_dma_free(void *pool, void *addr,
							dma_addr_t dma_handler)
{
	dma_pool_free(pool, addr, dma_handler);
}

static struct nvm_dev_ops nvme_nvm_dev_ops = {
	.identity		= nvme_nvm_identity,

	.get_bb_tbl		= nvme_nvm_get_bb_tbl,
	.set_bb_tbl		= nvme_nvm_set_bb_tbl,

	.get_chk_meta		= nvme_nvm_get_chk_meta,

	.submit_io		= nvme_nvm_submit_io,
	.submit_io_sync		= nvme_nvm_submit_io_sync,

	.create_dma_pool	= nvme_nvm_create_dma_pool,
	.destroy_dma_pool	= nvme_nvm_destroy_dma_pool,
	.dev_dma_alloc		= nvme_nvm_dev_dma_alloc,
	.dev_dma_free		= nvme_nvm_dev_dma_free,
};

static int nvme_nvm_submit_user_cmd(struct request_queue *q,
				struct nvme_ns *ns,
				struct nvme_nvm_command *vcmd,
				void __user *ubuf, unsigned int bufflen,
				void __user *meta_buf, unsigned int meta_len,
				void __user *ppa_buf, unsigned int ppa_len,
				u32 *result, u64 *status, unsigned int timeout)
{
	bool write = nvme_is_write((struct nvme_command *)vcmd);
	struct nvm_dev *dev = ns->ndev;
	struct gendisk *disk = ns->disk;
	struct request *rq;
	struct bio *bio = NULL;
	__le64 *ppa_list = NULL;
	dma_addr_t ppa_dma;
	__le64 *metadata = NULL;
	dma_addr_t metadata_dma;
	DECLARE_COMPLETION_ONSTACK(wait);
	int ret = 0;

	rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0,
			NVME_QID_ANY);
	if (IS_ERR(rq)) {
		ret = -ENOMEM;
		goto err_cmd;
	}

	rq->timeout = timeout ? timeout : ADMIN_TIMEOUT;

	if (ppa_buf && ppa_len) {
		ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma);
		if (!ppa_list) {
			ret = -ENOMEM;
			goto err_rq;
		}
		if (copy_from_user(ppa_list, (void __user *)ppa_buf,
						sizeof(u64) * (ppa_len + 1))) {
			ret = -EFAULT;
			goto err_ppa;
		}
		vcmd->ph_rw.spba = cpu_to_le64(ppa_dma);
	} else {
		vcmd->ph_rw.spba = cpu_to_le64((uintptr_t)ppa_buf);
	}

	if (ubuf && bufflen) {
		ret = blk_rq_map_user(q, rq, NULL, ubuf, bufflen, GFP_KERNEL);
		if (ret)
			goto err_ppa;
		bio = rq->bio;

		if (meta_buf && meta_len) {
			metadata = dma_pool_alloc(dev->dma_pool, GFP_KERNEL,
								&metadata_dma);
			if (!metadata) {
				ret = -ENOMEM;
				goto err_map;
			}

			if (write) {
				if (copy_from_user(metadata,
						(void __user *)meta_buf,
						meta_len)) {
					ret = -EFAULT;
					goto err_meta;
				}
			}
			vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma);
		}

		bio->bi_disk = disk;
	}

	blk_execute_rq(q, NULL, rq, 0);

	if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
		ret = -EINTR;
	else if (nvme_req(rq)->status & 0x7ff)
		ret = -EIO;
	if (result)
		*result = nvme_req(rq)->status & 0x7ff;
	if (status)
		*status = le64_to_cpu(nvme_req(rq)->result.u64);

	if (metadata && !ret && !write) {
		if (copy_to_user(meta_buf, (void *)metadata, meta_len))
			ret = -EFAULT;
	}
err_meta:
	if (meta_buf && meta_len)
		dma_pool_free(dev->dma_pool, metadata, metadata_dma);
err_map:
	if (bio)
		blk_rq_unmap_user(bio);
err_ppa:
	if (ppa_buf && ppa_len)
		dma_pool_free(dev->dma_pool, ppa_list, ppa_dma);
err_rq:
	blk_mq_free_request(rq);
err_cmd:
	return ret;
}

static int nvme_nvm_submit_vio(struct nvme_ns *ns,
					struct nvm_user_vio __user *uvio)
{
	struct nvm_user_vio vio;
	struct nvme_nvm_command c;
	unsigned int length;
	int ret;

	if (copy_from_user(&vio, uvio, sizeof(vio)))
		return -EFAULT;
	if (vio.flags)
		return -EINVAL;

	memset(&c, 0, sizeof(c));
	c.ph_rw.opcode = vio.opcode;
	c.ph_rw.nsid = cpu_to_le32(ns->head->ns_id);
	c.ph_rw.control = cpu_to_le16(vio.control);
	c.ph_rw.length = cpu_to_le16(vio.nppas);

	length = (vio.nppas + 1) << ns->lba_shift;

	ret = nvme_nvm_submit_user_cmd(ns->queue, ns, &c,
			(void __user *)(uintptr_t)vio.addr, length,
			(void __user *)(uintptr_t)vio.metadata,
							vio.metadata_len,
			(void __user *)(uintptr_t)vio.ppa_list, vio.nppas,
			&vio.result, &vio.status, 0);

	if (ret && copy_to_user(uvio, &vio, sizeof(vio)))
		return -EFAULT;

	return ret;
}

static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
					struct nvm_passthru_vio __user *uvcmd)
{
	struct nvm_passthru_vio vcmd;
	struct nvme_nvm_command c;
	struct request_queue *q;
	unsigned int timeout = 0;
	int ret;

	if (copy_from_user(&vcmd, uvcmd, sizeof(vcmd)))
		return -EFAULT;
	if ((vcmd.opcode != 0xF2) && (!capable(CAP_SYS_ADMIN)))
		return -EACCES;
	if (vcmd.flags)
		return -EINVAL;

	memset(&c, 0, sizeof(c));
	c.common.opcode = vcmd.opcode;
	c.common.nsid = cpu_to_le32(ns->head->ns_id);
	c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2);
	c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3);
	/* cdw11-12 */
	c.ph_rw.length = cpu_to_le16(vcmd.nppas);
	c.ph_rw.control  = cpu_to_le16(vcmd.control);
	c.common.cdw13 = cpu_to_le32(vcmd.cdw13);
	c.common.cdw14 = cpu_to_le32(vcmd.cdw14);
	c.common.cdw15 = cpu_to_le32(vcmd.cdw15);

	if (vcmd.timeout_ms)
		timeout = msecs_to_jiffies(vcmd.timeout_ms);

	q = admin ? ns->ctrl->admin_q : ns->queue;

	ret = nvme_nvm_submit_user_cmd(q, ns,
			(struct nvme_nvm_command *)&c,
			(void __user *)(uintptr_t)vcmd.addr, vcmd.data_len,
			(void __user *)(uintptr_t)vcmd.metadata,
							vcmd.metadata_len,
			(void __user *)(uintptr_t)vcmd.ppa_list, vcmd.nppas,
			&vcmd.result, &vcmd.status, timeout);

	if (ret && copy_to_user(uvcmd, &vcmd, sizeof(vcmd)))
		return -EFAULT;

	return ret;
}

int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg)
{
	switch (cmd) {
	case NVME_NVM_IOCTL_ADMIN_VIO:
		return nvme_nvm_user_vcmd(ns, 1, (void __user *)arg);
	case NVME_NVM_IOCTL_IO_VIO:
		return nvme_nvm_user_vcmd(ns, 0, (void __user *)arg);
	case NVME_NVM_IOCTL_SUBMIT_VIO:
		return nvme_nvm_submit_vio(ns, (void __user *)arg);
	default:
		return -ENOTTY;
	}
}

int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
{
	struct request_queue *q = ns->queue;
	struct nvm_dev *dev;
	struct nvm_geo *geo;

	_nvme_nvm_check_size();

	dev = nvm_alloc_dev(node);
	if (!dev)
		return -ENOMEM;

	/* Note that csecs and sos will be overridden if it is a 1.2 drive. */
	geo = &dev->geo;
	geo->csecs = 1 << ns->lba_shift;
	geo->sos = ns->ms;
	geo->ext = ns->ext;
	geo->mdts = ns->ctrl->max_hw_sectors;

	dev->q = q;
	memcpy(dev->name, disk_name, DISK_NAME_LEN);
	dev->ops = &nvme_nvm_dev_ops;
	dev->private_data = ns;
	ns->ndev = dev;

	return nvm_register(dev);
}

void nvme_nvm_unregister(struct nvme_ns *ns)
{
	nvm_unregister(ns->ndev);
}

static ssize_t nvm_dev_attr_show(struct device *dev,
		struct device_attribute *dattr, char *page)
{
	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
	struct nvm_dev *ndev = ns->ndev;
	struct nvm_geo *geo = &ndev->geo;
	struct attribute *attr;

	if (!ndev)
		return 0;

	attr = &dattr->attr;

	if (strcmp(attr->name, "version") == 0) {
		if (geo->major_ver_id == 1)
			return scnprintf(page, PAGE_SIZE, "%u\n",
						geo->major_ver_id);
		else
			return scnprintf(page, PAGE_SIZE, "%u.%u\n",
						geo->major_ver_id,
						geo->minor_ver_id);
	} else if (strcmp(attr->name, "capabilities") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->cap);
	} else if (strcmp(attr->name, "read_typ") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->trdt);
	} else if (strcmp(attr->name, "read_max") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->trdm);
	} else {
		return scnprintf(page,
				 PAGE_SIZE,
				 "Unhandled attr(%s) in `%s`\n",
				 attr->name, __func__);
	}
}

static ssize_t nvm_dev_attr_show_ppaf(struct nvm_addrf_12 *ppaf, char *page)
{
	return scnprintf(page, PAGE_SIZE,
		"0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
				ppaf->ch_offset, ppaf->ch_len,
				ppaf->lun_offset, ppaf->lun_len,
				ppaf->pln_offset, ppaf->pln_len,
				ppaf->blk_offset, ppaf->blk_len,
				ppaf->pg_offset, ppaf->pg_len,
				ppaf->sec_offset, ppaf->sec_len);
}

static ssize_t nvm_dev_attr_show_12(struct device *dev,
		struct device_attribute *dattr, char *page)
{
	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
	struct nvm_dev *ndev = ns->ndev;
	struct nvm_geo *geo = &ndev->geo;
	struct attribute *attr;

	if (!ndev)
		return 0;

	attr = &dattr->attr;

	if (strcmp(attr->name, "vendor_opcode") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->vmnt);
	} else if (strcmp(attr->name, "device_mode") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->dom);
	/* kept for compatibility */
	} else if (strcmp(attr->name, "media_manager") == 0) {
		return scnprintf(page, PAGE_SIZE, "%s\n", "gennvm");
	} else if (strcmp(attr->name, "ppa_format") == 0) {
		return nvm_dev_attr_show_ppaf((void *)&geo->addrf, page);
	} else if (strcmp(attr->name, "media_type") == 0) {	/* u8 */
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->mtype);
	} else if (strcmp(attr->name, "flash_media_type") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->fmtype);
	} else if (strcmp(attr->name, "num_channels") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_ch);
	} else if (strcmp(attr->name, "num_luns") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_lun);
	} else if (strcmp(attr->name, "num_planes") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_pln);
	} else if (strcmp(attr->name, "num_blocks") == 0) {	/* u16 */
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_chk);
	} else if (strcmp(attr->name, "num_pages") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_pg);
	} else if (strcmp(attr->name, "page_size") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->fpg_sz);
	} else if (strcmp(attr->name, "hw_sector_size") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->csecs);
	} else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->sos);
	} else if (strcmp(attr->name, "prog_typ") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprt);
	} else if (strcmp(attr->name, "prog_max") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprm);
	} else if (strcmp(attr->name, "erase_typ") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbet);
	} else if (strcmp(attr->name, "erase_max") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbem);
	} else if (strcmp(attr->name, "multiplane_modes") == 0) {
		return scnprintf(page, PAGE_SIZE, "0x%08x\n", geo->mpos);
	} else if (strcmp(attr->name, "media_capabilities") == 0) {
		return scnprintf(page, PAGE_SIZE, "0x%08x\n", geo->mccap);
	} else if (strcmp(attr->name, "max_phys_secs") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", NVM_MAX_VLBA);
	} else {
		return scnprintf(page, PAGE_SIZE,
			"Unhandled attr(%s) in `%s`\n",
			attr->name, __func__);
	}
}

static ssize_t nvm_dev_attr_show_20(struct device *dev,
		struct device_attribute *dattr, char *page)
{
	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
	struct nvm_dev *ndev = ns->ndev;
	struct nvm_geo *geo = &ndev->geo;
	struct attribute *attr;

	if (!ndev)
		return 0;

	attr = &dattr->attr;

	if (strcmp(attr->name, "groups") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_ch);
	} else if (strcmp(attr->name, "punits") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_lun);
	} else if (strcmp(attr->name, "chunks") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_chk);
	} else if (strcmp(attr->name, "clba") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->clba);
	} else if (strcmp(attr->name, "ws_min") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->ws_min);
	} else if (strcmp(attr->name, "ws_opt") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->ws_opt);
	} else if (strcmp(attr->name, "maxoc") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->maxoc);
	} else if (strcmp(attr->name, "maxocpu") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->maxocpu);
	} else if (strcmp(attr->name, "mw_cunits") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->mw_cunits);
	} else if (strcmp(attr->name, "write_typ") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprt);
	} else if (strcmp(attr->name, "write_max") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprm);
	} else if (strcmp(attr->name, "reset_typ") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbet);
	} else if (strcmp(attr->name, "reset_max") == 0) {
		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbem);
	} else {
		return scnprintf(page, PAGE_SIZE,
			"Unhandled attr(%s) in `%s`\n",
			attr->name, __func__);
	}
}

#define NVM_DEV_ATTR_RO(_name)					\
	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL)
#define NVM_DEV_ATTR_12_RO(_name)					\
	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show_12, NULL)
#define NVM_DEV_ATTR_20_RO(_name)					\
	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show_20, NULL)

/* general attributes */
static NVM_DEV_ATTR_RO(version);
static NVM_DEV_ATTR_RO(capabilities);

static NVM_DEV_ATTR_RO(read_typ);
static NVM_DEV_ATTR_RO(read_max);

/* 1.2 values */
static NVM_DEV_ATTR_12_RO(vendor_opcode);
static NVM_DEV_ATTR_12_RO(device_mode);
static NVM_DEV_ATTR_12_RO(ppa_format);
static NVM_DEV_ATTR_12_RO(media_manager);
static NVM_DEV_ATTR_12_RO(media_type);
static NVM_DEV_ATTR_12_RO(flash_media_type);
static NVM_DEV_ATTR_12_RO(num_channels);
static NVM_DEV_ATTR_12_RO(num_luns);
static NVM_DEV_ATTR_12_RO(num_planes);
static NVM_DEV_ATTR_12_RO(num_blocks);
static NVM_DEV_ATTR_12_RO(num_pages);
static NVM_DEV_ATTR_12_RO(page_size);
static NVM_DEV_ATTR_12_RO(hw_sector_size);
static NVM_DEV_ATTR_12_RO(oob_sector_size);
static NVM_DEV_ATTR_12_RO(prog_typ);
static NVM_DEV_ATTR_12_RO(prog_max);
static NVM_DEV_ATTR_12_RO(erase_typ);
static NVM_DEV_ATTR_12_RO(erase_max);
static NVM_DEV_ATTR_12_RO(multiplane_modes);
static NVM_DEV_ATTR_12_RO(media_capabilities);
static NVM_DEV_ATTR_12_RO(max_phys_secs);

/* 2.0 values */
static NVM_DEV_ATTR_20_RO(groups);
static NVM_DEV_ATTR_20_RO(punits);
static NVM_DEV_ATTR_20_RO(chunks);
static NVM_DEV_ATTR_20_RO(clba);
static NVM_DEV_ATTR_20_RO(ws_min);
static NVM_DEV_ATTR_20_RO(ws_opt);
static NVM_DEV_ATTR_20_RO(maxoc);
static NVM_DEV_ATTR_20_RO(maxocpu);
static NVM_DEV_ATTR_20_RO(mw_cunits);
static NVM_DEV_ATTR_20_RO(write_typ);
static NVM_DEV_ATTR_20_RO(write_max);
static NVM_DEV_ATTR_20_RO(reset_typ);
static NVM_DEV_ATTR_20_RO(reset_max);

static struct attribute *nvm_dev_attrs[] = {
	/* version agnostic attrs */
	&dev_attr_version.attr,
	&dev_attr_capabilities.attr,
	&dev_attr_read_typ.attr,
	&dev_attr_read_max.attr,

	/* 1.2 attrs */
	&dev_attr_vendor_opcode.attr,
	&dev_attr_device_mode.attr,
	&dev_attr_media_manager.attr,
	&dev_attr_ppa_format.attr,
	&dev_attr_media_type.attr,
	&dev_attr_flash_media_type.attr,
	&dev_attr_num_channels.attr,
	&dev_attr_num_luns.attr,
	&dev_attr_num_planes.attr,
	&dev_attr_num_blocks.attr,
	&dev_attr_num_pages.attr,
	&dev_attr_page_size.attr,
	&dev_attr_hw_sector_size.attr,
	&dev_attr_oob_sector_size.attr,
	&dev_attr_prog_typ.attr,
	&dev_attr_prog_max.attr,
	&dev_attr_erase_typ.attr,
	&dev_attr_erase_max.attr,
	&dev_attr_multiplane_modes.attr,
	&dev_attr_media_capabilities.attr,
	&dev_attr_max_phys_secs.attr,

	/* 2.0 attrs */
	&dev_attr_groups.attr,
	&dev_attr_punits.attr,
	&dev_attr_chunks.attr,
	&dev_attr_clba.attr,
	&dev_attr_ws_min.attr,
	&dev_attr_ws_opt.attr,
	&dev_attr_maxoc.attr,
	&dev_attr_maxocpu.attr,
	&dev_attr_mw_cunits.attr,

	&dev_attr_write_typ.attr,
	&dev_attr_write_max.attr,
	&dev_attr_reset_typ.attr,
	&dev_attr_reset_max.attr,

	NULL,
};

static umode_t nvm_dev_attrs_visible(struct kobject *kobj,
				     struct attribute *attr, int index)
{
	struct device *dev = container_of(kobj, struct device, kobj);
	struct gendisk *disk = dev_to_disk(dev);
	struct nvme_ns *ns = disk->private_data;
	struct nvm_dev *ndev = ns->ndev;
	struct device_attribute *dev_attr =
		container_of(attr, typeof(*dev_attr), attr);

	if (!ndev)
		return 0;

	if (dev_attr->show == nvm_dev_attr_show)
		return attr->mode;

	switch (ndev->geo.major_ver_id) {
	case 1:
		if (dev_attr->show == nvm_dev_attr_show_12)
			return attr->mode;
		break;
	case 2:
		if (dev_attr->show == nvm_dev_attr_show_20)
			return attr->mode;
		break;
	}

	return 0;
}

const struct attribute_group nvme_nvm_attr_group = {
	.name		= "lightnvm",
	.attrs		= nvm_dev_attrs,
	.is_visible	= nvm_dev_attrs_visible,
};