nvme-pci: implement host memory buffer support

If a controller supports the host memory buffer we try to provide
it with the requested size up to an upper cap set as a module
parameter.  We try to give as few as possible descriptors, eventually
working our way down.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
This commit is contained in:
Christoph Hellwig 2017-05-12 17:02:58 +02:00
parent fe6d53c9c0
commit 87ad72a59a

View file

@ -66,6 +66,11 @@ static bool use_cmb_sqes = true;
module_param(use_cmb_sqes, bool, 0644);
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
static unsigned int max_host_mem_size_mb = 128;
module_param(max_host_mem_size_mb, uint, 0444);
MODULE_PARM_DESC(max_host_mem_size_mb,
"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
static struct workqueue_struct *nvme_workq;
struct nvme_dev;
@ -104,10 +109,18 @@ struct nvme_dev {
u32 cmbloc;
struct nvme_ctrl ctrl;
struct completion ioq_wait;
/* shadow doorbell buffer support: */
u32 *dbbuf_dbs;
dma_addr_t dbbuf_dbs_dma_addr;
u32 *dbbuf_eis;
dma_addr_t dbbuf_eis_dma_addr;
/* host memory buffer support: */
u64 host_mem_size;
u32 nr_host_mem_descs;
struct nvme_host_mem_buf_desc *host_mem_descs;
void **host_mem_desc_bufs;
};
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
@ -1512,6 +1525,162 @@ static inline void nvme_release_cmb(struct nvme_dev *dev)
}
}
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
{
size_t len = dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs);
struct nvme_command c;
u64 dma_addr;
int ret;
dma_addr = dma_map_single(dev->dev, dev->host_mem_descs, len,
DMA_TO_DEVICE);
if (dma_mapping_error(dev->dev, dma_addr))
return -ENOMEM;
memset(&c, 0, sizeof(c));
c.features.opcode = nvme_admin_set_features;
c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
c.features.dword11 = cpu_to_le32(bits);
c.features.dword12 = cpu_to_le32(dev->host_mem_size >>
ilog2(dev->ctrl.page_size));
c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr));
c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr));
c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs);
ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
if (ret) {
dev_warn(dev->ctrl.device,
"failed to set host mem (err %d, flags %#x).\n",
ret, bits);
}
dma_unmap_single(dev->dev, dma_addr, len, DMA_TO_DEVICE);
return ret;
}
static void nvme_free_host_mem(struct nvme_dev *dev)
{
int i;
for (i = 0; i < dev->nr_host_mem_descs; i++) {
struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;
dma_free_coherent(dev->dev, size, dev->host_mem_desc_bufs[i],
le64_to_cpu(desc->addr));
}
kfree(dev->host_mem_desc_bufs);
dev->host_mem_desc_bufs = NULL;
kfree(dev->host_mem_descs);
dev->host_mem_descs = NULL;
}
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
{
struct nvme_host_mem_buf_desc *descs;
u32 chunk_size, max_entries, i = 0;
void **bufs;
u64 size, tmp;
/* start big and work our way down */
chunk_size = min(preferred, (u64)PAGE_SIZE << MAX_ORDER);
retry:
tmp = (preferred + chunk_size - 1);
do_div(tmp, chunk_size);
max_entries = tmp;
descs = kcalloc(max_entries, sizeof(*descs), GFP_KERNEL);
if (!descs)
goto out;
bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
if (!bufs)
goto out_free_descs;
for (size = 0; size < preferred; size += chunk_size) {
u32 len = min_t(u64, chunk_size, preferred - size);
dma_addr_t dma_addr;
bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
if (!bufs[i])
break;
descs[i].addr = cpu_to_le64(dma_addr);
descs[i].size = cpu_to_le32(len / dev->ctrl.page_size);
i++;
}
if (!size || (min && size < min)) {
dev_warn(dev->ctrl.device,
"failed to allocate host memory buffer.\n");
goto out_free_bufs;
}
dev_info(dev->ctrl.device,
"allocated %lld MiB host memory buffer.\n",
size >> ilog2(SZ_1M));
dev->nr_host_mem_descs = i;
dev->host_mem_size = size;
dev->host_mem_descs = descs;
dev->host_mem_desc_bufs = bufs;
return 0;
out_free_bufs:
while (--i >= 0) {
size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size;
dma_free_coherent(dev->dev, size, bufs[i],
le64_to_cpu(descs[i].addr));
}
kfree(bufs);
out_free_descs:
kfree(descs);
out:
/* try a smaller chunk size if we failed early */
if (chunk_size >= PAGE_SIZE * 2 && (i == 0 || size < min)) {
chunk_size /= 2;
goto retry;
}
dev->host_mem_descs = NULL;
return -ENOMEM;
}
static void nvme_setup_host_mem(struct nvme_dev *dev)
{
u64 max = (u64)max_host_mem_size_mb * SZ_1M;
u64 preferred = (u64)dev->ctrl.hmpre * 4096;
u64 min = (u64)dev->ctrl.hmmin * 4096;
u32 enable_bits = NVME_HOST_MEM_ENABLE;
preferred = min(preferred, max);
if (min > max) {
dev_warn(dev->ctrl.device,
"min host memory (%lld MiB) above limit (%d MiB).\n",
min >> ilog2(SZ_1M), max_host_mem_size_mb);
nvme_free_host_mem(dev);
return;
}
/*
* If we already have a buffer allocated check if we can reuse it.
*/
if (dev->host_mem_descs) {
if (dev->host_mem_size >= min)
enable_bits |= NVME_HOST_MEM_RETURN;
else
nvme_free_host_mem(dev);
}
if (!dev->host_mem_descs) {
if (nvme_alloc_host_mem(dev, min, preferred))
return;
}
if (nvme_set_host_mem(dev, enable_bits))
nvme_free_host_mem(dev);
}
static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
@ -1813,8 +1982,20 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
* Give the controller a chance to complete all entered requests if
* doing a safe shutdown.
*/
if (!dead && shutdown)
nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
if (!dead) {
if (shutdown)
nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
/*
* If the controller is still alive tell it to stop using the
* host memory buffer. In theory the shutdown / reset should
* make sure that it doesn't access the host memoery anymore,
* but I'd rather be safe than sorry..
*/
if (dev->host_mem_descs)
nvme_set_host_mem(dev, 0);
}
nvme_stop_queues(&dev->ctrl);
queues = dev->online_queues - 1;
@ -1946,6 +2127,9 @@ static void nvme_reset_work(struct work_struct *work)
"unable to allocate dma for dbbuf\n");
}
if (dev->ctrl.hmpre)
nvme_setup_host_mem(dev);
result = nvme_setup_io_queues(dev);
if (result)
goto out;
@ -2186,6 +2370,7 @@ static void nvme_remove(struct pci_dev *pdev)
flush_work(&dev->reset_work);
nvme_uninit_ctrl(&dev->ctrl);
nvme_dev_disable(dev, true);
nvme_free_host_mem(dev);
nvme_dev_remove_admin(dev);
nvme_free_queues(dev, 0);
nvme_release_prp_pools(dev);