nvme-pci: implement host memory buffer support
If a controller supports the host memory buffer we try to provide it with the requested size up to an upper cap set as a module parameter. We try to give as few as possible descriptors, eventually working our way down. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Keith Busch <keith.busch@intel.com> Reviewed-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Max Gurtovoy <maxg@mellanox.com> Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
This commit is contained in:
parent
fe6d53c9c0
commit
87ad72a59a
1 changed files with 187 additions and 2 deletions
|
@ -66,6 +66,11 @@ static bool use_cmb_sqes = true;
|
|||
module_param(use_cmb_sqes, bool, 0644);
|
||||
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
|
||||
|
||||
static unsigned int max_host_mem_size_mb = 128;
|
||||
module_param(max_host_mem_size_mb, uint, 0444);
|
||||
MODULE_PARM_DESC(max_host_mem_size_mb,
|
||||
"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
|
||||
|
||||
static struct workqueue_struct *nvme_workq;
|
||||
|
||||
struct nvme_dev;
|
||||
|
@ -104,10 +109,18 @@ struct nvme_dev {
|
|||
u32 cmbloc;
|
||||
struct nvme_ctrl ctrl;
|
||||
struct completion ioq_wait;
|
||||
|
||||
/* shadow doorbell buffer support: */
|
||||
u32 *dbbuf_dbs;
|
||||
dma_addr_t dbbuf_dbs_dma_addr;
|
||||
u32 *dbbuf_eis;
|
||||
dma_addr_t dbbuf_eis_dma_addr;
|
||||
|
||||
/* host memory buffer support: */
|
||||
u64 host_mem_size;
|
||||
u32 nr_host_mem_descs;
|
||||
struct nvme_host_mem_buf_desc *host_mem_descs;
|
||||
void **host_mem_desc_bufs;
|
||||
};
|
||||
|
||||
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
|
||||
|
@ -1512,6 +1525,162 @@ static inline void nvme_release_cmb(struct nvme_dev *dev)
|
|||
}
|
||||
}
|
||||
|
||||
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
|
||||
{
|
||||
size_t len = dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs);
|
||||
struct nvme_command c;
|
||||
u64 dma_addr;
|
||||
int ret;
|
||||
|
||||
dma_addr = dma_map_single(dev->dev, dev->host_mem_descs, len,
|
||||
DMA_TO_DEVICE);
|
||||
if (dma_mapping_error(dev->dev, dma_addr))
|
||||
return -ENOMEM;
|
||||
|
||||
memset(&c, 0, sizeof(c));
|
||||
c.features.opcode = nvme_admin_set_features;
|
||||
c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
|
||||
c.features.dword11 = cpu_to_le32(bits);
|
||||
c.features.dword12 = cpu_to_le32(dev->host_mem_size >>
|
||||
ilog2(dev->ctrl.page_size));
|
||||
c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr));
|
||||
c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr));
|
||||
c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs);
|
||||
|
||||
ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
|
||||
if (ret) {
|
||||
dev_warn(dev->ctrl.device,
|
||||
"failed to set host mem (err %d, flags %#x).\n",
|
||||
ret, bits);
|
||||
}
|
||||
dma_unmap_single(dev->dev, dma_addr, len, DMA_TO_DEVICE);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void nvme_free_host_mem(struct nvme_dev *dev)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < dev->nr_host_mem_descs; i++) {
|
||||
struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
|
||||
size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;
|
||||
|
||||
dma_free_coherent(dev->dev, size, dev->host_mem_desc_bufs[i],
|
||||
le64_to_cpu(desc->addr));
|
||||
}
|
||||
|
||||
kfree(dev->host_mem_desc_bufs);
|
||||
dev->host_mem_desc_bufs = NULL;
|
||||
kfree(dev->host_mem_descs);
|
||||
dev->host_mem_descs = NULL;
|
||||
}
|
||||
|
||||
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
|
||||
{
|
||||
struct nvme_host_mem_buf_desc *descs;
|
||||
u32 chunk_size, max_entries, i = 0;
|
||||
void **bufs;
|
||||
u64 size, tmp;
|
||||
|
||||
/* start big and work our way down */
|
||||
chunk_size = min(preferred, (u64)PAGE_SIZE << MAX_ORDER);
|
||||
retry:
|
||||
tmp = (preferred + chunk_size - 1);
|
||||
do_div(tmp, chunk_size);
|
||||
max_entries = tmp;
|
||||
descs = kcalloc(max_entries, sizeof(*descs), GFP_KERNEL);
|
||||
if (!descs)
|
||||
goto out;
|
||||
|
||||
bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
|
||||
if (!bufs)
|
||||
goto out_free_descs;
|
||||
|
||||
for (size = 0; size < preferred; size += chunk_size) {
|
||||
u32 len = min_t(u64, chunk_size, preferred - size);
|
||||
dma_addr_t dma_addr;
|
||||
|
||||
bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
|
||||
DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
|
||||
if (!bufs[i])
|
||||
break;
|
||||
|
||||
descs[i].addr = cpu_to_le64(dma_addr);
|
||||
descs[i].size = cpu_to_le32(len / dev->ctrl.page_size);
|
||||
i++;
|
||||
}
|
||||
|
||||
if (!size || (min && size < min)) {
|
||||
dev_warn(dev->ctrl.device,
|
||||
"failed to allocate host memory buffer.\n");
|
||||
goto out_free_bufs;
|
||||
}
|
||||
|
||||
dev_info(dev->ctrl.device,
|
||||
"allocated %lld MiB host memory buffer.\n",
|
||||
size >> ilog2(SZ_1M));
|
||||
dev->nr_host_mem_descs = i;
|
||||
dev->host_mem_size = size;
|
||||
dev->host_mem_descs = descs;
|
||||
dev->host_mem_desc_bufs = bufs;
|
||||
return 0;
|
||||
|
||||
out_free_bufs:
|
||||
while (--i >= 0) {
|
||||
size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size;
|
||||
|
||||
dma_free_coherent(dev->dev, size, bufs[i],
|
||||
le64_to_cpu(descs[i].addr));
|
||||
}
|
||||
|
||||
kfree(bufs);
|
||||
out_free_descs:
|
||||
kfree(descs);
|
||||
out:
|
||||
/* try a smaller chunk size if we failed early */
|
||||
if (chunk_size >= PAGE_SIZE * 2 && (i == 0 || size < min)) {
|
||||
chunk_size /= 2;
|
||||
goto retry;
|
||||
}
|
||||
dev->host_mem_descs = NULL;
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static void nvme_setup_host_mem(struct nvme_dev *dev)
|
||||
{
|
||||
u64 max = (u64)max_host_mem_size_mb * SZ_1M;
|
||||
u64 preferred = (u64)dev->ctrl.hmpre * 4096;
|
||||
u64 min = (u64)dev->ctrl.hmmin * 4096;
|
||||
u32 enable_bits = NVME_HOST_MEM_ENABLE;
|
||||
|
||||
preferred = min(preferred, max);
|
||||
if (min > max) {
|
||||
dev_warn(dev->ctrl.device,
|
||||
"min host memory (%lld MiB) above limit (%d MiB).\n",
|
||||
min >> ilog2(SZ_1M), max_host_mem_size_mb);
|
||||
nvme_free_host_mem(dev);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we already have a buffer allocated check if we can reuse it.
|
||||
*/
|
||||
if (dev->host_mem_descs) {
|
||||
if (dev->host_mem_size >= min)
|
||||
enable_bits |= NVME_HOST_MEM_RETURN;
|
||||
else
|
||||
nvme_free_host_mem(dev);
|
||||
}
|
||||
|
||||
if (!dev->host_mem_descs) {
|
||||
if (nvme_alloc_host_mem(dev, min, preferred))
|
||||
return;
|
||||
}
|
||||
|
||||
if (nvme_set_host_mem(dev, enable_bits))
|
||||
nvme_free_host_mem(dev);
|
||||
}
|
||||
|
||||
static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
|
||||
{
|
||||
return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
|
||||
|
@ -1813,8 +1982,20 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
|
|||
* Give the controller a chance to complete all entered requests if
|
||||
* doing a safe shutdown.
|
||||
*/
|
||||
if (!dead && shutdown)
|
||||
nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
|
||||
if (!dead) {
|
||||
if (shutdown)
|
||||
nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
|
||||
|
||||
/*
|
||||
* If the controller is still alive tell it to stop using the
|
||||
* host memory buffer. In theory the shutdown / reset should
|
||||
* make sure that it doesn't access the host memoery anymore,
|
||||
* but I'd rather be safe than sorry..
|
||||
*/
|
||||
if (dev->host_mem_descs)
|
||||
nvme_set_host_mem(dev, 0);
|
||||
|
||||
}
|
||||
nvme_stop_queues(&dev->ctrl);
|
||||
|
||||
queues = dev->online_queues - 1;
|
||||
|
@ -1946,6 +2127,9 @@ static void nvme_reset_work(struct work_struct *work)
|
|||
"unable to allocate dma for dbbuf\n");
|
||||
}
|
||||
|
||||
if (dev->ctrl.hmpre)
|
||||
nvme_setup_host_mem(dev);
|
||||
|
||||
result = nvme_setup_io_queues(dev);
|
||||
if (result)
|
||||
goto out;
|
||||
|
@ -2186,6 +2370,7 @@ static void nvme_remove(struct pci_dev *pdev)
|
|||
flush_work(&dev->reset_work);
|
||||
nvme_uninit_ctrl(&dev->ctrl);
|
||||
nvme_dev_disable(dev, true);
|
||||
nvme_free_host_mem(dev);
|
||||
nvme_dev_remove_admin(dev);
|
||||
nvme_free_queues(dev, 0);
|
||||
nvme_release_prp_pools(dev);
|
||||
|
|
Loading…
Reference in a new issue