From 2800b8e7d9dfca1fd9d044dcf7a046b5de5a7239 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 May 2016 12:38:09 -0600 Subject: [PATCH 01/13] NVMe: Allocate queues only for online cpus The driver previously requested allocating queues for the total possible number of CPUs so that blk-mq could rebalance these if CPUs were added after initialization. The number of hardware contexts can now be changed at runtime, so we only need to allocate the number of online queues since we can add more later. Suggested-by: Jeff Lien Signed-off-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0f093f14d348..3c7b625a5e56 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1394,7 +1394,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) struct pci_dev *pdev = to_pci_dev(dev->dev); int result, i, vecs, nr_io_queues, size; - nr_io_queues = num_possible_cpus(); + nr_io_queues = num_online_cpus(); result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) return result; From 014a0d609eb4721d1e416cf10da2d5602f9b34d5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 6 May 2016 11:50:52 -0600 Subject: [PATCH 02/13] NVMe: Delete only created queues Use the online queue count instead of the number of allocated queues. The controller should just return an invalid queue identifier error to the commands if a queue wasn't created. While it's not harmful, it's still not correct. Reported-by: Saar Gross Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 3c7b625a5e56..88ed43d0799c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1551,12 +1551,12 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) static void nvme_disable_io_queues(struct nvme_dev *dev) { - int pass; + int pass, queues = dev->online_queues - 1; unsigned long timeout; u8 opcode = nvme_admin_delete_sq; for (pass = 0; pass < 2; pass++) { - int sent = 0, i = dev->queue_count - 1; + int sent = 0, i = queues; reinit_completion(&dev->ioq_wait); retry: From 921920ab32f290dafdb0359024d4587897712728 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 28 Mar 2016 16:03:21 -0600 Subject: [PATCH 03/13] NVMe: Unbind driver on failure Instead of removing the PCI device from the kernel's topology on controller failure, this patch simply requests unbinding the device from the driver. This avoids concurrently running pci removal with the hot plug event, which has been reported to be problematic when multiple surprise events occur near simultaneously. The other benefit is that we will have PCI config and memory space available to poke around for debugging a failed controller, assuming the device was not physically removed. The down side occurs if the platform and/or kernel do not support any type of surprise hot removal. The device will remain visible through sysfs (and therefore lspci), and some manual work is necessary to get the logical topology corrected. But if your platform and/or kernel don't support surprise removal, you probably shouldn't be doing that anyway. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 88ed43d0799c..194e9014811b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1857,7 +1857,7 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) nvme_kill_queues(&dev->ctrl); if (pci_get_drvdata(pdev)) - pci_stop_and_remove_bus_device_locked(pdev); + device_release_driver(&pdev->dev); nvme_put_ctrl(&dev->ctrl); } From d011fb3164e8694d7839f10a497f8ab6c660149a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 4 Apr 2016 15:07:41 -0600 Subject: [PATCH 04/13] NVMe: Reduce driver log spamming Reduce error logging when no corrective action is required. Suggessted-by: Chris Petersen Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 194e9014811b..4feaed591e83 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2060,14 +2060,17 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, * shutdown the controller to quiesce. The controller will be restarted * after the slot reset through driver's slot_reset callback. */ - dev_warn(dev->ctrl.device, "error detected: state:%d\n", state); switch (state) { case pci_channel_io_normal: return PCI_ERS_RESULT_CAN_RECOVER; case pci_channel_io_frozen: + dev_warn(dev->ctrl.device, + "frozen state error detected, reset controller\n"); nvme_dev_disable(dev, false); return PCI_ERS_RESULT_NEED_RESET; case pci_channel_io_perm_failure: + dev_warn(dev->ctrl.device, + "failure state error detected, request disconnect\n"); return PCI_ERS_RESULT_DISCONNECT; } return PCI_ERS_RESULT_NEED_RESET; From 9ec3bb2f994bda9c8817856fdcbfaebe8f62fbd3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 29 Apr 2016 15:45:18 -0600 Subject: [PATCH 05/13] NVMe: Allow user initiated rescan This exposes ioctl and sysfs methods a user can invoke to request the driver rescan a controller and its namespaces. This is less harsh than doing a controller reset, which temporarilly halts all IO, just to surface a newly attached namespace. This is mainly useful for controllers that implement the namespace management command, but do not support the namespace notify change asynchronous event notification. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 15 +++++++++++++++ include/uapi/linux/nvme_ioctl.h | 1 + 2 files changed, 16 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2de248bd462b..acc05adaa0d4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1212,6 +1212,9 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, return ctrl->ops->reset_ctrl(ctrl); case NVME_IOCTL_SUBSYS_RESET: return nvme_reset_subsystem(ctrl); + case NVME_IOCTL_RESCAN: + nvme_queue_scan(ctrl); + return 0; default: return -ENOTTY; } @@ -1239,6 +1242,17 @@ static ssize_t nvme_sysfs_reset(struct device *dev, } static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); +static ssize_t nvme_sysfs_rescan(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + nvme_queue_scan(ctrl); + return count; +} +static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); + static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1342,6 +1356,7 @@ nvme_show_int_function(cntlid); static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, + &dev_attr_rescan_controller.attr, &dev_attr_model.attr, &dev_attr_serial.attr, &dev_attr_firmware_rev.attr, diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index c4b2a3f90829..50ff21f748b6 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -61,5 +61,6 @@ struct nvme_passthru_cmd { #define NVME_IOCTL_IO_CMD _IOWR('N', 0x43, struct nvme_passthru_cmd) #define NVME_IOCTL_RESET _IO('N', 0x44) #define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45) +#define NVME_IOCTL_RESCAN _IO('N', 0x46) #endif /* _UAPI_LINUX_NVME_IOCTL_H */ From 0ff9d4e1a284a9282a049bf064f123e27f838907 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 12 May 2016 08:37:14 -0600 Subject: [PATCH 06/13] NVMe: Short-cut removal on surprise hot-unplug This patch adds a new state that when set has the core automatically kill request queues prior to removing namespaces. If PCI device is not present at the time the nvme driver's remove is called, we can kill all IO queues immediately instead of waiting for the watchdog thread to do that at its polling interval. This improves scenarios where multiple hot plug events occur at the same time since it doesn't block the pci enumeration for as long. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 18 ++++++++++++++++++ drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 4 ++++ 3 files changed, 23 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index acc05adaa0d4..beed3940786b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -95,6 +95,15 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, break; } break; + case NVME_CTRL_DEAD: + switch (old_state) { + case NVME_CTRL_DELETING: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; default: break; } @@ -1595,6 +1604,15 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) { struct nvme_ns *ns, *next; + /* + * The dead states indicates the controller was not gracefully + * disconnected. In that case, we won't be able to flush any data while + * removing the namespaces' disks; fail all the queues now to avoid + * potentially having to clean up the failed sync later. + */ + if (ctrl->state == NVME_CTRL_DEAD) + nvme_kill_queues(ctrl); + mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) nvme_ns_remove(ns); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 114b92873894..1daa0482de0e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -72,6 +72,7 @@ enum nvme_ctrl_state { NVME_CTRL_LIVE, NVME_CTRL_RESETTING, NVME_CTRL_DELETING, + NVME_CTRL_DEAD, }; struct nvme_ctrl { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4feaed591e83..3bdcf0e34fd6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2017,6 +2017,10 @@ static void nvme_remove(struct pci_dev *pdev) nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); pci_set_drvdata(pdev, NULL); + + if (!pci_device_is_present(pdev)) + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); + flush_work(&dev->reset_work); nvme_uninit_ctrl(&dev->ctrl); nvme_dev_disable(dev, true); From 99466e708ddce8904c8635c213f2deb523ef4fb9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 2 May 2016 15:14:24 -0600 Subject: [PATCH 07/13] NVMe: Add device ID's with stripe quirk Adds two Intel controllers that have the "stripe" quirk. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 3bdcf0e34fd6..78dca3193ca4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2109,6 +2109,12 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0x0953), .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DISCARD_ZEROES, }, + { PCI_VDEVICE(INTEL, 0x0a53), + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DISCARD_ZEROES, }, + { PCI_VDEVICE(INTEL, 0x0a54), + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DISCARD_ZEROES, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, From ba36c21b0cd8b55fd7f010e9052656c2c03d9e5e Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Sat, 9 Apr 2016 03:04:42 +0000 Subject: [PATCH 08/13] nvme/host: Add missing blk_integrity tag_size + flags assignments While doing recent bring-up of nvme/host with target-core T10-PI, I noticed /sys/block/nvme*/integrity/device_is_integrity_capable was false, and /sys/block/nvme*/integrity/tag_size contained a bogus value. AFAICT outside of blk_integrity_compare() for DM + MD these are informational values, but go ahead and add the missing assignments for nvme/host to match what SCSI does within sd_dif_config_host() for consistency's sake. Cc: Keith Busch Cc: Jay Freyensee Cc: Martin K. Petersen Cc: Sagi Grimberg Cc: Christoph Hellwig Signed-off-by: Nicholas Bellinger Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index beed3940786b..1a51584a382b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -729,10 +729,14 @@ static void nvme_init_integrity(struct nvme_ns *ns) switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE3: integrity.profile = &t10_pi_type3_crc; + integrity.tag_size = sizeof(u16) + sizeof(u32); + integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; case NVME_NS_DPS_PI_TYPE1: case NVME_NS_DPS_PI_TYPE2: integrity.profile = &t10_pi_type1_crc; + integrity.tag_size = sizeof(u16); + integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; default: integrity.profile = NULL; From 7c87df9c159aa1d228f0d77b37942216cff34922 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 24 May 2016 16:38:10 +0200 Subject: [PATCH 09/13] bcache: bch_writeback_thread() is not freezable bch_writeback_thread() is calling try_to_freeze(), but that's just an expensive no-op given the fact that the thread is not marked freezable. I/O helper kthreads, exactly such as the bcache writeback thread, actually shouldn't be freezable, because they are potentially necessary for finalizing the image write-out. Signed-off-by: Jiri Kosina Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index b9346cd9cda1..60123677b382 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -12,7 +12,6 @@ #include "writeback.h" #include -#include #include #include @@ -228,7 +227,6 @@ static void read_dirty(struct cached_dev *dc) */ while (!kthread_should_stop()) { - try_to_freeze(); w = bch_keybuf_next(&dc->writeback_keys); if (!w) @@ -433,7 +431,6 @@ static int bch_writeback_thread(void *arg) if (kthread_should_stop()) return 0; - try_to_freeze(); schedule(); continue; } From 770b8ce400123af89ac469361d7912f458915547 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 24 May 2016 16:38:15 +0200 Subject: [PATCH 10/13] bcache: bch_allocator_thread() is not freezable bch_allocator_thread() is calling try_to_freeze(), but that's just an expensive no-op given the fact that the thread is not marked freezable. Bucket allocator has to be up and running to the very last stages of the suspend, as the bcache I/O that's in flight (think of writing an hibernation image to a swap device served by bcache). Signed-off-by: Jiri Kosina Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 8eeab72b93e2..ca4abe1ccd8d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -64,7 +64,6 @@ #include "btree.h" #include -#include #include #include #include @@ -288,7 +287,6 @@ do { \ if (kthread_should_stop()) \ return 0; \ \ - try_to_freeze(); \ schedule(); \ mutex_lock(&(ca)->set->bucket_lock); \ } \ From 29e6c57cc78eb7cb8302088061a0721a41ade658 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 24 May 2016 16:38:34 +0200 Subject: [PATCH 11/13] bcache: bch_gc_thread() is not freezable bch_gc_thread() doesn't mark itself freezable, so calling try_to_freeze() in its context is just an expensive no-op. Signed-off-by: Jiri Kosina Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 22b9e34ceb75..eab505ee0027 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -27,7 +27,6 @@ #include #include -#include #include #include #include @@ -1787,7 +1786,6 @@ static int bch_gc_thread(void *arg) mutex_unlock(&c->bucket_lock); - try_to_freeze(); schedule(); } From 4d1034eb7c2f5e32d48ddc4dfce0f1a723d28667 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 24 May 2016 16:38:50 +0200 Subject: [PATCH 12/13] MAINTAINERS: mark bcache as orphan The submitted patches are not being reacted upon, and Jens is only picking up stable fixes on an rather ad-hoc basis. Link: lkml.kernel.org/r/574462C5.40307@kernel.dk Signed-off-by: Jiri Kosina Signed-off-by: Jens Axboe --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 20e634627a10..f6f479549f4e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2222,7 +2222,7 @@ BCACHE (BLOCK LAYER CACHE) M: Kent Overstreet L: linux-bcache@vger.kernel.org W: http://bcache.evilpiepirate.org -S: Maintained +S: Orphan F: drivers/md/bcache/ BDISP ST MEDIA DRIVER From c7de5726307620711a4753b2a13d9e5daecc1081 Mon Sep 17 00:00:00 2001 From: Ming Lin Date: Wed, 25 May 2016 23:23:27 -0700 Subject: [PATCH 13/13] blk-mq: clear q->mq_ops if init fail blk_mq_init_queue() calls blk_mq_init_allocated_queue(), but q->mq_ops was not cleared when blk_mq_init_allocated_queue() fails. Then blk_cleanup_queue() calls blk_mq_free_queue() which will crash because: - q->all_q_node is not added to all_q_list yet - q->tag_set is NULL - hctx was not setup yet or already freed Fixed it by clearing q->mq_ops on error path. Signed-off-by: Ming Lin Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 7df9c9263b21..29cbc1b5fbdb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2020,7 +2020,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->queue_ctx = alloc_percpu(struct blk_mq_ctx); if (!q->queue_ctx) - return ERR_PTR(-ENOMEM); + goto err_exit; q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)), GFP_KERNEL, set->numa_node); @@ -2084,6 +2084,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, kfree(q->queue_hw_ctx); err_percpu: free_percpu(q->queue_ctx); +err_exit: + q->mq_ops = NULL; return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(blk_mq_init_allocated_queue);