From dc4a93078b8a6a10d2dcaba76ab488d6dbe73922 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Apr 2014 21:37:30 -0600 Subject: [PATCH 01/62] sd/skd: stuff discard page in request->completion_data Store the pointer to the page there, so we can always safely reference it from end_io context where ->bio may have been cleared. Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 4 ++-- drivers/scsi/sd.c | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 36bcedfd930c..c48d9084c965 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -743,6 +743,7 @@ static void skd_request_fn(struct request_queue *q) break; } skreq->discard_page = 1; + req->completion_data = page; skd_prep_discard_cdb(scsi_req, skreq, page, lba, count); } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { @@ -855,10 +856,9 @@ static void skd_end_request(struct skd_device *skdev, if ((io_flags & REQ_DISCARD) && (skreq->discard_page == 1)) { - struct bio *bio = req->bio; pr_debug("%s:%s:%d, free the page!", skdev->name, __func__, __LINE__); - __free_page(bio->bi_io_vec->bv_page); + __free_page(req->completion_data); } if (unlikely(error)) { diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 06d154d20faa..96af195224f2 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -737,6 +737,7 @@ static int sd_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq) goto out; } + rq->completion_data = page; blk_add_request_payload(rq, page, len); ret = scsi_setup_blk_pc_cmnd(sdp, rq); rq->__data_len = nr_bytes; @@ -839,11 +840,9 @@ static void sd_unprep_fn(struct request_queue *q, struct request *rq) { struct scsi_cmnd *SCpnt = rq->special; - if (rq->cmd_flags & REQ_DISCARD) { - struct bio *bio = rq->bio; + if (rq->cmd_flags & REQ_DISCARD) + __free_page(rq->completion_data); - __free_page(bio->bi_io_vec->bv_page); - } if (SCpnt->cmnd != rq->cmd) { mempool_free(SCpnt->cmnd, sd_cdb_pool); SCpnt->cmnd = NULL; From 11471e0d04f3762b9216d8421ce6e9d89b0bf450 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 19 Apr 2014 18:00:16 +0800 Subject: [PATCH 02/62] blk-mq: free hctx->ctx_map when init failed Avoid memory leak in the failure path. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index ee225cc312b8..5fbbb221d499 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1289,6 +1289,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); kfree(hctx->ctxs); + kfree(hctx->ctx_map); } return 1; From 4847900532c2dc0fc77b67f745c20e636d5b3125 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 19 Apr 2014 18:00:17 +0800 Subject: [PATCH 03/62] blk-mq: fix allocation of set->tags type of set->tags is struct blk_mq_tags **. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 5fbbb221d499..c2777970f28b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1537,7 +1537,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) return -EINVAL; - set->tags = kmalloc_node(set->nr_hw_queues * sizeof(struct blk_mq_tags), + set->tags = kmalloc_node(set->nr_hw_queues * + sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) goto out; From 4ca085009f447abc1c818f8802e908e9afdd77fa Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 19 Apr 2014 18:00:18 +0800 Subject: [PATCH 04/62] blk-mq: user (1 << order) to implement order_to_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Jörg-Volker Peetz Cc: Max Filippov Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index c2777970f28b..3a560a4db0b4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1130,12 +1130,7 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, static size_t order_to_size(unsigned int order) { - size_t ret = PAGE_SIZE; - - while (order--) - ret *= 2; - - return ret; + return (size_t)PAGE_SIZE << order; } static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, From 6a3c8a3ac0e68dcfc2a01f4aa1ca0edd1a1701eb Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 19 Apr 2014 18:00:19 +0800 Subject: [PATCH 05/62] blk-mq: initialize req->q in allocation The patch basically reverts the patch of(blk-mq: initialize request on allocation) in Jens's tree(already in -next), and only initialize req->q in allocation for two reasons: - presumed cache hotness on completion - blk_rq_tagged(rq) depends on reset of req->mq_ctx Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3a560a4db0b4..c26b3be1893c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -82,7 +82,6 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, tag = blk_mq_get_tag(hctx->tags, gfp, reserved); if (tag != BLK_MQ_TAG_FAIL) { rq = hctx->tags->rqs[tag]; - blk_rq_init(hctx->queue, rq); rq->tag = tag; return rq; @@ -187,6 +186,7 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; + rq->q = q; rq->mq_ctx = ctx; rq->cmd_flags = rw_flags; rq->start_time = jiffies; @@ -258,6 +258,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; + blk_rq_init(hctx->queue, rq); blk_mq_put_tag(hctx->tags, tag); blk_mq_queue_exit(q); } @@ -1194,6 +1195,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, left -= to_do * rq_size; for (j = 0; j < to_do; j++) { tags->rqs[i] = p; + blk_rq_init(NULL, tags->rqs[i]); if (set->ops->init_request) { if (set->ops->init_request(set->driver_data, tags->rqs[i], hctx_idx, i, From 1051a902fe19322ac8f2123a0f4778e43e143aca Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 20 Apr 2014 16:03:31 -0700 Subject: [PATCH 06/62] fs: fix new kernel-doc warnings in fs/bio.c Fix new kernel-doc warnings in fs/bio.c: Warning(fs/bio.c:316): No description found for parameter 'bio' Warning(fs/bio.c:316): No description found for parameter 'parent' Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- fs/bio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bio.c b/fs/bio.c index 6f0362b77806..4c9c5095bacb 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -305,6 +305,8 @@ static void bio_chain_endio(struct bio *bio, int error) /** * bio_chain - chain bio completions + * @bio: the target bio + * @parent: the @bio's parent bio * * The caller won't have a bi_end_io called when @bio completes - instead, * @parent's bi_end_io won't be called until both @parent and @bio have From 8876e140ec7b9c57ab5a5dc39b7bb9815f96e3f5 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 17 Apr 2014 21:41:16 +0200 Subject: [PATCH 07/62] block/blk-throttle.c: add static to blk_throtl_dispatch_work_fn blk_throtl_dispatch_work_fn is only used in blk-throttle.c Cc: Jens Axboe Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Jens Axboe --- block/blk-throttle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 033745cd7fba..680a0cc116dd 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1258,7 +1258,7 @@ static void throtl_pending_timer_fn(unsigned long arg) * of throtl_data->service_queue. Those bio's are ready and issued by this * function. */ -void blk_throtl_dispatch_work_fn(struct work_struct *work) +static void blk_throtl_dispatch_work_fn(struct work_struct *work) { struct throtl_data *td = container_of(work, struct throtl_data, dispatch_work); From a6c39cb4f71e61aff19d07e2d0b26bb6e3548fae Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 22 Apr 2014 15:09:05 -0600 Subject: [PATCH 08/62] fs/bio: remove bs paramater in biovec_create_pool bs is no longer used in biovec_create_pool since 9f060e2231ca96 ("block: Convert integrity to bvec_alloc_bs()") Signed-off-by: Fabian Frederick Cc: Jens Axboe Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 2 +- fs/bio.c | 4 ++-- include/linux/bio.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 1c2ce0c87711..9e241063a616 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -617,7 +617,7 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size) if (!bs->bio_integrity_pool) return -1; - bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); + bs->bvec_integrity_pool = biovec_create_pool(pool_size); if (!bs->bvec_integrity_pool) { mempool_destroy(bs->bio_integrity_pool); return -1; diff --git a/fs/bio.c b/fs/bio.c index 4c9c5095bacb..ca55d37436d6 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1861,7 +1861,7 @@ EXPORT_SYMBOL_GPL(bio_trim); * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. */ -mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) +mempool_t *biovec_create_pool(int pool_entries) { struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; @@ -1924,7 +1924,7 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) if (!bs->bio_pool) goto bad; - bs->bvec_pool = biovec_create_pool(bs, pool_size); + bs->bvec_pool = biovec_create_pool(pool_size); if (!bs->bvec_pool) goto bad; diff --git a/include/linux/bio.h b/include/linux/bio.h index bba550826921..5a645769f020 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -333,7 +333,7 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors, extern struct bio_set *bioset_create(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); -extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries); +extern mempool_t *biovec_create_pool(int pool_entries); extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); extern void bio_put(struct bio *); From 7410b3c6c5c919d1ca432fb47ce75a8092729eae Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 22 Apr 2014 15:09:07 -0600 Subject: [PATCH 09/62] fs/bio.c: remove nr_segs (unused function parameter) nr_segs is no longer used in bio_alloc_map_data since c8db444820a1e3 ("block: Don't save/copy bvec array anymore") Signed-off-by: Fabian Frederick Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/bio.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/bio.c b/fs/bio.c index ca55d37436d6..96d28eee8a1e 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1013,8 +1013,7 @@ static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, bio->bi_private = bmd; } -static struct bio_map_data *bio_alloc_map_data(int nr_segs, - unsigned int iov_count, +static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, gfp_t gfp_mask) { if (iov_count > UIO_MAXIOV) @@ -1156,7 +1155,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, if (offset) nr_pages++; - bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); + bmd = bio_alloc_map_data(iov_count, gfp_mask); if (!bmd) return ERR_PTR(-ENOMEM); From 981bd189f80f34c07ac0d87e4760d09bcda5f4cb Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 24 Apr 2014 00:07:34 +0800 Subject: [PATCH 10/62] blk-mq: fix leak of set->tags set->tags should be freed in blk_mq_free_tag_set(). Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index c26b3be1893c..cea1bd87a416 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1562,6 +1562,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) for (i = 0; i < set->nr_hw_queues; i++) blk_mq_free_rq_map(set, set->tags[i], i); + kfree(set->tags); } EXPORT_SYMBOL(blk_mq_free_tag_set); From 70ab0b2d51f84fc7d9eb6ed81c3986595efaa33d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 24 Apr 2014 08:50:38 -0600 Subject: [PATCH 11/62] Revert "blk-mq: initialize req->q in allocation" This reverts commit 6a3c8a3ac0e68dcfc2a01f4aa1ca0edd1a1701eb. We need selective clearing of the request to make the init-at-free time completely safe. Otherwise we end up stomping on rq->atomic_flags, which we don't want to do. --- block/blk-mq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index cea1bd87a416..7d5650d75aef 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -82,6 +82,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, tag = blk_mq_get_tag(hctx->tags, gfp, reserved); if (tag != BLK_MQ_TAG_FAIL) { rq = hctx->tags->rqs[tag]; + blk_rq_init(hctx->queue, rq); rq->tag = tag; return rq; @@ -186,7 +187,6 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; - rq->q = q; rq->mq_ctx = ctx; rq->cmd_flags = rw_flags; rq->start_time = jiffies; @@ -258,7 +258,6 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; - blk_rq_init(hctx->queue, rq); blk_mq_put_tag(hctx->tags, tag); blk_mq_queue_exit(q); } @@ -1195,7 +1194,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, left -= to_do * rq_size; for (j = 0; j < to_do; j++) { tags->rqs[i] = p; - blk_rq_init(NULL, tags->rqs[i]); if (set->ops->init_request) { if (set->ops->init_request(set->driver_data, tags->rqs[i], hctx_idx, i, From 87ee7b112193bd081ba1a171fa5f6f39c429ef56 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 24 Apr 2014 08:51:47 -0600 Subject: [PATCH 12/62] blk-mq: fix race with timeouts and requeue events If a requeue event races with a timeout, we can get into the situation where we attempt to complete a request from the timeout handler when it's not start anymore. This causes a crash. So have the timeout handler check that REQ_ATOM_STARTED is still set on the request - if not, we ignore the event. If this happens, the request has now been marked as complete. As a consequence, we need to ensure to clear REQ_ATOM_COMPLETE in blk_mq_start_request(), as to maintain proper request state. Signed-off-by: Jens Axboe --- block/blk-mq.c | 39 ++++++++++++++++++++++++++++++++------- block/blk-mq.h | 2 -- block/blk-timeout.c | 16 +++++++++------- block/blk.h | 3 +-- 4 files changed, 42 insertions(+), 18 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 7d5650d75aef..a84112c94e74 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -378,7 +378,15 @@ static void blk_mq_start_request(struct request *rq, bool last) * REQ_ATOMIC_STARTED is seen. */ rq->deadline = jiffies + q->rq_timeout; + + /* + * Mark us as started and clear complete. Complete might have been + * set if requeue raced with timeout, which then marked it as + * complete. So be sure to clear complete again when we start + * the request, otherwise we'll ignore the completion event. + */ set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -485,6 +493,28 @@ static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); } +static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) +{ + struct request_queue *q = rq->q; + + /* + * We know that complete is set at this point. If STARTED isn't set + * anymore, then the request isn't active and the "timeout" should + * just be ignored. This can happen due to the bitflag ordering. + * Timeout first checks if STARTED is set, and if it is, assumes + * the request is active. But if we race with completion, then + * we both flags will get cleared. So check here again, and ignore + * a timeout event with a request that isn't active. + */ + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + return BLK_EH_NOT_HANDLED; + + if (!q->mq_ops->timeout) + return BLK_EH_RESET_TIMER; + + return q->mq_ops->timeout(rq); +} + static void blk_mq_rq_timer(unsigned long data) { struct request_queue *q = (struct request_queue *) data; @@ -538,11 +568,6 @@ static bool blk_mq_attempt_merge(struct request_queue *q, return false; } -void blk_mq_add_timer(struct request *rq) -{ - __blk_add_timer(rq, NULL); -} - /* * Run this hardware queue, pulling any software queues mapped to it in. * Note that this function currently has various problems around ordering @@ -799,7 +824,7 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, /* * We do this early, to ensure we are on the right CPU. */ - blk_mq_add_timer(rq); + blk_add_timer(rq); } void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, @@ -1400,7 +1425,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) q->sg_reserved_size = INT_MAX; blk_queue_make_request(q, blk_mq_make_request); - blk_queue_rq_timed_out(q, set->ops->timeout); + blk_queue_rq_timed_out(q, blk_mq_rq_timed_out); if (set->timeout) blk_queue_rq_timeout(q, set->timeout); diff --git a/block/blk-mq.h b/block/blk-mq.h index 5fa14f19f752..b41a784de50d 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -51,6 +51,4 @@ void blk_mq_disable_hotplug(void); extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); -void blk_mq_add_timer(struct request *rq); - #endif diff --git a/block/blk-timeout.c b/block/blk-timeout.c index a09e8af8186c..49988a3ca85c 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -96,11 +96,7 @@ static void blk_rq_timed_out(struct request *req) __blk_complete_request(req); break; case BLK_EH_RESET_TIMER: - if (q->mq_ops) - blk_mq_add_timer(req); - else - blk_add_timer(req); - + blk_add_timer(req); blk_clear_rq_complete(req); break; case BLK_EH_NOT_HANDLED: @@ -170,7 +166,8 @@ void blk_abort_request(struct request *req) } EXPORT_SYMBOL_GPL(blk_abort_request); -void __blk_add_timer(struct request *req, struct list_head *timeout_list) +static void __blk_add_timer(struct request *req, + struct list_head *timeout_list) { struct request_queue *q = req->q; unsigned long expiry; @@ -225,6 +222,11 @@ void __blk_add_timer(struct request *req, struct list_head *timeout_list) */ void blk_add_timer(struct request *req) { - __blk_add_timer(req, &req->q->timeout_list); + struct request_queue *q = req->q; + + if (q->mq_ops) + __blk_add_timer(req, NULL); + else + __blk_add_timer(req, &req->q->timeout_list); } diff --git a/block/blk.h b/block/blk.h index 1d880f1f957f..79be2cbce7fd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -37,9 +37,8 @@ bool __blk_end_bidi_request(struct request *rq, int error, void blk_rq_timed_out_timer(unsigned long data); void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, unsigned int *next_set); -void __blk_add_timer(struct request *req, struct list_head *timeout_list); +void blk_add_timer(struct request *req); void blk_delete_timer(struct request *); -void blk_add_timer(struct request *); bool bio_attempt_front_merge(struct request_queue *q, struct request *req, From 38535201633077cbaf8b32886b5e3005b36c9024 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Apr 2014 02:32:53 -0700 Subject: [PATCH 13/62] blk-mq: respect rq_affinity The blk-mq code is using it's own version of the I/O completion affinity tunables, which causes a few issues: - the rq_affinity sysfs file doesn't work for blk-mq devices, even if it still is present, thus breaking existing tuning setups. - the rq_affinity = 1 mode, which is the defauly for legacy request based drivers isn't implemented at all. - blk-mq drivers don't implement any completion affinity with the default flag settings. This patches removes the blk-mq ipi_redirect flag and sysfs file, as well as the internal BLK_MQ_F_SHOULD_IPI flag and replaces it with code that respects the queue-wide rq_affinity flags and also implements the rq_affinity = 1 mode. This means I/O completion affinity can now only be tuned block-queue wide instead of per context, which seems more sensible to me anyway. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 42 ------------------------------------------ block/blk-mq.c | 8 ++++++-- block/blk-mq.h | 1 - include/linux/blk-mq.h | 1 - 4 files changed, 6 insertions(+), 46 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 9176a6984857..8145b5b25b4b 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -203,42 +203,6 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, return ret; } -static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - ssize_t ret; - - spin_lock(&hctx->lock); - ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI)); - spin_unlock(&hctx->lock); - - return ret; -} - -static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx, - const char *page, size_t len) -{ - struct blk_mq_ctx *ctx; - unsigned long ret; - unsigned int i; - - if (kstrtoul(page, 10, &ret)) { - pr_err("blk-mq-sysfs: invalid input '%s'\n", page); - return -EINVAL; - } - - spin_lock(&hctx->lock); - if (ret) - hctx->flags |= BLK_MQ_F_SHOULD_IPI; - else - hctx->flags &= ~BLK_MQ_F_SHOULD_IPI; - spin_unlock(&hctx->lock); - - hctx_for_each_ctx(hctx, ctx, i) - ctx->ipi_redirect = !!ret; - - return len; -} - static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { return blk_mq_tag_sysfs_show(hctx->tags, page); @@ -307,11 +271,6 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { .attr = {.name = "pending", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_rq_list_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = { - .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR}, - .show = blk_mq_hw_sysfs_ipi_show, - .store = blk_mq_hw_sysfs_ipi_store, -}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { .attr = {.name = "tags", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_tags_show, @@ -326,7 +285,6 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_run.attr, &blk_mq_hw_sysfs_dispatched.attr, &blk_mq_hw_sysfs_pending.attr, - &blk_mq_hw_sysfs_ipi.attr, &blk_mq_hw_sysfs_tags.attr, &blk_mq_hw_sysfs_cpus.attr, NULL, diff --git a/block/blk-mq.c b/block/blk-mq.c index a84112c94e74..f2e92eb92803 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -326,15 +326,19 @@ static void __blk_mq_complete_request_remote(void *data) void __blk_mq_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; + bool shared = false; int cpu; - if (!ctx->ipi_redirect) { + if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { rq->q->softirq_done_fn(rq); return; } cpu = get_cpu(); - if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) + shared = cpus_share_cache(cpu, ctx->cpu); + + if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; diff --git a/block/blk-mq.h b/block/blk-mq.h index b41a784de50d..1ae364ceaf8b 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -11,7 +11,6 @@ struct blk_mq_ctx { unsigned int cpu; unsigned int index_hw; - unsigned int ipi_redirect; /* incremented at dispatch time */ unsigned long rq_dispatched[2]; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ab469d525894..3b561d651a02 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -122,7 +122,6 @@ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_SHOULD_SORT = 1 << 1, - BLK_MQ_F_SHOULD_IPI = 1 << 2, BLK_MQ_S_STOPPED = 0, From c4a634f432376a28c9639b35746e33168ab97709 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Apr 2014 14:14:48 +0200 Subject: [PATCH 14/62] block: fold __blk_add_timer into blk_add_timer Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-timeout.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 49988a3ca85c..448745683d28 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -166,8 +166,15 @@ void blk_abort_request(struct request *req) } EXPORT_SYMBOL_GPL(blk_abort_request); -static void __blk_add_timer(struct request *req, - struct list_head *timeout_list) +/** + * blk_add_timer - Start timeout timer for a single request + * @req: request that is about to start running. + * + * Notes: + * Each request has its own timer, and as it is added to the queue, we + * set up the timer. When the request completes, we cancel the timer. + */ +void blk_add_timer(struct request *req) { struct request_queue *q = req->q; unsigned long expiry; @@ -185,8 +192,8 @@ static void __blk_add_timer(struct request *req, req->timeout = q->rq_timeout; req->deadline = jiffies + req->timeout; - if (timeout_list) - list_add_tail(&req->timeout_list, timeout_list); + if (!q->mq_ops) + list_add_tail(&req->timeout_list, &req->q->timeout_list); /* * If the timer isn't already pending or this timeout is earlier @@ -211,22 +218,3 @@ static void __blk_add_timer(struct request *req, } } - -/** - * blk_add_timer - Start timeout timer for a single request - * @req: request that is about to start running. - * - * Notes: - * Each request has its own timer, and as it is added to the queue, we - * set up the timer. When the request completes, we cancel the timer. - */ -void blk_add_timer(struct request *req) -{ - struct request_queue *q = req->q; - - if (q->mq_ops) - __blk_add_timer(req, NULL); - else - __blk_add_timer(req, &req->q->timeout_list); -} - From bdcfa3e57c9d92b082d2378bc9a64a3a8750fa8d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Apr 2014 00:36:37 -0700 Subject: [PATCH 15/62] random: export add_disk_randomness This will be needed for pending changes to the scsi midlayer that now calls lower level block APIs, as well as any blk-mq driver that wants to contribute to the random pool. Signed-off-by: Christoph Hellwig Acked-by: "Theodore Ts'o" Signed-off-by: Jens Axboe --- drivers/char/random.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/char/random.c b/drivers/char/random.c index 6b75713d953a..0a19d866a153 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -902,6 +902,7 @@ void add_disk_randomness(struct gendisk *disk) add_timer_randomness(disk->random, 0x100 + disk_devt(disk)); trace_add_disk_randomness(disk_devt(disk), ENTROPY_BITS(&input_pool)); } +EXPORT_SYMBOL_GPL(add_disk_randomness); #endif /********************************************************************* From 5810d903fa3459e703ce82a1d45136813c6afad8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 29 Apr 2014 20:49:48 -0600 Subject: [PATCH 16/62] blk-mq: fix waiting for reserved tags blk_mq_wait_for_tags() is only able to wait for "normal" tags, not reserved tags. Pass in which one we should attempt to get a tag for, so that waiting for reserved tags will work. Reserved tags are used for internal commands, which are usually serialized. Hence no waiting generally takes place, but we should ensure that it actually works if users need that functionality. Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 4 ++-- block/blk-mq-tag.h | 2 +- block/blk-mq.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 7a799c46c32d..1f43d6ee956f 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -6,9 +6,9 @@ #include "blk-mq.h" #include "blk-mq-tag.h" -void blk_mq_wait_for_tags(struct blk_mq_tags *tags) +void blk_mq_wait_for_tags(struct blk_mq_tags *tags, bool reserved) { - int tag = blk_mq_get_tag(tags, __GFP_WAIT, false); + int tag = blk_mq_get_tag(tags, __GFP_WAIT, reserved); blk_mq_put_tag(tags, tag); } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index b602e3fa66ea..c8e0645ea331 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -24,7 +24,7 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); -extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags); +extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags, bool reserved); extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag); extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); diff --git a/block/blk-mq.c b/block/blk-mq.c index f2e92eb92803..0aee32568210 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -218,7 +218,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, break; } - blk_mq_wait_for_tags(hctx->tags); + blk_mq_wait_for_tags(hctx->tags, reserved); } while (1); return rq; From 98bc1f272aba620d4222120853011d0ef026cf56 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Apr 2014 13:43:08 -0600 Subject: [PATCH 17/62] blk-mq remove debug BUG_ON() when draining software queues It's never been of any use, lets get rid of it. Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 0aee32568210..77308a84dfb2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -599,7 +599,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { clear_bit(bit, hctx->ctx_map); ctx = hctx->ctxs[bit]; - BUG_ON(bit != ctx->index_hw); spin_lock(&ctx->lock); list_splice_tail_init(&ctx->rq_list, &rq_list); From c6d600c65ebfa10a2a10d3e9183a24527ebe2aa4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Apr 2014 13:43:56 -0600 Subject: [PATCH 18/62] blk-mq: refactor request insertion/merging Refactor the logic around adding a new bio to a software queue, so we nest the ctx->lock where we really need it (merge and insertion) and don't hold it when we don't (init and IO start accounting). Signed-off-by: Jens Axboe --- block/blk-mq.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 77308a84dfb2..ca51ee4aa485 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1037,17 +1037,25 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) } } - spin_lock(&ctx->lock); + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { + init_request_from_bio(rq, bio); - if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - blk_mq_attempt_merge(q, ctx, bio)) - __blk_mq_free_request(hctx, ctx, rq); - else { - blk_mq_bio_to_request(rq, bio); + spin_lock(&ctx->lock); +insert_rq: __blk_mq_insert_request(hctx, rq, false); + spin_unlock(&ctx->lock); + blk_account_io_start(rq, 1); + } else { + spin_lock(&ctx->lock); + if (!blk_mq_attempt_merge(q, ctx, bio)) { + init_request_from_bio(rq, bio); + goto insert_rq; + } + + spin_unlock(&ctx->lock); + __blk_mq_free_request(hctx, ctx, rq); } - spin_unlock(&ctx->lock); /* * For a SYNC request, send it to the hardware immediately. For an From 176167ad9e44df221454e65e2ea83c3079950073 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Mon, 28 Apr 2014 12:38:34 +0900 Subject: [PATCH 19/62] block: Fix format string mismatch in cfq-iosched.c Fix format string mismatch in cfq_var_show() Signed-off-by: Masanari Iida Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5063a0bd831a..22dffebc7c73 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -4460,7 +4460,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) static ssize_t cfq_var_show(unsigned int var, char *page) { - return sprintf(page, "%d\n", var); + return sprintf(page, "%u\n", var); } static ssize_t From 74814b1c5569f5503727cf3052a52e2349818409 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 2 May 2014 11:24:48 -0600 Subject: [PATCH 20/62] blk-mq: remove extra requeue trace We already issue a blktrace requeue event in __blk_mq_requeue_request(), don't do it from the original caller as well. Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ca51ee4aa485..0d379830a278 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -427,13 +427,9 @@ static void __blk_mq_requeue_request(struct request *rq) void blk_mq_requeue_request(struct request *rq) { - struct request_queue *q = rq->q; - __blk_mq_requeue_request(rq); blk_clear_rq_complete(rq); - trace_block_rq_requeue(q, rq); - BUG_ON(blk_queued_rq(rq)); blk_mq_insert_request(rq, true, true, false); } From 5214e33c8efee16502b8a0549a3c24e21af458f7 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 2 May 2014 18:21:45 +0200 Subject: [PATCH 21/62] block/blk-iopoll.c: use iop instead of iopoll All blk_iopoll functions use iop for parent iopoll structure except blk_iopoll_complete.This also fixes one kernel-doc warning. Cc: Jens Axboe Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Jens Axboe --- block/blk-iopoll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c index c11d24e379e2..d828b44a404b 100644 --- a/block/blk-iopoll.c +++ b/block/blk-iopoll.c @@ -64,12 +64,12 @@ EXPORT_SYMBOL(__blk_iopoll_complete); * iopoll handler will not be invoked again before blk_iopoll_sched_prep() * is called. **/ -void blk_iopoll_complete(struct blk_iopoll *iopoll) +void blk_iopoll_complete(struct blk_iopoll *iop) { unsigned long flags; local_irq_save(flags); - __blk_iopoll_complete(iopoll); + __blk_iopoll_complete(iop); local_irq_restore(flags); } EXPORT_SYMBOL(blk_iopoll_complete); From 5cf8c2277576fcc48966b105bb42782d7929fc48 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 2 May 2014 18:28:17 +0200 Subject: [PATCH 22/62] block/blk-throttle.c: fix return of 0/1 with return type bool Fix 4 coccinelle warnings. Cc: Jens Axboe Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Jens Axboe --- block/blk-throttle.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 680a0cc116dd..9353b4683359 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -744,7 +744,7 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, static bool throtl_slice_used(struct throtl_grp *tg, bool rw) { if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) - return 0; + return false; return 1; } @@ -842,7 +842,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) *wait = 0; - return 1; + return true; } /* Calc approx time to dispatch */ @@ -880,7 +880,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { if (wait) *wait = 0; - return 1; + return true; } /* Calc approx time to dispatch */ @@ -923,7 +923,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { if (wait) *wait = 0; - return 1; + return true; } /* From 506e931f92defdc60c1dc4aa2ff4a19a5dcd8618 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 7 May 2014 10:26:44 -0600 Subject: [PATCH 23/62] blk-mq: add basic round-robin of what CPU to queue workqueue work on Right now we just pick the first CPU in the mask, but that can easily overload that one. Add some basic batching and round-robin all the entries in the mask instead. Signed-off-by: Jens Axboe --- block/blk-mq.c | 45 +++++++++++++++++++++++++++++------------- include/linux/blk-mq.h | 4 ++++ 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 0d379830a278..2410e0cb7aef 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -670,6 +670,30 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } } +/* + * It'd be great if the workqueue API had a way to pass + * in a mask and had some smarts for more clever placement. + * For now we just round-robin here, switching for every + * BLK_MQ_CPU_WORK_BATCH queued items. + */ +static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) +{ + int cpu = hctx->next_cpu; + + if (--hctx->next_cpu_batch <= 0) { + int next_cpu; + + next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(hctx->cpumask); + + hctx->next_cpu = next_cpu; + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + } + + return cpu; +} + void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) @@ -682,13 +706,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) else { unsigned int cpu; - /* - * It'd be great if the workqueue API had a way to pass - * in a mask and had some smarts for more clever placement - * than the first CPU. Or we could round-robin here. For now, - * just queue on the first CPU. - */ - cpu = cpumask_first(hctx->cpumask); + cpu = blk_mq_hctx_next_cpu(hctx); kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); } } @@ -795,13 +813,7 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) else { unsigned int cpu; - /* - * It'd be great if the workqueue API had a way to pass - * in a mask and had some smarts for more clever placement - * than the first CPU. Or we could round-robin here. For now, - * just queue on the first CPU. - */ - cpu = cpumask_first(hctx->cpumask); + cpu = blk_mq_hctx_next_cpu(hctx); kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); } } @@ -1378,6 +1390,11 @@ static void blk_mq_map_swqueue(struct request_queue *q) ctx->index_hw = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; } + + queue_for_each_hw_ctx(q, hctx, i) { + hctx->next_cpu = cpumask_first(hctx->cpumask); + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + } } struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 3b561d651a02..5bd677e2dcb7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -21,6 +21,8 @@ struct blk_mq_hw_ctx { struct delayed_work run_work; struct delayed_work delay_work; cpumask_var_t cpumask; + int next_cpu; + int next_cpu_batch; unsigned long flags; /* BLK_MQ_F_* flags */ @@ -126,6 +128,8 @@ enum { BLK_MQ_S_STOPPED = 0, BLK_MQ_MAX_DEPTH = 2048, + + BLK_MQ_CPU_WORK_BATCH = 8, }; struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); From 9fccfed8f0cad9b79575a87c45d6f5f6ee05bb66 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 8 May 2014 14:50:19 -0600 Subject: [PATCH 24/62] blk-mq: update a hotplug comment for grammar Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 2410e0cb7aef..3fdb097ebe5e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1548,10 +1548,10 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, struct request_queue *q; /* - * Before new mapping is established, hotadded cpu might already start - * handling requests. This doesn't break anything as we map offline - * CPUs to first hardware queue. We will re-init queue below to get - * optimal settings. + * Before new mappings are established, hotadded cpu might already + * start handling requests. This doesn't break anything as we map + * offline CPUs to first hardware queue. We will re-init the queue + * below to get optimal settings. */ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) From af76e555e5e29e08eb8ac1f7878e23dbf0d6741f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 May 2014 12:12:45 +0200 Subject: [PATCH 25/62] blk-mq: initialize struct request fields individually This allows us to avoid a non-atomic memset over ->atomic_flags as well as killing lots of duplicate initializations. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 47 ++++++++++++++++++++++++++++++++++++++++-- include/linux/blkdev.h | 7 ++++--- 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3fdb097ebe5e..492f49f96459 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -82,9 +82,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, tag = blk_mq_get_tag(hctx->tags, gfp, reserved); if (tag != BLK_MQ_TAG_FAIL) { rq = hctx->tags->rqs[tag]; - blk_rq_init(hctx->queue, rq); rq->tag = tag; - return rq; } @@ -187,10 +185,54 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; + INIT_LIST_HEAD(&rq->queuelist); + /* csd/requeue_work/fifo_time is initialized before use */ + rq->q = q; rq->mq_ctx = ctx; rq->cmd_flags = rw_flags; + rq->cmd_type = 0; + /* do not touch atomic flags, it needs atomic ops against the timer */ + rq->cpu = -1; + rq->__data_len = 0; + rq->__sector = (sector_t) -1; + rq->bio = NULL; + rq->biotail = NULL; + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + memset(&rq->flush, 0, max(sizeof(rq->flush), sizeof(rq->elv))); + rq->rq_disk = NULL; + rq->part = NULL; rq->start_time = jiffies; +#ifdef CONFIG_BLK_CGROUP + rq->rl = NULL; set_start_time_ns(rq); + rq->io_start_time_ns = 0; +#endif + rq->nr_phys_segments = 0; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + rq->nr_integrity_segments = 0; +#endif + rq->ioprio = 0; + rq->special = NULL; + /* tag was already set */ + rq->errors = 0; + memset(rq->__cmd, 0, sizeof(rq->__cmd)); + rq->cmd = rq->__cmd; + rq->cmd_len = BLK_MAX_CDB; + + rq->extra_len = 0; + rq->sense_len = 0; + rq->resid_len = 0; + rq->sense = NULL; + + rq->deadline = 0; + INIT_LIST_HEAD(&rq->timeout_list); + rq->timeout = 0; + rq->retries = 0; + rq->end_io = NULL; + rq->end_io_data = NULL; + rq->next_rq = NULL; + ctx->rq_dispatched[rw_is_sync(rw_flags)]++; } @@ -258,6 +300,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); blk_mq_put_tag(hctx->tags, tag); blk_mq_queue_exit(q); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 20b26d4e53a2..94b27210641b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -90,9 +90,10 @@ enum rq_cmd_type_bits { #define BLK_MAX_CDB 16 /* - * try to put the fields that are referenced together in the same cacheline. - * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init() - * as well! + * Try to put the fields that are referenced together in the same cacheline. + * + * If you modify this structure, make sure to update blk_rq_init() and + * especially blk_mq_rq_ctx_init() to take care of the added fields. */ struct request { struct list_head queuelist; From 4bb659b156996f2993dc16fad71fec9ee070153c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 May 2014 09:36:49 -0600 Subject: [PATCH 26/62] blk-mq: implement new and more efficient tagging scheme blk-mq currently uses percpu_ida for tag allocation. But that only works well if the ratio between tag space and number of CPUs is sufficiently high. For most devices and systems, that is not the case. The end result if that we either only utilize the tag space partially, or we end up attempting to fully exhaust it and run into lots of lock contention with stealing between CPUs. This is not optimal. This new tagging scheme is a hybrid bitmap allocator. It uses two tricks to both be SMP friendly and allow full exhaustion of the space: 1) We cache the last allocated (or freed) tag on a per blk-mq software context basis. This allows us to limit the space we have to search. The key element here is not caching it in the shared tag structure, otherwise we end up dirtying more shared cache lines on each allocate/free operation. 2) The tag space is split into cache line sized groups, and each context will start off randomly in that space. Even up to full utilization of the space, this divides the tag users efficiently into cache line groups, avoiding dirtying the same one both between allocators and between allocator and freeer. This scheme shows drastically better behaviour, both on small tag spaces but on large ones as well. It has been tested extensively to show better performance for all the cases blk-mq cares about. Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 415 +++++++++++++++++++++++++++++++++-------- block/blk-mq-tag.h | 42 ++++- block/blk-mq.c | 23 ++- block/blk-mq.h | 4 +- include/linux/blk-mq.h | 6 +- 5 files changed, 391 insertions(+), 99 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 1f43d6ee956f..467f3a20b355 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -1,64 +1,257 @@ #include #include +#include #include #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" -void blk_mq_wait_for_tags(struct blk_mq_tags *tags, bool reserved) +void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, + bool reserved) { - int tag = blk_mq_get_tag(tags, __GFP_WAIT, reserved); - blk_mq_put_tag(tags, tag); + int tag, zero = 0; + + tag = blk_mq_get_tag(tags, hctx, &zero, __GFP_WAIT, reserved); + blk_mq_put_tag(tags, tag, &zero); +} + +static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) +{ + int i; + + for (i = 0; i < bt->map_nr; i++) { + struct blk_mq_bitmap *bm = &bt->map[i]; + int ret; + + ret = find_first_zero_bit(&bm->word, bm->depth); + if (ret < bm->depth) + return true; + } + + return false; } bool blk_mq_has_free_tags(struct blk_mq_tags *tags) { - return !tags || - percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0; + if (!tags) + return true; + + return bt_has_free_tags(&tags->bitmap_tags); } -static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp) +static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag) +{ + int tag, org_last_tag, end; + + org_last_tag = last_tag = TAG_TO_BIT(last_tag); + end = bm->depth; + do { +restart: + tag = find_next_zero_bit(&bm->word, end, last_tag); + if (unlikely(tag >= end)) { + /* + * We started with an offset, start from 0 to + * exhaust the map. + */ + if (org_last_tag && last_tag) { + end = last_tag; + last_tag = 0; + goto restart; + } + return -1; + } + last_tag = tag + 1; + } while (test_and_set_bit_lock(tag, &bm->word)); + + return tag; +} + +/* + * Straight forward bitmap tag implementation, where each bit is a tag + * (cleared == free, and set == busy). The small twist is using per-cpu + * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue + * contexts. This enables us to drastically limit the space searched, + * without dirtying an extra shared cacheline like we would if we stored + * the cache value inside the shared blk_mq_bitmap_tags structure. On top + * of that, each word of tags is in a separate cacheline. This means that + * multiple users will tend to stick to different cachelines, at least + * until the map is exhausted. + */ +static int __bt_get(struct blk_mq_bitmap_tags *bt, unsigned int *tag_cache) +{ + unsigned int last_tag, org_last_tag; + int index, i, tag; + + last_tag = org_last_tag = *tag_cache; + index = TAG_TO_INDEX(last_tag); + + for (i = 0; i < bt->map_nr; i++) { + tag = __bt_get_word(&bt->map[index], last_tag); + if (tag != -1) { + tag += index * BITS_PER_LONG; + goto done; + } + + last_tag = 0; + if (++index >= bt->map_nr) + index = 0; + } + + *tag_cache = 0; + return -1; + + /* + * Only update the cache from the allocation path, if we ended + * up using the specific cached tag. + */ +done: + if (tag == org_last_tag) { + last_tag = tag + 1; + if (last_tag >= bt->depth - 1) + last_tag = 0; + + *tag_cache = last_tag; + } + + return tag; +} + +static inline void bt_index_inc(unsigned int *index) +{ + *index = (*index + 1) & (BT_WAIT_QUEUES - 1); +} + +static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, + struct blk_mq_hw_ctx *hctx) +{ + struct bt_wait_state *bs; + + if (!hctx) + return &bt->bs[0]; + + bs = &bt->bs[hctx->wait_index]; + bt_index_inc(&hctx->wait_index); + return bs; +} + +static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx, + unsigned int *last_tag, gfp_t gfp) +{ + struct bt_wait_state *bs; + DEFINE_WAIT(wait); + int tag; + + tag = __bt_get(bt, last_tag); + if (tag != -1) + return tag; + + if (!(gfp & __GFP_WAIT)) + return -1; + + bs = bt_wait_ptr(bt, hctx); + do { + bool was_empty; + + was_empty = list_empty(&wait.task_list); + prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); + + tag = __bt_get(bt, last_tag); + if (tag != -1) + break; + + if (was_empty) + atomic_set(&bs->wait_cnt, bt->wake_cnt); + + io_schedule(); + } while (1); + + finish_wait(&bs->wait, &wait); + return tag; +} + +static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, + struct blk_mq_hw_ctx *hctx, + unsigned int *last_tag, gfp_t gfp) { int tag; - tag = percpu_ida_alloc(&tags->free_tags, (gfp & __GFP_WAIT) ? - TASK_UNINTERRUPTIBLE : TASK_RUNNING); - if (tag < 0) - return BLK_MQ_TAG_FAIL; - return tag + tags->nr_reserved_tags; + tag = bt_get(&tags->bitmap_tags, hctx, last_tag, gfp); + if (tag >= 0) + return tag + tags->nr_reserved_tags; + + return BLK_MQ_TAG_FAIL; } static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, gfp_t gfp) { - int tag; + int tag, zero = 0; if (unlikely(!tags->nr_reserved_tags)) { WARN_ON_ONCE(1); return BLK_MQ_TAG_FAIL; } - tag = percpu_ida_alloc(&tags->reserved_tags, (gfp & __GFP_WAIT) ? - TASK_UNINTERRUPTIBLE : TASK_RUNNING); + tag = bt_get(&tags->breserved_tags, NULL, &zero, gfp); if (tag < 0) return BLK_MQ_TAG_FAIL; + return tag; } -unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved) +unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, + struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, + gfp_t gfp, bool reserved) { if (!reserved) - return __blk_mq_get_tag(tags, gfp); + return __blk_mq_get_tag(tags, hctx, last_tag, gfp); return __blk_mq_get_reserved_tag(tags, gfp); } +static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) +{ + int i, wake_index; + + wake_index = bt->wake_index; + for (i = 0; i < BT_WAIT_QUEUES; i++) { + struct bt_wait_state *bs = &bt->bs[wake_index]; + + if (waitqueue_active(&bs->wait)) { + if (wake_index != bt->wake_index) + bt->wake_index = wake_index; + + return bs; + } + + bt_index_inc(&wake_index); + } + + return NULL; +} + +static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) +{ + const int index = TAG_TO_INDEX(tag); + struct bt_wait_state *bs; + + clear_bit(TAG_TO_BIT(tag), &bt->map[index].word); + + bs = bt_wake_ptr(bt); + if (bs && atomic_dec_and_test(&bs->wait_cnt)) { + smp_mb__after_clear_bit(); + atomic_set(&bs->wait_cnt, bt->wake_cnt); + bt_index_inc(&bt->wake_index); + wake_up(&bs->wait); + } +} + static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) { BUG_ON(tag >= tags->nr_tags); - percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags); + bt_clear_tag(&tags->bitmap_tags, tag); } static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, @@ -66,22 +259,41 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, { BUG_ON(tag >= tags->nr_reserved_tags); - percpu_ida_free(&tags->reserved_tags, tag); + bt_clear_tag(&tags->breserved_tags, tag); } -void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) +void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, + unsigned int *last_tag) { - if (tag >= tags->nr_reserved_tags) - __blk_mq_put_tag(tags, tag); - else + if (tag >= tags->nr_reserved_tags) { + const int real_tag = tag - tags->nr_reserved_tags; + + __blk_mq_put_tag(tags, real_tag); + *last_tag = real_tag; + } else __blk_mq_put_reserved_tag(tags, tag); } -static int __blk_mq_tag_iter(unsigned id, void *data) +static void bt_for_each_free(struct blk_mq_bitmap_tags *bt, + unsigned long *free_map, unsigned int off) { - unsigned long *tag_map = data; - __set_bit(id, tag_map); - return 0; + int i; + + for (i = 0; i < bt->map_nr; i++) { + struct blk_mq_bitmap *bm = &bt->map[i]; + int bit = 0; + + do { + bit = find_next_zero_bit(&bm->word, bm->depth, bit); + if (bit >= bm->depth) + break; + + __set_bit(bit + off, free_map); + bit++; + } while (1); + + off += BITS_PER_LONG; + } } void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, @@ -95,21 +307,98 @@ void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, if (!tag_map) return; - percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map); + bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags); if (tags->nr_reserved_tags) - percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter, - tag_map); + bt_for_each_free(&tags->breserved_tags, tag_map, 0); fn(data, tag_map); kfree(tag_map); } +static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) +{ + unsigned int i, used; + + for (i = 0, used = 0; i < bt->map_nr; i++) { + struct blk_mq_bitmap *bm = &bt->map[i]; + + used += bitmap_weight(&bm->word, bm->depth); + } + + return bt->depth - used; +} + +static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, + int node, bool reserved) +{ + int i; + + /* + * Depth can be zero for reserved tags, that's not a failure + * condition. + */ + if (depth) { + int nr, i, map_depth; + + nr = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; + bt->map = kzalloc_node(nr * sizeof(struct blk_mq_bitmap), + GFP_KERNEL, node); + if (!bt->map) + return -ENOMEM; + + bt->map_nr = nr; + map_depth = depth; + for (i = 0; i < nr; i++) { + bt->map[i].depth = min(map_depth, BITS_PER_LONG); + map_depth -= BITS_PER_LONG; + } + } + + bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); + if (!bt->bs) { + kfree(bt->map); + return -ENOMEM; + } + + for (i = 0; i < BT_WAIT_QUEUES; i++) + init_waitqueue_head(&bt->bs[i].wait); + + bt->wake_cnt = BT_WAIT_BATCH; + if (bt->wake_cnt > depth / 4) + bt->wake_cnt = max(1U, depth / 4); + + bt->depth = depth; + return 0; +} + +static void bt_free(struct blk_mq_bitmap_tags *bt) +{ + kfree(bt->map); + kfree(bt->bs); +} + +static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, + int node) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + + if (bt_alloc(&tags->bitmap_tags, depth, node, false)) + goto enomem; + if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) + goto enomem; + + return tags; +enomem: + bt_free(&tags->bitmap_tags); + kfree(tags); + return NULL; +} + struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, int node) { unsigned int nr_tags, nr_cache; struct blk_mq_tags *tags; - int ret; if (total_tags > BLK_MQ_TAG_MAX) { pr_err("blk-mq: tag depth too large\n"); @@ -121,72 +410,46 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, return NULL; nr_tags = total_tags - reserved_tags; - nr_cache = nr_tags / num_possible_cpus(); - - if (nr_cache < BLK_MQ_TAG_CACHE_MIN) - nr_cache = BLK_MQ_TAG_CACHE_MIN; - else if (nr_cache > BLK_MQ_TAG_CACHE_MAX) - nr_cache = BLK_MQ_TAG_CACHE_MAX; + nr_cache = nr_tags / num_online_cpus(); tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; - tags->nr_max_cache = nr_cache; - tags->nr_batch_move = max(1u, nr_cache / 2); - ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags - - tags->nr_reserved_tags, - tags->nr_max_cache, - tags->nr_batch_move); - if (ret) - goto err_free_tags; - - if (reserved_tags) { - /* - * With max_cahe and batch set to 1, the allocator fallbacks to - * no cached. It's fine reserved tags allocation is slow. - */ - ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags, - 1, 1); - if (ret) - goto err_reserved_tags; - } - - return tags; - -err_reserved_tags: - percpu_ida_destroy(&tags->free_tags); -err_free_tags: - kfree(tags); - return NULL; + return blk_mq_init_bitmap_tags(tags, node); } void blk_mq_free_tags(struct blk_mq_tags *tags) { - percpu_ida_destroy(&tags->free_tags); - percpu_ida_destroy(&tags->reserved_tags); + bt_free(&tags->bitmap_tags); + bt_free(&tags->breserved_tags); kfree(tags); } +void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + + if (depth > 1) + *tag = prandom_u32() % (depth - 1); + else + *tag = 0; +} + ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) { char *orig_page = page; - unsigned int cpu; + unsigned int free, res; if (!tags) return 0; - page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u," - " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags, - tags->nr_batch_move, tags->nr_max_cache); + page += sprintf(page, "nr_tags=%u, reserved_tags=%u\n", + tags->nr_tags, tags->nr_reserved_tags); - page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", - percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids), - percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids)); + free = bt_unused_tags(&tags->bitmap_tags); + res = bt_unused_tags(&tags->breserved_tags); - for_each_possible_cpu(cpu) { - page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu, - percpu_ida_free_tags(&tags->free_tags, cpu)); - } + page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); return page - orig_page; } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index c8e0645ea331..06d4a2f0f7a0 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -1,7 +1,34 @@ #ifndef INT_BLK_MQ_TAG_H #define INT_BLK_MQ_TAG_H -#include +enum { + BT_WAIT_QUEUES = 8, + BT_WAIT_BATCH = 8, +}; + +struct bt_wait_state { + atomic_t wait_cnt; + wait_queue_head_t wait; +} ____cacheline_aligned_in_smp; + +#define TAG_TO_INDEX(tag) ((tag) / BITS_PER_LONG) +#define TAG_TO_BIT(tag) ((tag) & (BITS_PER_LONG - 1)) + +struct blk_mq_bitmap { + unsigned long word; + unsigned long depth; +} ____cacheline_aligned_in_smp; + +struct blk_mq_bitmap_tags { + unsigned int depth; + unsigned int wake_cnt; + + struct blk_mq_bitmap *map; + unsigned int map_nr; + + unsigned int wake_index; + struct bt_wait_state *bs; +}; /* * Tag address space map. @@ -9,11 +36,9 @@ struct blk_mq_tags { unsigned int nr_tags; unsigned int nr_reserved_tags; - unsigned int nr_batch_move; - unsigned int nr_max_cache; - struct percpu_ida free_tags; - struct percpu_ida reserved_tags; + struct blk_mq_bitmap_tags bitmap_tags; + struct blk_mq_bitmap_tags breserved_tags; struct request **rqs; struct list_head page_list; @@ -23,12 +48,13 @@ struct blk_mq_tags { extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); extern void blk_mq_free_tags(struct blk_mq_tags *tags); -extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); -extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags, bool reserved); -extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag); +extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); +extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, bool reserved); +extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, unsigned int *last_tag); extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); +extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); enum { BLK_MQ_TAG_CACHE_MIN = 1, diff --git a/block/blk-mq.c b/block/blk-mq.c index 492f49f96459..9f07a266f7ab 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -74,12 +74,13 @@ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, } static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, gfp_t gfp, bool reserved) { struct request *rq; unsigned int tag; - tag = blk_mq_get_tag(hctx->tags, gfp, reserved); + tag = blk_mq_get_tag(hctx->tags, hctx, &ctx->last_tag, gfp, reserved); if (tag != BLK_MQ_TAG_FAIL) { rq = hctx->tags->rqs[tag]; rq->tag = tag; @@ -246,7 +247,8 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); - rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); + rq = __blk_mq_alloc_request(hctx, ctx, gfp & ~__GFP_WAIT, + reserved); if (rq) { blk_mq_rq_ctx_init(q, ctx, rq, rw); break; @@ -260,7 +262,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, break; } - blk_mq_wait_for_tags(hctx->tags, reserved); + blk_mq_wait_for_tags(hctx->tags, hctx, reserved); } while (1); return rq; @@ -278,6 +280,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) blk_mq_put_ctx(rq->mq_ctx); return rq; } +EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp) @@ -301,7 +304,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct request_queue *q = rq->q; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - blk_mq_put_tag(hctx->tags, tag); + blk_mq_put_tag(hctx->tags, tag, &ctx->last_tag); blk_mq_queue_exit(q); } @@ -677,11 +680,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) queued++; continue; case BLK_MQ_RQ_QUEUE_BUSY: - /* - * FIXME: we should have a mechanism to stop the queue - * like blk_stop_queue, otherwise we will waste cpu - * time - */ list_add(&rq->queuelist, &rq_list); __blk_mq_requeue_request(rq); break; @@ -873,6 +871,7 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, list_add(&rq->queuelist, &ctx->rq_list); else list_add_tail(&rq->queuelist, &ctx->rq_list); + blk_mq_hctx_mark_pending(hctx, ctx); /* @@ -1046,7 +1045,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) if (is_sync) rw |= REQ_SYNC; trace_block_getrq(q, bio, rw); - rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); + rq = __blk_mq_alloc_request(hctx, ctx, GFP_ATOMIC, false); if (likely(rq)) blk_mq_rq_ctx_init(q, ctx, rq, rw); else { @@ -1130,8 +1129,8 @@ EXPORT_SYMBOL(blk_mq_map_queue); struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set, unsigned int hctx_index) { - return kmalloc_node(sizeof(struct blk_mq_hw_ctx), - GFP_KERNEL | __GFP_ZERO, set->numa_node); + return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, + set->numa_node); } EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); diff --git a/block/blk-mq.h b/block/blk-mq.h index 1ae364ceaf8b..97cfab9c092f 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -12,6 +12,8 @@ struct blk_mq_ctx { unsigned int cpu; unsigned int index_hw; + unsigned int last_tag ____cacheline_aligned_in_smp; + /* incremented at dispatch time */ unsigned long rq_dispatched[2]; unsigned long rq_merged; @@ -21,7 +23,7 @@ struct blk_mq_ctx { struct request_queue *queue; struct kobject kobj; -}; +} ____cacheline_aligned_in_smp; void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 5bd677e2dcb7..f83d15f6e1c1 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -31,10 +31,12 @@ struct blk_mq_hw_ctx { void *driver_data; - unsigned int nr_ctx; - struct blk_mq_ctx **ctxs; unsigned int nr_ctx_map; unsigned long *ctx_map; + unsigned int nr_ctx; + struct blk_mq_ctx **ctxs; + + unsigned int wait_index; struct blk_mq_tags *tags; From 59d13bf5f57ded658c872fa22276f75ab8f12841 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 May 2014 13:41:15 -0600 Subject: [PATCH 27/62] blk-mq: use sparser tag layout for lower queue depth For best performance, spreading tags over multiple cachelines makes the tagging more efficient on multicore systems. But since we have 8 * sizeof(unsigned long) tags per cacheline, we don't always get a nice spread. Attempt to spread the tags over at least 4 cachelines, using fewer number of bits per unsigned long if we have to. This improves tagging performance in setups with 32-128 tags. For higher depths, the spread is the same as before (BITS_PER_LONG tags per cacheline). Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 45 ++++++++++++++++++++++++++++++++------------- block/blk-mq-tag.h | 7 ++++--- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 467f3a20b355..6c78c08865e3 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -44,7 +44,7 @@ static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag) { int tag, org_last_tag, end; - org_last_tag = last_tag = TAG_TO_BIT(last_tag); + org_last_tag = last_tag; end = bm->depth; do { restart: @@ -84,12 +84,12 @@ static int __bt_get(struct blk_mq_bitmap_tags *bt, unsigned int *tag_cache) int index, i, tag; last_tag = org_last_tag = *tag_cache; - index = TAG_TO_INDEX(last_tag); + index = TAG_TO_INDEX(bt, last_tag); for (i = 0; i < bt->map_nr; i++) { - tag = __bt_get_word(&bt->map[index], last_tag); + tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag)); if (tag != -1) { - tag += index * BITS_PER_LONG; + tag += (index << bt->bits_per_word); goto done; } @@ -233,10 +233,10 @@ static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) { - const int index = TAG_TO_INDEX(tag); + const int index = TAG_TO_INDEX(bt, tag); struct bt_wait_state *bs; - clear_bit(TAG_TO_BIT(tag), &bt->map[index].word); + clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word); bs = bt_wake_ptr(bt); if (bs && atomic_dec_and_test(&bs->wait_cnt)) { @@ -292,7 +292,7 @@ static void bt_for_each_free(struct blk_mq_bitmap_tags *bt, bit++; } while (1); - off += BITS_PER_LONG; + off += (1 << bt->bits_per_word); } } @@ -333,14 +333,31 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, { int i; + bt->bits_per_word = ilog2(BITS_PER_LONG); + /* * Depth can be zero for reserved tags, that's not a failure * condition. */ if (depth) { - int nr, i, map_depth; + unsigned int nr, i, map_depth, tags_per_word; - nr = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; + tags_per_word = (1 << bt->bits_per_word); + + /* + * If the tag space is small, shrink the number of tags + * per word so we spread over a few cachelines, at least. + * If less than 4 tags, just forget about it, it's not + * going to work optimally anyway. + */ + if (depth >= 4) { + while (tags_per_word * 4 > depth) { + bt->bits_per_word--; + tags_per_word = (1 << bt->bits_per_word); + } + } + + nr = ALIGN(depth, tags_per_word) / tags_per_word; bt->map = kzalloc_node(nr * sizeof(struct blk_mq_bitmap), GFP_KERNEL, node); if (!bt->map) @@ -349,8 +366,8 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, bt->map_nr = nr; map_depth = depth; for (i = 0; i < nr; i++) { - bt->map[i].depth = min(map_depth, BITS_PER_LONG); - map_depth -= BITS_PER_LONG; + bt->map[i].depth = min(map_depth, tags_per_word); + map_depth -= tags_per_word; } } @@ -443,8 +460,10 @@ ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) if (!tags) return 0; - page += sprintf(page, "nr_tags=%u, reserved_tags=%u\n", - tags->nr_tags, tags->nr_reserved_tags); + page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " + "bits_per_word=%u\n", + tags->nr_tags, tags->nr_reserved_tags, + tags->bitmap_tags.bits_per_word); free = bt_unused_tags(&tags->bitmap_tags); res = bt_unused_tags(&tags->breserved_tags); diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 06d4a2f0f7a0..7aa9f0665489 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -11,8 +11,8 @@ struct bt_wait_state { wait_queue_head_t wait; } ____cacheline_aligned_in_smp; -#define TAG_TO_INDEX(tag) ((tag) / BITS_PER_LONG) -#define TAG_TO_BIT(tag) ((tag) & (BITS_PER_LONG - 1)) +#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word) +#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1)) struct blk_mq_bitmap { unsigned long word; @@ -22,9 +22,10 @@ struct blk_mq_bitmap { struct blk_mq_bitmap_tags { unsigned int depth; unsigned int wake_cnt; + unsigned int bits_per_word; - struct blk_mq_bitmap *map; unsigned int map_nr; + struct blk_mq_bitmap *map; unsigned int wake_index; struct bt_wait_state *bs; From cf4b50afc28cc4629f82f5591bef1db2ce129fdc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 May 2014 14:54:08 -0600 Subject: [PATCH 28/62] blk-mq: fix race in IO start accounting Commit c6d600c6 opened up a small race where we could attempt to account IO completion on a request, racing with IO start accounting. Fix this up by ensuring that we've accounted for IO start before inserting the request. Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 9f07a266f7ab..526feee31bff 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1088,17 +1088,15 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) } if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { - init_request_from_bio(rq, bio); - + blk_mq_bio_to_request(rq, bio); spin_lock(&ctx->lock); insert_rq: __blk_mq_insert_request(hctx, rq, false); spin_unlock(&ctx->lock); - blk_account_io_start(rq, 1); } else { spin_lock(&ctx->lock); if (!blk_mq_attempt_merge(q, ctx, bio)) { - init_request_from_bio(rq, bio); + blk_mq_bio_to_request(rq, bio); goto insert_rq; } From 7276d02e241dc3595d4a18ac2456fa449e448de7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 May 2014 15:48:23 -0600 Subject: [PATCH 29/62] block: only calculate part_in_flight() once We first check if we have inflight IO, then retrieve that same number again. Usually this isn't that costly since the chance of having the data dirtied in between is small, but there's no reason for calling part_in_flight() twice. Signed-off-by: Jens Axboe --- block/blk-core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index c4269701cb4f..a6bd3e702201 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1233,12 +1233,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq, static void part_round_stats_single(int cpu, struct hd_struct *part, unsigned long now) { + int inflight; + if (now == part->stamp) return; - if (part_in_flight(part)) { + inflight = part_in_flight(part); + if (inflight) { __part_stat_add(cpu, part, time_in_queue, - part_in_flight(part) * (now - part->stamp)); + inflight * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; From 0289b2e110b7824b2f76d194ad6f8f0844e270ad Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 11 May 2014 01:01:48 +0800 Subject: [PATCH 30/62] blk-mq: bitmap tag: use clear_bit_unlock in bt_clear_tag() The unlock memory barrier need to order access to req in free path and clearing tag bit, otherwise either request free path may see a allocated request, or initialized request in allocate path might be modified by the ongoing free path. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 6c78c08865e3..a81b138e89fe 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -236,7 +236,11 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) const int index = TAG_TO_INDEX(bt, tag); struct bt_wait_state *bs; - clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word); + /* + * The unlock memory barrier need to order access to req in free + * path and clearing tag bit + */ + clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word); bs = bt_wake_ptr(bt); if (bs && atomic_dec_and_test(&bs->wait_cnt)) { From 60f2df8a29df5f2db2c87fd23122a1cebdf2011a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 11 May 2014 01:01:49 +0800 Subject: [PATCH 31/62] blk-mq: bitmap tag: remove barrier in bt_clear_tag() The barrier isn't necessary because both atomic_dec_and_test() and wake_up() implicate one barrier. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index a81b138e89fe..5a83d8e587f7 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -244,7 +244,6 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) bs = bt_wake_ptr(bt); if (bs && atomic_dec_and_test(&bs->wait_cnt)) { - smp_mb__after_clear_bit(); atomic_set(&bs->wait_cnt, bt->wake_cnt); bt_index_inc(&bt->wake_index); wake_up(&bs->wait); From 9d3d21aeb4f194cd7ac205abe68b14b47ae736a8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 10 May 2014 15:43:14 -0600 Subject: [PATCH 32/62] blk-mq: bitmap tag: select random tag betweet 0 and (depth - 1) The selected tag should be selected at random between 0 and (depth - 1) with probability 1/depth, instead between 0 and (depth - 2) with probability 1/(depth - 1). Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 5a83d8e587f7..f196e60178f4 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -449,10 +449,7 @@ void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) { unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; - if (depth > 1) - *tag = prandom_u32() % (depth - 1); - else - *tag = 0; + *tag = prandom_u32() % depth; } ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) From 1f236ab22ce3bc5d4f975aa116966c0ea7ec2013 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 11 May 2014 01:01:51 +0800 Subject: [PATCH 33/62] blk-mq: bitmap tag: cleanup blk_mq_init_tags Both nr_cache and nr_tags arn't needed for bitmap tag anymore. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index f196e60178f4..8d526a3e02f6 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -417,7 +417,6 @@ static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, int node) { - unsigned int nr_tags, nr_cache; struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { @@ -429,9 +428,6 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, if (!tags) return NULL; - nr_tags = total_tags - reserved_tags; - nr_cache = nr_tags / num_online_cpus(); - tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; From 0d2602ca30e410e84e8bdf05c84ed5688e0a5a44 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 May 2014 15:10:52 -0600 Subject: [PATCH 34/62] blk-mq: improve support for shared tags maps This adds support for active queue tracking, meaning that the blk-mq tagging maintains a count of active users of a tag set. This allows us to maintain a notion of fairness between users, so that we can distribute the tag depth evenly without starving some users while allowing others to try unfair deep queues. If sharing of a tag set is detected, each hardware queue will track the depth of its own queue. And if this exceeds the total depth divided by the number of active queues, the user is actively throttled down. The active queue count is done lazily to avoid bouncing that data between submitter and completer. Each hardware queue gets marked active when it allocates its first tag, and gets marked inactive when 1) the last tag is cleared, and 2) the queue timeout grace period has passed. Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 10 ++++ block/blk-mq-tag.c | 112 ++++++++++++++++++++++++++++++++------ block/blk-mq-tag.h | 27 ++++++++- block/blk-mq.c | 85 +++++++++++++++++++++++++++-- block/blk-timeout.c | 13 ++++- block/blk.h | 4 ++ include/linux/blk-mq.h | 7 +++ include/linux/blk_types.h | 2 + include/linux/blkdev.h | 3 + 9 files changed, 236 insertions(+), 27 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 8145b5b25b4b..99a60a829e69 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -208,6 +208,11 @@ static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) return blk_mq_tag_sysfs_show(hctx->tags, page); } +static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + return sprintf(page, "%u\n", atomic_read(&hctx->nr_active)); +} + static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) { unsigned int i, first = 1; @@ -267,6 +272,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { .attr = {.name = "dispatched", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_dispatched_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { + .attr = {.name = "active", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_active_show, +}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { .attr = {.name = "pending", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_rq_list_show, @@ -287,6 +296,7 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_pending.attr, &blk_mq_hw_sysfs_tags.attr, &blk_mq_hw_sysfs_cpus.attr, + &blk_mq_hw_sysfs_active.attr, NULL, }; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 8d526a3e02f6..c80086c9c064 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -7,13 +7,12 @@ #include "blk-mq.h" #include "blk-mq-tag.h" -void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, - bool reserved) +void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved) { int tag, zero = 0; - tag = blk_mq_get_tag(tags, hctx, &zero, __GFP_WAIT, reserved); - blk_mq_put_tag(tags, tag, &zero); + tag = blk_mq_get_tag(hctx, &zero, __GFP_WAIT, reserved); + blk_mq_put_tag(hctx, tag, &zero); } static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) @@ -40,6 +39,84 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags) return bt_has_free_tags(&tags->bitmap_tags); } +static inline void bt_index_inc(unsigned int *index) +{ + *index = (*index + 1) & (BT_WAIT_QUEUES - 1); +} + +/* + * If a previously inactive queue goes active, bump the active user count. + */ +bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && + !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + atomic_inc(&hctx->tags->active_queues); + + return true; +} + +/* + * If a previously busy queue goes inactive, potential waiters could now + * be allowed to queue. Wake them up and check. + */ +void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->tags; + struct blk_mq_bitmap_tags *bt; + int i, wake_index; + + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return; + + atomic_dec(&tags->active_queues); + + /* + * Will only throttle depth on non-reserved tags + */ + bt = &tags->bitmap_tags; + wake_index = bt->wake_index; + for (i = 0; i < BT_WAIT_QUEUES; i++) { + struct bt_wait_state *bs = &bt->bs[wake_index]; + + if (waitqueue_active(&bs->wait)) + wake_up(&bs->wait); + + bt_index_inc(&wake_index); + } +} + +/* + * For shared tag users, we track the number of currently active users + * and attempt to provide a fair share of the tag depth for each of them. + */ +static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, + struct blk_mq_bitmap_tags *bt) +{ + unsigned int depth, users; + + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return true; + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return true; + + /* + * Don't try dividing an ant + */ + if (bt->depth == 1) + return true; + + users = atomic_read(&hctx->tags->active_queues); + if (!users) + return true; + + /* + * Allow at least some tags + */ + depth = max((bt->depth + users - 1) / users, 4U); + return atomic_read(&hctx->nr_active) < depth; +} + static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag) { int tag, org_last_tag, end; @@ -78,11 +155,15 @@ static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag) * multiple users will tend to stick to different cachelines, at least * until the map is exhausted. */ -static int __bt_get(struct blk_mq_bitmap_tags *bt, unsigned int *tag_cache) +static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, + unsigned int *tag_cache) { unsigned int last_tag, org_last_tag; int index, i, tag; + if (!hctx_may_queue(hctx, bt)) + return -1; + last_tag = org_last_tag = *tag_cache; index = TAG_TO_INDEX(bt, last_tag); @@ -117,11 +198,6 @@ static int __bt_get(struct blk_mq_bitmap_tags *bt, unsigned int *tag_cache) return tag; } -static inline void bt_index_inc(unsigned int *index) -{ - *index = (*index + 1) & (BT_WAIT_QUEUES - 1); -} - static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx) { @@ -142,7 +218,7 @@ static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx, DEFINE_WAIT(wait); int tag; - tag = __bt_get(bt, last_tag); + tag = __bt_get(hctx, bt, last_tag); if (tag != -1) return tag; @@ -156,7 +232,7 @@ static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx, was_empty = list_empty(&wait.task_list); prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); - tag = __bt_get(bt, last_tag); + tag = __bt_get(hctx, bt, last_tag); if (tag != -1) break; @@ -200,14 +276,13 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, return tag; } -unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, - struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, +unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved) { if (!reserved) - return __blk_mq_get_tag(tags, hctx, last_tag, gfp); + return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp); - return __blk_mq_get_reserved_tag(tags, gfp); + return __blk_mq_get_reserved_tag(hctx->tags, gfp); } static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) @@ -265,9 +340,11 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, bt_clear_tag(&tags->breserved_tags, tag); } -void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, +void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag) { + struct blk_mq_tags *tags = hctx->tags; + if (tag >= tags->nr_reserved_tags) { const int real_tag = tag - tags->nr_reserved_tags; @@ -465,6 +542,7 @@ ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) res = bt_unused_tags(&tags->breserved_tags); page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); + page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); return page - orig_page; } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 7aa9f0665489..0f5ec8b50ef3 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -38,6 +38,8 @@ struct blk_mq_tags { unsigned int nr_tags; unsigned int nr_reserved_tags; + atomic_t active_queues; + struct blk_mq_bitmap_tags bitmap_tags; struct blk_mq_bitmap_tags breserved_tags; @@ -49,9 +51,9 @@ struct blk_mq_tags { extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); extern void blk_mq_free_tags(struct blk_mq_tags *tags); -extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); -extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, bool reserved); -extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, unsigned int *last_tag); +extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); +extern void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved); +extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); @@ -68,4 +70,23 @@ enum { BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, }; +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); + +static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return false; + + return __blk_mq_tag_busy(hctx); +} + +static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return; + + __blk_mq_tag_idle(hctx); +} + #endif diff --git a/block/blk-mq.c b/block/blk-mq.c index 9f07a266f7ab..3c4f1fceef8e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -80,9 +80,16 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, struct request *rq; unsigned int tag; - tag = blk_mq_get_tag(hctx->tags, hctx, &ctx->last_tag, gfp, reserved); + tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved); if (tag != BLK_MQ_TAG_FAIL) { rq = hctx->tags->rqs[tag]; + + rq->cmd_flags = 0; + if (blk_mq_tag_busy(hctx)) { + rq->cmd_flags = REQ_MQ_INFLIGHT; + atomic_inc(&hctx->nr_active); + } + rq->tag = tag; return rq; } @@ -190,7 +197,7 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, /* csd/requeue_work/fifo_time is initialized before use */ rq->q = q; rq->mq_ctx = ctx; - rq->cmd_flags = rw_flags; + rq->cmd_flags |= rw_flags; rq->cmd_type = 0; /* do not touch atomic flags, it needs atomic ops against the timer */ rq->cpu = -1; @@ -262,7 +269,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, break; } - blk_mq_wait_for_tags(hctx->tags, hctx, reserved); + blk_mq_wait_for_tags(hctx, reserved); } while (1); return rq; @@ -303,8 +310,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; + if (rq->cmd_flags & REQ_MQ_INFLIGHT) + atomic_dec(&hctx->nr_active); + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - blk_mq_put_tag(hctx->tags, tag, &ctx->last_tag); + blk_mq_put_tag(hctx, tag, &ctx->last_tag); blk_mq_queue_exit(q); } @@ -571,8 +581,13 @@ static void blk_mq_rq_timer(unsigned long data) queue_for_each_hw_ctx(q, hctx, i) blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); - if (next_set) - mod_timer(&q->timeout, round_jiffies_up(next)); + if (next_set) { + next = blk_rq_timeout(round_jiffies_up(next)); + mod_timer(&q->timeout, next); + } else { + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_tag_idle(hctx); + } } /* @@ -1439,6 +1454,56 @@ static void blk_mq_map_swqueue(struct request_queue *q) } } +static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) +{ + struct blk_mq_hw_ctx *hctx; + struct request_queue *q; + bool shared; + int i; + + if (set->tag_list.next == set->tag_list.prev) + shared = false; + else + shared = true; + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_freeze_queue(q); + + queue_for_each_hw_ctx(q, hctx, i) { + if (shared) + hctx->flags |= BLK_MQ_F_TAG_SHARED; + else + hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + } + blk_mq_unfreeze_queue(q); + } +} + +static void blk_mq_del_queue_tag_set(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + + blk_mq_freeze_queue(q); + + mutex_lock(&set->tag_list_lock); + list_del_init(&q->tag_set_list); + blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); + + blk_mq_unfreeze_queue(q); +} + +static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, + struct request_queue *q) +{ + q->tag_set = set; + + mutex_lock(&set->tag_list_lock); + list_add_tail(&q->tag_set_list, &set->tag_list); + blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); +} + struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) { struct blk_mq_hw_ctx **hctxs; @@ -1464,6 +1529,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) goto err_hctxs; + atomic_set(&hctxs[i]->nr_active, 0); hctxs[i]->numa_node = NUMA_NO_NODE; hctxs[i]->queue_num = i; } @@ -1516,6 +1582,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) list_add_tail(&q->all_q_node, &all_q_list); mutex_unlock(&all_q_mutex); + blk_mq_add_queue_tag_set(set, q); + return q; err_flush_rq: @@ -1543,6 +1611,8 @@ void blk_mq_free_queue(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; + blk_mq_del_queue_tag_set(q); + queue_for_each_hw_ctx(q, hctx, i) { kfree(hctx->ctx_map); kfree(hctx->ctxs); @@ -1635,6 +1705,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) goto out_unwind; } + mutex_init(&set->tag_list_lock); + INIT_LIST_HEAD(&set->tag_list); + return 0; out_unwind: diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 448745683d28..43e8b515806f 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -166,6 +166,17 @@ void blk_abort_request(struct request *req) } EXPORT_SYMBOL_GPL(blk_abort_request); +unsigned long blk_rq_timeout(unsigned long timeout) +{ + unsigned long maxt; + + maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT); + if (time_after(timeout, maxt)) + timeout = maxt; + + return timeout; +} + /** * blk_add_timer - Start timeout timer for a single request * @req: request that is about to start running. @@ -200,7 +211,7 @@ void blk_add_timer(struct request *req) * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = round_jiffies_up(req->deadline); + expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { diff --git a/block/blk.h b/block/blk.h index 79be2cbce7fd..95cab70000e3 100644 --- a/block/blk.h +++ b/block/blk.h @@ -9,6 +9,9 @@ /* Number of requests a "batching" process may submit */ #define BLK_BATCH_REQ 32 +/* Max future timer expiry for timeouts */ +#define BLK_MAX_TIMEOUT (5 * HZ) + extern struct kmem_cache *blk_requestq_cachep; extern struct kmem_cache *request_cachep; extern struct kobj_type blk_queue_ktype; @@ -37,6 +40,7 @@ bool __blk_end_bidi_request(struct request *rq, int error, void blk_rq_timed_out_timer(unsigned long data); void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, unsigned int *next_set); +unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); void blk_delete_timer(struct request *); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f83d15f6e1c1..379f88d5c44d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -48,6 +48,8 @@ struct blk_mq_hw_ctx { unsigned int numa_node; unsigned int cmd_size; /* per-request extra data */ + atomic_t nr_active; + struct blk_mq_cpu_notifier cpu_notifier; struct kobject kobj; }; @@ -64,6 +66,9 @@ struct blk_mq_tag_set { void *driver_data; struct blk_mq_tags **tags; + + struct mutex tag_list_lock; + struct list_head tag_list; }; typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); @@ -126,8 +131,10 @@ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_SHOULD_SORT = 1 << 1, + BLK_MQ_F_TAG_SHARED = 1 << 2, BLK_MQ_S_STOPPED = 0, + BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_MAX_DEPTH = 2048, diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index aa0eaa2d0bd8..d8e4cea23a25 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -190,6 +190,7 @@ enum rq_flag_bits { __REQ_PM, /* runtime pm request */ __REQ_END, /* last of chain of requests */ __REQ_HASHED, /* on IO scheduler merge hash */ + __REQ_MQ_INFLIGHT, /* track inflight for MQ */ __REQ_NR_BITS, /* stops here */ }; @@ -243,5 +244,6 @@ enum rq_flag_bits { #define REQ_PM (1ULL << __REQ_PM) #define REQ_END (1ULL << __REQ_END) #define REQ_HASHED (1ULL << __REQ_HASHED) +#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 94b27210641b..6bc011a09e82 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -481,6 +481,9 @@ struct request_queue { wait_queue_head_t mq_freeze_wq; struct percpu_counter mq_usage_counter; struct list_head all_q_node; + + struct blk_mq_tag_set *tag_set; + struct list_head tag_set_list; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ From f9c78b2be2cac2a7a397d489275e7d9f9ae785f2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 May 2014 08:16:41 -0600 Subject: [PATCH 35/62] block: move bio.c and bio-integrity.c from fs/ to block/ They really belong in block/, especially now since it's not in drivers/block/ anymore. Additionally, the get_maintainer script gets it wrong when in fs/. Suggested-by: Christoph Hellwig Acked-by: Al Viro Signed-off-by: Jens Axboe --- block/Makefile | 3 ++- {fs => block}/bio-integrity.c | 0 {fs => block}/bio.c | 0 fs/Makefile | 3 +-- 4 files changed, 3 insertions(+), 3 deletions(-) rename {fs => block}/bio-integrity.c (100%) rename {fs => block}/bio.c (100%) diff --git a/block/Makefile b/block/Makefile index 20645e88fb57..506a0c570be2 100644 --- a/block/Makefile +++ b/block/Makefile @@ -2,7 +2,7 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ +obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ @@ -20,3 +20,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o diff --git a/fs/bio-integrity.c b/block/bio-integrity.c similarity index 100% rename from fs/bio-integrity.c rename to block/bio-integrity.c diff --git a/fs/bio.c b/block/bio.c similarity index 100% rename from fs/bio.c rename to block/bio.c diff --git a/fs/Makefile b/fs/Makefile index f9cb9876e466..1ed9eab5e0a9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -14,14 +14,13 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o +obj-y += buffer.o block_dev.o direct-io.o mpage.o ioprio.o else obj-y += no-block.o endif obj-$(CONFIG_PROC_FS) += proc_namespace.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_ANON_INODES) += anon_inodes.o From 2667bcbbd5ed71f29b78ba69f059dbc450e07faf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 May 2014 11:02:18 -0600 Subject: [PATCH 36/62] block: move ioprio.c from fs/ to block/ Like commit f9c78b2b, move this block related file outside of fs/ and into the core block directory, block/. Signed-off-by: Jens Axboe --- block/Makefile | 3 ++- {fs => block}/ioprio.c | 0 fs/Makefile | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) rename {fs => block}/ioprio.c (100%) diff --git a/block/Makefile b/block/Makefile index 506a0c570be2..b4c4d3b99a2d 100644 --- a/block/Makefile +++ b/block/Makefile @@ -7,7 +7,8 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ - genhd.o scsi_ioctl.o partition-generic.o partitions/ + genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ + partitions/ obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o diff --git a/fs/ioprio.c b/block/ioprio.c similarity index 100% rename from fs/ioprio.c rename to block/ioprio.c diff --git a/fs/Makefile b/fs/Makefile index 1ed9eab5e0a9..4030cbfbc9af 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -14,7 +14,7 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o block_dev.o direct-io.o mpage.o ioprio.o +obj-y += buffer.o block_dev.o direct-io.o mpage.o else obj-y += no-block.o endif From e93ecf602beb8439f0bdcc1fa2cbc1f31fdfb8e2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 May 2014 09:17:48 -0600 Subject: [PATCH 37/62] blk-mq: move the cache friendly bitmap type of out blk-mq-tag We will use it for the pending list in blk-mq core as well. Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 10 +++++----- block/blk-mq-tag.h | 9 +++------ block/blk-mq.h | 9 +++++++++ 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 8d526a3e02f6..03ce6a11ba79 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -21,7 +21,7 @@ static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) int i; for (i = 0; i < bt->map_nr; i++) { - struct blk_mq_bitmap *bm = &bt->map[i]; + struct blk_align_bitmap *bm = &bt->map[i]; int ret; ret = find_first_zero_bit(&bm->word, bm->depth); @@ -40,7 +40,7 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags) return bt_has_free_tags(&tags->bitmap_tags); } -static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag) +static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) { int tag, org_last_tag, end; @@ -283,7 +283,7 @@ static void bt_for_each_free(struct blk_mq_bitmap_tags *bt, int i; for (i = 0; i < bt->map_nr; i++) { - struct blk_mq_bitmap *bm = &bt->map[i]; + struct blk_align_bitmap *bm = &bt->map[i]; int bit = 0; do { @@ -323,7 +323,7 @@ static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) unsigned int i, used; for (i = 0, used = 0; i < bt->map_nr; i++) { - struct blk_mq_bitmap *bm = &bt->map[i]; + struct blk_align_bitmap *bm = &bt->map[i]; used += bitmap_weight(&bm->word, bm->depth); } @@ -361,7 +361,7 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, } nr = ALIGN(depth, tags_per_word) / tags_per_word; - bt->map = kzalloc_node(nr * sizeof(struct blk_mq_bitmap), + bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), GFP_KERNEL, node); if (!bt->map) return -ENOMEM; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 7aa9f0665489..9014269f3910 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -1,6 +1,8 @@ #ifndef INT_BLK_MQ_TAG_H #define INT_BLK_MQ_TAG_H +#include "blk-mq.h" + enum { BT_WAIT_QUEUES = 8, BT_WAIT_BATCH = 8, @@ -14,18 +16,13 @@ struct bt_wait_state { #define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word) #define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1)) -struct blk_mq_bitmap { - unsigned long word; - unsigned long depth; -} ____cacheline_aligned_in_smp; - struct blk_mq_bitmap_tags { unsigned int depth; unsigned int wake_cnt; unsigned int bits_per_word; unsigned int map_nr; - struct blk_mq_bitmap *map; + struct blk_align_bitmap *map; unsigned int wake_index; struct bt_wait_state *bs; diff --git a/block/blk-mq.h b/block/blk-mq.h index 97cfab9c092f..5e5a378962b7 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -52,4 +52,13 @@ void blk_mq_disable_hotplug(void); extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); +/* + * Basic implementation of sparser bitmap, allowing the user to spread + * the bits over more cachelines. + */ +struct blk_align_bitmap { + unsigned long word; + unsigned long depth; +} ____cacheline_aligned_in_smp; + #endif From 1429d7c9467e1e3de0b0ff91d7e4d67c1a92f8a3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 May 2014 09:23:55 -0600 Subject: [PATCH 38/62] blk-mq: switch ctx pending map to the sparser blk_align_bitmap Each hardware queue has a bitmap of software queues with pending requests. When new IO is queued on a software queue, the bit is set, and when IO is pruned on a hardware queue run, the bit is cleared. This causes a lot of traffic. Switch this from the regular BITS_PER_LONG bitmap to a sparser layout, similarly to what was done for blk-mq tagging. 20% performance increase was observed for single threaded IO, and about 15% performanc increase on multiple threads driving the same device. Signed-off-by: Jens Axboe --- block/blk-mq.c | 119 +++++++++++++++++++++++++++++++---------- include/linux/blk-mq.h | 10 +++- 2 files changed, 99 insertions(+), 30 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 526feee31bff..e862c4408427 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -56,21 +56,40 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { unsigned int i; - for (i = 0; i < hctx->nr_ctx_map; i++) - if (hctx->ctx_map[i]) + for (i = 0; i < hctx->ctx_map.map_size; i++) + if (hctx->ctx_map.map[i].word) return true; return false; } +static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; +} + +#define CTX_TO_BIT(hctx, ctx) \ + ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) + /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - if (!test_bit(ctx->index_hw, hctx->ctx_map)) - set_bit(ctx->index_hw, hctx->ctx_map); + struct blk_align_bitmap *bm = get_bm(hctx, ctx); + + if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) + set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); +} + +static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + struct blk_align_bitmap *bm = get_bm(hctx, ctx); + + clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, @@ -614,6 +633,40 @@ static bool blk_mq_attempt_merge(struct request_queue *q, return false; } +/* + * Process software queues that have been marked busy, splicing them + * to the for-dispatch + */ +static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) +{ + struct blk_mq_ctx *ctx; + int i; + + for (i = 0; i < hctx->ctx_map.map_size; i++) { + struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; + unsigned int off, bit; + + if (!bm->word) + continue; + + bit = 0; + off = i * hctx->ctx_map.bits_per_word; + do { + bit = find_next_bit(&bm->word, bm->depth, bit); + if (bit >= bm->depth) + break; + + ctx = hctx->ctxs[bit + off]; + clear_bit(bit, &bm->word); + spin_lock(&ctx->lock); + list_splice_tail_init(&ctx->rq_list, list); + spin_unlock(&ctx->lock); + + bit++; + } while (1); + } +} + /* * Run this hardware queue, pulling any software queues mapped to it in. * Note that this function currently has various problems around ordering @@ -623,10 +676,9 @@ static bool blk_mq_attempt_merge(struct request_queue *q, static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; - struct blk_mq_ctx *ctx; struct request *rq; LIST_HEAD(rq_list); - int bit, queued; + int queued; WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); @@ -638,14 +690,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) /* * Touch any software queue that has pending entries. */ - for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { - clear_bit(bit, hctx->ctx_map); - ctx = hctx->ctxs[bit]; - - spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, &rq_list); - spin_unlock(&ctx->lock); - } + flush_busy_ctxs(hctx, &rq_list); /* * If we have previous entries on our dispatch list, grab them @@ -658,14 +703,10 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) spin_unlock(&hctx->lock); } - /* - * Delete and return all entries from our dispatch list - */ - queued = 0; - /* * Now process all the entries, sending them to the driver. */ + queued = 0; while (!list_empty(&rq_list)) { int ret; @@ -1158,7 +1199,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action, spin_lock(&ctx->lock); if (!list_empty(&ctx->rq_list)) { list_splice_init(&ctx->rq_list, &tmp); - clear_bit(ctx->index_hw, hctx->ctx_map); + blk_mq_hctx_clear_pending(hctx, ctx); } spin_unlock(&ctx->lock); @@ -1298,6 +1339,34 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, return NULL; } +static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) +{ + kfree(bitmap->map); +} + +static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) +{ + unsigned int bpw = 8, total, num_maps, i; + + bitmap->bits_per_word = bpw; + + num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; + bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), + GFP_KERNEL, node); + if (!bitmap->map) + return -ENOMEM; + + bitmap->map_size = num_maps; + + total = nr_cpu_ids; + for (i = 0; i < num_maps; i++) { + bitmap->map[i].depth = min(total, bitmap->bits_per_word); + total -= bitmap->map[i].depth; + } + + return 0; +} + static int blk_mq_init_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set) { @@ -1308,7 +1377,6 @@ static int blk_mq_init_hw_queues(struct request_queue *q, * Initialize hardware queues */ queue_for_each_hw_ctx(q, hctx, i) { - unsigned int num_maps; int node; node = hctx->numa_node; @@ -1339,13 +1407,9 @@ static int blk_mq_init_hw_queues(struct request_queue *q, if (!hctx->ctxs) break; - num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; - hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), - GFP_KERNEL, node); - if (!hctx->ctx_map) + if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) break; - hctx->nr_ctx_map = num_maps; hctx->nr_ctx = 0; if (set->ops->init_hctx && @@ -1368,7 +1432,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); kfree(hctx->ctxs); - kfree(hctx->ctx_map); + blk_mq_free_bitmap(&hctx->ctx_map); } return 1; @@ -1542,7 +1606,6 @@ void blk_mq_free_queue(struct request_queue *q) int i; queue_for_each_hw_ctx(q, hctx, i) { - kfree(hctx->ctx_map); kfree(hctx->ctxs); blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); if (q->mq_ops->exit_hctx) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f83d15f6e1c1..952e558ee598 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -11,6 +11,12 @@ struct blk_mq_cpu_notifier { void (*notify)(void *data, unsigned long action, unsigned int cpu); }; +struct blk_mq_ctxmap { + unsigned int map_size; + unsigned int bits_per_word; + struct blk_align_bitmap *map; +}; + struct blk_mq_hw_ctx { struct { spinlock_t lock; @@ -31,8 +37,8 @@ struct blk_mq_hw_ctx { void *driver_data; - unsigned int nr_ctx_map; - unsigned long *ctx_map; + struct blk_mq_ctxmap ctx_map; + unsigned int nr_ctx; struct blk_mq_ctx **ctxs; From 719c555f4424b194905aa3512a754c0444f27ce8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 May 2014 20:01:52 -0600 Subject: [PATCH 39/62] block: move mm/bounce.c to block/ Continue moving some of the block files that are scattered around. bounce.c contains only code for bouncing the contents of a bio. It's block proper code, not mm code. Suggested-by: Ming Lei Signed-off-by: Jens Axboe --- block/Makefile | 1 + {mm => block}/bounce.c | 0 mm/Makefile | 1 - 3 files changed, 1 insertion(+), 1 deletion(-) rename {mm => block}/bounce.c (100%) diff --git a/block/Makefile b/block/Makefile index b4c4d3b99a2d..a2ce6ac935ec 100644 --- a/block/Makefile +++ b/block/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ partitions/ +obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o diff --git a/mm/bounce.c b/block/bounce.c similarity index 100% rename from mm/bounce.c rename to block/bounce.c diff --git a/mm/Makefile b/mm/Makefile index b484452dac57..0173940407f6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -30,7 +30,6 @@ endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o -obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o From 64b14519e5913e8d4de9f2e5d9ef59abba3ed83d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 20 May 2014 08:17:35 -0600 Subject: [PATCH 40/62] htmldocs: fix bio.c location Commit f9c78b2be2ca moved bio.c from fs/ to block/, but didn't update the docbook location. Fix that up. Signed-off-by: Jens Axboe --- Documentation/DocBook/filesystems.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl index 4f676838da06..bcdfdb9a9277 100644 --- a/Documentation/DocBook/filesystems.tmpl +++ b/Documentation/DocBook/filesystems.tmpl @@ -62,7 +62,7 @@ !Efs/mpage.c !Efs/namei.c !Efs/buffer.c -!Efs/bio.c +!Eblock/bio.c !Efs/seq_file.c !Efs/filesystems.c !Efs/fs-writeback.c From e3a2b3f931f59d5284abd13faf8bded726884ffd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 20 May 2014 11:49:02 -0600 Subject: [PATCH 41/62] blk-mq: allow changing of queue depth through sysfs For request_fn based devices, the block layer exports a 'nr_requests' file through sysfs to allow adjusting of queue depth on the fly. Currently this returns -EINVAL for blk-mq, since it's not wired up. Wire this up for blk-mq, so that it now also always dynamic adjustments of the allowed queue depth for any given block device managed by blk-mq. Signed-off-by: Jens Axboe --- block/blk-core.c | 41 ++++++++++++++++++++++ block/blk-mq-tag.c | 80 ++++++++++++++++++++++++++++++------------ block/blk-mq-tag.h | 1 + block/blk-mq.c | 22 ++++++++++++ block/blk-mq.h | 1 + block/blk-sysfs.c | 43 +++++------------------ block/blk.h | 2 ++ include/linux/blk-mq.h | 2 +- 8 files changed, 133 insertions(+), 59 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index a6bd3e702201..fe81e19099a1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -848,6 +848,47 @@ static void freed_request(struct request_list *rl, unsigned int flags) __freed_request(rl, sync ^ 1); } +int blk_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + struct request_list *rl; + + spin_lock_irq(q->queue_lock); + q->nr_requests = nr; + blk_queue_congestion_threshold(q); + + /* congestion isn't cgroup aware and follows root blkcg for now */ + rl = &q->root_rl; + + if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) + blk_set_queue_congested(q, BLK_RW_SYNC); + else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) + blk_clear_queue_congested(q, BLK_RW_SYNC); + + if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) + blk_set_queue_congested(q, BLK_RW_ASYNC); + else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) + blk_clear_queue_congested(q, BLK_RW_ASYNC); + + blk_queue_for_each_rl(rl, q) { + if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_SYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_SYNC); + wake_up(&rl->wait[BLK_RW_SYNC]); + } + + if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_ASYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_ASYNC); + wake_up(&rl->wait[BLK_RW_ASYNC]); + } + } + + spin_unlock_irq(q->queue_lock); + return 0; +} + /* * Determine if elevator data should be initialized when allocating the * request associated with @bio. diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index e6b3fbae9862..f6dea968b710 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -57,23 +57,13 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) } /* - * If a previously busy queue goes inactive, potential waiters could now - * be allowed to queue. Wake them up and check. + * Wakeup all potentially sleeping on normal (non-reserved) tags */ -void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags) { - struct blk_mq_tags *tags = hctx->tags; struct blk_mq_bitmap_tags *bt; int i, wake_index; - if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - return; - - atomic_dec(&tags->active_queues); - - /* - * Will only throttle depth on non-reserved tags - */ bt = &tags->bitmap_tags; wake_index = bt->wake_index; for (i = 0; i < BT_WAIT_QUEUES; i++) { @@ -86,6 +76,22 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) } } +/* + * If a previously busy queue goes inactive, potential waiters could now + * be allowed to queue. Wake them up and check. + */ +void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->tags; + + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return; + + atomic_dec(&tags->active_queues); + + blk_mq_tag_wakeup_all(tags); +} + /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. @@ -408,6 +414,28 @@ static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) return bt->depth - used; } +static void bt_update_count(struct blk_mq_bitmap_tags *bt, + unsigned int depth) +{ + unsigned int tags_per_word = 1U << bt->bits_per_word; + unsigned int map_depth = depth; + + if (depth) { + int i; + + for (i = 0; i < bt->map_nr; i++) { + bt->map[i].depth = min(map_depth, tags_per_word); + map_depth -= bt->map[i].depth; + } + } + + bt->wake_cnt = BT_WAIT_BATCH; + if (bt->wake_cnt > depth / 4) + bt->wake_cnt = max(1U, depth / 4); + + bt->depth = depth; +} + static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, int node, bool reserved) { @@ -420,7 +448,7 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, * condition. */ if (depth) { - unsigned int nr, i, map_depth, tags_per_word; + unsigned int nr, tags_per_word; tags_per_word = (1 << bt->bits_per_word); @@ -444,11 +472,6 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, return -ENOMEM; bt->map_nr = nr; - map_depth = depth; - for (i = 0; i < nr; i++) { - bt->map[i].depth = min(map_depth, tags_per_word); - map_depth -= tags_per_word; - } } bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); @@ -460,11 +483,7 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, for (i = 0; i < BT_WAIT_QUEUES; i++) init_waitqueue_head(&bt->bs[i].wait); - bt->wake_cnt = BT_WAIT_BATCH; - if (bt->wake_cnt > depth / 4) - bt->wake_cnt = max(1U, depth / 4); - - bt->depth = depth; + bt_update_count(bt, depth); return 0; } @@ -525,6 +544,21 @@ void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) *tag = prandom_u32() % depth; } +int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) +{ + tdepth -= tags->nr_reserved_tags; + if (tdepth > tags->nr_tags) + return -EINVAL; + + /* + * Don't need (or can't) update reserved tags here, they remain + * static and should never need resizing. + */ + bt_update_count(&tags->bitmap_tags, tdepth); + blk_mq_tag_wakeup_all(tags); + return 0; +} + ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) { char *orig_page = page; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index e144f68ec45f..e7ff5ceeeb97 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -55,6 +55,7 @@ extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); +extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); enum { BLK_MQ_TAG_CACHE_MIN = 1, diff --git a/block/blk-mq.c b/block/blk-mq.c index 0fbef7e9bef1..7b71ab1b1536 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1789,6 +1789,28 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_free_tag_set); +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_hw_ctx *hctx; + int i, ret; + + if (!set || nr > set->queue_depth) + return -EINVAL; + + ret = 0; + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_tag_update_depth(hctx->tags, nr); + if (ret) + break; + } + + if (!ret) + q->nr_requests = nr; + + return ret; +} + void blk_mq_disable_hotplug(void) { mutex_lock(&all_q_mutex); diff --git a/block/blk-mq.h b/block/blk-mq.h index 5e5a378962b7..7db4fe4bd002 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -32,6 +32,7 @@ void blk_mq_drain_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); void blk_mq_clone_flush_request(struct request *flush_rq, struct request *orig_rq); +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); /* * CPU hotplug helpers diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7500f876dae4..4d6811ac13fd 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -48,11 +48,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page) static ssize_t queue_requests_store(struct request_queue *q, const char *page, size_t count) { - struct request_list *rl; unsigned long nr; - int ret; + int ret, err; - if (!q->request_fn) + if (!q->request_fn && !q->mq_ops) return -EINVAL; ret = queue_var_store(&nr, page, count); @@ -62,40 +61,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - spin_lock_irq(q->queue_lock); - q->nr_requests = nr; - blk_queue_congestion_threshold(q); + if (q->request_fn) + err = blk_update_nr_requests(q, nr); + else + err = blk_mq_update_nr_requests(q, nr); - /* congestion isn't cgroup aware and follows root blkcg for now */ - rl = &q->root_rl; + if (err) + return err; - if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_SYNC); - else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_SYNC); - - if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_ASYNC); - else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_ASYNC); - - blk_queue_for_each_rl(rl, q) { - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_SYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_SYNC); - wake_up(&rl->wait[BLK_RW_SYNC]); - } - - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_ASYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_ASYNC); - wake_up(&rl->wait[BLK_RW_ASYNC]); - } - } - - spin_unlock_irq(q->queue_lock); return ret; } diff --git a/block/blk.h b/block/blk.h index 95cab70000e3..45385e9abf6f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -188,6 +188,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) return q->nr_congestion_off; } +extern int blk_update_nr_requests(struct request_queue *, unsigned int); + /* * Contribute to IO statistics IFF: * diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a06ca7b5ea05..f45424453338 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -63,7 +63,7 @@ struct blk_mq_hw_ctx { struct blk_mq_tag_set { struct blk_mq_ops *ops; unsigned int nr_hw_queues; - unsigned int queue_depth; + unsigned int queue_depth; /* max hw supported */ unsigned int reserved_tags; unsigned int cmd_size; /* per-request extra data */ int numa_node; From eba7176826ddab1d04c51bb2d5f2bbf22865444c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 20 May 2014 15:17:27 -0600 Subject: [PATCH 42/62] blk-mq: initialize q->nr_requests after calling blk_queue_make_request() blk_queue_make_requests() overwrites our set value for q->nr_requests, turning it into the default of 128. Set this appropriately after initializing queue values in blk_queue_make_request(). Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 7b71ab1b1536..fec8fcc4f8a4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1623,6 +1623,11 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (set->timeout) blk_queue_rq_timeout(q, set->timeout); + /* + * Do this after blk_queue_make_request() overrides it... + */ + q->nr_requests = set->queue_depth; + if (set->ops->complete) blk_queue_softirq_done(q, set->ops->complete); From da41a589f52464e24ddefe76814ee35bfb07950c Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 20 May 2014 16:46:26 -0500 Subject: [PATCH 43/62] blk-mq: Micro-optimize blk_queue_nomerges() check In blk_mq_make_request(), do the blk_queue_nomerges() check outside the call to blk_attempt_plug_merge() to eliminate function call overhead when nomerges=2 (disabled) Signed-off-by: Robert Elliott Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++++---- block/blk-mq.c | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index fe81e19099a1..5b6f768a7c01 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1471,6 +1471,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * added on the elevator at this point. In addition, we don't have * reliable access to the elevator outside queue lock. Only check basic * merging parameters without querying the elevator. + * + * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int *request_count) @@ -1480,9 +1482,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, bool ret = false; struct list_head *plug_list; - if (blk_queue_nomerges(q)) - goto out; - plug = current->plug; if (!plug) goto out; @@ -1561,7 +1560,8 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (blk_attempt_plug_merge(q, bio, &request_count)) + if (!blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count)) return; spin_lock_irq(q->queue_lock); diff --git a/block/blk-mq.c b/block/blk-mq.c index fec8fcc4f8a4..ef7ed5e95d6d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1087,7 +1087,8 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) return; } - if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) + if (use_plug && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count)) return; if (blk_mq_queue_enter(q)) { From e814e71ba4a6e1d7509b0f4b1928365ea650cace Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 May 2014 13:59:08 -0600 Subject: [PATCH 44/62] blk-mq: allow the hctx cpu hotplug notifier to return errors Prepare this for the next patch which adds more smarts in the plugging logic, so that we can save some memory. Signed-off-by: Jens Axboe --- block/blk-mq-cpu.c | 12 ++++++++---- block/blk-mq.c | 9 +++++---- block/blk-mq.h | 2 +- include/linux/blk-mq.h | 2 +- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c index 136ef8643bba..d2c253f71b86 100644 --- a/block/blk-mq-cpu.c +++ b/block/blk-mq-cpu.c @@ -18,14 +18,18 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self, { unsigned int cpu = (unsigned long) hcpu; struct blk_mq_cpu_notifier *notify; + int ret = NOTIFY_OK; raw_spin_lock(&blk_mq_cpu_notify_lock); - list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) - notify->notify(notify->data, action, cpu); + list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) { + ret = notify->notify(notify->data, action, cpu); + if (ret != NOTIFY_OK) + break; + } raw_spin_unlock(&blk_mq_cpu_notify_lock); - return NOTIFY_OK; + return ret; } void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) @@ -45,7 +49,7 @@ void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) } void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, - void (*fn)(void *, unsigned long, unsigned int), + int (*fn)(void *, unsigned long, unsigned int), void *data) { notifier->notify = fn; diff --git a/block/blk-mq.c b/block/blk-mq.c index ef7ed5e95d6d..5a3683fc5bdb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1196,8 +1196,8 @@ void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, } EXPORT_SYMBOL(blk_mq_free_single_hw_queue); -static void blk_mq_hctx_notify(void *data, unsigned long action, - unsigned int cpu) +static int blk_mq_hctx_notify(void *data, unsigned long action, + unsigned int cpu) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; @@ -1205,7 +1205,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action, LIST_HEAD(tmp); if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return; + return NOTIFY_OK; /* * Move ctx entries to new CPU, if this one is going away. @@ -1220,7 +1220,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action, spin_unlock(&ctx->lock); if (list_empty(&tmp)) - return; + return NOTIFY_OK; ctx = blk_mq_get_ctx(q); spin_lock(&ctx->lock); @@ -1240,6 +1240,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action, blk_mq_run_hw_queue(hctx, true); blk_mq_put_ctx(ctx); + return NOTIFY_OK; } static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, diff --git a/block/blk-mq.h b/block/blk-mq.h index 7db4fe4bd002..491dbd4e93f5 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -39,7 +39,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); */ struct blk_mq_cpu_notifier; void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, - void (*fn)(void *, unsigned long, unsigned int), + int (*fn)(void *, unsigned long, unsigned int), void *data); void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f45424453338..4d2800567aad 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -8,7 +8,7 @@ struct blk_mq_tags; struct blk_mq_cpu_notifier { struct list_head list; void *data; - void (*notify)(void *data, unsigned long action, unsigned int cpu); + int (*notify)(void *data, unsigned long action, unsigned int cpu); }; struct blk_mq_ctxmap { From 484b4061e6683e0e6a09c7455f80781128dc8a6b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 May 2014 14:01:15 -0600 Subject: [PATCH 45/62] blk-mq: save memory by freeing requests on unused hardware queues Depending on the topology of the machine and the number of queues exposed by a device, we can end up in a situation where some of the hardware queues are unused (as in, they don't map to any software queues). For this case, free up the memory used by the request map, as we will not use it. This can be a substantial amount of memory, depending on the number of queues vs CPUs and the queue depth of the device. Signed-off-by: Jens Axboe --- block/blk-mq.c | 157 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 105 insertions(+), 52 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 5a3683fc5bdb..103aa1dbc000 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -597,8 +597,16 @@ static void blk_mq_rq_timer(unsigned long data) unsigned long next = 0; int i, next_set = 0; - queue_for_each_hw_ctx(q, hctx, i) + queue_for_each_hw_ctx(q, hctx, i) { + /* + * If not software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!hctx->nr_ctx || !hctx->tags) + continue; + blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); + } if (next_set) { next = blk_rq_timeout(round_jiffies_up(next)); @@ -1196,53 +1204,6 @@ void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, } EXPORT_SYMBOL(blk_mq_free_single_hw_queue); -static int blk_mq_hctx_notify(void *data, unsigned long action, - unsigned int cpu) -{ - struct blk_mq_hw_ctx *hctx = data; - struct request_queue *q = hctx->queue; - struct blk_mq_ctx *ctx; - LIST_HEAD(tmp); - - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return NOTIFY_OK; - - /* - * Move ctx entries to new CPU, if this one is going away. - */ - ctx = __blk_mq_get_ctx(q, cpu); - - spin_lock(&ctx->lock); - if (!list_empty(&ctx->rq_list)) { - list_splice_init(&ctx->rq_list, &tmp); - blk_mq_hctx_clear_pending(hctx, ctx); - } - spin_unlock(&ctx->lock); - - if (list_empty(&tmp)) - return NOTIFY_OK; - - ctx = blk_mq_get_ctx(q); - spin_lock(&ctx->lock); - - while (!list_empty(&tmp)) { - struct request *rq; - - rq = list_first_entry(&tmp, struct request, queuelist); - rq->mq_ctx = ctx; - list_move_tail(&rq->queuelist, &ctx->rq_list); - } - - hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_hctx_mark_pending(hctx, ctx); - - spin_unlock(&ctx->lock); - - blk_mq_run_hw_queue(hctx, true); - blk_mq_put_ctx(ctx); - return NOTIFY_OK; -} - static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { @@ -1384,6 +1345,77 @@ static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) return 0; } +static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) +{ + struct request_queue *q = hctx->queue; + struct blk_mq_ctx *ctx; + LIST_HEAD(tmp); + + /* + * Move ctx entries to new CPU, if this one is going away. + */ + ctx = __blk_mq_get_ctx(q, cpu); + + spin_lock(&ctx->lock); + if (!list_empty(&ctx->rq_list)) { + list_splice_init(&ctx->rq_list, &tmp); + blk_mq_hctx_clear_pending(hctx, ctx); + } + spin_unlock(&ctx->lock); + + if (list_empty(&tmp)) + return NOTIFY_OK; + + ctx = blk_mq_get_ctx(q); + spin_lock(&ctx->lock); + + while (!list_empty(&tmp)) { + struct request *rq; + + rq = list_first_entry(&tmp, struct request, queuelist); + rq->mq_ctx = ctx; + list_move_tail(&rq->queuelist, &ctx->rq_list); + } + + hctx = q->mq_ops->map_queue(q, ctx->cpu); + blk_mq_hctx_mark_pending(hctx, ctx); + + spin_unlock(&ctx->lock); + + blk_mq_run_hw_queue(hctx, true); + blk_mq_put_ctx(ctx); + return NOTIFY_OK; +} + +static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu) +{ + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; + + if (set->tags[hctx->queue_num]) + return NOTIFY_OK; + + set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num); + if (!set->tags[hctx->queue_num]) + return NOTIFY_STOP; + + hctx->tags = set->tags[hctx->queue_num]; + return NOTIFY_OK; +} + +static int blk_mq_hctx_notify(void *data, unsigned long action, + unsigned int cpu) +{ + struct blk_mq_hw_ctx *hctx = data; + + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) + return blk_mq_hctx_cpu_offline(hctx, cpu); + else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) + return blk_mq_hctx_cpu_online(hctx, cpu); + + return NOTIFY_OK; +} + static int blk_mq_init_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set) { @@ -1513,6 +1545,24 @@ static void blk_mq_map_swqueue(struct request_queue *q) } queue_for_each_hw_ctx(q, hctx, i) { + /* + * If not software queues are mapped to this hardware queue, + * disable it and free the request entries + */ + if (!hctx->nr_ctx) { + struct blk_mq_tag_set *set = q->tag_set; + + if (set->tags[i]) { + blk_mq_free_rq_map(set, set->tags[i], i); + set->tags[i] = NULL; + hctx->tags = NULL; + } + continue; + } + + /* + * Initialize batch roundrobin counts + */ hctx->next_cpu = cpumask_first(hctx->cpumask); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } @@ -1645,14 +1695,14 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (blk_mq_init_hw_queues(q, set)) goto err_flush_rq; - blk_mq_map_swqueue(q); - mutex_lock(&all_q_mutex); list_add_tail(&q->all_q_node, &all_q_list); mutex_unlock(&all_q_mutex); blk_mq_add_queue_tag_set(set, q); + blk_mq_map_swqueue(q); + return q; err_flush_rq: @@ -1790,8 +1840,11 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < set->nr_hw_queues; i++) - blk_mq_free_rq_map(set, set->tags[i], i); + for (i = 0; i < set->nr_hw_queues; i++) { + if (set->tags[i]) + blk_mq_free_rq_map(set, set->tags[i], i); + } + kfree(set->tags); } EXPORT_SYMBOL(blk_mq_free_tag_set); From 07068d5b8ed8fa6759b2826ba9197e49b69a1fc3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 May 2014 10:40:51 -0600 Subject: [PATCH 46/62] blk-mq: split make request handler for multi and single queue We want slightly different behavior from them: - On single queue devices, we currently use the per-process plug for deferred IO and for merging. - On multi queue devices, we don't use the per-process plug, but we want to go straight to hardware for SYNC IO. Split blk_mq_make_request() into a blk_sq_make_request() for single queue devices, and retain blk_mq_make_request() for multi queue devices. Then we don't need multiple checks for q->nr_hw_queues in the request mapping. Signed-off-by: Jens Axboe --- block/blk-mq.c | 209 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 158 insertions(+), 51 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 103aa1dbc000..54e78863c083 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1072,43 +1072,57 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) blk_account_io_start(rq, 1); } -static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct request *rq, struct bio *bio) +{ + struct request_queue *q = hctx->queue; + + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { + blk_mq_bio_to_request(rq, bio); + spin_lock(&ctx->lock); +insert_rq: + __blk_mq_insert_request(hctx, rq, false); + spin_unlock(&ctx->lock); + return false; + } else { + spin_lock(&ctx->lock); + if (!blk_mq_attempt_merge(q, ctx, bio)) { + blk_mq_bio_to_request(rq, bio); + goto insert_rq; + } + + spin_unlock(&ctx->lock); + __blk_mq_free_request(hctx, ctx, rq); + return true; + } +} + +struct blk_map_ctx { + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; +}; + +static struct request *blk_mq_map_request(struct request_queue *q, + struct bio *bio, + struct blk_map_ctx *data) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; - const int is_sync = rw_is_sync(bio->bi_rw); - const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); - int rw = bio_data_dir(bio); struct request *rq; - unsigned int use_plug, request_count = 0; + int rw = bio_data_dir(bio); - /* - * If we have multiple hardware queues, just go directly to - * one of those for sync IO. - */ - use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); - - blk_queue_bounce(q, &bio); - - if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + if (unlikely(blk_mq_queue_enter(q))) { bio_endio(bio, -EIO); - return; - } - - if (use_plug && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) - return; - - if (blk_mq_queue_enter(q)) { - bio_endio(bio, -EIO); - return; + return NULL; } ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - if (is_sync) + if (rw_is_sync(bio->bi_rw)) rw |= REQ_SYNC; + trace_block_getrq(q, bio, rw); rq = __blk_mq_alloc_request(hctx, ctx, GFP_ATOMIC, false); if (likely(rq)) @@ -1123,6 +1137,109 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) } hctx->queued++; + data->hctx = hctx; + data->ctx = ctx; + return rq; +} + +/* + * Multiple hardware queue variant. This will not use per-process plugs, + * but will attempt to bypass the hctx queueing if we can go straight to + * hardware for SYNC IO. + */ +static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +{ + const int is_sync = rw_is_sync(bio->bi_rw); + const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); + struct blk_map_ctx data; + struct request *rq; + + blk_queue_bounce(q, &bio); + + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio_endio(bio, -EIO); + return; + } + + rq = blk_mq_map_request(q, bio, &data); + if (unlikely(!rq)) + return; + + if (unlikely(is_flush_fua)) { + blk_mq_bio_to_request(rq, bio); + blk_insert_flush(rq); + goto run_queue; + } + + if (is_sync) { + int ret; + + blk_mq_bio_to_request(rq, bio); + blk_mq_start_request(rq, true); + + /* + * For OK queue, we are done. For error, kill it. Any other + * error (busy), just add it to our list as we previously + * would have done + */ + ret = q->mq_ops->queue_rq(data.hctx, rq); + if (ret == BLK_MQ_RQ_QUEUE_OK) + goto done; + else { + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + rq->errors = -EIO; + blk_mq_end_io(rq, rq->errors); + goto done; + } + } + } + + if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + /* + * For a SYNC request, send it to the hardware immediately. For + * an ASYNC request, just ensure that we run it later on. The + * latter allows for merging opportunities and more efficient + * dispatching. + */ +run_queue: + blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); + } +done: + blk_mq_put_ctx(data.ctx); +} + +/* + * Single hardware queue variant. This will attempt to use any per-process + * plug for merging and IO deferral. + */ +static void blk_sq_make_request(struct request_queue *q, struct bio *bio) +{ + const int is_sync = rw_is_sync(bio->bi_rw); + const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); + unsigned int use_plug, request_count = 0; + struct blk_map_ctx data; + struct request *rq; + + /* + * If we have multiple hardware queues, just go directly to + * one of those for sync IO. + */ + use_plug = !is_flush_fua && !is_sync; + + blk_queue_bounce(q, &bio); + + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio_endio(bio, -EIO); + return; + } + + if (use_plug && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count)) + return; + + rq = blk_mq_map_request(q, bio, &data); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); @@ -1147,37 +1264,23 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) trace_block_plug(q); } list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(ctx); + blk_mq_put_ctx(data.ctx); return; } } - if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { - blk_mq_bio_to_request(rq, bio); - spin_lock(&ctx->lock); -insert_rq: - __blk_mq_insert_request(hctx, rq, false); - spin_unlock(&ctx->lock); - } else { - spin_lock(&ctx->lock); - if (!blk_mq_attempt_merge(q, ctx, bio)) { - blk_mq_bio_to_request(rq, bio); - goto insert_rq; - } - - spin_unlock(&ctx->lock); - __blk_mq_free_request(hctx, ctx, rq); + if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + /* + * For a SYNC request, send it to the hardware immediately. For + * an ASYNC request, just ensure that we run it later on. The + * latter allows for merging opportunities and more efficient + * dispatching. + */ +run_queue: + blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } - - /* - * For a SYNC request, send it to the hardware immediately. For an - * ASYNC request, just ensure that we run it later on. The latter - * allows for merging opportunities and more efficient dispatching. - */ -run_queue: - blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); - blk_mq_put_ctx(ctx); + blk_mq_put_ctx(data.ctx); } /* @@ -1670,7 +1773,11 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) q->sg_reserved_size = INT_MAX; - blk_queue_make_request(q, blk_mq_make_request); + if (q->nr_hw_queues > 1) + blk_queue_make_request(q, blk_mq_make_request); + else + blk_queue_make_request(q, blk_sq_make_request); + blk_queue_rq_timed_out(q, blk_mq_rq_timed_out); if (set->timeout) blk_queue_rq_timeout(q, set->timeout); From edf866b3805c5651bf7d035b72dc0190cb6ff4a7 Mon Sep 17 00:00:00 2001 From: Sam Bradshaw Date: Fri, 23 May 2014 13:30:16 -0600 Subject: [PATCH 47/62] blk-mq: export blk_mq_tag_busy_iter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export the blk-mq in-flight tag iterator for driver consumption. This is particularly useful in exception paths or SRSI where in-flight IOs need to be cancelled and/or reissued. The NVMe driver conversion will use this. Signed-off-by: Sam Bradshaw Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 1 + block/blk-mq-tag.h | 1 - include/linux/blk-mq.h | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index f6dea968b710..05e2baf4fa0d 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -400,6 +400,7 @@ void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, fn(data, tag_map); kfree(tag_map); } +EXPORT_SYMBOL(blk_mq_tag_busy_iter); static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) { diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index e7ff5ceeeb97..2e5e6872d089 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -51,7 +51,6 @@ extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); extern void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved); extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); -extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4d2800567aad..f76bb18350af 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -181,6 +181,7 @@ void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); +void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); /* * Driver command data is immediately after the request. So subtract request From c22d9d8a60646a1521ae12a2176f58da19afa186 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 23 May 2014 14:14:57 -0600 Subject: [PATCH 48/62] blk-mq: allow setting of per-request timeouts Currently blk-mq uses the queue timeout for all requests. But for some commands, drivers may want to set a specific timeout for special requests. Allow this to be passed in through request->timeout, and use it if set. Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 54e78863c083..9bc075335d06 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -454,9 +454,13 @@ static void blk_mq_start_request(struct request *rq, bool last) /* * Just mark start time and set the started bit. Due to memory * ordering, we know we'll see the correct deadline as long as - * REQ_ATOMIC_STARTED is seen. + * REQ_ATOMIC_STARTED is seen. Use the default queue timeout, + * unless one has been set in the request. */ - rq->deadline = jiffies + q->rq_timeout; + if (!rq->timeout) + rq->deadline = jiffies + q->rq_timeout; + else + rq->deadline = jiffies + rq->timeout; /* * Mark us as started and clear complete. Complete might have been From 19c5d84f14d2b01fe4c24e1444142f69e6dc08f6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 May 2014 11:45:02 +0200 Subject: [PATCH 49/62] blk-mq: idle all hardware contexts before freeing a queue Without this we can leak the active_queues reference if a command is freed while it is considered active. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 9bc075335d06..62082c5daae7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1844,6 +1844,7 @@ void blk_mq_free_queue(struct request_queue *q) blk_mq_del_queue_tag_set(q); queue_for_each_hw_ctx(q, hctx, i) { + blk_mq_tag_idle(hctx); kfree(hctx->ctxs); blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); if (q->mq_ops->exit_hctx) From 35086784caec571be185f643eb1b045a275d60b3 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 26 May 2014 22:19:14 +0200 Subject: [PATCH 50/62] block/blk-lib.c: make __blkdev_issue_zeroout static __blkdev_issue_zeroout is only used in blk-lib.c Cc: Jens Axboe Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Jens Axboe --- block/blk-lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 97a733cf3d5f..8411be3c19d3 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -226,8 +226,8 @@ EXPORT_SYMBOL(blkdev_issue_write_same); * Generate and issue number of bios with zerofiled pages. */ -int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask) +static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask) { int ret; struct bio *bio; From 1f9f07e917f43af420f8cb3afc2b3fa703cea6e9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 May 2014 08:34:45 -0600 Subject: [PATCH 51/62] blk-mq: fix leak of hctx->ctx_map hctx->ctx_map should have been freed inside blk_mq_free_queue(). Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 62082c5daae7..dad22a9abe49 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1846,6 +1846,7 @@ void blk_mq_free_queue(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) { blk_mq_tag_idle(hctx); kfree(hctx->ctxs); + blk_mq_free_bitmap(&hctx->ctx_map); blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); if (q->mq_ops->exit_hctx) q->mq_ops->exit_hctx(hctx, i); From 624dbe47541643b72868a59b2c0059bb53dc923f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 May 2014 23:35:13 +0800 Subject: [PATCH 52/62] blk-mq: avoid code duplication blk_mq_exit_hw_queues() and blk_mq_free_hw_queues() are introduced to avoid code duplication. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 61 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index dad22a9abe49..07851753a049 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1523,11 +1523,43 @@ static int blk_mq_hctx_notify(void *data, unsigned long action, return NOTIFY_OK; } +static void blk_mq_exit_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set, int nr_queue) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (i == nr_queue) + break; + + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, i); + + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + kfree(hctx->ctxs); + blk_mq_free_bitmap(&hctx->ctx_map); + } + +} + +static void blk_mq_free_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + queue_for_each_hw_ctx(q, hctx, i) { + free_cpumask_var(hctx->cpumask); + set->ops->free_hctx(hctx, i); + } +} + static int blk_mq_init_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set) { struct blk_mq_hw_ctx *hctx; - unsigned int i, j; + unsigned int i; /* * Initialize hardware queues @@ -1579,17 +1611,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, /* * Init failed */ - queue_for_each_hw_ctx(q, hctx, j) { - if (i == j) - break; - - if (set->ops->exit_hctx) - set->ops->exit_hctx(hctx, j); - - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - kfree(hctx->ctxs); - blk_mq_free_bitmap(&hctx->ctx_map); - } + blk_mq_exit_hw_queues(q, set, i); return 1; } @@ -1838,21 +1860,12 @@ EXPORT_SYMBOL(blk_mq_init_queue); void blk_mq_free_queue(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - int i; + struct blk_mq_tag_set *set = q->tag_set; blk_mq_del_queue_tag_set(q); - queue_for_each_hw_ctx(q, hctx, i) { - blk_mq_tag_idle(hctx); - kfree(hctx->ctxs); - blk_mq_free_bitmap(&hctx->ctx_map); - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - if (q->mq_ops->exit_hctx) - q->mq_ops->exit_hctx(hctx, i); - free_cpumask_var(hctx->cpumask); - q->mq_ops->free_hctx(hctx, i); - } + blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); + blk_mq_free_hw_queues(q, set); free_percpu(q->queue_ctx); kfree(q->queue_hw_ctx); From 3d2936f457a847d9d88a9cc127e0eb7a0ebba0ff Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 May 2014 23:35:14 +0800 Subject: [PATCH 53/62] block: only allocate/free mq_usage_counter in blk-mq The percpu counter is only used for blk-mq, so move its allocation and free inside blk-mq, and don't allocate it for legacy queue device. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 7 +------ block/blk-mq.c | 5 +++++ block/blk-sysfs.c | 2 -- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 5b6f768a7c01..29d5fbafd94a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -576,12 +576,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q) return NULL; - if (percpu_counter_init(&q->mq_usage_counter, 0)) - goto fail_q; - q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) - goto fail_c; + goto fail_q; q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; @@ -639,8 +636,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) bdi_destroy(&q->backing_dev_info); fail_id: ida_simple_remove(&blk_queue_ida, q->id); -fail_c: - percpu_counter_destroy(&q->mq_usage_counter); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; diff --git a/block/blk-mq.c b/block/blk-mq.c index 07851753a049..e8b5f74dc1a1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1781,6 +1781,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!q) goto err_hctxs; + if (percpu_counter_init(&q->mq_usage_counter, 0)) + goto err_map; + q->mq_map = blk_mq_make_queue_map(set); if (!q->mq_map) goto err_map; @@ -1867,6 +1870,8 @@ void blk_mq_free_queue(struct request_queue *q) blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); blk_mq_free_hw_queues(q, set); + percpu_counter_destroy(&q->mq_usage_counter); + free_percpu(q->queue_ctx); kfree(q->queue_hw_ctx); kfree(q->mq_map); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 4d6811ac13fd..23321fbab293 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -517,8 +517,6 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - percpu_counter_destroy(&q->mq_usage_counter); - if (q->mq_ops) blk_mq_free_queue(q); From f14bbe77a96bb979dc539d8308ee18a9363a544f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 27 May 2014 12:06:53 -0600 Subject: [PATCH 54/62] blk-mq: pass in suggested NUMA node to ->alloc_hctx() Drivers currently have to figure this out on their own, and they are missing information to do it properly. The ones that did attempt to do it, do it wrong. So just pass in the suggested node directly to the alloc function. Signed-off-by: Jens Axboe --- block/blk-mq-cpumap.c | 16 ++++++++++++++++ block/blk-mq.c | 26 +++++++++++++++----------- block/blk-mq.h | 1 + drivers/block/null_blk.c | 35 +++-------------------------------- include/linux/blk-mq.h | 4 ++-- 5 files changed, 37 insertions(+), 45 deletions(-) diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 5d0f93cf358c..0daacb927be1 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -96,3 +96,19 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) kfree(map); return NULL; } + +/* + * We have no quick way of doing reverse lookups. This is only used at + * queue init time, so runtime isn't important. + */ +int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) +{ + int i; + + for_each_possible_cpu(i) { + if (index == mq_map[i]) + return cpu_to_node(i); + } + + return NUMA_NO_NODE; +} diff --git a/block/blk-mq.c b/block/blk-mq.c index e8b5f74dc1a1..30bad930e661 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1297,10 +1297,10 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) EXPORT_SYMBOL(blk_mq_map_queue); struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set, - unsigned int hctx_index) + unsigned int hctx_index, + int node) { - return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, - set->numa_node); + return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node); } EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); @@ -1752,6 +1752,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) struct blk_mq_hw_ctx **hctxs; struct blk_mq_ctx *ctx; struct request_queue *q; + unsigned int *map; int i; ctx = alloc_percpu(struct blk_mq_ctx); @@ -1764,8 +1765,14 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!hctxs) goto err_percpu; + map = blk_mq_make_queue_map(set); + if (!map) + goto err_map; + for (i = 0; i < set->nr_hw_queues; i++) { - hctxs[i] = set->ops->alloc_hctx(set, i); + int node = blk_mq_hw_queue_to_node(map, i); + + hctxs[i] = set->ops->alloc_hctx(set, i, node); if (!hctxs[i]) goto err_hctxs; @@ -1773,7 +1780,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) goto err_hctxs; atomic_set(&hctxs[i]->nr_active, 0); - hctxs[i]->numa_node = NUMA_NO_NODE; + hctxs[i]->numa_node = node; hctxs[i]->queue_num = i; } @@ -1784,15 +1791,12 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (percpu_counter_init(&q->mq_usage_counter, 0)) goto err_map; - q->mq_map = blk_mq_make_queue_map(set); - if (!q->mq_map) - goto err_map; - setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); blk_queue_rq_timeout(q, 30000); q->nr_queues = nr_cpu_ids; q->nr_hw_queues = set->nr_hw_queues; + q->mq_map = map; q->queue_ctx = ctx; q->queue_hw_ctx = hctxs; @@ -1844,16 +1848,16 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) err_flush_rq: kfree(q->flush_rq); err_hw: - kfree(q->mq_map); -err_map: blk_cleanup_queue(q); err_hctxs: + kfree(map); for (i = 0; i < set->nr_hw_queues; i++) { if (!hctxs[i]) break; free_cpumask_var(hctxs[i]->cpumask); set->ops->free_hctx(hctxs[i], i); } +err_map: kfree(hctxs); err_percpu: free_percpu(ctx); diff --git a/block/blk-mq.h b/block/blk-mq.h index 491dbd4e93f5..ff5e6bf0f691 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -52,6 +52,7 @@ void blk_mq_disable_hotplug(void); */ extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); +extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); /* * Basic implementation of sparser bitmap, allowing the user to spread diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 8e7e3a0b0d24..4d33c8c25fbf 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -322,39 +322,10 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) } static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_tag_set *set, - unsigned int hctx_index) + unsigned int hctx_index, + int node) { - int b_size = DIV_ROUND_UP(set->nr_hw_queues, nr_online_nodes); - int tip = (set->nr_hw_queues % nr_online_nodes); - int node = 0, i, n; - - /* - * Split submit queues evenly wrt to the number of nodes. If uneven, - * fill the first buckets with one extra, until the rest is filled with - * no extra. - */ - for (i = 0, n = 1; i < hctx_index; i++, n++) { - if (n % b_size == 0) { - n = 0; - node++; - - tip--; - if (!tip) - b_size = set->nr_hw_queues / nr_online_nodes; - } - } - - /* - * A node might not be online, therefore map the relative node id to the - * real node id. - */ - for_each_online_node(n) { - if (!node) - break; - node--; - } - - return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, n); + return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node); } static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f76bb18350af..afeb93496907 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -80,7 +80,7 @@ struct blk_mq_tag_set { typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_tag_set *, - unsigned int); + unsigned int, int); typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); @@ -165,7 +165,7 @@ struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, g struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); -struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int); +struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int); void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); void blk_mq_end_io(struct request *rq, int error); From 95f096849932fe5eaa7bfec887530cf556744a76 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 27 May 2014 17:46:48 -0600 Subject: [PATCH 55/62] blk-mq: allow non-softirq completions Right now we export two ways of completing a request: 1) blk_mq_complete_request(). This uses an IPI (if needed) and completes through q->softirq_done_fn(). It also works with timeouts. 2) blk_mq_end_io(). This completes inline, and ignores any timeout state of the request. Let blk_mq_complete_request() handle non-softirq_done_fn completions as well, by just completing inline. If a driver has enough completion ports to place completions correctly, it need not define a mq_ops->complete() and we can avoid an indirect function call by doing the completion inline. Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 +++++++++--- include/linux/blk-mq.h | 4 ++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 30bad930e661..010b878d53b3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -434,10 +434,16 @@ void __blk_mq_complete_request(struct request *rq) **/ void blk_mq_complete_request(struct request *rq) { - if (unlikely(blk_should_fake_timeout(rq->q))) + struct request_queue *q = rq->q; + + if (unlikely(blk_should_fake_timeout(q))) return; - if (!blk_mark_rq_complete(rq)) - __blk_mq_complete_request(rq); + if (!blk_mark_rq_complete(rq)) { + if (q->softirq_done_fn) + __blk_mq_complete_request(rq); + else + blk_mq_end_io(rq, rq->errors); + } } EXPORT_SYMBOL(blk_mq_complete_request); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index afeb93496907..1dfeb1529a61 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -173,6 +173,10 @@ void __blk_mq_end_io(struct request *rq, int error); void blk_mq_requeue_request(struct request *rq); +/* + * Complete request through potential IPI for right placement. Driver must + * have defined a mq_ops->complete() hook for this. + */ void blk_mq_complete_request(struct request *rq); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); From 7738dac4f697ffbd0ed4c4aeb69a714ef9d876da Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 May 2014 08:06:34 -0600 Subject: [PATCH 56/62] blk-mq: remove stale comment for blk_mq_complete_request() It works for both IPI and local completions as of commit 95f096849932. Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1dfeb1529a61..5b171fbe95c5 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -172,11 +172,6 @@ void blk_mq_end_io(struct request *rq, int error); void __blk_mq_end_io(struct request *rq, int error); void blk_mq_requeue_request(struct request *rq); - -/* - * Complete request through potential IPI for right placement. Driver must - * have defined a mq_ops->complete() hook for this. - */ void blk_mq_complete_request(struct request *rq); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); From 6fca6a611c27f1f0d90fbe1cc3c229dbf8c09e48 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 May 2014 08:08:02 -0600 Subject: [PATCH 57/62] blk-mq: add helper to insert requests from irq context Both the cache flush state machine and the SCSI midlayer want to submit requests from irq context, and the current per-request requeue_work unfortunately causes corruption due to sharing with the csd field for flushes. Replace them with a per-request_queue list of requests to be requeued. Based on an earlier test by Ming Lei. Signed-off-by: Christoph Hellwig Reported-by: Ming Lei Tested-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-flush.c | 16 +++-------- block/blk-mq.c | 64 +++++++++++++++++++++++++++++++++++++++++- include/linux/blk-mq.h | 2 ++ include/linux/blkdev.h | 5 +++- 4 files changed, 73 insertions(+), 14 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index ec7a224d6733..ef608b35d9be 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -130,21 +130,13 @@ static void blk_flush_restore_request(struct request *rq) blk_clear_rq_complete(rq); } -static void mq_flush_run(struct work_struct *work) -{ - struct request *rq; - - rq = container_of(work, struct request, requeue_work); - - memset(&rq->csd, 0, sizeof(rq->csd)); - blk_mq_insert_request(rq, false, true, false); -} - static bool blk_flush_queue_rq(struct request *rq, bool add_front) { if (rq->q->mq_ops) { - INIT_WORK(&rq->requeue_work, mq_flush_run); - kblockd_schedule_work(&rq->requeue_work); + struct request_queue *q = rq->q; + + blk_mq_add_to_requeue_list(rq, add_front); + blk_mq_kick_requeue_list(q); return false; } else { if (add_front) diff --git a/block/blk-mq.c b/block/blk-mq.c index 010b878d53b3..67066ecc79c0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -516,10 +516,68 @@ void blk_mq_requeue_request(struct request *rq) blk_clear_rq_complete(rq); BUG_ON(blk_queued_rq(rq)); - blk_mq_insert_request(rq, true, true, false); + blk_mq_add_to_requeue_list(rq, true); } EXPORT_SYMBOL(blk_mq_requeue_request); +static void blk_mq_requeue_work(struct work_struct *work) +{ + struct request_queue *q = + container_of(work, struct request_queue, requeue_work); + LIST_HEAD(rq_list); + struct request *rq, *next; + unsigned long flags; + + spin_lock_irqsave(&q->requeue_lock, flags); + list_splice_init(&q->requeue_list, &rq_list); + spin_unlock_irqrestore(&q->requeue_lock, flags); + + list_for_each_entry_safe(rq, next, &rq_list, queuelist) { + if (!(rq->cmd_flags & REQ_SOFTBARRIER)) + continue; + + rq->cmd_flags &= ~REQ_SOFTBARRIER; + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, true, false, false); + } + + while (!list_empty(&rq_list)) { + rq = list_entry(rq_list.next, struct request, queuelist); + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, false, false, false); + } + + blk_mq_run_queues(q, false); +} + +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) +{ + struct request_queue *q = rq->q; + unsigned long flags; + + /* + * We abuse this flag that is otherwise used by the I/O scheduler to + * request head insertation from the workqueue. + */ + BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); + + spin_lock_irqsave(&q->requeue_lock, flags); + if (at_head) { + rq->cmd_flags |= REQ_SOFTBARRIER; + list_add(&rq->queuelist, &q->requeue_list); + } else { + list_add_tail(&rq->queuelist, &q->requeue_list); + } + spin_unlock_irqrestore(&q->requeue_lock, flags); +} +EXPORT_SYMBOL(blk_mq_add_to_requeue_list); + +void blk_mq_kick_requeue_list(struct request_queue *q) +{ + kblockd_schedule_work(&q->requeue_work); +} +EXPORT_SYMBOL(blk_mq_kick_requeue_list); + struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { return tags->rqs[tag]; @@ -1812,6 +1870,10 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) q->sg_reserved_size = INT_MAX; + INIT_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_LIST_HEAD(&q->requeue_list); + spin_lock_init(&q->requeue_lock); + if (q->nr_hw_queues > 1) blk_queue_make_request(q, blk_mq_make_request); else diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 5b171fbe95c5..b9a74a386dbc 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -172,6 +172,8 @@ void blk_mq_end_io(struct request *rq, int error); void __blk_mq_end_io(struct request *rq, int error); void blk_mq_requeue_request(struct request *rq); +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); +void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_complete_request(struct request *rq); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6bc011a09e82..913f1c2d3be0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -99,7 +99,6 @@ struct request { struct list_head queuelist; union { struct call_single_data csd; - struct work_struct requeue_work; unsigned long fifo_time; }; @@ -463,6 +462,10 @@ struct request_queue { struct request *flush_rq; spinlock_t mq_flush_lock; + struct list_head requeue_list; + spinlock_t requeue_lock; + struct work_struct requeue_work; + struct mutex sysfs_lock; int bypass_depth; From 4ce01dd1a07d9cf3eaf44fbf4ea9a61b11badccc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 May 2014 20:59:46 +0200 Subject: [PATCH 58/62] blk-mq: merge blk_mq_alloc_reserved_request into blk_mq_alloc_request Instead of having two almost identical copies of the same code just let the callers pass in the reserved flag directly. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-mq.c | 20 +++----------------- include/linux/blk-mq.h | 4 ++-- 3 files changed, 6 insertions(+), 20 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 29d5fbafd94a..d87be5b4e554 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1173,7 +1173,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { if (q->mq_ops) - return blk_mq_alloc_request(q, rw, gfp_mask); + return blk_mq_alloc_request(q, rw, gfp_mask, false); else return blk_old_get_request(q, rw, gfp_mask); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 67066ecc79c0..63d581d72a70 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -294,35 +294,21 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, return rq; } -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, + bool reserved) { struct request *rq; if (blk_mq_queue_enter(q)) return NULL; - rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); + rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved); if (rq) blk_mq_put_ctx(rq->mq_ctx); return rq; } EXPORT_SYMBOL(blk_mq_alloc_request); -struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, - gfp_t gfp) -{ - struct request *rq; - - if (blk_mq_queue_enter(q)) - return NULL; - - rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); - if (rq) - blk_mq_put_ctx(rq->mq_ctx); - return rq; -} -EXPORT_SYMBOL(blk_mq_alloc_reserved_request); - static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b9a74a386dbc..2bd82f399128 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -160,8 +160,8 @@ void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp); -struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, + gfp_t gfp, bool reserved); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); From 5dee857720db15e2c8ef0c03f7eeac00c4c63cb2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 May 2014 20:59:47 +0200 Subject: [PATCH 59/62] blk-mq: initialize request in __blk_mq_alloc_request Both callers if __blk_mq_alloc_request want to initialize the request, so lift it into the common path. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 62 ++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 63d581d72a70..04ef7ecb3c7f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -92,30 +92,6 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } -static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, - gfp_t gfp, bool reserved) -{ - struct request *rq; - unsigned int tag; - - tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved); - if (tag != BLK_MQ_TAG_FAIL) { - rq = hctx->tags->rqs[tag]; - - rq->cmd_flags = 0; - if (blk_mq_tag_busy(hctx)) { - rq->cmd_flags = REQ_MQ_INFLIGHT; - atomic_inc(&hctx->nr_active); - } - - rq->tag = tag; - return rq; - } - - return NULL; -} - static int blk_mq_queue_enter(struct request_queue *q) { int ret; @@ -263,6 +239,32 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, ctx->rq_dispatched[rw_is_sync(rw_flags)]++; } +static struct request * +__blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, int rw, gfp_t gfp, bool reserved) +{ + struct request *rq; + unsigned int tag; + + tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved); + if (tag != BLK_MQ_TAG_FAIL) { + rq = hctx->tags->rqs[tag]; + + rq->cmd_flags = 0; + if (blk_mq_tag_busy(hctx)) { + rq->cmd_flags = REQ_MQ_INFLIGHT; + atomic_inc(&hctx->nr_active); + } + + rq->tag = tag; + blk_mq_rq_ctx_init(q, ctx, rq, rw); + return rq; + } + + return NULL; +} + + static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, int rw, gfp_t gfp, bool reserved) @@ -273,12 +275,10 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); - rq = __blk_mq_alloc_request(hctx, ctx, gfp & ~__GFP_WAIT, + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp & ~__GFP_WAIT, reserved); - if (rq) { - blk_mq_rq_ctx_init(q, ctx, rq, rw); + if (rq) break; - } if (gfp & __GFP_WAIT) { __blk_mq_run_hw_queue(hctx); @@ -1178,10 +1178,8 @@ static struct request *blk_mq_map_request(struct request_queue *q, rw |= REQ_SYNC; trace_block_getrq(q, bio, rw); - rq = __blk_mq_alloc_request(hctx, ctx, GFP_ATOMIC, false); - if (likely(rq)) - blk_mq_rq_ctx_init(q, ctx, rq, rw); - else { + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false); + if (unlikely(!rq)) { blk_mq_put_ctx(ctx); trace_block_sleeprq(q, bio, rw); rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, From a3bd77567cae6af700dcd245148befc73fc89a50 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 May 2014 20:59:48 +0200 Subject: [PATCH 60/62] blk-mq: remove blk_mq_wait_for_tags The current logic for blocking tag allocation is rather confusing, as we first allocated and then free again a tag in blk_mq_wait_for_tags, just to attempt a non-blocking allocation and then repeat if someone else managed to grab the tag before us. Instead change blk_mq_alloc_request_pinned to simply do a blocking tag allocation itself and use the request we get back from it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 8 -------- block/blk-mq-tag.h | 1 - block/blk-mq.c | 13 ++++++------- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 05e2baf4fa0d..0d0640d38a06 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -7,14 +7,6 @@ #include "blk-mq.h" #include "blk-mq-tag.h" -void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved) -{ - int tag, zero = 0; - - tag = blk_mq_get_tag(hctx, &zero, __GFP_WAIT, reserved); - blk_mq_put_tag(hctx, tag, &zero); -} - static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) { int i; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 2e5e6872d089..c959de58d2a5 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -49,7 +49,6 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); -extern void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved); extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); diff --git a/block/blk-mq.c b/block/blk-mq.c index 04ef7ecb3c7f..3224888d329a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -264,31 +264,30 @@ __blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx, return NULL; } - static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, int rw, gfp_t gfp, bool reserved) { + bool gfp_mask = gfp & ~__GFP_WAIT; struct request *rq; do { struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); - rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp & ~__GFP_WAIT, + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp_mask, reserved); if (rq) break; - if (gfp & __GFP_WAIT) { - __blk_mq_run_hw_queue(hctx); - blk_mq_put_ctx(ctx); - } else { + if (!(gfp & __GFP_WAIT)) { blk_mq_put_ctx(ctx); break; } - blk_mq_wait_for_tags(hctx, reserved); + __blk_mq_run_hw_queue(hctx); + blk_mq_put_ctx(ctx); + gfp_mask = gfp; } while (1); return rq; From 793597a6a95675f4f85671cf747c1d92e7dbc295 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 May 2014 20:59:49 +0200 Subject: [PATCH 61/62] blk-mq: do not use blk_mq_alloc_request_pinned in blk_mq_map_request We already do a non-blocking allocation in blk_mq_map_request, no need to repeat it. Just call __blk_mq_alloc_request to wait directly. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3224888d329a..43f0c8ffa92a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1179,12 +1179,14 @@ static struct request *blk_mq_map_request(struct request_queue *q, trace_block_getrq(q, bio, rw); rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false); if (unlikely(!rq)) { + __blk_mq_run_hw_queue(hctx); blk_mq_put_ctx(ctx); trace_block_sleeprq(q, bio, rw); - rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, - false); - ctx = rq->mq_ctx; + + ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, + __GFP_WAIT|GFP_ATOMIC, false); } hctx->queued++; From d852564f8c88b0604490234fdeeb6fb47e4bcc7a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 May 2014 20:59:50 +0200 Subject: [PATCH 62/62] blk-mq: remove blk_mq_alloc_request_pinned We now only have one caller left and can open code it there in a cleaner way. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 48 ++++++++++++++++-------------------------------- 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 43f0c8ffa92a..ae14749b530c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -264,46 +264,30 @@ __blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx, return NULL; } -static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, - int rw, gfp_t gfp, - bool reserved) -{ - bool gfp_mask = gfp & ~__GFP_WAIT; - struct request *rq; - - do { - struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); - - rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp_mask, - reserved); - if (rq) - break; - - if (!(gfp & __GFP_WAIT)) { - blk_mq_put_ctx(ctx); - break; - } - - __blk_mq_run_hw_queue(hctx); - blk_mq_put_ctx(ctx); - gfp_mask = gfp; - } while (1); - - return rq; -} - struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved) { + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; struct request *rq; if (blk_mq_queue_enter(q)) return NULL; - rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved); - if (rq) - blk_mq_put_ctx(rq->mq_ctx); + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp & ~__GFP_WAIT, + reserved); + if (!rq && (gfp & __GFP_WAIT)) { + __blk_mq_run_hw_queue(hctx); + blk_mq_put_ctx(ctx); + + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp, reserved); + } + blk_mq_put_ctx(ctx); return rq; } EXPORT_SYMBOL(blk_mq_alloc_request);