diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl index 4f676838da06..bcdfdb9a9277 100644 --- a/Documentation/DocBook/filesystems.tmpl +++ b/Documentation/DocBook/filesystems.tmpl @@ -62,7 +62,7 @@ !Efs/mpage.c !Efs/namei.c !Efs/buffer.c -!Efs/bio.c +!Eblock/bio.c !Efs/seq_file.c !Efs/filesystems.c !Efs/fs-writeback.c diff --git a/block/Makefile b/block/Makefile index 20645e88fb57..a2ce6ac935ec 100644 --- a/block/Makefile +++ b/block/Makefile @@ -2,13 +2,15 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ +obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ - genhd.o scsi_ioctl.o partition-generic.o partitions/ + genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ + partitions/ +obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o @@ -20,3 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o diff --git a/fs/bio-integrity.c b/block/bio-integrity.c similarity index 99% rename from fs/bio-integrity.c rename to block/bio-integrity.c index 1c2ce0c87711..9e241063a616 100644 --- a/fs/bio-integrity.c +++ b/block/bio-integrity.c @@ -617,7 +617,7 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size) if (!bs->bio_integrity_pool) return -1; - bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); + bs->bvec_integrity_pool = biovec_create_pool(pool_size); if (!bs->bvec_integrity_pool) { mempool_destroy(bs->bio_integrity_pool); return -1; diff --git a/fs/bio.c b/block/bio.c similarity index 99% rename from fs/bio.c rename to block/bio.c index 6f0362b77806..96d28eee8a1e 100644 --- a/fs/bio.c +++ b/block/bio.c @@ -305,6 +305,8 @@ static void bio_chain_endio(struct bio *bio, int error) /** * bio_chain - chain bio completions + * @bio: the target bio + * @parent: the @bio's parent bio * * The caller won't have a bi_end_io called when @bio completes - instead, * @parent's bi_end_io won't be called until both @parent and @bio have @@ -1011,8 +1013,7 @@ static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, bio->bi_private = bmd; } -static struct bio_map_data *bio_alloc_map_data(int nr_segs, - unsigned int iov_count, +static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, gfp_t gfp_mask) { if (iov_count > UIO_MAXIOV) @@ -1154,7 +1155,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, if (offset) nr_pages++; - bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); + bmd = bio_alloc_map_data(iov_count, gfp_mask); if (!bmd) return ERR_PTR(-ENOMEM); @@ -1859,7 +1860,7 @@ EXPORT_SYMBOL_GPL(bio_trim); * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. */ -mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) +mempool_t *biovec_create_pool(int pool_entries) { struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; @@ -1922,7 +1923,7 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) if (!bs->bio_pool) goto bad; - bs->bvec_pool = biovec_create_pool(bs, pool_size); + bs->bvec_pool = biovec_create_pool(pool_size); if (!bs->bvec_pool) goto bad; diff --git a/block/blk-core.c b/block/blk-core.c index c4269701cb4f..d87be5b4e554 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -576,12 +576,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q) return NULL; - if (percpu_counter_init(&q->mq_usage_counter, 0)) - goto fail_q; - q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) - goto fail_c; + goto fail_q; q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; @@ -639,8 +636,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) bdi_destroy(&q->backing_dev_info); fail_id: ida_simple_remove(&blk_queue_ida, q->id); -fail_c: - percpu_counter_destroy(&q->mq_usage_counter); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; @@ -848,6 +843,47 @@ static void freed_request(struct request_list *rl, unsigned int flags) __freed_request(rl, sync ^ 1); } +int blk_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + struct request_list *rl; + + spin_lock_irq(q->queue_lock); + q->nr_requests = nr; + blk_queue_congestion_threshold(q); + + /* congestion isn't cgroup aware and follows root blkcg for now */ + rl = &q->root_rl; + + if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) + blk_set_queue_congested(q, BLK_RW_SYNC); + else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) + blk_clear_queue_congested(q, BLK_RW_SYNC); + + if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) + blk_set_queue_congested(q, BLK_RW_ASYNC); + else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) + blk_clear_queue_congested(q, BLK_RW_ASYNC); + + blk_queue_for_each_rl(rl, q) { + if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_SYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_SYNC); + wake_up(&rl->wait[BLK_RW_SYNC]); + } + + if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_ASYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_ASYNC); + wake_up(&rl->wait[BLK_RW_ASYNC]); + } + } + + spin_unlock_irq(q->queue_lock); + return 0; +} + /* * Determine if elevator data should be initialized when allocating the * request associated with @bio. @@ -1137,7 +1173,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { if (q->mq_ops) - return blk_mq_alloc_request(q, rw, gfp_mask); + return blk_mq_alloc_request(q, rw, gfp_mask, false); else return blk_old_get_request(q, rw, gfp_mask); } @@ -1233,12 +1269,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq, static void part_round_stats_single(int cpu, struct hd_struct *part, unsigned long now) { + int inflight; + if (now == part->stamp) return; - if (part_in_flight(part)) { + inflight = part_in_flight(part); + if (inflight) { __part_stat_add(cpu, part, time_in_queue, - part_in_flight(part) * (now - part->stamp)); + inflight * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; @@ -1427,6 +1466,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * added on the elevator at this point. In addition, we don't have * reliable access to the elevator outside queue lock. Only check basic * merging parameters without querying the elevator. + * + * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int *request_count) @@ -1436,9 +1477,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, bool ret = false; struct list_head *plug_list; - if (blk_queue_nomerges(q)) - goto out; - plug = current->plug; if (!plug) goto out; @@ -1517,7 +1555,8 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (blk_attempt_plug_merge(q, bio, &request_count)) + if (!blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count)) return; spin_lock_irq(q->queue_lock); diff --git a/block/blk-flush.c b/block/blk-flush.c index ec7a224d6733..ef608b35d9be 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -130,21 +130,13 @@ static void blk_flush_restore_request(struct request *rq) blk_clear_rq_complete(rq); } -static void mq_flush_run(struct work_struct *work) -{ - struct request *rq; - - rq = container_of(work, struct request, requeue_work); - - memset(&rq->csd, 0, sizeof(rq->csd)); - blk_mq_insert_request(rq, false, true, false); -} - static bool blk_flush_queue_rq(struct request *rq, bool add_front) { if (rq->q->mq_ops) { - INIT_WORK(&rq->requeue_work, mq_flush_run); - kblockd_schedule_work(&rq->requeue_work); + struct request_queue *q = rq->q; + + blk_mq_add_to_requeue_list(rq, add_front); + blk_mq_kick_requeue_list(q); return false; } else { if (add_front) diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c index c11d24e379e2..d828b44a404b 100644 --- a/block/blk-iopoll.c +++ b/block/blk-iopoll.c @@ -64,12 +64,12 @@ EXPORT_SYMBOL(__blk_iopoll_complete); * iopoll handler will not be invoked again before blk_iopoll_sched_prep() * is called. **/ -void blk_iopoll_complete(struct blk_iopoll *iopoll) +void blk_iopoll_complete(struct blk_iopoll *iop) { unsigned long flags; local_irq_save(flags); - __blk_iopoll_complete(iopoll); + __blk_iopoll_complete(iop); local_irq_restore(flags); } EXPORT_SYMBOL(blk_iopoll_complete); diff --git a/block/blk-lib.c b/block/blk-lib.c index 97a733cf3d5f..8411be3c19d3 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -226,8 +226,8 @@ EXPORT_SYMBOL(blkdev_issue_write_same); * Generate and issue number of bios with zerofiled pages. */ -int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask) +static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask) { int ret; struct bio *bio; diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c index 136ef8643bba..d2c253f71b86 100644 --- a/block/blk-mq-cpu.c +++ b/block/blk-mq-cpu.c @@ -18,14 +18,18 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self, { unsigned int cpu = (unsigned long) hcpu; struct blk_mq_cpu_notifier *notify; + int ret = NOTIFY_OK; raw_spin_lock(&blk_mq_cpu_notify_lock); - list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) - notify->notify(notify->data, action, cpu); + list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) { + ret = notify->notify(notify->data, action, cpu); + if (ret != NOTIFY_OK) + break; + } raw_spin_unlock(&blk_mq_cpu_notify_lock); - return NOTIFY_OK; + return ret; } void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) @@ -45,7 +49,7 @@ void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) } void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, - void (*fn)(void *, unsigned long, unsigned int), + int (*fn)(void *, unsigned long, unsigned int), void *data) { notifier->notify = fn; diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 5d0f93cf358c..0daacb927be1 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -96,3 +96,19 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) kfree(map); return NULL; } + +/* + * We have no quick way of doing reverse lookups. This is only used at + * queue init time, so runtime isn't important. + */ +int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) +{ + int i; + + for_each_possible_cpu(i) { + if (index == mq_map[i]) + return cpu_to_node(i); + } + + return NUMA_NO_NODE; +} diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 9176a6984857..99a60a829e69 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -203,47 +203,16 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, return ret; } -static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - ssize_t ret; - - spin_lock(&hctx->lock); - ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI)); - spin_unlock(&hctx->lock); - - return ret; -} - -static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx, - const char *page, size_t len) -{ - struct blk_mq_ctx *ctx; - unsigned long ret; - unsigned int i; - - if (kstrtoul(page, 10, &ret)) { - pr_err("blk-mq-sysfs: invalid input '%s'\n", page); - return -EINVAL; - } - - spin_lock(&hctx->lock); - if (ret) - hctx->flags |= BLK_MQ_F_SHOULD_IPI; - else - hctx->flags &= ~BLK_MQ_F_SHOULD_IPI; - spin_unlock(&hctx->lock); - - hctx_for_each_ctx(hctx, ctx, i) - ctx->ipi_redirect = !!ret; - - return len; -} - static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { return blk_mq_tag_sysfs_show(hctx->tags, page); } +static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + return sprintf(page, "%u\n", atomic_read(&hctx->nr_active)); +} + static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) { unsigned int i, first = 1; @@ -303,15 +272,14 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { .attr = {.name = "dispatched", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_dispatched_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { + .attr = {.name = "active", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_active_show, +}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { .attr = {.name = "pending", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_rq_list_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = { - .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR}, - .show = blk_mq_hw_sysfs_ipi_show, - .store = blk_mq_hw_sysfs_ipi_store, -}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { .attr = {.name = "tags", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_tags_show, @@ -326,9 +294,9 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_run.attr, &blk_mq_hw_sysfs_dispatched.attr, &blk_mq_hw_sysfs_pending.attr, - &blk_mq_hw_sysfs_ipi.attr, &blk_mq_hw_sysfs_tags.attr, &blk_mq_hw_sysfs_cpus.attr, + &blk_mq_hw_sysfs_active.attr, NULL, }; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 7a799c46c32d..0d0640d38a06 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -1,64 +1,333 @@ #include #include +#include #include #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" -void blk_mq_wait_for_tags(struct blk_mq_tags *tags) +static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) { - int tag = blk_mq_get_tag(tags, __GFP_WAIT, false); - blk_mq_put_tag(tags, tag); + int i; + + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + int ret; + + ret = find_first_zero_bit(&bm->word, bm->depth); + if (ret < bm->depth) + return true; + } + + return false; } bool blk_mq_has_free_tags(struct blk_mq_tags *tags) { - return !tags || - percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0; + if (!tags) + return true; + + return bt_has_free_tags(&tags->bitmap_tags); } -static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp) +static inline void bt_index_inc(unsigned int *index) +{ + *index = (*index + 1) & (BT_WAIT_QUEUES - 1); +} + +/* + * If a previously inactive queue goes active, bump the active user count. + */ +bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && + !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + atomic_inc(&hctx->tags->active_queues); + + return true; +} + +/* + * Wakeup all potentially sleeping on normal (non-reserved) tags + */ +static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags) +{ + struct blk_mq_bitmap_tags *bt; + int i, wake_index; + + bt = &tags->bitmap_tags; + wake_index = bt->wake_index; + for (i = 0; i < BT_WAIT_QUEUES; i++) { + struct bt_wait_state *bs = &bt->bs[wake_index]; + + if (waitqueue_active(&bs->wait)) + wake_up(&bs->wait); + + bt_index_inc(&wake_index); + } +} + +/* + * If a previously busy queue goes inactive, potential waiters could now + * be allowed to queue. Wake them up and check. + */ +void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->tags; + + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return; + + atomic_dec(&tags->active_queues); + + blk_mq_tag_wakeup_all(tags); +} + +/* + * For shared tag users, we track the number of currently active users + * and attempt to provide a fair share of the tag depth for each of them. + */ +static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, + struct blk_mq_bitmap_tags *bt) +{ + unsigned int depth, users; + + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return true; + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return true; + + /* + * Don't try dividing an ant + */ + if (bt->depth == 1) + return true; + + users = atomic_read(&hctx->tags->active_queues); + if (!users) + return true; + + /* + * Allow at least some tags + */ + depth = max((bt->depth + users - 1) / users, 4U); + return atomic_read(&hctx->nr_active) < depth; +} + +static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) +{ + int tag, org_last_tag, end; + + org_last_tag = last_tag; + end = bm->depth; + do { +restart: + tag = find_next_zero_bit(&bm->word, end, last_tag); + if (unlikely(tag >= end)) { + /* + * We started with an offset, start from 0 to + * exhaust the map. + */ + if (org_last_tag && last_tag) { + end = last_tag; + last_tag = 0; + goto restart; + } + return -1; + } + last_tag = tag + 1; + } while (test_and_set_bit_lock(tag, &bm->word)); + + return tag; +} + +/* + * Straight forward bitmap tag implementation, where each bit is a tag + * (cleared == free, and set == busy). The small twist is using per-cpu + * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue + * contexts. This enables us to drastically limit the space searched, + * without dirtying an extra shared cacheline like we would if we stored + * the cache value inside the shared blk_mq_bitmap_tags structure. On top + * of that, each word of tags is in a separate cacheline. This means that + * multiple users will tend to stick to different cachelines, at least + * until the map is exhausted. + */ +static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, + unsigned int *tag_cache) +{ + unsigned int last_tag, org_last_tag; + int index, i, tag; + + if (!hctx_may_queue(hctx, bt)) + return -1; + + last_tag = org_last_tag = *tag_cache; + index = TAG_TO_INDEX(bt, last_tag); + + for (i = 0; i < bt->map_nr; i++) { + tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag)); + if (tag != -1) { + tag += (index << bt->bits_per_word); + goto done; + } + + last_tag = 0; + if (++index >= bt->map_nr) + index = 0; + } + + *tag_cache = 0; + return -1; + + /* + * Only update the cache from the allocation path, if we ended + * up using the specific cached tag. + */ +done: + if (tag == org_last_tag) { + last_tag = tag + 1; + if (last_tag >= bt->depth - 1) + last_tag = 0; + + *tag_cache = last_tag; + } + + return tag; +} + +static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, + struct blk_mq_hw_ctx *hctx) +{ + struct bt_wait_state *bs; + + if (!hctx) + return &bt->bs[0]; + + bs = &bt->bs[hctx->wait_index]; + bt_index_inc(&hctx->wait_index); + return bs; +} + +static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx, + unsigned int *last_tag, gfp_t gfp) +{ + struct bt_wait_state *bs; + DEFINE_WAIT(wait); + int tag; + + tag = __bt_get(hctx, bt, last_tag); + if (tag != -1) + return tag; + + if (!(gfp & __GFP_WAIT)) + return -1; + + bs = bt_wait_ptr(bt, hctx); + do { + bool was_empty; + + was_empty = list_empty(&wait.task_list); + prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); + + tag = __bt_get(hctx, bt, last_tag); + if (tag != -1) + break; + + if (was_empty) + atomic_set(&bs->wait_cnt, bt->wake_cnt); + + io_schedule(); + } while (1); + + finish_wait(&bs->wait, &wait); + return tag; +} + +static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, + struct blk_mq_hw_ctx *hctx, + unsigned int *last_tag, gfp_t gfp) { int tag; - tag = percpu_ida_alloc(&tags->free_tags, (gfp & __GFP_WAIT) ? - TASK_UNINTERRUPTIBLE : TASK_RUNNING); - if (tag < 0) - return BLK_MQ_TAG_FAIL; - return tag + tags->nr_reserved_tags; + tag = bt_get(&tags->bitmap_tags, hctx, last_tag, gfp); + if (tag >= 0) + return tag + tags->nr_reserved_tags; + + return BLK_MQ_TAG_FAIL; } static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, gfp_t gfp) { - int tag; + int tag, zero = 0; if (unlikely(!tags->nr_reserved_tags)) { WARN_ON_ONCE(1); return BLK_MQ_TAG_FAIL; } - tag = percpu_ida_alloc(&tags->reserved_tags, (gfp & __GFP_WAIT) ? - TASK_UNINTERRUPTIBLE : TASK_RUNNING); + tag = bt_get(&tags->breserved_tags, NULL, &zero, gfp); if (tag < 0) return BLK_MQ_TAG_FAIL; + return tag; } -unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved) +unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, + gfp_t gfp, bool reserved) { if (!reserved) - return __blk_mq_get_tag(tags, gfp); + return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp); - return __blk_mq_get_reserved_tag(tags, gfp); + return __blk_mq_get_reserved_tag(hctx->tags, gfp); +} + +static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) +{ + int i, wake_index; + + wake_index = bt->wake_index; + for (i = 0; i < BT_WAIT_QUEUES; i++) { + struct bt_wait_state *bs = &bt->bs[wake_index]; + + if (waitqueue_active(&bs->wait)) { + if (wake_index != bt->wake_index) + bt->wake_index = wake_index; + + return bs; + } + + bt_index_inc(&wake_index); + } + + return NULL; +} + +static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) +{ + const int index = TAG_TO_INDEX(bt, tag); + struct bt_wait_state *bs; + + /* + * The unlock memory barrier need to order access to req in free + * path and clearing tag bit + */ + clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word); + + bs = bt_wake_ptr(bt); + if (bs && atomic_dec_and_test(&bs->wait_cnt)) { + atomic_set(&bs->wait_cnt, bt->wake_cnt); + bt_index_inc(&bt->wake_index); + wake_up(&bs->wait); + } } static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) { BUG_ON(tag >= tags->nr_tags); - percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags); + bt_clear_tag(&tags->bitmap_tags, tag); } static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, @@ -66,22 +335,43 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, { BUG_ON(tag >= tags->nr_reserved_tags); - percpu_ida_free(&tags->reserved_tags, tag); + bt_clear_tag(&tags->breserved_tags, tag); } -void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) +void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, + unsigned int *last_tag) { - if (tag >= tags->nr_reserved_tags) - __blk_mq_put_tag(tags, tag); - else + struct blk_mq_tags *tags = hctx->tags; + + if (tag >= tags->nr_reserved_tags) { + const int real_tag = tag - tags->nr_reserved_tags; + + __blk_mq_put_tag(tags, real_tag); + *last_tag = real_tag; + } else __blk_mq_put_reserved_tag(tags, tag); } -static int __blk_mq_tag_iter(unsigned id, void *data) +static void bt_for_each_free(struct blk_mq_bitmap_tags *bt, + unsigned long *free_map, unsigned int off) { - unsigned long *tag_map = data; - __set_bit(id, tag_map); - return 0; + int i; + + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + int bit = 0; + + do { + bit = find_next_zero_bit(&bm->word, bm->depth, bit); + if (bit >= bm->depth) + break; + + __set_bit(bit + off, free_map); + bit++; + } while (1); + + off += (1 << bt->bits_per_word); + } } void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, @@ -95,21 +385,128 @@ void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, if (!tag_map) return; - percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map); + bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags); if (tags->nr_reserved_tags) - percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter, - tag_map); + bt_for_each_free(&tags->breserved_tags, tag_map, 0); fn(data, tag_map); kfree(tag_map); } +EXPORT_SYMBOL(blk_mq_tag_busy_iter); + +static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) +{ + unsigned int i, used; + + for (i = 0, used = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + + used += bitmap_weight(&bm->word, bm->depth); + } + + return bt->depth - used; +} + +static void bt_update_count(struct blk_mq_bitmap_tags *bt, + unsigned int depth) +{ + unsigned int tags_per_word = 1U << bt->bits_per_word; + unsigned int map_depth = depth; + + if (depth) { + int i; + + for (i = 0; i < bt->map_nr; i++) { + bt->map[i].depth = min(map_depth, tags_per_word); + map_depth -= bt->map[i].depth; + } + } + + bt->wake_cnt = BT_WAIT_BATCH; + if (bt->wake_cnt > depth / 4) + bt->wake_cnt = max(1U, depth / 4); + + bt->depth = depth; +} + +static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, + int node, bool reserved) +{ + int i; + + bt->bits_per_word = ilog2(BITS_PER_LONG); + + /* + * Depth can be zero for reserved tags, that's not a failure + * condition. + */ + if (depth) { + unsigned int nr, tags_per_word; + + tags_per_word = (1 << bt->bits_per_word); + + /* + * If the tag space is small, shrink the number of tags + * per word so we spread over a few cachelines, at least. + * If less than 4 tags, just forget about it, it's not + * going to work optimally anyway. + */ + if (depth >= 4) { + while (tags_per_word * 4 > depth) { + bt->bits_per_word--; + tags_per_word = (1 << bt->bits_per_word); + } + } + + nr = ALIGN(depth, tags_per_word) / tags_per_word; + bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), + GFP_KERNEL, node); + if (!bt->map) + return -ENOMEM; + + bt->map_nr = nr; + } + + bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); + if (!bt->bs) { + kfree(bt->map); + return -ENOMEM; + } + + for (i = 0; i < BT_WAIT_QUEUES; i++) + init_waitqueue_head(&bt->bs[i].wait); + + bt_update_count(bt, depth); + return 0; +} + +static void bt_free(struct blk_mq_bitmap_tags *bt) +{ + kfree(bt->map); + kfree(bt->bs); +} + +static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, + int node) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + + if (bt_alloc(&tags->bitmap_tags, depth, node, false)) + goto enomem; + if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) + goto enomem; + + return tags; +enomem: + bt_free(&tags->bitmap_tags); + kfree(tags); + return NULL; +} struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, int node) { - unsigned int nr_tags, nr_cache; struct blk_mq_tags *tags; - int ret; if (total_tags > BLK_MQ_TAG_MAX) { pr_err("blk-mq: tag depth too large\n"); @@ -120,73 +517,59 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, if (!tags) return NULL; - nr_tags = total_tags - reserved_tags; - nr_cache = nr_tags / num_possible_cpus(); - - if (nr_cache < BLK_MQ_TAG_CACHE_MIN) - nr_cache = BLK_MQ_TAG_CACHE_MIN; - else if (nr_cache > BLK_MQ_TAG_CACHE_MAX) - nr_cache = BLK_MQ_TAG_CACHE_MAX; - tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; - tags->nr_max_cache = nr_cache; - tags->nr_batch_move = max(1u, nr_cache / 2); - ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags - - tags->nr_reserved_tags, - tags->nr_max_cache, - tags->nr_batch_move); - if (ret) - goto err_free_tags; - - if (reserved_tags) { - /* - * With max_cahe and batch set to 1, the allocator fallbacks to - * no cached. It's fine reserved tags allocation is slow. - */ - ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags, - 1, 1); - if (ret) - goto err_reserved_tags; - } - - return tags; - -err_reserved_tags: - percpu_ida_destroy(&tags->free_tags); -err_free_tags: - kfree(tags); - return NULL; + return blk_mq_init_bitmap_tags(tags, node); } void blk_mq_free_tags(struct blk_mq_tags *tags) { - percpu_ida_destroy(&tags->free_tags); - percpu_ida_destroy(&tags->reserved_tags); + bt_free(&tags->bitmap_tags); + bt_free(&tags->breserved_tags); kfree(tags); } +void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + + *tag = prandom_u32() % depth; +} + +int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) +{ + tdepth -= tags->nr_reserved_tags; + if (tdepth > tags->nr_tags) + return -EINVAL; + + /* + * Don't need (or can't) update reserved tags here, they remain + * static and should never need resizing. + */ + bt_update_count(&tags->bitmap_tags, tdepth); + blk_mq_tag_wakeup_all(tags); + return 0; +} + ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) { char *orig_page = page; - unsigned int cpu; + unsigned int free, res; if (!tags) return 0; - page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u," - " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags, - tags->nr_batch_move, tags->nr_max_cache); + page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " + "bits_per_word=%u\n", + tags->nr_tags, tags->nr_reserved_tags, + tags->bitmap_tags.bits_per_word); - page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", - percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids), - percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids)); + free = bt_unused_tags(&tags->bitmap_tags); + res = bt_unused_tags(&tags->breserved_tags); - for_each_possible_cpu(cpu) { - page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu, - percpu_ida_free_tags(&tags->free_tags, cpu)); - } + page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); + page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); return page - orig_page; } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index b602e3fa66ea..c959de58d2a5 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -1,7 +1,32 @@ #ifndef INT_BLK_MQ_TAG_H #define INT_BLK_MQ_TAG_H -#include +#include "blk-mq.h" + +enum { + BT_WAIT_QUEUES = 8, + BT_WAIT_BATCH = 8, +}; + +struct bt_wait_state { + atomic_t wait_cnt; + wait_queue_head_t wait; +} ____cacheline_aligned_in_smp; + +#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word) +#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1)) + +struct blk_mq_bitmap_tags { + unsigned int depth; + unsigned int wake_cnt; + unsigned int bits_per_word; + + unsigned int map_nr; + struct blk_align_bitmap *map; + + unsigned int wake_index; + struct bt_wait_state *bs; +}; /* * Tag address space map. @@ -9,11 +34,11 @@ struct blk_mq_tags { unsigned int nr_tags; unsigned int nr_reserved_tags; - unsigned int nr_batch_move; - unsigned int nr_max_cache; - struct percpu_ida free_tags; - struct percpu_ida reserved_tags; + atomic_t active_queues; + + struct blk_mq_bitmap_tags bitmap_tags; + struct blk_mq_bitmap_tags breserved_tags; struct request **rqs; struct list_head page_list; @@ -23,12 +48,12 @@ struct blk_mq_tags { extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); extern void blk_mq_free_tags(struct blk_mq_tags *tags); -extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); -extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags); -extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag); -extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); +extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); +extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); +extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); +extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); enum { BLK_MQ_TAG_CACHE_MIN = 1, @@ -41,4 +66,23 @@ enum { BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, }; +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); + +static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return false; + + return __blk_mq_tag_busy(hctx); +} + +static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return; + + __blk_mq_tag_idle(hctx); +} + #endif diff --git a/block/blk-mq.c b/block/blk-mq.c index ee225cc312b8..ae14749b530c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -56,39 +56,40 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { unsigned int i; - for (i = 0; i < hctx->nr_ctx_map; i++) - if (hctx->ctx_map[i]) + for (i = 0; i < hctx->ctx_map.map_size; i++) + if (hctx->ctx_map.map[i].word) return true; return false; } +static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; +} + +#define CTX_TO_BIT(hctx, ctx) \ + ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) + /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - if (!test_bit(ctx->index_hw, hctx->ctx_map)) - set_bit(ctx->index_hw, hctx->ctx_map); + struct blk_align_bitmap *bm = get_bm(hctx, ctx); + + if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) + set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } -static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, - gfp_t gfp, bool reserved) +static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) { - struct request *rq; - unsigned int tag; + struct blk_align_bitmap *bm = get_bm(hctx, ctx); - tag = blk_mq_get_tag(hctx->tags, gfp, reserved); - if (tag != BLK_MQ_TAG_FAIL) { - rq = hctx->tags->rqs[tag]; - blk_rq_init(hctx->queue, rq); - rq->tag = tag; - - return rq; - } - - return NULL; + clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } static int blk_mq_queue_enter(struct request_queue *q) @@ -187,70 +188,109 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; + INIT_LIST_HEAD(&rq->queuelist); + /* csd/requeue_work/fifo_time is initialized before use */ + rq->q = q; rq->mq_ctx = ctx; - rq->cmd_flags = rw_flags; + rq->cmd_flags |= rw_flags; + rq->cmd_type = 0; + /* do not touch atomic flags, it needs atomic ops against the timer */ + rq->cpu = -1; + rq->__data_len = 0; + rq->__sector = (sector_t) -1; + rq->bio = NULL; + rq->biotail = NULL; + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + memset(&rq->flush, 0, max(sizeof(rq->flush), sizeof(rq->elv))); + rq->rq_disk = NULL; + rq->part = NULL; rq->start_time = jiffies; +#ifdef CONFIG_BLK_CGROUP + rq->rl = NULL; set_start_time_ns(rq); + rq->io_start_time_ns = 0; +#endif + rq->nr_phys_segments = 0; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + rq->nr_integrity_segments = 0; +#endif + rq->ioprio = 0; + rq->special = NULL; + /* tag was already set */ + rq->errors = 0; + memset(rq->__cmd, 0, sizeof(rq->__cmd)); + rq->cmd = rq->__cmd; + rq->cmd_len = BLK_MAX_CDB; + + rq->extra_len = 0; + rq->sense_len = 0; + rq->resid_len = 0; + rq->sense = NULL; + + rq->deadline = 0; + INIT_LIST_HEAD(&rq->timeout_list); + rq->timeout = 0; + rq->retries = 0; + rq->end_io = NULL; + rq->end_io_data = NULL; + rq->next_rq = NULL; + ctx->rq_dispatched[rw_is_sync(rw_flags)]++; } -static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, - int rw, gfp_t gfp, - bool reserved) +static struct request * +__blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, int rw, gfp_t gfp, bool reserved) { struct request *rq; + unsigned int tag; - do { - struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); + tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved); + if (tag != BLK_MQ_TAG_FAIL) { + rq = hctx->tags->rqs[tag]; - rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); - if (rq) { - blk_mq_rq_ctx_init(q, ctx, rq, rw); - break; + rq->cmd_flags = 0; + if (blk_mq_tag_busy(hctx)) { + rq->cmd_flags = REQ_MQ_INFLIGHT; + atomic_inc(&hctx->nr_active); } - if (gfp & __GFP_WAIT) { - __blk_mq_run_hw_queue(hctx); - blk_mq_put_ctx(ctx); - } else { - blk_mq_put_ctx(ctx); - break; - } + rq->tag = tag; + blk_mq_rq_ctx_init(q, ctx, rq, rw); + return rq; + } - blk_mq_wait_for_tags(hctx->tags); - } while (1); - - return rq; + return NULL; } -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, + bool reserved) { + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; struct request *rq; if (blk_mq_queue_enter(q)) return NULL; - rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); - if (rq) - blk_mq_put_ctx(rq->mq_ctx); + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp & ~__GFP_WAIT, + reserved); + if (!rq && (gfp & __GFP_WAIT)) { + __blk_mq_run_hw_queue(hctx); + blk_mq_put_ctx(ctx); + + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp, reserved); + } + blk_mq_put_ctx(ctx); return rq; } - -struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, - gfp_t gfp) -{ - struct request *rq; - - if (blk_mq_queue_enter(q)) - return NULL; - - rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); - if (rq) - blk_mq_put_ctx(rq->mq_ctx); - return rq; -} -EXPORT_SYMBOL(blk_mq_alloc_reserved_request); +EXPORT_SYMBOL(blk_mq_alloc_request); static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq) @@ -258,7 +298,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; - blk_mq_put_tag(hctx->tags, tag); + if (rq->cmd_flags & REQ_MQ_INFLIGHT) + atomic_dec(&hctx->nr_active); + + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + blk_mq_put_tag(hctx, tag, &ctx->last_tag); blk_mq_queue_exit(q); } @@ -326,15 +370,19 @@ static void __blk_mq_complete_request_remote(void *data) void __blk_mq_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; + bool shared = false; int cpu; - if (!ctx->ipi_redirect) { + if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { rq->q->softirq_done_fn(rq); return; } cpu = get_cpu(); - if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) + shared = cpus_share_cache(cpu, ctx->cpu); + + if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; @@ -355,10 +403,16 @@ void __blk_mq_complete_request(struct request *rq) **/ void blk_mq_complete_request(struct request *rq) { - if (unlikely(blk_should_fake_timeout(rq->q))) + struct request_queue *q = rq->q; + + if (unlikely(blk_should_fake_timeout(q))) return; - if (!blk_mark_rq_complete(rq)) - __blk_mq_complete_request(rq); + if (!blk_mark_rq_complete(rq)) { + if (q->softirq_done_fn) + __blk_mq_complete_request(rq); + else + blk_mq_end_io(rq, rq->errors); + } } EXPORT_SYMBOL(blk_mq_complete_request); @@ -375,10 +429,22 @@ static void blk_mq_start_request(struct request *rq, bool last) /* * Just mark start time and set the started bit. Due to memory * ordering, we know we'll see the correct deadline as long as - * REQ_ATOMIC_STARTED is seen. + * REQ_ATOMIC_STARTED is seen. Use the default queue timeout, + * unless one has been set in the request. + */ + if (!rq->timeout) + rq->deadline = jiffies + q->rq_timeout; + else + rq->deadline = jiffies + rq->timeout; + + /* + * Mark us as started and clear complete. Complete might have been + * set if requeue raced with timeout, which then marked it as + * complete. So be sure to clear complete again when we start + * the request, otherwise we'll ignore the completion event. */ - rq->deadline = jiffies + q->rq_timeout; set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -415,18 +481,72 @@ static void __blk_mq_requeue_request(struct request *rq) void blk_mq_requeue_request(struct request *rq) { - struct request_queue *q = rq->q; - __blk_mq_requeue_request(rq); blk_clear_rq_complete(rq); - trace_block_rq_requeue(q, rq); - BUG_ON(blk_queued_rq(rq)); - blk_mq_insert_request(rq, true, true, false); + blk_mq_add_to_requeue_list(rq, true); } EXPORT_SYMBOL(blk_mq_requeue_request); +static void blk_mq_requeue_work(struct work_struct *work) +{ + struct request_queue *q = + container_of(work, struct request_queue, requeue_work); + LIST_HEAD(rq_list); + struct request *rq, *next; + unsigned long flags; + + spin_lock_irqsave(&q->requeue_lock, flags); + list_splice_init(&q->requeue_list, &rq_list); + spin_unlock_irqrestore(&q->requeue_lock, flags); + + list_for_each_entry_safe(rq, next, &rq_list, queuelist) { + if (!(rq->cmd_flags & REQ_SOFTBARRIER)) + continue; + + rq->cmd_flags &= ~REQ_SOFTBARRIER; + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, true, false, false); + } + + while (!list_empty(&rq_list)) { + rq = list_entry(rq_list.next, struct request, queuelist); + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, false, false, false); + } + + blk_mq_run_queues(q, false); +} + +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) +{ + struct request_queue *q = rq->q; + unsigned long flags; + + /* + * We abuse this flag that is otherwise used by the I/O scheduler to + * request head insertation from the workqueue. + */ + BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); + + spin_lock_irqsave(&q->requeue_lock, flags); + if (at_head) { + rq->cmd_flags |= REQ_SOFTBARRIER; + list_add(&rq->queuelist, &q->requeue_list); + } else { + list_add_tail(&rq->queuelist, &q->requeue_list); + } + spin_unlock_irqrestore(&q->requeue_lock, flags); +} +EXPORT_SYMBOL(blk_mq_add_to_requeue_list); + +void blk_mq_kick_requeue_list(struct request_queue *q) +{ + kblockd_schedule_work(&q->requeue_work); +} +EXPORT_SYMBOL(blk_mq_kick_requeue_list); + struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { return tags->rqs[tag]; @@ -485,6 +605,28 @@ static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); } +static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) +{ + struct request_queue *q = rq->q; + + /* + * We know that complete is set at this point. If STARTED isn't set + * anymore, then the request isn't active and the "timeout" should + * just be ignored. This can happen due to the bitflag ordering. + * Timeout first checks if STARTED is set, and if it is, assumes + * the request is active. But if we race with completion, then + * we both flags will get cleared. So check here again, and ignore + * a timeout event with a request that isn't active. + */ + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + return BLK_EH_NOT_HANDLED; + + if (!q->mq_ops->timeout) + return BLK_EH_RESET_TIMER; + + return q->mq_ops->timeout(rq); +} + static void blk_mq_rq_timer(unsigned long data) { struct request_queue *q = (struct request_queue *) data; @@ -492,11 +634,24 @@ static void blk_mq_rq_timer(unsigned long data) unsigned long next = 0; int i, next_set = 0; - queue_for_each_hw_ctx(q, hctx, i) - blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); + queue_for_each_hw_ctx(q, hctx, i) { + /* + * If not software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!hctx->nr_ctx || !hctx->tags) + continue; - if (next_set) - mod_timer(&q->timeout, round_jiffies_up(next)); + blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); + } + + if (next_set) { + next = blk_rq_timeout(round_jiffies_up(next)); + mod_timer(&q->timeout, next); + } else { + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_tag_idle(hctx); + } } /* @@ -538,9 +693,38 @@ static bool blk_mq_attempt_merge(struct request_queue *q, return false; } -void blk_mq_add_timer(struct request *rq) +/* + * Process software queues that have been marked busy, splicing them + * to the for-dispatch + */ +static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { - __blk_add_timer(rq, NULL); + struct blk_mq_ctx *ctx; + int i; + + for (i = 0; i < hctx->ctx_map.map_size; i++) { + struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; + unsigned int off, bit; + + if (!bm->word) + continue; + + bit = 0; + off = i * hctx->ctx_map.bits_per_word; + do { + bit = find_next_bit(&bm->word, bm->depth, bit); + if (bit >= bm->depth) + break; + + ctx = hctx->ctxs[bit + off]; + clear_bit(bit, &bm->word); + spin_lock(&ctx->lock); + list_splice_tail_init(&ctx->rq_list, list); + spin_unlock(&ctx->lock); + + bit++; + } while (1); + } } /* @@ -552,10 +736,9 @@ void blk_mq_add_timer(struct request *rq) static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; - struct blk_mq_ctx *ctx; struct request *rq; LIST_HEAD(rq_list); - int bit, queued; + int queued; WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); @@ -567,15 +750,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) /* * Touch any software queue that has pending entries. */ - for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { - clear_bit(bit, hctx->ctx_map); - ctx = hctx->ctxs[bit]; - BUG_ON(bit != ctx->index_hw); - - spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, &rq_list); - spin_unlock(&ctx->lock); - } + flush_busy_ctxs(hctx, &rq_list); /* * If we have previous entries on our dispatch list, grab them @@ -588,14 +763,10 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) spin_unlock(&hctx->lock); } - /* - * Delete and return all entries from our dispatch list - */ - queued = 0; - /* * Now process all the entries, sending them to the driver. */ + queued = 0; while (!list_empty(&rq_list)) { int ret; @@ -610,11 +781,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) queued++; continue; case BLK_MQ_RQ_QUEUE_BUSY: - /* - * FIXME: we should have a mechanism to stop the queue - * like blk_stop_queue, otherwise we will waste cpu - * time - */ list_add(&rq->queuelist, &rq_list); __blk_mq_requeue_request(rq); break; @@ -646,6 +812,30 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } } +/* + * It'd be great if the workqueue API had a way to pass + * in a mask and had some smarts for more clever placement. + * For now we just round-robin here, switching for every + * BLK_MQ_CPU_WORK_BATCH queued items. + */ +static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) +{ + int cpu = hctx->next_cpu; + + if (--hctx->next_cpu_batch <= 0) { + int next_cpu; + + next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(hctx->cpumask); + + hctx->next_cpu = next_cpu; + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + } + + return cpu; +} + void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) @@ -658,13 +848,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) else { unsigned int cpu; - /* - * It'd be great if the workqueue API had a way to pass - * in a mask and had some smarts for more clever placement - * than the first CPU. Or we could round-robin here. For now, - * just queue on the first CPU. - */ - cpu = cpumask_first(hctx->cpumask); + cpu = blk_mq_hctx_next_cpu(hctx); kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); } } @@ -771,13 +955,7 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) else { unsigned int cpu; - /* - * It'd be great if the workqueue API had a way to pass - * in a mask and had some smarts for more clever placement - * than the first CPU. Or we could round-robin here. For now, - * just queue on the first CPU. - */ - cpu = cpumask_first(hctx->cpumask); + cpu = blk_mq_hctx_next_cpu(hctx); kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); } } @@ -794,12 +972,13 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, list_add(&rq->queuelist, &ctx->rq_list); else list_add_tail(&rq->queuelist, &ctx->rq_list); + blk_mq_hctx_mark_pending(hctx, ctx); /* * We do this early, to ensure we are on the right CPU. */ - blk_mq_add_timer(rq); + blk_add_timer(rq); } void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, @@ -930,21 +1109,87 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) blk_account_io_start(rq, 1); } -static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct request *rq, struct bio *bio) +{ + struct request_queue *q = hctx->queue; + + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { + blk_mq_bio_to_request(rq, bio); + spin_lock(&ctx->lock); +insert_rq: + __blk_mq_insert_request(hctx, rq, false); + spin_unlock(&ctx->lock); + return false; + } else { + spin_lock(&ctx->lock); + if (!blk_mq_attempt_merge(q, ctx, bio)) { + blk_mq_bio_to_request(rq, bio); + goto insert_rq; + } + + spin_unlock(&ctx->lock); + __blk_mq_free_request(hctx, ctx, rq); + return true; + } +} + +struct blk_map_ctx { + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; +}; + +static struct request *blk_mq_map_request(struct request_queue *q, + struct bio *bio, + struct blk_map_ctx *data) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; + struct request *rq; + int rw = bio_data_dir(bio); + + if (unlikely(blk_mq_queue_enter(q))) { + bio_endio(bio, -EIO); + return NULL; + } + + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + if (rw_is_sync(bio->bi_rw)) + rw |= REQ_SYNC; + + trace_block_getrq(q, bio, rw); + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false); + if (unlikely(!rq)) { + __blk_mq_run_hw_queue(hctx); + blk_mq_put_ctx(ctx); + trace_block_sleeprq(q, bio, rw); + + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, + __GFP_WAIT|GFP_ATOMIC, false); + } + + hctx->queued++; + data->hctx = hctx; + data->ctx = ctx; + return rq; +} + +/* + * Multiple hardware queue variant. This will not use per-process plugs, + * but will attempt to bypass the hctx queueing if we can go straight to + * hardware for SYNC IO. + */ +static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +{ const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); - int rw = bio_data_dir(bio); + struct blk_map_ctx data; struct request *rq; - unsigned int use_plug, request_count = 0; - - /* - * If we have multiple hardware queues, just go directly to - * one of those for sync IO. - */ - use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); blk_queue_bounce(q, &bio); @@ -953,33 +1198,85 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) return; } - if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) + rq = blk_mq_map_request(q, bio, &data); + if (unlikely(!rq)) return; - if (blk_mq_queue_enter(q)) { + if (unlikely(is_flush_fua)) { + blk_mq_bio_to_request(rq, bio); + blk_insert_flush(rq); + goto run_queue; + } + + if (is_sync) { + int ret; + + blk_mq_bio_to_request(rq, bio); + blk_mq_start_request(rq, true); + + /* + * For OK queue, we are done. For error, kill it. Any other + * error (busy), just add it to our list as we previously + * would have done + */ + ret = q->mq_ops->queue_rq(data.hctx, rq); + if (ret == BLK_MQ_RQ_QUEUE_OK) + goto done; + else { + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + rq->errors = -EIO; + blk_mq_end_io(rq, rq->errors); + goto done; + } + } + } + + if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + /* + * For a SYNC request, send it to the hardware immediately. For + * an ASYNC request, just ensure that we run it later on. The + * latter allows for merging opportunities and more efficient + * dispatching. + */ +run_queue: + blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); + } +done: + blk_mq_put_ctx(data.ctx); +} + +/* + * Single hardware queue variant. This will attempt to use any per-process + * plug for merging and IO deferral. + */ +static void blk_sq_make_request(struct request_queue *q, struct bio *bio) +{ + const int is_sync = rw_is_sync(bio->bi_rw); + const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); + unsigned int use_plug, request_count = 0; + struct blk_map_ctx data; + struct request *rq; + + /* + * If we have multiple hardware queues, just go directly to + * one of those for sync IO. + */ + use_plug = !is_flush_fua && !is_sync; + + blk_queue_bounce(q, &bio); + + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { bio_endio(bio, -EIO); return; } - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); + if (use_plug && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count)) + return; - if (is_sync) - rw |= REQ_SYNC; - trace_block_getrq(q, bio, rw); - rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); - if (likely(rq)) - blk_mq_rq_ctx_init(q, ctx, rq, rw); - else { - blk_mq_put_ctx(ctx); - trace_block_sleeprq(q, bio, rw); - rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, - false); - ctx = rq->mq_ctx; - hctx = q->mq_ops->map_queue(q, ctx->cpu); - } - - hctx->queued++; + rq = blk_mq_map_request(q, bio, &data); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); @@ -1004,31 +1301,23 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) trace_block_plug(q); } list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(ctx); + blk_mq_put_ctx(data.ctx); return; } } - spin_lock(&ctx->lock); - - if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - blk_mq_attempt_merge(q, ctx, bio)) - __blk_mq_free_request(hctx, ctx, rq); - else { - blk_mq_bio_to_request(rq, bio); - __blk_mq_insert_request(hctx, rq, false); + if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + /* + * For a SYNC request, send it to the hardware immediately. For + * an ASYNC request, just ensure that we run it later on. The + * latter allows for merging opportunities and more efficient + * dispatching. + */ +run_queue: + blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } - spin_unlock(&ctx->lock); - - /* - * For a SYNC request, send it to the hardware immediately. For an - * ASYNC request, just ensure that we run it later on. The latter - * allows for merging opportunities and more efficient dispatching. - */ -run_queue: - blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); - blk_mq_put_ctx(ctx); + blk_mq_put_ctx(data.ctx); } /* @@ -1041,10 +1330,10 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) EXPORT_SYMBOL(blk_mq_map_queue); struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set, - unsigned int hctx_index) + unsigned int hctx_index, + int node) { - return kmalloc_node(sizeof(struct blk_mq_hw_ctx), - GFP_KERNEL | __GFP_ZERO, set->numa_node); + return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node); } EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); @@ -1055,52 +1344,6 @@ void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, } EXPORT_SYMBOL(blk_mq_free_single_hw_queue); -static void blk_mq_hctx_notify(void *data, unsigned long action, - unsigned int cpu) -{ - struct blk_mq_hw_ctx *hctx = data; - struct request_queue *q = hctx->queue; - struct blk_mq_ctx *ctx; - LIST_HEAD(tmp); - - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return; - - /* - * Move ctx entries to new CPU, if this one is going away. - */ - ctx = __blk_mq_get_ctx(q, cpu); - - spin_lock(&ctx->lock); - if (!list_empty(&ctx->rq_list)) { - list_splice_init(&ctx->rq_list, &tmp); - clear_bit(ctx->index_hw, hctx->ctx_map); - } - spin_unlock(&ctx->lock); - - if (list_empty(&tmp)) - return; - - ctx = blk_mq_get_ctx(q); - spin_lock(&ctx->lock); - - while (!list_empty(&tmp)) { - struct request *rq; - - rq = list_first_entry(&tmp, struct request, queuelist); - rq->mq_ctx = ctx; - list_move_tail(&rq->queuelist, &ctx->rq_list); - } - - hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_hctx_mark_pending(hctx, ctx); - - spin_unlock(&ctx->lock); - - blk_mq_run_hw_queue(hctx, true); - blk_mq_put_ctx(ctx); -} - static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { @@ -1130,12 +1373,7 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, static size_t order_to_size(unsigned int order) { - size_t ret = PAGE_SIZE; - - while (order--) - ret *= 2; - - return ret; + return (size_t)PAGE_SIZE << order; } static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, @@ -1219,17 +1457,147 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, return NULL; } +static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) +{ + kfree(bitmap->map); +} + +static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) +{ + unsigned int bpw = 8, total, num_maps, i; + + bitmap->bits_per_word = bpw; + + num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; + bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), + GFP_KERNEL, node); + if (!bitmap->map) + return -ENOMEM; + + bitmap->map_size = num_maps; + + total = nr_cpu_ids; + for (i = 0; i < num_maps; i++) { + bitmap->map[i].depth = min(total, bitmap->bits_per_word); + total -= bitmap->map[i].depth; + } + + return 0; +} + +static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) +{ + struct request_queue *q = hctx->queue; + struct blk_mq_ctx *ctx; + LIST_HEAD(tmp); + + /* + * Move ctx entries to new CPU, if this one is going away. + */ + ctx = __blk_mq_get_ctx(q, cpu); + + spin_lock(&ctx->lock); + if (!list_empty(&ctx->rq_list)) { + list_splice_init(&ctx->rq_list, &tmp); + blk_mq_hctx_clear_pending(hctx, ctx); + } + spin_unlock(&ctx->lock); + + if (list_empty(&tmp)) + return NOTIFY_OK; + + ctx = blk_mq_get_ctx(q); + spin_lock(&ctx->lock); + + while (!list_empty(&tmp)) { + struct request *rq; + + rq = list_first_entry(&tmp, struct request, queuelist); + rq->mq_ctx = ctx; + list_move_tail(&rq->queuelist, &ctx->rq_list); + } + + hctx = q->mq_ops->map_queue(q, ctx->cpu); + blk_mq_hctx_mark_pending(hctx, ctx); + + spin_unlock(&ctx->lock); + + blk_mq_run_hw_queue(hctx, true); + blk_mq_put_ctx(ctx); + return NOTIFY_OK; +} + +static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu) +{ + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; + + if (set->tags[hctx->queue_num]) + return NOTIFY_OK; + + set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num); + if (!set->tags[hctx->queue_num]) + return NOTIFY_STOP; + + hctx->tags = set->tags[hctx->queue_num]; + return NOTIFY_OK; +} + +static int blk_mq_hctx_notify(void *data, unsigned long action, + unsigned int cpu) +{ + struct blk_mq_hw_ctx *hctx = data; + + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) + return blk_mq_hctx_cpu_offline(hctx, cpu); + else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) + return blk_mq_hctx_cpu_online(hctx, cpu); + + return NOTIFY_OK; +} + +static void blk_mq_exit_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set, int nr_queue) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (i == nr_queue) + break; + + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, i); + + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + kfree(hctx->ctxs); + blk_mq_free_bitmap(&hctx->ctx_map); + } + +} + +static void blk_mq_free_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + queue_for_each_hw_ctx(q, hctx, i) { + free_cpumask_var(hctx->cpumask); + set->ops->free_hctx(hctx, i); + } +} + static int blk_mq_init_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set) { struct blk_mq_hw_ctx *hctx; - unsigned int i, j; + unsigned int i; /* * Initialize hardware queues */ queue_for_each_hw_ctx(q, hctx, i) { - unsigned int num_maps; int node; node = hctx->numa_node; @@ -1260,13 +1628,9 @@ static int blk_mq_init_hw_queues(struct request_queue *q, if (!hctx->ctxs) break; - num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; - hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), - GFP_KERNEL, node); - if (!hctx->ctx_map) + if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) break; - hctx->nr_ctx_map = num_maps; hctx->nr_ctx = 0; if (set->ops->init_hctx && @@ -1280,16 +1644,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, /* * Init failed */ - queue_for_each_hw_ctx(q, hctx, j) { - if (i == j) - break; - - if (set->ops->exit_hctx) - set->ops->exit_hctx(hctx, j); - - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - kfree(hctx->ctxs); - } + blk_mq_exit_hw_queues(q, set, i); return 1; } @@ -1350,6 +1705,79 @@ static void blk_mq_map_swqueue(struct request_queue *q) ctx->index_hw = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; } + + queue_for_each_hw_ctx(q, hctx, i) { + /* + * If not software queues are mapped to this hardware queue, + * disable it and free the request entries + */ + if (!hctx->nr_ctx) { + struct blk_mq_tag_set *set = q->tag_set; + + if (set->tags[i]) { + blk_mq_free_rq_map(set, set->tags[i], i); + set->tags[i] = NULL; + hctx->tags = NULL; + } + continue; + } + + /* + * Initialize batch roundrobin counts + */ + hctx->next_cpu = cpumask_first(hctx->cpumask); + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + } +} + +static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) +{ + struct blk_mq_hw_ctx *hctx; + struct request_queue *q; + bool shared; + int i; + + if (set->tag_list.next == set->tag_list.prev) + shared = false; + else + shared = true; + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_freeze_queue(q); + + queue_for_each_hw_ctx(q, hctx, i) { + if (shared) + hctx->flags |= BLK_MQ_F_TAG_SHARED; + else + hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + } + blk_mq_unfreeze_queue(q); + } +} + +static void blk_mq_del_queue_tag_set(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + + blk_mq_freeze_queue(q); + + mutex_lock(&set->tag_list_lock); + list_del_init(&q->tag_set_list); + blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); + + blk_mq_unfreeze_queue(q); +} + +static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, + struct request_queue *q) +{ + q->tag_set = set; + + mutex_lock(&set->tag_list_lock); + list_add_tail(&q->tag_set_list, &set->tag_list); + blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); } struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) @@ -1357,6 +1785,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) struct blk_mq_hw_ctx **hctxs; struct blk_mq_ctx *ctx; struct request_queue *q; + unsigned int *map; int i; ctx = alloc_percpu(struct blk_mq_ctx); @@ -1369,15 +1798,22 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!hctxs) goto err_percpu; + map = blk_mq_make_queue_map(set); + if (!map) + goto err_map; + for (i = 0; i < set->nr_hw_queues; i++) { - hctxs[i] = set->ops->alloc_hctx(set, i); + int node = blk_mq_hw_queue_to_node(map, i); + + hctxs[i] = set->ops->alloc_hctx(set, i, node); if (!hctxs[i]) goto err_hctxs; if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) goto err_hctxs; - hctxs[i]->numa_node = NUMA_NO_NODE; + atomic_set(&hctxs[i]->nr_active, 0); + hctxs[i]->numa_node = node; hctxs[i]->queue_num = i; } @@ -1385,8 +1821,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!q) goto err_hctxs; - q->mq_map = blk_mq_make_queue_map(set); - if (!q->mq_map) + if (percpu_counter_init(&q->mq_usage_counter, 0)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); @@ -1394,6 +1829,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) q->nr_queues = nr_cpu_ids; q->nr_hw_queues = set->nr_hw_queues; + q->mq_map = map; q->queue_ctx = ctx; q->queue_hw_ctx = hctxs; @@ -1403,11 +1839,24 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) q->sg_reserved_size = INT_MAX; - blk_queue_make_request(q, blk_mq_make_request); - blk_queue_rq_timed_out(q, set->ops->timeout); + INIT_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_LIST_HEAD(&q->requeue_list); + spin_lock_init(&q->requeue_lock); + + if (q->nr_hw_queues > 1) + blk_queue_make_request(q, blk_mq_make_request); + else + blk_queue_make_request(q, blk_sq_make_request); + + blk_queue_rq_timed_out(q, blk_mq_rq_timed_out); if (set->timeout) blk_queue_rq_timeout(q, set->timeout); + /* + * Do this after blk_queue_make_request() overrides it... + */ + q->nr_requests = set->queue_depth; + if (set->ops->complete) blk_queue_softirq_done(q, set->ops->complete); @@ -1423,27 +1872,29 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (blk_mq_init_hw_queues(q, set)) goto err_flush_rq; - blk_mq_map_swqueue(q); - mutex_lock(&all_q_mutex); list_add_tail(&q->all_q_node, &all_q_list); mutex_unlock(&all_q_mutex); + blk_mq_add_queue_tag_set(set, q); + + blk_mq_map_swqueue(q); + return q; err_flush_rq: kfree(q->flush_rq); err_hw: - kfree(q->mq_map); -err_map: blk_cleanup_queue(q); err_hctxs: + kfree(map); for (i = 0; i < set->nr_hw_queues; i++) { if (!hctxs[i]) break; free_cpumask_var(hctxs[i]->cpumask); set->ops->free_hctx(hctxs[i], i); } +err_map: kfree(hctxs); err_percpu: free_percpu(ctx); @@ -1453,18 +1904,14 @@ EXPORT_SYMBOL(blk_mq_init_queue); void blk_mq_free_queue(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - int i; + struct blk_mq_tag_set *set = q->tag_set; - queue_for_each_hw_ctx(q, hctx, i) { - kfree(hctx->ctx_map); - kfree(hctx->ctxs); - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - if (q->mq_ops->exit_hctx) - q->mq_ops->exit_hctx(hctx, i); - free_cpumask_var(hctx->cpumask); - q->mq_ops->free_hctx(hctx, i); - } + blk_mq_del_queue_tag_set(q); + + blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); + blk_mq_free_hw_queues(q, set); + + percpu_counter_destroy(&q->mq_usage_counter); free_percpu(q->queue_ctx); kfree(q->queue_hw_ctx); @@ -1503,10 +1950,10 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, struct request_queue *q; /* - * Before new mapping is established, hotadded cpu might already start - * handling requests. This doesn't break anything as we map offline - * CPUs to first hardware queue. We will re-init queue below to get - * optimal settings. + * Before new mappings are established, hotadded cpu might already + * start handling requests. This doesn't break anything as we map + * offline CPUs to first hardware queue. We will re-init the queue + * below to get optimal settings. */ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) @@ -1536,7 +1983,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) return -EINVAL; - set->tags = kmalloc_node(set->nr_hw_queues * sizeof(struct blk_mq_tags), + set->tags = kmalloc_node(set->nr_hw_queues * + sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) goto out; @@ -1547,6 +1995,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) goto out_unwind; } + mutex_init(&set->tag_list_lock); + INIT_LIST_HEAD(&set->tag_list); + return 0; out_unwind: @@ -1561,11 +2012,37 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < set->nr_hw_queues; i++) - blk_mq_free_rq_map(set, set->tags[i], i); + for (i = 0; i < set->nr_hw_queues; i++) { + if (set->tags[i]) + blk_mq_free_rq_map(set, set->tags[i], i); + } + + kfree(set->tags); } EXPORT_SYMBOL(blk_mq_free_tag_set); +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_hw_ctx *hctx; + int i, ret; + + if (!set || nr > set->queue_depth) + return -EINVAL; + + ret = 0; + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_tag_update_depth(hctx->tags, nr); + if (ret) + break; + } + + if (!ret) + q->nr_requests = nr; + + return ret; +} + void blk_mq_disable_hotplug(void) { mutex_lock(&all_q_mutex); diff --git a/block/blk-mq.h b/block/blk-mq.h index 5fa14f19f752..ff5e6bf0f691 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -11,7 +11,8 @@ struct blk_mq_ctx { unsigned int cpu; unsigned int index_hw; - unsigned int ipi_redirect; + + unsigned int last_tag ____cacheline_aligned_in_smp; /* incremented at dispatch time */ unsigned long rq_dispatched[2]; @@ -22,7 +23,7 @@ struct blk_mq_ctx { struct request_queue *queue; struct kobject kobj; -}; +} ____cacheline_aligned_in_smp; void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); @@ -31,13 +32,14 @@ void blk_mq_drain_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); void blk_mq_clone_flush_request(struct request *flush_rq, struct request *orig_rq); +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); /* * CPU hotplug helpers */ struct blk_mq_cpu_notifier; void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, - void (*fn)(void *, unsigned long, unsigned int), + int (*fn)(void *, unsigned long, unsigned int), void *data); void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); @@ -50,7 +52,15 @@ void blk_mq_disable_hotplug(void); */ extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); +extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); -void blk_mq_add_timer(struct request *rq); +/* + * Basic implementation of sparser bitmap, allowing the user to spread + * the bits over more cachelines. + */ +struct blk_align_bitmap { + unsigned long word; + unsigned long depth; +} ____cacheline_aligned_in_smp; #endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7500f876dae4..23321fbab293 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -48,11 +48,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page) static ssize_t queue_requests_store(struct request_queue *q, const char *page, size_t count) { - struct request_list *rl; unsigned long nr; - int ret; + int ret, err; - if (!q->request_fn) + if (!q->request_fn && !q->mq_ops) return -EINVAL; ret = queue_var_store(&nr, page, count); @@ -62,40 +61,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - spin_lock_irq(q->queue_lock); - q->nr_requests = nr; - blk_queue_congestion_threshold(q); + if (q->request_fn) + err = blk_update_nr_requests(q, nr); + else + err = blk_mq_update_nr_requests(q, nr); - /* congestion isn't cgroup aware and follows root blkcg for now */ - rl = &q->root_rl; + if (err) + return err; - if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_SYNC); - else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_SYNC); - - if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_ASYNC); - else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_ASYNC); - - blk_queue_for_each_rl(rl, q) { - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_SYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_SYNC); - wake_up(&rl->wait[BLK_RW_SYNC]); - } - - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_ASYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_ASYNC); - wake_up(&rl->wait[BLK_RW_ASYNC]); - } - } - - spin_unlock_irq(q->queue_lock); return ret; } @@ -544,8 +517,6 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - percpu_counter_destroy(&q->mq_usage_counter); - if (q->mq_ops) blk_mq_free_queue(q); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 033745cd7fba..9353b4683359 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -744,7 +744,7 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, static bool throtl_slice_used(struct throtl_grp *tg, bool rw) { if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) - return 0; + return false; return 1; } @@ -842,7 +842,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) *wait = 0; - return 1; + return true; } /* Calc approx time to dispatch */ @@ -880,7 +880,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { if (wait) *wait = 0; - return 1; + return true; } /* Calc approx time to dispatch */ @@ -923,7 +923,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { if (wait) *wait = 0; - return 1; + return true; } /* @@ -1258,7 +1258,7 @@ static void throtl_pending_timer_fn(unsigned long arg) * of throtl_data->service_queue. Those bio's are ready and issued by this * function. */ -void blk_throtl_dispatch_work_fn(struct work_struct *work) +static void blk_throtl_dispatch_work_fn(struct work_struct *work) { struct throtl_data *td = container_of(work, struct throtl_data, dispatch_work); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index a09e8af8186c..43e8b515806f 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -96,11 +96,7 @@ static void blk_rq_timed_out(struct request *req) __blk_complete_request(req); break; case BLK_EH_RESET_TIMER: - if (q->mq_ops) - blk_mq_add_timer(req); - else - blk_add_timer(req); - + blk_add_timer(req); blk_clear_rq_complete(req); break; case BLK_EH_NOT_HANDLED: @@ -170,7 +166,26 @@ void blk_abort_request(struct request *req) } EXPORT_SYMBOL_GPL(blk_abort_request); -void __blk_add_timer(struct request *req, struct list_head *timeout_list) +unsigned long blk_rq_timeout(unsigned long timeout) +{ + unsigned long maxt; + + maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT); + if (time_after(timeout, maxt)) + timeout = maxt; + + return timeout; +} + +/** + * blk_add_timer - Start timeout timer for a single request + * @req: request that is about to start running. + * + * Notes: + * Each request has its own timer, and as it is added to the queue, we + * set up the timer. When the request completes, we cancel the timer. + */ +void blk_add_timer(struct request *req) { struct request_queue *q = req->q; unsigned long expiry; @@ -188,15 +203,15 @@ void __blk_add_timer(struct request *req, struct list_head *timeout_list) req->timeout = q->rq_timeout; req->deadline = jiffies + req->timeout; - if (timeout_list) - list_add_tail(&req->timeout_list, timeout_list); + if (!q->mq_ops) + list_add_tail(&req->timeout_list, &req->q->timeout_list); /* * If the timer isn't already pending or this timeout is earlier * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = round_jiffies_up(req->deadline); + expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { @@ -214,17 +229,3 @@ void __blk_add_timer(struct request *req, struct list_head *timeout_list) } } - -/** - * blk_add_timer - Start timeout timer for a single request - * @req: request that is about to start running. - * - * Notes: - * Each request has its own timer, and as it is added to the queue, we - * set up the timer. When the request completes, we cancel the timer. - */ -void blk_add_timer(struct request *req) -{ - __blk_add_timer(req, &req->q->timeout_list); -} - diff --git a/block/blk.h b/block/blk.h index 1d880f1f957f..45385e9abf6f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -9,6 +9,9 @@ /* Number of requests a "batching" process may submit */ #define BLK_BATCH_REQ 32 +/* Max future timer expiry for timeouts */ +#define BLK_MAX_TIMEOUT (5 * HZ) + extern struct kmem_cache *blk_requestq_cachep; extern struct kmem_cache *request_cachep; extern struct kobj_type blk_queue_ktype; @@ -37,9 +40,9 @@ bool __blk_end_bidi_request(struct request *rq, int error, void blk_rq_timed_out_timer(unsigned long data); void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, unsigned int *next_set); -void __blk_add_timer(struct request *req, struct list_head *timeout_list); +unsigned long blk_rq_timeout(unsigned long timeout); +void blk_add_timer(struct request *req); void blk_delete_timer(struct request *); -void blk_add_timer(struct request *); bool bio_attempt_front_merge(struct request_queue *q, struct request *req, @@ -185,6 +188,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) return q->nr_congestion_off; } +extern int blk_update_nr_requests(struct request_queue *, unsigned int); + /* * Contribute to IO statistics IFF: * diff --git a/mm/bounce.c b/block/bounce.c similarity index 100% rename from mm/bounce.c rename to block/bounce.c diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5063a0bd831a..22dffebc7c73 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -4460,7 +4460,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) static ssize_t cfq_var_show(unsigned int var, char *page) { - return sprintf(page, "%d\n", var); + return sprintf(page, "%u\n", var); } static ssize_t diff --git a/fs/ioprio.c b/block/ioprio.c similarity index 100% rename from fs/ioprio.c rename to block/ioprio.c diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index ae331ab4a451..ea323e91903b 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -178,7 +178,7 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) { struct request *rq; - rq = blk_mq_alloc_reserved_request(dd->queue, 0, __GFP_WAIT); + rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true); return blk_mq_rq_to_pdu(rq); } diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index e932398588aa..5a8081114df6 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -322,39 +322,10 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) } static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_tag_set *set, - unsigned int hctx_index) + unsigned int hctx_index, + int node) { - int b_size = DIV_ROUND_UP(set->nr_hw_queues, nr_online_nodes); - int tip = (set->nr_hw_queues % nr_online_nodes); - int node = 0, i, n; - - /* - * Split submit queues evenly wrt to the number of nodes. If uneven, - * fill the first buckets with one extra, until the rest is filled with - * no extra. - */ - for (i = 0, n = 1; i < hctx_index; i++, n++) { - if (n % b_size == 0) { - n = 0; - node++; - - tip--; - if (!tip) - b_size = set->nr_hw_queues / nr_online_nodes; - } - } - - /* - * A node might not be online, therefore map the relative node id to the - * real node id. - */ - for_each_online_node(n) { - if (!node) - break; - node--; - } - - return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, n); + return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node); } static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 1dcf9067cffa..608532d3f8c9 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -743,6 +743,7 @@ static void skd_request_fn(struct request_queue *q) break; } skreq->discard_page = 1; + req->completion_data = page; skd_prep_discard_cdb(scsi_req, skreq, page, lba, count); } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { @@ -855,10 +856,9 @@ static void skd_end_request(struct skd_device *skdev, if ((io_flags & REQ_DISCARD) && (skreq->discard_page == 1)) { - struct bio *bio = req->bio; pr_debug("%s:%s:%d, free the page!", skdev->name, __func__, __LINE__); - __free_page(bio->bi_io_vec->bv_page); + __free_page(req->completion_data); } if (unlikely(error)) { diff --git a/drivers/char/random.c b/drivers/char/random.c index 6b75713d953a..0a19d866a153 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -902,6 +902,7 @@ void add_disk_randomness(struct gendisk *disk) add_timer_randomness(disk->random, 0x100 + disk_devt(disk)); trace_add_disk_randomness(disk_devt(disk), ENTROPY_BITS(&input_pool)); } +EXPORT_SYMBOL_GPL(add_disk_randomness); #endif /********************************************************************* diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 06d154d20faa..96af195224f2 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -737,6 +737,7 @@ static int sd_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq) goto out; } + rq->completion_data = page; blk_add_request_payload(rq, page, len); ret = scsi_setup_blk_pc_cmnd(sdp, rq); rq->__data_len = nr_bytes; @@ -839,11 +840,9 @@ static void sd_unprep_fn(struct request_queue *q, struct request *rq) { struct scsi_cmnd *SCpnt = rq->special; - if (rq->cmd_flags & REQ_DISCARD) { - struct bio *bio = rq->bio; + if (rq->cmd_flags & REQ_DISCARD) + __free_page(rq->completion_data); - __free_page(bio->bi_io_vec->bv_page); - } if (SCpnt->cmnd != rq->cmd) { mempool_free(SCpnt->cmnd, sd_cdb_pool); SCpnt->cmnd = NULL; diff --git a/fs/Makefile b/fs/Makefile index f9cb9876e466..4030cbfbc9af 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -14,14 +14,13 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o +obj-y += buffer.o block_dev.o direct-io.o mpage.o else obj-y += no-block.o endif obj-$(CONFIG_PROC_FS) += proc_namespace.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_ANON_INODES) += anon_inodes.o diff --git a/include/linux/bio.h b/include/linux/bio.h index bba550826921..5a645769f020 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -333,7 +333,7 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors, extern struct bio_set *bioset_create(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); -extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries); +extern mempool_t *biovec_create_pool(int pool_entries); extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); extern void bio_put(struct bio *); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ab469d525894..2bd82f399128 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -8,7 +8,13 @@ struct blk_mq_tags; struct blk_mq_cpu_notifier { struct list_head list; void *data; - void (*notify)(void *data, unsigned long action, unsigned int cpu); + int (*notify)(void *data, unsigned long action, unsigned int cpu); +}; + +struct blk_mq_ctxmap { + unsigned int map_size; + unsigned int bits_per_word; + struct blk_align_bitmap *map; }; struct blk_mq_hw_ctx { @@ -21,6 +27,8 @@ struct blk_mq_hw_ctx { struct delayed_work run_work; struct delayed_work delay_work; cpumask_var_t cpumask; + int next_cpu; + int next_cpu_batch; unsigned long flags; /* BLK_MQ_F_* flags */ @@ -29,10 +37,12 @@ struct blk_mq_hw_ctx { void *driver_data; + struct blk_mq_ctxmap ctx_map; + unsigned int nr_ctx; struct blk_mq_ctx **ctxs; - unsigned int nr_ctx_map; - unsigned long *ctx_map; + + unsigned int wait_index; struct blk_mq_tags *tags; @@ -44,6 +54,8 @@ struct blk_mq_hw_ctx { unsigned int numa_node; unsigned int cmd_size; /* per-request extra data */ + atomic_t nr_active; + struct blk_mq_cpu_notifier cpu_notifier; struct kobject kobj; }; @@ -51,7 +63,7 @@ struct blk_mq_hw_ctx { struct blk_mq_tag_set { struct blk_mq_ops *ops; unsigned int nr_hw_queues; - unsigned int queue_depth; + unsigned int queue_depth; /* max hw supported */ unsigned int reserved_tags; unsigned int cmd_size; /* per-request extra data */ int numa_node; @@ -60,12 +72,15 @@ struct blk_mq_tag_set { void *driver_data; struct blk_mq_tags **tags; + + struct mutex tag_list_lock; + struct list_head tag_list; }; typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_tag_set *, - unsigned int); + unsigned int, int); typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); @@ -122,11 +137,14 @@ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_SHOULD_SORT = 1 << 1, - BLK_MQ_F_SHOULD_IPI = 1 << 2, + BLK_MQ_F_TAG_SHARED = 1 << 2, BLK_MQ_S_STOPPED = 0, + BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_MAX_DEPTH = 2048, + + BLK_MQ_CPU_WORK_BATCH = 8, }; struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); @@ -142,19 +160,20 @@ void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp); -struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, + gfp_t gfp, bool reserved); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); -struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int); +struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int); void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); void blk_mq_end_io(struct request *rq, int error); void __blk_mq_end_io(struct request *rq, int error); void blk_mq_requeue_request(struct request *rq); - +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); +void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_complete_request(struct request *rq); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); @@ -163,6 +182,7 @@ void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); +void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); /* * Driver command data is immediately after the request. So subtract request diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index aa0eaa2d0bd8..d8e4cea23a25 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -190,6 +190,7 @@ enum rq_flag_bits { __REQ_PM, /* runtime pm request */ __REQ_END, /* last of chain of requests */ __REQ_HASHED, /* on IO scheduler merge hash */ + __REQ_MQ_INFLIGHT, /* track inflight for MQ */ __REQ_NR_BITS, /* stops here */ }; @@ -243,5 +244,6 @@ enum rq_flag_bits { #define REQ_PM (1ULL << __REQ_PM) #define REQ_END (1ULL << __REQ_END) #define REQ_HASHED (1ULL << __REQ_HASHED) +#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 74ee55fefcf0..e90e1692e052 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -90,15 +90,15 @@ enum rq_cmd_type_bits { #define BLK_MAX_CDB 16 /* - * try to put the fields that are referenced together in the same cacheline. - * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init() - * as well! + * Try to put the fields that are referenced together in the same cacheline. + * + * If you modify this structure, make sure to update blk_rq_init() and + * especially blk_mq_rq_ctx_init() to take care of the added fields. */ struct request { struct list_head queuelist; union { struct call_single_data csd; - struct work_struct requeue_work; unsigned long fifo_time; }; @@ -462,6 +462,10 @@ struct request_queue { struct request *flush_rq; spinlock_t mq_flush_lock; + struct list_head requeue_list; + spinlock_t requeue_lock; + struct work_struct requeue_work; + struct mutex sysfs_lock; int bypass_depth; @@ -480,6 +484,9 @@ struct request_queue { wait_queue_head_t mq_freeze_wq; struct percpu_counter mq_usage_counter; struct list_head all_q_node; + + struct blk_mq_tag_set *tag_set; + struct list_head tag_set_list; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ diff --git a/mm/Makefile b/mm/Makefile index b484452dac57..0173940407f6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -30,7 +30,6 @@ endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o -obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o