5778322e67
Notifier callbacks for CPU_ONLINE action can be run on the other CPU than the CPU which was just onlined. So it is possible for the process running on the just onlined CPU to insert request and run hw queue before establishing new mapping which is done by blk_mq_queue_reinit_notify(). This can cause a problem when the CPU has just been onlined first time since the request queue was initialized. At this time ctx->index_hw for the CPU, which is the index in hctx->ctxs[] for this ctx, is still zero before blk_mq_queue_reinit_notify() is called by notifier callbacks for CPU_ONLINE action. For example, there is a single hw queue (hctx) and two CPU queues (ctx0 for CPU0, and ctx1 for CPU1). Now CPU1 is just onlined and a request is inserted into ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is still zero. And then while running hw queue, flush_busy_ctxs() finds bit0 is set in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is ignored. Fix it by ensuring that new mapping is established before onlined cpu starts running. Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com> Reviewed-by: Ming Lei <tom.leiming@gmail.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Ming Lei <tom.leiming@gmail.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@fb.com>
120 lines
2.5 KiB
C
120 lines
2.5 KiB
C
/*
|
|
* CPU <-> hardware queue mapping helpers
|
|
*
|
|
* Copyright (C) 2013-2014 Jens Axboe
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/threads.h>
|
|
#include <linux/module.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/smp.h>
|
|
#include <linux/cpu.h>
|
|
|
|
#include <linux/blk-mq.h>
|
|
#include "blk.h"
|
|
#include "blk-mq.h"
|
|
|
|
static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
|
|
const int cpu)
|
|
{
|
|
return cpu * nr_queues / nr_cpus;
|
|
}
|
|
|
|
static int get_first_sibling(unsigned int cpu)
|
|
{
|
|
unsigned int ret;
|
|
|
|
ret = cpumask_first(topology_sibling_cpumask(cpu));
|
|
if (ret < nr_cpu_ids)
|
|
return ret;
|
|
|
|
return cpu;
|
|
}
|
|
|
|
int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
|
|
const struct cpumask *online_mask)
|
|
{
|
|
unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
|
|
cpumask_var_t cpus;
|
|
|
|
if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
|
|
return 1;
|
|
|
|
cpumask_clear(cpus);
|
|
nr_cpus = nr_uniq_cpus = 0;
|
|
for_each_cpu(i, online_mask) {
|
|
nr_cpus++;
|
|
first_sibling = get_first_sibling(i);
|
|
if (!cpumask_test_cpu(first_sibling, cpus))
|
|
nr_uniq_cpus++;
|
|
cpumask_set_cpu(i, cpus);
|
|
}
|
|
|
|
queue = 0;
|
|
for_each_possible_cpu(i) {
|
|
if (!cpumask_test_cpu(i, online_mask)) {
|
|
map[i] = 0;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* Easy case - we have equal or more hardware queues. Or
|
|
* there are no thread siblings to take into account. Do
|
|
* 1:1 if enough, or sequential mapping if less.
|
|
*/
|
|
if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
|
|
map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
|
|
queue++;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* Less then nr_cpus queues, and we have some number of
|
|
* threads per cores. Map sibling threads to the same
|
|
* queue.
|
|
*/
|
|
first_sibling = get_first_sibling(i);
|
|
if (first_sibling == i) {
|
|
map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
|
|
queue);
|
|
queue++;
|
|
} else
|
|
map[i] = map[first_sibling];
|
|
}
|
|
|
|
free_cpumask_var(cpus);
|
|
return 0;
|
|
}
|
|
|
|
unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
|
|
{
|
|
unsigned int *map;
|
|
|
|
/* If cpus are offline, map them to first hctx */
|
|
map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL,
|
|
set->numa_node);
|
|
if (!map)
|
|
return NULL;
|
|
|
|
if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
|
|
return map;
|
|
|
|
kfree(map);
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* We have no quick way of doing reverse lookups. This is only used at
|
|
* queue init time, so runtime isn't important.
|
|
*/
|
|
int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
|
|
{
|
|
int i;
|
|
|
|
for_each_possible_cpu(i) {
|
|
if (index == mq_map[i])
|
|
return cpu_to_node(i);
|
|
}
|
|
|
|
return NUMA_NO_NODE;
|
|
}
|