From 1e3335de05da3dfbe48b8caa03db1834a2133256 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Feb 2007 19:59:49 +0100 Subject: [PATCH 01/18] cfq-iosched: improve preemption for cooperating tasks When testing the syslet async io approach, I discovered that CFQ sometimes didn't perform as well as expected. cfq_should_preempt() needs to better check for cooperating tasks, so fix that by allowing preemption of an equal priority queue if the recently queued request is as good a candidate for IO as the one we are currently waiting for. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index f92ba2a869b4..bfb396774cbb 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -867,15 +867,11 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd) static void cfq_dispatch_insert(request_queue_t *q, struct request *rq) { - struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_queue *cfqq = RQ_CFQQ(rq); cfq_remove_request(rq); cfqq->on_dispatch[rq_is_sync(rq)]++; elv_dispatch_sort(q, rq); - - rq = list_entry(q->queue_head.prev, struct request, queuelist); - cfqd->last_sector = rq->sector + rq->nr_sectors; } /* @@ -1585,6 +1581,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, struct request *rq) { struct cfq_queue *cfqq = cfqd->active_queue; + sector_t dist; if (cfq_class_idle(new_cfqq)) return 0; @@ -1594,14 +1591,14 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (cfq_class_idle(cfqq)) return 1; - if (!cfq_cfqq_wait_request(new_cfqq)) - return 0; + /* * if the new request is sync, but the currently running queue is * not, let the sync request have priority. */ if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) return 1; + /* * So both queues are sync. Let the new request get disk time if * it's a metadata request and the current queue is doing regular IO. @@ -1609,6 +1606,21 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (rq_is_meta(rq) && !cfqq->meta_pending) return 1; + if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) + return 0; + + /* + * if this request is as-good as one we would expect from the + * current cfqq, let it preempt + */ + if (rq->sector > cfqd->last_sector) + dist = rq->sector - cfqd->last_sector; + else + dist = cfqd->last_sector - rq->sector; + + if (dist <= cfqd->active_cic->seek_mean) + return 1; + return 0; } @@ -1719,6 +1731,8 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq) cfqq->on_dispatch[sync]--; cfqq->service_last = now; + cfqd->last_sector = rq->hard_sector + rq->hard_nr_sectors; + if (!cfq_class_idle(cfqq)) cfqd->last_end_request = now; From 6d048f5310aa2dda2b5acd947eab3598c25e269f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Apr 2007 12:44:27 +0200 Subject: [PATCH 02/18] cfq-iosched: development update - Implement logic for detecting cooperating processes, so we choose the best available queue whenever possible. - Improve residual slice time accounting. - Remove dead code: we no longer see async requests coming in on sync queues. That part was removed a long time ago. That means that we can also remove the difference between cfq_cfqq_sync() and cfq_cfqq_class_sync(), they are now indentical. And we can kill the on_dispatch array, just make it a counter. - Allow a process to go into the current list, if it hasn't been serviced in this scheduler tick yet. Possible future improvements including caching the cfqq lookup in cfq_close_cooperator(), so we don't have to look it up twice. cfq_get_best_queue() should just use that last decision instead of doing it again. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 383 ++++++++++++++++++++++++++++++-------------- 1 file changed, 262 insertions(+), 121 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index bfb396774cbb..28236f2cd908 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -56,13 +56,7 @@ static struct completion *ioc_gone; #define ASYNC (0) #define SYNC (1) -#define cfq_cfqq_dispatched(cfqq) \ - ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC]) - -#define cfq_cfqq_class_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC) - -#define cfq_cfqq_sync(cfqq) \ - (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC]) +#define cfq_cfqq_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC) #define sample_valid(samples) ((samples) > 80) @@ -79,6 +73,7 @@ struct cfq_data { struct list_head busy_rr; struct list_head cur_rr; struct list_head idle_rr; + unsigned long cur_rr_tick; unsigned int busy_queues; /* @@ -98,11 +93,12 @@ struct cfq_data { struct cfq_queue *active_queue; struct cfq_io_context *active_cic; int cur_prio, cur_end_prio; + unsigned long prio_time; unsigned int dispatch_slice; struct timer_list idle_class_timer; - sector_t last_sector; + sector_t last_position; unsigned long last_end_request; /* @@ -117,6 +113,9 @@ struct cfq_data { unsigned int cfq_slice_idle; struct list_head cic_list; + + sector_t new_seek_mean; + u64 new_seek_total; }; /* @@ -133,6 +132,8 @@ struct cfq_queue { unsigned int key; /* member of the rr/busy/cur/idle cfqd list */ struct list_head cfq_list; + /* in what tick we were last serviced */ + unsigned long rr_tick; /* sorted list of pending requests */ struct rb_root sort_list; /* if fifo isn't expired, next request to serve */ @@ -148,10 +149,11 @@ struct cfq_queue { unsigned long slice_end; unsigned long service_last; + unsigned long slice_start; long slice_resid; - /* number of requests that are on the dispatch list */ - int on_dispatch[2]; + /* number of requests that are on the dispatch list or inside driver */ + int dispatched; /* io prio of this group */ unsigned short ioprio, org_ioprio; @@ -159,6 +161,8 @@ struct cfq_queue { /* various state flags, see below */ unsigned int flags; + + sector_t last_request_pos; }; enum cfqq_state_flags { @@ -259,6 +263,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) * easily introduce oscillations. */ cfqq->slice_resid = 0; + + cfqq->slice_start = jiffies; } /* @@ -307,7 +313,7 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) s1 = rq1->sector; s2 = rq2->sector; - last = cfqd->last_sector; + last = cfqd->last_position; /* * by definition, 1KiB is 2 sectors @@ -398,39 +404,42 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, return cfq_choose_req(cfqd, next, prev); } -static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) +/* + * This function finds out where to insert a BE queue in the service hierarchy + */ +static void cfq_resort_be_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq, + int preempted) { - struct cfq_data *cfqd = cfqq->cfqd; struct list_head *list, *n; struct cfq_queue *__cfqq; + int add_tail = 0; /* - * Resorting requires the cfqq to be on the RR list already. + * if cfqq has requests in flight, don't allow it to be + * found in cfq_set_active_queue before it has finished them. + * this is done to increase fairness between a process that + * has lots of io pending vs one that only generates one + * sporadically or synchronously */ - if (!cfq_cfqq_on_rr(cfqq)) - return; - - list_del(&cfqq->cfq_list); - - if (cfq_class_rt(cfqq)) + if (cfqq->dispatched) + list = &cfqd->busy_rr; + else if (cfqq->ioprio == (cfqd->cur_prio + 1) && + cfq_cfqq_sync(cfqq) && + (time_before(cfqd->prio_time, cfqq->service_last) || + cfq_cfqq_queue_new(cfqq) || preempted)) { list = &cfqd->cur_rr; - else if (cfq_class_idle(cfqq)) - list = &cfqd->idle_rr; - else { - /* - * if cfqq has requests in flight, don't allow it to be - * found in cfq_set_active_queue before it has finished them. - * this is done to increase fairness between a process that - * has lots of io pending vs one that only generates one - * sporadically or synchronously - */ - if (cfq_cfqq_dispatched(cfqq)) - list = &cfqd->busy_rr; - else - list = &cfqd->rr_list[cfqq->ioprio]; - } + add_tail = 1; + } else + list = &cfqd->rr_list[cfqq->ioprio]; - if (preempted || cfq_cfqq_queue_new(cfqq)) { + if (!cfq_cfqq_sync(cfqq) || add_tail) { + /* + * async queue always goes to the end. this wont be overly + * unfair to writes, as the sort of the sync queue wont be + * allowed to pass the async queue again. + */ + list_add_tail(&cfqq->cfq_list, list); + } else if (preempted || cfq_cfqq_queue_new(cfqq)) { /* * If this queue was preempted or is new (never been serviced), * let it be added first for fairness but beind other new @@ -444,14 +453,7 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) n = n->next; } - list_add_tail(&cfqq->cfq_list, n); - } else if (!cfq_cfqq_class_sync(cfqq)) { - /* - * async queue always goes to the end. this wont be overly - * unfair to writes, as the sort of the sync queue wont be - * allowed to pass the async queue again. - */ - list_add_tail(&cfqq->cfq_list, list); + list_add(&cfqq->cfq_list, n); } else { /* * sort by last service, but don't cross a new or async @@ -461,17 +463,54 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) */ n = list; while ((n = n->prev) != list) { - struct cfq_queue *__cfqq = list_entry_cfqq(n); + struct cfq_queue *__c = list_entry_cfqq(n); - if (!cfq_cfqq_class_sync(cfqq) || !__cfqq->service_last) + if (!cfq_cfqq_sync(__c) || !__c->service_last) break; - if (time_before(__cfqq->service_last, cfqq->service_last)) + if (time_before(__c->service_last, cfqq->service_last)) break; } list_add(&cfqq->cfq_list, n); } } +static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) +{ + struct cfq_data *cfqd = cfqq->cfqd; + struct list_head *n; + + /* + * Resorting requires the cfqq to be on the RR list already. + */ + if (!cfq_cfqq_on_rr(cfqq)) + return; + + list_del(&cfqq->cfq_list); + + if (cfq_class_rt(cfqq)) { + /* + * At to the front of the current list, but behind other + * RT queues. + */ + n = &cfqd->cur_rr; + while (n->next != &cfqd->cur_rr) + if (!cfq_class_rt(cfqq)) + break; + + list_add(&cfqq->cfq_list, n); + } else if (cfq_class_idle(cfqq)) { + /* + * IDLE goes to the tail of the idle list + */ + list_add_tail(&cfqq->cfq_list, &cfqd->idle_rr); + } else { + /* + * So we get here, ergo the queue is a regular best-effort queue + */ + cfq_resort_be_queue(cfqd, cfqq, preempted); + } +} + /* * add to busy list of queues for service, trying to be fair in ordering * the pending list according to last request service @@ -579,6 +618,8 @@ static void cfq_activate_request(request_queue_t *q, struct request *rq) */ if (!cfqd->hw_tag && cfqd->rq_in_driver > 4) cfqd->hw_tag = 1; + + cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors; } static void cfq_deactivate_request(request_queue_t *q, struct request *rq) @@ -684,6 +725,7 @@ __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfq_clear_cfqq_must_alloc_slice(cfqq); cfq_clear_cfqq_fifo_expire(cfqq); cfq_mark_cfqq_slice_new(cfqq); + cfqq->rr_tick = cfqd->cur_rr_tick; } cfqd->active_queue = cfqq; @@ -786,10 +828,46 @@ static int cfq_get_next_prio_level(struct cfq_data *cfqd) cfqd->cur_end_prio = 0; } + cfqd->cur_rr_tick++; + cfqd->prio_time = jiffies; return prio; } -static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) +static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, + struct request *rq) +{ + if (rq->sector >= cfqd->last_position) + return rq->sector - cfqd->last_position; + else + return cfqd->last_position - rq->sector; +} + +static struct cfq_queue *cfq_get_best_queue(struct cfq_data *cfqd) +{ + struct cfq_queue *cfqq = NULL, *__cfqq; + sector_t best = -1, dist; + + list_for_each_entry(__cfqq, &cfqd->cur_rr, cfq_list) { + if (!__cfqq->next_rq || !cfq_cfqq_sync(__cfqq)) + continue; + + dist = cfq_dist_from_last(cfqd, __cfqq->next_rq); + if (dist < best) { + best = dist; + cfqq = __cfqq; + } + } + + /* + * Only async queue(s) available, grab first entry + */ + if (!cfqq) + cfqq = list_entry_cfqq(cfqd->cur_rr.next); + + return cfqq; +} + +static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { struct cfq_queue *cfqq = NULL; @@ -799,7 +877,7 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) * empty, get next prio level and grab first entry then if any * are spliced */ - cfqq = list_entry_cfqq(cfqd->cur_rr.next); + cfqq = cfq_get_best_queue(cfqd); } else if (!list_empty(&cfqd->busy_rr)) { /* * If no new queues are available, check if the busy list has @@ -820,49 +898,128 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) mod_timer(&cfqd->idle_class_timer, end); } + return cfqq; +} + +static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) +{ + struct cfq_queue *cfqq; + + do { + long prio; + + cfqq = cfq_get_next_queue(cfqd); + if (!cfqq) + break; + + prio = cfq_prio_to_slice(cfqd, cfqq); + if (cfqq->slice_resid > -prio) + break; + + cfqq->slice_resid += prio; + list_del_init(&cfqq->cfq_list); + list_add_tail(&cfqq->cfq_list, &cfqd->rr_list[cfqq->ioprio]); + cfqq = NULL; + } while (1); + __cfq_set_active_queue(cfqd, cfqq); return cfqq; } -#define CIC_SEEKY(cic) ((cic)->seek_mean > (128 * 1024)) +static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) +{ + struct cfq_io_context *cic = cfqd->active_cic; -static int cfq_arm_slice_timer(struct cfq_data *cfqd) + if (!sample_valid(cic->seek_samples)) + return 0; + + return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean; +} + +static struct cfq_queue *__cfq_close_cooperator(struct cfq_data *cfqd, + struct cfq_queue *cur_cfqq, + struct list_head *list) +{ + struct cfq_queue *cfqq; + + list_for_each_entry(cfqq, list, cfq_list) { + if (cfqq == cur_cfqq || !cfq_cfqq_sync(cfqq)) + continue; + + BUG_ON(!cfqq->next_rq); + + if (cfq_rq_close(cfqd, cfqq->next_rq)) + return cfqq; + } + + return NULL; +} + +static int cfq_close_cooperator(struct cfq_data *cfqd, + struct cfq_queue *cur_cfqq) +{ + struct cfq_queue *cfqq; + + if (!cfqd->busy_queues) + return 0; + + /* + * check cur_rr and same-prio rr_list for candidates + */ + cfqq = __cfq_close_cooperator(cfqd, cur_cfqq, &cfqd->cur_rr); + if (cfqq) + return 1; + + cfqq = __cfq_close_cooperator(cfqd, cur_cfqq, &cfqd->rr_list[cur_cfqq->ioprio]); + if (cfqq && (cfqq->rr_tick == cfqd->cur_rr_tick)) + cfqq = NULL; + + return cfqq != NULL; +} + +#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024)) + +static void cfq_arm_slice_timer(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; struct cfq_io_context *cic; unsigned long sl; WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); + WARN_ON(cfq_cfqq_slice_new(cfqq)); /* * idle is disabled, either manually or by past process history */ - if (!cfqd->cfq_slice_idle) - return 0; - if (!cfq_cfqq_idle_window(cfqq)) - return 0; + if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) + return; + /* * task has exited, don't wait */ cic = cfqd->active_cic; if (!cic || !cic->ioc->task) - return 0; + return; + + /* + * See if this prio level has a good candidate + */ + if (cfq_close_cooperator(cfqd, cfqq)) + return; cfq_mark_cfqq_must_dispatch(cfqq); cfq_mark_cfqq_wait_request(cfqq); - sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle); - /* * we don't want to idle for seeks, but we do want to allow * fair distribution of slice time for a process doing back-to-back * seeks. so allow a little bit of time for him to submit a new rq */ + sl = cfqd->cfq_slice_idle; if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) sl = min(sl, msecs_to_jiffies(2)); mod_timer(&cfqd->idle_slice_timer, jiffies + sl); - return 1; } static void cfq_dispatch_insert(request_queue_t *q, struct request *rq) @@ -870,7 +1027,7 @@ static void cfq_dispatch_insert(request_queue_t *q, struct request *rq) struct cfq_queue *cfqq = RQ_CFQQ(rq); cfq_remove_request(rq); - cfqq->on_dispatch[rq_is_sync(rq)]++; + cfqq->dispatched++; elv_dispatch_sort(q, rq); } @@ -891,13 +1048,13 @@ static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq) if (list_empty(&cfqq->fifo)) return NULL; - fifo = cfq_cfqq_class_sync(cfqq); + fifo = cfq_cfqq_sync(cfqq); rq = rq_entry_fifo(cfqq->fifo.next); - if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) - return rq; + if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) + return NULL; - return NULL; + return rq; } static inline int @@ -922,23 +1079,26 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) goto new_queue; /* - * slice has expired + * The active queue has run out of time, expire it and select new. */ - if (!cfq_cfqq_must_dispatch(cfqq) && cfq_slice_used(cfqq)) + if (cfq_slice_used(cfqq)) goto expire; /* - * if queue has requests, dispatch one. if not, check if - * enough slice is left to wait for one + * The active queue has requests and isn't expired, allow it to + * dispatch. */ if (!RB_EMPTY_ROOT(&cfqq->sort_list)) goto keep_queue; - else if (cfq_cfqq_slice_new(cfqq) || cfq_cfqq_dispatched(cfqq)) { + + /* + * No requests pending. If the active queue still has requests in + * flight or is idling for a new request, allow either of these + * conditions to happen (or time out) before selecting a new queue. + */ + if (cfqq->dispatched || timer_pending(&cfqd->idle_slice_timer)) { cfqq = NULL; goto keep_queue; - } else if (cfq_cfqq_class_sync(cfqq)) { - if (cfq_arm_slice_timer(cfqd)) - return NULL; } expire: @@ -1039,7 +1199,7 @@ static int cfq_dispatch_requests(request_queue_t *q, int force) { struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_queue *cfqq, *prev_cfqq; + struct cfq_queue *cfqq; int dispatched; if (!cfqd->busy_queues) @@ -1049,23 +1209,19 @@ cfq_dispatch_requests(request_queue_t *q, int force) return cfq_forced_dispatch(cfqd); dispatched = 0; - prev_cfqq = NULL; while ((cfqq = cfq_select_queue(cfqd)) != NULL) { int max_dispatch; if (cfqd->busy_queues > 1) { - /* - * Don't repeat dispatch from the previous queue. - */ - if (prev_cfqq == cfqq) - break; - /* * So we have dispatched before in this round, if the * next queue has idling enabled (must be sync), don't - * allow it service until the previous have continued. + * allow it service until the previous have completed. */ - if (cfqd->rq_in_driver && cfq_cfqq_idle_window(cfqq)) + if (cfqd->rq_in_driver && cfq_cfqq_idle_window(cfqq) && + dispatched) + break; + if (cfqq->dispatched >= cfqd->cfq_quantum) break; } @@ -1078,7 +1234,6 @@ cfq_dispatch_requests(request_queue_t *q, int force) max_dispatch = 1; dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch); - prev_cfqq = cfqq; } return dispatched; @@ -1520,7 +1675,8 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) } static void -cfq_update_io_seektime(struct cfq_io_context *cic, struct request *rq) +cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, + struct request *rq) { sector_t sdist; u64 total; @@ -1530,6 +1686,11 @@ cfq_update_io_seektime(struct cfq_io_context *cic, struct request *rq) else sdist = cic->last_request_pos - rq->sector; + if (!cic->seek_samples) { + cfqd->new_seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; + cfqd->new_seek_mean = cfqd->new_seek_total / 256; + } + /* * Don't allow the seek distance to get too large from the * odd fragment, pagein, etc @@ -1580,13 +1741,16 @@ static int cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, struct request *rq) { - struct cfq_queue *cfqq = cfqd->active_queue; - sector_t dist; + struct cfq_queue *cfqq; - if (cfq_class_idle(new_cfqq)) + cfqq = cfqd->active_queue; + if (!cfqq) return 0; - if (!cfqq) + if (cfq_slice_used(cfqq)) + return 1; + + if (cfq_class_idle(new_cfqq)) return 0; if (cfq_class_idle(cfqq)) @@ -1613,12 +1777,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * if this request is as-good as one we would expect from the * current cfqq, let it preempt */ - if (rq->sector > cfqd->last_sector) - dist = rq->sector - cfqd->last_sector; - else - dist = cfqd->last_sector - rq->sector; - - if (dist <= cfqd->active_cic->seek_mean) + if (cfq_rq_close(cfqd, rq)) return 1; return 0; @@ -1656,28 +1815,12 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (rq_is_meta(rq)) cfqq->meta_pending++; - /* - * we never wait for an async request and we don't allow preemption - * of an async request. so just return early - */ - if (!rq_is_sync(rq)) { - /* - * sync process issued an async request, if it's waiting - * then expire it and kick rq handling. - */ - if (cic == cfqd->active_cic && - del_timer(&cfqd->idle_slice_timer)) { - cfq_slice_expired(cfqd, 0, 0); - blk_start_queueing(cfqd->queue); - } - return; - } - cfq_update_io_thinktime(cfqd, cic); - cfq_update_io_seektime(cic, rq); + cfq_update_io_seektime(cfqd, cic, rq); cfq_update_idle_window(cfqd, cfqq, cic); cic->last_request_pos = rq->sector + rq->nr_sectors; + cfqq->last_request_pos = cic->last_request_pos; if (cfqq == cfqd->active_queue) { /* @@ -1726,13 +1869,11 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq) now = jiffies; WARN_ON(!cfqd->rq_in_driver); - WARN_ON(!cfqq->on_dispatch[sync]); + WARN_ON(!cfqq->dispatched); cfqd->rq_in_driver--; - cfqq->on_dispatch[sync]--; + cfqq->dispatched--; cfqq->service_last = now; - cfqd->last_sector = rq->hard_sector + rq->hard_nr_sectors; - if (!cfq_class_idle(cfqq)) cfqd->last_end_request = now; @@ -1752,11 +1893,12 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq) } if (cfq_slice_used(cfqq)) cfq_slice_expired(cfqd, 0, 1); - else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) { - if (!cfq_arm_slice_timer(cfqd)) - cfq_schedule_dispatch(cfqd); - } + else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) + cfq_arm_slice_timer(cfqd); } + + if (!cfqd->rq_in_driver) + cfq_schedule_dispatch(cfqd); } /* @@ -2101,7 +2243,6 @@ static int __init cfq_slab_setup(void) /* * sysfs parts below --> */ - static ssize_t cfq_var_show(unsigned int var, char *page) { From 1afba0451c83cbff622a08f2d86fbb2e680dfd5f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Apr 2007 12:47:55 +0200 Subject: [PATCH 03/18] cfq-iosched: minor updates - Move the queue_new flag clear to when the queue is selected - Only select the non-first queue in cfq_get_best_queue(), if there's a substantial difference between the best and first. - Get rid of ->busy_rr - Only select a close cooperator, if the current queue is known to take a while to "think". Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 81 ++++++++++----------------------------------- 1 file changed, 18 insertions(+), 63 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 28236f2cd908..9d6f04103f01 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -70,7 +70,6 @@ struct cfq_data { * rr list of queues with requests and the count of them */ struct list_head rr_list[CFQ_PRIO_LISTS]; - struct list_head busy_rr; struct list_head cur_rr; struct list_head idle_rr; unsigned long cur_rr_tick; @@ -410,59 +409,18 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, static void cfq_resort_be_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq, int preempted) { - struct list_head *list, *n; - struct cfq_queue *__cfqq; - int add_tail = 0; + if (!cfq_cfqq_sync(cfqq)) + list_add_tail(&cfqq->cfq_list, &cfqd->rr_list[cfqq->ioprio]); + else { + struct list_head *n = &cfqd->rr_list[cfqq->ioprio]; - /* - * if cfqq has requests in flight, don't allow it to be - * found in cfq_set_active_queue before it has finished them. - * this is done to increase fairness between a process that - * has lots of io pending vs one that only generates one - * sporadically or synchronously - */ - if (cfqq->dispatched) - list = &cfqd->busy_rr; - else if (cfqq->ioprio == (cfqd->cur_prio + 1) && - cfq_cfqq_sync(cfqq) && - (time_before(cfqd->prio_time, cfqq->service_last) || - cfq_cfqq_queue_new(cfqq) || preempted)) { - list = &cfqd->cur_rr; - add_tail = 1; - } else - list = &cfqd->rr_list[cfqq->ioprio]; - - if (!cfq_cfqq_sync(cfqq) || add_tail) { - /* - * async queue always goes to the end. this wont be overly - * unfair to writes, as the sort of the sync queue wont be - * allowed to pass the async queue again. - */ - list_add_tail(&cfqq->cfq_list, list); - } else if (preempted || cfq_cfqq_queue_new(cfqq)) { - /* - * If this queue was preempted or is new (never been serviced), - * let it be added first for fairness but beind other new - * queues. - */ - n = list; - while (n->next != list) { - __cfqq = list_entry_cfqq(n->next); - if (!cfq_cfqq_queue_new(__cfqq)) - break; - - n = n->next; - } - list_add(&cfqq->cfq_list, n); - } else { /* * sort by last service, but don't cross a new or async - * queue. we don't cross a new queue because it hasn't been - * service before, and we don't cross an async queue because - * it gets added to the end on expire. + * queue. we don't cross a new queue because it hasn't + * been service before, and we don't cross an async + * queue because it gets added to the end on expire. */ - n = list; - while ((n = n->prev) != list) { + while ((n = n->prev) != &cfqd->rr_list[cfqq->ioprio]) { struct cfq_queue *__c = list_entry_cfqq(n); if (!cfq_cfqq_sync(__c) || !__c->service_last) @@ -725,6 +683,7 @@ __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfq_clear_cfqq_must_alloc_slice(cfqq); cfq_clear_cfqq_fifo_expire(cfqq); cfq_mark_cfqq_slice_new(cfqq); + cfq_clear_cfqq_queue_new(cfqq); cfqq->rr_tick = cfqd->cur_rr_tick; } @@ -743,7 +702,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_clear_cfqq_must_dispatch(cfqq); cfq_clear_cfqq_wait_request(cfqq); - cfq_clear_cfqq_queue_new(cfqq); /* * store what was left of this slice, if the queue idled out @@ -845,13 +803,15 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, static struct cfq_queue *cfq_get_best_queue(struct cfq_data *cfqd) { struct cfq_queue *cfqq = NULL, *__cfqq; - sector_t best = -1, dist; + sector_t best = -1, first = -1, dist; list_for_each_entry(__cfqq, &cfqd->cur_rr, cfq_list) { if (!__cfqq->next_rq || !cfq_cfqq_sync(__cfqq)) continue; dist = cfq_dist_from_last(cfqd, __cfqq->next_rq); + if (first == -1) + first = dist; if (dist < best) { best = dist; cfqq = __cfqq; @@ -859,9 +819,11 @@ static struct cfq_queue *cfq_get_best_queue(struct cfq_data *cfqd) } /* - * Only async queue(s) available, grab first entry + * Only async queue(s) available, grab first entry. Do the same + * if the difference between the first and best isn't more than + * twice, to obey fairness. */ - if (!cfqq) + if (!cfqq || (best && first != best && ((first / best) < 4))) cfqq = list_entry_cfqq(cfqd->cur_rr.next); return cfqq; @@ -878,12 +840,6 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) * are spliced */ cfqq = cfq_get_best_queue(cfqd); - } else if (!list_empty(&cfqd->busy_rr)) { - /* - * If no new queues are available, check if the busy list has - * some before falling back to idle io. - */ - cfqq = list_entry_cfqq(cfqd->busy_rr.next); } else if (!list_empty(&cfqd->idle_rr)) { /* * if we have idle queues and no rt or be queues had pending @@ -1004,7 +960,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) /* * See if this prio level has a good candidate */ - if (cfq_close_cooperator(cfqd, cfqq)) + if (cfq_close_cooperator(cfqd, cfqq) && + (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2)) return; cfq_mark_cfqq_must_dispatch(cfqq); @@ -1184,7 +1141,6 @@ cfq_forced_dispatch(struct cfq_data *cfqd) for (i = 0; i < CFQ_PRIO_LISTS; i++) dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]); - dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr); dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr); dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr); @@ -2174,7 +2130,6 @@ static void *cfq_init_queue(request_queue_t *q) for (i = 0; i < CFQ_PRIO_LISTS; i++) INIT_LIST_HEAD(&cfqd->rr_list[i]); - INIT_LIST_HEAD(&cfqd->busy_rr); INIT_LIST_HEAD(&cfqd->cur_rr); INIT_LIST_HEAD(&cfqd->idle_rr); INIT_LIST_HEAD(&cfqd->cic_list); From d9e7620e60bc6648c3dcabbc8d1a320b69c846f9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Apr 2007 14:27:50 +0200 Subject: [PATCH 04/18] cfq-iosched: rework the whole round-robin list concept Drawing on some inspiration from the CFS CPU scheduler design, overhaul the pending cfq_queue concept list management. Currently CFQ uses a doubly linked list per priority level for sorting and service uses. Kill those lists and maintain an rbtree of cfq_queue's, sorted by when to service them. This unfortunately means that the ionice levels aren't as strong anymore, will work on improving those later. We only scale the slice time now, not the number of times we service. This means that latency is better (for all priority levels), but that the distinction between the highest and lower levels aren't as big. The diffstat speaks for itself. cfq-iosched.c | 363 +++++++++++++++++--------------------------------- 1 file changed, 125 insertions(+), 238 deletions(-) Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 361 +++++++++++++++----------------------------- 1 file changed, 123 insertions(+), 238 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 9d6f04103f01..4838c2b16f2c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -26,7 +26,16 @@ static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; +/* + * grace period before allowing idle class to get disk access + */ #define CFQ_IDLE_GRACE (HZ / 10) + +/* + * below this threshold, we consider thinktime immediate + */ +#define CFQ_MIN_TT (2) + #define CFQ_SLICE_SCALE (5) #define CFQ_KEY_ASYNC (0) @@ -69,10 +78,9 @@ struct cfq_data { /* * rr list of queues with requests and the count of them */ - struct list_head rr_list[CFQ_PRIO_LISTS]; + struct rb_root service_tree; struct list_head cur_rr; struct list_head idle_rr; - unsigned long cur_rr_tick; unsigned int busy_queues; /* @@ -91,8 +99,6 @@ struct cfq_data { struct cfq_queue *active_queue; struct cfq_io_context *active_cic; - int cur_prio, cur_end_prio; - unsigned long prio_time; unsigned int dispatch_slice; struct timer_list idle_class_timer; @@ -131,8 +137,10 @@ struct cfq_queue { unsigned int key; /* member of the rr/busy/cur/idle cfqd list */ struct list_head cfq_list; - /* in what tick we were last serviced */ - unsigned long rr_tick; + /* service_tree member */ + struct rb_node rb_node; + /* service_tree key */ + unsigned long rb_key; /* sorted list of pending requests */ struct rb_root sort_list; /* if fifo isn't expired, next request to serve */ @@ -147,8 +155,6 @@ struct cfq_queue { struct list_head fifo; unsigned long slice_end; - unsigned long service_last; - unsigned long slice_start; long slice_resid; /* number of requests that are on the dispatch list or inside driver */ @@ -240,30 +246,26 @@ static inline pid_t cfq_queue_pid(struct task_struct *task, int rw, int is_sync) * if a queue is marked sync and has sync io queued. A sync queue with async * io only, should not get full sync slice length. */ +static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync, + unsigned short prio) +{ + const int base_slice = cfqd->cfq_slice[sync]; + + WARN_ON(prio >= IOPRIO_BE_NR); + + return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio)); +} + static inline int cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)]; - - WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); - - return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio)); + return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); } static inline void cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; - cfqq->slice_end += cfqq->slice_resid; - - /* - * Don't carry over residual for more than one slice, we only want - * to slightly correct the fairness. Carrying over forever would - * easily introduce oscillations. - */ - cfqq->slice_resid = 0; - - cfqq->slice_start = jiffies; } /* @@ -403,33 +405,50 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, return cfq_choose_req(cfqd, next, prev); } -/* - * This function finds out where to insert a BE queue in the service hierarchy - */ -static void cfq_resort_be_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq, - int preempted) +static unsigned long cfq_slice_offset(struct cfq_data *cfqd, + struct cfq_queue *cfqq) { - if (!cfq_cfqq_sync(cfqq)) - list_add_tail(&cfqq->cfq_list, &cfqd->rr_list[cfqq->ioprio]); - else { - struct list_head *n = &cfqd->rr_list[cfqq->ioprio]; + /* + * just an approximation, should be ok. + */ + return ((cfqd->busy_queues - 1) * cfq_prio_slice(cfqd, 1, 0)); +} +static void cfq_service_tree_add(struct cfq_data *cfqd, + struct cfq_queue *cfqq) +{ + struct rb_node **p = &cfqd->service_tree.rb_node; + struct rb_node *parent = NULL; + struct cfq_queue *__cfqq; + unsigned long rb_key; + + rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; + rb_key += cfqq->slice_resid; + cfqq->slice_resid = 0; + + if (!RB_EMPTY_NODE(&cfqq->rb_node)) { /* - * sort by last service, but don't cross a new or async - * queue. we don't cross a new queue because it hasn't - * been service before, and we don't cross an async - * queue because it gets added to the end on expire. + * same position, nothing more to do */ - while ((n = n->prev) != &cfqd->rr_list[cfqq->ioprio]) { - struct cfq_queue *__c = list_entry_cfqq(n); + if (rb_key == cfqq->rb_key) + return; - if (!cfq_cfqq_sync(__c) || !__c->service_last) - break; - if (time_before(__c->service_last, cfqq->service_last)) - break; - } - list_add(&cfqq->cfq_list, n); + rb_erase(&cfqq->rb_node, &cfqd->service_tree); } + + while (*p) { + parent = *p; + __cfqq = rb_entry(parent, struct cfq_queue, rb_node); + + if (rb_key < __cfqq->rb_key) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + cfqq->rb_key = rb_key; + rb_link_node(&cfqq->rb_node, parent, p); + rb_insert_color(&cfqq->rb_node, &cfqd->service_tree); } static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) @@ -443,7 +462,7 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) if (!cfq_cfqq_on_rr(cfqq)) return; - list_del(&cfqq->cfq_list); + list_del_init(&cfqq->cfq_list); if (cfq_class_rt(cfqq)) { /* @@ -465,7 +484,7 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) /* * So we get here, ergo the queue is a regular best-effort queue */ - cfq_resort_be_queue(cfqd, cfqq, preempted); + cfq_service_tree_add(cfqd, cfqq); } } @@ -490,6 +509,11 @@ cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfq_clear_cfqq_on_rr(cfqq); list_del_init(&cfqq->cfq_list); + if (!RB_EMPTY_NODE(&cfqq->rb_node)) { + rb_erase(&cfqq->rb_node, &cfqd->service_tree); + RB_CLEAR_NODE(&cfqq->rb_node); + } + BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; } @@ -684,7 +708,6 @@ __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfq_clear_cfqq_fifo_expire(cfqq); cfq_mark_cfqq_slice_new(cfqq); cfq_clear_cfqq_queue_new(cfqq); - cfqq->rr_tick = cfqd->cur_rr_tick; } cfqd->active_queue = cfqq; @@ -732,114 +755,19 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted, __cfq_slice_expired(cfqd, cfqq, preempted, timed_out); } -/* - * 0 - * 0,1 - * 0,1,2 - * 0,1,2,3 - * 0,1,2,3,4 - * 0,1,2,3,4,5 - * 0,1,2,3,4,5,6 - * 0,1,2,3,4,5,6,7 - */ -static int cfq_get_next_prio_level(struct cfq_data *cfqd) -{ - int prio, wrap; - - prio = -1; - wrap = 0; - do { - int p; - - for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) { - if (!list_empty(&cfqd->rr_list[p])) { - prio = p; - break; - } - } - - if (prio != -1) - break; - cfqd->cur_prio = 0; - if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) { - cfqd->cur_end_prio = 0; - if (wrap) - break; - wrap = 1; - } - } while (1); - - if (unlikely(prio == -1)) - return -1; - - BUG_ON(prio >= CFQ_PRIO_LISTS); - - list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr); - - cfqd->cur_prio = prio + 1; - if (cfqd->cur_prio > cfqd->cur_end_prio) { - cfqd->cur_end_prio = cfqd->cur_prio; - cfqd->cur_prio = 0; - } - if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) { - cfqd->cur_prio = 0; - cfqd->cur_end_prio = 0; - } - - cfqd->cur_rr_tick++; - cfqd->prio_time = jiffies; - return prio; -} - -static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, - struct request *rq) -{ - if (rq->sector >= cfqd->last_position) - return rq->sector - cfqd->last_position; - else - return cfqd->last_position - rq->sector; -} - -static struct cfq_queue *cfq_get_best_queue(struct cfq_data *cfqd) -{ - struct cfq_queue *cfqq = NULL, *__cfqq; - sector_t best = -1, first = -1, dist; - - list_for_each_entry(__cfqq, &cfqd->cur_rr, cfq_list) { - if (!__cfqq->next_rq || !cfq_cfqq_sync(__cfqq)) - continue; - - dist = cfq_dist_from_last(cfqd, __cfqq->next_rq); - if (first == -1) - first = dist; - if (dist < best) { - best = dist; - cfqq = __cfqq; - } - } - - /* - * Only async queue(s) available, grab first entry. Do the same - * if the difference between the first and best isn't more than - * twice, to obey fairness. - */ - if (!cfqq || (best && first != best && ((first / best) < 4))) - cfqq = list_entry_cfqq(cfqd->cur_rr.next); - - return cfqq; -} - static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { struct cfq_queue *cfqq = NULL; - if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) { + if (!list_empty(&cfqd->cur_rr)) { /* - * if current list is non-empty, grab first entry. if it is - * empty, get next prio level and grab first entry then if any - * are spliced + * if current list is non-empty, grab first entry. */ - cfqq = cfq_get_best_queue(cfqd); + cfqq = list_entry_cfqq(cfqd->cur_rr.next); + } else if (!RB_EMPTY_ROOT(&cfqd->service_tree)) { + struct rb_node *n = rb_first(&cfqd->service_tree); + + cfqq = rb_entry(n, struct cfq_queue, rb_node); } else if (!list_empty(&cfqd->idle_rr)) { /* * if we have idle queues and no rt or be queues had pending @@ -861,27 +789,20 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) { struct cfq_queue *cfqq; - do { - long prio; - - cfqq = cfq_get_next_queue(cfqd); - if (!cfqq) - break; - - prio = cfq_prio_to_slice(cfqd, cfqq); - if (cfqq->slice_resid > -prio) - break; - - cfqq->slice_resid += prio; - list_del_init(&cfqq->cfq_list); - list_add_tail(&cfqq->cfq_list, &cfqd->rr_list[cfqq->ioprio]); - cfqq = NULL; - } while (1); - + cfqq = cfq_get_next_queue(cfqd); __cfq_set_active_queue(cfqd, cfqq); return cfqq; } +static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, + struct request *rq) +{ + if (rq->sector >= cfqd->last_position) + return rq->sector - cfqd->last_position; + else + return cfqd->last_position - rq->sector; +} + static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) { struct cfq_io_context *cic = cfqd->active_cic; @@ -892,45 +813,15 @@ static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean; } -static struct cfq_queue *__cfq_close_cooperator(struct cfq_data *cfqd, - struct cfq_queue *cur_cfqq, - struct list_head *list) +static int cfq_close_cooperator(struct cfq_data *cfq_data, + struct cfq_queue *cfqq) { - struct cfq_queue *cfqq; - - list_for_each_entry(cfqq, list, cfq_list) { - if (cfqq == cur_cfqq || !cfq_cfqq_sync(cfqq)) - continue; - - BUG_ON(!cfqq->next_rq); - - if (cfq_rq_close(cfqd, cfqq->next_rq)) - return cfqq; - } - - return NULL; -} - -static int cfq_close_cooperator(struct cfq_data *cfqd, - struct cfq_queue *cur_cfqq) -{ - struct cfq_queue *cfqq; - - if (!cfqd->busy_queues) - return 0; - /* - * check cur_rr and same-prio rr_list for candidates + * We should notice if some of the queues are cooperating, eg + * working closely on the same area of the disk. In that case, + * we can group them together and don't waste time idling. */ - cfqq = __cfq_close_cooperator(cfqd, cur_cfqq, &cfqd->cur_rr); - if (cfqq) - return 1; - - cfqq = __cfq_close_cooperator(cfqd, cur_cfqq, &cfqd->rr_list[cur_cfqq->ioprio]); - if (cfqq && (cfqq->rr_tick == cfqd->cur_rr_tick)) - cfqq = NULL; - - return cfqq != NULL; + return 0; } #define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024)) @@ -974,7 +865,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) */ sl = cfqd->cfq_slice_idle; if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) - sl = min(sl, msecs_to_jiffies(2)); + sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); mod_timer(&cfqd->idle_slice_timer, jiffies + sl); } @@ -1115,31 +1006,41 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq, return dispatched; } -static int -cfq_forced_dispatch_cfqqs(struct list_head *list) +static inline int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) +{ + int dispatched = 0; + + while (cfqq->next_rq) { + cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); + dispatched++; + } + + BUG_ON(!list_empty(&cfqq->fifo)); + return dispatched; +} + +static int cfq_forced_dispatch_cfqqs(struct list_head *list) { struct cfq_queue *cfqq, *next; int dispatched; dispatched = 0; - list_for_each_entry_safe(cfqq, next, list, cfq_list) { - while (cfqq->next_rq) { - cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); - dispatched++; - } - BUG_ON(!list_empty(&cfqq->fifo)); - } + list_for_each_entry_safe(cfqq, next, list, cfq_list) + dispatched += __cfq_forced_dispatch_cfqq(cfqq); return dispatched; } -static int -cfq_forced_dispatch(struct cfq_data *cfqd) +static int cfq_forced_dispatch(struct cfq_data *cfqd) { - int i, dispatched = 0; + int dispatched = 0; + struct rb_node *n; - for (i = 0; i < CFQ_PRIO_LISTS; i++) - dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]); + while ((n = rb_first(&cfqd->service_tree)) != NULL) { + struct cfq_queue *cfqq = rb_entry(n, struct cfq_queue, rb_node); + + dispatched += __cfq_forced_dispatch_cfqq(cfqq); + } dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr); dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr); @@ -1151,8 +1052,7 @@ cfq_forced_dispatch(struct cfq_data *cfqd) return dispatched; } -static int -cfq_dispatch_requests(request_queue_t *q, int force) +static int cfq_dispatch_requests(request_queue_t *q, int force) { struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_queue *cfqq; @@ -1222,7 +1122,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq) /* * it's on the empty list and still hashed */ - list_del(&cfqq->cfq_list); hlist_del(&cfqq->cfq_hash); kmem_cache_free(cfq_pool, cfqq); } @@ -1391,8 +1290,6 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq) */ cfqq->org_ioprio = cfqq->ioprio; cfqq->org_ioprio_class = cfqq->ioprio_class; - - cfq_resort_rr_list(cfqq, 0); cfq_clear_cfqq_prio_changed(cfqq); } @@ -1478,6 +1375,7 @@ cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, INIT_HLIST_NODE(&cfqq->cfq_hash); INIT_LIST_HEAD(&cfqq->cfq_list); + RB_CLEAR_NODE(&cfqq->rb_node); INIT_LIST_HEAD(&cfqq->fifo); cfqq->key = key; @@ -1752,7 +1650,8 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) * so we know that it will be selected next. */ BUG_ON(!cfq_cfqq_on_rr(cfqq)); - list_move(&cfqq->cfq_list, &cfqd->cur_rr); + list_del_init(&cfqq->cfq_list); + list_add(&cfqq->cfq_list, &cfqd->cur_rr); cfqq->slice_end = 0; cfq_mark_cfqq_slice_new(cfqq); @@ -1828,13 +1727,10 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq) WARN_ON(!cfqq->dispatched); cfqd->rq_in_driver--; cfqq->dispatched--; - cfqq->service_last = now; if (!cfq_class_idle(cfqq)) cfqd->last_end_request = now; - cfq_resort_rr_list(cfqq, 0); - if (sync) RQ_CIC(rq)->last_end_request = now; @@ -1863,9 +1759,6 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq) */ static void cfq_prio_boost(struct cfq_queue *cfqq) { - const int ioprio_class = cfqq->ioprio_class; - const int ioprio = cfqq->ioprio; - if (has_fs_excl()) { /* * boost idle prio on transactions that would lock out other @@ -1884,12 +1777,6 @@ static void cfq_prio_boost(struct cfq_queue *cfqq) if (cfqq->ioprio != cfqq->org_ioprio) cfqq->ioprio = cfqq->org_ioprio; } - - /* - * refile between round-robin lists if we moved the priority class - */ - if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio)) - cfq_resort_rr_list(cfqq, 0); } static inline int __cfq_may_queue(struct cfq_queue *cfqq) @@ -2127,9 +2014,7 @@ static void *cfq_init_queue(request_queue_t *q) memset(cfqd, 0, sizeof(*cfqd)); - for (i = 0; i < CFQ_PRIO_LISTS; i++) - INIT_LIST_HEAD(&cfqd->rr_list[i]); - + cfqd->service_tree = RB_ROOT; INIT_LIST_HEAD(&cfqd->cur_rr); INIT_LIST_HEAD(&cfqd->idle_rr); INIT_LIST_HEAD(&cfqd->cic_list); From cc09e2990fdd96d25fdbb9db6bc9b4c82d9e4a3c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Apr 2007 12:53:50 +0200 Subject: [PATCH 05/18] [PATCH] cfq-iosched: speed up rbtree handling For cases where the rbtree is mainly used for sorting and min retrieval, a nice speedup of the rbtree code is to maintain a cache of the leftmost node in the tree. Also spotted in the CFS CPU scheduler code. Improved by Alan D. Brunelle by updating the leftmost hint in cfq_rb_first() if it isn't set, instead of only updating it on insert. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 62 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4838c2b16f2c..55c476baa692 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -69,6 +69,18 @@ static struct completion *ioc_gone; #define sample_valid(samples) ((samples) > 80) +/* + * Most of our rbtree usage is for sorting with min extraction, so + * if we cache the leftmost node we don't have to walk down the tree + * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should + * move this into the elevator for the rq sorting as well. + */ +struct cfq_rb_root { + struct rb_root rb; + struct rb_node *left; +}; +#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } + /* * Per block device queue structure */ @@ -78,7 +90,7 @@ struct cfq_data { /* * rr list of queues with requests and the count of them */ - struct rb_root service_tree; + struct cfq_rb_root service_tree; struct list_head cur_rr; struct list_head idle_rr; unsigned int busy_queues; @@ -378,6 +390,23 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) } } +static struct rb_node *cfq_rb_first(struct cfq_rb_root *root) +{ + if (!root->left) + root->left = rb_first(&root->rb); + + return root->left; +} + +static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) +{ + if (root->left == n) + root->left = NULL; + + rb_erase(n, &root->rb); + RB_CLEAR_NODE(n); +} + /* * would be nice to take fifo expire time into account as well */ @@ -417,10 +446,10 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - struct rb_node **p = &cfqd->service_tree.rb_node; + struct rb_node **p = &cfqd->service_tree.rb.rb_node; struct rb_node *parent = NULL; - struct cfq_queue *__cfqq; unsigned long rb_key; + int left = 1; rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; rb_key += cfqq->slice_resid; @@ -433,22 +462,29 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, if (rb_key == cfqq->rb_key) return; - rb_erase(&cfqq->rb_node, &cfqd->service_tree); + cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); } while (*p) { + struct cfq_queue *__cfqq; + parent = *p; __cfqq = rb_entry(parent, struct cfq_queue, rb_node); if (rb_key < __cfqq->rb_key) p = &(*p)->rb_left; - else + else { p = &(*p)->rb_right; + left = 0; + } } + if (left) + cfqd->service_tree.left = &cfqq->rb_node; + cfqq->rb_key = rb_key; rb_link_node(&cfqq->rb_node, parent, p); - rb_insert_color(&cfqq->rb_node, &cfqd->service_tree); + rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); } static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) @@ -509,10 +545,8 @@ cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfq_clear_cfqq_on_rr(cfqq); list_del_init(&cfqq->cfq_list); - if (!RB_EMPTY_NODE(&cfqq->rb_node)) { - rb_erase(&cfqq->rb_node, &cfqd->service_tree); - RB_CLEAR_NODE(&cfqq->rb_node); - } + if (!RB_EMPTY_NODE(&cfqq->rb_node)) + cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; @@ -764,8 +798,8 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) * if current list is non-empty, grab first entry. */ cfqq = list_entry_cfqq(cfqd->cur_rr.next); - } else if (!RB_EMPTY_ROOT(&cfqd->service_tree)) { - struct rb_node *n = rb_first(&cfqd->service_tree); + } else if (!RB_EMPTY_ROOT(&cfqd->service_tree.rb)) { + struct rb_node *n = cfq_rb_first(&cfqd->service_tree); cfqq = rb_entry(n, struct cfq_queue, rb_node); } else if (!list_empty(&cfqd->idle_rr)) { @@ -1036,7 +1070,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) int dispatched = 0; struct rb_node *n; - while ((n = rb_first(&cfqd->service_tree)) != NULL) { + while ((n = cfq_rb_first(&cfqd->service_tree)) != NULL) { struct cfq_queue *cfqq = rb_entry(n, struct cfq_queue, rb_node); dispatched += __cfq_forced_dispatch_cfqq(cfqq); @@ -2014,7 +2048,7 @@ static void *cfq_init_queue(request_queue_t *q) memset(cfqd, 0, sizeof(*cfqd)); - cfqd->service_tree = RB_ROOT; + cfqd->service_tree = CFQ_RB_ROOT; INIT_LIST_HEAD(&cfqd->cur_rr); INIT_LIST_HEAD(&cfqd->idle_rr); INIT_LIST_HEAD(&cfqd->cic_list); From 0c534e0a463e2eeafc97ba25ab23c14f3cdf2bdb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 18 Apr 2007 20:01:57 +0200 Subject: [PATCH 06/18] cfq-iosched: sort RT queues into the rbtree Currently CFQ does a linked insert into the current list for RT queues. We can just factor the class into the rb insertion, and then we don't have to treat RT queues in a special way. It's faster, too. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 55c476baa692..81c057eadfcc 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -471,7 +471,16 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, parent = *p; __cfqq = rb_entry(parent, struct cfq_queue, rb_node); - if (rb_key < __cfqq->rb_key) + /* + * sort RT queues first, we always want to give + * preference to them. after that, sort on the next + * service time. + */ + if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) + p = &(*p)->rb_left; + else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq)) + p = &(*p)->rb_right; + else if (rb_key < __cfqq->rb_key) p = &(*p)->rb_left; else { p = &(*p)->rb_right; @@ -490,7 +499,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) { struct cfq_data *cfqd = cfqq->cfqd; - struct list_head *n; /* * Resorting requires the cfqq to be on the RR list already. @@ -500,25 +508,14 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) list_del_init(&cfqq->cfq_list); - if (cfq_class_rt(cfqq)) { - /* - * At to the front of the current list, but behind other - * RT queues. - */ - n = &cfqd->cur_rr; - while (n->next != &cfqd->cur_rr) - if (!cfq_class_rt(cfqq)) - break; - - list_add(&cfqq->cfq_list, n); - } else if (cfq_class_idle(cfqq)) { + if (cfq_class_idle(cfqq)) { /* * IDLE goes to the tail of the idle list */ list_add_tail(&cfqq->cfq_list, &cfqd->idle_rr); } else { /* - * So we get here, ergo the queue is a regular best-effort queue + * RT and BE queues, sort into the rbtree */ cfq_service_tree_add(cfqd, cfqq); } From 67060e37994444ee9c0bd2413c8baa6cc58e7adb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 18 Apr 2007 20:13:32 +0200 Subject: [PATCH 07/18] cfq-iosched: sort IDLE queues into the rbtree Same treatment as the RT conversion, just put the sorted idle branch at the end of the tree. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 67 +++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 81c057eadfcc..6a6a5f7930d8 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -92,7 +92,6 @@ struct cfq_data { */ struct cfq_rb_root service_tree; struct list_head cur_rr; - struct list_head idle_rr; unsigned int busy_queues; /* @@ -467,25 +466,33 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, while (*p) { struct cfq_queue *__cfqq; + struct rb_node **n; parent = *p; __cfqq = rb_entry(parent, struct cfq_queue, rb_node); /* * sort RT queues first, we always want to give - * preference to them. after that, sort on the next - * service time. + * preference to them. IDLE queues goes to the back. + * after that, sort on the next service time. */ if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) - p = &(*p)->rb_left; + n = &(*p)->rb_left; else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq)) - p = &(*p)->rb_right; + n = &(*p)->rb_right; + else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq)) + n = &(*p)->rb_left; + else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq)) + n = &(*p)->rb_right; else if (rb_key < __cfqq->rb_key) - p = &(*p)->rb_left; - else { - p = &(*p)->rb_right; + n = &(*p)->rb_left; + else + n = &(*p)->rb_right; + + if (n == &(*p)->rb_right) left = 0; - } + + p = n; } if (left) @@ -506,19 +513,7 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) if (!cfq_cfqq_on_rr(cfqq)) return; - list_del_init(&cfqq->cfq_list); - - if (cfq_class_idle(cfqq)) { - /* - * IDLE goes to the tail of the idle list - */ - list_add_tail(&cfqq->cfq_list, &cfqd->idle_rr); - } else { - /* - * RT and BE queues, sort into the rbtree - */ - cfq_service_tree_add(cfqd, cfqq); - } + cfq_service_tree_add(cfqd, cfqq); } /* @@ -797,20 +792,22 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) cfqq = list_entry_cfqq(cfqd->cur_rr.next); } else if (!RB_EMPTY_ROOT(&cfqd->service_tree.rb)) { struct rb_node *n = cfq_rb_first(&cfqd->service_tree); + unsigned long end; cfqq = rb_entry(n, struct cfq_queue, rb_node); - } else if (!list_empty(&cfqd->idle_rr)) { - /* - * if we have idle queues and no rt or be queues had pending - * requests, either allow immediate service if the grace period - * has passed or arm the idle grace timer - */ - unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE; - - if (time_after_eq(jiffies, end)) - cfqq = list_entry_cfqq(cfqd->idle_rr.next); - else - mod_timer(&cfqd->idle_class_timer, end); + if (cfq_class_idle(cfqq)) { + /* + * if we have idle queues and no rt or be queues had + * pending requests, either allow immediate service if + * the grace period has passed or arm the idle grace + * timer + */ + end = cfqd->last_end_request + CFQ_IDLE_GRACE; + if (time_before(jiffies, end)) { + mod_timer(&cfqd->idle_class_timer, end); + cfqq = NULL; + } + } } return cfqq; @@ -1074,7 +1071,6 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) } dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr); - dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr); cfq_slice_expired(cfqd, 0, 0); @@ -2047,7 +2043,6 @@ static void *cfq_init_queue(request_queue_t *q) cfqd->service_tree = CFQ_RB_ROOT; INIT_LIST_HEAD(&cfqd->cur_rr); - INIT_LIST_HEAD(&cfqd->idle_rr); INIT_LIST_HEAD(&cfqd->cic_list); cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node); From 498d3aa2b4f791059acd8c942ee8fa15c2ce36c2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Apr 2007 12:54:48 +0200 Subject: [PATCH 08/18] [PATCH] cfq-iosched: style cleanups and comments Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 66 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 6a6a5f7930d8..29284fa06e6b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -222,7 +222,7 @@ CFQ_CFQQ_FNS(slice_new); static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short); static void cfq_dispatch_insert(request_queue_t *, struct request *); -static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask); +static struct cfq_queue *cfq_get_queue(struct cfq_data *, unsigned int, struct task_struct *, gfp_t); /* * scheduler run of queue, if there are requests pending and no one in the @@ -389,6 +389,9 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) } } +/* + * The below is leftmost cache rbtree addon + */ static struct rb_node *cfq_rb_first(struct cfq_rb_root *root) { if (!root->left) @@ -442,13 +445,18 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, return ((cfqd->busy_queues - 1) * cfq_prio_slice(cfqd, 1, 0)); } +/* + * The cfqd->service_tree holds all pending cfq_queue's that have + * requests waiting to be processed. It is sorted in the order that + * we will service the queues. + */ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) { struct rb_node **p = &cfqd->service_tree.rb.rb_node; struct rb_node *parent = NULL; unsigned long rb_key; - int left = 1; + int left; rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; rb_key += cfqq->slice_resid; @@ -464,6 +472,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); } + left = 1; while (*p) { struct cfq_queue *__cfqq; struct rb_node **n; @@ -503,17 +512,16 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); } +/* + * Update cfqq's position in the service tree. + */ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) { - struct cfq_data *cfqd = cfqq->cfqd; - /* * Resorting requires the cfqq to be on the RR list already. */ - if (!cfq_cfqq_on_rr(cfqq)) - return; - - cfq_service_tree_add(cfqd, cfqq); + if (cfq_cfqq_on_rr(cfqq)) + cfq_service_tree_add(cfqq->cfqd, cfqq); } /* @@ -530,6 +538,10 @@ cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfq_resort_rr_list(cfqq, 0); } +/* + * Called when the cfqq no longer has requests pending, remove it from + * the service tree. + */ static inline void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) { @@ -654,8 +666,7 @@ static void cfq_remove_request(struct request *rq) } } -static int -cfq_merge(request_queue_t *q, struct request **req, struct bio *bio) +static int cfq_merge(request_queue_t *q, struct request **req, struct bio *bio) { struct cfq_data *cfqd = q->elevator->elevator_data; struct request *__rq; @@ -781,6 +792,10 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted, __cfq_slice_expired(cfqd, cfqq, preempted, timed_out); } +/* + * Get next queue for service. Unless we have a queue preemption, + * we'll simply select the first cfqq in the service tree. + */ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { struct cfq_queue *cfqq = NULL; @@ -792,10 +807,11 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) cfqq = list_entry_cfqq(cfqd->cur_rr.next); } else if (!RB_EMPTY_ROOT(&cfqd->service_tree.rb)) { struct rb_node *n = cfq_rb_first(&cfqd->service_tree); - unsigned long end; cfqq = rb_entry(n, struct cfq_queue, rb_node); if (cfq_class_idle(cfqq)) { + unsigned long end; + /* * if we have idle queues and no rt or be queues had * pending requests, either allow immediate service if @@ -813,6 +829,9 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) return cfqq; } +/* + * Get and set a new active queue for service. + */ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) { struct cfq_queue *cfqq; @@ -898,6 +917,9 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) mod_timer(&cfqd->idle_slice_timer, jiffies + sl); } +/* + * Move request from internal lists to the request queue dispatch list. + */ static void cfq_dispatch_insert(request_queue_t *q, struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); @@ -944,7 +966,8 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) } /* - * get next queue for service + * Select a queue for service. If we have a current active queue, + * check whether to continue servicing it, or retrieve and set a new one. */ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) { @@ -985,6 +1008,10 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) return cfqq; } +/* + * Dispatch some requests from cfqq, moving them to the request queue + * dispatch list. + */ static int __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq, int max_dispatch) @@ -1059,6 +1086,10 @@ static int cfq_forced_dispatch_cfqqs(struct list_head *list) return dispatched; } +/* + * Drain our current requests. Used for barriers and when switching + * io schedulers on-the-fly. + */ static int cfq_forced_dispatch(struct cfq_data *cfqd) { int dispatched = 0; @@ -1224,10 +1255,6 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, } } - -/* - * Called with interrupts disabled - */ static void cfq_exit_single_io_context(struct cfq_io_context *cic) { struct cfq_data *cfqd = cic->key; @@ -1241,6 +1268,10 @@ static void cfq_exit_single_io_context(struct cfq_io_context *cic) } } +/* + * The process that ioc belongs to has exited, we need to clean up + * and put the internal structures we have that belongs to that process. + */ static void cfq_exit_io_context(struct io_context *ioc) { struct cfq_io_context *__cic; @@ -1427,6 +1458,9 @@ cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, return cfqq; } +/* + * We drop cfq io contexts lazily, so we may find a dead one. + */ static void cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic) { From 67e6b49e39e9b9bf5ce1351ef21dad391856183f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Apr 2007 14:18:00 +0200 Subject: [PATCH 09/18] cfq-iosched: slice offset should take ioprio into account Use the max_slice-cur_slice as the multipler for the insertion offset. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 29284fa06e6b..4a0397022f5b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -442,7 +442,8 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, /* * just an approximation, should be ok. */ - return ((cfqd->busy_queues - 1) * cfq_prio_slice(cfqd, 1, 0)); + return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - + cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); } /* From edd75ffd92a5b7f6244431e8ff6c32b846f9ba86 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Apr 2007 12:03:34 +0200 Subject: [PATCH 10/18] cfq-iosched: get rid of ->cur_rr and ->cfq_list It's only used for preemption now that the IDLE and RT queues also use the rbtree. If we pass an 'add_front' variable to cfq_service_tree_add(), we can set ->rb_key to 0 to force insertion at the front of the tree. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 87 +++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 55 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4a0397022f5b..a8437042e28a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -45,9 +45,6 @@ static int cfq_slice_idle = HZ / 125; */ #define CFQ_QHASH_SHIFT 6 #define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT) -#define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash) - -#define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) #define RQ_CIC(rq) ((struct cfq_io_context*)(rq)->elevator_private) #define RQ_CFQQ(rq) ((rq)->elevator_private2) @@ -91,7 +88,6 @@ struct cfq_data { * rr list of queues with requests and the count of them */ struct cfq_rb_root service_tree; - struct list_head cur_rr; unsigned int busy_queues; /* @@ -146,8 +142,6 @@ struct cfq_queue { struct hlist_node cfq_hash; /* hash key */ unsigned int key; - /* member of the rr/busy/cur/idle cfqd list */ - struct list_head cfq_list; /* service_tree member */ struct rb_node rb_node; /* service_tree key */ @@ -452,16 +446,19 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, * we will service the queues. */ static void cfq_service_tree_add(struct cfq_data *cfqd, - struct cfq_queue *cfqq) + struct cfq_queue *cfqq, int add_front) { struct rb_node **p = &cfqd->service_tree.rb.rb_node; struct rb_node *parent = NULL; unsigned long rb_key; int left; - rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; - rb_key += cfqq->slice_resid; - cfqq->slice_resid = 0; + if (!add_front) { + rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; + rb_key += cfqq->slice_resid; + cfqq->slice_resid = 0; + } else + rb_key = 0; if (!RB_EMPTY_NODE(&cfqq->rb_node)) { /* @@ -516,13 +513,13 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, /* * Update cfqq's position in the service tree. */ -static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) +static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) { /* * Resorting requires the cfqq to be on the RR list already. */ if (cfq_cfqq_on_rr(cfqq)) - cfq_service_tree_add(cfqq->cfqd, cfqq); + cfq_service_tree_add(cfqd, cfqq, 0); } /* @@ -536,7 +533,7 @@ cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; - cfq_resort_rr_list(cfqq, 0); + cfq_resort_rr_list(cfqd, cfqq); } /* @@ -548,7 +545,6 @@ cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) { BUG_ON(!cfq_cfqq_on_rr(cfqq)); cfq_clear_cfqq_on_rr(cfqq); - list_del_init(&cfqq->cfq_list); if (!RB_EMPTY_NODE(&cfqq->rb_node)) cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); @@ -771,7 +767,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (timed_out && !cfq_cfqq_slice_new(cfqq)) cfqq->slice_resid = cfqq->slice_end - jiffies; - cfq_resort_rr_list(cfqq, preempted); + cfq_resort_rr_list(cfqd, cfqq); if (cfqq == cfqd->active_queue) cfqd->active_queue = NULL; @@ -799,31 +795,28 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted, */ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { - struct cfq_queue *cfqq = NULL; + struct cfq_queue *cfqq; + struct rb_node *n; + + if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) + return NULL; + + n = cfq_rb_first(&cfqd->service_tree); + cfqq = rb_entry(n, struct cfq_queue, rb_node); + + if (cfq_class_idle(cfqq)) { + unsigned long end; - if (!list_empty(&cfqd->cur_rr)) { /* - * if current list is non-empty, grab first entry. + * if we have idle queues and no rt or be queues had + * pending requests, either allow immediate service if + * the grace period has passed or arm the idle grace + * timer */ - cfqq = list_entry_cfqq(cfqd->cur_rr.next); - } else if (!RB_EMPTY_ROOT(&cfqd->service_tree.rb)) { - struct rb_node *n = cfq_rb_first(&cfqd->service_tree); - - cfqq = rb_entry(n, struct cfq_queue, rb_node); - if (cfq_class_idle(cfqq)) { - unsigned long end; - - /* - * if we have idle queues and no rt or be queues had - * pending requests, either allow immediate service if - * the grace period has passed or arm the idle grace - * timer - */ - end = cfqd->last_end_request + CFQ_IDLE_GRACE; - if (time_before(jiffies, end)) { - mod_timer(&cfqd->idle_class_timer, end); - cfqq = NULL; - } + end = cfqd->last_end_request + CFQ_IDLE_GRACE; + if (time_before(jiffies, end)) { + mod_timer(&cfqd->idle_class_timer, end); + cfqq = NULL; } } @@ -1075,18 +1068,6 @@ static inline int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) return dispatched; } -static int cfq_forced_dispatch_cfqqs(struct list_head *list) -{ - struct cfq_queue *cfqq, *next; - int dispatched; - - dispatched = 0; - list_for_each_entry_safe(cfqq, next, list, cfq_list) - dispatched += __cfq_forced_dispatch_cfqq(cfqq); - - return dispatched; -} - /* * Drain our current requests. Used for barriers and when switching * io schedulers on-the-fly. @@ -1102,8 +1083,6 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) dispatched += __cfq_forced_dispatch_cfqq(cfqq); } - dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr); - cfq_slice_expired(cfqd, 0, 0); BUG_ON(cfqd->busy_queues); @@ -1433,7 +1412,6 @@ cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, memset(cfqq, 0, sizeof(*cfqq)); INIT_HLIST_NODE(&cfqq->cfq_hash); - INIT_LIST_HEAD(&cfqq->cfq_list); RB_CLEAR_NODE(&cfqq->rb_node); INIT_LIST_HEAD(&cfqq->fifo); @@ -1712,8 +1690,8 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) * so we know that it will be selected next. */ BUG_ON(!cfq_cfqq_on_rr(cfqq)); - list_del_init(&cfqq->cfq_list); - list_add(&cfqq->cfq_list, &cfqd->cur_rr); + + cfq_service_tree_add(cfqd, cfqq, 1); cfqq->slice_end = 0; cfq_mark_cfqq_slice_new(cfqq); @@ -2077,7 +2055,6 @@ static void *cfq_init_queue(request_queue_t *q) memset(cfqd, 0, sizeof(*cfqd)); cfqd->service_tree = CFQ_RB_ROOT; - INIT_LIST_HEAD(&cfqd->cur_rr); INIT_LIST_HEAD(&cfqd->cic_list); cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node); From 6084cdda0ea4561feb68e00a8c50068bba98006d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 Apr 2007 08:25:00 +0200 Subject: [PATCH 11/18] cfq-iosched: don't pass unused preemption variable around We don't use it anymore in the slice expiry handling. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index a8437042e28a..f089eeecdf32 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -752,7 +752,7 @@ __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) */ static void __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, - int preempted, int timed_out) + int timed_out) { if (cfq_cfqq_wait_request(cfqq)) del_timer(&cfqd->idle_slice_timer); @@ -761,8 +761,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_clear_cfqq_wait_request(cfqq); /* - * store what was left of this slice, if the queue idled out - * or was preempted + * store what was left of this slice, if the queue idled/timed out */ if (timed_out && !cfq_cfqq_slice_new(cfqq)) cfqq->slice_resid = cfqq->slice_end - jiffies; @@ -780,13 +779,12 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqd->dispatch_slice = 0; } -static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted, - int timed_out) +static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out) { struct cfq_queue *cfqq = cfqd->active_queue; if (cfqq) - __cfq_slice_expired(cfqd, cfqq, preempted, timed_out); + __cfq_slice_expired(cfqd, cfqq, timed_out); } /* @@ -995,7 +993,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) } expire: - cfq_slice_expired(cfqd, 0, 0); + cfq_slice_expired(cfqd, 0); new_queue: cfqq = cfq_set_active_queue(cfqd); keep_queue: @@ -1049,7 +1047,7 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) || cfq_class_idle(cfqq))) { cfqq->slice_end = jiffies + 1; - cfq_slice_expired(cfqd, 0, 0); + cfq_slice_expired(cfqd, 0); } return dispatched; @@ -1083,7 +1081,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) dispatched += __cfq_forced_dispatch_cfqq(cfqq); } - cfq_slice_expired(cfqd, 0, 0); + cfq_slice_expired(cfqd, 0); BUG_ON(cfqd->busy_queues); @@ -1153,7 +1151,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); if (unlikely(cfqd->active_queue == cfqq)) { - __cfq_slice_expired(cfqd, cfqq, 0, 0); + __cfq_slice_expired(cfqd, cfqq, 0); cfq_schedule_dispatch(cfqd); } @@ -1210,7 +1208,7 @@ static void cfq_free_io_context(struct io_context *ioc) static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) { if (unlikely(cfqq == cfqd->active_queue)) { - __cfq_slice_expired(cfqd, cfqq, 0, 0); + __cfq_slice_expired(cfqd, cfqq, 0); cfq_schedule_dispatch(cfqd); } @@ -1683,7 +1681,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, */ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - cfq_slice_expired(cfqd, 1, 1); + cfq_slice_expired(cfqd, 1); /* * Put the new queue at the front of the of the current list, @@ -1784,7 +1782,7 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq) cfq_clear_cfqq_slice_new(cfqq); } if (cfq_slice_used(cfqq)) - cfq_slice_expired(cfqd, 0, 1); + cfq_slice_expired(cfqd, 1); else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) cfq_arm_slice_timer(cfqd); } @@ -1979,7 +1977,7 @@ static void cfq_idle_slice_timer(unsigned long data) } } expire: - cfq_slice_expired(cfqd, 0, timed_out); + cfq_slice_expired(cfqd, timed_out); out_kick: cfq_schedule_dispatch(cfqd); out_cont: @@ -2025,7 +2023,7 @@ static void cfq_exit_queue(elevator_t *e) spin_lock_irq(q->queue_lock); if (cfqd->active_queue) - __cfq_slice_expired(cfqd, cfqd->active_queue, 0, 0); + __cfq_slice_expired(cfqd, cfqd->active_queue, 0); while (!list_empty(&cfqd->cic_list)) { struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, From 20e493a8d03b3b2f51b619a453f7bbbebedd6bda Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 Apr 2007 08:26:36 +0200 Subject: [PATCH 12/18] cfq-iosched: get rid of ->dispatch_slice We can track it fairly accurately locally, let the slice handling take care of the rest. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index f089eeecdf32..839086dcb0af 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -106,7 +106,6 @@ struct cfq_data { struct cfq_queue *active_queue; struct cfq_io_context *active_cic; - unsigned int dispatch_slice; struct timer_list idle_class_timer; @@ -775,8 +774,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, put_io_context(cfqd->active_cic->ioc); cfqd->active_cic = NULL; } - - cfqd->dispatch_slice = 0; } static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out) @@ -1026,7 +1023,6 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq, */ cfq_dispatch_insert(cfqd->queue, rq); - cfqd->dispatch_slice++; dispatched++; if (!cfqd->active_cic) { @@ -1044,7 +1040,7 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq, * queue always expire after 1 dispatch round. */ if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && - cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) || + dispatched >= cfq_prio_to_maxrq(cfqd, cfqq)) || cfq_class_idle(cfqq))) { cfqq->slice_end = jiffies + 1; cfq_slice_expired(cfqd, 0); From 1be92f2fc7b563db3a8909d2d1c6a6520aeca323 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Apr 2007 14:32:26 +0200 Subject: [PATCH 13/18] cfq-iosched: never allow an async queue idling We don't enable it by default, don't let it get enabled during runtime. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 839086dcb0af..df82755ac40b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1603,7 +1603,12 @@ static void cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_io_context *cic) { - int enable_idle = cfq_cfqq_idle_window(cfqq); + int enable_idle; + + if (!cfq_cfqq_sync(cfqq)) + return; + + enable_idle = cfq_cfqq_idle_window(cfqq); if (!cic->ioc->task || !cfqd->cfq_slice_idle || (cfqd->hw_tag && CIC_SEEKY(cic))) From 3ed9a2965c47636bc0ebafab31a39f1c105492ca Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 Apr 2007 08:33:33 +0200 Subject: [PATCH 14/18] cfq-iosched: improve sync vs async workloads Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index df82755ac40b..a8237be97a28 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -96,6 +96,7 @@ struct cfq_data { struct hlist_head *cfq_hash; int rq_in_driver; + int sync_flight; int hw_tag; /* @@ -911,11 +912,15 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) */ static void cfq_dispatch_insert(request_queue_t *q, struct request *rq) { + struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_queue *cfqq = RQ_CFQQ(rq); cfq_remove_request(rq); cfqq->dispatched++; elv_dispatch_sort(q, rq); + + if (cfq_cfqq_sync(cfqq)) + cfqd->sync_flight++; } /* @@ -1100,27 +1105,24 @@ static int cfq_dispatch_requests(request_queue_t *q, int force) while ((cfqq = cfq_select_queue(cfqd)) != NULL) { int max_dispatch; - if (cfqd->busy_queues > 1) { - /* - * So we have dispatched before in this round, if the - * next queue has idling enabled (must be sync), don't - * allow it service until the previous have completed. - */ - if (cfqd->rq_in_driver && cfq_cfqq_idle_window(cfqq) && - dispatched) + max_dispatch = cfqd->cfq_quantum; + if (cfq_class_idle(cfqq)) + max_dispatch = 1; + + if (cfqq->dispatched >= max_dispatch) { + if (cfqd->busy_queues > 1) break; - if (cfqq->dispatched >= cfqd->cfq_quantum) + if (cfqq->dispatched >= 4 * max_dispatch) break; } + if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq)) + break; + cfq_clear_cfqq_must_dispatch(cfqq); cfq_clear_cfqq_wait_request(cfqq); del_timer(&cfqd->idle_slice_timer); - max_dispatch = cfqd->cfq_quantum; - if (cfq_class_idle(cfqq)) - max_dispatch = 1; - dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch); } @@ -1767,6 +1769,9 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq) cfqd->rq_in_driver--; cfqq->dispatched--; + if (cfq_cfqq_sync(cfqq)) + cfqd->sync_flight--; + if (!cfq_class_idle(cfqq)) cfqd->last_end_request = now; From cc19747977824ece6aa1c56a29e974fef5ec2b32 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Apr 2007 20:45:39 +0200 Subject: [PATCH 15/18] cfq-iosched: tighten queue request overlap condition For tagged devices, allow overlap of requests if the idle window isn't enabled on the current active queue. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index a8237be97a28..e859b4966e4c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -989,7 +989,8 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) * flight or is idling for a new request, allow either of these * conditions to happen (or time out) before selecting a new queue. */ - if (cfqq->dispatched || timer_pending(&cfqd->idle_slice_timer)) { + if (timer_pending(&cfqd->idle_slice_timer) || + (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) { cfqq = NULL; goto keep_queue; } From 91fac317a34859986a2359a5a5c0e37dc17a9c3d Mon Sep 17 00:00:00 2001 From: Vasily Tarasov Date: Wed, 25 Apr 2007 12:29:51 +0200 Subject: [PATCH 16/18] cfq-iosched: get rid of cfqq hash cfq hash is no more necessary. We always can get cfqq from io context. cfq_get_io_context_noalloc() function is introduced, because we don't want to allocate cic on merging and checking may_queue. In order to identify sync queue we've used hash key = CFQ_KEY_ASYNC. Since hash is eliminated we need to use other criterion: sync flag for queue is added. In all places where we dig in rb_tree we're in current context, so no additional locking is required. Advantages of this patch: no additional memory for hash, no seeking in hash, code is cleaner. But it is necessary now to seek cic in per-ioc rbtree, but it is faster: - most processes work only with few devices - most systems have only few block devices - it is a rb-tree Signed-off-by: Vasily Tarasov Changes by me: - Merge into CFQ devel branch - Get rid of cfq_get_io_context_noalloc() - Fix various bugs with dereferencing cic->cfqq[] with offset other than 0 or 1. - Fix bug in cfqq setup, is_sync condition was reversed. - Fix bug where only bio_sync() is used, we need to check for a READ too Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 167 ++++++++++++++++++-------------------------- 1 file changed, 67 insertions(+), 100 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e859b4966e4c..94f53a1f4677 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -38,14 +37,6 @@ static int cfq_slice_idle = HZ / 125; #define CFQ_SLICE_SCALE (5) -#define CFQ_KEY_ASYNC (0) - -/* - * for the hash of cfqq inside the cfqd - */ -#define CFQ_QHASH_SHIFT 6 -#define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT) - #define RQ_CIC(rq) ((struct cfq_io_context*)(rq)->elevator_private) #define RQ_CFQQ(rq) ((rq)->elevator_private2) @@ -62,8 +53,6 @@ static struct completion *ioc_gone; #define ASYNC (0) #define SYNC (1) -#define cfq_cfqq_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC) - #define sample_valid(samples) ((samples) > 80) /* @@ -90,11 +79,6 @@ struct cfq_data { struct cfq_rb_root service_tree; unsigned int busy_queues; - /* - * cfqq lookup hash - */ - struct hlist_head *cfq_hash; - int rq_in_driver; int sync_flight; int hw_tag; @@ -138,10 +122,6 @@ struct cfq_queue { atomic_t ref; /* parent cfq_data */ struct cfq_data *cfqd; - /* cfqq lookup hash */ - struct hlist_node cfq_hash; - /* hash key */ - unsigned int key; /* service_tree member */ struct rb_node rb_node; /* service_tree key */ @@ -186,6 +166,7 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ CFQ_CFQQ_FLAG_queue_new, /* queue never been serviced */ CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ + CFQ_CFQQ_FLAG_sync, /* synchronous queue */ }; #define CFQ_CFQQ_FNS(name) \ @@ -212,11 +193,38 @@ CFQ_CFQQ_FNS(idle_window); CFQ_CFQQ_FNS(prio_changed); CFQ_CFQQ_FNS(queue_new); CFQ_CFQQ_FNS(slice_new); +CFQ_CFQQ_FNS(sync); #undef CFQ_CFQQ_FNS -static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short); static void cfq_dispatch_insert(request_queue_t *, struct request *); -static struct cfq_queue *cfq_get_queue(struct cfq_data *, unsigned int, struct task_struct *, gfp_t); +static struct cfq_queue *cfq_get_queue(struct cfq_data *, int, + struct task_struct *, gfp_t); +static struct cfq_io_context *cfq_cic_rb_lookup(struct cfq_data *, + struct io_context *); + +static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, + int is_sync) +{ + return cic->cfqq[!!is_sync]; +} + +static inline void cic_set_cfqq(struct cfq_io_context *cic, + struct cfq_queue *cfqq, int is_sync) +{ + cic->cfqq[!!is_sync] = cfqq; +} + +/* + * We regard a request as SYNC, if it's either a read or has the SYNC bit + * set (in which case it could also be direct WRITE). + */ +static inline int cfq_bio_sync(struct bio *bio) +{ + if (bio_data_dir(bio) == READ || bio_sync(bio)) + return 1; + + return 0; +} /* * scheduler run of queue, if there are requests pending and no one in the @@ -235,17 +243,6 @@ static int cfq_queue_empty(request_queue_t *q) return !cfqd->busy_queues; } -static inline pid_t cfq_queue_pid(struct task_struct *task, int rw, int is_sync) -{ - /* - * Use the per-process queue, for read requests and syncronous writes - */ - if (!(rw & REQ_RW) || is_sync) - return task->pid; - - return CFQ_KEY_ASYNC; -} - /* * Scale schedule slice based on io priority. Use the sync time slice only * if a queue is marked sync and has sync io queued. A sync queue with async @@ -608,10 +605,14 @@ static struct request * cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) { struct task_struct *tsk = current; - pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio), bio_sync(bio)); + struct cfq_io_context *cic; struct cfq_queue *cfqq; - cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio); + cic = cfq_cic_rb_lookup(cfqd, tsk->io_context); + if (!cic) + return NULL; + + cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); if (cfqq) { sector_t sector = bio->bi_sector + bio_sectors(bio); @@ -705,23 +706,24 @@ static int cfq_allow_merge(request_queue_t *q, struct request *rq, struct bio *bio) { struct cfq_data *cfqd = q->elevator->elevator_data; - const int rw = bio_data_dir(bio); + struct cfq_io_context *cic; struct cfq_queue *cfqq; - pid_t key; /* * Disallow merge of a sync bio into an async request. */ - if ((bio_data_dir(bio) == READ || bio_sync(bio)) && !rq_is_sync(rq)) + if (cfq_bio_sync(bio) && !rq_is_sync(rq)) return 0; /* * Lookup the cfqq that this bio will be queued with. Allow * merge only if rq is queued there. */ - key = cfq_queue_pid(current, rw, bio_sync(bio)); - cfqq = cfq_find_cfq_hash(cfqd, key, current->ioprio); + cic = cfq_cic_rb_lookup(cfqd, current->io_context); + if (!cic) + return 0; + cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); if (cfqq == RQ_CFQQ(rq)) return 1; @@ -1154,37 +1156,9 @@ static void cfq_put_queue(struct cfq_queue *cfqq) cfq_schedule_dispatch(cfqd); } - /* - * it's on the empty list and still hashed - */ - hlist_del(&cfqq->cfq_hash); kmem_cache_free(cfq_pool, cfqq); } -static struct cfq_queue * -__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio, - const int hashval) -{ - struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; - struct hlist_node *entry; - struct cfq_queue *__cfqq; - - hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) { - const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio); - - if (__cfqq->key == key && (__p == prio || !prio)) - return __cfqq; - } - - return NULL; -} - -static struct cfq_queue * -cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio) -{ - return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT)); -} - static void cfq_free_io_context(struct io_context *ioc) { struct cfq_io_context *__cic; @@ -1342,7 +1316,7 @@ static inline void changed_ioprio(struct cfq_io_context *cic) cfqq = cic->cfqq[ASYNC]; if (cfqq) { struct cfq_queue *new_cfqq; - new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC, cic->ioc->task, + new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc->task, GFP_ATOMIC); if (new_cfqq) { cic->cfqq[ASYNC] = new_cfqq; @@ -1374,16 +1348,16 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc) } static struct cfq_queue * -cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, +cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct task_struct *tsk, gfp_t gfp_mask) { - const int hashval = hash_long(key, CFQ_QHASH_SHIFT); struct cfq_queue *cfqq, *new_cfqq = NULL; - unsigned short ioprio; + struct cfq_io_context *cic; retry: - ioprio = tsk->ioprio; - cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval); + cic = cfq_cic_rb_lookup(cfqd, tsk->io_context); + /* cic always exists here */ + cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq) { if (new_cfqq) { @@ -1408,20 +1382,20 @@ cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, memset(cfqq, 0, sizeof(*cfqq)); - INIT_HLIST_NODE(&cfqq->cfq_hash); RB_CLEAR_NODE(&cfqq->rb_node); INIT_LIST_HEAD(&cfqq->fifo); - cfqq->key = key; - hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); atomic_set(&cfqq->ref, 0); cfqq->cfqd = cfqd; - if (key != CFQ_KEY_ASYNC) + if (is_sync) { cfq_mark_cfqq_idle_window(cfqq); + cfq_mark_cfqq_sync(cfqq); + } cfq_mark_cfqq_prio_changed(cfqq); cfq_mark_cfqq_queue_new(cfqq); + cfq_init_prio_data(cfqq); } @@ -1453,6 +1427,9 @@ cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc) struct cfq_io_context *cic; void *k, *key = cfqd; + if (unlikely(!ioc)) + return NULL; + restart: n = ioc->cic_root.rb_node; while (n) { @@ -1839,10 +1816,8 @@ static int cfq_may_queue(request_queue_t *q, int rw) { struct cfq_data *cfqd = q->elevator->elevator_data; struct task_struct *tsk = current; + struct cfq_io_context *cic; struct cfq_queue *cfqq; - unsigned int key; - - key = cfq_queue_pid(tsk, rw, rw & REQ_RW_SYNC); /* * don't force setup of a queue from here, as a call to may_queue @@ -1850,7 +1825,11 @@ static int cfq_may_queue(request_queue_t *q, int rw) * so just lookup a possibly existing queue, or return 'may queue' * if that fails */ - cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio); + cic = cfq_cic_rb_lookup(cfqd, tsk->io_context); + if (!cic) + return ELV_MQUEUE_MAY; + + cfqq = cic_to_cfqq(cic, rw & REQ_RW_SYNC); if (cfqq) { cfq_init_prio_data(cfqq); cfq_prio_boost(cfqq); @@ -1894,7 +1873,6 @@ cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask) struct cfq_io_context *cic; const int rw = rq_data_dir(rq); const int is_sync = rq_is_sync(rq); - pid_t key = cfq_queue_pid(tsk, rw, is_sync); struct cfq_queue *cfqq; unsigned long flags; @@ -1907,14 +1885,15 @@ cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask) if (!cic) goto queue_fail; - if (!cic->cfqq[is_sync]) { - cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask); + cfqq = cic_to_cfqq(cic, is_sync); + if (!cfqq) { + cfqq = cfq_get_queue(cfqd, is_sync, tsk, gfp_mask); + if (!cfqq) goto queue_fail; - cic->cfqq[is_sync] = cfqq; - } else - cfqq = cic->cfqq[is_sync]; + cic_set_cfqq(cic, cfqq, is_sync); + } cfqq->allocated[rw]++; cfq_clear_cfqq_must_alloc(cfqq); @@ -2044,14 +2023,12 @@ static void cfq_exit_queue(elevator_t *e) cfq_shutdown_timer_wq(cfqd); - kfree(cfqd->cfq_hash); kfree(cfqd); } static void *cfq_init_queue(request_queue_t *q) { struct cfq_data *cfqd; - int i; cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node); if (!cfqd) @@ -2062,13 +2039,6 @@ static void *cfq_init_queue(request_queue_t *q) cfqd->service_tree = CFQ_RB_ROOT; INIT_LIST_HEAD(&cfqd->cic_list); - cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node); - if (!cfqd->cfq_hash) - goto out_free; - - for (i = 0; i < CFQ_QHASH_ENTRIES; i++) - INIT_HLIST_HEAD(&cfqd->cfq_hash[i]); - cfqd->queue = q; init_timer(&cfqd->idle_slice_timer); @@ -2092,9 +2062,6 @@ static void *cfq_init_queue(request_queue_t *q) cfqd->cfq_slice_idle = cfq_slice_idle; return cfqd; -out_free: - kfree(cfqd); - return NULL; } static void cfq_slab_kill(void) From 4e521c27eee33cebd618c26649e2c93803004647 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 Apr 2007 21:17:33 +0200 Subject: [PATCH 17/18] ll_rw_blk: add io_context private pointer To be used by as/cfq as they see fit. Signed-off-by: Jens Axboe --- block/ll_rw_blk.c | 1 + include/linux/blkdev.h | 1 + 2 files changed, 2 insertions(+) diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index 3de06953ac33..123003a90477 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -3741,6 +3741,7 @@ static struct io_context *current_io_context(gfp_t gfp_flags, int node) ret->nr_batch_requests = 0; /* because this is 0 */ ret->aic = NULL; ret->cic_root.rb_node = NULL; + ret->ioc_data = NULL; /* make sure set_task_ioprio() sees the settings above */ smp_wmb(); tsk->io_context = ret; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 83dcd8c0e974..a686eabe22d6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -116,6 +116,7 @@ struct io_context { struct as_io_context *aic; struct rb_root cic_root; + void *ioc_data; }; void put_io_context(struct io_context *ioc); From 597bc485d6906359ad667fc8ead5e5f0ede03a0a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 Apr 2007 21:23:53 +0200 Subject: [PATCH 18/18] cfq-iosched: speedup cic rb lookup We often lookup the same queue many times in succession, so cache the last looked up queue to avoid browsing the rbtree. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 94f53a1f4677..64df3fa303b0 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1165,6 +1165,8 @@ static void cfq_free_io_context(struct io_context *ioc) struct rb_node *n; int freed = 0; + ioc->ioc_data = NULL; + while ((n = rb_first(&ioc->cic_root)) != NULL) { __cic = rb_entry(n, struct cfq_io_context, rb_node); rb_erase(&__cic->rb_node, &ioc->cic_root); @@ -1228,10 +1230,11 @@ static void cfq_exit_io_context(struct io_context *ioc) struct cfq_io_context *__cic; struct rb_node *n; + ioc->ioc_data = NULL; + /* * put the reference this task is holding to the various queues */ - n = rb_first(&ioc->cic_root); while (n != NULL) { __cic = rb_entry(n, struct cfq_io_context, rb_node); @@ -1415,6 +1418,10 @@ static void cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic) { WARN_ON(!list_empty(&cic->queue_list)); + + if (ioc->ioc_data == cic) + ioc->ioc_data = NULL; + rb_erase(&cic->rb_node, &ioc->cic_root); kmem_cache_free(cfq_ioc_pool, cic); elv_ioc_count_dec(ioc_count); @@ -1430,6 +1437,13 @@ cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc) if (unlikely(!ioc)) return NULL; + /* + * we maintain a last-hit cache, to avoid browsing over the tree + */ + cic = ioc->ioc_data; + if (cic && cic->key == cfqd) + return cic; + restart: n = ioc->cic_root.rb_node; while (n) { @@ -1445,8 +1459,10 @@ cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc) n = n->rb_left; else if (key > k) n = n->rb_right; - else + else { + ioc->ioc_data = cic; return cic; + } } return NULL;