From b1bf42105aad7c976907665923bc53ce2244e494 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 17:49:29 -0700 Subject: [PATCH 01/13] block/floppy: Convert callback to pass timer_list In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to passing in the timer pointer explicitly. Calculate the drive from the offset of the timer in the timer list. Cc: Jiri Kosina Cc: Jens Axboe Cc: Ming Lei Cc: Al Viro Cc: Geliang Tang Cc: Thomas Gleixner Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index a54183935aa1..eae484acfbbc 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -903,10 +903,14 @@ static void unlock_fdc(void) } /* switches the motor off after a given timeout */ -static void motor_off_callback(unsigned long nr) +static void motor_off_callback(struct timer_list *t) { + unsigned long nr = t - motor_off_timer; unsigned char mask = ~(0x10 << UNIT(nr)); + if (WARN_ON_ONCE(nr >= N_DRIVE)) + return; + set_dor(FDC(nr), mask, 0); } @@ -3047,7 +3051,7 @@ static void raw_cmd_done(int flag) else raw_cmd->flags &= ~FD_RAW_DISK_CHANGE; if (raw_cmd->flags & FD_RAW_NO_MOTOR_AFTER) - motor_off_callback(current_drive); + motor_off_callback(&motor_off_timer[current_drive]); if (raw_cmd->next && (!(raw_cmd->flags & FD_RAW_FAILURE) || @@ -4542,7 +4546,7 @@ static int __init do_floppy_init(void) disks[drive]->fops = &floppy_fops; sprintf(disks[drive]->disk_name, "fd%d", drive); - setup_timer(&motor_off_timer[drive], motor_off_callback, drive); + timer_setup(&motor_off_timer[drive], motor_off_callback, 0); } err = register_blkdev(FLOPPY_MAJOR, "fd"); From cbb9d17875d059aa5665a854fafeff922e7a7938 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 17:48:38 -0700 Subject: [PATCH 02/13] amifloppy: Convert timers to use timer_setup() This converts the amifloppy driver to pass the timer pointer to the callback instead of the drive number (and flags). It eliminates the decusagecounter flag, as it was unused, and drops the ininterrupt flag which appeared to be a needless optimization. The drive can then be calculated from the offset of the timer in the drive timer array. Additionally moves to a static data variable instead of the soon-to-be-gone timer->data field. Cc: Jens Axboe Cc: Krzysztof Halasa Cc: Greg Kroah-Hartman Cc: Thomas Gleixner Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- drivers/block/amiflop.c | 57 +++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 4e3fb9f104af..e5aa62fcf5a8 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -146,6 +146,7 @@ static struct amiga_floppy_struct unit[FD_MAX_UNITS]; static struct timer_list flush_track_timer[FD_MAX_UNITS]; static struct timer_list post_write_timer; +static unsigned long post_write_timer_drive; static struct timer_list motor_on_timer; static struct timer_list motor_off_timer[FD_MAX_UNITS]; static int on_attempts; @@ -323,7 +324,7 @@ static void fd_deselect (int drive) } -static void motor_on_callback(unsigned long ignored) +static void motor_on_callback(struct timer_list *unused) { if (!(ciaa.pra & DSKRDY) || --on_attempts == 0) { complete_all(&motor_on_completion); @@ -355,7 +356,7 @@ static int fd_motor_on(int nr) on_attempts = -1; #if 0 printk (KERN_ERR "motor_on failed, turning motor off\n"); - fd_motor_off (nr); + fd_motor_off (motor_off_timer + nr); return 0; #else printk (KERN_WARNING "DSKRDY not set after 1.5 seconds - assuming drive is spinning notwithstanding\n"); @@ -365,20 +366,17 @@ static int fd_motor_on(int nr) return 1; } -static void fd_motor_off(unsigned long drive) +static void fd_motor_off(struct timer_list *timer) { - long calledfromint; -#ifdef MODULE - long decusecount; + unsigned long drive = ((unsigned long)timer - + (unsigned long)&motor_off_timer[0]) / + sizeof(motor_off_timer[0]); - decusecount = drive & 0x40000000; -#endif - calledfromint = drive & 0x80000000; drive&=3; - if (calledfromint && !try_fdc(drive)) { + if (!try_fdc(drive)) { /* We would be blocked in an interrupt, so try again later */ - motor_off_timer[drive].expires = jiffies + 1; - add_timer(motor_off_timer + drive); + timer->expires = jiffies + 1; + add_timer(timer); return; } unit[drive].motor = 0; @@ -392,8 +390,6 @@ static void floppy_off (unsigned int nr) int drive; drive = nr & 3; - /* called this way it is always from interrupt */ - motor_off_timer[drive].data = nr | 0x80000000; mod_timer(motor_off_timer + drive, jiffies + 3*HZ); } @@ -435,7 +431,7 @@ static int fd_calibrate(int drive) break; if (--n == 0) { printk (KERN_ERR "fd%d: calibrate failed, turning motor off\n", drive); - fd_motor_off (drive); + fd_motor_off (motor_off_timer + drive); unit[drive].track = -1; rel_fdc(); return 0; @@ -564,7 +560,7 @@ static irqreturn_t fd_block_done(int irq, void *dummy) if (block_flag == 2) { /* writing */ writepending = 2; post_write_timer.expires = jiffies + 1; /* at least 2 ms */ - post_write_timer.data = selected; + post_write_timer_drive = selected; add_timer(&post_write_timer); } else { /* reading */ @@ -651,6 +647,10 @@ static void post_write (unsigned long drive) rel_fdc(); /* corresponds to get_fdc() in raw_write */ } +static void post_write_callback(struct timer_list *timer) +{ + post_write(post_write_timer_drive); +} /* * The following functions are to convert the block contents into raw data @@ -1244,8 +1244,12 @@ static void dos_write(int disk) /* FIXME: this assumes the drive is still spinning - * which is only true if we complete writing a track within three seconds */ -static void flush_track_callback(unsigned long nr) +static void flush_track_callback(struct timer_list *timer) { + unsigned long nr = ((unsigned long)timer - + (unsigned long)&flush_track_timer[0]) / + sizeof(flush_track_timer[0]); + nr&=3; writefromint = 1; if (!try_fdc(nr)) { @@ -1649,8 +1653,7 @@ static void floppy_release(struct gendisk *disk, fmode_t mode) fd_ref[drive] = 0; } #ifdef MODULE -/* the mod_use counter is handled this way */ - floppy_off (drive | 0x40000000); + floppy_off (drive); #endif mutex_unlock(&amiflop_mutex); } @@ -1791,27 +1794,19 @@ static int __init amiga_floppy_probe(struct platform_device *pdev) floppy_find, NULL, NULL); /* initialize variables */ - init_timer(&motor_on_timer); + timer_setup(&motor_on_timer, motor_on_callback, 0); motor_on_timer.expires = 0; - motor_on_timer.data = 0; - motor_on_timer.function = motor_on_callback; for (i = 0; i < FD_MAX_UNITS; i++) { - init_timer(&motor_off_timer[i]); + timer_setup(&motor_off_timer[i], fd_motor_off, 0); motor_off_timer[i].expires = 0; - motor_off_timer[i].data = i|0x80000000; - motor_off_timer[i].function = fd_motor_off; - init_timer(&flush_track_timer[i]); + timer_setup(&flush_track_timer[i], flush_track_callback, 0); flush_track_timer[i].expires = 0; - flush_track_timer[i].data = i; - flush_track_timer[i].function = flush_track_callback; unit[i].track = -1; } - init_timer(&post_write_timer); + timer_setup(&post_write_timer, post_write_callback, 0); post_write_timer.expires = 0; - post_write_timer.data = 0; - post_write_timer.function = post_write; for (i = 0; i < 128; i++) mfmdecode[i]=255; From 0e0cc9df86bc56e5d55a72e0adf530d6f7fe8628 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 5 Oct 2017 16:13:54 -0700 Subject: [PATCH 03/13] block/aoe: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Jens Axboe Cc: "Ed L. Cashin" Cc: linux-block@vger.kernel.org Cc: Thomas Gleixner Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- drivers/block/aoe/aoecmd.c | 6 +++--- drivers/block/aoe/aoedev.c | 9 +++------ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index dc43254e05a4..55ab25f79a08 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -744,7 +744,7 @@ count_targets(struct aoedev *d, int *untainted) } static void -rexmit_timer(ulong vp) +rexmit_timer(struct timer_list *timer) { struct aoedev *d; struct aoetgt *t; @@ -758,7 +758,7 @@ rexmit_timer(ulong vp) int utgts; /* number of aoetgt descriptors (not slots) */ int since; - d = (struct aoedev *) vp; + d = from_timer(d, timer, timer); spin_lock_irqsave(&d->lock, flags); @@ -1429,7 +1429,7 @@ aoecmd_ata_id(struct aoedev *d) d->rttavg = RTTAVG_INIT; d->rttdev = RTTDEV_INIT; - d->timer.function = rexmit_timer; + d->timer.function = (TIMER_FUNC_TYPE)rexmit_timer; skb = skb_clone(skb, GFP_ATOMIC); if (skb) { diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index b28fefb90391..697f735b07a4 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -15,7 +15,6 @@ #include #include "aoe.h" -static void dummy_timer(ulong); static void freetgt(struct aoedev *d, struct aoetgt *t); static void skbpoolfree(struct aoedev *d); @@ -146,11 +145,11 @@ aoedev_put(struct aoedev *d) } static void -dummy_timer(ulong vp) +dummy_timer(struct timer_list *t) { struct aoedev *d; - d = (struct aoedev *)vp; + d = from_timer(d, t, timer); if (d->flags & DEVFL_TKILL) return; d->timer.expires = jiffies + HZ; @@ -466,9 +465,7 @@ aoedev_by_aoeaddr(ulong maj, int min, int do_alloc) INIT_WORK(&d->work, aoecmd_sleepwork); spin_lock_init(&d->lock); skb_queue_head_init(&d->skbpool); - init_timer(&d->timer); - d->timer.data = (ulong) d; - d->timer.function = dummy_timer; + timer_setup(&d->timer, dummy_timer, 0); d->timer.expires = jiffies + HZ; add_timer(&d->timer); d->bufpool = NULL; /* defer to aoeblk_gdalloc */ From b5775a6ba373f1a6e41723ab54c8a4b0fb6f0f00 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 17 Oct 2017 15:32:27 -0700 Subject: [PATCH 04/13] block: swim3: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Ingo Molnar Cc: Arvind Yadav Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- drivers/block/swim3.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 9f931f8f6b4c..e620e423102b 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -239,10 +239,10 @@ static unsigned short write_postamble[] = { static void seek_track(struct floppy_state *fs, int n); static void init_dma(struct dbdma_cmd *cp, int cmd, void *buf, int count); static void act(struct floppy_state *fs); -static void scan_timeout(unsigned long data); -static void seek_timeout(unsigned long data); -static void settle_timeout(unsigned long data); -static void xfer_timeout(unsigned long data); +static void scan_timeout(struct timer_list *t); +static void seek_timeout(struct timer_list *t); +static void settle_timeout(struct timer_list *t); +static void xfer_timeout(struct timer_list *t); static irqreturn_t swim3_interrupt(int irq, void *dev_id); /*static void fd_dma_interrupt(int irq, void *dev_id);*/ static int grab_drive(struct floppy_state *fs, enum swim_state state, @@ -392,13 +392,12 @@ static void do_fd_request(struct request_queue * q) } static void set_timeout(struct floppy_state *fs, int nticks, - void (*proc)(unsigned long)) + void (*proc)(struct timer_list *t)) { if (fs->timeout_pending) del_timer(&fs->timeout); fs->timeout.expires = jiffies + nticks; - fs->timeout.function = proc; - fs->timeout.data = (unsigned long) fs; + fs->timeout.function = (TIMER_FUNC_TYPE)proc; add_timer(&fs->timeout); fs->timeout_pending = 1; } @@ -569,9 +568,9 @@ static void act(struct floppy_state *fs) } } -static void scan_timeout(unsigned long data) +static void scan_timeout(struct timer_list *t) { - struct floppy_state *fs = (struct floppy_state *) data; + struct floppy_state *fs = from_timer(fs, t, timeout); struct swim3 __iomem *sw = fs->swim3; unsigned long flags; @@ -594,9 +593,9 @@ static void scan_timeout(unsigned long data) spin_unlock_irqrestore(&swim3_lock, flags); } -static void seek_timeout(unsigned long data) +static void seek_timeout(struct timer_list *t) { - struct floppy_state *fs = (struct floppy_state *) data; + struct floppy_state *fs = from_timer(fs, t, timeout); struct swim3 __iomem *sw = fs->swim3; unsigned long flags; @@ -614,9 +613,9 @@ static void seek_timeout(unsigned long data) spin_unlock_irqrestore(&swim3_lock, flags); } -static void settle_timeout(unsigned long data) +static void settle_timeout(struct timer_list *t) { - struct floppy_state *fs = (struct floppy_state *) data; + struct floppy_state *fs = from_timer(fs, t, timeout); struct swim3 __iomem *sw = fs->swim3; unsigned long flags; @@ -644,9 +643,9 @@ static void settle_timeout(unsigned long data) spin_unlock_irqrestore(&swim3_lock, flags); } -static void xfer_timeout(unsigned long data) +static void xfer_timeout(struct timer_list *t) { - struct floppy_state *fs = (struct floppy_state *) data; + struct floppy_state *fs = from_timer(fs, t, timeout); struct swim3 __iomem *sw = fs->swim3; struct dbdma_regs __iomem *dr = fs->dma; unsigned long flags; @@ -1182,7 +1181,7 @@ static int swim3_add_device(struct macio_dev *mdev, int index) return -EBUSY; } - init_timer(&fs->timeout); + timer_setup(&fs->timeout, NULL, 0); swim3_info("SWIM3 floppy controller %s\n", mdev->media_bay ? "in media bay" : ""); From 8376d3c1f98988ae7f9e9bc2d1eeeb7d61fd206c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:01:48 -0700 Subject: [PATCH 05/13] md: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Kent Overstreet Cc: Shaohua Li Cc: Alasdair Kergon Cc: Mike Snitzer Cc: dm-devel@redhat.com Cc: linux-bcache@vger.kernel.org Cc: linux-raid@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Michael Lyle Reviewed-by: Mike Snitzer Signed-off-by: Jens Axboe --- drivers/md/bcache/stats.c | 8 +++----- drivers/md/dm-delay.c | 6 +++--- drivers/md/dm-integrity.c | 6 +++--- drivers/md/dm-raid1.c | 8 +++----- drivers/md/md.c | 9 ++++----- 5 files changed, 16 insertions(+), 21 deletions(-) diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index d0831d5bcc87..be119326297b 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -147,9 +147,9 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) } } -static void scale_accounting(unsigned long data) +static void scale_accounting(struct timer_list *t) { - struct cache_accounting *acc = (struct cache_accounting *) data; + struct cache_accounting *acc = from_timer(acc, t, timer); #define move_stat(name) do { \ unsigned t = atomic_xchg(&acc->collector.name, 0); \ @@ -234,9 +234,7 @@ void bch_cache_accounting_init(struct cache_accounting *acc, kobject_init(&acc->day.kobj, &bch_stats_ktype); closure_init(&acc->cl, parent); - init_timer(&acc->timer); + timer_setup(&acc->timer, scale_accounting, 0); acc->timer.expires = jiffies + accounting_delay; - acc->timer.data = (unsigned long) acc; - acc->timer.function = scale_accounting; add_timer(&acc->timer); } diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 2209a9700acd..288386bfbfb5 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -44,9 +44,9 @@ struct dm_delay_info { static DEFINE_MUTEX(delayed_bios_lock); -static void handle_delayed_timer(unsigned long data) +static void handle_delayed_timer(struct timer_list *t) { - struct delay_c *dc = (struct delay_c *)data; + struct delay_c *dc = from_timer(dc, t, delay_timer); queue_work(dc->kdelayd_wq, &dc->flush_expired_bios); } @@ -195,7 +195,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_queue; } - setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc); + timer_setup(&dc->delay_timer, handle_delayed_timer, 0); INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); INIT_LIST_HEAD(&dc->delayed_bios); diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 61180783ef42..05c7bfd0c9d9 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1094,9 +1094,9 @@ static void sleep_on_endio_wait(struct dm_integrity_c *ic) __remove_wait_queue(&ic->endio_wait, &wait); } -static void autocommit_fn(unsigned long data) +static void autocommit_fn(struct timer_list *t) { - struct dm_integrity_c *ic = (struct dm_integrity_c *)data; + struct dm_integrity_c *ic = from_timer(ic, t, autocommit_timer); if (likely(!dm_integrity_failed(ic))) queue_work(ic->commit_wq, &ic->commit_work); @@ -2942,7 +2942,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ic->autocommit_jiffies = msecs_to_jiffies(sync_msec); ic->autocommit_msec = sync_msec; - setup_timer(&ic->autocommit_timer, autocommit_fn, (unsigned long)ic); + timer_setup(&ic->autocommit_timer, autocommit_fn, 0); ic->io = dm_io_client_create(); if (IS_ERR(ic->io)) { diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index c0b82136b2d1..580c49cc8079 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -94,9 +94,9 @@ static void wakeup_mirrord(void *context) queue_work(ms->kmirrord_wq, &ms->kmirrord_work); } -static void delayed_wake_fn(unsigned long data) +static void delayed_wake_fn(struct timer_list *t) { - struct mirror_set *ms = (struct mirror_set *) data; + struct mirror_set *ms = from_timer(ms, t, timer); clear_bit(0, &ms->timer_pending); wakeup_mirrord(ms); @@ -108,8 +108,6 @@ static void delayed_wake(struct mirror_set *ms) return; ms->timer.expires = jiffies + HZ / 5; - ms->timer.data = (unsigned long) ms; - ms->timer.function = delayed_wake_fn; add_timer(&ms->timer); } @@ -1133,7 +1131,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err_free_context; } INIT_WORK(&ms->kmirrord_work, do_mirror); - init_timer(&ms->timer); + timer_setup(&ms->timer, delayed_wake_fn, 0); ms->timer_pending = 0; INIT_WORK(&ms->trigger_event, trigger_event); diff --git a/drivers/md/md.c b/drivers/md/md.c index 09c3af3dcdca..0f2d79d16949 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -541,7 +541,7 @@ static void mddev_put(struct mddev *mddev) bioset_free(sync_bs); } -static void md_safemode_timeout(unsigned long data); +static void md_safemode_timeout(struct timer_list *t); void mddev_init(struct mddev *mddev) { @@ -550,8 +550,7 @@ void mddev_init(struct mddev *mddev) mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); - setup_timer(&mddev->safemode_timer, md_safemode_timeout, - (unsigned long) mddev); + timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); @@ -5404,9 +5403,9 @@ static int add_named_array(const char *val, struct kernel_param *kp) return -EINVAL; } -static void md_safemode_timeout(unsigned long data) +static void md_safemode_timeout(struct timer_list *t) { - struct mddev *mddev = (struct mddev *) data; + struct mddev *mddev = from_timer(mddev, t, safemode_timer); mddev->safemode = 1; if (mddev->external) From 4fc930896db910c39999ed0e298798b57362772a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 1 Nov 2017 15:31:49 -0700 Subject: [PATCH 06/13] ide: Make ide_cdrom_prep_fs() initialize the sense buffer pointer The changes introduced through commit 82ed4db499b8 assume that the sense buffer pointer in struct scsi_request is initialized for all requests - passthrough and filesystem requests. Hence make sure that that pointer is initialized for filesystem requests. Remove the memset() call that clears .cmd because the scsi_req_init() call in ide_initialize_rq() already initializes the .cmd. Fixes: commit 82ed4db499b8 ("block: split scsi_request out of struct request") Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hongxu Jia Signed-off-by: Jens Axboe --- drivers/ide/ide-cd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 6ff0be8cbdc9..7c3ed7c9af77 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1333,8 +1333,7 @@ static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq) unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9); struct scsi_request *req = scsi_req(rq); - scsi_req_init(req); - memset(req->cmd, 0, BLK_MAX_CDB); + q->initialize_rq_fn(rq); if (rq_data_dir(rq) == READ) req->cmd[0] = GPCMD_READ_10; From 68017e5d87a2477d40476f1a0a06f202ee79316b Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 13 Nov 2017 07:34:07 +0100 Subject: [PATCH 07/13] doc, block, bfq: update max IOPS sustainable with BFQ We have investigated more deeply the performance of BFQ, in terms of number of IOPS that can be processed by the CPU when BFQ is used as I/O scheduler. In more detail, using the script [1], we have measured the number of IOPS reached on top of a null block device configured with zero latency, as a function of the workload (sequential read, sequential write, random read, random write) and of the system (we considered desktops, laptops and embedded systems). Basing on the resulting figures, with this commit we update the current, conservative IOPS range reported in BFQ documentation. In particular, the documentation now reports, for each of three different systems, the lowest number of IOPS obtained for that system with the above test (namely, the value obtained with the workload leading to the lowest IOPS). [1] https://github.com/Algodev-github/IOSpeed Reviewed-by: Lee Tibbert Signed-off-by: Paolo Valente Signed-off-by: Luca Miccio Signed-off-by: Jens Axboe --- Documentation/block/bfq-iosched.txt | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 3d6951d63489..7a9361508157 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -20,12 +20,17 @@ for that device, by setting low_latency to 0. See Section 3 for details on how to configure BFQ for the desired tradeoff between latency and throughput, or on how to maximize throughput. -On average CPUs, the current version of BFQ can handle devices -performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a -reference, 30-50 KIOPS correspond to very high bandwidths with -sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and -to 120-200 MB/s with 4KB random I/O. BFQ is currently being tested on -multi-queue devices too. +BFQ has a non-null overhead, which limits the maximum IOPS that the +CPU can process for a device scheduled with BFQ. To give an idea of +the limits on slow or average CPUs, here are BFQ limits for three +different CPUs, on, respectively, an average laptop, an old desktop, +and a cheap embedded system, in case full hierarchical support is +enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set): +- Intel i7-4850HQ: 250 KIOPS +- AMD A8-3850: 170 KIOPS +- ARM CortexTM-A53 Octa-core: 45 KIOPS + +BFQ works for multi-queue devices too. The table of contents follow. Impatients can just jump to Section 3. From 614822f81f606e0064acdae11d9ec1efd3db4190 Mon Sep 17 00:00:00 2001 From: Luca Miccio Date: Mon, 13 Nov 2017 07:34:08 +0100 Subject: [PATCH 08/13] block, bfq: add missing invocations of bfqg_stats_update_io_add/remove bfqg_stats_update_io_add and bfqg_stats_update_io_remove are to be invoked, respectively, when an I/O request enters and when an I/O request exits the scheduler. Unfortunately, bfq does not fully comply with this scheme, because it does not invoke these functions for requests that are inserted into or extracted from its priority dispatch list. This commit fixes this mistake. Tested-by: Lee Tibbert Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Luca Miccio Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 889a8549d97f..91703eba63f0 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1359,7 +1359,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfqq->ttime.last_end_request + bfqd->bfq_slice_idle * 3; - bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); /* * bfqq deserves to be weight-raised if: @@ -1633,7 +1632,6 @@ static void bfq_remove_request(struct request_queue *q, if (rq->cmd_flags & REQ_META) bfqq->meta_pending--; - bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); } static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) @@ -1746,6 +1744,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, bfqq->next_rq = rq; bfq_remove_request(q, next); + bfqg_stats_update_io_remove(bfqq_group(bfqq), next->cmd_flags); spin_unlock_irq(&bfqq->bfqd->lock); end: @@ -3700,6 +3699,9 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) spin_lock_irq(&bfqd->lock); rq = __bfq_dispatch_request(hctx); + if (rq && RQ_BFQQ(rq)) + bfqg_stats_update_io_remove(bfqq_group(RQ_BFQQ(rq)), + rq->cmd_flags); spin_unlock_irq(&bfqd->lock); return rq; @@ -4224,6 +4226,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq); spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq)) { @@ -4243,6 +4246,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, list_add_tail(&rq->queuelist, &bfqd->dispatch); } else { __bfq_insert_request(bfqd, rq); + /* + * Update bfqq, because, if a queue merge has occurred + * in __bfq_insert_request, then rq has been + * redirected into a new queue. + */ + bfqq = RQ_BFQQ(rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -4251,6 +4260,9 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } } + if (bfqq) + bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, rq->cmd_flags); + spin_unlock_irq(&bfqd->lock); } @@ -4428,8 +4440,11 @@ static void bfq_finish_request(struct request *rq) * lock is held. */ - if (!RB_EMPTY_NODE(&rq->rb_node)) + if (!RB_EMPTY_NODE(&rq->rb_node)) { bfq_remove_request(rq->q, rq); + bfqg_stats_update_io_remove(bfqq_group(bfqq), + rq->cmd_flags); + } bfq_put_rq_priv_body(bfqq); } From 24bfd19bb7890255693ee5cb6dc100d8d215d00b Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 13 Nov 2017 07:34:09 +0100 Subject: [PATCH 09/13] block, bfq: update blkio stats outside the scheduler lock bfq invokes various blkg_*stats_* functions to update the statistics contained in the special files blkio.bfq.* in the blkio controller groups, i.e., the I/O accounting related to the proportional-share policy provided by bfq. The execution of these functions takes a considerable percentage, about 40%, of the total per-request execution time of bfq (i.e., of the sum of the execution time of all the bfq functions that have to be executed to process an I/O request from its creation to its destruction). This reduces the request-processing rate sustainable by bfq noticeably, even on a multicore CPU. In fact, the bfq functions that invoke blkg_*stats_* functions cannot be executed in parallel with the rest of the code of bfq, because both are executed under the same same per-device scheduler lock. To reduce this slowdown, this commit moves, wherever possible, the invocation of these functions (more precisely, of the bfq functions that invoke blkg_*stats_* functions) outside the critical sections protected by the scheduler lock. With this change, and with all blkio.bfq.* statistics enabled, the throughput grows, e.g., from 250 to 310 KIOPS (+25%) on an Intel i7-4850HQ, in case of 8 threads doing random I/O in parallel on null_blk, with the latter configured with 0 latency. We obtained the same or higher throughput boosts, up to +30%, with other processors (some figures are reported in the documentation). For our tests, we used the script [1], with which our results can be easily reproduced. NOTE. This commit still protects the invocation of blkg_*stats_* functions with the request_queue lock, because the group these functions are invoked on may otherwise disappear before or while these functions are executed. Fortunately, tests without even this lock show, by difference, that the serialization caused by this lock has a little impact (at most ~5% of throughput reduction). [1] https://github.com/Algodev-github/IOSpeed Tested-by: Lee Tibbert Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Luca Miccio Signed-off-by: Jens Axboe --- Documentation/block/bfq-iosched.txt | 6 +- block/bfq-iosched.c | 110 +++++++++++++++++++++++++--- block/bfq-wf2q.c | 1 - 3 files changed, 102 insertions(+), 15 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 7a9361508157..7fad6c061470 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -26,9 +26,9 @@ the limits on slow or average CPUs, here are BFQ limits for three different CPUs, on, respectively, an average laptop, an old desktop, and a cheap embedded system, in case full hierarchical support is enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set): -- Intel i7-4850HQ: 250 KIOPS -- AMD A8-3850: 170 KIOPS -- ARM CortexTM-A53 Octa-core: 45 KIOPS +- Intel i7-4850HQ: 310 KIOPS +- AMD A8-3850: 200 KIOPS +- ARM CortexTM-A53 Octa-core: 56 KIOPS BFQ works for multi-queue devices too. diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 91703eba63f0..69e05f861daf 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2228,7 +2228,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq) { if (bfqq) { - bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); bfq_clear_bfqq_fifo_expire(bfqq); bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8; @@ -3469,7 +3468,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) */ bfq_clear_bfqq_wait_request(bfqq); hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqg_stats_update_idle_time(bfqq_group(bfqq)); } goto keep_queue; } @@ -3695,15 +3693,67 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct request *rq; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + struct bfq_queue *in_serv_queue, *bfqq; + bool waiting_rq, idle_timer_disabled; +#endif spin_lock_irq(&bfqd->lock); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + in_serv_queue = bfqd->in_service_queue; + waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); + rq = __bfq_dispatch_request(hctx); - if (rq && RQ_BFQQ(rq)) - bfqg_stats_update_io_remove(bfqq_group(RQ_BFQQ(rq)), - rq->cmd_flags); + + idle_timer_disabled = + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + +#else + rq = __bfq_dispatch_request(hctx); +#endif spin_unlock_irq(&bfqd->lock); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqq = rq ? RQ_BFQQ(rq) : NULL; + if (!idle_timer_disabled && !bfqq) + return rq; + + /* + * rq and bfqq are guaranteed to exist until this function + * ends, for the following reasons. First, rq can be + * dispatched to the device, and then can be completed and + * freed, only after this function ends. Second, rq cannot be + * merged (and thus freed because of a merge) any longer, + * because it has already started. Thus rq cannot be freed + * before this function ends, and, since rq has a reference to + * bfqq, the same guarantee holds for bfqq too. + * + * In addition, the following queue lock guarantees that + * bfqq_group(bfqq) exists as well. + */ + spin_lock_irq(hctx->queue->queue_lock); + if (idle_timer_disabled) + /* + * Since the idle timer has been disabled, + * in_serv_queue contained some request when + * __bfq_dispatch_request was invoked above, which + * implies that rq was picked exactly from + * in_serv_queue. Thus in_serv_queue == bfqq, and is + * therefore guaranteed to exist because of the above + * arguments. + */ + bfqg_stats_update_idle_time(bfqq_group(in_serv_queue)); + if (bfqq) { + struct bfq_group *bfqg = bfqq_group(bfqq); + + bfqg_stats_update_avg_queue_size(bfqg); + bfqg_stats_set_start_empty_time(bfqg); + bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); + } + spin_unlock_irq(hctx->queue->queue_lock); +#endif + return rq; } @@ -4161,7 +4211,6 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ bfq_clear_bfqq_wait_request(bfqq); hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqg_stats_update_idle_time(bfqq_group(bfqq)); /* * The queue is not empty, because a new request just @@ -4176,10 +4225,12 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, } } -static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) +/* returns true if it causes the idle timer to be disabled */ +static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); + bool waiting, idle_timer_disabled = false; if (new_bfqq) { if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) @@ -4213,12 +4264,16 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) bfqq = new_bfqq; } + waiting = bfqq && bfq_bfqq_wait_request(bfqq); bfq_add_request(rq); + idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo); bfq_rq_enqueued(bfqd, bfqq, rq); + + return idle_timer_disabled; } static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, @@ -4226,7 +4281,11 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; +#ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_queue *bfqq = RQ_BFQQ(rq); + bool idle_timer_disabled = false; + unsigned int cmd_flags; +#endif spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq)) { @@ -4245,13 +4304,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, else list_add_tail(&rq->queuelist, &bfqd->dispatch); } else { - __bfq_insert_request(bfqd, rq); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* * Update bfqq, because, if a queue merge has occurred * in __bfq_insert_request, then rq has been * redirected into a new queue. */ bfqq = RQ_BFQQ(rq); +#else + __bfq_insert_request(bfqd, rq); +#endif if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -4260,10 +4323,35 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } } - if (bfqq) - bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, rq->cmd_flags); - +#ifdef CONFIG_BFQ_GROUP_IOSCHED + /* + * Cache cmd_flags before releasing scheduler lock, because rq + * may disappear afterwards (for example, because of a request + * merge). + */ + cmd_flags = rq->cmd_flags; +#endif spin_unlock_irq(&bfqd->lock); + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (!bfqq) + return; + /* + * bfqq still exists, because it can disappear only after + * either it is merged with another queue, or the process it + * is associated with exits. But both actions must be taken by + * the same process currently executing this flow of + * instruction. + * + * In addition, the following queue lock guarantees that + * bfqq_group(bfqq) exists as well. + */ + spin_lock_irq(q->queue_lock); + bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); + if (idle_timer_disabled) + bfqg_stats_update_idle_time(bfqq_group(bfqq)); + spin_unlock_irq(q->queue_lock); +#endif } static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 414ba686a847..e495d3f9b4b0 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -843,7 +843,6 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served) st->vtime += bfq_delta(served, st->wsum); bfq_forget_idle(st); } - bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); } From a33801e8b4735b8d473f963e5854172f9cde3e8b Mon Sep 17 00:00:00 2001 From: Luca Miccio Date: Mon, 13 Nov 2017 07:34:10 +0100 Subject: [PATCH 10/13] block, bfq: move debug blkio stats behind CONFIG_DEBUG_BLK_CGROUP BFQ currently creates, and updates, its own instance of the whole set of blkio statistics that cfq creates. Yet, from the comments of Tejun Heo in [1], it turned out that most of these statistics are meant/useful only for debugging. This commit makes BFQ create the latter, debugging statistics only if the option CONFIG_DEBUG_BLK_CGROUP is set. By doing so, this commit also enables BFQ to enjoy a high perfomance boost. The reason is that, if CONFIG_DEBUG_BLK_CGROUP is not set, then BFQ has to update far fewer statistics, and, in particular, not the heaviest to update. To give an idea of the benefits, if CONFIG_DEBUG_BLK_CGROUP is not set, then, on an Intel i7-4850HQ, and with 8 threads doing random I/O in parallel on null_blk (configured with 0 latency), the throughput of BFQ grows from 310 to 400 KIOPS (+30%). We have measured similar or even much higher boosts with other CPUs: e.g., +45% with an ARM CortexTM-A53 Octa-core. Our results have been obtained and can be reproduced very easily with the script in [1]. [1] https://www.spinics.net/lists/linux-block/msg18943.html Suggested-by: Tejun Heo Suggested-by: Ulf Hansson Tested-by: Lee Tibbert Tested-by: Oleksandr Natalenko Signed-off-by: Luca Miccio Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- Documentation/block/bfq-iosched.txt | 38 +++++-- block/bfq-cgroup.c | 148 ++++++++++++++++------------ block/bfq-iosched.c | 14 +-- block/bfq-iosched.h | 4 +- 4 files changed, 125 insertions(+), 79 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 7fad6c061470..8d8d8f06cab2 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -20,12 +20,22 @@ for that device, by setting low_latency to 0. See Section 3 for details on how to configure BFQ for the desired tradeoff between latency and throughput, or on how to maximize throughput. -BFQ has a non-null overhead, which limits the maximum IOPS that the -CPU can process for a device scheduled with BFQ. To give an idea of -the limits on slow or average CPUs, here are BFQ limits for three -different CPUs, on, respectively, an average laptop, an old desktop, -and a cheap embedded system, in case full hierarchical support is -enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set): +BFQ has a non-null overhead, which limits the maximum IOPS that a CPU +can process for a device scheduled with BFQ. To give an idea of the +limits on slow or average CPUs, here are, first, the limits of BFQ for +three different CPUs, on, respectively, an average laptop, an old +desktop, and a cheap embedded system, in case full hierarchical +support is enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set), but +CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2): +- Intel i7-4850HQ: 400 KIOPS +- AMD A8-3850: 250 KIOPS +- ARM CortexTM-A53 Octa-core: 80 KIOPS + +If CONFIG_DEBUG_BLK_CGROUP is set (and of course full hierarchical +support is enabled), then the sustainable throughput with BFQ +decreases, because all blkio.bfq* statistics are created and updated +(Section 4-2). For BFQ, this leads to the following maximum +sustainable throughputs, on the same systems as above: - Intel i7-4850HQ: 310 KIOPS - AMD A8-3850: 200 KIOPS - ARM CortexTM-A53 Octa-core: 56 KIOPS @@ -505,6 +515,22 @@ BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group parameter to set the weight of a group with BFQ is blkio.bfq.weight or io.bfq.weight. +As for cgroups-v1 (blkio controller), the exact set of stat files +created, and kept up-to-date by bfq, depends on whether +CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all +the stat files documented in +Documentation/cgroup-v1/blkio-controller.txt. If, instead, +CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files +blkio.bfq.io_service_bytes +blkio.bfq.io_service_bytes_recursive +blkio.bfq.io_serviced +blkio.bfq.io_serviced_recursive + +The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum +throughput sustainable with bfq, because updating the blkio.bfq.* +stats is rather costly, especially for some of the stats enabled by +CONFIG_DEBUG_BLK_CGROUP. + Parameters to set ----------------- diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index ceefb9a706d6..da1525ec4c87 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -24,7 +24,7 @@ #include "bfq-iosched.h" -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* bfqg stats flags */ enum bfqg_stats_flags { @@ -152,6 +152,57 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) bfqg_stats_update_group_wait_time(stats); } +void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, + unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.queued, op, 1); + bfqg_stats_end_empty_time(&bfqg->stats); + if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) + bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); +} + +void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.queued, op, -1); +} + +void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.merged, op, 1); +} + +void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, + uint64_t io_start_time, unsigned int op) +{ + struct bfqg_stats *stats = &bfqg->stats; + unsigned long long now = sched_clock(); + + if (time_after64(now, io_start_time)) + blkg_rwstat_add(&stats->service_time, op, + now - io_start_time); + if (time_after64(io_start_time, start_time)) + blkg_rwstat_add(&stats->wait_time, op, + io_start_time - start_time); +} + +#else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ + +void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, + unsigned int op) { } +void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } +void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } +void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, + uint64_t io_start_time, unsigned int op) { } +void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } +void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } +void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } +void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } +void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } + +#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + /* * blk-cgroup policy-related handlers * The following functions help in converting between blk-cgroup @@ -229,42 +280,10 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg) blkg_put(bfqg_to_blkg(bfqg)); } -void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - unsigned int op) -{ - blkg_rwstat_add(&bfqg->stats.queued, op, 1); - bfqg_stats_end_empty_time(&bfqg->stats); - if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) - bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -} - -void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) -{ - blkg_rwstat_add(&bfqg->stats.queued, op, -1); -} - -void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) -{ - blkg_rwstat_add(&bfqg->stats.merged, op, 1); -} - -void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, - uint64_t io_start_time, unsigned int op) -{ - struct bfqg_stats *stats = &bfqg->stats; - unsigned long long now = sched_clock(); - - if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, op, - now - io_start_time); - if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&stats->wait_time, op, - io_start_time - start_time); -} - /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { +#ifdef CONFIG_DEBUG_BLK_CGROUP /* queued stats shouldn't be cleared */ blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); @@ -276,6 +295,7 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) blkg_stat_reset(&stats->group_wait_time); blkg_stat_reset(&stats->idle_time); blkg_stat_reset(&stats->empty_time); +#endif } /* @to += @from */ @@ -284,6 +304,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) if (!to || !from) return; +#ifdef CONFIG_DEBUG_BLK_CGROUP /* queued stats shouldn't be cleared */ blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); @@ -296,6 +317,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); blkg_stat_add_aux(&to->idle_time, &from->idle_time); blkg_stat_add_aux(&to->empty_time, &from->empty_time); +#endif } /* @@ -342,6 +364,7 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) static void bfqg_stats_exit(struct bfqg_stats *stats) { +#ifdef CONFIG_DEBUG_BLK_CGROUP blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); @@ -353,10 +376,12 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) blkg_stat_exit(&stats->group_wait_time); blkg_stat_exit(&stats->idle_time); blkg_stat_exit(&stats->empty_time); +#endif } static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { +#ifdef CONFIG_DEBUG_BLK_CGROUP if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || @@ -371,6 +396,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) bfqg_stats_exit(stats); return -ENOMEM; } +#endif return 0; } @@ -887,6 +913,7 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, return bfq_io_set_weight_legacy(of_css(of), NULL, weight); } +#ifdef CONFIG_DEBUG_BLK_CGROUP static int bfqg_print_stat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, @@ -991,6 +1018,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) 0, false); return 0; } +#endif /* CONFIG_DEBUG_BLK_CGROUP */ struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { @@ -1028,15 +1056,6 @@ struct cftype bfq_blkcg_legacy_files[] = { }, /* statistics, covers only the tasks in the bfqg */ - { - .name = "bfq.time", - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat, - }, - { - .name = "bfq.sectors", - .seq_show = bfqg_print_stat_sectors, - }, { .name = "bfq.io_service_bytes", .private = (unsigned long)&blkcg_policy_bfq, @@ -1047,6 +1066,16 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_ios, }, +#ifdef CONFIG_DEBUG_BLK_CGROUP + { + .name = "bfq.time", + .private = offsetof(struct bfq_group, stats.time), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.sectors", + .seq_show = bfqg_print_stat_sectors, + }, { .name = "bfq.io_service_time", .private = offsetof(struct bfq_group, stats.service_time), @@ -1067,17 +1096,9 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat, }, +#endif /* CONFIG_DEBUG_BLK_CGROUP */ /* the same statictics which cover the bfqg and its descendants */ - { - .name = "bfq.time_recursive", - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat_recursive, - }, - { - .name = "bfq.sectors_recursive", - .seq_show = bfqg_print_stat_sectors_recursive, - }, { .name = "bfq.io_service_bytes_recursive", .private = (unsigned long)&blkcg_policy_bfq, @@ -1088,6 +1109,16 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_ios_recursive, }, +#ifdef CONFIG_DEBUG_BLK_CGROUP + { + .name = "bfq.time_recursive", + .private = offsetof(struct bfq_group, stats.time), + .seq_show = bfqg_print_stat_recursive, + }, + { + .name = "bfq.sectors_recursive", + .seq_show = bfqg_print_stat_sectors_recursive, + }, { .name = "bfq.io_service_time_recursive", .private = offsetof(struct bfq_group, stats.service_time), @@ -1132,6 +1163,7 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = offsetof(struct bfq_group, stats.dequeue), .seq_show = bfqg_print_stat, }, +#endif /* CONFIG_DEBUG_BLK_CGROUP */ { } /* terminate */ }; @@ -1147,18 +1179,6 @@ struct cftype bfq_blkg_files[] = { #else /* CONFIG_BFQ_GROUP_IOSCHED */ -void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - unsigned int op) { } -void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } -void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, - uint64_t io_start_time, unsigned int op) { } -void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } - void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) {} diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 69e05f861daf..bcb6d21baf12 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3693,14 +3693,14 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct request *rq; -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) struct bfq_queue *in_serv_queue, *bfqq; bool waiting_rq, idle_timer_disabled; #endif spin_lock_irq(&bfqd->lock); -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) in_serv_queue = bfqd->in_service_queue; waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); @@ -3714,7 +3714,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) #endif spin_unlock_irq(&bfqd->lock); -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) bfqq = rq ? RQ_BFQQ(rq) : NULL; if (!idle_timer_disabled && !bfqq) return rq; @@ -4281,7 +4281,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) struct bfq_queue *bfqq = RQ_BFQQ(rq); bool idle_timer_disabled = false; unsigned int cmd_flags; @@ -4304,7 +4304,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, else list_add_tail(&rq->queuelist, &bfqd->dispatch); } else { -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* * Update bfqq, because, if a queue merge has occurred @@ -4323,7 +4323,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } } -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* * Cache cmd_flags before releasing scheduler lock, because rq * may disappear afterwards (for example, because of a request @@ -4333,7 +4333,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, #endif spin_unlock_irq(&bfqd->lock); -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) if (!bfqq) return; /* diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index ac0809c72c98..91c4390903a1 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -689,7 +689,7 @@ enum bfqq_expiration { }; struct bfqg_stats { -#ifdef CONFIG_BFQ_GROUP_IOSCHED +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -717,7 +717,7 @@ struct bfqg_stats { uint64_t start_idle_time; uint64_t start_empty_time; uint16_t flags; -#endif /* CONFIG_BFQ_GROUP_IOSCHED */ +#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ }; #ifdef CONFIG_BFQ_GROUP_IOSCHED From 34d9715ac1edd50285168dd8d80c972739a4f6a4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Nov 2017 08:08:44 +0800 Subject: [PATCH 11/13] block: wake up all tasks blocked in get_request() Once blk_set_queue_dying() is done in blk_cleanup_queue(), we call blk_freeze_queue() and wait for q->q_usage_counter becoming zero. But if there are tasks blocked in get_request(), q->q_usage_counter can never become zero. So we have to wake up all these tasks in blk_set_queue_dying() first. Fixes: 3ef28e83ab157997 ("block: generic request_queue reference counting") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 7c54c195e79e..1038706edd87 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -637,8 +637,8 @@ void blk_set_queue_dying(struct request_queue *q) spin_lock_irq(q->queue_lock); blk_queue_for_each_rl(rl, q) { if (rl->rq_pool) { - wake_up(&rl->wait[BLK_RW_SYNC]); - wake_up(&rl->wait[BLK_RW_ASYNC]); + wake_up_all(&rl->wait[BLK_RW_SYNC]); + wake_up_all(&rl->wait[BLK_RW_ASYNC]); } } spin_unlock_irq(q->queue_lock); From cce75291ffd82eb5bac56b051f46f89e8c0d2918 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 15 Nov 2017 17:00:21 -0800 Subject: [PATCH 12/13] nvmet_fc: fix better length checking Reorganize nvmet_fc_handle_fcp_rqst() so that the nvmet req.transfer_len field is set after the call nvmet_req_init(). An update to nvmet now has nvmet_req_init() clearing the field, thus the fc transport was losing the value. Reviewed-by: Christoph Hellwig Signed-off-by: James Smart Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 739b8feadc7d..664d3013f68f 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -2144,6 +2144,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, struct nvmet_fc_fcp_iod *fod) { struct nvme_fc_cmd_iu *cmdiu = &fod->cmdiubuf; + u32 xfrlen = be32_to_cpu(cmdiu->data_len); int ret; /* @@ -2157,7 +2158,6 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, fod->fcpreq->done = nvmet_fc_xmt_fcp_op_done; - fod->req.transfer_len = be32_to_cpu(cmdiu->data_len); if (cmdiu->flags & FCNVME_CMD_FLAGS_WRITE) { fod->io_dir = NVMET_FCP_WRITE; if (!nvme_is_write(&cmdiu->sqe)) @@ -2168,7 +2168,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, goto transport_error; } else { fod->io_dir = NVMET_FCP_NODATA; - if (fod->req.transfer_len) + if (xfrlen) goto transport_error; } @@ -2192,6 +2192,8 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, return; } + fod->req.transfer_len = xfrlen; + /* keep a running counter of tail position */ atomic_inc(&fod->queue->sqtail); From 62530ed8b1d07a45dec94d46e521c0c6c2d476e6 Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Thu, 16 Nov 2017 23:47:25 -0800 Subject: [PATCH 13/13] bio: ensure __bio_clone_fast copies bi_partno A new field was introduced in 74d46992e0d9, bi_partno, instead of using bdev->bd_contains and encoding the partition information in the bi_bdev field. __bio_clone_fast was changed to copy the disk information, but not the partition information. At minimum, this regressed bcache and caused data corruption. Signed-off-by: Michael Lyle Fixes: 74d46992e0d9 ("block: replace bi_bdev with a gendisk pointer and partitions index") Reported-by: Pavel Goran Reported-by: Campbell Steven Reviewed-by: Coly Li Reviewed-by: Ming Lei Cc: # 4.14 Signed-off-by: Jens Axboe --- block/bio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/bio.c b/block/bio.c index b94a802f8ba3..459cc857f3d9 100644 --- a/block/bio.c +++ b/block/bio.c @@ -597,6 +597,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) * so we don't set nor calculate new physical/hw segment counts here */ bio->bi_disk = bio_src->bi_disk; + bio->bi_partno = bio_src->bi_partno; bio_set_flag(bio, BIO_CLONED); bio->bi_opf = bio_src->bi_opf; bio->bi_write_hint = bio_src->bi_write_hint;