From 7b21e34fd1c272e3a8c3846168f2f6287a4cd72b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 12 Jan 2012 15:44:42 +1030 Subject: [PATCH 01/24] virtio: harsher barriers for rpmsg. We were cheating with our barriers; using the smp ones rather than the real device ones. That was fine, until rpmsg came along, which is used to talk to a real device (a non-SMP CPU). Unfortunately, just putting back the real barriers (reverting d57ed95d) causes a performance regression on virtio-pci. In particular, Amos reports netbench's TCP_RR over virtio_net CPU utilization increased up to 35% while throughput went down by up to 14%. By comparison, this branch is in the noise. Reference: https://lkml.org/lkml/2011/12/11/22 Signed-off-by: Rusty Russell --- drivers/lguest/lguest_device.c | 8 +++++--- drivers/s390/kvm/kvm_virtio.c | 2 +- drivers/virtio/virtio_mmio.c | 4 ++-- drivers/virtio/virtio_pci.c | 4 ++-- drivers/virtio/virtio_ring.c | 34 +++++++++++++++++++++------------- include/linux/virtio_ring.h | 1 + tools/virtio/linux/virtio.h | 1 + tools/virtio/virtio_test.c | 3 ++- 8 files changed, 35 insertions(+), 22 deletions(-) diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 595d73197016..6a1d6447b864 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -292,10 +292,12 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, /* * OK, tell virtio_ring.c to set up a virtqueue now we know its size - * and we've got a pointer to its pages. + * and we've got a pointer to its pages. Note that we set weak_barriers + * to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu + * barriers. */ - vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, - vdev, lvq->pages, lg_notify, callback, name); + vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, vdev, + true, lvq->pages, lg_notify, callback, name); if (!vq) { err = -ENOMEM; goto unmap; diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c index 8af868bab20b..7bc1955337ea 100644 --- a/drivers/s390/kvm/kvm_virtio.c +++ b/drivers/s390/kvm/kvm_virtio.c @@ -198,7 +198,7 @@ static struct virtqueue *kvm_find_vq(struct virtio_device *vdev, goto out; vq = vring_new_virtqueue(config->num, KVM_S390_VIRTIO_RING_ALIGN, - vdev, (void *) config->address, + vdev, true, (void *) config->address, kvm_notify, callback, name); if (!vq) { err = -ENOMEM; diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 0269717436af..01d6dc250d5c 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -310,8 +310,8 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); /* Create the vring */ - vq = vring_new_virtqueue(info->num, VIRTIO_MMIO_VRING_ALIGN, - vdev, info->queue, vm_notify, callback, name); + vq = vring_new_virtqueue(info->num, VIRTIO_MMIO_VRING_ALIGN, vdev, + true, info->queue, vm_notify, callback, name); if (!vq) { err = -ENOMEM; goto error_new_virtqueue; diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index baabb7937ec2..688b42d28dad 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -414,8 +414,8 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); /* create the vring */ - vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN, - vdev, info->queue, vp_notify, callback, name); + vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN, vdev, + true, info->queue, vp_notify, callback, name); if (!vq) { err = -ENOMEM; goto out_activate_queue; diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index c7a2c208f6ea..50da92046092 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -28,17 +28,20 @@ #ifdef CONFIG_SMP /* Where possible, use SMP barriers which are more lightweight than mandatory * barriers, because mandatory barriers control MMIO effects on accesses - * through relaxed memory I/O windows (which virtio does not use). */ -#define virtio_mb() smp_mb() -#define virtio_rmb() smp_rmb() -#define virtio_wmb() smp_wmb() + * through relaxed memory I/O windows (which virtio-pci does not use). */ +#define virtio_mb(vq) \ + do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0) +#define virtio_rmb(vq) \ + do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0) +#define virtio_wmb(vq) \ + do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0) #else /* We must force memory ordering even if guest is UP since host could be * running on another CPU, but SMP barriers are defined to barrier() in that * configuration. So fall back to mandatory barriers instead. */ -#define virtio_mb() mb() -#define virtio_rmb() rmb() -#define virtio_wmb() wmb() +#define virtio_mb(vq) mb() +#define virtio_rmb(vq) rmb() +#define virtio_wmb(vq) wmb() #endif #ifdef DEBUG @@ -77,6 +80,9 @@ struct vring_virtqueue /* Actual memory layout for this queue */ struct vring vring; + /* Can we use weak barriers? */ + bool weak_barriers; + /* Other side has made a mess, don't try any more. */ bool broken; @@ -245,14 +251,14 @@ void virtqueue_kick(struct virtqueue *_vq) START_USE(vq); /* Descriptors and available array need to be set before we expose the * new available array entries. */ - virtio_wmb(); + virtio_wmb(vq); old = vq->vring.avail->idx; new = vq->vring.avail->idx = old + vq->num_added; vq->num_added = 0; /* Need to update avail index before checking if we should notify */ - virtio_mb(); + virtio_mb(vq); if (vq->event ? vring_need_event(vring_avail_event(&vq->vring), new, old) : @@ -314,7 +320,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) } /* Only get used array entries after they have been exposed by host. */ - virtio_rmb(); + virtio_rmb(vq); i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id; *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len; @@ -337,7 +343,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) * the read in the next get_buf call. */ if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { vring_used_event(&vq->vring) = vq->last_used_idx; - virtio_mb(); + virtio_mb(vq); } END_USE(vq); @@ -366,7 +372,7 @@ bool virtqueue_enable_cb(struct virtqueue *_vq) * entry. Always do both to keep code simple. */ vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; vring_used_event(&vq->vring) = vq->last_used_idx; - virtio_mb(); + virtio_mb(vq); if (unlikely(more_used(vq))) { END_USE(vq); return false; @@ -393,7 +399,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) /* TODO: tune this threshold */ bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4; vring_used_event(&vq->vring) = vq->last_used_idx + bufs; - virtio_mb(); + virtio_mb(vq); if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) { END_USE(vq); return false; @@ -453,6 +459,7 @@ EXPORT_SYMBOL_GPL(vring_interrupt); struct virtqueue *vring_new_virtqueue(unsigned int num, unsigned int vring_align, struct virtio_device *vdev, + bool weak_barriers, void *pages, void (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), @@ -476,6 +483,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, vq->vq.vdev = vdev; vq->vq.name = name; vq->notify = notify; + vq->weak_barriers = weak_barriers; vq->broken = false; vq->last_used_idx = 0; vq->num_added = 0; diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 36be0f6e18a9..e338730c2660 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -168,6 +168,7 @@ struct virtqueue; struct virtqueue *vring_new_virtqueue(unsigned int num, unsigned int vring_align, struct virtio_device *vdev, + bool weak_barriers, void *pages, void (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index 669bcdd45805..953db2abf6b9 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -214,6 +214,7 @@ void *virtqueue_detach_unused_buf(struct virtqueue *vq); struct virtqueue *vring_new_virtqueue(unsigned int num, unsigned int vring_align, struct virtio_device *vdev, + bool weak_barriers, void *pages, void (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index 74d3331bdaf9..0740284396c1 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -92,7 +92,8 @@ static void vq_info_add(struct vdev_info *dev, int num) assert(r >= 0); memset(info->ring, 0, vring_size(num, 4096)); vring_init(&info->vring, num, info->ring, 4096); - info->vq = vring_new_virtqueue(info->vring.num, 4096, &dev->vdev, info->ring, + info->vq = vring_new_virtqueue(info->vring.num, 4096, &dev->vdev, + true, info->ring, vq_notify, vq_callback, "test"); assert(info->vq); info->vq->priv = info; From 1e214a5c1a7e901fc8e98ad6ef84f11005f9ee9d Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 3 Nov 2011 10:20:04 +0200 Subject: [PATCH 02/24] virtio-balloon: Trivial cleanups Trivial changes to remove forgotten junk, format comments, and correct names. Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: virtualization@lists.linux-foundation.org Signed-off-by: Sasha Levin Signed-off-by: Rusty Russell --- drivers/virtio/virtio_balloon.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 94fd738a7741..f64ff185b8b5 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -1,4 +1,5 @@ -/* Virtio balloon implementation, inspired by Dor Loar and Marcelo +/* + * Virtio balloon implementation, inspired by Dor Laor and Marcelo * Tosatti's implementations. * * Copyright 2008 Rusty Russell IBM Corporation @@ -17,7 +18,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -//#define DEBUG + #include #include #include @@ -149,7 +150,6 @@ static void leak_balloon(struct virtio_balloon *vb, size_t num) vb->num_pages--; } - /* * Note that if * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST); From 5dfc17628d57f9e62043ed0cba03a6e3eb019a78 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 12 Jan 2012 15:44:42 +1030 Subject: [PATCH 03/24] virtio: document functions better. The old documentation is left over from when we used a structure with strategy pointers. And move the documentation to the C file as per kernel practice. Though I disagree... Signed-off-by: Rusty Russell Reviewed-by: Christoph Hellwig --- drivers/virtio/virtio_ring.c | 92 +++++++++++++++++++++++++++++++++++- include/linux/virtio.h | 47 ------------------ 2 files changed, 91 insertions(+), 48 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 50da92046092..fe50486341a4 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -166,6 +166,23 @@ static int vring_add_indirect(struct vring_virtqueue *vq, return head; } +/** + * virtqueue_add_buf_gfp - expose buffer to other end + * @vq: the struct virtqueue we're talking about. + * @sg: the description of the buffer(s). + * @out_num: the number of sg readable by other side + * @in_num: the number of sg which are writable (after readable ones) + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns remaining capacity of queue or a negative error + * (ie. ENOSPC). Note that it only really makes sense to treat all + * positive return values as "available": indirect buffers mean that + * we can put an entire sg[] array inside a single queue entry. + */ int virtqueue_add_buf_gfp(struct virtqueue *_vq, struct scatterlist sg[], unsigned int out, @@ -244,6 +261,16 @@ int virtqueue_add_buf_gfp(struct virtqueue *_vq, } EXPORT_SYMBOL_GPL(virtqueue_add_buf_gfp); +/** + * virtqueue_kick - update after add_buf + * @vq: the struct virtqueue + * + * After one or more virtqueue_add_buf_gfp calls, invoke this to kick + * the other side. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + */ void virtqueue_kick(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); @@ -300,6 +327,22 @@ static inline bool more_used(const struct vring_virtqueue *vq) return vq->last_used_idx != vq->vring.used->idx; } +/** + * virtqueue_get_buf - get the next used buffer + * @vq: the struct virtqueue we're talking about. + * @len: the length written into the buffer + * + * If the driver wrote data into the buffer, @len will be set to the + * amount written. This means you don't need to clear the buffer + * beforehand to ensure there's no data leakage in the case of short + * writes. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + * + * Returns NULL if there are no used buffers, or the "data" token + * handed to virtqueue_add_buf_gfp(). + */ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) { struct vring_virtqueue *vq = to_vvq(_vq); @@ -351,6 +394,15 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) } EXPORT_SYMBOL_GPL(virtqueue_get_buf); +/** + * virtqueue_disable_cb - disable callbacks + * @vq: the struct virtqueue we're talking about. + * + * Note that this is not necessarily synchronous, hence unreliable and only + * useful as an optimization. + * + * Unlike other operations, this need not be serialized. + */ void virtqueue_disable_cb(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); @@ -359,6 +411,17 @@ void virtqueue_disable_cb(struct virtqueue *_vq) } EXPORT_SYMBOL_GPL(virtqueue_disable_cb); +/** + * virtqueue_enable_cb - restart callbacks after disable_cb. + * @vq: the struct virtqueue we're talking about. + * + * This re-enables callbacks; it returns "false" if there are pending + * buffers in the queue, to detect a possible race between the driver + * checking for more work, and enabling callbacks. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + */ bool virtqueue_enable_cb(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); @@ -383,6 +446,19 @@ bool virtqueue_enable_cb(struct virtqueue *_vq) } EXPORT_SYMBOL_GPL(virtqueue_enable_cb); +/** + * virtqueue_enable_cb_delayed - restart callbacks after disable_cb. + * @vq: the struct virtqueue we're talking about. + * + * This re-enables callbacks but hints to the other side to delay + * interrupts until most of the available buffers have been processed; + * it returns "false" if there are many pending buffers in the queue, + * to detect a possible race between the driver checking for more work, + * and enabling callbacks. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + */ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); @@ -410,6 +486,14 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) } EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed); +/** + * virtqueue_detach_unused_buf - detach first unused buffer + * @vq: the struct virtqueue we're talking about. + * + * Returns NULL or the "data" token handed to virtqueue_add_buf_gfp(). + * This is not valid on an active queue; it is useful only for device + * shutdown. + */ void *virtqueue_detach_unused_buf(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); @@ -538,7 +622,13 @@ void vring_transport_features(struct virtio_device *vdev) } EXPORT_SYMBOL_GPL(vring_transport_features); -/* return the size of the vring within the virtqueue */ +/** + * virtqueue_get_vring_size - return the size of the virtqueue's vring + * @vq: the struct virtqueue containing the vring of interest. + * + * Returns the size of the vring. This is mainly used for boasting to + * userspace. Unlike other operations, this need not be serialized. + */ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq) { diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 4c069d8bd740..73ad7243128f 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -25,53 +25,6 @@ struct virtqueue { void *priv; }; -/** - * operations for virtqueue - * virtqueue_add_buf: expose buffer to other end - * vq: the struct virtqueue we're talking about. - * sg: the description of the buffer(s). - * out_num: the number of sg readable by other side - * in_num: the number of sg which are writable (after readable ones) - * data: the token identifying the buffer. - * gfp: how to do memory allocations (if necessary). - * Returns remaining capacity of queue (sg segments) or a negative error. - * virtqueue_kick: update after add_buf - * vq: the struct virtqueue - * After one or more add_buf calls, invoke this to kick the other side. - * virtqueue_get_buf: get the next used buffer - * vq: the struct virtqueue we're talking about. - * len: the length written into the buffer - * Returns NULL or the "data" token handed to add_buf. - * virtqueue_disable_cb: disable callbacks - * vq: the struct virtqueue we're talking about. - * Note that this is not necessarily synchronous, hence unreliable and only - * useful as an optimization. - * virtqueue_enable_cb: restart callbacks after disable_cb. - * vq: the struct virtqueue we're talking about. - * This re-enables callbacks; it returns "false" if there are pending - * buffers in the queue, to detect a possible race between the driver - * checking for more work, and enabling callbacks. - * virtqueue_enable_cb_delayed: restart callbacks after disable_cb. - * vq: the struct virtqueue we're talking about. - * This re-enables callbacks but hints to the other side to delay - * interrupts until most of the available buffers have been processed; - * it returns "false" if there are many pending buffers in the queue, - * to detect a possible race between the driver checking for more work, - * and enabling callbacks. - * virtqueue_detach_unused_buf: detach first unused buffer - * vq: the struct virtqueue we're talking about. - * Returns NULL or the "data" token handed to add_buf - * virtqueue_get_vring_size: return the size of the virtqueue's vring - * vq: the struct virtqueue containing the vring of interest. - * Returns the size of the vring. - * - * Locking rules are straightforward: the driver is responsible for - * locking. No two operations may be invoked simultaneously, with the exception - * of virtqueue_disable_cb. - * - * All operations can be called in any context. - */ - int virtqueue_add_buf_gfp(struct virtqueue *vq, struct scatterlist sg[], unsigned int out_num, From f96fde41f7f9af6cf20f6a1919f5d9670f84d574 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 12 Jan 2012 15:44:42 +1030 Subject: [PATCH 04/24] virtio: rename virtqueue_add_buf_gfp to virtqueue_add_buf Remove wrapper functions. This makes the allocation type explicit in all callers; I used GPF_KERNEL where it seemed obvious, left it at GFP_ATOMIC otherwise. Signed-off-by: Rusty Russell Reviewed-by: Christoph Hellwig --- drivers/block/virtio_blk.c | 2 +- drivers/char/hw_random/virtio-rng.c | 2 +- drivers/char/virtio_console.c | 6 +++--- drivers/net/virtio_net.c | 12 ++++++------ drivers/virtio/virtio_balloon.c | 7 ++++--- drivers/virtio/virtio_ring.c | 22 +++++++++++----------- include/linux/virtio.h | 21 ++++++--------------- net/9p/trans_virtio.c | 6 ++++-- tools/virtio/linux/virtio.h | 21 ++++++--------------- tools/virtio/virtio_test.c | 3 ++- 10 files changed, 44 insertions(+), 58 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 4d0b70adf5f7..a345e40e1bca 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -172,7 +172,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, } } - if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr) < 0) { + if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, GFP_ATOMIC)<0) { mempool_free(vbr, vblk->pool); return false; } diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index fd699ccecf5b..723725bbb96b 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size) sg_init_one(&sg, buf, size); /* There should always be room for one buffer. */ - if (virtqueue_add_buf(vq, &sg, 0, 1, buf) < 0) + if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vq); diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 8e3c46d67cb3..d1ae1492ee78 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -392,7 +392,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf) sg_init_one(sg, buf->buf, buf->size); - ret = virtqueue_add_buf(vq, sg, 0, 1, buf); + ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC); virtqueue_kick(vq); return ret; } @@ -457,7 +457,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id, vq = portdev->c_ovq; sg_init_one(sg, &cpkt, sizeof(cpkt)); - if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt) >= 0) { + if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) >= 0) { virtqueue_kick(vq); while (!virtqueue_get_buf(vq, &len)) cpu_relax(); @@ -506,7 +506,7 @@ static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count, reclaim_consumed_buffers(port); sg_init_one(sg, in_buf, in_count); - ret = virtqueue_add_buf(out_vq, sg, 1, 0, in_buf); + ret = virtqueue_add_buf(out_vq, sg, 1, 0, in_buf, GFP_ATOMIC); /* Tell Host to go! */ virtqueue_kick(out_vq); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 76fe14efb2b5..6345a52194f9 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -370,7 +370,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp) skb_to_sgvec(skb, vi->rx_sg + 1, 0, skb->len); - err = virtqueue_add_buf_gfp(vi->rvq, vi->rx_sg, 0, 2, skb, gfp); + err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, 2, skb, gfp); if (err < 0) dev_kfree_skb(skb); @@ -415,8 +415,8 @@ static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp) /* chain first in list head */ first->private = (unsigned long)list; - err = virtqueue_add_buf_gfp(vi->rvq, vi->rx_sg, 0, MAX_SKB_FRAGS + 2, - first, gfp); + err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, MAX_SKB_FRAGS + 2, + first, gfp); if (err < 0) give_pages(vi, first); @@ -434,7 +434,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp) sg_init_one(vi->rx_sg, page_address(page), PAGE_SIZE); - err = virtqueue_add_buf_gfp(vi->rvq, vi->rx_sg, 0, 1, page, gfp); + err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, 1, page, gfp); if (err < 0) give_pages(vi, page); @@ -609,7 +609,7 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb) hdr->num_sg = skb_to_sgvec(skb, vi->tx_sg + 1, 0, skb->len) + 1; return virtqueue_add_buf(vi->svq, vi->tx_sg, hdr->num_sg, - 0, skb); + 0, skb, GFP_ATOMIC); } static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) @@ -767,7 +767,7 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, sg_set_buf(&sg[i + 1], sg_virt(s), s->length); sg_set_buf(&sg[out + in - 1], &status, sizeof(status)); - BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi) < 0); + BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi, GFP_ATOMIC) < 0); virtqueue_kick(vi->cvq); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index f64ff185b8b5..0a6425aadf95 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -88,7 +88,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) init_completion(&vb->acked); /* We should always be able to add one buffer to an empty queue. */ - if (virtqueue_add_buf(vq, &sg, 1, 0, vb) < 0) + if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vq); @@ -220,7 +220,7 @@ static void stats_handle_request(struct virtio_balloon *vb) vq = vb->stats_vq; sg_init_one(&sg, vb->stats, sizeof(vb->stats)); - if (virtqueue_add_buf(vq, &sg, 1, 0, vb) < 0) + if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vq); } @@ -313,7 +313,8 @@ static int virtballoon_probe(struct virtio_device *vdev) * use it to signal us later. */ sg_init_one(&sg, vb->stats, sizeof vb->stats); - if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb) < 0) + if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb, GFP_KERNEL) + < 0) BUG(); virtqueue_kick(vb->stats_vq); } diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index fe50486341a4..6ea92a6d1134 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -167,7 +167,7 @@ static int vring_add_indirect(struct vring_virtqueue *vq, } /** - * virtqueue_add_buf_gfp - expose buffer to other end + * virtqueue_add_buf - expose buffer to other end * @vq: the struct virtqueue we're talking about. * @sg: the description of the buffer(s). * @out_num: the number of sg readable by other side @@ -183,12 +183,12 @@ static int vring_add_indirect(struct vring_virtqueue *vq, * positive return values as "available": indirect buffers mean that * we can put an entire sg[] array inside a single queue entry. */ -int virtqueue_add_buf_gfp(struct virtqueue *_vq, - struct scatterlist sg[], - unsigned int out, - unsigned int in, - void *data, - gfp_t gfp) +int virtqueue_add_buf(struct virtqueue *_vq, + struct scatterlist sg[], + unsigned int out, + unsigned int in, + void *data, + gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); unsigned int i, avail, uninitialized_var(prev); @@ -259,13 +259,13 @@ int virtqueue_add_buf_gfp(struct virtqueue *_vq, return vq->num_free; } -EXPORT_SYMBOL_GPL(virtqueue_add_buf_gfp); +EXPORT_SYMBOL_GPL(virtqueue_add_buf); /** * virtqueue_kick - update after add_buf * @vq: the struct virtqueue * - * After one or more virtqueue_add_buf_gfp calls, invoke this to kick + * After one or more virtqueue_add_buf calls, invoke this to kick * the other side. * * Caller must ensure we don't call this with other virtqueue @@ -341,7 +341,7 @@ static inline bool more_used(const struct vring_virtqueue *vq) * operations at the same time (except where noted). * * Returns NULL if there are no used buffers, or the "data" token - * handed to virtqueue_add_buf_gfp(). + * handed to virtqueue_add_buf(). */ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) { @@ -490,7 +490,7 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed); * virtqueue_detach_unused_buf - detach first unused buffer * @vq: the struct virtqueue we're talking about. * - * Returns NULL or the "data" token handed to virtqueue_add_buf_gfp(). + * Returns NULL or the "data" token handed to virtqueue_add_buf(). * This is not valid on an active queue; it is useful only for device * shutdown. */ diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 73ad7243128f..ec1706e7df50 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -25,21 +25,12 @@ struct virtqueue { void *priv; }; -int virtqueue_add_buf_gfp(struct virtqueue *vq, - struct scatterlist sg[], - unsigned int out_num, - unsigned int in_num, - void *data, - gfp_t gfp); - -static inline int virtqueue_add_buf(struct virtqueue *vq, - struct scatterlist sg[], - unsigned int out_num, - unsigned int in_num, - void *data) -{ - return virtqueue_add_buf_gfp(vq, sg, out_num, in_num, data, GFP_ATOMIC); -} +int virtqueue_add_buf(struct virtqueue *vq, + struct scatterlist sg[], + unsigned int out_num, + unsigned int in_num, + void *data, + gfp_t gfp); void virtqueue_kick(struct virtqueue *vq); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 330421e54713..3d432068f627 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -272,7 +272,8 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity); - err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc); + err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, + GFP_ATOMIC); if (err < 0) { if (err == -ENOSPC) { chan->ring_bufs_avail = 0; @@ -414,7 +415,8 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM, in_pages, in_nr_pages, uidata, inlen); - err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc); + err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, + GFP_ATOMIC); if (err < 0) { if (err == -ENOSPC) { chan->ring_bufs_avail = 0; diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index 953db2abf6b9..b4fbc91c41b4 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -186,21 +186,12 @@ struct virtqueue { #endif /* Interfaces exported by virtio_ring. */ -int virtqueue_add_buf_gfp(struct virtqueue *vq, - struct scatterlist sg[], - unsigned int out_num, - unsigned int in_num, - void *data, - gfp_t gfp); - -static inline int virtqueue_add_buf(struct virtqueue *vq, - struct scatterlist sg[], - unsigned int out_num, - unsigned int in_num, - void *data) -{ - return virtqueue_add_buf_gfp(vq, sg, out_num, in_num, data, GFP_ATOMIC); -} +int virtqueue_add_buf(struct virtqueue *vq, + struct scatterlist sg[], + unsigned int out_num, + unsigned int in_num, + void *data, + gfp_t gfp); void virtqueue_kick(struct virtqueue *vq); diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index 0740284396c1..6bf95f995364 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -161,7 +161,8 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq, int bufs) if (started < bufs) { sg_init_one(&sl, dev->buf, dev->buf_size); r = virtqueue_add_buf(vq->vq, &sl, 1, 0, - dev->buf + started); + dev->buf + started, + GFP_ATOMIC); if (likely(r >= 0)) { ++started; virtqueue_kick(vq->vq); From 41f0377f73039ca6fe97a469d1941a89cd9757f1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 12 Jan 2012 15:44:43 +1030 Subject: [PATCH 05/24] virtio: support unlocked queue kick Based on patch by Christoph for virtio_blk speedup: Split virtqueue_kick to be able to do the actual notification outside the lock protecting the virtqueue. This patch was originally done by Stefan Hajnoczi, but I can't find the original one anymore and had to recreated it from memory. Pointers to the original or corrections for the commit message are welcome. Stefan's patch was here: https://github.com/stefanha/linux/commit/a6d06644e3a58e57a774e77d7dc34c4a5a2e7496 http://www.spinics.net/lists/linux-virtualization/msg14616.html Third time's the charm! Signed-off-by: Rusty Russell --- drivers/virtio/virtio_ring.c | 60 ++++++++++++++++++++++++++++-------- include/linux/virtio.h | 4 +++ 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 6ea92a6d1134..c56bbe799241 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -262,19 +262,22 @@ int virtqueue_add_buf(struct virtqueue *_vq, EXPORT_SYMBOL_GPL(virtqueue_add_buf); /** - * virtqueue_kick - update after add_buf + * virtqueue_kick_prepare - first half of split virtqueue_kick call. * @vq: the struct virtqueue * - * After one or more virtqueue_add_buf calls, invoke this to kick - * the other side. + * Instead of virtqueue_kick(), you can do: + * if (virtqueue_kick_prepare(vq)) + * virtqueue_notify(vq); * - * Caller must ensure we don't call this with other virtqueue - * operations at the same time (except where noted). + * This is sometimes useful because the virtqueue_kick_prepare() needs + * to be serialized, but the actual virtqueue_notify() call does not. */ -void virtqueue_kick(struct virtqueue *_vq) +bool virtqueue_kick_prepare(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); u16 new, old; + bool needs_kick; + START_USE(vq); /* Descriptors and available array need to be set before we expose the * new available array entries. */ @@ -287,13 +290,46 @@ void virtqueue_kick(struct virtqueue *_vq) /* Need to update avail index before checking if we should notify */ virtio_mb(vq); - if (vq->event ? - vring_need_event(vring_avail_event(&vq->vring), new, old) : - !(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY)) - /* Prod other side to tell it about changes. */ - vq->notify(&vq->vq); - + if (vq->event) { + needs_kick = vring_need_event(vring_avail_event(&vq->vring), + new, old); + } else { + needs_kick = !(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY); + } END_USE(vq); + return needs_kick; +} +EXPORT_SYMBOL_GPL(virtqueue_kick_prepare); + +/** + * virtqueue_notify - second half of split virtqueue_kick call. + * @vq: the struct virtqueue + * + * This does not need to be serialized. + */ +void virtqueue_notify(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + /* Prod other side to tell it about changes. */ + vq->notify(_vq); +} +EXPORT_SYMBOL_GPL(virtqueue_notify); + +/** + * virtqueue_kick - update after add_buf + * @vq: the struct virtqueue + * + * After one or more virtqueue_add_buf calls, invoke this to kick + * the other side. + * + * Caller must ensure we don't call this with other virtqueue + * operations at the same time (except where noted). + */ +void virtqueue_kick(struct virtqueue *vq) +{ + if (virtqueue_kick_prepare(vq)) + virtqueue_notify(vq); } EXPORT_SYMBOL_GPL(virtqueue_kick); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index ec1706e7df50..31fe3a62874b 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -34,6 +34,10 @@ int virtqueue_add_buf(struct virtqueue *vq, void virtqueue_kick(struct virtqueue *vq); +bool virtqueue_kick_prepare(struct virtqueue *vq); + +void virtqueue_notify(struct virtqueue *vq); + void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); void virtqueue_disable_cb(struct virtqueue *vq); From 3b720b8c865098c49c1570b6b5c7832bcfa6e6c2 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 12 Jan 2012 15:44:43 +1030 Subject: [PATCH 06/24] virtio: avoid modulus operation. Since we know vq->vring.num is a power of 2, modulus is lazy (it's asserted in vring_new_virtqueue()). Signed-off-by: Rusty Russell --- drivers/virtio/virtio_ring.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index c56bbe799241..99dc9480f3fe 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -250,8 +250,8 @@ int virtqueue_add_buf(struct virtqueue *_vq, vq->data[head] = data; /* Put entry in available array (but don't update avail->idx until they - * do sync). FIXME: avoid modulus here? */ - avail = (vq->vring.avail->idx + vq->num_added++) % vq->vring.num; + * do sync). */ + avail = ((vq->vring.avail->idx + vq->num_added++) & (vq->vring.num-1)); vq->vring.avail->ring[avail] = head; pr_debug("Added buffer head %i to %p\n", head, vq); @@ -384,6 +384,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) struct vring_virtqueue *vq = to_vvq(_vq); void *ret; unsigned int i; + u16 last_used; START_USE(vq); @@ -401,8 +402,9 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) /* Only get used array entries after they have been exposed by host. */ virtio_rmb(vq); - i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id; - *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len; + last_used = (vq->last_used_idx & (vq->vring.num - 1)); + i = vq->vring.used->ring[last_used].id; + *len = vq->vring.used->ring[last_used].len; if (unlikely(i >= vq->vring.num)) { BAD_RING(vq, "id %u out of range\n", i); From ee7cd8981e15bcb365fc762afe3fc47b8242f630 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 12 Jan 2012 15:44:43 +1030 Subject: [PATCH 07/24] virtio: expose added descriptors immediately. A virtio driver does virtqueue_add_buf() multiple times before finally calling virtqueue_kick(); previously we only exposed the added buffers in the virtqueue_kick() call. This means we don't need a memory barrier in virtqueue_add_buf(), but it reduces concurrency as the device (ie. host) can't see the buffers until the kick. In the unusual (but now possible) case where a driver does add_buf() and get_buf() without doing a kick, we do need to insert one before our counter wraps. Otherwise we could wrap num_added, and later on not realize that we have passed the marker where we should have kicked. Signed-off-by: Rusty Russell --- drivers/virtio/virtio_ring.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 99dc9480f3fe..36bb6a613728 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -251,9 +251,20 @@ int virtqueue_add_buf(struct virtqueue *_vq, /* Put entry in available array (but don't update avail->idx until they * do sync). */ - avail = ((vq->vring.avail->idx + vq->num_added++) & (vq->vring.num-1)); + avail = (vq->vring.avail->idx & (vq->vring.num-1)); vq->vring.avail->ring[avail] = head; + /* Descriptors and available array need to be set before we expose the + * new available array entries. */ + virtio_wmb(vq); + vq->vring.avail->idx++; + vq->num_added++; + + /* This is very unlikely, but theoretically possible. Kick + * just in case. */ + if (unlikely(vq->num_added == (1 << 16) - 1)) + virtqueue_kick(_vq); + pr_debug("Added buffer head %i to %p\n", head, vq); END_USE(vq); @@ -283,13 +294,10 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq) * new available array entries. */ virtio_wmb(vq); - old = vq->vring.avail->idx; - new = vq->vring.avail->idx = old + vq->num_added; + old = vq->vring.avail->idx - vq->num_added; + new = vq->vring.avail->idx; vq->num_added = 0; - /* Need to update avail index before checking if we should notify */ - virtio_mb(vq); - if (vq->event) { needs_kick = vring_need_event(vring_avail_event(&vq->vring), new, old); From e93300b1afc7cd4fe1e741ceaf06714d060e88b8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 12 Jan 2012 15:44:43 +1030 Subject: [PATCH 08/24] virtio: add debugging if driver doesn't kick. Under the existing #ifdef DEBUG, check that they don't have more than 1/10 of a second between an add_buf() and a virtqueue_notify()/virtqueue_kick_prepare() call. We could get false positives on a really busy system, but good for development. Signed-off-by: Rusty Russell --- drivers/virtio/virtio_ring.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 36bb6a613728..79e1b292c030 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -22,6 +22,7 @@ #include #include #include +#include /* virtio guest is communicating with a virtual "device" that actually runs on * a host processor. Memory barriers are used to control SMP effects. */ @@ -108,6 +109,10 @@ struct vring_virtqueue #ifdef DEBUG /* They're supposed to lock for us. */ unsigned int in_use; + + /* Figure out if their kicks are too delayed. */ + bool last_add_time_valid; + ktime_t last_add_time; #endif /* Tokens for callbacks. */ @@ -198,6 +203,19 @@ int virtqueue_add_buf(struct virtqueue *_vq, BUG_ON(data == NULL); +#ifdef DEBUG + { + ktime_t now = ktime_get(); + + /* No kick or get, with .1 second between? Warn. */ + if (vq->last_add_time_valid) + WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time)) + > 100); + vq->last_add_time = now; + vq->last_add_time_valid = true; + } +#endif + /* If the host supports indirect descriptor tables, and we have multiple * buffers, then go indirect. FIXME: tune this threshold */ if (vq->indirect && (out + in) > 1 && vq->num_free) { @@ -298,6 +316,14 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq) new = vq->vring.avail->idx; vq->num_added = 0; +#ifdef DEBUG + if (vq->last_add_time_valid) { + WARN_ON(ktime_to_ms(ktime_sub(ktime_get(), + vq->last_add_time)) > 100); + } + vq->last_add_time_valid = false; +#endif + if (vq->event) { needs_kick = vring_need_event(vring_avail_event(&vq->vring), new, old); @@ -435,6 +461,10 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) virtio_mb(vq); } +#ifdef DEBUG + vq->last_add_time_valid = false; +#endif + END_USE(vq); return ret; } @@ -620,6 +650,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, list_add_tail(&vq->vq.list, &vdev->vqs); #ifdef DEBUG vq->in_use = false; + vq->last_add_time_valid = false; #endif vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC); From 4678d6f970c2f7c0cbfefc0cc666432d153b321b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 12 Jan 2012 15:44:44 +1030 Subject: [PATCH 09/24] virtio_blk: fix config handler race Fix a theoretical race related to config work handler: a config interrupt might happen after we flush config work but before we reset the device. It will then cause the config work to run during or after reset. Two problems with this: - if this runs after device is gone we will get use after free - access of config while reset is in progress is racy (as layout is changing). As a solution 1. flush after reset when we know there will be no more interrupts 2. add a flag to disable config access before reset Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index a345e40e1bca..ba73661fb9f3 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,12 @@ struct virtio_blk /* Process context for config space updates */ struct work_struct config_work; + /* Lock for config space updates */ + struct mutex config_lock; + + /* enable config space updates */ + bool config_enable; + /* What host tells us, plus 2 for header & tailer. */ unsigned int sg_elems; @@ -318,6 +325,10 @@ static void virtblk_config_changed_work(struct work_struct *work) char cap_str_2[10], cap_str_10[10]; u64 capacity, size; + mutex_lock(&vblk->config_lock); + if (!vblk->config_enable) + goto done; + /* Host must always specify the capacity. */ vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity), &capacity, sizeof(capacity)); @@ -340,6 +351,8 @@ static void virtblk_config_changed_work(struct work_struct *work) cap_str_10, cap_str_2); set_capacity(vblk->disk, capacity); +done: + mutex_unlock(&vblk->config_lock); } static void virtblk_config_changed(struct virtio_device *vdev) @@ -388,7 +401,9 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) vblk->vdev = vdev; vblk->sg_elems = sg_elems; sg_init_table(vblk->sg, vblk->sg_elems); + mutex_init(&vblk->config_lock); INIT_WORK(&vblk->config_work, virtblk_config_changed_work); + vblk->config_enable = true; /* We expect one virtqueue, for output. */ vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests"); @@ -542,7 +557,10 @@ static void __devexit virtblk_remove(struct virtio_device *vdev) struct virtio_blk *vblk = vdev->priv; int index = vblk->index; - flush_work(&vblk->config_work); + /* Prevent config work handler from accessing the device. */ + mutex_lock(&vblk->config_lock); + vblk->config_enable = false; + mutex_unlock(&vblk->config_lock); /* Nothing should be pending. */ BUG_ON(!list_empty(&vblk->reqs)); @@ -550,6 +568,8 @@ static void __devexit virtblk_remove(struct virtio_device *vdev) /* Stop all the virtqueues. */ vdev->config->reset(vdev); + flush_work(&vblk->config_work); + del_gendisk(vblk->disk); blk_cleanup_queue(vblk->disk->queue); put_disk(vblk->disk); From d077536386595309060dda57e7b7474c501a589b Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Thu, 22 Dec 2011 16:58:25 +0530 Subject: [PATCH 10/24] virtio: pci: switch to new PM API The older PM API doesn't have a way to get notifications on hibernate events. Switch to the newer one that gives us those notifications. Signed-off-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/virtio/virtio_pci.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 688b42d28dad..7f9ac1af7cfd 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -716,19 +716,28 @@ static void __devexit virtio_pci_remove(struct pci_dev *pci_dev) } #ifdef CONFIG_PM -static int virtio_pci_suspend(struct pci_dev *pci_dev, pm_message_t state) +static int virtio_pci_suspend(struct device *dev) { + struct pci_dev *pci_dev = to_pci_dev(dev); + pci_save_state(pci_dev); pci_set_power_state(pci_dev, PCI_D3hot); return 0; } -static int virtio_pci_resume(struct pci_dev *pci_dev) +static int virtio_pci_resume(struct device *dev) { + struct pci_dev *pci_dev = to_pci_dev(dev); + pci_restore_state(pci_dev); pci_set_power_state(pci_dev, PCI_D0); return 0; } + +static const struct dev_pm_ops virtio_pci_pm_ops = { + .suspend = virtio_pci_suspend, + .resume = virtio_pci_resume, +}; #endif static struct pci_driver virtio_pci_driver = { @@ -737,8 +746,7 @@ static struct pci_driver virtio_pci_driver = { .probe = virtio_pci_probe, .remove = __devexit_p(virtio_pci_remove), #ifdef CONFIG_PM - .suspend = virtio_pci_suspend, - .resume = virtio_pci_resume, + .driver.pm = &virtio_pci_pm_ops, #endif }; From f0fe6f11503fa9880867554350ac5d3092c47251 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Thu, 22 Dec 2011 16:58:26 +0530 Subject: [PATCH 11/24] virtio: pci: add PM notification handlers for restore, freeze, thaw, poweroff Handle thaw, restore and freeze notifications from the PM core. Expose these to individual virtio drivers that can quiesce and resume vq operations. For drivers not implementing the thaw() method, use the restore method instead. These functions also save device-specific data so that the device can be put in pre-suspend state after resume, and disable and enable the PCI device in the freeze and resume functions, respectively. Signed-off-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/virtio/virtio_pci.c | 94 ++++++++++++++++++++++++++++++++++++- include/linux/virtio.h | 5 ++ 2 files changed, 97 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 7f9ac1af7cfd..635e1efb3792 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -55,6 +55,10 @@ struct virtio_pci_device unsigned msix_vectors; /* Vectors allocated, excluding per-vq vectors if any */ unsigned msix_used_vectors; + + /* Status saved during hibernate/restore */ + u8 saved_status; + /* Whether we have vector per vq */ bool per_vq_vectors; }; @@ -734,9 +738,95 @@ static int virtio_pci_resume(struct device *dev) return 0; } +static int virtio_pci_freeze(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + struct virtio_driver *drv; + int ret; + + drv = container_of(vp_dev->vdev.dev.driver, + struct virtio_driver, driver); + + ret = 0; + vp_dev->saved_status = vp_get_status(&vp_dev->vdev); + if (drv && drv->freeze) + ret = drv->freeze(&vp_dev->vdev); + + if (!ret) + pci_disable_device(pci_dev); + return ret; +} + +static int restore_common(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + int ret; + + ret = pci_enable_device(pci_dev); + if (ret) + return ret; + pci_set_master(pci_dev); + vp_finalize_features(&vp_dev->vdev); + + return ret; +} + +static int virtio_pci_thaw(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + struct virtio_driver *drv; + int ret; + + ret = restore_common(dev); + if (ret) + return ret; + + drv = container_of(vp_dev->vdev.dev.driver, + struct virtio_driver, driver); + + if (drv && drv->thaw) + ret = drv->thaw(&vp_dev->vdev); + else if (drv && drv->restore) + ret = drv->restore(&vp_dev->vdev); + + /* Finally, tell the device we're all set */ + if (!ret) + vp_set_status(&vp_dev->vdev, vp_dev->saved_status); + + return ret; +} + +static int virtio_pci_restore(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + struct virtio_driver *drv; + int ret; + + drv = container_of(vp_dev->vdev.dev.driver, + struct virtio_driver, driver); + + ret = restore_common(dev); + if (!ret && drv && drv->restore) + ret = drv->restore(&vp_dev->vdev); + + /* Finally, tell the device we're all set */ + if (!ret) + vp_set_status(&vp_dev->vdev, vp_dev->saved_status); + + return ret; +} + static const struct dev_pm_ops virtio_pci_pm_ops = { - .suspend = virtio_pci_suspend, - .resume = virtio_pci_resume, + .suspend = virtio_pci_suspend, + .resume = virtio_pci_resume, + .freeze = virtio_pci_freeze, + .thaw = virtio_pci_thaw, + .restore = virtio_pci_restore, + .poweroff = virtio_pci_suspend, }; #endif diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 31fe3a62874b..d0018d27c281 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -94,6 +94,11 @@ struct virtio_driver { int (*probe)(struct virtio_device *dev); void (*remove)(struct virtio_device *dev); void (*config_changed)(struct virtio_device *dev); +#ifdef CONFIG_PM + int (*freeze)(struct virtio_device *dev); + int (*thaw)(struct virtio_device *dev); + int (*restore)(struct virtio_device *dev); +#endif }; int register_virtio_driver(struct virtio_driver *drv); From a0e2dbfc223028b72a1c193f94fcd3f67253ba4a Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Thu, 22 Dec 2011 16:58:27 +0530 Subject: [PATCH 12/24] virtio: console: Move vq and vq buf removal into separate functions This common code will be shared with the PM freeze function. Signed-off-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/char/virtio_console.c | 68 ++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index d1ae1492ee78..9681ffd79904 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -1271,6 +1271,20 @@ static void remove_port(struct kref *kref) kfree(port); } +static void remove_port_data(struct port *port) +{ + struct port_buffer *buf; + + /* Remove unused data this port might have received. */ + discard_port_data(port); + + reclaim_consumed_buffers(port); + + /* Remove buffers we queued up for the Host to send us data in. */ + while ((buf = virtqueue_detach_unused_buf(port->in_vq))) + free_buf(buf); +} + /* * Port got unplugged. Remove port from portdev's list and drop the * kref reference. If no userspace has this port opened, it will @@ -1278,8 +1292,6 @@ static void remove_port(struct kref *kref) */ static void unplug_port(struct port *port) { - struct port_buffer *buf; - spin_lock_irq(&port->portdev->ports_lock); list_del(&port->list); spin_unlock_irq(&port->portdev->ports_lock); @@ -1300,14 +1312,7 @@ static void unplug_port(struct port *port) hvc_remove(port->cons.hvc); } - /* Remove unused data this port might have received. */ - discard_port_data(port); - - reclaim_consumed_buffers(port); - - /* Remove buffers we queued up for the Host to send us data in. */ - while ((buf = virtqueue_detach_unused_buf(port->in_vq))) - free_buf(buf); + remove_port_data(port); /* * We should just assume the device itself has gone off -- @@ -1659,6 +1664,28 @@ static const struct file_operations portdev_fops = { .owner = THIS_MODULE, }; +static void remove_vqs(struct ports_device *portdev) +{ + portdev->vdev->config->del_vqs(portdev->vdev); + kfree(portdev->in_vqs); + kfree(portdev->out_vqs); +} + +static void remove_controlq_data(struct ports_device *portdev) +{ + struct port_buffer *buf; + unsigned int len; + + if (!use_multiport(portdev)) + return; + + while ((buf = virtqueue_get_buf(portdev->c_ivq, &len))) + free_buf(buf); + + while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq))) + free_buf(buf); +} + /* * Once we're further in boot, we get probed like any other virtio * device. @@ -1764,9 +1791,7 @@ static int __devinit virtcons_probe(struct virtio_device *vdev) /* The host might want to notify mgmt sw about device add failure */ __send_control_msg(portdev, VIRTIO_CONSOLE_BAD_ID, VIRTIO_CONSOLE_DEVICE_READY, 0); - vdev->config->del_vqs(vdev); - kfree(portdev->in_vqs); - kfree(portdev->out_vqs); + remove_vqs(portdev); free_chrdev: unregister_chrdev(portdev->chr_major, "virtio-portsdev"); free: @@ -1804,21 +1829,8 @@ static void virtcons_remove(struct virtio_device *vdev) * have to just stop using the port, as the vqs are going * away. */ - if (use_multiport(portdev)) { - struct port_buffer *buf; - unsigned int len; - - while ((buf = virtqueue_get_buf(portdev->c_ivq, &len))) - free_buf(buf); - - while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq))) - free_buf(buf); - } - - vdev->config->del_vqs(vdev); - kfree(portdev->in_vqs); - kfree(portdev->out_vqs); - + remove_controlq_data(portdev); + remove_vqs(portdev); kfree(portdev); } From 2b8f41d846990c3c1c8addbaed2cf53c3ef91d25 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Thu, 22 Dec 2011 16:58:28 +0530 Subject: [PATCH 13/24] virtio: console: Add freeze and restore handlers to support S4 Remove all vqs and associated buffers in the freeze callback which prepares us to go into hibernation state. On restore, re-create all the vqs and populate the input vqs with buffers to get to the pre-hibernate state. Note: Any outstanding unconsumed buffers are discarded; which means there's a possibility of data loss in case the host or the guest didn't consume any data already present in the vqs. This can be addressed in a later patch series, perhaps in virtio common code. Signed-off-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/char/virtio_console.c | 58 +++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 9681ffd79904..614b84d38d7a 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -1844,6 +1844,60 @@ static unsigned int features[] = { VIRTIO_CONSOLE_F_MULTIPORT, }; +#ifdef CONFIG_PM +static int virtcons_freeze(struct virtio_device *vdev) +{ + struct ports_device *portdev; + struct port *port; + + portdev = vdev->priv; + + vdev->config->reset(vdev); + + cancel_work_sync(&portdev->control_work); + remove_controlq_data(portdev); + + list_for_each_entry(port, &portdev->ports, list) { + /* + * We'll ask the host later if the new invocation has + * the port opened or closed. + */ + port->host_connected = false; + remove_port_data(port); + } + remove_vqs(portdev); + + return 0; +} + +static int virtcons_restore(struct virtio_device *vdev) +{ + struct ports_device *portdev; + struct port *port; + int ret; + + portdev = vdev->priv; + + ret = init_vqs(portdev); + if (ret) + return ret; + + if (use_multiport(portdev)) + fill_queue(portdev->c_ivq, &portdev->cvq_lock); + + list_for_each_entry(port, &portdev->ports, list) { + port->in_vq = portdev->in_vqs[port->id]; + port->out_vq = portdev->out_vqs[port->id]; + + fill_queue(port->in_vq, &port->inbuf_lock); + + /* Get port open/close status on the host */ + send_control_msg(port, VIRTIO_CONSOLE_PORT_READY, 1); + } + return 0; +} +#endif + static struct virtio_driver virtio_console = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), @@ -1853,6 +1907,10 @@ static struct virtio_driver virtio_console = { .probe = virtcons_probe, .remove = virtcons_remove, .config_changed = config_intr, +#ifdef CONFIG_PM + .freeze = virtcons_freeze, + .restore = virtcons_restore, +#endif }; static int __init init(void) From c743d09dbd018dd6345519fb3264b16bdaa41746 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Fri, 6 Jan 2012 16:19:08 +0530 Subject: [PATCH 14/24] virtio: console: Disable callbacks for virtqueues at start of S4 freeze To ensure we don't receive any more interrupts from the host after we enter the freeze function, disable all vq interrupts. There wasn't any problem seen due to this in tests, but applying this patch makes the freeze case more robust. Signed-off-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/char/virtio_console.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 614b84d38d7a..b58b56187065 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -1854,10 +1854,18 @@ static int virtcons_freeze(struct virtio_device *vdev) vdev->config->reset(vdev); + virtqueue_disable_cb(portdev->c_ivq); cancel_work_sync(&portdev->control_work); + /* + * Once more: if control_work_handler() was running, it would + * enable the cb as the last step. + */ + virtqueue_disable_cb(portdev->c_ivq); remove_controlq_data(portdev); list_for_each_entry(port, &portdev->ports, list) { + virtqueue_disable_cb(port->in_vq); + virtqueue_disable_cb(port->out_vq); /* * We'll ask the host later if the new invocation has * the port opened or closed. From 6abd6e5a4404056e28be04958a57d0286883161a Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Thu, 22 Dec 2011 16:58:29 +0530 Subject: [PATCH 15/24] virtio: blk: Move vq initialization to separate function The probe and PM restore functions will share this code. Signed-off-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index ba73661fb9f3..e8af52355bf5 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -362,6 +362,18 @@ static void virtblk_config_changed(struct virtio_device *vdev) queue_work(virtblk_wq, &vblk->config_work); } +static int init_vq(struct virtio_blk *vblk) +{ + int err = 0; + + /* We expect one virtqueue, for output. */ + vblk->vq = virtio_find_single_vq(vblk->vdev, blk_done, "requests"); + if (IS_ERR(vblk->vq)) + err = PTR_ERR(vblk->vq); + + return err; +} + static int __devinit virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; @@ -405,12 +417,9 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) INIT_WORK(&vblk->config_work, virtblk_config_changed_work); vblk->config_enable = true; - /* We expect one virtqueue, for output. */ - vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests"); - if (IS_ERR(vblk->vq)) { - err = PTR_ERR(vblk->vq); + err = init_vq(vblk); + if (err) goto out_free_vblk; - } vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req)); if (!vblk->pool) { From f8fb5bc23a50a5398aa31a4e8c6dbbef53d2dec6 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Thu, 22 Dec 2011 16:58:30 +0530 Subject: [PATCH 16/24] virtio: blk: Add freeze, restore handlers to support S4 Delete the vq and flush any pending requests from the block queue on the freeze callback to prepare for hibernation. Re-create the vq in the restore callback to resume normal function. Signed-off-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 44 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index e8af52355bf5..ffd5ca919295 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -588,6 +588,46 @@ static void __devexit virtblk_remove(struct virtio_device *vdev) ida_simple_remove(&vd_index_ida, index); } +#ifdef CONFIG_PM +static int virtblk_freeze(struct virtio_device *vdev) +{ + struct virtio_blk *vblk = vdev->priv; + + /* Ensure we don't receive any more interrupts */ + vdev->config->reset(vdev); + + /* Prevent config work handler from accessing the device. */ + mutex_lock(&vblk->config_lock); + vblk->config_enable = false; + mutex_unlock(&vblk->config_lock); + + flush_work(&vblk->config_work); + + spin_lock_irq(vblk->disk->queue->queue_lock); + blk_stop_queue(vblk->disk->queue); + spin_unlock_irq(vblk->disk->queue->queue_lock); + blk_sync_queue(vblk->disk->queue); + + vdev->config->del_vqs(vdev); + return 0; +} + +static int virtblk_restore(struct virtio_device *vdev) +{ + struct virtio_blk *vblk = vdev->priv; + int ret; + + vblk->config_enable = true; + ret = init_vq(vdev->priv); + if (!ret) { + spin_lock_irq(vblk->disk->queue->queue_lock); + blk_start_queue(vblk->disk->queue); + spin_unlock_irq(vblk->disk->queue->queue_lock); + } + return ret; +} +#endif + static const struct virtio_device_id id_table[] = { { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, { 0 }, @@ -613,6 +653,10 @@ static struct virtio_driver __refdata virtio_blk = { .probe = virtblk_probe, .remove = __devexit_p(virtblk_remove), .config_changed = virtblk_config_changed, +#ifdef CONFIG_PM + .freeze = virtblk_freeze, + .restore = virtblk_restore, +#endif }; static int __init init(void) From 3f9c10b0d478a3b7c3dde555edae79fecef496e5 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Thu, 22 Dec 2011 16:58:31 +0530 Subject: [PATCH 17/24] virtio: net: Move vq initialization into separate function The probe and PM restore functions will share this code. Signed-off-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/net/virtio_net.c | 47 ++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 6345a52194f9..70a9c4b1252d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -985,15 +985,38 @@ static void virtnet_config_changed(struct virtio_device *vdev) virtnet_update_status(vi); } +static int init_vqs(struct virtnet_info *vi) +{ + struct virtqueue *vqs[3]; + vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL}; + const char *names[] = { "input", "output", "control" }; + int nvqs, err; + + /* We expect two virtqueues, receive then send, + * and optionally control. */ + nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2; + + err = vi->vdev->config->find_vqs(vi->vdev, nvqs, vqs, callbacks, names); + if (err) + return err; + + vi->rvq = vqs[0]; + vi->svq = vqs[1]; + + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) { + vi->cvq = vqs[2]; + + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN)) + vi->dev->features |= NETIF_F_HW_VLAN_FILTER; + } + return 0; +} + static int virtnet_probe(struct virtio_device *vdev) { int err; struct net_device *dev; struct virtnet_info *vi; - struct virtqueue *vqs[3]; - vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL}; - const char *names[] = { "input", "output", "control" }; - int nvqs; /* Allocate ourselves a network device with room for our info */ dev = alloc_etherdev(sizeof(struct virtnet_info)); @@ -1065,24 +1088,10 @@ static int virtnet_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) vi->mergeable_rx_bufs = true; - /* We expect two virtqueues, receive then send, - * and optionally control. */ - nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2; - - err = vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names); + err = init_vqs(vi); if (err) goto free_stats; - vi->rvq = vqs[0]; - vi->svq = vqs[1]; - - if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) { - vi->cvq = vqs[2]; - - if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN)) - dev->features |= NETIF_F_HW_VLAN_FILTER; - } - err = register_netdev(dev); if (err) { pr_debug("virtio_net: registering device failed\n"); From 04486ed019d249249c00546704af12498a432c15 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Thu, 22 Dec 2011 16:58:32 +0530 Subject: [PATCH 18/24] virtio: net: Move vq and vq buf removal into separate function The remove and PM freeze functions will share this code. Signed-off-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/net/virtio_net.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 70a9c4b1252d..59109c1b6b87 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1153,22 +1153,26 @@ static void free_unused_bufs(struct virtnet_info *vi) BUG_ON(vi->num != 0); } -static void __devexit virtnet_remove(struct virtio_device *vdev) +static void remove_vq_common(struct virtnet_info *vi) { - struct virtnet_info *vi = vdev->priv; - - /* Stop all the virtqueues. */ - vdev->config->reset(vdev); - - unregister_netdev(vi->dev); + vi->vdev->config->reset(vi->vdev); /* Free unused buffers in both send and recv, if any. */ free_unused_bufs(vi); - vdev->config->del_vqs(vi->vdev); + vi->vdev->config->del_vqs(vi->vdev); while (vi->pages) __free_pages(get_a_page(vi, GFP_KERNEL), 0); +} + +static void __devexit virtnet_remove(struct virtio_device *vdev) +{ + struct virtnet_info *vi = vdev->priv; + + unregister_netdev(vi->dev); + + remove_vq_common(vi); free_percpu(vi->stats); free_netdev(vi->dev); From 0741bcb5584f9e2390ae6261573c4de8314999f2 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Thu, 22 Dec 2011 16:58:33 +0530 Subject: [PATCH 19/24] virtio: net: Add freeze, restore handlers to support S4 Remove all the vqs, disable napi and detach from the netdev on hibernation. Re-create vqs after restoring from a hibernated image, re-enable napi and re-attach the netdev. This keeps networking working across hibernation. Signed-off-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/net/virtio_net.c | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 59109c1b6b87..4880aa8b4c28 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1178,6 +1178,48 @@ static void __devexit virtnet_remove(struct virtio_device *vdev) free_netdev(vi->dev); } +#ifdef CONFIG_PM +static int virtnet_freeze(struct virtio_device *vdev) +{ + struct virtnet_info *vi = vdev->priv; + + virtqueue_disable_cb(vi->rvq); + virtqueue_disable_cb(vi->svq); + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) + virtqueue_disable_cb(vi->cvq); + + netif_device_detach(vi->dev); + cancel_delayed_work_sync(&vi->refill); + + if (netif_running(vi->dev)) + napi_disable(&vi->napi); + + remove_vq_common(vi); + + return 0; +} + +static int virtnet_restore(struct virtio_device *vdev) +{ + struct virtnet_info *vi = vdev->priv; + int err; + + err = init_vqs(vi); + if (err) + return err; + + if (netif_running(vi->dev)) + virtnet_napi_enable(vi); + + netif_device_attach(vi->dev); + + if (!try_fill_recv(vi, GFP_KERNEL)) + queue_delayed_work(system_nrt_wq, &vi->refill, 0); + + return 0; +} +#endif + static struct virtio_device_id id_table[] = { { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, { 0 }, @@ -1202,6 +1244,10 @@ static struct virtio_driver virtio_net_driver = { .probe = virtnet_probe, .remove = __devexit_p(virtnet_remove), .config_changed = virtnet_config_changed, +#ifdef CONFIG_PM + .freeze = virtnet_freeze, + .restore = virtnet_restore, +#endif }; static int __init init(void) From be91c33dd15eff6b0dffc60cee4c8042e75493d2 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Thu, 22 Dec 2011 16:58:34 +0530 Subject: [PATCH 20/24] virtio: balloon: Move vq initialization into separate function The probe and PM restore functions will share this code. Signed-off-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/virtio/virtio_balloon.c | 48 ++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 0a6425aadf95..3368cb6ef193 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -275,32 +275,21 @@ static int balloon(void *_vballoon) return 0; } -static int virtballoon_probe(struct virtio_device *vdev) +static int init_vqs(struct virtio_balloon *vb) { - struct virtio_balloon *vb; struct virtqueue *vqs[3]; vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request }; const char *names[] = { "inflate", "deflate", "stats" }; int err, nvqs; - vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL); - if (!vb) { - err = -ENOMEM; - goto out; - } - - INIT_LIST_HEAD(&vb->pages); - vb->num_pages = 0; - init_waitqueue_head(&vb->config_change); - vb->vdev = vdev; - vb->need_stats_update = 0; - - /* We expect two virtqueues: inflate and deflate, - * and optionally stat. */ + /* + * We expect two virtqueues: inflate and deflate, and + * optionally stat. + */ nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2; - err = vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names); + err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names); if (err) - goto out_free_vb; + return err; vb->inflate_vq = vqs[0]; vb->deflate_vq = vqs[1]; @@ -318,6 +307,29 @@ static int virtballoon_probe(struct virtio_device *vdev) BUG(); virtqueue_kick(vb->stats_vq); } + return 0; +} + +static int virtballoon_probe(struct virtio_device *vdev) +{ + struct virtio_balloon *vb; + int err; + + vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL); + if (!vb) { + err = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&vb->pages); + vb->num_pages = 0; + init_waitqueue_head(&vb->config_change); + vb->vdev = vdev; + vb->need_stats_update = 0; + + err = init_vqs(vb); + if (err) + goto out_free_vb; vb->thread = kthread_run(balloon, vb, "vballoon"); if (IS_ERR(vb->thread)) { From e562966dbaf49e7804097cd991e5d3a8934fc148 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Thu, 22 Dec 2011 16:58:35 +0530 Subject: [PATCH 21/24] virtio: balloon: Add freeze, restore handlers to support S4 Handling balloon hibernate / restore is tricky. If the balloon was inflated before going into the hibernation state, upon resume, the host will not have any memory of that. Any pages that were passed on to the host earlier would most likely be invalid, and the host will have to re-balloon to the previous value to get in the pre-hibernate state. So the only sane thing for the guest to do here is to discard all the pages that were put in the balloon. When to discard the pages is the next question. One solution is to deflate the balloon just before writing the image to the disk (in the freeze() PM callback). However, asking for pages from the host just to discard them immediately after seems wasteful of resources. Hence, it makes sense to do this by just fudging our counters soon after wakeup. This means we don't deflate the balloon before sleep, and also don't put unnecessary pressure on the host. This also helps in the thaw case: if the freeze fails for whatever reason, the balloon should continue to remain in the inflated state. This was tested by issuing 'swapoff -a' and trying to go into the S4 state. That fails, and the balloon stays inflated, as expected. Both the host and the guest are happy. Finally, in the restore() callback, we empty the list of pages that were previously given off to the host, add the appropriate number of pages to the totalram_pages counter, reset the num_pages counter to 0, and all is fine. As a last step, delete the vqs on the freeze callback to prepare for hibernation, and re-create them in the restore and thaw callbacks to resume normal operation. The kthread doesn't race with any operations here, since it's frozen before the freeze() call and is thawed after the thaw() and restore() callbacks, so we're safe with that. Signed-off-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/virtio/virtio_balloon.c | 47 +++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 3368cb6ef193..95aeedf198f8 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -364,6 +364,48 @@ static void __devexit virtballoon_remove(struct virtio_device *vdev) kfree(vb); } +#ifdef CONFIG_PM +static int virtballoon_freeze(struct virtio_device *vdev) +{ + /* + * The kthread is already frozen by the PM core before this + * function is called. + */ + + /* Ensure we don't get any more requests from the host */ + vdev->config->reset(vdev); + vdev->config->del_vqs(vdev); + return 0; +} + +static int virtballoon_thaw(struct virtio_device *vdev) +{ + return init_vqs(vdev->priv); +} + +static int virtballoon_restore(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + struct page *page, *page2; + + /* We're starting from a clean slate */ + vb->num_pages = 0; + + /* + * If a request wasn't complete at the time of freezing, this + * could have been set. + */ + vb->need_stats_update = 0; + + /* We don't have these pages in the balloon anymore! */ + list_for_each_entry_safe(page, page2, &vb->pages, lru) { + list_del(&page->lru); + totalram_pages++; + } + return init_vqs(vdev->priv); +} +#endif + static unsigned int features[] = { VIRTIO_BALLOON_F_MUST_TELL_HOST, VIRTIO_BALLOON_F_STATS_VQ, @@ -378,6 +420,11 @@ static struct virtio_driver virtio_balloon_driver = { .probe = virtballoon_probe, .remove = __devexit_p(virtballoon_remove), .config_changed = virtballoon_changed, +#ifdef CONFIG_PM + .freeze = virtballoon_freeze, + .restore = virtballoon_restore, + .thaw = virtballoon_thaw, +#endif }; static int __init init(void) From 39082f7e5912cdc70f9ab0767e7342711f34b9f8 Mon Sep 17 00:00:00 2001 From: Jacek Galowicz Date: Thu, 12 Jan 2012 15:44:47 +1030 Subject: [PATCH 22/24] lguest: switch segment-voodoo-numbers to readable symbols When studying lguest's x86 segment descriptor code, it is not longer necessary to have the Intel x86 architecture manual open on the page with the segment descriptor illustration to understand the crazy numbers assigned to both descriptor structure halves a/b. Now the struct desc_struct's fields, like suggested by Glauber de Oliveira Costa in 2008, are used. Signed-off-by: Jacek Galowicz Signed-off-by: Rusty Russell --- drivers/lguest/segments.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index ede46581351a..c4fb424dfddb 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c @@ -81,8 +81,8 @@ static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) * sometimes careless and leaves this as 0, even though it's * running at privilege level 1. If so, we fix it here. */ - if ((cpu->arch.gdt[i].b & 0x00006000) == 0) - cpu->arch.gdt[i].b |= (GUEST_PL << 13); + if (cpu->arch.gdt[i].dpl == 0) + cpu->arch.gdt[i].dpl |= GUEST_PL; /* * Each descriptor has an "accessed" bit. If we don't set it @@ -90,7 +90,7 @@ static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) * that entry into a segment register. But the GDT isn't * writable by the Guest, so bad things can happen. */ - cpu->arch.gdt[i].b |= 0x00000100; + cpu->arch.gdt[i].type |= 0x1; } } @@ -114,13 +114,19 @@ void setup_default_gdt_entries(struct lguest_ro_state *state) /* * The TSS segment refers to the TSS entry for this particular CPU. - * Forgive the magic flags: the 0x8900 means the entry is Present, it's - * privilege level 0 Available 386 TSS system segment, and the 0x67 - * means Saturn is eclipsed by Mercury in the twelfth house. */ - gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); - gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) - | ((tss >> 16) & 0x000000FF); + gdt[GDT_ENTRY_TSS].a = 0; + gdt[GDT_ENTRY_TSS].b = 0; + + gdt[GDT_ENTRY_TSS].limit0 = 0x67; + gdt[GDT_ENTRY_TSS].base0 = tss & 0xFFFF; + gdt[GDT_ENTRY_TSS].base1 = (tss >> 16) & 0xFF; + gdt[GDT_ENTRY_TSS].base2 = tss >> 24; + gdt[GDT_ENTRY_TSS].type = 0x9; /* 32-bit TSS (available) */ + gdt[GDT_ENTRY_TSS].p = 0x1; /* Entry is present */ + gdt[GDT_ENTRY_TSS].dpl = 0x0; /* Privilege level 0 */ + gdt[GDT_ENTRY_TSS].s = 0x0; /* system segment */ + } /* @@ -135,8 +141,8 @@ void setup_guest_gdt(struct lg_cpu *cpu) */ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; - cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); - cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); + cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL; + cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL; } /*H:650 From 07fe9977b6234ede1bd29e10e0323e478860c871 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 12 Jan 2012 15:44:47 +1030 Subject: [PATCH 23/24] lguest: move the lguest tool to the tools directory This is a better location instead of having it in Documentation. Signed-off-by: Davidlohr Bueso Signed-off-by: Rusty Russell (fixed compile) --- drivers/lguest/Makefile | 2 +- {Documentation/virtual => tools}/lguest/.gitignore | 0 {Documentation/virtual => tools}/lguest/Makefile | 0 {Documentation/virtual => tools}/lguest/extract | 0 {Documentation/virtual => tools}/lguest/lguest.c | 2 +- {Documentation/virtual => tools}/lguest/lguest.txt | 0 6 files changed, 2 insertions(+), 2 deletions(-) rename {Documentation/virtual => tools}/lguest/.gitignore (100%) rename {Documentation/virtual => tools}/lguest/Makefile (100%) rename {Documentation/virtual => tools}/lguest/extract (100%) rename {Documentation/virtual => tools}/lguest/lguest.c (99%) rename {Documentation/virtual => tools}/lguest/lguest.txt (100%) diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile index 8ac947c7e7c7..c4197503900e 100644 --- a/drivers/lguest/Makefile +++ b/drivers/lguest/Makefile @@ -18,7 +18,7 @@ Mastery: PREFIX=M Beer: @for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}" Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery: - @sh ../../Documentation/virtual/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'` + @sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'` Puppy: @clear @printf " __ \n (___()'\`;\n /, /\`\n \\\\\\\"--\\\\\\ \n" diff --git a/Documentation/virtual/lguest/.gitignore b/tools/lguest/.gitignore similarity index 100% rename from Documentation/virtual/lguest/.gitignore rename to tools/lguest/.gitignore diff --git a/Documentation/virtual/lguest/Makefile b/tools/lguest/Makefile similarity index 100% rename from Documentation/virtual/lguest/Makefile rename to tools/lguest/Makefile diff --git a/Documentation/virtual/lguest/extract b/tools/lguest/extract similarity index 100% rename from Documentation/virtual/lguest/extract rename to tools/lguest/extract diff --git a/Documentation/virtual/lguest/lguest.c b/tools/lguest/lguest.c similarity index 99% rename from Documentation/virtual/lguest/lguest.c rename to tools/lguest/lguest.c index c095d79cae73..f759f4f097c7 100644 --- a/Documentation/virtual/lguest/lguest.c +++ b/tools/lguest/lguest.c @@ -49,7 +49,7 @@ #include #include #include -#include "../../../include/linux/lguest_launcher.h" +#include "../../include/linux/lguest_launcher.h" /*L:110 * We can ignore the 43 include files we need for this program, but I do want * to draw attention to the use of kernel-style types. diff --git a/Documentation/virtual/lguest/lguest.txt b/tools/lguest/lguest.txt similarity index 100% rename from Documentation/virtual/lguest/lguest.txt rename to tools/lguest/lguest.txt From b6c96c0214138186f495e3ee73737c6fc5e4efa2 Mon Sep 17 00:00:00 2001 From: Stratos Psomadakis Date: Thu, 12 Jan 2012 15:44:47 +1030 Subject: [PATCH 24/24] lguest: Make sure interrupt is allocated ok by lguest_setup_irq Make sure the interrupt is allocated correctly by lguest_setup_irq (check the return value of irq_alloc_desc_at for -ENOMEM) Signed-off-by: Stratos Psomadakis Signed-off-by: Rusty Russell (cleanups and commentry) --- arch/x86/lguest/boot.c | 21 +++++++++++++-------- drivers/lguest/lguest_device.c | 10 +++++++--- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index cf4603ba866f..642d8805bc1b 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -856,18 +856,23 @@ static void __init lguest_init_IRQ(void) } /* - * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so - * rather than set them in lguest_init_IRQ we are called here every time an - * lguest device needs an interrupt. - * - * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should - * pass that up! + * Interrupt descriptors are allocated as-needed, but low-numbered ones are + * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it + * tells us the irq is already used: other errors (ie. ENOMEM) we take + * seriously. */ -void lguest_setup_irq(unsigned int irq) +int lguest_setup_irq(unsigned int irq) { - irq_alloc_desc_at(irq, 0); + int err; + + /* Returns -ve error or vector number. */ + err = irq_alloc_desc_at(irq, 0); + if (err < 0 && err != -EEXIST) + return err; + irq_set_chip_and_handler_name(irq, &lguest_irq_controller, handle_level_irq, "level"); + return 0; } /* diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 6a1d6447b864..9e8388efd88e 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -241,7 +241,7 @@ static void lg_notify(struct virtqueue *vq) } /* An extern declaration inside a C file is bad form. Don't do it. */ -extern void lguest_setup_irq(unsigned int irq); +extern int lguest_setup_irq(unsigned int irq); /* * This routine finds the Nth virtqueue described in the configuration of @@ -304,7 +304,9 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, } /* Make sure the interrupt is allocated. */ - lguest_setup_irq(lvq->config.irq); + err = lguest_setup_irq(lvq->config.irq); + if (err) + goto destroy_vring; /* * Tell the interrupt for this virtqueue to go to the virtio_ring @@ -317,7 +319,7 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vq); if (err) - goto destroy_vring; + goto free_desc; /* * Last of all we hook up our 'struct lguest_vq_info" to the @@ -326,6 +328,8 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, vq->priv = lvq; return vq; +free_desc: + irq_free_desc(lvq->config.irq); destroy_vring: vring_del_virtqueue(vq); unmap: