From f68d24082e22ccee3077d11aeb6dc5354f0ca7f1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 23 Sep 2009 22:26:29 -0600 Subject: [PATCH 01/10] virtio_pci: minor MSI-X cleanups 1) Rename vp_request_vectors to vp_request_msix_vectors, and take non-MSI-X case out to caller. 2) Comment weird pci_enable_msix API 3) Rename vp_find_vq to setup_vq. 4) Fix spaces to tabs 5) Make nvectors calc internal to vp_try_to_find_vqs() 6) Rename vector to msix_vector for more clarity. Signed-off-by: Rusty Russell Cc: "Michael S. Tsirkin" --- drivers/virtio/virtio_pci.c | 125 ++++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 55 deletions(-) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 248e00ec4dc1..4a1f1ebff7bf 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -84,7 +84,7 @@ struct virtio_pci_vq_info struct list_head node; /* MSI-X vector (or none) */ - unsigned vector; + unsigned msix_vector; }; /* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */ @@ -280,25 +280,14 @@ static void vp_free_vectors(struct virtio_device *vdev) vp_dev->msix_entries = NULL; } -static int vp_request_vectors(struct virtio_device *vdev, int nvectors, - bool per_vq_vectors) +static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, + bool per_vq_vectors) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); const char *name = dev_name(&vp_dev->vdev.dev); unsigned i, v; int err = -ENOMEM; - if (!nvectors) { - /* Can't allocate MSI-X vectors, use regular interrupt */ - vp_dev->msix_vectors = 0; - err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, - IRQF_SHARED, name, vp_dev); - if (err) - return err; - vp_dev->intx_enabled = 1; - return 0; - } - vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries, GFP_KERNEL); if (!vp_dev->msix_entries) @@ -311,6 +300,7 @@ static int vp_request_vectors(struct virtio_device *vdev, int nvectors, for (i = 0; i < nvectors; ++i) vp_dev->msix_entries[i].entry = i; + /* pci_enable_msix returns positive if we can't get this many. */ err = pci_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, nvectors); if (err > 0) err = -ENOSPC; @@ -356,10 +346,22 @@ static int vp_request_vectors(struct virtio_device *vdev, int nvectors, return err; } -static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, - void (*callback)(struct virtqueue *vq), - const char *name, - u16 vector) +static int vp_request_intx(struct virtio_device *vdev) +{ + int err; + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, + IRQF_SHARED, dev_name(&vdev->dev), vp_dev); + if (!err) + vp_dev->intx_enabled = 1; + return err; +} + +static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index, + void (*callback)(struct virtqueue *vq), + const char *name, + u16 msix_vec) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info; @@ -384,7 +386,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, info->queue_index = index; info->num = num; - info->vector = vector; + info->msix_vector = msix_vec; size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN)); info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO); @@ -408,10 +410,10 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, vq->priv = info; info->vq = vq; - if (vector != VIRTIO_MSI_NO_VECTOR) { - iowrite16(vector, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); - vector = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); - if (vector == VIRTIO_MSI_NO_VECTOR) { + if (msix_vec != VIRTIO_MSI_NO_VECTOR) { + iowrite16(msix_vec, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); + msix_vec = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); + if (msix_vec == VIRTIO_MSI_NO_VECTOR) { err = -EBUSY; goto out_assign; } @@ -472,7 +474,8 @@ static void vp_del_vqs(struct virtio_device *vdev) list_for_each_entry_safe(vq, n, &vdev->vqs, list) { info = vq->priv; if (vp_dev->per_vq_vectors) - free_irq(vp_dev->msix_entries[info->vector].vector, vq); + free_irq(vp_dev->msix_entries[info->msix_vector].vector, + vq); vp_del_vq(vq); } vp_dev->per_vq_vectors = false; @@ -484,38 +487,58 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char *names[], - int nvectors, + bool use_msix, bool per_vq_vectors) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - u16 vector; - int i, err, allocated_vectors; + u16 msix_vec; + int i, err, nvectors, allocated_vectors; - err = vp_request_vectors(vdev, nvectors, per_vq_vectors); - if (err) - goto error_request; + if (!use_msix) { + /* Old style: one normal interrupt for change and all vqs. */ + err = vp_request_intx(vdev); + if (err) + goto error_request; + } else { + if (per_vq_vectors) { + /* Best option: one for change interrupt, one per vq. */ + nvectors = 1; + for (i = 0; i < nvqs; ++i) + if (callbacks[i]) + ++nvectors; + } else { + /* Second best: one for change, shared for all vqs. */ + nvectors = 2; + } + + err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors); + if (err) + goto error_request; + } vp_dev->per_vq_vectors = per_vq_vectors; allocated_vectors = vp_dev->msix_used_vectors; for (i = 0; i < nvqs; ++i) { if (!callbacks[i] || !vp_dev->msix_enabled) - vector = VIRTIO_MSI_NO_VECTOR; + msix_vec = VIRTIO_MSI_NO_VECTOR; else if (vp_dev->per_vq_vectors) - vector = allocated_vectors++; + msix_vec = allocated_vectors++; else - vector = VP_MSIX_VQ_VECTOR; - vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i], vector); + msix_vec = VP_MSIX_VQ_VECTOR; + vqs[i] = setup_vq(vdev, i, callbacks[i], names[i], msix_vec); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; } /* allocate per-vq irq if available and necessary */ - if (vp_dev->per_vq_vectors && vector != VIRTIO_MSI_NO_VECTOR) { - snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names, - "%s-%s", dev_name(&vp_dev->vdev.dev), names[i]); - err = request_irq(vp_dev->msix_entries[vector].vector, - vring_interrupt, 0, - vp_dev->msix_names[vector], vqs[i]); + if (vp_dev->per_vq_vectors) { + snprintf(vp_dev->msix_names[msix_vec], + sizeof *vp_dev->msix_names, + "%s-%s", + dev_name(&vp_dev->vdev.dev), names[i]); + err = request_irq(msix_vec, vring_interrupt, 0, + vp_dev->msix_names[msix_vec], + vqs[i]); if (err) { vp_del_vq(vqs[i]); goto error_find; @@ -537,28 +560,20 @@ static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, vq_callback_t *callbacks[], const char *names[]) { - int vectors = 0; - int i, uninitialized_var(err); + int err; - /* How many vectors would we like? */ - for (i = 0; i < nvqs; ++i) - if (callbacks[i]) - ++vectors; - - /* We want at most one vector per queue and one for config changes. */ - err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, - vectors + 1, true); + /* Try MSI-X with one vector per queue. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, true, true); if (!err) return 0; - /* Fallback to separate vectors for config and a shared for queues. */ + /* Fallback: MSI-X with one vector for config, one shared for queues. */ err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, - 2, false); + true, false); if (!err) return 0; /* Finally fall back to regular interrupts. */ - err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, - 0, false); - return err; + return vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + false, false); } static struct virtio_config_ops virtio_pci_config_ops = { From 3c1b27d5043086a485f8526353ae9fe37bfa1065 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 23 Sep 2009 22:26:31 -0600 Subject: [PATCH 02/10] virtio: make add_buf return capacity remaining This API change means that virtio_net can tell how much capacity remains for buffers. It's necessarily fuzzy, since VIRTIO_RING_F_INDIRECT_DESC means we can fit any number of descriptors in one, *if* we can kmalloc. Signed-off-by: Rusty Russell Cc: Dinesh Subhraveti --- drivers/block/virtio_blk.c | 2 +- drivers/char/hw_random/virtio-rng.c | 2 +- drivers/char/virtio_console.c | 4 ++-- drivers/net/virtio_net.c | 14 +++++++------- drivers/virtio/virtio_balloon.c | 2 +- drivers/virtio/virtio_ring.c | 6 +++++- include/linux/virtio.h | 2 +- net/9p/trans_virtio.c | 2 +- 8 files changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index aa1a3d5a3e2b..d739ee4201f9 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -139,7 +139,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, } } - if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) { + if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr) < 0) { mempool_free(vbr, vblk->pool); return false; } diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 32216b623248..b6c24dcc987d 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -51,7 +51,7 @@ static void register_buffer(void) sg_init_one(&sg, random_data+data_left, RANDOM_DATA_SIZE-data_left); /* There should always be room for one buffer. */ - if (vq->vq_ops->add_buf(vq, &sg, 0, 1, random_data) != 0) + if (vq->vq_ops->add_buf(vq, &sg, 0, 1, random_data) < 0) BUG(); vq->vq_ops->kick(vq); } diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index c74dacfa6795..a035ae39a359 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -65,7 +65,7 @@ static int put_chars(u32 vtermno, const char *buf, int count) /* add_buf wants a token to identify this buffer: we hand it any * non-NULL pointer, since there's only ever one buffer. */ - if (out_vq->vq_ops->add_buf(out_vq, sg, 1, 0, (void *)1) == 0) { + if (out_vq->vq_ops->add_buf(out_vq, sg, 1, 0, (void *)1) >= 0) { /* Tell Host to go! */ out_vq->vq_ops->kick(out_vq); /* Chill out until it's done with the buffer. */ @@ -85,7 +85,7 @@ static void add_inbuf(void) sg_init_one(sg, inbuf, PAGE_SIZE); /* We should always be able to add one buffer to an empty queue. */ - if (in_vq->vq_ops->add_buf(in_vq, sg, 0, 1, inbuf) != 0) + if (in_vq->vq_ops->add_buf(in_vq, sg, 0, 1, inbuf) < 0) BUG(); in_vq->vq_ops->kick(in_vq); } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 32266fb89c20..fbf04a553f5a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -320,7 +320,7 @@ static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp) skb_queue_head(&vi->recv, skb); err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb); - if (err) { + if (err < 0) { skb_unlink(skb, &vi->recv); trim_pages(vi, skb); kfree_skb(skb); @@ -373,7 +373,7 @@ static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp) skb_queue_head(&vi->recv, skb); err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 1, skb); - if (err) { + if (err < 0) { skb_unlink(skb, &vi->recv); kfree_skb(skb); break; @@ -527,7 +527,7 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb) num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; err = vi->svq->vq_ops->add_buf(vi->svq, sg, num, 0, skb); - if (!err && !vi->free_in_tasklet) + if (err >= 0 && !vi->free_in_tasklet) mod_timer(&vi->xmit_free_timer, jiffies + (HZ/10)); return err; @@ -538,7 +538,7 @@ static void xmit_tasklet(unsigned long data) struct virtnet_info *vi = (void *)data; netif_tx_lock_bh(vi->dev); - if (vi->last_xmit_skb && xmit_skb(vi, vi->last_xmit_skb) == 0) { + if (vi->last_xmit_skb && xmit_skb(vi, vi->last_xmit_skb) >= 0) { vi->svq->vq_ops->kick(vi->svq); vi->last_xmit_skb = NULL; } @@ -557,7 +557,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) /* If we has a buffer left over from last time, send it now. */ if (unlikely(vi->last_xmit_skb) && - xmit_skb(vi, vi->last_xmit_skb) != 0) + xmit_skb(vi, vi->last_xmit_skb) < 0) goto stop_queue; vi->last_xmit_skb = NULL; @@ -565,7 +565,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) /* Put new one in send queue and do transmit */ if (likely(skb)) { __skb_queue_head(&vi->send, skb); - if (xmit_skb(vi, skb) != 0) { + if (xmit_skb(vi, skb) < 0) { vi->last_xmit_skb = skb; skb = NULL; goto stop_queue; @@ -668,7 +668,7 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, sg_set_buf(&sg[i + 1], sg_virt(s), s->length); sg_set_buf(&sg[out + in - 1], &status, sizeof(status)); - BUG_ON(vi->cvq->vq_ops->add_buf(vi->cvq, sg, out, in, vi)); + BUG_ON(vi->cvq->vq_ops->add_buf(vi->cvq, sg, out, in, vi) < 0); vi->cvq->vq_ops->kick(vi->cvq); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 26b278264796..39789232646d 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -84,7 +84,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) init_completion(&vb->acked); /* We should always be able to add one buffer to an empty queue. */ - if (vq->vq_ops->add_buf(vq, &sg, 1, 0, vb) != 0) + if (vq->vq_ops->add_buf(vq, &sg, 1, 0, vb) < 0) BUG(); vq->vq_ops->kick(vq); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index a882f2606515..f53600580726 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -208,7 +208,11 @@ static int vring_add_buf(struct virtqueue *_vq, pr_debug("Added buffer head %i to %p\n", head, vq); END_USE(vq); - return 0; + + /* If we're indirect, we can fit many (assuming not OOM). */ + if (vq->indirect) + return vq->num_free ? vq->vring.num : 0; + return vq->num_free; } static void vring_kick(struct virtqueue *_vq) diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 4fca4f5440ba..057a2e010758 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -34,7 +34,7 @@ struct virtqueue { * out_num: the number of sg readable by other side * in_num: the number of sg which are writable (after readable ones) * data: the token identifying the buffer. - * Returns 0 or an error. + * Returns remaining capacity of queue (sg segments) or a negative error. * @kick: update after add_buf * vq: the struct virtqueue * After one or more add_buf calls, invoke this to kick the other side. diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 9bf0b737aa51..53c139d31a21 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -200,7 +200,7 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) req->status = REQ_STATUS_SENT; - if (chan->vq->vq_ops->add_buf(chan->vq, chan->sg, out, in, req->tc)) { + if (chan->vq->vq_ops->add_buf(chan->vq, chan->sg, out, in, req->tc) < 0) { P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio rpc add_buf returned failure"); return -EIO; From 3a20210dc26bbfff3bbb48bb22d2846240b71d8f Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Fri, 31 Jul 2009 12:09:05 +0900 Subject: [PATCH 03/10] virtio: get rid of redundant VIRTIO_ID_9P definition VIRTIO_ID_9P is already defined in include/linux/virtio_9p.h so use that definition instead. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Rusty Russell Cc: Eric Van Hensbergen --- net/9p/trans_virtio.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 53c139d31a21..ea1e3daabefe 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -334,8 +334,6 @@ static void p9_virtio_remove(struct virtio_device *vdev) } } -#define VIRTIO_ID_9P 9 - static struct virtio_device_id id_table[] = { { VIRTIO_ID_9P, VIRTIO_DEV_ANY_ID }, { 0 }, From 3ca4f5ca73057a617f9444a91022d7127041970a Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Fri, 31 Jul 2009 15:25:56 +0900 Subject: [PATCH 04/10] virtio: add virtio IDs file Virtio IDs are spread all over the tree which makes assigning new IDs bothersome. Putting them together should make the process less error-prone. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 1 + drivers/block/virtio_blk.c | 1 + drivers/char/hw_random/virtio-rng.c | 1 + drivers/char/virtio_console.c | 1 + drivers/net/virtio_net.c | 1 + drivers/virtio/virtio_balloon.c | 1 + include/linux/virtio_9p.h | 2 -- include/linux/virtio_balloon.h | 3 --- include/linux/virtio_blk.h | 3 --- include/linux/virtio_console.h | 3 --- include/linux/virtio_ids.h | 17 +++++++++++++++++ include/linux/virtio_net.h | 3 --- include/linux/virtio_rng.h | 3 --- net/9p/trans_virtio.c | 1 + 14 files changed, 24 insertions(+), 17 deletions(-) create mode 100644 include/linux/virtio_ids.h diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 950cde6d6e58..84bbb190bf7b 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -42,6 +42,7 @@ #include #include "linux/lguest_launcher.h" #include "linux/virtio_config.h" +#include #include "linux/virtio_net.h" #include "linux/virtio_blk.h" #include "linux/virtio_console.h" diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index d739ee4201f9..73de7532fcf5 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index b6c24dcc987d..962968f05b94 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -21,6 +21,7 @@ #include #include #include +#include #include /* The host will fill any buffer we give it with sweet, sweet randomness. We diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index a035ae39a359..0d328b59568d 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "hvc_console.h" diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index fbf04a553f5a..5c498d2b043f 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 39789232646d..200c22f55130 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -19,6 +19,7 @@ */ //#define DEBUG #include +#include #include #include #include diff --git a/include/linux/virtio_9p.h b/include/linux/virtio_9p.h index b3c4a60ceeb3..ea7226a45acb 100644 --- a/include/linux/virtio_9p.h +++ b/include/linux/virtio_9p.h @@ -4,8 +4,6 @@ * compatible drivers/servers. */ #include -/* The ID for virtio console */ -#define VIRTIO_ID_9P 9 /* Maximum number of virtio channels per partition (1 for now) */ #define MAX_9P_CHAN 1 diff --git a/include/linux/virtio_balloon.h b/include/linux/virtio_balloon.h index 8726ff77763e..09d730085060 100644 --- a/include/linux/virtio_balloon.h +++ b/include/linux/virtio_balloon.h @@ -4,9 +4,6 @@ * compatible drivers/servers. */ #include -/* The ID for virtio_balloon */ -#define VIRTIO_ID_BALLOON 5 - /* The feature bitmap for virtio balloon */ #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h index 8dab9f2b8832..25fbabfa90d4 100644 --- a/include/linux/virtio_blk.h +++ b/include/linux/virtio_blk.h @@ -5,9 +5,6 @@ #include #include -/* The ID for virtio_block */ -#define VIRTIO_ID_BLOCK 2 - /* Feature bits */ #define VIRTIO_BLK_F_BARRIER 0 /* Does host support barriers? */ #define VIRTIO_BLK_F_SIZE_MAX 1 /* Indicates maximum segment size */ diff --git a/include/linux/virtio_console.h b/include/linux/virtio_console.h index dc161115ae35..b5f519806014 100644 --- a/include/linux/virtio_console.h +++ b/include/linux/virtio_console.h @@ -5,9 +5,6 @@ /* This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so * anyone can use the definitions to implement compatible drivers/servers. */ -/* The ID for virtio console */ -#define VIRTIO_ID_CONSOLE 3 - /* Feature bits */ #define VIRTIO_CONSOLE_F_SIZE 0 /* Does host provide console size? */ diff --git a/include/linux/virtio_ids.h b/include/linux/virtio_ids.h new file mode 100644 index 000000000000..06660c0a78d7 --- /dev/null +++ b/include/linux/virtio_ids.h @@ -0,0 +1,17 @@ +#ifndef _LINUX_VIRTIO_IDS_H +#define _LINUX_VIRTIO_IDS_H +/* + * Virtio IDs + * + * This header is BSD licensed so anyone can use the definitions to implement + * compatible drivers/servers. + */ + +#define VIRTIO_ID_NET 1 /* virtio net */ +#define VIRTIO_ID_BLOCK 2 /* virtio block */ +#define VIRTIO_ID_CONSOLE 3 /* virtio console */ +#define VIRTIO_ID_RNG 4 /* virtio ring */ +#define VIRTIO_ID_BALLOON 5 /* virtio balloon */ +#define VIRTIO_ID_9P 9 /* 9p virtio console */ + +#endif /* _LINUX_VIRTIO_IDS_H */ diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index d8dd539c9f48..1f41734bbb77 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -6,9 +6,6 @@ #include #include -/* The ID for virtio_net */ -#define VIRTIO_ID_NET 1 - /* The feature bitmap for virtio net */ #define VIRTIO_NET_F_CSUM 0 /* Host handles pkts w/ partial csum */ #define VIRTIO_NET_F_GUEST_CSUM 1 /* Guest handles pkts w/ partial csum */ diff --git a/include/linux/virtio_rng.h b/include/linux/virtio_rng.h index 1a85dab8a940..48121c3c434b 100644 --- a/include/linux/virtio_rng.h +++ b/include/linux/virtio_rng.h @@ -4,7 +4,4 @@ * compatible drivers/servers. */ #include -/* The ID for virtio_rng */ -#define VIRTIO_ID_RNG 4 - #endif /* _LINUX_VIRTIO_RNG_H */ diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index ea1e3daabefe..b2e07f0dd298 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #define VIRTQUEUE_NUM 128 From f1b0ef062602713c2c7cfa12362d5d90ed01c5f6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Sep 2009 19:57:42 +0200 Subject: [PATCH 05/10] virtio_blk: add support for cache flush Recent qemu has added a VIRTIO_BLK_F_FLUSH flag to advertise that the virtual disk has a volatile write cache that needs to be flushed. In case we see this feature implement tell the Linux block layer about the fact and use the new VIRTIO_BLK_T_FLUSH to flush the cache when required. This allows for an correct and simple implementation of write barriers. Signed-off-by: Christoph Hellwig Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 30 +++++++++++++++++++++++++----- include/linux/virtio_blk.h | 15 +++++++++++++++ 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 73de7532fcf5..3d5fe97569c7 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -92,15 +92,26 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, return false; vbr->req = req; - if (blk_fs_request(vbr->req)) { + switch (req->cmd_type) { + case REQ_TYPE_FS: vbr->out_hdr.type = 0; vbr->out_hdr.sector = blk_rq_pos(vbr->req); vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); - } else if (blk_pc_request(vbr->req)) { + break; + case REQ_TYPE_BLOCK_PC: vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD; vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); - } else { + break; + case REQ_TYPE_LINUX_BLOCK: + if (req->cmd[0] == REQ_LB_OP_FLUSH) { + vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; + vbr->out_hdr.sector = 0; + vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + break; + } + /*FALLTHRU*/ + default: /* We don't put anything else in the queue. */ BUG(); } @@ -200,6 +211,12 @@ static int virtblk_identify(struct gendisk *disk, void *argp) return err; } +static void virtblk_prepare_flush(struct request_queue *q, struct request *req) +{ + req->cmd_type = REQ_TYPE_LINUX_BLOCK; + req->cmd[0] = REQ_LB_OP_FLUSH; +} + static int virtblk_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long data) { @@ -338,7 +355,10 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) index++; /* If barriers are supported, tell block layer that queue is ordered */ - if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) + if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) + blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_DRAIN_FLUSH, + virtblk_prepare_flush); + else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL); /* If disk is read-only in the host, the guest should obey */ @@ -425,7 +445,7 @@ static struct virtio_device_id id_table[] = { static unsigned int features[] = { VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, - VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_IDENTIFY + VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_IDENTIFY, VIRTIO_BLK_F_FLUSH }; /* diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h index 25fbabfa90d4..15cb666581d7 100644 --- a/include/linux/virtio_blk.h +++ b/include/linux/virtio_blk.h @@ -14,6 +14,7 @@ #define VIRTIO_BLK_F_BLK_SIZE 6 /* Block size of disk is available*/ #define VIRTIO_BLK_F_SCSI 7 /* Supports scsi command passthru */ #define VIRTIO_BLK_F_IDENTIFY 8 /* ATA IDENTIFY supported */ +#define VIRTIO_BLK_F_FLUSH 9 /* Cache flush command support */ #define VIRTIO_BLK_ID_BYTES (sizeof(__u16[256])) /* IDENTIFY DATA */ @@ -35,6 +36,17 @@ struct virtio_blk_config { __u8 identify[VIRTIO_BLK_ID_BYTES]; } __attribute__((packed)); +/* + * Command types + * + * Usage is a bit tricky as some bits are used as flags and some are not. + * + * Rules: + * VIRTIO_BLK_T_OUT may be combined with VIRTIO_BLK_T_SCSI_CMD or + * VIRTIO_BLK_T_BARRIER. VIRTIO_BLK_T_FLUSH is a command of its own + * and may not be combined with any of the other flags. + */ + /* These two define direction. */ #define VIRTIO_BLK_T_IN 0 #define VIRTIO_BLK_T_OUT 1 @@ -42,6 +54,9 @@ struct virtio_blk_config { /* This bit says it's a scsi command, not an actual read or write. */ #define VIRTIO_BLK_T_SCSI_CMD 2 +/* Cache flush command */ +#define VIRTIO_BLK_T_FLUSH 4 + /* Barrier before this op. */ #define VIRTIO_BLK_T_BARRIER 0x80000000 From cdae0ad5e8ea630017d4cad3923f763c4e6c3127 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 23 Sep 2009 22:26:42 -0600 Subject: [PATCH 06/10] lguest: move panic notifier registration to its expected place. We used to defer it, so lockdep was happy. We now init lockdep early anyway, so just do it after that. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4cb7d5d18b8e..7e59dc1d3fc2 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1135,11 +1135,6 @@ static struct notifier_block paniced = { /* Setting up memory is fairly easy. */ static __init char *lguest_memory_setup(void) { - /* We do this here and not earlier because lockcheck used to barf if we - * did it before start_kernel(). I think we fixed that, so it'd be - * nice to move it back to lguest_init. Patch welcome... */ - atomic_notifier_chain_register(&panic_notifier_list, &paniced); - /* *The Linux bootloader header contains an "e820" memory map: the * Launcher populated the first entry with our memory limit. @@ -1364,10 +1359,13 @@ __init void lguest_init(void) /* * If we don't initialize the lock dependency checker now, it crashes - * paravirt_disable_iospace. + * atomic_notifier_chain_register, then paravirt_disable_iospace. */ lockdep_init(); + /* Hook in our special panic hypercall code. */ + atomic_notifier_chain_register(&panic_notifier_list, &paniced); + /* * The IDE code spends about 3 seconds probing for disks: if we reserve * all the I/O ports up front it can't get them and so doesn't probe. From 4c1ea3dd718a1d93a726cb3e66665ac4170dcccd Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 23 Sep 2009 22:26:45 -0600 Subject: [PATCH 07/10] lguest: use set_pte/set_pmd uniformly for real page table entries If we're building a pte, we can use simple assigment; only use set_pte etc. when we're actually going to use that destination as a PTE. I don't know that we'll ever run under Xen, but it's neater. And use set_pte/set_pmd rather than assuming native_ versions, even though that's probably true for most people. (Includes compile fix by Kamalesh Babulal ) Signed-off-by: Rusty Russell Cc: Matias Zabaljauregui Cc: Kamalesh Babulal --- drivers/lguest/page_tables.c | 39 ++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a8d0aee3bc0e..232fba6a4882 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -380,7 +380,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * And we copy the flags to the shadow PMD entry. The page * number in the shadow PMD is the page we just allocated. */ - native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); + set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); } /* @@ -447,7 +447,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * we will come back here when a write does actually occur, so * we can update the Guest's _PAGE_DIRTY flag. */ - native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); + set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); /* * Finally, we write the Guest PTE entry back: we've set the @@ -528,7 +528,7 @@ static void release_pmd(pmd_t *spmd) /* Now we can free the page of PTEs */ free_page((long)ptepage); /* And zero out the PMD entry so we never release it twice. */ - native_set_pmd(spmd, __pmd(0)); + set_pmd(spmd, __pmd(0)); } } @@ -833,15 +833,15 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { check_gpte(cpu, gpte); - native_set_pte(spte, - gpte_to_spte(cpu, gpte, + set_pte(spte, + gpte_to_spte(cpu, gpte, pte_flags(gpte) & _PAGE_DIRTY)); } else { /* * Otherwise kill it and we can demand_page() * it in later. */ - native_set_pte(spte, __pte(0)); + set_pte(spte, __pte(0)); } #ifdef CONFIG_X86_PAE } @@ -983,16 +983,15 @@ static unsigned long setup_pagetables(struct lguest *lg, */ for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; i += PTRS_PER_PTE, j++) { - /* FIXME: native_set_pmd is overkill here. */ - native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) - - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); + pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE, + __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) return -EFAULT; } /* One PGD entry, pointing to that PMD page. */ - set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); + pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT); /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) return -EFAULT; @@ -1141,15 +1140,13 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); pte_t regs_pte; - unsigned long pfn; #ifdef CONFIG_X86_PAE pmd_t switcher_pmd; pmd_t *pmd_table; - /* FIXME: native_set_pmd is overkill here. */ - native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> - PAGE_SHIFT, PAGE_KERNEL_EXEC)); + switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, + PAGE_KERNEL_EXEC); /* Figure out where the pmd page is, by reading the PGD, and converting * it to a virtual address. */ @@ -1157,7 +1154,7 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) << PAGE_SHIFT); /* Now write it into the shadow page table. */ - native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); + set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); #else pgd_t switcher_pgd; @@ -1179,10 +1176,8 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) * page is already mapped there, we don't have to copy them out * again. */ - pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; - native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL)); - native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], - regs_pte); + regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte); } /*:*/ @@ -1209,7 +1204,7 @@ static __init void populate_switcher_pte_page(unsigned int cpu, /* The first entries are easy: they map the Switcher code. */ for (i = 0; i < pages; i++) { - native_set_pte(&pte[i], mk_pte(switcher_page[i], + set_pte(&pte[i], mk_pte(switcher_page[i], __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } @@ -1217,14 +1212,14 @@ static __init void populate_switcher_pte_page(unsigned int cpu, i = pages + cpu*2; /* First page (Guest registers) is writable from the Guest */ - native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), + set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); /* * The second page contains the "struct lguest_ro_state", and is * read-only. */ - native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), + set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } From fb100d78c04ff6053047625d0368d0d4b1d9912a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 23 Sep 2009 22:26:46 -0600 Subject: [PATCH 08/10] lguest: use PGDIR_SHIFT for PAE code to allow different PAGE_OFFSET We still assume the Guest and Host have the same PAGE_OFFSET settings, but now we don't assume 0xC0000000. Signed-off-by: Rusty Russell Cc: Matias Zabaljauregui --- drivers/lguest/page_tables.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 232fba6a4882..bf37a31c0e05 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -996,11 +996,9 @@ static unsigned long setup_pagetables(struct lguest *lg, if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) return -EFAULT; /* - * And the third PGD entry (ie. addresses 3G-4G). - * - * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000. + * And the other PGD entry to make the linear mapping at PAGE_OFFSET */ - if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) + if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd))) return -EFAULT; #else /* From 6c189d8312246af776c2587c233d6afcf3714438 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 5 Aug 2009 17:42:37 +0800 Subject: [PATCH 09/10] lguest: cleanup for map_switcher() We can use alloc_page() instead of get_zeroed_page() and virt_to_page() Signed-off-by: Xiao Guangrong Signed-off-by: Rusty Russell --- drivers/lguest/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 1e2cb846b3c9..8744d24ac6e6 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -67,12 +67,11 @@ static __init int map_switcher(void) * so we make sure they're zeroed. */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { - unsigned long addr = get_zeroed_page(GFP_KERNEL); - if (!addr) { + switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!switcher_page[i]) { err = -ENOMEM; goto free_some_pages; } - switcher_page[i] = virt_to_page(addr); } /* From ca60a42c9be41c07ebcc2ec8c43dd1be53f147bf Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 23 Sep 2009 22:26:47 -0600 Subject: [PATCH 10/10] lguest: don't force VIRTIO_F_NOTIFY_ON_EMPTY VIRTIO_F_NOTIFY_ON_EMPTY indicates to the Guest that we will hit them with an interrupt every time the xmit queue is emptied. Because it results in lots of tx interrupts, modern Guests probably don't want it, so let's only force it when they accept the option. Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 84bbb190bf7b..ba9373f82ab5 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -134,6 +134,9 @@ struct device { /* Is it operational */ bool running; + /* Does Guest want an intrrupt on empty? */ + bool irq_on_empty; + /* Device-specific data. */ void *priv; }; @@ -624,10 +627,13 @@ static void trigger_irq(struct virtqueue *vq) return; vq->pending_used = 0; - /* If they don't want an interrupt, don't send one, unless empty. */ - if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) - && lg_last_avail(vq) != vq->vring.avail->idx) - return; + /* If they don't want an interrupt, don't send one... */ + if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { + /* ... unless they've asked us to force one on empty. */ + if (!vq->dev->irq_on_empty + || lg_last_avail(vq) != vq->vring.avail->idx) + return; + } /* Send the Guest an interrupt tell them we used something up. */ if (write(lguest_fd, buf, sizeof(buf)) != 0) @@ -1043,6 +1049,15 @@ static void create_thread(struct virtqueue *vq) close(vq->eventfd); } +static bool accepted_feature(struct device *dev, unsigned int bit) +{ + const u8 *features = get_feature_bits(dev) + dev->feature_len; + + if (dev->feature_len < bit / CHAR_BIT) + return false; + return features[bit / CHAR_BIT] & (1 << (bit % CHAR_BIT)); +} + static void start_device(struct device *dev) { unsigned int i; @@ -1056,6 +1071,8 @@ static void start_device(struct device *dev) verbose(" %02x", get_feature_bits(dev) [dev->feature_len+i]); + dev->irq_on_empty = accepted_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); + for (vq = dev->vq; vq; vq = vq->next) { if (vq->service) create_thread(vq);