From 11d217430fe296db9aa7df57836c99d8b1327f57 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 25 May 2021 12:47:29 -0500 Subject: [PATCH 01/16] vhost: remove work arg from vhost_work_flush vhost_work_flush doesn't do anything with the work arg. This patch drops it and then renames vhost_work_flush to vhost_work_dev_flush to reflect that the function flushes all the works in the dev and not just a specific queue or work item. Signed-off-by: Mike Christie Acked-by: Jason Wang Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210525174733.6212-2-michael.christie@oracle.com Reviewed-by: Stefano Garzarella Signed-off-by: Michael S. Tsirkin --- drivers/vhost/scsi.c | 4 ++-- drivers/vhost/vhost.c | 8 ++++---- drivers/vhost/vhost.h | 2 +- drivers/vhost/vsock.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 4ce9f00ae10e84..4b70519dcae752 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1470,8 +1470,8 @@ static void vhost_scsi_flush(struct vhost_scsi *vs) /* Flush both the vhost poll and vhost work */ for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) vhost_scsi_flush_vq(vs, i); - vhost_work_flush(&vs->dev, &vs->vs_completion_work); - vhost_work_flush(&vs->dev, &vs->vs_event_work); + vhost_work_dev_flush(&vs->dev); + vhost_work_dev_flush(&vs->dev); /* Wait for all reqs issued before the flush to be finished */ for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 601e8b35ea218a..e8f711b737a515 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -231,7 +231,7 @@ void vhost_poll_stop(struct vhost_poll *poll) } EXPORT_SYMBOL_GPL(vhost_poll_stop); -void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) +void vhost_work_dev_flush(struct vhost_dev *dev) { struct vhost_flush_struct flush; @@ -243,13 +243,13 @@ void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) wait_for_completion(&flush.wait_event); } } -EXPORT_SYMBOL_GPL(vhost_work_flush); +EXPORT_SYMBOL_GPL(vhost_work_dev_flush); /* Flush any work that has been scheduled. When calling this, don't hold any * locks that are also used by the callback. */ void vhost_poll_flush(struct vhost_poll *poll) { - vhost_work_flush(poll->dev, &poll->work); + vhost_work_dev_flush(poll->dev); } EXPORT_SYMBOL_GPL(vhost_poll_flush); @@ -541,7 +541,7 @@ static int vhost_attach_cgroups(struct vhost_dev *dev) attach.owner = current; vhost_work_init(&attach.work, vhost_attach_cgroups_work); vhost_work_queue(dev, &attach.work); - vhost_work_flush(dev, &attach.work); + vhost_work_dev_flush(dev); return attach.ret; } diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 8396e54ce1ce5e..8be6b7b6864f88 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -46,7 +46,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file); void vhost_poll_stop(struct vhost_poll *poll); void vhost_poll_flush(struct vhost_poll *poll); void vhost_poll_queue(struct vhost_poll *poll); -void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work); +void vhost_work_dev_flush(struct vhost_dev *dev); long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); struct vhost_log { diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 9885cab70ea59b..823dc5fe56e0ee 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -689,7 +689,7 @@ static void vhost_vsock_flush(struct vhost_vsock *vsock) for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) if (vsock->vqs[i].handle_kick) vhost_poll_flush(&vsock->vqs[i].poll); - vhost_work_flush(&vsock->dev, &vsock->send_pkt_work); + vhost_work_dev_flush(&vsock->dev); } static void vhost_vsock_reset_orphans(struct sock *sk) From 316959bcae71b551c3a4a2483679d60597f1b9ed Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 17 May 2022 13:08:43 -0500 Subject: [PATCH 02/16] vhost: get rid of vhost_poll_flush() wrapper vhost_poll_flush() is a simple wrapper around vhost_work_dev_flush(). It gives wrong impression that we are doing some work over vhost_poll, while in fact it flushes vhost_poll->dev. It only complicate understanding of the code and leads to mistakes like flushing the same vhost_dev several times in a row. Just remove vhost_poll_flush() and call vhost_work_dev_flush() directly. Signed-off-by: Andrey Ryabinin [merge vhost_poll_flush removal from Stefano Garzarella] Signed-off-by: Mike Christie Reviewed-by: Chaitanya Kulkarni Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Message-Id: <20220517180850.198915-2-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/net.c | 4 ++-- drivers/vhost/test.c | 2 +- drivers/vhost/vhost.c | 12 ++---------- drivers/vhost/vhost.h | 1 - drivers/vhost/vsock.c | 2 +- 5 files changed, 6 insertions(+), 15 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 6f8542535afeea..e582e56f83ac7d 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1408,8 +1408,8 @@ static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, static void vhost_net_flush_vq(struct vhost_net *n, int index) { - vhost_poll_flush(n->poll + index); - vhost_poll_flush(&n->vqs[index].vq.poll); + vhost_work_dev_flush(n->poll[index].dev); + vhost_work_dev_flush(n->vqs[index].vq.poll.dev); } static void vhost_net_flush(struct vhost_net *n) diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c index a09dedc79f6820..1a8ab1d8cb1cf3 100644 --- a/drivers/vhost/test.c +++ b/drivers/vhost/test.c @@ -146,7 +146,7 @@ static void vhost_test_stop(struct vhost_test *n, void **privatep) static void vhost_test_flush_vq(struct vhost_test *n, int index) { - vhost_poll_flush(&n->vqs[index].poll); + vhost_work_dev_flush(n->vqs[index].poll.dev); } static void vhost_test_flush(struct vhost_test *n) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index e8f711b737a515..6e80572049e836 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -245,14 +245,6 @@ void vhost_work_dev_flush(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_work_dev_flush); -/* Flush any work that has been scheduled. When calling this, don't hold any - * locks that are also used by the callback. */ -void vhost_poll_flush(struct vhost_poll *poll) -{ - vhost_work_dev_flush(poll->dev); -} -EXPORT_SYMBOL_GPL(vhost_poll_flush); - void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) { if (!dev->worker) @@ -666,7 +658,7 @@ void vhost_dev_stop(struct vhost_dev *dev) for (i = 0; i < dev->nvqs; ++i) { if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) { vhost_poll_stop(&dev->vqs[i]->poll); - vhost_poll_flush(&dev->vqs[i]->poll); + vhost_work_dev_flush(dev->vqs[i]->poll.dev); } } } @@ -1720,7 +1712,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg mutex_unlock(&vq->mutex); if (pollstop && vq->handle_kick) - vhost_poll_flush(&vq->poll); + vhost_work_dev_flush(vq->poll.dev); return r; } EXPORT_SYMBOL_GPL(vhost_vring_ioctl); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 8be6b7b6864f88..99332dca9edc79 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -44,7 +44,6 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, __poll_t mask, struct vhost_dev *dev); int vhost_poll_start(struct vhost_poll *poll, struct file *file); void vhost_poll_stop(struct vhost_poll *poll); -void vhost_poll_flush(struct vhost_poll *poll); void vhost_poll_queue(struct vhost_poll *poll); void vhost_work_dev_flush(struct vhost_dev *dev); long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 823dc5fe56e0ee..1b6ca5f8c0b8da 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -688,7 +688,7 @@ static void vhost_vsock_flush(struct vhost_vsock *vsock) for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) if (vsock->vqs[i].handle_kick) - vhost_poll_flush(&vsock->vqs[i].poll); + vhost_work_dev_flush(vsock->vqs[i].poll.dev); vhost_work_dev_flush(&vsock->dev); } From 455b1719a97603276e6c1f760ec7f14087e3fe6d Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 17 May 2022 13:08:45 -0500 Subject: [PATCH 03/16] vhost_net: get rid of vhost_net_flush_vq() and extra flush calls vhost_net_flush_vq() calls vhost_work_dev_flush() twice passing vhost_dev pointer obtained via 'n->poll[index].dev' and 'n->vqs[index].vq.poll.dev'. This is actually the same pointer, initialized in vhost_net_open()/vhost_dev_init()/vhost_poll_init() Remove vhost_net_flush_vq() and call vhost_work_dev_flush() directly. Do the flushes only once instead of several flush calls in a row which seems rather useless. Signed-off-by: Andrey Ryabinin [drop vhost_dev forward declaration in vhost.h] Signed-off-by: Mike Christie Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Message-Id: <20220517180850.198915-4-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/net.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index e582e56f83ac7d..040ac1b236dab4 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1406,16 +1406,9 @@ static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); } -static void vhost_net_flush_vq(struct vhost_net *n, int index) -{ - vhost_work_dev_flush(n->poll[index].dev); - vhost_work_dev_flush(n->vqs[index].vq.poll.dev); -} - static void vhost_net_flush(struct vhost_net *n) { - vhost_net_flush_vq(n, VHOST_NET_VQ_TX); - vhost_net_flush_vq(n, VHOST_NET_VQ_RX); + vhost_work_dev_flush(&n->dev); if (n->vqs[VHOST_NET_VQ_TX].ubufs) { mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = true; @@ -1605,7 +1598,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) } if (oldsock) { - vhost_net_flush_vq(n, index); + vhost_work_dev_flush(&n->dev); sockfd_put(oldsock); } From 98837d1a950e885b7d3d34a4f56da5529d08fc9c Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 17 May 2022 13:08:44 -0500 Subject: [PATCH 04/16] vhost: flush dev once during vhost_dev_stop When vhost_work_dev_flush returns all work queued at that time will have completed. There is then no need to flush after every vhost_poll_stop call, and we can move the flush call to after the loop that stops the pollers. Signed-off-by: Mike Christie Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Message-Id: <20220517180850.198915-3-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 6e80572049e836..abf6cf27db14d7 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -656,11 +656,11 @@ void vhost_dev_stop(struct vhost_dev *dev) int i; for (i = 0; i < dev->nvqs; ++i) { - if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) { + if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) vhost_poll_stop(&dev->vqs[i]->poll); - vhost_work_dev_flush(dev->vqs[i]->poll.dev); - } } + + vhost_work_dev_flush(dev); } EXPORT_SYMBOL_GPL(vhost_dev_stop); From 82d314629f43caa102e033a19bf7ead50b7d87c7 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 17 May 2022 13:08:47 -0500 Subject: [PATCH 05/16] vhost_vsock: simplify vhost_vsock_flush() vhost_vsock_flush() calls vhost_work_dev_flush(vsock->vqs[i].poll.dev) before vhost_work_dev_flush(&vsock->dev). This seems pointless as vsock->vqs[i].poll.dev is the same as &vsock->dev and several flushes in a row doesn't do anything useful, one is just enough. Signed-off-by: Andrey Ryabinin Reviewed-by: Stefano Garzarella Signed-off-by: Mike Christie Acked-by: Jason Wang Message-Id: <20220517180850.198915-6-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vsock.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 1b6ca5f8c0b8da..1505cdc1a3f7ca 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -684,11 +684,6 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) static void vhost_vsock_flush(struct vhost_vsock *vsock) { - int i; - - for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) - if (vsock->vqs[i].handle_kick) - vhost_work_dev_flush(vsock->vqs[i].poll.dev); vhost_work_dev_flush(&vsock->dev); } From e6135a74f0e7341757c7d388a0c8d69c092b1951 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 17 May 2022 13:08:50 -0500 Subject: [PATCH 06/16] vhost: rename vhost_work_dev_flush This patch renames vhost_work_dev_flush to just vhost_dev_flush to relfect that it flushes everything on the device and that drivers don't know/care that polls are based on vhost_works. Drivers just flush the entire device and polls, and works for vhost-scsi management TMFs and IO net virtqueues, etc all are flushed. Signed-off-by: Mike Christie Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Message-Id: <20220517180850.198915-9-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin Signed-off-by: Leonid Komarianskyi --- drivers/vhost/net.c | 4 ++-- drivers/vhost/scsi.c | 4 ++-- drivers/vhost/test.c | 2 +- drivers/vhost/vhost.c | 10 +++++----- drivers/vhost/vhost.h | 2 +- drivers/vhost/vsock.c | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 040ac1b236dab4..923d27d4c6942b 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1408,7 +1408,7 @@ static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, static void vhost_net_flush(struct vhost_net *n) { - vhost_work_dev_flush(&n->dev); + vhost_dev_flush(&n->dev); if (n->vqs[VHOST_NET_VQ_TX].ubufs) { mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = true; @@ -1598,7 +1598,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) } if (oldsock) { - vhost_work_dev_flush(&n->dev); + vhost_dev_flush(&n->dev); sockfd_put(oldsock); } diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 4b70519dcae752..fe7570c1b6e9b2 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1470,8 +1470,8 @@ static void vhost_scsi_flush(struct vhost_scsi *vs) /* Flush both the vhost poll and vhost work */ for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) vhost_scsi_flush_vq(vs, i); - vhost_work_dev_flush(&vs->dev); - vhost_work_dev_flush(&vs->dev); + vhost_dev_flush(&vs->dev); + vhost_dev_flush(&vs->dev); /* Wait for all reqs issued before the flush to be finished */ for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c index 1a8ab1d8cb1cf3..39e71f431d8810 100644 --- a/drivers/vhost/test.c +++ b/drivers/vhost/test.c @@ -146,7 +146,7 @@ static void vhost_test_stop(struct vhost_test *n, void **privatep) static void vhost_test_flush_vq(struct vhost_test *n, int index) { - vhost_work_dev_flush(n->vqs[index].poll.dev); + vhost_dev_flush(n->vqs[index].poll.dev); } static void vhost_test_flush(struct vhost_test *n) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index abf6cf27db14d7..afc696d9554f80 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -231,7 +231,7 @@ void vhost_poll_stop(struct vhost_poll *poll) } EXPORT_SYMBOL_GPL(vhost_poll_stop); -void vhost_work_dev_flush(struct vhost_dev *dev) +void vhost_dev_flush(struct vhost_dev *dev) { struct vhost_flush_struct flush; @@ -243,7 +243,7 @@ void vhost_work_dev_flush(struct vhost_dev *dev) wait_for_completion(&flush.wait_event); } } -EXPORT_SYMBOL_GPL(vhost_work_dev_flush); +EXPORT_SYMBOL_GPL(vhost_dev_flush); void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) { @@ -533,7 +533,7 @@ static int vhost_attach_cgroups(struct vhost_dev *dev) attach.owner = current; vhost_work_init(&attach.work, vhost_attach_cgroups_work); vhost_work_queue(dev, &attach.work); - vhost_work_dev_flush(dev); + vhost_dev_flush(dev); return attach.ret; } @@ -660,7 +660,7 @@ void vhost_dev_stop(struct vhost_dev *dev) vhost_poll_stop(&dev->vqs[i]->poll); } - vhost_work_dev_flush(dev); + vhost_dev_flush(dev); } EXPORT_SYMBOL_GPL(vhost_dev_stop); @@ -1712,7 +1712,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg mutex_unlock(&vq->mutex); if (pollstop && vq->handle_kick) - vhost_work_dev_flush(vq->poll.dev); + vhost_dev_flush(vq->poll.dev); return r; } EXPORT_SYMBOL_GPL(vhost_vring_ioctl); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 99332dca9edc79..7bb1cbd545933d 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -45,7 +45,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, int vhost_poll_start(struct vhost_poll *poll, struct file *file); void vhost_poll_stop(struct vhost_poll *poll); void vhost_poll_queue(struct vhost_poll *poll); -void vhost_work_dev_flush(struct vhost_dev *dev); +void vhost_dev_flush(struct vhost_dev *dev); long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); struct vhost_log { diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 1505cdc1a3f7ca..8c2569743f6c9a 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -684,7 +684,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) static void vhost_vsock_flush(struct vhost_vsock *vsock) { - vhost_work_dev_flush(&vsock->dev); + vhost_dev_flush(&vsock->dev); } static void vhost_vsock_reset_orphans(struct sock *sk) From 0ed463080a1b6c9610f38908e82c9f731e06061d Mon Sep 17 00:00:00 2001 From: Leonid Komarianskyi Date: Tue, 3 Sep 2024 15:38:44 +0300 Subject: [PATCH 07/16] drivers/vhost: vhost-blk accelerator for virtio-blk guests Although QEMU virtio is quite fast, there is still some room for improvements. Disk latency can be reduced if we handle virito-blk requests in host kernel istead of passing them to QEMU. The patch adds vhost-blk kernel module to do so. Some test setups: fio --direct=1 --rw=randread --bs=4k --ioengine=libaio --iodepth=128 QEMU drive options: cache=none filesystem: xfs SSD: | randread, IOPS | randwrite, IOPS | Host | 95.8k | 85.3k | QEMU virtio | 57.5k | 79.4k | QEMU vhost-blk | 95.6k | 84.3k | RAMDISK (vq == vcpu = numjobs): | randread, IOPS | randwrite, IOPS | virtio, 1vcpu | 133k | 133k | virtio, 2vcpu | 305k | 306k | virtio, 4vcpu | 310k | 298k | vhost-blk, 1vcpu | 110k | 113k | vhost-blk, 2vcpu | 247k | 252k | vhost-blk, 4vcpu | 558k | 556k | v2: - removed unused VHOST_BLK_VQ - reworked bio handling a bit: now add all pages from signle iov into bio until it is full istead of allocating one bio per page - changed sector incrementation calculation - check move_iovec() in vhost_blk_req_handle() - remove snprintf check and better check ret from copy_to_iter for VIRTIO_BLK_ID_BYTES requests - discard vq request if vhost_blk_req_handle() returned negative code - forbid to change nonzero backend in vhost_blk_set_backend(). First of all, QEMU sets backend only once. Also if we want to change backend when we already running requests we need to be much more careful in vhost_blk_handle_guest_kick() as it is not taking any references. If userspace want to change backend that bad it can always reset device. - removed EXPERIMENTAL from Kconfig Signed-off-by: Andrey Zhadchenko --- drivers/vhost/Kconfig | 12 + drivers/vhost/Makefile | 3 + drivers/vhost/blk.c | 820 +++++++++++++++++++++++++++++++++++++ include/uapi/linux/vhost.h | 5 + 4 files changed, 840 insertions(+) create mode 100644 drivers/vhost/blk.c diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 5602241ac9737f..7c79e25b45f210 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -95,4 +95,16 @@ config VHOST_CROSS_ENDIAN_LEGACY If unsure, say "N". +config VHOST_BLK + tristate "Host kernel accelerator for virtio-blk" + depends on BLOCK && EVENTFD + select VHOST + default n + help + This kernel module can be loaded in host kernel to accelerate + guest vm with virtio-blk driver. + + To compile this driver as a module, choose M here: the module will + be called vhost_blk. + endif diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index 52c1a8e37f19bf..d08c8dde89aa82 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -18,5 +18,8 @@ obj-$(CONFIG_VHOST) += vhost.o obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o vhost_iotlb-y := iotlb.o +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o +vhost_blk-y := blk.o + obj-$(CONFIG_VHOST_XEN) += vhost_xen.o vhost_xen-y := xen.o diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c new file mode 100644 index 00000000000000..e0c23c7cb5c2df --- /dev/null +++ b/drivers/vhost/blk.c @@ -0,0 +1,820 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2011 Taobao, Inc. + * Author: Liu Yuan + * + * Copyright (C) 2012 Red Hat, Inc. + * Author: Asias He + * + * Copyright (c) 2022 Virtuozzo International GmbH. + * Author: Andrey Zhadchenko + * + * virtio-blk host kernel accelerator. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +enum { + VHOST_BLK_FEATURES = VHOST_FEATURES | + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | + (1ULL << VIRTIO_RING_F_EVENT_IDX) | + (1ULL << VIRTIO_BLK_F_MQ) | + (1ULL << VIRTIO_BLK_F_FLUSH), +}; + +/* + * Max number of bytes transferred before requeueing the job. + * Using this limit prevents one virtqueue from starving others. + */ +#define VHOST_DEV_WEIGHT 0x80000 + +/* + * Max number of packets transferred before requeueing the job. + * Using this limit prevents one virtqueue from starving others with + * pkts. + */ +#define VHOST_DEV_PKT_WEIGHT 256 + +#define VHOST_BLK_VQ_MAX 16 + +#define VHOST_MAX_METADATA_IOV 1 + +#define VHOST_BLK_SECTOR_BITS 9 +#define VHOST_BLK_SECTOR_SIZE (1 << VHOST_BLK_SECTOR_BITS) +#define VHOST_BLK_SECTOR_MASK (VHOST_BLK_SECTOR_SIZE - 1) + +struct req_page_list { + struct page **pages; + int pages_nr; +}; + +#define NR_INLINE 16 + +struct vhost_blk_req { + struct req_page_list inline_pl[NR_INLINE]; + struct page *inline_page[NR_INLINE]; + struct bio *inline_bio[NR_INLINE]; + struct req_page_list *pl; + int during_flush; + bool use_inline; + + struct llist_node llnode; + + struct vhost_blk *blk; + + struct iovec *iov; + int iov_nr; + + struct bio **bio; + atomic_t bio_nr; + + struct iovec status[VHOST_MAX_METADATA_IOV]; + + sector_t sector; + int bi_opf; + u16 head; + long len; + int bio_err; + + struct vhost_blk_vq *blk_vq; +}; + +struct vhost_blk_vq { + struct vhost_virtqueue vq; + struct vhost_blk_req *req; + struct iovec iov[UIO_MAXIOV]; + struct llist_head llhead; + struct vhost_work work; +}; + +struct vhost_blk { + wait_queue_head_t flush_wait; + struct vhost_blk_vq vqs[VHOST_BLK_VQ_MAX]; + atomic_t req_inflight[2]; + spinlock_t flush_lock; + struct vhost_dev dev; + int during_flush; + struct file *backend; + int index; +}; + +static int gen; + +static int move_iovec(struct iovec *from, struct iovec *to, + size_t len, int iov_count_from, int iov_count_to) +{ + int moved_seg = 0, spent_seg = 0; + size_t size; + + while (len && spent_seg < iov_count_from && moved_seg < iov_count_to) { + if (from->iov_len == 0) { + ++from; + ++spent_seg; + continue; + } + size = min(from->iov_len, len); + to->iov_base = from->iov_base; + to->iov_len = size; + from->iov_len -= size; + from->iov_base += size; + len -= size; + ++from; + ++to; + ++moved_seg; + ++spent_seg; + } + + return len ? -1 : moved_seg; +} + +static inline int iov_num_pages(struct iovec *iov) +{ + return (PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) - + ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT; +} + +static inline int vhost_blk_set_status(struct vhost_blk_req *req, u8 status) +{ + struct iov_iter iter; + int ret; + + iov_iter_init(&iter, WRITE, req->status, ARRAY_SIZE(req->status), sizeof(status)); + ret = copy_to_iter(&status, sizeof(status), &iter); + if (ret != sizeof(status)) { + vq_err(&req->blk_vq->vq, "Failed to write status\n"); + return -EFAULT; + } + + return 0; +} + +static void vhost_blk_req_done(struct bio *bio) +{ + struct vhost_blk_req *req = bio->bi_private; + struct vhost_blk *blk = req->blk; + + req->bio_err = blk_status_to_errno(bio->bi_status); + + if (atomic_dec_and_test(&req->bio_nr)) { + llist_add(&req->llnode, &req->blk_vq->llhead); + vhost_work_queue(&blk->dev, &req->blk_vq->work); + } + + bio_put(bio); +} + +static void vhost_blk_req_umap(struct vhost_blk_req *req) +{ + struct req_page_list *pl; + int i, j; + + if (req->pl) { + for (i = 0; i < req->iov_nr; i++) { + pl = &req->pl[i]; + + for (j = 0; j < pl->pages_nr; j++) { + if (!req->bi_opf) + set_page_dirty_lock(pl->pages[j]); + put_page(pl->pages[j]); + } + } + } + + if (!req->use_inline) + kfree(req->pl); +} + +static int vhost_blk_bio_make_simple(struct vhost_blk_req *req, + struct block_device *bdev) +{ + struct bio *bio; + + req->use_inline = true; + req->pl = NULL; + req->bio = req->inline_bio; + + bio = bio_alloc(req->bi_opf, GFP_KERNEL); + if (!bio) + return -ENOMEM; + + bio->bi_iter.bi_sector = req->sector; + bio->bi_private = req; + bio->bi_end_io = vhost_blk_req_done; + req->bio[0] = bio; + + atomic_set(&req->bio_nr, 1); + + return 0; +} + +static struct page **vhost_blk_prepare_req(struct vhost_blk_req *req, + int total_pages, int iov_nr) +{ + int pl_len, page_len, bio_len; + void *buf; + + req->use_inline = false; + pl_len = iov_nr * sizeof(req->pl[0]); + page_len = total_pages * sizeof(struct page *); + bio_len = total_pages * sizeof(struct bio *); + + buf = kmalloc(pl_len + page_len + bio_len, GFP_KERNEL); + if (!buf) + return NULL; + + req->pl = buf; + req->bio = buf + pl_len + page_len; + + return buf + pl_len; +} + +static int vhost_blk_bio_make(struct vhost_blk_req *req, + struct block_device *bdev) +{ + int pages_nr_total, i, j, ret; + struct iovec *iov = req->iov; + int iov_nr = req->iov_nr; + struct page **pages, *page; + struct bio *bio = NULL; + int bio_nr = 0; + + if (unlikely(req->bi_opf == REQ_OP_FLUSH)) + return vhost_blk_bio_make_simple(req, bdev); + + pages_nr_total = 0; + for (i = 0; i < iov_nr; i++) + pages_nr_total += iov_num_pages(&iov[i]); + + if (pages_nr_total > NR_INLINE) { + pages = vhost_blk_prepare_req(req, pages_nr_total, iov_nr); + if (!pages) + return -ENOMEM; + } else { + req->use_inline = true; + req->pl = req->inline_pl; + pages = req->inline_page; + req->bio = req->inline_bio; + } + + req->iov_nr = 0; + for (i = 0; i < iov_nr; i++) { + int pages_nr = iov_num_pages(&iov[i]); + unsigned long iov_base, iov_len; + struct req_page_list *pl; + + iov_base = (unsigned long)iov[i].iov_base; + iov_len = (unsigned long)iov[i].iov_len; + + ret = get_user_pages_fast(iov_base, pages_nr, + !req->bi_opf, pages); + if (ret != pages_nr) + goto fail; + + req->iov_nr++; + pl = &req->pl[i]; + pl->pages_nr = pages_nr; + pl->pages = pages; + + for (j = 0; j < pages_nr; j++) { + unsigned int off, len, pos; + + page = pages[j]; + off = iov_base & ~PAGE_MASK; + len = PAGE_SIZE - off; + if (len > iov_len) + len = iov_len; + + while (!bio || !bio_add_page(bio, page, len, off)) { + bio = bio_alloc(req->bi_opf, GFP_KERNEL); + if (!bio) + goto fail; + bio->bi_iter.bi_sector = req->sector; + bio->bi_private = req; + bio->bi_end_io = vhost_blk_req_done; + req->bio[bio_nr++] = bio; + } + + iov_base += len; + iov_len -= len; + + pos = (iov_base & VHOST_BLK_SECTOR_MASK) + iov_len; + req->sector += pos >> VHOST_BLK_SECTOR_BITS; + } + + pages += pages_nr; + } + atomic_set(&req->bio_nr, bio_nr); + return 0; + +fail: + for (i = 0; i < bio_nr; i++) + bio_put(req->bio[i]); + vhost_blk_req_umap(req); + return -ENOMEM; +} + +static inline void vhost_blk_bio_send(struct vhost_blk_req *req) +{ + struct blk_plug plug; + int i, bio_nr; + + bio_nr = atomic_read(&req->bio_nr); + blk_start_plug(&plug); + for (i = 0; i < bio_nr; i++) + submit_bio(req->bio[i]); + + blk_finish_plug(&plug); +} + +static int vhost_blk_req_submit(struct vhost_blk_req *req, struct file *file) +{ + + struct inode *inode = file->f_mapping->host; + struct block_device *bdev = I_BDEV(inode); + int ret; + + ret = vhost_blk_bio_make(req, bdev); + if (ret < 0) + return ret; + + vhost_blk_bio_send(req); + + spin_lock(&req->blk->flush_lock); + req->during_flush = req->blk->during_flush; + atomic_inc(&req->blk->req_inflight[req->during_flush]); + spin_unlock(&req->blk->flush_lock); + + return ret; +} + +static int vhost_blk_req_handle(struct vhost_virtqueue *vq, + struct virtio_blk_outhdr *hdr, + u16 head, u16 total_iov_nr, + struct file *file) +{ + struct vhost_blk *blk = container_of(vq->dev, struct vhost_blk, dev); + struct vhost_blk_vq *blk_vq = container_of(vq, struct vhost_blk_vq, vq); + unsigned char id[VIRTIO_BLK_ID_BYTES]; + struct vhost_blk_req *req; + struct iov_iter iter; + int ret, len; + u8 status; + + req = &blk_vq->req[head]; + req->blk_vq = blk_vq; + req->head = head; + req->blk = blk; + req->sector = hdr->sector; + req->iov = blk_vq->iov; + + req->len = iov_length(vq->iov, total_iov_nr) - sizeof(status); + req->iov_nr = move_iovec(vq->iov, req->iov, req->len, total_iov_nr, + ARRAY_SIZE(blk_vq->iov)); + + ret = move_iovec(vq->iov, req->status, sizeof(status), total_iov_nr, + ARRAY_SIZE(req->status)); + if (ret < 0 || req->iov_nr < 0) + return -EINVAL; + + switch (hdr->type) { + case VIRTIO_BLK_T_OUT: + req->bi_opf = REQ_OP_WRITE; + ret = vhost_blk_req_submit(req, file); + break; + case VIRTIO_BLK_T_IN: + req->bi_opf = REQ_OP_READ; + ret = vhost_blk_req_submit(req, file); + break; + case VIRTIO_BLK_T_FLUSH: + req->bi_opf = REQ_OP_FLUSH; + ret = vhost_blk_req_submit(req, file); + break; + case VIRTIO_BLK_T_GET_ID: + len = snprintf(id, VIRTIO_BLK_ID_BYTES, "vhost-blk%d", blk->index); + iov_iter_init(&iter, WRITE, req->iov, req->iov_nr, req->len); + ret = copy_to_iter(id, len, &iter); + status = ret != len ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; + ret = vhost_blk_set_status(req, status); + if (ret) + break; + vhost_add_used_and_signal(&blk->dev, vq, head, len); + break; + default: + vq_err(vq, "Unsupported request type %d\n", hdr->type); + status = VIRTIO_BLK_S_UNSUPP; + ret = vhost_blk_set_status(req, status); + if (ret) + break; + vhost_add_used_and_signal(&blk->dev, vq, head, 0); + } + + return ret; +} + +static void vhost_blk_handle_guest_kick(struct vhost_work *work) +{ + struct virtio_blk_outhdr hdr; + struct vhost_blk_vq *blk_vq; + struct vhost_virtqueue *vq; + struct iovec hdr_iovec[VHOST_MAX_METADATA_IOV]; + struct vhost_blk *blk; + struct iov_iter iter; + int in, out, ret; + struct file *f; + u16 head; + + vq = container_of(work, struct vhost_virtqueue, poll.work); + blk = container_of(vq->dev, struct vhost_blk, dev); + blk_vq = container_of(vq, struct vhost_blk_vq, vq); + + f = vhost_vq_get_backend(vq); + if (!f) + return; + + vhost_disable_notify(&blk->dev, vq); + for (;;) { + head = vhost_get_vq_desc(vq, vq->iov, + ARRAY_SIZE(vq->iov), + &out, &in, NULL, NULL); + if (unlikely(head < 0)) + break; + + if (unlikely(head == vq->num)) { + if (unlikely(vhost_enable_notify(&blk->dev, vq))) { + vhost_disable_notify(&blk->dev, vq); + continue; + } + break; + } + + ret = move_iovec(vq->iov, hdr_iovec, sizeof(hdr), in + out, ARRAY_SIZE(hdr_iovec)); + if (ret < 0) { + vq_err(vq, "virtio_blk_hdr is too split!"); + vhost_discard_vq_desc(vq, 1); + break; + } + + iov_iter_init(&iter, READ, hdr_iovec, ARRAY_SIZE(hdr_iovec), sizeof(hdr)); + ret = copy_from_iter(&hdr, sizeof(hdr), &iter); + if (ret != sizeof(hdr)) { + vq_err(vq, "Failed to get block header: read %d bytes instead of %ld!\n", + ret, sizeof(hdr)); + vhost_discard_vq_desc(vq, 1); + break; + } + + if (vhost_blk_req_handle(vq, &hdr, head, out + in, f) < 0) { + vhost_discard_vq_desc(vq, 1); + break; + } + + if (!llist_empty(&blk_vq->llhead)) { + vhost_poll_queue(&vq->poll); + break; + } + } +} + +static void vhost_blk_handle_host_kick(struct vhost_work *work) +{ + struct vhost_blk_vq *blk_vq; + struct vhost_virtqueue *vq; + struct vhost_blk_req *req; + struct llist_node *llnode; + struct vhost_blk *blk = NULL; + bool added, zero; + u8 status; + int ret; + + blk_vq = container_of(work, struct vhost_blk_vq, work); + vq = &blk_vq->vq; + llnode = llist_del_all(&blk_vq->llhead); + added = false; + while (llnode) { + req = llist_entry(llnode, struct vhost_blk_req, llnode); + llnode = llist_next(llnode); + + if (!blk) + blk = req->blk; + + vhost_blk_req_umap(req); + + status = req->bio_err == 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR; + ret = vhost_blk_set_status(req, status); + if (unlikely(ret)) + continue; + + vhost_add_used(vq, req->head, req->len); + added = true; + + spin_lock(&req->blk->flush_lock); + zero = atomic_dec_and_test( + &req->blk->req_inflight[req->during_flush]); + if (zero && !req->during_flush) + wake_up(&blk->flush_wait); + spin_unlock(&req->blk->flush_lock); + + } + + if (likely(added)) + vhost_signal(&blk->dev, vq); +} + +static void vhost_blk_flush(struct vhost_blk *blk) +{ + spin_lock(&blk->flush_lock); + blk->during_flush = 1; + spin_unlock(&blk->flush_lock); + + vhost_dev_flush(&blk->dev); + /* + * Wait until requests fired before the flush to be finished + * req_inflight[0] is used to track the requests fired before the flush + * req_inflight[1] is used to track the requests fired during the flush + */ + wait_event(blk->flush_wait, !atomic_read(&blk->req_inflight[0])); + + spin_lock(&blk->flush_lock); + blk->during_flush = 0; + spin_unlock(&blk->flush_lock); +} + +static inline void vhost_blk_drop_backends(struct vhost_blk *blk) +{ + struct vhost_virtqueue *vq; + int i; + + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) { + vq = &blk->vqs[i].vq; + + mutex_lock(&vq->mutex); + vhost_vq_set_backend(vq, NULL); + mutex_unlock(&vq->mutex); + } +} + +static int vhost_blk_open(struct inode *inode, struct file *file) +{ + struct vhost_blk *blk; + struct vhost_virtqueue **vqs; + int ret = 0, i = 0; + + blk = kvzalloc(sizeof(*blk), GFP_KERNEL); + if (!blk) { + ret = -ENOMEM; + goto out; + } + + vqs = kcalloc(VHOST_BLK_VQ_MAX, sizeof(*vqs), GFP_KERNEL); + if (!vqs) { + ret = -ENOMEM; + goto out_blk; + } + + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) { + blk->vqs[i].vq.handle_kick = vhost_blk_handle_guest_kick; + vqs[i] = &blk->vqs[i].vq; + } + + blk->index = gen++; + + atomic_set(&blk->req_inflight[0], 0); + atomic_set(&blk->req_inflight[1], 0); + blk->during_flush = 0; + spin_lock_init(&blk->flush_lock); + init_waitqueue_head(&blk->flush_wait); + + vhost_dev_init(&blk->dev, vqs, VHOST_BLK_VQ_MAX, UIO_MAXIOV, + VHOST_DEV_WEIGHT, VHOST_DEV_PKT_WEIGHT, true, NULL); + file->private_data = blk; + + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) + vhost_work_init(&blk->vqs[i].work, vhost_blk_handle_host_kick); + + return ret; +out_blk: + kvfree(blk); +out: + return ret; +} + +static int vhost_blk_release(struct inode *inode, struct file *f) +{ + struct vhost_blk *blk = f->private_data; + int i; + + vhost_blk_drop_backends(blk); + vhost_blk_flush(blk); + vhost_dev_stop(&blk->dev); + if (blk->backend) + fput(blk->backend); + vhost_dev_cleanup(&blk->dev); + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) + kvfree(blk->vqs[i].req); + kfree(blk->dev.vqs); + kvfree(blk); + + return 0; +} + +static int vhost_blk_set_features(struct vhost_blk *blk, u64 features) +{ + struct vhost_virtqueue *vq; + int i; + + mutex_lock(&blk->dev.mutex); + if ((features & (1 << VHOST_F_LOG_ALL)) && + !vhost_log_access_ok(&blk->dev)) { + mutex_unlock(&blk->dev.mutex); + return -EFAULT; + } + + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) { + vq = &blk->vqs[i].vq; + mutex_lock(&vq->mutex); + vq->acked_features = features & (VHOST_BLK_FEATURES); + mutex_unlock(&vq->mutex); + } + + vhost_blk_flush(blk); + mutex_unlock(&blk->dev.mutex); + + return 0; +} + +static long vhost_blk_set_backend(struct vhost_blk *blk, int fd) +{ + struct vhost_virtqueue *vq; + struct file *file; + struct inode *inode; + int ret, i; + + mutex_lock(&blk->dev.mutex); + ret = vhost_dev_check_owner(&blk->dev); + if (ret) + goto out_dev; + + if (blk->backend) { + ret = -EBUSY; + goto out_dev; + } + + file = fget(fd); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto out_dev; + } + + inode = file->f_mapping->host; + if (!S_ISBLK(inode->i_mode)) { + ret = -EFAULT; + goto out_file; + } + + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) { + vq = &blk->vqs[i].vq; + if (!vhost_vq_access_ok(vq)) { + ret = -EFAULT; + goto out_drop; + } + + mutex_lock(&vq->mutex); + vhost_vq_set_backend(vq, file); + ret = vhost_vq_init_access(vq); + mutex_unlock(&vq->mutex); + } + + blk->backend = file; + + mutex_unlock(&blk->dev.mutex); + return 0; + +out_drop: + vhost_blk_drop_backends(blk); +out_file: + fput(file); +out_dev: + mutex_unlock(&blk->dev.mutex); + return ret; +} + +static long vhost_blk_reset_owner(struct vhost_blk *blk) +{ + struct vhost_iotlb *umem; + int err, i; + + mutex_lock(&blk->dev.mutex); + err = vhost_dev_check_owner(&blk->dev); + if (err) + goto done; + umem = vhost_dev_reset_owner_prepare(); + if (!umem) { + err = -ENOMEM; + goto done; + } + vhost_blk_drop_backends(blk); + if (blk->backend) { + fput(blk->backend); + blk->backend = NULL; + } + vhost_blk_flush(blk); + vhost_dev_stop(&blk->dev); + vhost_dev_reset_owner(&blk->dev, umem); + + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) { + kvfree(blk->vqs[i].req); + blk->vqs[i].req = NULL; + } + +done: + mutex_unlock(&blk->dev.mutex); + return err; +} + +static int vhost_blk_setup(struct vhost_blk *blk, void __user *argp) +{ + struct vhost_vring_state s; + + if (copy_from_user(&s, argp, sizeof(s))) + return -EFAULT; + + if (blk->vqs[s.index].req) + return 0; + + blk->vqs[s.index].req = kvmalloc(sizeof(struct vhost_blk_req) * s.num, GFP_KERNEL); + if (!blk->vqs[s.index].req) + return -ENOMEM; + + return 0; +} + +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct vhost_blk *blk = f->private_data; + void __user *argp = (void __user *)arg; + struct vhost_vring_file backend; + u64 __user *featurep = argp; + u64 features; + int ret; + + switch (ioctl) { + case VHOST_BLK_SET_BACKEND: + if (copy_from_user(&backend, argp, sizeof(backend))) + return -EFAULT; + return vhost_blk_set_backend(blk, backend.fd); + case VHOST_GET_FEATURES: + features = VHOST_BLK_FEATURES; + if (copy_to_user(featurep, &features, sizeof(features))) + return -EFAULT; + return 0; + case VHOST_SET_FEATURES: + if (copy_from_user(&features, featurep, sizeof(features))) + return -EFAULT; + if (features & ~VHOST_BLK_FEATURES) + return -EOPNOTSUPP; + return vhost_blk_set_features(blk, features); + case VHOST_RESET_OWNER: + return vhost_blk_reset_owner(blk); + default: + mutex_lock(&blk->dev.mutex); + ret = vhost_dev_ioctl(&blk->dev, ioctl, argp); + if (ret == -ENOIOCTLCMD) + ret = vhost_vring_ioctl(&blk->dev, ioctl, argp); + if (!ret && ioctl == VHOST_SET_VRING_NUM) + ret = vhost_blk_setup(blk, argp); + vhost_blk_flush(blk); + mutex_unlock(&blk->dev.mutex); + return ret; + } +} + +static const struct file_operations vhost_blk_fops = { + .owner = THIS_MODULE, + .open = vhost_blk_open, + .release = vhost_blk_release, + .llseek = noop_llseek, + .unlocked_ioctl = vhost_blk_ioctl, +}; + +static struct miscdevice vhost_blk_misc = { + MISC_DYNAMIC_MINOR, + "vhost-blk", + &vhost_blk_fops, +}; +module_misc_device(vhost_blk_misc); + +MODULE_VERSION("0.0.1"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Andrey Zhadchenko"); +MODULE_DESCRIPTION("Host kernel accelerator for virtio_blk"); diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index c998860d7bbc43..13caf114bcdea5 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -150,4 +150,9 @@ /* Get the valid iova range */ #define VHOST_VDPA_GET_IOVA_RANGE _IOR(VHOST_VIRTIO, 0x78, \ struct vhost_vdpa_iova_range) + +/* VHOST_BLK specific defines */ +#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0xFF, \ + struct vhost_vring_file) + #endif From 1963c3d8818cd64c023c9195ade3b5d1e8e14fa7 Mon Sep 17 00:00:00 2001 From: Leonid Komarianskyi Date: Wed, 4 Sep 2024 17:56:50 +0300 Subject: [PATCH 08/16] drivers/vhost: use array to store workers We want to support several vhost workers. The first step is to rework vhost to use array of workers rather than single pointer. Update creation and cleanup routines. Signed-off-by: Andrey Zhadchenko --- drivers/vhost/vhost.c | 75 +++++++++++++++++++++++++++++++------------ drivers/vhost/vhost.h | 10 +++++- 2 files changed, 63 insertions(+), 22 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index afc696d9554f80..e19603f2f37e5e 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -231,11 +231,24 @@ void vhost_poll_stop(struct vhost_poll *poll) } EXPORT_SYMBOL_GPL(vhost_poll_stop); +static void vhost_work_queue_at_worker(struct vhost_worker *w, + struct vhost_work *work) +{ + if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) { + /* We can only add the work to the list after we're + * sure it was not in the list. + * test_and_set_bit() implies a memory barrier. + */ + llist_add(&work->node, &w->work_list); + wake_up_process(w->worker); + } +} + void vhost_dev_flush(struct vhost_dev *dev) { struct vhost_flush_struct flush; - if (dev->worker) { + if (dev->workers[0].worker) { init_completion(&flush.wait_event); vhost_work_init(&flush.work, vhost_flush_work); @@ -247,17 +260,12 @@ EXPORT_SYMBOL_GPL(vhost_dev_flush); void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) { - if (!dev->worker) + struct vhost_worker *w = &dev->workers[0]; + + if (!w->worker) return; - if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) { - /* We can only add the work to the list after we're - * sure it was not in the list. - * test_and_set_bit() implies a memory barrier. - */ - llist_add(&work->node, &dev->work_list); - wake_up_process(dev->worker); - } + vhost_work_queue_at_worker(w, work); } EXPORT_SYMBOL_GPL(vhost_work_queue); @@ -333,9 +341,29 @@ static void vhost_vq_reset(struct vhost_dev *dev, __vhost_vq_meta_reset(vq); } +static void vhost_worker_reset(struct vhost_worker *w) +{ + init_llist_head(&w->work_list); + w->worker = NULL; +} + +void vhost_cleanup_workers(struct vhost_dev *dev) +{ + int i; + + for (i = 0; i < dev->nworkers; ++i) { + WARN_ON(!llist_empty(&dev->workers[i].work_list)); + kthread_stop(dev->workers[i].worker); + vhost_worker_reset(&dev->workers[i]); + } + + dev->nworkers = 0; +} + static int vhost_worker(void *data) { - struct vhost_dev *dev = data; + struct vhost_worker *w = data; + struct vhost_dev *dev = w->dev; struct vhost_work *work, *work_next; struct llist_node *node; @@ -350,7 +378,7 @@ static int vhost_worker(void *data) break; } - node = llist_del_all(&dev->work_list); + node = llist_del_all(&w->work_list); if (!node) schedule(); @@ -473,7 +501,6 @@ void vhost_dev_init(struct vhost_dev *dev, dev->umem = NULL; dev->iotlb = NULL; dev->mm = NULL; - dev->worker = NULL; dev->iov_limit = iov_limit; dev->weight = weight; dev->byte_weight = byte_weight; @@ -485,6 +512,11 @@ void vhost_dev_init(struct vhost_dev *dev, INIT_LIST_HEAD(&dev->pending_list); spin_lock_init(&dev->iotlb_lock); + dev->nworkers = 0; + for (i = 0; i < VHOST_MAX_WORKERS; ++i) { + dev->workers[i].dev = dev; + vhost_worker_reset(&dev->workers[i]); + } for (i = 0; i < dev->nvqs; ++i) { vq = dev->vqs[i]; @@ -597,7 +629,8 @@ long vhost_dev_set_owner(struct vhost_dev *dev) goto err_worker; } - dev->worker = worker; + dev->workers[0].worker = worker; + dev->nworkers = 1; wake_up_process(worker); /* avoid contributing to loadavg */ err = vhost_attach_cgroups(dev); @@ -611,9 +644,10 @@ long vhost_dev_set_owner(struct vhost_dev *dev) return 0; err_cgroup: - if (dev->worker) { - kthread_stop(dev->worker); - dev->worker = NULL; + dev->nworkers = 0; + if (dev->workers[0].worker) { + kthread_stop(dev->workers[0].worker); + dev->workers[0].worker = NULL; } err_worker: vhost_detach_mm(dev); @@ -699,6 +733,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev) vhost_xen_unmap_desc_all(dev->vqs[i]); #endif } + vhost_dev_free_iovecs(dev); if (dev->log_ctx) eventfd_ctx_put(dev->log_ctx); @@ -710,10 +745,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev) dev->iotlb = NULL; vhost_clear_msg(dev); wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM); - WARN_ON(!llist_empty(&dev->work_list)); - if (dev->worker) { - kthread_stop(dev->worker); - dev->worker = NULL; + if (dev->use_worker) { + vhost_cleanup_workers(dev); dev->kcov_handle = 0; } vhost_detach_mm(dev); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 7bb1cbd545933d..3e5135cfa71e17 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -25,6 +25,13 @@ struct vhost_work { unsigned long flags; }; +#define VHOST_MAX_WORKERS 4 +struct vhost_worker { + struct task_struct *worker; + struct llist_head work_list; + struct vhost_dev *dev; +}; + /* Poll a file (eventfd or socket) */ /* Note: there's nothing vhost specific about this structure. */ struct vhost_poll { @@ -158,7 +165,8 @@ struct vhost_dev { int nvqs; struct eventfd_ctx *log_ctx; struct llist_head work_list; - struct task_struct *worker; + struct vhost_worker workers[VHOST_MAX_WORKERS]; + int nworkers; struct vhost_iotlb *umem; struct vhost_iotlb *iotlb; spinlock_t iotlb_lock; From d2eaad2f0ae5c1528b60bdfe0caf03d39ab07ce3 Mon Sep 17 00:00:00 2001 From: Andrey Zhadchenko Date: Thu, 13 Oct 2022 18:18:32 +0300 Subject: [PATCH 09/16] drivers/vhost: adjust vhost to flush all workers Make vhost_dev_flush support several workers and flush them simultaneously Signed-off-by: Andrey Zhadchenko --- drivers/vhost/vhost.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index e19603f2f37e5e..95265e123df5c7 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -246,15 +246,19 @@ static void vhost_work_queue_at_worker(struct vhost_worker *w, void vhost_dev_flush(struct vhost_dev *dev) { - struct vhost_flush_struct flush; + struct vhost_flush_struct flush[VHOST_MAX_WORKERS]; + int i, nworkers; - if (dev->workers[0].worker) { - init_completion(&flush.wait_event); - vhost_work_init(&flush.work, vhost_flush_work); + nworkers = READ_ONCE(dev->nworkers); - vhost_work_queue(dev, &flush.work); - wait_for_completion(&flush.wait_event); + for (i = 0; i < nworkers; i++) { + init_completion(&flush[i].wait_event); + vhost_work_init(&flush[i].work, vhost_flush_work); + vhost_work_queue_at_worker(&dev->workers[i], &flush[i].work); } + + for (i = 0; i < nworkers; i++) + wait_for_completion(&flush[i].wait_event); } EXPORT_SYMBOL_GPL(vhost_dev_flush); From b75716158153e1daccf827826ab331099daa0e64 Mon Sep 17 00:00:00 2001 From: Andrey Zhadchenko Date: Thu, 13 Oct 2022 18:18:33 +0300 Subject: [PATCH 10/16] drivers/vhost: rework cgroups attachment to be worker aware Rework vhost_attach_cgroups to manipulate specified worker. Implement vhost_worker_flush as we need to flush specific worker. Signed-off-by: Andrey Zhadchenko --- drivers/vhost/vhost.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 95265e123df5c7..0151287015be3f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -262,6 +262,16 @@ void vhost_dev_flush(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_flush); +static void vhost_worker_flush(struct vhost_worker *w) +{ + struct vhost_flush_struct flush; + + init_completion(&flush.wait_event); + vhost_work_init(&flush.work, vhost_flush_work); + vhost_work_queue_at_worker(w, &flush.work); + wait_for_completion(&flush.wait_event); +} + void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) { struct vhost_worker *w = &dev->workers[0]; @@ -562,14 +572,14 @@ static void vhost_attach_cgroups_work(struct vhost_work *work) s->ret = cgroup_attach_task_all(s->owner, current); } -static int vhost_attach_cgroups(struct vhost_dev *dev) +static int vhost_worker_attach_cgroups(struct vhost_worker *w) { struct vhost_attach_cgroups_struct attach; attach.owner = current; vhost_work_init(&attach.work, vhost_attach_cgroups_work); - vhost_work_queue(dev, &attach.work); - vhost_dev_flush(dev); + vhost_work_queue_at_worker(w, &attach.work); + vhost_worker_flush(w); return attach.ret; } @@ -637,7 +647,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev) dev->nworkers = 1; wake_up_process(worker); /* avoid contributing to loadavg */ - err = vhost_attach_cgroups(dev); + err = vhost_worker_attach_cgroups(&dev->workers[0]); if (err) goto err_cgroup; } From 3b10e1e054ca220882189def3058134810e5c77f Mon Sep 17 00:00:00 2001 From: Andrey Zhadchenko Date: Thu, 13 Oct 2022 18:18:34 +0300 Subject: [PATCH 11/16] drivers/vhost: rework worker creation Add function to create a vhost worker and add it into the device. Rework vhost_dev_set_owner Signed-off-by: Andrey Zhadchenko --- drivers/vhost/vhost.c | 64 +++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 0151287015be3f..cd7b1e0a7044b8 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -620,53 +620,65 @@ static void vhost_detach_mm(struct vhost_dev *dev) dev->mm = NULL; } +static int vhost_add_worker(struct vhost_dev *dev) +{ + struct vhost_worker *w = &dev->workers[dev->nworkers]; + struct task_struct *worker; + int err; + + if (dev->nworkers == VHOST_MAX_WORKERS) + return -E2BIG; + + worker = kthread_create(vhost_worker, w, + "vhost-%d-%d", current->pid, dev->nworkers); + if (IS_ERR(worker)) + return PTR_ERR(worker); + + w->worker = worker; + wake_up_process(worker); /* avoid contributing to loadavg */ + + err = vhost_worker_attach_cgroups(w); + if (err) + goto cleanup; + + dev->nworkers++; + return 0; + +cleanup: + kthread_stop(worker); + w->worker = NULL; + + return err; +} + /* Caller should have device mutex */ long vhost_dev_set_owner(struct vhost_dev *dev) { - struct task_struct *worker; int err; /* Is there an owner already? */ - if (vhost_dev_has_owner(dev)) { - err = -EBUSY; - goto err_mm; - } + if (vhost_dev_has_owner(dev)) + return -EBUSY; vhost_attach_mm(dev); dev->kcov_handle = kcov_common_handle(); if (dev->use_worker) { - worker = kthread_create(vhost_worker, dev, - "vhost-%d", current->pid); - if (IS_ERR(worker)) { - err = PTR_ERR(worker); - goto err_worker; - } - - dev->workers[0].worker = worker; - dev->nworkers = 1; - wake_up_process(worker); /* avoid contributing to loadavg */ - - err = vhost_worker_attach_cgroups(&dev->workers[0]); + err = vhost_add_worker(dev); if (err) - goto err_cgroup; + goto err_mm; } err = vhost_dev_alloc_iovecs(dev); if (err) - goto err_cgroup; + goto err_worker; return 0; -err_cgroup: - dev->nworkers = 0; - if (dev->workers[0].worker) { - kthread_stop(dev->workers[0].worker); - dev->workers[0].worker = NULL; - } err_worker: + vhost_cleanup_workers(dev); +err_mm: vhost_detach_mm(dev); dev->kcov_handle = 0; -err_mm: return err; } EXPORT_SYMBOL_GPL(vhost_dev_set_owner); From e1b61966f669412cc49970955ef75b737ac0525d Mon Sep 17 00:00:00 2001 From: Andrey Zhadchenko Date: Thu, 13 Oct 2022 18:18:35 +0300 Subject: [PATCH 12/16] drivers/vhost: add ioctl to increase the number of workers Finally add ioctl to allow userspace to create additional workers For now only allow to increase the number of workers Signed-off-by: Andrey Zhadchenko --- drivers/vhost/vhost.c | 32 +++++++++++++++++++++++++++++++- include/uapi/linux/vhost.h | 9 +++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index cd7b1e0a7044b8..0b9444f561ba24 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -651,6 +651,25 @@ static int vhost_add_worker(struct vhost_dev *dev) return err; } +static int vhost_set_workers(struct vhost_dev *dev, int n) +{ + int i, ret; + + if (n > dev->nvqs) + n = dev->nvqs; + + if (n > VHOST_MAX_WORKERS) + n = VHOST_MAX_WORKERS; + + for (i = 0; i < n - dev->nworkers ; i++) { + ret = vhost_add_worker(dev); + if (ret) + break; + } + + return ret; +} + /* Caller should have device mutex */ long vhost_dev_set_owner(struct vhost_dev *dev) { @@ -1809,7 +1828,7 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) struct eventfd_ctx *ctx; u64 p; long r; - int i, fd; + int i, fd, n; /* If you are not the owner, you can become one */ if (ioctl == VHOST_SET_OWNER) { @@ -1866,6 +1885,17 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) if (ctx) eventfd_ctx_put(ctx); break; + case VHOST_SET_NWORKERS: + r = get_user(n, (int __user *)argp); + if (r < 0) + break; + if (n < d->nworkers) { + r = -EINVAL; + break; + } + + r = vhost_set_workers(d, n); + break; default: r = -ENOIOCTLCMD; break; diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 13caf114bcdea5..d6d87f6315f60a 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -71,6 +71,15 @@ #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) +/* Set number of vhost workers + * Currently nuber of vhost workers can only be increased. + * All workers are freed upon reset. + * If the value is too big it is silently truncated to the maximum number of + * supported vhost workers + * Even if the error is returned it is possible that some workers were created + */ +#define VHOST_SET_NWORKERS _IOW(VHOST_VIRTIO, 0x1F, int) + /* The following ioctls use eventfd file descriptors to signal and poll * for events. */ From 3fd4c2ab791fae94a3325d6fdbbfb1b9647c90cf Mon Sep 17 00:00:00 2001 From: Leonid Komarianskyi Date: Thu, 5 Sep 2024 13:09:11 +0300 Subject: [PATCH 13/16] drivers/vhost: assign workers to virtqueues Add worker pointer to every virtqueue. Add routine to assing workers to virtqueues and call it after any worker creation Signed-off-by: Andrey Zhadchenko --- drivers/vhost/vhost.c | 14 ++++++++++++++ drivers/vhost/vhost.h | 1 + 2 files changed, 15 insertions(+) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 0b9444f561ba24..0603ce710d1944 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -353,6 +353,7 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq->iotlb = NULL; vhost_vring_call_reset(&vq->call_ctx); __vhost_vq_meta_reset(vq); + vq->worker = NULL; } static void vhost_worker_reset(struct vhost_worker *w) @@ -670,6 +671,17 @@ static int vhost_set_workers(struct vhost_dev *dev, int n) return ret; } +static void vhost_assign_workers(struct vhost_dev *dev) +{ + int i, j = 0; + + for (i = 0; i < dev->nvqs; i++) { + dev->vqs[i]->worker = &dev->workers[j]; + if (++j == dev->nworkers) + j = 0; + } +} + /* Caller should have device mutex */ long vhost_dev_set_owner(struct vhost_dev *dev) { @@ -692,6 +704,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev) if (err) goto err_worker; + vhost_assign_workers(dev); return 0; err_worker: vhost_cleanup_workers(dev); @@ -1895,6 +1908,7 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) } r = vhost_set_workers(d, n); + vhost_assign_workers(d); break; default: r = -ENOIOCTLCMD; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 3e5135cfa71e17..325d9faaa6d29b 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -138,6 +138,7 @@ struct vhost_virtqueue { bool user_be; #endif u32 busyloop_timeout; + struct vhost_worker *worker; #ifdef CONFIG_VHOST_XEN /* From 6a5b722f6aa099def13221de8c8e72e0d20334ff Mon Sep 17 00:00:00 2001 From: Leonid Komarianskyi Date: Thu, 5 Sep 2024 13:12:06 +0300 Subject: [PATCH 14/16] drivers/vhost: add API to queue work at virtqueue's worker Add routines to queue works on virtqueue assigned workers Signed-off-by: Andrey Zhadchenko --- drivers/vhost/vhost.c | 22 ++++++++++++++++++++++ drivers/vhost/vhost.h | 5 +++++ 2 files changed, 27 insertions(+) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 0603ce710d1944..2d3b2cbcbf1616 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -272,6 +272,17 @@ static void vhost_worker_flush(struct vhost_worker *w) wait_for_completion(&flush.wait_event); } +void vhost_work_flush_vq(struct vhost_virtqueue *vq) +{ + struct vhost_worker *w = READ_ONCE(vq->worker); + + if (!w) + return; + + vhost_worker_flush(w); +} +EXPORT_SYMBOL_GPL(vhost_work_flush_vq); + void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) { struct vhost_worker *w = &dev->workers[0]; @@ -283,6 +294,17 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) } EXPORT_SYMBOL_GPL(vhost_work_queue); +void vhost_work_vqueue(struct vhost_virtqueue *vq, struct vhost_work *work) +{ + struct vhost_worker *w = READ_ONCE(vq->worker); + + if (!w) + return; + + vhost_work_queue_at_worker(w, work); +} +EXPORT_SYMBOL_GPL(vhost_work_vqueue); + /* A lockless hint for busy polling code to exit the loop */ bool vhost_has_work(struct vhost_dev *dev) { diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 325d9faaa6d29b..35397c945c711f 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -150,6 +150,11 @@ struct vhost_virtqueue { #endif }; +/* Queue the work on virtqueue assigned worker */ +void vhost_work_vqueue(struct vhost_virtqueue *vq, struct vhost_work *work); +/* Flush virtqueue assigned worker */ +void vhost_work_flush_vq(struct vhost_virtqueue *vq); + struct vhost_msg_node { union { struct vhost_msg msg; From b4f7f8355f2cffdc8aa7316514c77d14915abd9f Mon Sep 17 00:00:00 2001 From: Leonid Komarianskyi Date: Thu, 5 Sep 2024 18:19:13 +0300 Subject: [PATCH 15/16] drivers/vhost: allow polls to be bound to workers via vqs Allow vhost polls to be associated with vqs so we can queue them on assigned workers. If polls are not associated with specific vqs queue them on the first virtqueue. Signed-off-by: Andrey Zhadchenko --- drivers/vhost/vhost.c | 24 ++++++++++++++++-------- drivers/vhost/vhost.h | 4 +++- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 2d3b2cbcbf1616..f3941ee74affbb 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -170,7 +170,7 @@ static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, if (!(key_to_poll(key) & poll->mask)) return 0; - if (!poll->dev->use_worker) + if (!poll->vq->dev->use_worker) work->fn(work); else vhost_poll_queue(poll); @@ -185,19 +185,27 @@ void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) } EXPORT_SYMBOL_GPL(vhost_work_init); -/* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, __poll_t mask, struct vhost_dev *dev) +{ + vhost_poll_init_vq(poll, fn, mask, dev->vqs[0]); +} +EXPORT_SYMBOL_GPL(vhost_poll_init); + + +/* Init poll structure */ +void vhost_poll_init_vq(struct vhost_poll *poll, vhost_work_fn_t fn, + __poll_t mask, struct vhost_virtqueue *vq) { init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); init_poll_funcptr(&poll->table, vhost_poll_func); poll->mask = mask; - poll->dev = dev; + poll->vq = vq; poll->wqh = NULL; vhost_work_init(&poll->work, fn); } -EXPORT_SYMBOL_GPL(vhost_poll_init); +EXPORT_SYMBOL_GPL(vhost_poll_init_vq); /* Start polling a file. We add ourselves to file's wait queue. The caller must * keep a reference to a file until after vhost_poll_stop is called. */ @@ -314,7 +322,7 @@ EXPORT_SYMBOL_GPL(vhost_has_work); void vhost_poll_queue(struct vhost_poll *poll) { - vhost_work_queue(poll->dev, &poll->work); + vhost_work_vqueue(poll->vq, &poll->work); } EXPORT_SYMBOL_GPL(vhost_poll_queue); @@ -567,8 +575,8 @@ void vhost_dev_init(struct vhost_dev *dev, #endif vhost_vq_reset(dev, vq); if (vq->handle_kick) - vhost_poll_init(&vq->poll, vq->handle_kick, - EPOLLIN, dev); + vhost_poll_init_vq(&vq->poll, vq->handle_kick, + EPOLLIN, vq); } } EXPORT_SYMBOL_GPL(vhost_dev_init); @@ -1825,7 +1833,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg mutex_unlock(&vq->mutex); if (pollstop && vq->handle_kick) - vhost_dev_flush(vq->poll.dev); + vhost_dev_flush(d); return r; } EXPORT_SYMBOL_GPL(vhost_vring_ioctl); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 35397c945c711f..4ee85a9ce4732c 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -40,7 +40,7 @@ struct vhost_poll { wait_queue_entry_t wait; struct vhost_work work; __poll_t mask; - struct vhost_dev *dev; + struct vhost_virtqueue *vq; }; void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn); @@ -49,6 +49,8 @@ bool vhost_has_work(struct vhost_dev *dev); void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, __poll_t mask, struct vhost_dev *dev); +void vhost_poll_init_vq(struct vhost_poll *poll, vhost_work_fn_t fn, + __poll_t mask, struct vhost_virtqueue *vq); int vhost_poll_start(struct vhost_poll *poll, struct file *file); void vhost_poll_stop(struct vhost_poll *poll); void vhost_poll_queue(struct vhost_poll *poll); From b72aa3c8fb05555908744943f101f3ee5fabbf3a Mon Sep 17 00:00:00 2001 From: Andrey Zhadchenko Date: Thu, 13 Oct 2022 18:18:39 +0300 Subject: [PATCH 16/16] drivers/vhost: queue vhost_blk works at vq workers Update vhost_blk to queue works on virtqueue workers. Together with previous changes this allows us to split virtio blk requests across several threads. | randread, IOPS | randwrite, IOPS | 8vcpu, 1 kernel worker | 576k | 575k | 8vcpu, 2 kernel workers | 803k | 779k | Signed-off-by: Andrey Zhadchenko --- drivers/vhost/blk.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c index e0c23c7cb5c2df..dfb6579a441229 100644 --- a/drivers/vhost/blk.c +++ b/drivers/vhost/blk.c @@ -161,13 +161,12 @@ static inline int vhost_blk_set_status(struct vhost_blk_req *req, u8 status) static void vhost_blk_req_done(struct bio *bio) { struct vhost_blk_req *req = bio->bi_private; - struct vhost_blk *blk = req->blk; req->bio_err = blk_status_to_errno(bio->bi_status); if (atomic_dec_and_test(&req->bio_nr)) { llist_add(&req->llnode, &req->blk_vq->llhead); - vhost_work_queue(&blk->dev, &req->blk_vq->work); + vhost_work_vqueue(&req->blk_vq->vq, &req->blk_vq->work); } bio_put(bio);