From 11d217430fe296db9aa7df57836c99d8b1327f57 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 25 May 2021 12:47:29 -0500
Subject: [PATCH 01/16] vhost: remove work arg from vhost_work_flush

vhost_work_flush doesn't do anything with the work arg. This patch drops
it and then renames vhost_work_flush to vhost_work_dev_flush to reflect
that the function flushes all the works in the dev and not just a
specific queue or work item.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Link: https://lore.kernel.org/r/20210525174733.6212-2-michael.christie@oracle.com
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/scsi.c  | 4 ++--
 drivers/vhost/vhost.c | 8 ++++----
 drivers/vhost/vhost.h | 2 +-
 drivers/vhost/vsock.c | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 4ce9f00ae10e84..4b70519dcae752 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1470,8 +1470,8 @@ static void vhost_scsi_flush(struct vhost_scsi *vs)
 	/* Flush both the vhost poll and vhost work */
 	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
 		vhost_scsi_flush_vq(vs, i);
-	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
-	vhost_work_flush(&vs->dev, &vs->vs_event_work);
+	vhost_work_dev_flush(&vs->dev);
+	vhost_work_dev_flush(&vs->dev);
 
 	/* Wait for all reqs issued before the flush to be finished */
 	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 601e8b35ea218a..e8f711b737a515 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -231,7 +231,7 @@ void vhost_poll_stop(struct vhost_poll *poll)
 }
 EXPORT_SYMBOL_GPL(vhost_poll_stop);
 
-void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
+void vhost_work_dev_flush(struct vhost_dev *dev)
 {
 	struct vhost_flush_struct flush;
 
@@ -243,13 +243,13 @@ void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
 		wait_for_completion(&flush.wait_event);
 	}
 }
-EXPORT_SYMBOL_GPL(vhost_work_flush);
+EXPORT_SYMBOL_GPL(vhost_work_dev_flush);
 
 /* Flush any work that has been scheduled. When calling this, don't hold any
  * locks that are also used by the callback. */
 void vhost_poll_flush(struct vhost_poll *poll)
 {
-	vhost_work_flush(poll->dev, &poll->work);
+	vhost_work_dev_flush(poll->dev);
 }
 EXPORT_SYMBOL_GPL(vhost_poll_flush);
 
@@ -541,7 +541,7 @@ static int vhost_attach_cgroups(struct vhost_dev *dev)
 	attach.owner = current;
 	vhost_work_init(&attach.work, vhost_attach_cgroups_work);
 	vhost_work_queue(dev, &attach.work);
-	vhost_work_flush(dev, &attach.work);
+	vhost_work_dev_flush(dev);
 	return attach.ret;
 }
 
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8396e54ce1ce5e..8be6b7b6864f88 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -46,7 +46,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
 void vhost_poll_queue(struct vhost_poll *poll);
-void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work);
+void vhost_work_dev_flush(struct vhost_dev *dev);
 long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp);
 
 struct vhost_log {
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 9885cab70ea59b..823dc5fe56e0ee 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -689,7 +689,7 @@ static void vhost_vsock_flush(struct vhost_vsock *vsock)
 	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++)
 		if (vsock->vqs[i].handle_kick)
 			vhost_poll_flush(&vsock->vqs[i].poll);
-	vhost_work_flush(&vsock->dev, &vsock->send_pkt_work);
+	vhost_work_dev_flush(&vsock->dev);
 }
 
 static void vhost_vsock_reset_orphans(struct sock *sk)

From 316959bcae71b551c3a4a2483679d60597f1b9ed Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <arbn@yandex-team.com>
Date: Tue, 17 May 2022 13:08:43 -0500
Subject: [PATCH 02/16] vhost: get rid of vhost_poll_flush() wrapper

vhost_poll_flush() is a simple wrapper around vhost_work_dev_flush().
It gives wrong impression that we are doing some work over vhost_poll,
while in fact it flushes vhost_poll->dev.
It only complicate understanding of the code and leads to mistakes
like flushing the same vhost_dev several times in a row.

Just remove vhost_poll_flush() and call vhost_work_dev_flush() directly.

Signed-off-by: Andrey Ryabinin <arbn@yandex-team.com>
[merge vhost_poll_flush removal from Stefano Garzarella]
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20220517180850.198915-2-michael.christie@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/net.c   |  4 ++--
 drivers/vhost/test.c  |  2 +-
 drivers/vhost/vhost.c | 12 ++----------
 drivers/vhost/vhost.h |  1 -
 drivers/vhost/vsock.c |  2 +-
 5 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6f8542535afeea..e582e56f83ac7d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1408,8 +1408,8 @@ static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
 
 static void vhost_net_flush_vq(struct vhost_net *n, int index)
 {
-	vhost_poll_flush(n->poll + index);
-	vhost_poll_flush(&n->vqs[index].vq.poll);
+	vhost_work_dev_flush(n->poll[index].dev);
+	vhost_work_dev_flush(n->vqs[index].vq.poll.dev);
 }
 
 static void vhost_net_flush(struct vhost_net *n)
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index a09dedc79f6820..1a8ab1d8cb1cf3 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -146,7 +146,7 @@ static void vhost_test_stop(struct vhost_test *n, void **privatep)
 
 static void vhost_test_flush_vq(struct vhost_test *n, int index)
 {
-	vhost_poll_flush(&n->vqs[index].poll);
+	vhost_work_dev_flush(n->vqs[index].poll.dev);
 }
 
 static void vhost_test_flush(struct vhost_test *n)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e8f711b737a515..6e80572049e836 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -245,14 +245,6 @@ void vhost_work_dev_flush(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_work_dev_flush);
 
-/* Flush any work that has been scheduled. When calling this, don't hold any
- * locks that are also used by the callback. */
-void vhost_poll_flush(struct vhost_poll *poll)
-{
-	vhost_work_dev_flush(poll->dev);
-}
-EXPORT_SYMBOL_GPL(vhost_poll_flush);
-
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 {
 	if (!dev->worker)
@@ -666,7 +658,7 @@ void vhost_dev_stop(struct vhost_dev *dev)
 	for (i = 0; i < dev->nvqs; ++i) {
 		if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) {
 			vhost_poll_stop(&dev->vqs[i]->poll);
-			vhost_poll_flush(&dev->vqs[i]->poll);
+			vhost_work_dev_flush(dev->vqs[i]->poll.dev);
 		}
 	}
 }
@@ -1720,7 +1712,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
 	mutex_unlock(&vq->mutex);
 
 	if (pollstop && vq->handle_kick)
-		vhost_poll_flush(&vq->poll);
+		vhost_work_dev_flush(vq->poll.dev);
 	return r;
 }
 EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8be6b7b6864f88..99332dca9edc79 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -44,7 +44,6 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 		     __poll_t mask, struct vhost_dev *dev);
 int vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
-void vhost_poll_flush(struct vhost_poll *poll);
 void vhost_poll_queue(struct vhost_poll *poll);
 void vhost_work_dev_flush(struct vhost_dev *dev);
 long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp);
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 823dc5fe56e0ee..1b6ca5f8c0b8da 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -688,7 +688,7 @@ static void vhost_vsock_flush(struct vhost_vsock *vsock)
 
 	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++)
 		if (vsock->vqs[i].handle_kick)
-			vhost_poll_flush(&vsock->vqs[i].poll);
+			vhost_work_dev_flush(vsock->vqs[i].poll.dev);
 	vhost_work_dev_flush(&vsock->dev);
 }
 

From 455b1719a97603276e6c1f760ec7f14087e3fe6d Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <arbn@yandex-team.com>
Date: Tue, 17 May 2022 13:08:45 -0500
Subject: [PATCH 03/16] vhost_net: get rid of vhost_net_flush_vq() and extra
 flush calls

vhost_net_flush_vq() calls vhost_work_dev_flush() twice passing
vhost_dev pointer obtained via 'n->poll[index].dev' and
'n->vqs[index].vq.poll.dev'. This is actually the same pointer,
initialized in vhost_net_open()/vhost_dev_init()/vhost_poll_init()

Remove vhost_net_flush_vq() and call vhost_work_dev_flush() directly.
Do the flushes only once instead of several flush calls in a row
which seems rather useless.

Signed-off-by: Andrey Ryabinin <arbn@yandex-team.com>
[drop vhost_dev forward declaration in vhost.h]
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20220517180850.198915-4-michael.christie@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/net.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e582e56f83ac7d..040ac1b236dab4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1406,16 +1406,9 @@ static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
 	*rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq);
 }
 
-static void vhost_net_flush_vq(struct vhost_net *n, int index)
-{
-	vhost_work_dev_flush(n->poll[index].dev);
-	vhost_work_dev_flush(n->vqs[index].vq.poll.dev);
-}
-
 static void vhost_net_flush(struct vhost_net *n)
 {
-	vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
-	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
+	vhost_work_dev_flush(&n->dev);
 	if (n->vqs[VHOST_NET_VQ_TX].ubufs) {
 		mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
 		n->tx_flush = true;
@@ -1605,7 +1598,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	}
 
 	if (oldsock) {
-		vhost_net_flush_vq(n, index);
+		vhost_work_dev_flush(&n->dev);
 		sockfd_put(oldsock);
 	}
 

From 98837d1a950e885b7d3d34a4f56da5529d08fc9c Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 17 May 2022 13:08:44 -0500
Subject: [PATCH 04/16] vhost: flush dev once during vhost_dev_stop

When vhost_work_dev_flush returns all work queued at that time will have
completed. There is then no need to flush after every vhost_poll_stop
call, and we can move the flush call to after the loop that stops the
pollers.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20220517180850.198915-3-michael.christie@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vhost.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6e80572049e836..abf6cf27db14d7 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -656,11 +656,11 @@ void vhost_dev_stop(struct vhost_dev *dev)
 	int i;
 
 	for (i = 0; i < dev->nvqs; ++i) {
-		if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) {
+		if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick)
 			vhost_poll_stop(&dev->vqs[i]->poll);
-			vhost_work_dev_flush(dev->vqs[i]->poll.dev);
-		}
 	}
+
+	vhost_work_dev_flush(dev);
 }
 EXPORT_SYMBOL_GPL(vhost_dev_stop);
 

From 82d314629f43caa102e033a19bf7ead50b7d87c7 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <arbn@yandex-team.com>
Date: Tue, 17 May 2022 13:08:47 -0500
Subject: [PATCH 05/16] vhost_vsock: simplify vhost_vsock_flush()

vhost_vsock_flush() calls vhost_work_dev_flush(vsock->vqs[i].poll.dev)
before vhost_work_dev_flush(&vsock->dev). This seems pointless
as vsock->vqs[i].poll.dev is the same as &vsock->dev and several flushes
in a row doesn't do anything useful, one is just enough.

Signed-off-by: Andrey Ryabinin <arbn@yandex-team.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220517180850.198915-6-michael.christie@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vsock.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 1b6ca5f8c0b8da..1505cdc1a3f7ca 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -684,11 +684,6 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
 
 static void vhost_vsock_flush(struct vhost_vsock *vsock)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++)
-		if (vsock->vqs[i].handle_kick)
-			vhost_work_dev_flush(vsock->vqs[i].poll.dev);
 	vhost_work_dev_flush(&vsock->dev);
 }
 

From e6135a74f0e7341757c7d388a0c8d69c092b1951 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Tue, 17 May 2022 13:08:50 -0500
Subject: [PATCH 06/16] vhost: rename vhost_work_dev_flush

This patch renames vhost_work_dev_flush to just vhost_dev_flush to
relfect that it flushes everything on the device and that drivers
don't know/care that polls are based on vhost_works. Drivers just
flush the entire device and polls, and works for vhost-scsi
management TMFs and IO net virtqueues, etc all are flushed.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20220517180850.198915-9-michael.christie@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Leonid Komarianskyi <leonid_komarianskyi@epam.com>
---
 drivers/vhost/net.c   |  4 ++--
 drivers/vhost/scsi.c  |  4 ++--
 drivers/vhost/test.c  |  2 +-
 drivers/vhost/vhost.c | 10 +++++-----
 drivers/vhost/vhost.h |  2 +-
 drivers/vhost/vsock.c |  2 +-
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 040ac1b236dab4..923d27d4c6942b 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1408,7 +1408,7 @@ static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
 
 static void vhost_net_flush(struct vhost_net *n)
 {
-	vhost_work_dev_flush(&n->dev);
+	vhost_dev_flush(&n->dev);
 	if (n->vqs[VHOST_NET_VQ_TX].ubufs) {
 		mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
 		n->tx_flush = true;
@@ -1598,7 +1598,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	}
 
 	if (oldsock) {
-		vhost_work_dev_flush(&n->dev);
+		vhost_dev_flush(&n->dev);
 		sockfd_put(oldsock);
 	}
 
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 4b70519dcae752..fe7570c1b6e9b2 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1470,8 +1470,8 @@ static void vhost_scsi_flush(struct vhost_scsi *vs)
 	/* Flush both the vhost poll and vhost work */
 	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
 		vhost_scsi_flush_vq(vs, i);
-	vhost_work_dev_flush(&vs->dev);
-	vhost_work_dev_flush(&vs->dev);
+	vhost_dev_flush(&vs->dev);
+	vhost_dev_flush(&vs->dev);
 
 	/* Wait for all reqs issued before the flush to be finished */
 	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 1a8ab1d8cb1cf3..39e71f431d8810 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -146,7 +146,7 @@ static void vhost_test_stop(struct vhost_test *n, void **privatep)
 
 static void vhost_test_flush_vq(struct vhost_test *n, int index)
 {
-	vhost_work_dev_flush(n->vqs[index].poll.dev);
+	vhost_dev_flush(n->vqs[index].poll.dev);
 }
 
 static void vhost_test_flush(struct vhost_test *n)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index abf6cf27db14d7..afc696d9554f80 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -231,7 +231,7 @@ void vhost_poll_stop(struct vhost_poll *poll)
 }
 EXPORT_SYMBOL_GPL(vhost_poll_stop);
 
-void vhost_work_dev_flush(struct vhost_dev *dev)
+void vhost_dev_flush(struct vhost_dev *dev)
 {
 	struct vhost_flush_struct flush;
 
@@ -243,7 +243,7 @@ void vhost_work_dev_flush(struct vhost_dev *dev)
 		wait_for_completion(&flush.wait_event);
 	}
 }
-EXPORT_SYMBOL_GPL(vhost_work_dev_flush);
+EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 {
@@ -533,7 +533,7 @@ static int vhost_attach_cgroups(struct vhost_dev *dev)
 	attach.owner = current;
 	vhost_work_init(&attach.work, vhost_attach_cgroups_work);
 	vhost_work_queue(dev, &attach.work);
-	vhost_work_dev_flush(dev);
+	vhost_dev_flush(dev);
 	return attach.ret;
 }
 
@@ -660,7 +660,7 @@ void vhost_dev_stop(struct vhost_dev *dev)
 			vhost_poll_stop(&dev->vqs[i]->poll);
 	}
 
-	vhost_work_dev_flush(dev);
+	vhost_dev_flush(dev);
 }
 EXPORT_SYMBOL_GPL(vhost_dev_stop);
 
@@ -1712,7 +1712,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
 	mutex_unlock(&vq->mutex);
 
 	if (pollstop && vq->handle_kick)
-		vhost_work_dev_flush(vq->poll.dev);
+		vhost_dev_flush(vq->poll.dev);
 	return r;
 }
 EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 99332dca9edc79..7bb1cbd545933d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -45,7 +45,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 int vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_queue(struct vhost_poll *poll);
-void vhost_work_dev_flush(struct vhost_dev *dev);
+void vhost_dev_flush(struct vhost_dev *dev);
 long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp);
 
 struct vhost_log {
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 1505cdc1a3f7ca..8c2569743f6c9a 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -684,7 +684,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
 
 static void vhost_vsock_flush(struct vhost_vsock *vsock)
 {
-	vhost_work_dev_flush(&vsock->dev);
+	vhost_dev_flush(&vsock->dev);
 }
 
 static void vhost_vsock_reset_orphans(struct sock *sk)

From 0ed463080a1b6c9610f38908e82c9f731e06061d Mon Sep 17 00:00:00 2001
From: Leonid Komarianskyi <leonid_komarianskyi@epam.com>
Date: Tue, 3 Sep 2024 15:38:44 +0300
Subject: [PATCH 07/16] drivers/vhost: vhost-blk accelerator for virtio-blk
 guests

Although QEMU virtio is quite fast, there is still some room for
improvements. Disk latency can be reduced if we handle virito-blk requests
in host kernel istead of passing them to QEMU. The patch adds vhost-blk
kernel module to do so.

Some test setups:
fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
QEMU drive options: cache=none
filesystem: xfs

SSD:
               | randread, IOPS  | randwrite, IOPS |
Host           |      95.8k      |      85.3k      |
QEMU virtio    |      57.5k      |      79.4k      |
QEMU vhost-blk |      95.6k      |      84.3k      |

RAMDISK (vq == vcpu = numjobs):
                 | randread, IOPS | randwrite, IOPS |
virtio, 1vcpu    |      133k      |	 133k       |
virtio, 2vcpu    |      305k      |      306k       |
virtio, 4vcpu    |      310k      |      298k       |
vhost-blk, 1vcpu |      110k      |	 113k       |
vhost-blk, 2vcpu |      247k      |      252k       |
vhost-blk, 4vcpu |	558k	  |	 556k       |

v2:
 - removed unused VHOST_BLK_VQ
 - reworked bio handling a bit: now add all pages from signle iov into
bio until it is full istead of allocating one bio per page
 - changed sector incrementation calculation
 - check move_iovec() in vhost_blk_req_handle()
 - remove snprintf check and better check ret from copy_to_iter for
VIRTIO_BLK_ID_BYTES requests
 - discard vq request if vhost_blk_req_handle() returned negative code
 - forbid to change nonzero backend in vhost_blk_set_backend(). First of
all, QEMU sets backend only once. Also if we want to change backend when
we already running requests we need to be much more careful in
vhost_blk_handle_guest_kick() as it is not taking any references. If
userspace want to change backend that bad it can always reset device.
 - removed EXPERIMENTAL from Kconfig

Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
---
 drivers/vhost/Kconfig      |  12 +
 drivers/vhost/Makefile     |   3 +
 drivers/vhost/blk.c        | 820 +++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vhost.h |   5 +
 4 files changed, 840 insertions(+)
 create mode 100644 drivers/vhost/blk.c

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 5602241ac9737f..7c79e25b45f210 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -95,4 +95,16 @@ config VHOST_CROSS_ENDIAN_LEGACY
 
 	  If unsure, say "N".
 
+config VHOST_BLK
+	tristate "Host kernel accelerator for virtio-blk"
+	depends on BLOCK && EVENTFD
+	select VHOST
+	default n
+	help
+	  This kernel module can be loaded in host kernel to accelerate
+	  guest vm with virtio-blk driver.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called vhost_blk.
+
 endif
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 52c1a8e37f19bf..d08c8dde89aa82 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -18,5 +18,8 @@ obj-$(CONFIG_VHOST)	+= vhost.o
 obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o
 vhost_iotlb-y := iotlb.o
 
+obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
+vhost_blk-y := blk.o
+
 obj-$(CONFIG_VHOST_XEN) += vhost_xen.o
 vhost_xen-y := xen.o
diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
new file mode 100644
index 00000000000000..e0c23c7cb5c2df
--- /dev/null
+++ b/drivers/vhost/blk.c
@@ -0,0 +1,820 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2011 Taobao, Inc.
+ * Author: Liu Yuan <tailai.ly@taobao.com>
+ *
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ *
+ * Copyright (c) 2022 Virtuozzo International GmbH.
+ * Author: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
+ *
+ * virtio-blk host kernel accelerator.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/vhost.h>
+#include <linux/virtio_blk.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/blkdev.h>
+#include <linux/llist.h>
+
+#include "vhost.h"
+
+enum {
+	VHOST_BLK_FEATURES = VHOST_FEATURES |
+			     (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+			     (1ULL << VIRTIO_RING_F_EVENT_IDX) |
+			     (1ULL << VIRTIO_BLK_F_MQ) |
+			     (1ULL << VIRTIO_BLK_F_FLUSH),
+};
+
+/*
+ * Max number of bytes transferred before requeueing the job.
+ * Using this limit prevents one virtqueue from starving others.
+ */
+#define VHOST_DEV_WEIGHT 0x80000
+
+/*
+ * Max number of packets transferred before requeueing the job.
+ * Using this limit prevents one virtqueue from starving others with
+ * pkts.
+ */
+#define VHOST_DEV_PKT_WEIGHT 256
+
+#define VHOST_BLK_VQ_MAX 16
+
+#define VHOST_MAX_METADATA_IOV 1
+
+#define VHOST_BLK_SECTOR_BITS 9
+#define VHOST_BLK_SECTOR_SIZE (1 << VHOST_BLK_SECTOR_BITS)
+#define VHOST_BLK_SECTOR_MASK (VHOST_BLK_SECTOR_SIZE - 1)
+
+struct req_page_list {
+	struct page **pages;
+	int pages_nr;
+};
+
+#define NR_INLINE 16
+
+struct vhost_blk_req {
+	struct req_page_list inline_pl[NR_INLINE];
+	struct page *inline_page[NR_INLINE];
+	struct bio *inline_bio[NR_INLINE];
+	struct req_page_list *pl;
+	int during_flush;
+	bool use_inline;
+
+	struct llist_node llnode;
+
+	struct vhost_blk *blk;
+
+	struct iovec *iov;
+	int iov_nr;
+
+	struct bio **bio;
+	atomic_t bio_nr;
+
+	struct iovec status[VHOST_MAX_METADATA_IOV];
+
+	sector_t sector;
+	int bi_opf;
+	u16 head;
+	long len;
+	int bio_err;
+
+	struct vhost_blk_vq *blk_vq;
+};
+
+struct vhost_blk_vq {
+	struct vhost_virtqueue vq;
+	struct vhost_blk_req *req;
+	struct iovec iov[UIO_MAXIOV];
+	struct llist_head llhead;
+	struct vhost_work work;
+};
+
+struct vhost_blk {
+	wait_queue_head_t flush_wait;
+	struct vhost_blk_vq vqs[VHOST_BLK_VQ_MAX];
+	atomic_t req_inflight[2];
+	spinlock_t flush_lock;
+	struct vhost_dev dev;
+	int during_flush;
+	struct file *backend;
+	int index;
+};
+
+static int gen;
+
+static int move_iovec(struct iovec *from, struct iovec *to,
+		      size_t len, int iov_count_from, int iov_count_to)
+{
+	int moved_seg = 0, spent_seg = 0;
+	size_t size;
+
+	while (len && spent_seg < iov_count_from && moved_seg < iov_count_to) {
+		if (from->iov_len == 0) {
+			++from;
+			++spent_seg;
+			continue;
+		}
+		size = min(from->iov_len, len);
+		to->iov_base = from->iov_base;
+		to->iov_len = size;
+		from->iov_len -= size;
+		from->iov_base += size;
+		len -= size;
+		++from;
+		++to;
+		++moved_seg;
+		++spent_seg;
+	}
+
+	return len ? -1 : moved_seg;
+}
+
+static inline int iov_num_pages(struct iovec *iov)
+{
+	return (PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) -
+	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
+}
+
+static inline int vhost_blk_set_status(struct vhost_blk_req *req, u8 status)
+{
+	struct iov_iter iter;
+	int ret;
+
+	iov_iter_init(&iter, WRITE, req->status, ARRAY_SIZE(req->status), sizeof(status));
+	ret = copy_to_iter(&status, sizeof(status), &iter);
+	if (ret != sizeof(status)) {
+		vq_err(&req->blk_vq->vq, "Failed to write status\n");
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static void vhost_blk_req_done(struct bio *bio)
+{
+	struct vhost_blk_req *req = bio->bi_private;
+	struct vhost_blk *blk = req->blk;
+
+	req->bio_err = blk_status_to_errno(bio->bi_status);
+
+	if (atomic_dec_and_test(&req->bio_nr)) {
+		llist_add(&req->llnode, &req->blk_vq->llhead);
+		vhost_work_queue(&blk->dev, &req->blk_vq->work);
+	}
+
+	bio_put(bio);
+}
+
+static void vhost_blk_req_umap(struct vhost_blk_req *req)
+{
+	struct req_page_list *pl;
+	int i, j;
+
+	if (req->pl) {
+		for (i = 0; i < req->iov_nr; i++) {
+			pl = &req->pl[i];
+
+			for (j = 0; j < pl->pages_nr; j++) {
+				if (!req->bi_opf)
+					set_page_dirty_lock(pl->pages[j]);
+				put_page(pl->pages[j]);
+			}
+		}
+	}
+
+	if (!req->use_inline)
+		kfree(req->pl);
+}
+
+static int vhost_blk_bio_make_simple(struct vhost_blk_req *req,
+				     struct block_device *bdev)
+{
+	struct bio *bio;
+
+	req->use_inline = true;
+	req->pl = NULL;
+	req->bio = req->inline_bio;
+
+	bio = bio_alloc(req->bi_opf, GFP_KERNEL);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_iter.bi_sector = req->sector;
+	bio->bi_private = req;
+	bio->bi_end_io  = vhost_blk_req_done;
+	req->bio[0] = bio;
+
+	atomic_set(&req->bio_nr, 1);
+
+	return 0;
+}
+
+static struct page **vhost_blk_prepare_req(struct vhost_blk_req *req,
+				 int total_pages, int iov_nr)
+{
+	int pl_len, page_len, bio_len;
+	void *buf;
+
+	req->use_inline = false;
+	pl_len = iov_nr * sizeof(req->pl[0]);
+	page_len = total_pages * sizeof(struct page *);
+	bio_len = total_pages * sizeof(struct bio *);
+
+	buf = kmalloc(pl_len + page_len + bio_len, GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	req->pl	= buf;
+	req->bio = buf + pl_len + page_len;
+
+	return buf + pl_len;
+}
+
+static int vhost_blk_bio_make(struct vhost_blk_req *req,
+			      struct block_device *bdev)
+{
+	int pages_nr_total, i, j, ret;
+	struct iovec *iov = req->iov;
+	int iov_nr = req->iov_nr;
+	struct page **pages, *page;
+	struct bio *bio = NULL;
+	int bio_nr = 0;
+
+	if (unlikely(req->bi_opf == REQ_OP_FLUSH))
+		return vhost_blk_bio_make_simple(req, bdev);
+
+	pages_nr_total = 0;
+	for (i = 0; i < iov_nr; i++)
+		pages_nr_total += iov_num_pages(&iov[i]);
+
+	if (pages_nr_total > NR_INLINE) {
+		pages = vhost_blk_prepare_req(req, pages_nr_total, iov_nr);
+		if (!pages)
+			return -ENOMEM;
+	} else {
+		req->use_inline = true;
+		req->pl = req->inline_pl;
+		pages = req->inline_page;
+		req->bio = req->inline_bio;
+	}
+
+	req->iov_nr = 0;
+	for (i = 0; i < iov_nr; i++) {
+		int pages_nr = iov_num_pages(&iov[i]);
+		unsigned long iov_base, iov_len;
+		struct req_page_list *pl;
+
+		iov_base = (unsigned long)iov[i].iov_base;
+		iov_len  = (unsigned long)iov[i].iov_len;
+
+		ret = get_user_pages_fast(iov_base, pages_nr,
+					  !req->bi_opf, pages);
+		if (ret != pages_nr)
+			goto fail;
+
+		req->iov_nr++;
+		pl = &req->pl[i];
+		pl->pages_nr = pages_nr;
+		pl->pages = pages;
+
+		for (j = 0; j < pages_nr; j++) {
+			unsigned int off, len, pos;
+
+			page = pages[j];
+			off = iov_base & ~PAGE_MASK;
+			len = PAGE_SIZE - off;
+			if (len > iov_len)
+				len = iov_len;
+
+			while (!bio || !bio_add_page(bio, page, len, off)) {
+				bio = bio_alloc(req->bi_opf, GFP_KERNEL);
+				if (!bio)
+					goto fail;
+				bio->bi_iter.bi_sector  = req->sector;
+				bio->bi_private = req;
+				bio->bi_end_io  = vhost_blk_req_done;
+				req->bio[bio_nr++] = bio;
+			}
+
+			iov_base	+= len;
+			iov_len		-= len;
+
+			pos = (iov_base & VHOST_BLK_SECTOR_MASK) + iov_len;
+			req->sector += pos >> VHOST_BLK_SECTOR_BITS;
+		}
+
+		pages += pages_nr;
+	}
+	atomic_set(&req->bio_nr, bio_nr);
+	return 0;
+
+fail:
+	for (i = 0; i < bio_nr; i++)
+		bio_put(req->bio[i]);
+	vhost_blk_req_umap(req);
+	return -ENOMEM;
+}
+
+static inline void vhost_blk_bio_send(struct vhost_blk_req *req)
+{
+	struct blk_plug plug;
+	int i, bio_nr;
+
+	bio_nr = atomic_read(&req->bio_nr);
+	blk_start_plug(&plug);
+	for (i = 0; i < bio_nr; i++)
+		submit_bio(req->bio[i]);
+
+	blk_finish_plug(&plug);
+}
+
+static int vhost_blk_req_submit(struct vhost_blk_req *req, struct file *file)
+{
+
+	struct inode *inode = file->f_mapping->host;
+	struct block_device *bdev = I_BDEV(inode);
+	int ret;
+
+	ret = vhost_blk_bio_make(req, bdev);
+	if (ret < 0)
+		return ret;
+
+	vhost_blk_bio_send(req);
+
+	spin_lock(&req->blk->flush_lock);
+	req->during_flush = req->blk->during_flush;
+	atomic_inc(&req->blk->req_inflight[req->during_flush]);
+	spin_unlock(&req->blk->flush_lock);
+
+	return ret;
+}
+
+static int vhost_blk_req_handle(struct vhost_virtqueue *vq,
+				struct virtio_blk_outhdr *hdr,
+				u16 head, u16 total_iov_nr,
+				struct file *file)
+{
+	struct vhost_blk *blk = container_of(vq->dev, struct vhost_blk, dev);
+	struct vhost_blk_vq *blk_vq = container_of(vq, struct vhost_blk_vq, vq);
+	unsigned char id[VIRTIO_BLK_ID_BYTES];
+	struct vhost_blk_req *req;
+	struct iov_iter iter;
+	int ret, len;
+	u8 status;
+
+	req		= &blk_vq->req[head];
+	req->blk_vq	= blk_vq;
+	req->head	= head;
+	req->blk	= blk;
+	req->sector	= hdr->sector;
+	req->iov	= blk_vq->iov;
+
+	req->len	= iov_length(vq->iov, total_iov_nr) - sizeof(status);
+	req->iov_nr	= move_iovec(vq->iov, req->iov, req->len, total_iov_nr,
+				     ARRAY_SIZE(blk_vq->iov));
+
+	ret = move_iovec(vq->iov, req->status, sizeof(status), total_iov_nr,
+			 ARRAY_SIZE(req->status));
+	if (ret < 0 || req->iov_nr < 0)
+		return -EINVAL;
+
+	switch (hdr->type) {
+	case VIRTIO_BLK_T_OUT:
+		req->bi_opf = REQ_OP_WRITE;
+		ret = vhost_blk_req_submit(req, file);
+		break;
+	case VIRTIO_BLK_T_IN:
+		req->bi_opf = REQ_OP_READ;
+		ret = vhost_blk_req_submit(req, file);
+		break;
+	case VIRTIO_BLK_T_FLUSH:
+		req->bi_opf = REQ_OP_FLUSH;
+		ret = vhost_blk_req_submit(req, file);
+		break;
+	case VIRTIO_BLK_T_GET_ID:
+		len = snprintf(id, VIRTIO_BLK_ID_BYTES, "vhost-blk%d", blk->index);
+		iov_iter_init(&iter, WRITE, req->iov, req->iov_nr, req->len);
+		ret = copy_to_iter(id, len, &iter);
+		status = ret != len ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+		ret = vhost_blk_set_status(req, status);
+		if (ret)
+			break;
+		vhost_add_used_and_signal(&blk->dev, vq, head, len);
+		break;
+	default:
+		vq_err(vq, "Unsupported request type %d\n", hdr->type);
+		status = VIRTIO_BLK_S_UNSUPP;
+		ret = vhost_blk_set_status(req, status);
+		if (ret)
+			break;
+		vhost_add_used_and_signal(&blk->dev, vq, head, 0);
+	}
+
+	return ret;
+}
+
+static void vhost_blk_handle_guest_kick(struct vhost_work *work)
+{
+	struct virtio_blk_outhdr hdr;
+	struct vhost_blk_vq *blk_vq;
+	struct vhost_virtqueue *vq;
+	struct iovec hdr_iovec[VHOST_MAX_METADATA_IOV];
+	struct vhost_blk *blk;
+	struct iov_iter iter;
+	int in, out, ret;
+	struct file *f;
+	u16 head;
+
+	vq = container_of(work, struct vhost_virtqueue, poll.work);
+	blk = container_of(vq->dev, struct vhost_blk, dev);
+	blk_vq = container_of(vq, struct vhost_blk_vq, vq);
+
+	f = vhost_vq_get_backend(vq);
+	if (!f)
+		return;
+
+	vhost_disable_notify(&blk->dev, vq);
+	for (;;) {
+		head = vhost_get_vq_desc(vq, vq->iov,
+					 ARRAY_SIZE(vq->iov),
+					 &out, &in, NULL, NULL);
+		if (unlikely(head < 0))
+			break;
+
+		if (unlikely(head == vq->num)) {
+			if (unlikely(vhost_enable_notify(&blk->dev, vq))) {
+				vhost_disable_notify(&blk->dev, vq);
+				continue;
+			}
+			break;
+		}
+
+		ret = move_iovec(vq->iov, hdr_iovec, sizeof(hdr), in + out, ARRAY_SIZE(hdr_iovec));
+		if (ret < 0) {
+			vq_err(vq, "virtio_blk_hdr is too split!");
+			vhost_discard_vq_desc(vq, 1);
+			break;
+		}
+
+		iov_iter_init(&iter, READ, hdr_iovec, ARRAY_SIZE(hdr_iovec), sizeof(hdr));
+		ret = copy_from_iter(&hdr, sizeof(hdr), &iter);
+		if (ret != sizeof(hdr)) {
+			vq_err(vq, "Failed to get block header: read %d bytes instead of %ld!\n",
+			       ret, sizeof(hdr));
+			vhost_discard_vq_desc(vq, 1);
+			break;
+		}
+
+		if (vhost_blk_req_handle(vq, &hdr, head, out + in, f) < 0) {
+			vhost_discard_vq_desc(vq, 1);
+			break;
+		}
+
+		if (!llist_empty(&blk_vq->llhead)) {
+			vhost_poll_queue(&vq->poll);
+			break;
+		}
+	}
+}
+
+static void vhost_blk_handle_host_kick(struct vhost_work *work)
+{
+	struct vhost_blk_vq *blk_vq;
+	struct vhost_virtqueue *vq;
+	struct vhost_blk_req *req;
+	struct llist_node *llnode;
+	struct vhost_blk *blk = NULL;
+	bool added, zero;
+	u8 status;
+	int ret;
+
+	blk_vq = container_of(work, struct vhost_blk_vq, work);
+	vq = &blk_vq->vq;
+	llnode = llist_del_all(&blk_vq->llhead);
+	added = false;
+	while (llnode) {
+		req = llist_entry(llnode, struct vhost_blk_req, llnode);
+		llnode = llist_next(llnode);
+
+		if (!blk)
+			blk = req->blk;
+
+		vhost_blk_req_umap(req);
+
+		status = req->bio_err == 0 ?  VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
+		ret = vhost_blk_set_status(req, status);
+		if (unlikely(ret))
+			continue;
+
+		vhost_add_used(vq, req->head, req->len);
+		added = true;
+
+		spin_lock(&req->blk->flush_lock);
+		zero = atomic_dec_and_test(
+				&req->blk->req_inflight[req->during_flush]);
+		if (zero && !req->during_flush)
+			wake_up(&blk->flush_wait);
+		spin_unlock(&req->blk->flush_lock);
+
+	}
+
+	if (likely(added))
+		vhost_signal(&blk->dev, vq);
+}
+
+static void vhost_blk_flush(struct vhost_blk *blk)
+{
+	spin_lock(&blk->flush_lock);
+	blk->during_flush = 1;
+	spin_unlock(&blk->flush_lock);
+
+	vhost_dev_flush(&blk->dev);
+	/*
+	 * Wait until requests fired before the flush to be finished
+	 * req_inflight[0] is used to track the requests fired before the flush
+	 * req_inflight[1] is used to track the requests fired during the flush
+	 */
+	wait_event(blk->flush_wait, !atomic_read(&blk->req_inflight[0]));
+
+	spin_lock(&blk->flush_lock);
+	blk->during_flush = 0;
+	spin_unlock(&blk->flush_lock);
+}
+
+static inline void vhost_blk_drop_backends(struct vhost_blk *blk)
+{
+	struct vhost_virtqueue *vq;
+	int i;
+
+	for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
+		vq = &blk->vqs[i].vq;
+
+		mutex_lock(&vq->mutex);
+		vhost_vq_set_backend(vq, NULL);
+		mutex_unlock(&vq->mutex);
+	}
+}
+
+static int vhost_blk_open(struct inode *inode, struct file *file)
+{
+	struct vhost_blk *blk;
+	struct vhost_virtqueue **vqs;
+	int ret = 0, i = 0;
+
+	blk = kvzalloc(sizeof(*blk), GFP_KERNEL);
+	if (!blk) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	vqs = kcalloc(VHOST_BLK_VQ_MAX, sizeof(*vqs), GFP_KERNEL);
+	if (!vqs) {
+		ret = -ENOMEM;
+		goto out_blk;
+	}
+
+	for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
+		blk->vqs[i].vq.handle_kick = vhost_blk_handle_guest_kick;
+		vqs[i] = &blk->vqs[i].vq;
+	}
+
+	blk->index = gen++;
+
+	atomic_set(&blk->req_inflight[0], 0);
+	atomic_set(&blk->req_inflight[1], 0);
+	blk->during_flush = 0;
+	spin_lock_init(&blk->flush_lock);
+	init_waitqueue_head(&blk->flush_wait);
+
+	vhost_dev_init(&blk->dev, vqs, VHOST_BLK_VQ_MAX, UIO_MAXIOV,
+		       VHOST_DEV_WEIGHT, VHOST_DEV_PKT_WEIGHT, true, NULL);
+	file->private_data = blk;
+
+	for (i = 0; i < VHOST_BLK_VQ_MAX; i++)
+		vhost_work_init(&blk->vqs[i].work, vhost_blk_handle_host_kick);
+
+	return ret;
+out_blk:
+	kvfree(blk);
+out:
+	return ret;
+}
+
+static int vhost_blk_release(struct inode *inode, struct file *f)
+{
+	struct vhost_blk *blk = f->private_data;
+	int i;
+
+	vhost_blk_drop_backends(blk);
+	vhost_blk_flush(blk);
+	vhost_dev_stop(&blk->dev);
+	if (blk->backend)
+		fput(blk->backend);
+	vhost_dev_cleanup(&blk->dev);
+	for (i = 0; i < VHOST_BLK_VQ_MAX; i++)
+		kvfree(blk->vqs[i].req);
+	kfree(blk->dev.vqs);
+	kvfree(blk);
+
+	return 0;
+}
+
+static int vhost_blk_set_features(struct vhost_blk *blk, u64 features)
+{
+	struct vhost_virtqueue *vq;
+	int i;
+
+	mutex_lock(&blk->dev.mutex);
+	if ((features & (1 << VHOST_F_LOG_ALL)) &&
+	    !vhost_log_access_ok(&blk->dev)) {
+		mutex_unlock(&blk->dev.mutex);
+		return -EFAULT;
+	}
+
+	for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
+		vq = &blk->vqs[i].vq;
+		mutex_lock(&vq->mutex);
+		vq->acked_features = features & (VHOST_BLK_FEATURES);
+		mutex_unlock(&vq->mutex);
+	}
+
+	vhost_blk_flush(blk);
+	mutex_unlock(&blk->dev.mutex);
+
+	return 0;
+}
+
+static long vhost_blk_set_backend(struct vhost_blk *blk, int fd)
+{
+	struct vhost_virtqueue *vq;
+	struct file *file;
+	struct inode *inode;
+	int ret, i;
+
+	mutex_lock(&blk->dev.mutex);
+	ret = vhost_dev_check_owner(&blk->dev);
+	if (ret)
+		goto out_dev;
+
+	if (blk->backend) {
+		ret = -EBUSY;
+		goto out_dev;
+	}
+
+	file = fget(fd);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto out_dev;
+	}
+
+	inode = file->f_mapping->host;
+	if (!S_ISBLK(inode->i_mode)) {
+		ret = -EFAULT;
+		goto out_file;
+	}
+
+	for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
+		vq = &blk->vqs[i].vq;
+		if (!vhost_vq_access_ok(vq)) {
+			ret = -EFAULT;
+			goto out_drop;
+		}
+
+		mutex_lock(&vq->mutex);
+		vhost_vq_set_backend(vq, file);
+		ret = vhost_vq_init_access(vq);
+		mutex_unlock(&vq->mutex);
+	}
+
+	blk->backend = file;
+
+	mutex_unlock(&blk->dev.mutex);
+	return 0;
+
+out_drop:
+	vhost_blk_drop_backends(blk);
+out_file:
+	fput(file);
+out_dev:
+	mutex_unlock(&blk->dev.mutex);
+	return ret;
+}
+
+static long vhost_blk_reset_owner(struct vhost_blk *blk)
+{
+	struct vhost_iotlb *umem;
+	int err, i;
+
+	mutex_lock(&blk->dev.mutex);
+	err = vhost_dev_check_owner(&blk->dev);
+	if (err)
+		goto done;
+	umem = vhost_dev_reset_owner_prepare();
+	if (!umem) {
+		err = -ENOMEM;
+		goto done;
+	}
+	vhost_blk_drop_backends(blk);
+	if (blk->backend) {
+		fput(blk->backend);
+		blk->backend = NULL;
+	}
+	vhost_blk_flush(blk);
+	vhost_dev_stop(&blk->dev);
+	vhost_dev_reset_owner(&blk->dev, umem);
+
+	for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
+		kvfree(blk->vqs[i].req);
+		blk->vqs[i].req = NULL;
+	}
+
+done:
+	mutex_unlock(&blk->dev.mutex);
+	return err;
+}
+
+static int vhost_blk_setup(struct vhost_blk *blk, void __user *argp)
+{
+	struct vhost_vring_state s;
+
+	if (copy_from_user(&s, argp, sizeof(s)))
+		return -EFAULT;
+
+	if (blk->vqs[s.index].req)
+		return 0;
+
+	blk->vqs[s.index].req = kvmalloc(sizeof(struct vhost_blk_req) * s.num, GFP_KERNEL);
+	if (!blk->vqs[s.index].req)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
+			    unsigned long arg)
+{
+	struct vhost_blk *blk = f->private_data;
+	void __user *argp = (void __user *)arg;
+	struct vhost_vring_file backend;
+	u64 __user *featurep = argp;
+	u64 features;
+	int ret;
+
+	switch (ioctl) {
+	case VHOST_BLK_SET_BACKEND:
+		if (copy_from_user(&backend, argp, sizeof(backend)))
+			return -EFAULT;
+		return vhost_blk_set_backend(blk, backend.fd);
+	case VHOST_GET_FEATURES:
+		features = VHOST_BLK_FEATURES;
+		if (copy_to_user(featurep, &features, sizeof(features)))
+			return -EFAULT;
+		return 0;
+	case VHOST_SET_FEATURES:
+		if (copy_from_user(&features, featurep, sizeof(features)))
+			return -EFAULT;
+		if (features & ~VHOST_BLK_FEATURES)
+			return -EOPNOTSUPP;
+		return vhost_blk_set_features(blk, features);
+	case VHOST_RESET_OWNER:
+		return vhost_blk_reset_owner(blk);
+	default:
+		mutex_lock(&blk->dev.mutex);
+		ret = vhost_dev_ioctl(&blk->dev, ioctl, argp);
+		if (ret == -ENOIOCTLCMD)
+			ret = vhost_vring_ioctl(&blk->dev, ioctl, argp);
+		if (!ret && ioctl == VHOST_SET_VRING_NUM)
+			ret = vhost_blk_setup(blk, argp);
+		vhost_blk_flush(blk);
+		mutex_unlock(&blk->dev.mutex);
+		return ret;
+	}
+}
+
+static const struct file_operations vhost_blk_fops = {
+	.owner          = THIS_MODULE,
+	.open           = vhost_blk_open,
+	.release        = vhost_blk_release,
+	.llseek		= noop_llseek,
+	.unlocked_ioctl = vhost_blk_ioctl,
+};
+
+static struct miscdevice vhost_blk_misc = {
+	MISC_DYNAMIC_MINOR,
+	"vhost-blk",
+	&vhost_blk_fops,
+};
+module_misc_device(vhost_blk_misc);
+
+MODULE_VERSION("0.0.1");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Andrey Zhadchenko");
+MODULE_DESCRIPTION("Host kernel accelerator for virtio_blk");
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index c998860d7bbc43..13caf114bcdea5 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -150,4 +150,9 @@
 /* Get the valid iova range */
 #define VHOST_VDPA_GET_IOVA_RANGE	_IOR(VHOST_VIRTIO, 0x78, \
 					     struct vhost_vdpa_iova_range)
+
+/* VHOST_BLK specific defines */
+#define VHOST_BLK_SET_BACKEND		_IOW(VHOST_VIRTIO, 0xFF, \
+					     struct vhost_vring_file)
+
 #endif

From 1963c3d8818cd64c023c9195ade3b5d1e8e14fa7 Mon Sep 17 00:00:00 2001
From: Leonid Komarianskyi <leonid_komarianskyi@epam.com>
Date: Wed, 4 Sep 2024 17:56:50 +0300
Subject: [PATCH 08/16] drivers/vhost: use array to store workers

We want to support several vhost workers. The first step is to
rework vhost to use array of workers rather than single pointer.
Update creation and cleanup routines.

Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
---
 drivers/vhost/vhost.c | 75 +++++++++++++++++++++++++++++++------------
 drivers/vhost/vhost.h | 10 +++++-
 2 files changed, 63 insertions(+), 22 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index afc696d9554f80..e19603f2f37e5e 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -231,11 +231,24 @@ void vhost_poll_stop(struct vhost_poll *poll)
 }
 EXPORT_SYMBOL_GPL(vhost_poll_stop);
 
+static void vhost_work_queue_at_worker(struct vhost_worker *w,
+				       struct vhost_work *work)
+{
+	if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
+		/* We can only add the work to the list after we're
+		 * sure it was not in the list.
+		 * test_and_set_bit() implies a memory barrier.
+		 */
+		llist_add(&work->node, &w->work_list);
+		wake_up_process(w->worker);
+	}
+}
+
 void vhost_dev_flush(struct vhost_dev *dev)
 {
 	struct vhost_flush_struct flush;
 
-	if (dev->worker) {
+	if (dev->workers[0].worker) {
 		init_completion(&flush.wait_event);
 		vhost_work_init(&flush.work, vhost_flush_work);
 
@@ -247,17 +260,12 @@ EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 {
-	if (!dev->worker)
+	struct vhost_worker *w = &dev->workers[0];
+
+	if (!w->worker)
 		return;
 
-	if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
-		/* We can only add the work to the list after we're
-		 * sure it was not in the list.
-		 * test_and_set_bit() implies a memory barrier.
-		 */
-		llist_add(&work->node, &dev->work_list);
-		wake_up_process(dev->worker);
-	}
+	vhost_work_queue_at_worker(w, work);
 }
 EXPORT_SYMBOL_GPL(vhost_work_queue);
 
@@ -333,9 +341,29 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	__vhost_vq_meta_reset(vq);
 }
 
+static void vhost_worker_reset(struct vhost_worker *w)
+{
+	init_llist_head(&w->work_list);
+	w->worker = NULL;
+}
+
+void vhost_cleanup_workers(struct vhost_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < dev->nworkers; ++i) {
+		WARN_ON(!llist_empty(&dev->workers[i].work_list));
+		kthread_stop(dev->workers[i].worker);
+		vhost_worker_reset(&dev->workers[i]);
+	}
+
+	dev->nworkers = 0;
+}
+
 static int vhost_worker(void *data)
 {
-	struct vhost_dev *dev = data;
+	struct vhost_worker *w = data;
+	struct vhost_dev *dev = w->dev;
 	struct vhost_work *work, *work_next;
 	struct llist_node *node;
 
@@ -350,7 +378,7 @@ static int vhost_worker(void *data)
 			break;
 		}
 
-		node = llist_del_all(&dev->work_list);
+		node = llist_del_all(&w->work_list);
 		if (!node)
 			schedule();
 
@@ -473,7 +501,6 @@ void vhost_dev_init(struct vhost_dev *dev,
 	dev->umem = NULL;
 	dev->iotlb = NULL;
 	dev->mm = NULL;
-	dev->worker = NULL;
 	dev->iov_limit = iov_limit;
 	dev->weight = weight;
 	dev->byte_weight = byte_weight;
@@ -485,6 +512,11 @@ void vhost_dev_init(struct vhost_dev *dev,
 	INIT_LIST_HEAD(&dev->pending_list);
 	spin_lock_init(&dev->iotlb_lock);
 
+	dev->nworkers = 0;
+	for (i = 0; i < VHOST_MAX_WORKERS; ++i) {
+		dev->workers[i].dev = dev;
+		vhost_worker_reset(&dev->workers[i]);
+	}
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		vq = dev->vqs[i];
@@ -597,7 +629,8 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 			goto err_worker;
 		}
 
-		dev->worker = worker;
+		dev->workers[0].worker = worker;
+		dev->nworkers = 1;
 		wake_up_process(worker); /* avoid contributing to loadavg */
 
 		err = vhost_attach_cgroups(dev);
@@ -611,9 +644,10 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 
 	return 0;
 err_cgroup:
-	if (dev->worker) {
-		kthread_stop(dev->worker);
-		dev->worker = NULL;
+	dev->nworkers = 0;
+	if (dev->workers[0].worker) {
+		kthread_stop(dev->workers[0].worker);
+		dev->workers[0].worker = NULL;
 	}
 err_worker:
 	vhost_detach_mm(dev);
@@ -699,6 +733,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 		vhost_xen_unmap_desc_all(dev->vqs[i]);
 #endif
 	}
+
 	vhost_dev_free_iovecs(dev);
 	if (dev->log_ctx)
 		eventfd_ctx_put(dev->log_ctx);
@@ -710,10 +745,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 	dev->iotlb = NULL;
 	vhost_clear_msg(dev);
 	wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
-	WARN_ON(!llist_empty(&dev->work_list));
-	if (dev->worker) {
-		kthread_stop(dev->worker);
-		dev->worker = NULL;
+	if (dev->use_worker) {
+		vhost_cleanup_workers(dev);
 		dev->kcov_handle = 0;
 	}
 	vhost_detach_mm(dev);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 7bb1cbd545933d..3e5135cfa71e17 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -25,6 +25,13 @@ struct vhost_work {
 	unsigned long		  flags;
 };
 
+#define VHOST_MAX_WORKERS 4
+struct vhost_worker {
+	struct task_struct *worker;
+	struct llist_head work_list;
+	struct vhost_dev *dev;
+};
+
 /* Poll a file (eventfd or socket) */
 /* Note: there's nothing vhost specific about this structure. */
 struct vhost_poll {
@@ -158,7 +165,8 @@ struct vhost_dev {
 	int nvqs;
 	struct eventfd_ctx *log_ctx;
 	struct llist_head work_list;
-	struct task_struct *worker;
+	struct vhost_worker workers[VHOST_MAX_WORKERS];
+	int nworkers;
 	struct vhost_iotlb *umem;
 	struct vhost_iotlb *iotlb;
 	spinlock_t iotlb_lock;

From d2eaad2f0ae5c1528b60bdfe0caf03d39ab07ce3 Mon Sep 17 00:00:00 2001
From: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
Date: Thu, 13 Oct 2022 18:18:32 +0300
Subject: [PATCH 09/16] drivers/vhost: adjust vhost to flush all workers

Make vhost_dev_flush support several workers and flush
them simultaneously

Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
---
 drivers/vhost/vhost.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e19603f2f37e5e..95265e123df5c7 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -246,15 +246,19 @@ static void vhost_work_queue_at_worker(struct vhost_worker *w,
 
 void vhost_dev_flush(struct vhost_dev *dev)
 {
-	struct vhost_flush_struct flush;
+	struct vhost_flush_struct flush[VHOST_MAX_WORKERS];
+	int i, nworkers;
 
-	if (dev->workers[0].worker) {
-		init_completion(&flush.wait_event);
-		vhost_work_init(&flush.work, vhost_flush_work);
+	nworkers = READ_ONCE(dev->nworkers);
 
-		vhost_work_queue(dev, &flush.work);
-		wait_for_completion(&flush.wait_event);
+	for (i = 0; i < nworkers; i++) {
+		init_completion(&flush[i].wait_event);
+		vhost_work_init(&flush[i].work, vhost_flush_work);
+		vhost_work_queue_at_worker(&dev->workers[i], &flush[i].work);
 	}
+
+	for (i = 0; i < nworkers; i++)
+		wait_for_completion(&flush[i].wait_event);
 }
 EXPORT_SYMBOL_GPL(vhost_dev_flush);
 

From b75716158153e1daccf827826ab331099daa0e64 Mon Sep 17 00:00:00 2001
From: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
Date: Thu, 13 Oct 2022 18:18:33 +0300
Subject: [PATCH 10/16] drivers/vhost: rework cgroups attachment to be worker
 aware

Rework vhost_attach_cgroups to manipulate specified worker.
Implement vhost_worker_flush as we need to flush specific worker.

Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
---
 drivers/vhost/vhost.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 95265e123df5c7..0151287015be3f 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -262,6 +262,16 @@ void vhost_dev_flush(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
+static void vhost_worker_flush(struct vhost_worker *w)
+{
+       struct vhost_flush_struct flush;
+
+       init_completion(&flush.wait_event);
+       vhost_work_init(&flush.work, vhost_flush_work);
+       vhost_work_queue_at_worker(w, &flush.work);
+       wait_for_completion(&flush.wait_event);
+}
+
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 {
 	struct vhost_worker *w = &dev->workers[0];
@@ -562,14 +572,14 @@ static void vhost_attach_cgroups_work(struct vhost_work *work)
 	s->ret = cgroup_attach_task_all(s->owner, current);
 }
 
-static int vhost_attach_cgroups(struct vhost_dev *dev)
+static int vhost_worker_attach_cgroups(struct vhost_worker *w)
 {
 	struct vhost_attach_cgroups_struct attach;
 
 	attach.owner = current;
 	vhost_work_init(&attach.work, vhost_attach_cgroups_work);
-	vhost_work_queue(dev, &attach.work);
-	vhost_dev_flush(dev);
+	vhost_work_queue_at_worker(w, &attach.work);
+	vhost_worker_flush(w);
 	return attach.ret;
 }
 
@@ -637,7 +647,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 		dev->nworkers = 1;
 		wake_up_process(worker); /* avoid contributing to loadavg */
 
-		err = vhost_attach_cgroups(dev);
+		err = vhost_worker_attach_cgroups(&dev->workers[0]);
 		if (err)
 			goto err_cgroup;
 	}

From 3b10e1e054ca220882189def3058134810e5c77f Mon Sep 17 00:00:00 2001
From: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
Date: Thu, 13 Oct 2022 18:18:34 +0300
Subject: [PATCH 11/16] drivers/vhost: rework worker creation

Add function to create a vhost worker and add it into the device.
Rework vhost_dev_set_owner

Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
---
 drivers/vhost/vhost.c | 64 +++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0151287015be3f..cd7b1e0a7044b8 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -620,53 +620,65 @@ static void vhost_detach_mm(struct vhost_dev *dev)
 	dev->mm = NULL;
 }
 
+static int vhost_add_worker(struct vhost_dev *dev)
+{
+	struct vhost_worker *w = &dev->workers[dev->nworkers];
+	struct task_struct *worker;
+	int err;
+
+	if (dev->nworkers == VHOST_MAX_WORKERS)
+		return -E2BIG;
+
+	worker = kthread_create(vhost_worker, w,
+				"vhost-%d-%d", current->pid, dev->nworkers);
+	if (IS_ERR(worker))
+		return PTR_ERR(worker);
+
+	w->worker = worker;
+	wake_up_process(worker); /* avoid contributing to loadavg */
+
+	err = vhost_worker_attach_cgroups(w);
+	if (err)
+		goto cleanup;
+
+	dev->nworkers++;
+	return 0;
+
+cleanup:
+	kthread_stop(worker);
+	w->worker = NULL;
+
+	return err;
+}
+
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
-	struct task_struct *worker;
 	int err;
 
 	/* Is there an owner already? */
-	if (vhost_dev_has_owner(dev)) {
-		err = -EBUSY;
-		goto err_mm;
-	}
+	if (vhost_dev_has_owner(dev))
+		return -EBUSY;
 
 	vhost_attach_mm(dev);
 
 	dev->kcov_handle = kcov_common_handle();
 	if (dev->use_worker) {
-		worker = kthread_create(vhost_worker, dev,
-					"vhost-%d", current->pid);
-		if (IS_ERR(worker)) {
-			err = PTR_ERR(worker);
-			goto err_worker;
-		}
-
-		dev->workers[0].worker = worker;
-		dev->nworkers = 1;
-		wake_up_process(worker); /* avoid contributing to loadavg */
-
-		err = vhost_worker_attach_cgroups(&dev->workers[0]);
+		err = vhost_add_worker(dev);
 		if (err)
-			goto err_cgroup;
+			goto err_mm;
 	}
 
 	err = vhost_dev_alloc_iovecs(dev);
 	if (err)
-		goto err_cgroup;
+		goto err_worker;
 
 	return 0;
-err_cgroup:
-	dev->nworkers = 0;
-	if (dev->workers[0].worker) {
-		kthread_stop(dev->workers[0].worker);
-		dev->workers[0].worker = NULL;
-	}
 err_worker:
+	vhost_cleanup_workers(dev);
+err_mm:
 	vhost_detach_mm(dev);
 	dev->kcov_handle = 0;
-err_mm:
 	return err;
 }
 EXPORT_SYMBOL_GPL(vhost_dev_set_owner);

From e1b61966f669412cc49970955ef75b737ac0525d Mon Sep 17 00:00:00 2001
From: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
Date: Thu, 13 Oct 2022 18:18:35 +0300
Subject: [PATCH 12/16] drivers/vhost: add ioctl to increase the number of
 workers

Finally add ioctl to allow userspace to create additional workers
For now only allow to increase the number of workers

Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
---
 drivers/vhost/vhost.c      | 32 +++++++++++++++++++++++++++++++-
 include/uapi/linux/vhost.h |  9 +++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index cd7b1e0a7044b8..0b9444f561ba24 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -651,6 +651,25 @@ static int vhost_add_worker(struct vhost_dev *dev)
 	return err;
 }
 
+static int vhost_set_workers(struct vhost_dev *dev, int n)
+{
+	int i, ret;
+
+	if (n > dev->nvqs)
+		n = dev->nvqs;
+
+	if (n > VHOST_MAX_WORKERS)
+		n = VHOST_MAX_WORKERS;
+
+	for (i = 0; i < n - dev->nworkers ; i++) {
+		ret = vhost_add_worker(dev);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
@@ -1809,7 +1828,7 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
 	struct eventfd_ctx *ctx;
 	u64 p;
 	long r;
-	int i, fd;
+	int i, fd, n;
 
 	/* If you are not the owner, you can become one */
 	if (ioctl == VHOST_SET_OWNER) {
@@ -1866,6 +1885,17 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
 		if (ctx)
 			eventfd_ctx_put(ctx);
 		break;
+	case VHOST_SET_NWORKERS:
+		r = get_user(n, (int __user *)argp);
+		if (r < 0)
+			break;
+		if (n < d->nworkers) {
+			r = -EINVAL;
+			break;
+		}
+
+		r = vhost_set_workers(d, n);
+		break;
 	default:
 		r = -ENOIOCTLCMD;
 		break;
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 13caf114bcdea5..d6d87f6315f60a 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -71,6 +71,15 @@
 #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state)
 #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state)
 
+/* Set number of vhost workers
+ * Currently nuber of vhost workers can only be increased.
+ * All workers are freed upon reset.
+ * If the value is too big it is silently truncated to the maximum number of
+ * supported vhost workers
+ * Even if the error is returned it is possible that some workers were created
+ */
+#define VHOST_SET_NWORKERS _IOW(VHOST_VIRTIO, 0x1F, int)
+
 /* The following ioctls use eventfd file descriptors to signal and poll
  * for events. */
 

From 3fd4c2ab791fae94a3325d6fdbbfb1b9647c90cf Mon Sep 17 00:00:00 2001
From: Leonid Komarianskyi <leonid_komarianskyi@epam.com>
Date: Thu, 5 Sep 2024 13:09:11 +0300
Subject: [PATCH 13/16] drivers/vhost: assign workers to virtqueues

Add worker pointer to every virtqueue. Add routine to assing
workers to virtqueues and call it after any worker creation

Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
---
 drivers/vhost/vhost.c | 14 ++++++++++++++
 drivers/vhost/vhost.h |  1 +
 2 files changed, 15 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0b9444f561ba24..0603ce710d1944 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -353,6 +353,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->iotlb = NULL;
 	vhost_vring_call_reset(&vq->call_ctx);
 	__vhost_vq_meta_reset(vq);
+	vq->worker = NULL;
 }
 
 static void vhost_worker_reset(struct vhost_worker *w)
@@ -670,6 +671,17 @@ static int vhost_set_workers(struct vhost_dev *dev, int n)
 	return ret;
 }
 
+static void vhost_assign_workers(struct vhost_dev *dev)
+{
+	int i, j = 0;
+
+	for (i = 0; i < dev->nvqs; i++) {
+		dev->vqs[i]->worker = &dev->workers[j];
+		if (++j == dev->nworkers)
+			j = 0;
+	}
+}
+
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
@@ -692,6 +704,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 	if (err)
 		goto err_worker;
 
+	vhost_assign_workers(dev);
 	return 0;
 err_worker:
 	vhost_cleanup_workers(dev);
@@ -1895,6 +1908,7 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
 		}
 
 		r = vhost_set_workers(d, n);
+		vhost_assign_workers(d);
 		break;
 	default:
 		r = -ENOIOCTLCMD;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 3e5135cfa71e17..325d9faaa6d29b 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -138,6 +138,7 @@ struct vhost_virtqueue {
 	bool user_be;
 #endif
 	u32 busyloop_timeout;
+	struct vhost_worker *worker;
 
 #ifdef CONFIG_VHOST_XEN
 	/*

From 6a5b722f6aa099def13221de8c8e72e0d20334ff Mon Sep 17 00:00:00 2001
From: Leonid Komarianskyi <leonid_komarianskyi@epam.com>
Date: Thu, 5 Sep 2024 13:12:06 +0300
Subject: [PATCH 14/16] drivers/vhost: add API to queue work at virtqueue's
 worker

Add routines to queue works on virtqueue assigned workers

Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
---
 drivers/vhost/vhost.c | 22 ++++++++++++++++++++++
 drivers/vhost/vhost.h |  5 +++++
 2 files changed, 27 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0603ce710d1944..2d3b2cbcbf1616 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -272,6 +272,17 @@ static void vhost_worker_flush(struct vhost_worker *w)
        wait_for_completion(&flush.wait_event);
 }
 
+void vhost_work_flush_vq(struct vhost_virtqueue *vq)
+{
+       struct vhost_worker *w = READ_ONCE(vq->worker);
+
+       if (!w)
+               return;
+
+       vhost_worker_flush(w);
+}
+EXPORT_SYMBOL_GPL(vhost_work_flush_vq);
+
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 {
 	struct vhost_worker *w = &dev->workers[0];
@@ -283,6 +294,17 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 }
 EXPORT_SYMBOL_GPL(vhost_work_queue);
 
+void vhost_work_vqueue(struct vhost_virtqueue *vq, struct vhost_work *work)
+{
+       struct vhost_worker *w = READ_ONCE(vq->worker);
+
+       if (!w)
+               return;
+
+       vhost_work_queue_at_worker(w, work);
+}
+EXPORT_SYMBOL_GPL(vhost_work_vqueue);
+
 /* A lockless hint for busy polling code to exit the loop */
 bool vhost_has_work(struct vhost_dev *dev)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 325d9faaa6d29b..35397c945c711f 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -150,6 +150,11 @@ struct vhost_virtqueue {
 #endif
 };
 
+/* Queue the work on virtqueue assigned worker */
+void vhost_work_vqueue(struct vhost_virtqueue *vq, struct vhost_work *work);
+/* Flush virtqueue assigned worker */
+void vhost_work_flush_vq(struct vhost_virtqueue *vq);
+
 struct vhost_msg_node {
   union {
 	  struct vhost_msg msg;

From b4f7f8355f2cffdc8aa7316514c77d14915abd9f Mon Sep 17 00:00:00 2001
From: Leonid Komarianskyi <leonid_komarianskyi@epam.com>
Date: Thu, 5 Sep 2024 18:19:13 +0300
Subject: [PATCH 15/16] drivers/vhost: allow polls to be bound to workers via
 vqs

Allow vhost polls to be associated with vqs so we can queue them
on assigned workers.
If polls are not associated with specific vqs queue them on the first
virtqueue.

Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
---
 drivers/vhost/vhost.c | 24 ++++++++++++++++--------
 drivers/vhost/vhost.h |  4 +++-
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2d3b2cbcbf1616..f3941ee74affbb 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -170,7 +170,7 @@ static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
 	if (!(key_to_poll(key) & poll->mask))
 		return 0;
 
-	if (!poll->dev->use_worker)
+	if (!poll->vq->dev->use_worker)
 		work->fn(work);
 	else
 		vhost_poll_queue(poll);
@@ -185,19 +185,27 @@ void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
 }
 EXPORT_SYMBOL_GPL(vhost_work_init);
 
-/* Init poll structure */
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 		     __poll_t mask, struct vhost_dev *dev)
+{
+       vhost_poll_init_vq(poll, fn, mask, dev->vqs[0]);
+}
+EXPORT_SYMBOL_GPL(vhost_poll_init);
+
+
+/* Init poll structure */
+void vhost_poll_init_vq(struct vhost_poll *poll, vhost_work_fn_t fn,
+                    __poll_t mask, struct vhost_virtqueue *vq)
 {
 	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
 	init_poll_funcptr(&poll->table, vhost_poll_func);
 	poll->mask = mask;
-	poll->dev = dev;
+	poll->vq = vq;
 	poll->wqh = NULL;
 
 	vhost_work_init(&poll->work, fn);
 }
-EXPORT_SYMBOL_GPL(vhost_poll_init);
+EXPORT_SYMBOL_GPL(vhost_poll_init_vq);
 
 /* Start polling a file. We add ourselves to file's wait queue. The caller must
  * keep a reference to a file until after vhost_poll_stop is called. */
@@ -314,7 +322,7 @@ EXPORT_SYMBOL_GPL(vhost_has_work);
 
 void vhost_poll_queue(struct vhost_poll *poll)
 {
-	vhost_work_queue(poll->dev, &poll->work);
+	vhost_work_vqueue(poll->vq, &poll->work);
 }
 EXPORT_SYMBOL_GPL(vhost_poll_queue);
 
@@ -567,8 +575,8 @@ void vhost_dev_init(struct vhost_dev *dev,
 #endif
 		vhost_vq_reset(dev, vq);
 		if (vq->handle_kick)
-			vhost_poll_init(&vq->poll, vq->handle_kick,
-					EPOLLIN, dev);
+			vhost_poll_init_vq(&vq->poll, vq->handle_kick,
+					EPOLLIN, vq);
 	}
 }
 EXPORT_SYMBOL_GPL(vhost_dev_init);
@@ -1825,7 +1833,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
 	mutex_unlock(&vq->mutex);
 
 	if (pollstop && vq->handle_kick)
-		vhost_dev_flush(vq->poll.dev);
+		vhost_dev_flush(d);
 	return r;
 }
 EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 35397c945c711f..4ee85a9ce4732c 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -40,7 +40,7 @@ struct vhost_poll {
 	wait_queue_entry_t              wait;
 	struct vhost_work	  work;
 	__poll_t		  mask;
-	struct vhost_dev	 *dev;
+	struct vhost_virtqueue	*vq;
 };
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
@@ -49,6 +49,8 @@ bool vhost_has_work(struct vhost_dev *dev);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 		     __poll_t mask, struct vhost_dev *dev);
+void vhost_poll_init_vq(struct vhost_poll *poll, vhost_work_fn_t fn,
+		     __poll_t mask, struct vhost_virtqueue *vq);
 int vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_queue(struct vhost_poll *poll);

From b72aa3c8fb05555908744943f101f3ee5fabbf3a Mon Sep 17 00:00:00 2001
From: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
Date: Thu, 13 Oct 2022 18:18:39 +0300
Subject: [PATCH 16/16] drivers/vhost: queue vhost_blk works at vq workers

Update vhost_blk to queue works on virtqueue workers. Together
with previous changes this allows us to split virtio blk requests
across several threads.

                        | randread, IOPS | randwrite, IOPS |
8vcpu, 1 kernel worker  |      576k      |      575k       |
8vcpu, 2 kernel workers |      803k      |      779k       |

Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
---
 drivers/vhost/blk.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
index e0c23c7cb5c2df..dfb6579a441229 100644
--- a/drivers/vhost/blk.c
+++ b/drivers/vhost/blk.c
@@ -161,13 +161,12 @@ static inline int vhost_blk_set_status(struct vhost_blk_req *req, u8 status)
 static void vhost_blk_req_done(struct bio *bio)
 {
 	struct vhost_blk_req *req = bio->bi_private;
-	struct vhost_blk *blk = req->blk;
 
 	req->bio_err = blk_status_to_errno(bio->bi_status);
 
 	if (atomic_dec_and_test(&req->bio_nr)) {
 		llist_add(&req->llnode, &req->blk_vq->llhead);
-		vhost_work_queue(&blk->dev, &req->blk_vq->work);
+		vhost_work_vqueue(&req->blk_vq->vq, &req->blk_vq->work);
 	}
 
 	bio_put(bio);