[PATCH nbdkit] Experiment with parallel python plugin

Thursday, 6 August 2020

This is a quick hack to experiment with parallel threading model in the
python plugin.

Changes:

- Use aligned buffers to make it possible to use O_DIRECT. Using
  parallel I/O does not buy us much when using buffered I/O.  pwrite()
  copies data to the page cache, and pread() reads data from the page
  cache.

- Disable extents in the file plugin. This way we can compare it with
  the python file example.

- Implement flush in the file example.

With these changes, I could compare the file plugin with the new python
file example, and it seems that the parallel threading models works
nicely, and we get similar performance for the case of fully allocated
image.

I created a test image using:

$ virt-builder fedora-32 -o /var/tmp/fedora-32.raw --root-password=password:root

And a fully allocated test image using:

$ fallocate --length 6g /var/tmp/disk.raw
$ dd if=/var/tmp/fedora-32.raw bs=8M of=/var/tmp/disk.raw iflag=direct oflag=direct
conv=fsync,notrunc

$ qemu-img map --output json /var/tmp/disk.raw
[{ "start": 0, "length": 6442450944, "depth": 0,
"zero": false, "data": true, "offset": 0}]

For reference, copying this image with dd using direct I/O:

$ dd if=/var/tmp/disk.raw bs=2M of=/dev/shm/disk.raw iflag=direct conv=fsync
status=progress
6442450944 bytes (6.4 GB, 6.0 GiB) copied, 10.4783 s, 615 MB/s

Copying same image with qemu-img convert, disabling zero detection,
using different number of coroutines:

$ time qemu-img convert -f raw -O raw -T none -S0 -m1 -W /var/tmp/disk.raw
/dev/shm/disk.raw

real	0m11.527s
user	0m0.102s
sys	0m2.330s

$ time qemu-img convert -f raw -O raw -T none -S0 -m2 -W /var/tmp/disk.raw
/dev/shm/disk.raw

real	0m5.971s
user	0m0.080s
sys	0m2.749s

$ time qemu-img convert -f raw -O raw -T none -S0 -m4 -W /var/tmp/disk.raw
/dev/shm/disk.raw

real	0m3.674s
user	0m0.071s
sys	0m3.140s

$ time qemu-img convert -f raw -O raw -T none -S0 -m8 -W /var/tmp/disk.raw
/dev/shm/disk.raw

real	0m3.408s
user	0m0.069s
sys	0m3.813s

$ time qemu-img convert -f raw -O raw -T none -S0 -m16 -W /var/tmp/disk.raw
/dev/shm/disk.raw

real	0m3.305s
user	0m0.054s
sys	0m3.767s

Same with the modified file plugin, using direct I/O and without
extents:

$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t1 -f -r file
file=/var/tmp/disk.raw
$ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw

real	0m12.167s
user	0m5.798s
sys	0m2.477s

$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t2 -f -r file
file=/var/tmp/disk.raw
$ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw

real	0m7.981s
user	0m5.204s
sys	0m2.740s

$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t4 -f -r file
file=/var/tmp/disk.raw
$ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw

real	0m6.568s
user	0m4.996s
sys	0m3.167s

$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t8 -f -r file
file=/var/tmp/disk.raw
$ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw

real	0m6.493s
user	0m4.950s
sys	0m3.492s

$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t16 -f -r file
file=/var/tmp/disk.raw
$ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw

real	0m6.138s
user	0m4.621s
sys	0m3.550s

Finally, same with the python file example:

$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t1 -f -r python
./plugins/python/examples/file.py file=/var/tmp/disk.raw
$ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw

real	0m12.398s
user	0m6.652s
sys	0m2.484s

$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t2 -f -r python
./plugins/python/examples/file.py file=/var/tmp/disk.raw
$ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw

real	0m8.169s
user	0m5.418s
sys	0m2.736s

$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t4 -f -r python
./plugins/python/examples/file.py file=/var/tmp/disk.raw
$ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw

real	0m6.419s
user	0m4.891s
sys	0m3.103s

$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t8 -f -r python
./plugins/python/examples/file.py file=/var/tmp/disk.raw
$ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw

real	0m6.610s
user	0m5.115s
sys	0m3.377s

$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t16 -f -r python
./plugins/python/examples/file.py file=/var/tmp/disk.raw
$ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw

real	0m6.093s
user	0m4.520s
sys	0m3.567s

I think this show that the parallel threading model works for the python
plugin as good as for the file plugin.
---
 plugins/file/file.c             |  4 ++--
 plugins/python/examples/file.py |  5 ++++-
 server/plugins.c                | 20 ++++++++++++++------
 server/threadlocal.c            |  7 +++++--
 4 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/plugins/file/file.c b/plugins/file/file.c
index dc99f992..27316b9f 100644
--- a/plugins/file/file.c
+++ b/plugins/file/file.c
@@ -170,7 +170,7 @@ file_open (int readonly)
     return NULL;
   }
 
-  flags = O_CLOEXEC|O_NOCTTY;
+  flags = O_CLOEXEC|O_NOCTTY|O_DIRECT;
   if (readonly)
     flags |= O_RDONLY;
   else
@@ -551,7 +551,7 @@ file_can_extents (void *handle)
     nbdkit_debug ("extents disabled: lseek: SEEK_HOLE: %m");
     return 0;
   }
-  return 1;
+  return 0;
 }
 
 static int
diff --git a/plugins/python/examples/file.py b/plugins/python/examples/file.py
index 866b8244..3652eb52 100644
--- a/plugins/python/examples/file.py
+++ b/plugins/python/examples/file.py
@@ -49,7 +49,7 @@ def open(readonly):
         flags = os.O_RDONLY
     else:
         flags = os.O_RDWR
-    fd = os.open(filename, flags)
+    fd = os.open(filename, flags | os.O_DIRECT)
     return { 'fd': fd }
 
 def get_size(h):
@@ -65,3 +65,6 @@ def pwrite(h, buf, offset, flags):
     n = os.pwritev(h['fd'], [buf], offset)
     if n != len(buf):
         raise RuntimeError("short write")
+
+def flush(h, flags):
+    os.fsync(h['fd'])
diff --git a/server/plugins.c b/server/plugins.c
index d4364cd2..ce4700a3 100644
--- a/server/plugins.c
+++ b/server/plugins.c
@@ -631,6 +631,8 @@ plugin_zero (struct backend *b, void *handle,
   bool fast_zero = flags & NBDKIT_FLAG_FAST_ZERO;
   bool emulate = false;
   bool need_flush = false;
+  void *zero_buffer = NULL;
+  int buffer_size = MIN (MAX_REQUEST_SIZE, count);
 
   if (fua && backend_can_fua (b) != NBDKIT_FUA_NATIVE) {
     flags &= ~NBDKIT_FLAG_FUA;
@@ -669,19 +671,25 @@ plugin_zero (struct backend *b, void *handle,
   threadlocal_set_error (0);
   *err = 0;
 
+  *err = posix_memalign(&zero_buffer, 4096, buffer_size);
+  if (*err != 0) {
+      r = -1;
+      goto done;
+  }
+
+  memset(zero_buffer, 0, buffer_size);
+
   while (count) {
-    /* Always contains zeroes, but we can't use const or else gcc 9
-     * will use .rodata instead of .bss and inflate the binary size.
-     */
-    static /* const */ char buf[MAX_REQUEST_SIZE];
-    uint32_t limit = MIN (count, sizeof buf);
+    uint32_t limit = MIN (count, buffer_size);
 
-    r = plugin_pwrite (b, handle, buf, limit, offset, flags, err);
+    r = plugin_pwrite (b, handle, zero_buffer, limit, offset, flags, err);
     if (r == -1)
       break;
     count -= limit;
   }
 
+  free(zero_buffer);
+
  done:
   if (r != -1 && need_flush)
     r = plugin_flush (b, handle, 0, err);
diff --git a/server/threadlocal.c b/server/threadlocal.c
index 90230028..04c82842 100644
--- a/server/threadlocal.c
+++ b/server/threadlocal.c
@@ -195,13 +195,16 @@ threadlocal_buffer (size_t size)
 
   if (threadlocal->buffer_size < size) {
     void *ptr;
+    int err;
 
-    ptr = realloc (threadlocal->buffer, size);
-    if (ptr == NULL) {
+    err = posix_memalign (&ptr, 4096, size);
+    if (err != 0) {
       nbdkit_error ("threadlocal_buffer: realloc: %m");
       return NULL;
     }
+
     memset (ptr, 0, size);
+    free(threadlocal->buffer);
     threadlocal->buffer = ptr;
     threadlocal->buffer_size = size;
   }
-- 
2.25.4


    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009