I went ahead and implemented 2 new hooks on the existing nbdkit_plugin struct for async_pread and async_pwrite to get some testable numbers.

The test is set up with an all in memory block device (data being copied to and from a std vector as simulated reading and writing).
Every read and write op has a 64ms wait before completing (simulated device latency).
These tests are using nbd-client local to the nbd-server with a unix domain socket for communication
The device has an ext4 filesystem on it mkfs.ext4 -b 4096 /dev/nbd0

Baseline test with unmodified nbdkit

dd if=/dev/zero of=./mnt/zeros bs=128k count=80 conv=fsync
80+0 records in
80+0 records out
10485760 bytes (10 MB, 10 MiB) copied, 5.40797 s, 1.9 MB/s

echo 1 | sudo tee /proc/sys/vm/drop_caches

time cat mnt/zeros > /dev/null
real    0m5.386s
user    0m0.004s
sys    0m0.000s

Read and Write performance are both around 2MB/s which is exactly a single 128k read or write every 64ms.

With nbdkit modified to use async_pread and async_pwrite and a buffer pool of 64 buffers (each 128k for a total of 8MB buffer memory)

dd if=/dev/zero of=./mnt/zeros bs=128k count=8000 conv=fsync
8000+0 records in
8000+0 records out
1048576000 bytes (1.0 GB, 1000 MiB) copied, 8.7736 s, 120 MB/s

echo 1 | sudo tee /proc/sys/vm/drop_caches

time cat mnt/zeros > /dev/null
real    0m8.153s
user    0m0.000s
sys    0m0.320s

Read and Write performance are now 120MB/s (about 64x faster) because we can process 64 ops in parallel. Our throughput scaled nearly linearly without needing 64 threads in the nbdkit. Total memory for the buffers is 8MB.

With async_pread/async_pwrite and a buffer pool of 1024 buffers (128MB buffer memory) and 2 io service threads (3 threads total, 1 thread in nbdkit pulling requests off the socket)

dd if=/dev/zero of=./mnt/zeros bs=128k count=80000 conv=fsync
80000+0 records in
80000+0 records out
10485760000 bytes (10 GB, 9.8 GiB) copied, 5.86029 s, 1.8 GB/s

echo 1 | sudo tee /proc/sys/vm/drop_caches

time cat mnt/zeros > /dev/null
real    0m12.545s
user    0m0.012s
sys    0m2.444s

Read performance was capped at around 825 MiB/s for 1 file sequential read but when performing 2 files in parallel the read throughput was 1.6GB/s and for 4 files in parallel 1.9GB/s.

Below is the patch for changes made to nbdkit to support this.

diff --git a/include/nbdkit-plugin.h b/include/nbdkit-plugin.h
index 95cba8d..2e88cad 100644
--- a/include/nbdkit-plugin.h
+++ b/include/nbdkit-plugin.h
@@ -38,15 +38,13 @@
 
 #include <stdarg.h>
 #include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
+#include <stdbool.h>
 
 #define NBDKIT_THREAD_MODEL_SERIALIZE_CONNECTIONS     0
 #define NBDKIT_THREAD_MODEL_SERIALIZE_ALL_REQUESTS    1
 #define NBDKIT_THREAD_MODEL_SERIALIZE_REQUESTS        2
 #define NBDKIT_THREAD_MODEL_PARALLEL                  3
+#define NBDKIT_THREAD_MODEL_ASYNC                     4
 
 #define NBDKIT_API_VERSION                            1
 
@@ -94,28 +92,35 @@ struct nbdkit_plugin {
 
   int errno_is_preserved;
 
+  int (*async_pread) (void *conn, uint64_t reqid, bool flush, void *handle, void *buf, uint32_t count, uint64_t offset);
+  int (*async_pwrite) (void *conn, uint64_t reqid, bool flush, void *handle, const void *buf, uint32_t count, uint64_t offset);
   /* int (*set_exportname) (void *handle, const char *exportname); */
 };
 
-extern void nbdkit_set_error (int err);
-extern void nbdkit_error (const char *msg, ...)
-  __attribute__((format (printf, 1, 2)));
-extern void nbdkit_verror (const char *msg, va_list args);
-extern void nbdkit_debug (const char *msg, ...)
-  __attribute__((format (printf, 1, 2)));
-extern void nbdkit_vdebug (const char *msg, va_list args);
-
-extern char *nbdkit_absolute_path (const char *path);
-extern int64_t nbdkit_parse_size (const char *str);
-
 #ifdef __cplusplus
-#define NBDKIT_CXX_LANG_C extern "C"
+#define NBDKIT_CXX_LANG_C "C"
 #else
 #define NBDKIT_CXX_LANG_C /* nothing */
 #endif
 
+extern NBDKIT_CXX_LANG_C void nbdkit_set_error (int err);
+extern NBDKIT_CXX_LANG_C void nbdkit_error (const char *msg, ...)
+  __attribute__((format (printf, 1, 2)));
+extern NBDKIT_CXX_LANG_C void nbdkit_verror (const char *msg, va_list args);
+extern NBDKIT_CXX_LANG_C void nbdkit_debug (const char *msg, ...)
+  __attribute__((format (printf, 1, 2)));
+extern NBDKIT_CXX_LANG_C void nbdkit_vdebug (const char *msg, va_list args);
+
+extern NBDKIT_CXX_LANG_C char *nbdkit_absolute_path (const char *path);
+extern NBDKIT_CXX_LANG_C int64_t nbdkit_parse_size (const char *str);
+
+extern NBDKIT_CXX_LANG_C int nbdkit_async_reply (void *conn, uint64_t reqid);
+extern NBDKIT_CXX_LANG_C int nbdkit_async_reply_read (void *conn, uint64_t reqid, uint32_t count, void *buf);
+extern NBDKIT_CXX_LANG_C int nbdkit_async_reply_error (void *conn, uint64_t reqid);
+
+
 #define NBDKIT_REGISTER_PLUGIN(plugin)                                  \
-  NBDKIT_CXX_LANG_C                                                     \
+  extern NBDKIT_CXX_LANG_C                                              \
   struct nbdkit_plugin *                                                \
   plugin_init (void)                                                    \
   {                                                                     \
@@ -125,8 +130,4 @@ extern int64_t nbdkit_parse_size (const char *str);
     return &(plugin);                                                   \
   }
 
-#ifdef __cplusplus
-}
-#endif
-
 #endif /* NBDKIT_PLUGIN_H */
diff --git a/src/connections.c b/src/connections.c
index a0d689a..e03a0f6 100644
--- a/src/connections.c
+++ b/src/connections.c
@@ -62,7 +62,8 @@
 static struct connection *new_connection (int sockin, int sockout);
 static void free_connection (struct connection *conn);
 static int negotiate_handshake (struct connection *conn);
-static int recv_request_send_reply (struct connection *conn);
+static int recv_request (struct connection *conn);
+static int send_reply (struct connection *conn, uint64_t handle, uint32_t count, void *buf, uint32_t error);
 
 static int
 _handle_single_connection (int sockin, int sockout)
@@ -86,7 +87,7 @@ _handle_single_connection (int sockin, int sockout)
    * a thread pool.
    */
   while (!quit) {
-    r = recv_request_send_reply (conn);
+    r = recv_request (conn);
     if (r == -1)
       goto err;
     if (r == 0)
@@ -127,6 +128,7 @@ new_connection (int sockin, int sockout)
   conn->sockin = sockin;
   conn->sockout = sockout;
   pthread_mutex_init (&conn->request_lock, NULL);
+  pthread_mutex_init (&conn->reply_lock, NULL);
 
   return conn;
 }
@@ -143,6 +145,7 @@ free_connection (struct connection *conn)
     close (conn->sockout);
 
   pthread_mutex_destroy (&conn->request_lock);
+  pthread_mutex_destroy (&conn->reply_lock);
 
   if (conn->handle)
     plugin_close (conn);
@@ -626,13 +629,13 @@ get_error (struct connection *conn)
  * On read/write errors, sets *error appropriately and returns 0.
  */
 static int
-_handle_request (struct connection *conn,
+_handle_request (struct connection *conn, uint64_t handle,
                  uint32_t cmd, uint32_t flags, uint64_t offset, uint32_t count,
-                 void *buf,
-                 uint32_t *error)
+                 void *buf)
 {
-  bool flush_after_command;
   int r;
+  uint32_t error = 0;
+  bool flush_after_command;
 
   /* Flush after command performed? */
   flush_after_command = (flags & NBD_CMD_FLAG_FUA) != 0;
@@ -645,42 +648,54 @@ _handle_request (struct connection *conn,
 
   switch (cmd) {
   case NBD_CMD_READ:
-    r = plugin_pread (conn, buf, count, offset);
+    if (plugin_can_async_read (conn)) {
+      r = plugin_async_pread (conn, handle, flush_after_command, buf, count, offset);
+      if (r == 0)
+        return 0; // plugin now has responsibility of sending response
+    }
+    else
+      r = plugin_pread (conn, buf, count, offset);
     if (r == -1) {
-      *error = get_error (conn);
-      return 0;
+      error = get_error (conn);
+      return send_reply (conn, handle, 0, NULL, error);
     }
     break;
 
   case NBD_CMD_WRITE:
-    r = plugin_pwrite (conn, buf, count, offset);
+    if (plugin_can_async_write (conn)) {
+      r = plugin_async_pwrite (conn, handle, flush_after_command, buf, count, offset);
+      if (r == 0)
+        return 0; // plugin now has responsibility of sending response
+    }
+    else
+      r = plugin_pwrite (conn, buf, count, offset);
     if (r == -1) {
-      *error = get_error (conn);
-      return 0;
+      error = get_error (conn);
+      return send_reply (conn, handle, 0, NULL, error);
     }
     break;
 
   case NBD_CMD_FLUSH:
     r = plugin_flush (conn);
     if (r == -1) {
-      *error = get_error (conn);
-      return 0;
+      error = get_error (conn);
+      return send_reply (conn, handle, 0, NULL, error);
     }
     break;
 
   case NBD_CMD_TRIM:
     r = plugin_trim (conn, count, offset);
     if (r == -1) {
-      *error = get_error (conn);
-      return 0;
+      error = get_error (conn);
+      return send_reply (conn, handle, 0, NULL, error);
     }
     break;
 
   case NBD_CMD_WRITE_ZEROES:
     r = plugin_zero (conn, count, offset, !(flags & NBD_CMD_FLAG_NO_HOLE));
     if (r == -1) {
-      *error = get_error (conn);
-      return 0;
+      error = get_error (conn);
+      return send_reply (conn, handle, 0, NULL, error);
     }
     break;
 
@@ -691,24 +706,28 @@ _handle_request (struct connection *conn,
   if (flush_after_command) {
     r = plugin_flush (conn);
     if (r == -1) {
-      *error = get_error (conn);
-      return 0;
+      error = get_error (conn);
+      return send_reply (conn, handle, 0, NULL, error);
     }
   }
 
-  return 0;
+  if (cmd == NBD_CMD_READ)
+    r = send_reply (conn, handle, count, buf, error);
+  else
+   r = send_reply (conn, handle, 0, NULL, error);
+
+  return r;
 }
 
 static int
-handle_request (struct connection *conn,
+handle_request (struct connection *conn, uint64_t handle,
                 uint32_t cmd, uint32_t flags, uint64_t offset, uint32_t count,
-                void *buf,
-                uint32_t *error)
+                void *buf)
 {
   int r;
 
   plugin_lock_request (conn);
-  r = _handle_request (conn, cmd, flags, offset, count, buf, error);
+  r = _handle_request (conn, handle, cmd, flags, offset, count, buf);
   plugin_unlock_request (conn);
 
   return r;
@@ -763,11 +782,10 @@ nbd_errno (int error)
 }
 
 static int
-recv_request_send_reply (struct connection *conn)
+recv_request (struct connection *conn)
 {
   int r;
   struct request request;
-  struct reply reply;
   uint32_t magic, cmd, flags, count, error = 0;
   uint64_t offset;
   CLEANUP_FREE char *buf = NULL;
@@ -808,7 +826,10 @@ recv_request_send_reply (struct connection *conn)
   if (r == 0) {                 /* request not valid */
     if (cmd == NBD_CMD_WRITE)
       skip_over_write_buffer (conn->sockin, count);
-    goto send_reply;
+    r = send_reply (conn, request.handle, 0, NULL, error);
+    if (r == -1)
+      return -1;
+    return 1;
   }
 
   /* Allocate the data buffer used for either read or write requests. */
@@ -819,7 +840,9 @@ recv_request_send_reply (struct connection *conn)
       error = ENOMEM;
       if (cmd == NBD_CMD_WRITE)
         skip_over_write_buffer (conn->sockin, count);
-      goto send_reply;
+      r = send_reply (conn, request.handle, 0, NULL, error);
+      if (r == -1)
+        return -1;
     }
   }
 
@@ -837,14 +860,20 @@ recv_request_send_reply (struct connection *conn)
   }
 
   /* Perform the request.  Only this part happens inside the request lock. */
-  r = handle_request (conn, cmd, flags, offset, count, buf, &error);
+  r = handle_request (conn, request.handle, cmd, flags, offset, count, buf);
   if (r == -1)
     return -1;
 
-  /* Send the reply packet. */
- send_reply:
+  return 1;                     /* command processed ok */
+}
+
+static int
+_send_reply (struct connection *conn, uint64_t handle, uint32_t count, void *buf, uint32_t error)
+{
+  int r;
+  struct reply reply;
   reply.magic = htobe32 (NBD_REPLY_MAGIC);
-  reply.handle = request.handle;
+  reply.handle = handle;
   reply.error = htobe32 (nbd_errno (error));
 
   if (error != 0) {
@@ -862,8 +891,7 @@ recv_request_send_reply (struct connection *conn)
     return -1;
   }
 
-  /* Send the read data buffer. */
-  if (cmd == NBD_CMD_READ) {
+  if (error == 0 && buf != NULL) { /* Send the read data buffer. */
     r = xwrite (conn->sockout, buf, count);
     if (r == -1) {
       nbdkit_error ("write data: %m");
@@ -871,5 +899,37 @@ recv_request_send_reply (struct connection *conn)
     }
   }
 
-  return 1;                     /* command processed ok */
+  return 0;
+}
+
+static int
+send_reply (struct connection *conn, uint64_t handle, uint32_t count, void *buf, uint32_t error)
+{
+  int r;
+
+  plugin_lock_reply (conn);
+  r = _send_reply (conn, handle, count, buf, error);
+  plugin_unlock_reply (conn);
+
+  return r;
+}
+
+int
+nbdkit_async_reply (void *conn, uint64_t reqid)
+{
+  return send_reply (conn, reqid, 0, NULL, 0);
+}
+
+int
+nbdkit_async_reply_read (void *conn, uint64_t reqid, uint32_t count, void *buf)
+{
+
+  return send_reply (conn, reqid, count, buf, 0);
+}
+
+int
+nbdkit_async_reply_error (void *conn, uint64_t reqid)
+{
+  uint32_t error = get_error (conn);
+  return send_reply (conn, reqid, 0, NULL, error);
 }
diff --git a/src/internal.h b/src/internal.h
index e73edf1..93d32e9 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -114,6 +114,7 @@ extern void cleanup_free (void *ptr);
 struct connection {
   int sockin, sockout;
   pthread_mutex_t request_lock;
+  pthread_mutex_t reply_lock;
   void *handle;
   uint64_t exportsize;
   int readonly;
@@ -140,6 +141,8 @@ extern void plugin_lock_connection (void);
 extern void plugin_unlock_connection (void);
 extern void plugin_lock_request (struct connection *conn);
 extern void plugin_unlock_request (struct connection *conn);
+extern void plugin_lock_reply (struct connection *conn);
+extern void plugin_unlock_reply (struct connection *conn);
 extern int plugin_errno_is_preserved (void);
 extern int plugin_open (struct connection *conn, int readonly);
 extern void plugin_close (struct connection *conn);
@@ -148,8 +151,12 @@ extern int plugin_can_write (struct connection *conn);
 extern int plugin_can_flush (struct connection *conn);
 extern int plugin_is_rotational (struct connection *conn);
 extern int plugin_can_trim (struct connection *conn);
+extern int plugin_can_async_read (struct connection *conn);
 extern int plugin_pread (struct connection *conn, void *buf, uint32_t count, uint64_t offset);
+extern int plugin_async_pread (struct connection *conn, uint64_t handle, bool flush, void *buf, uint32_t count, uint64_t offset);
+extern int plugin_can_async_write (struct connection *conn);
 extern int plugin_pwrite (struct connection *conn, void *buf, uint32_t count, uint64_t offset);
+extern int plugin_async_pwrite (struct connection *conn, uint64_t handle, bool flush, void *buf, uint32_t count, uint64_t offset);
 extern int plugin_flush (struct connection *conn);
 extern int plugin_trim (struct connection *conn, uint32_t count, uint64_t offset);
 extern int plugin_zero (struct connection *conn, uint32_t count, uint64_t offset, int may_trim);
diff --git a/src/plugins.c b/src/plugins.c
index eeed8a9..9da30db 100644
--- a/src/plugins.c
+++ b/src/plugins.c
@@ -121,8 +121,9 @@ plugin_register (const char *_filename,
              program_name, filename);
     exit (EXIT_FAILURE);
   }
-  if (plugin.pread == NULL) {
-    fprintf (stderr, "%s: %s: plugin must have a .pread callback\n",
+  if (plugin.pread == NULL &&
+     (plugin._thread_model != NBDKIT_THREAD_MODEL_ASYNC || plugin.async_pread == NULL)) {
+    fprintf (stderr, "%s: %s: plugin must have either a .pread or .async_pread callback\n",
              program_name, filename);
     exit (EXIT_FAILURE);
   }
@@ -231,6 +232,9 @@ plugin_dump_fields (void)
   case NBDKIT_THREAD_MODEL_PARALLEL:
     printf ("parallel");
     break;
+  case NBDKIT_THREAD_MODEL_ASYNC:
+    printf ("async");
+    break;
   default:
     printf ("%d # unknown thread model!", plugin._thread_model);
     break;
@@ -258,6 +262,8 @@ plugin_dump_fields (void)
   HAS (flush);
   HAS (trim);
   HAS (zero);
+  HAS (async_pread);
+  HAS (async_pwrite);
 #undef HAS
 }
 
@@ -350,6 +356,28 @@ plugin_unlock_request (struct connection *conn)
   }
 }
 
+void
+plugin_lock_reply (struct connection *conn)
+{
+  assert (dl);
+
+  if (plugin._thread_model >= NBDKIT_THREAD_MODEL_PARALLEL) {
+    debug ("acquire per-connection reply lock");
+    pthread_mutex_lock (&conn->reply_lock);
+  }
+}
+
+void
+plugin_unlock_reply (struct connection *conn)
+{
+  assert (dl);
+
+  if (plugin._thread_model >= NBDKIT_THREAD_MODEL_PARALLEL) {
+    debug ("release per-connection reply lock");
+    pthread_mutex_unlock (&conn->reply_lock);
+  }
+}
+
 int
 plugin_errno_is_preserved (void)
 {
@@ -414,7 +442,8 @@ plugin_can_write (struct connection *conn)
   if (plugin.can_write)
     return plugin.can_write (conn->handle);
   else
-    return plugin.pwrite != NULL;
+    return plugin.pwrite != NULL ||
+           (plugin._thread_model == NBDKIT_THREAD_MODEL_ASYNC && plugin.async_pwrite != NULL);
 }
 
 int
@@ -460,6 +489,16 @@ plugin_can_trim (struct connection *conn)
 }
 
 int
+plugin_can_async_read (struct connection *conn)
+{
+  assert (dl);
+  assert (conn->handle);
+
+  return ((plugin._thread_model == NBDKIT_THREAD_MODEL_ASYNC) &&
+          (plugin.async_pread != NULL));
+}
+
+int
 plugin_pread (struct connection *conn,
               void *buf, uint32_t count, uint64_t offset)
 {
@@ -473,6 +512,27 @@ plugin_pread (struct connection *conn,
 }
 
 int
+plugin_async_pread (struct connection *conn, uint64_t handle, bool flush,
+                    void *buf, uint32_t count, uint64_t offset)
+{
+  assert (dl);
+  assert (conn->handle);
+
+  debug ("async_pread count=%" PRIu32 " offset=%" PRIu64, count, offset);
+
+  return plugin.async_pread (conn, handle, flush, conn->handle, buf, count, offset);
+}
+
+int
+plugin_can_async_write (struct connection *conn)
+{
+  assert (dl);
+  assert (conn->handle);
+
+  return (plugin._thread_model == NBDKIT_THREAD_MODEL_ASYNC) && (plugin.async_pwrite != NULL);
+}
+
+int
 plugin_pwrite (struct connection *conn,
                void *buf, uint32_t count, uint64_t offset)
 {
@@ -490,23 +550,44 @@ plugin_pwrite (struct connection *conn,
 }
 
 int
+plugin_async_pwrite (struct connection *conn, uint64_t handle, bool flush,
+                     void *buf, uint32_t count, uint64_t offset)
+{
+  assert (dl);
+  assert (conn->handle);
+
+  debug ("async_pwrite count=%" PRIu32 " offset=%" PRIu64, count, offset);
+
+  if (plugin.async_pwrite != NULL)
+    return plugin.async_pwrite (conn, handle, flush, conn->handle, buf, count, offset);
+  else {
+    errno = EROFS;
+    return -1;
+  }
+}
+
+int
 plugin_flush (struct connection *conn)
 {
+  int r;
   assert (dl);
   assert (conn->handle);
 
   debug ("flush");
 
   if (plugin.flush != NULL)
-    return plugin.flush (conn->handle);
+    r = plugin.flush (conn->handle);
   else {
     errno = EINVAL;
-    return -1;
+    r = -1;
   }
+
+  return r;
 }
 
 int
-plugin_trim (struct connection *conn, uint32_t count, uint64_t offset)
+plugin_trim (struct connection *conn,
+             uint32_t count, uint64_t offset)
 {
   assert (dl);
   assert (conn->handle);