We already added a .can_fua callback internally; it is now time to
expose it to the filters, and to update the particular filters that
can perform more efficient FUA when subdividing requests.
Note that this is a change to the filter API; but given that we've
already bumped the API in a previous patch, we can group all of the
API changes within the same release without bumping the #define
value in this patch.
For the cow filter, emulate FUA requests locally (arguably, as long as
we are serializing all requests and the overlay file is temporary,
waiting for data to hit the disk is pointless and we could treat FUA
as a no-op, but for now we keep the fdatasync() call that matches what
we would have had with a client calling flush instead of using FUA).
For the cache and blocksize filters, optimize whether the FUA flag is
passed on to the next layer according to what style of FUA the next
layer implements. However, in the emulate case, it was simpler to
just always do a final flush, compared to figuring out how to send
the FUA flag on just the final sub-request.
Signed-off-by: Eric Blake <eblake(a)redhat.com>
---
docs/nbdkit-filter.pod | 46 +++++++++++++++++++++++++++++++++++++------
include/nbdkit-common.h | 4 ++++
include/nbdkit-filter.h | 3 +++
src/internal.h | 4 ----
src/filters.c | 16 +++++++++++++--
filters/blocksize/blocksize.c | 32 ++++++++++++++++++++++++------
filters/cache/cache.c | 25 ++++++++++++++++++++++-
filters/cow/cow.c | 21 ++++++++++++++++++++
filters/log/log.c | 6 +++---
9 files changed, 135 insertions(+), 22 deletions(-)
diff --git a/docs/nbdkit-filter.pod b/docs/nbdkit-filter.pod
index 6766216..d53daf4 100644
--- a/docs/nbdkit-filter.pod
+++ b/docs/nbdkit-filter.pod
@@ -379,6 +379,39 @@ error message and return C<-1>. Like the other initial
queries
documented above, caching the return value of this function is
allowed.
+=head2 C<.can_fua>
+
+ int (*can_fua) (struct nbdkit_next_ops *next_ops, void *nxdata,
+ void *handle);
+
+This controls how the Forced Unit Access flag will be handled; it is
+only relevant for connections that are not read-only. For now, this
+function has no counterpart in plugins, although it will be added.
+Unlike other feature functions with just two success states, this one
+returns three success values: C<NBDKIT_FUA_NONE> to avoid advertising
+FUA support to the client, C<NBDKIT_FUA_EMULATE> if FUA is emulated by
+nbdkit calling flush after a write transaction, and
+C<NBDKIT_FUA_NATIVE> if FUA semantics are handled by the plugin
+without the overhead of an extra flush from nbdkit.
+
+The difference between the two non-zero values may affect choices made
+in the filter: when splitting a write request that requested FUA from
+the client, native support should pass the FUA flag on to each
+sub-request; while if it is known that FUA is emulated by a flush, it
+is more efficient to only flush once after all sub-requests have
+completed (often by passing C<NBDKIT_FLAG_FUA> on to only the final
+sub-request, or by dropping the flag and ending with a direct call to
+C<next_ops-E<gt>flush>).
+
+If this callback is omitted, the default returned for the plugin layer
+is C<NBDKIT_FUA_EMULATE> if C<can_flush> returned true, otherwise it
+is C<NBDKIT_FUA_NONE>.
+
+If there is an error, the callback should call C<nbdkit_error> with an
+error message and return C<-1>. Like the other initial queries
+documented above, caching the return value of this function is
+allowed.
+
=head2 C<.pread>
int (*pread) (struct nbdkit_next_ops *next_ops, void *nxdata,
@@ -412,9 +445,9 @@ turn, the filter should not call C<next_ops-E<gt>pwrite>
if
C<next_ops-E<gt>can_write> did not return true.
The parameter C<flags> may include C<NBDKIT_FLAG_FUA> on input based
-on the result of C<.can_flush>. In turn, the filter should only pass
+on the result of C<.can_fua>. In turn, the filter should only pass
C<NBDKIT_FLAG_FUA> on to C<next_ops-E<gt>pwrite> if
-C<next_ops-E<gt>can_flush> returned true.
+C<next_ops-E<gt>can_fua> returned a positive value.
If there is an error (including a short write which couldn't be
recovered from), C<.pwrite> should call C<nbdkit_error> with an error
@@ -455,9 +488,9 @@ turn, the filter should not call C<next_ops-E<gt>trim> if
C<next_ops-E<gt>can_trim> did not return true.
The parameter C<flags> may include C<NBDKIT_FLAG_FUA> on input based
-on the result of C<.can_flush>. In turn, the filter should only pass
+on the result of C<.can_fua>. In turn, the filter should only pass
C<NBDKIT_FLAG_FUA> on to C<next_ops-E<gt>trim> if
-C<next_ops-E<gt>can_flush> returned true.
+C<next_ops-E<gt>can_fua> returned a positive value.
If there is an error, C<.trim> should call C<nbdkit_error> with an
error message B<and> return -1 with C<err> set to the positive errno
@@ -478,9 +511,10 @@ C<next_ops-E<gt>can_write> did not return true.
On input, the parameter C<flags> may include C<NBDKIT_FLAG_MAY_TRIM>
unconditionally, and C<NBDKIT_FLAG_FUA> based on the result of
-C<.can_flush>. In turn, the filter may pass C<NBDKIT_FLAG_MAY_TRIM>
+C<.can_fua>. In turn, the filter may pass C<NBDKIT_FLAG_MAY_TRIM>
unconditionally, but should only pass C<NBDKIT_FLAG_FUA> on to
-C<next_ops-E<gt>zero> if C<next_ops-E<gt>can_flush> returned
true.
+C<next_ops-E<gt>zero> if C<next_ops-E<gt>can_fua> returned a
positive
+value.
Note that unlike the plugin C<.zero> which is permitted to fail with
C<EOPNOTSUPP> to force a fallback to C<.pwrite>, the function
diff --git a/include/nbdkit-common.h b/include/nbdkit-common.h
index f32bf76..851b890 100644
--- a/include/nbdkit-common.h
+++ b/include/nbdkit-common.h
@@ -53,6 +53,10 @@ extern "C" {
#define NBDKIT_FLAG_MAY_TRIM (1<<0) /* Maps to !NBD_CMD_FLAG_NO_HOLE */
#define NBDKIT_FLAG_FUA (1<<1) /* Maps to NBD_CMD_FLAG_FUA */
+#define NBDKIT_FUA_NONE 0
+#define NBDKIT_FUA_EMULATE 1
+#define NBDKIT_FUA_NATIVE 2
+
extern void nbdkit_error (const char *msg, ...)
__attribute__((__format__ (__printf__, 1, 2)));
extern void nbdkit_verror (const char *msg, va_list args);
diff --git a/include/nbdkit-filter.h b/include/nbdkit-filter.h
index 533713b..931d923 100644
--- a/include/nbdkit-filter.h
+++ b/include/nbdkit-filter.h
@@ -58,6 +58,7 @@ struct nbdkit_next_ops {
int (*is_rotational) (void *nxdata);
int (*can_trim) (void *nxdata);
int (*can_zero) (void *nxdata);
+ int (*can_fua) (void *nxdata);
int (*pread) (void *nxdata, void *buf, uint32_t count, uint64_t offset,
uint32_t flags, int *err);
@@ -118,6 +119,8 @@ struct nbdkit_filter {
void *handle);
int (*can_zero) (struct nbdkit_next_ops *next_ops, void *nxdata,
void *handle);
+ int (*can_fua) (struct nbdkit_next_ops *next_ops, void *nxdata,
+ void *handle);
int (*pread) (struct nbdkit_next_ops *next_ops, void *nxdata,
void *handle, void *buf, uint32_t count, uint64_t offset,
diff --git a/src/internal.h b/src/internal.h
index b49f071..d873cd5 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -98,10 +98,6 @@
(type *) ((char *) __mptr - offsetof(type, member)); \
})
-#define NBDKIT_FUA_NONE 0
-#define NBDKIT_FUA_EMULATE 1
-#define NBDKIT_FUA_NATIVE 2
-
/* main.c */
extern const char *exportname;
extern const char *ipaddr;
diff --git a/src/filters.c b/src/filters.c
index 491c676..3d2c07e 100644
--- a/src/filters.c
+++ b/src/filters.c
@@ -294,6 +294,13 @@ next_can_zero (void *nxdata)
return b_conn->b->can_zero (b_conn->b, b_conn->conn);
}
+static int
+next_can_fua (void *nxdata)
+{
+ struct b_conn *b_conn = nxdata;
+ return b_conn->b->can_fua (b_conn->b, b_conn->conn);
+}
+
static int
next_pread (void *nxdata, void *buf, uint32_t count, uint64_t offset,
uint32_t flags, int *err)
@@ -342,6 +349,7 @@ static struct nbdkit_next_ops next_ops = {
.is_rotational = next_is_rotational,
.can_trim = next_can_trim,
.can_zero = next_can_zero,
+ .can_fua = next_can_fua,
.pread = next_pread,
.pwrite = next_pwrite,
.flush = next_flush,
@@ -473,11 +481,15 @@ static int
filter_can_fua (struct backend *b, struct connection *conn)
{
struct backend_filter *f = container_of (b, struct backend_filter, backend);
+ void *handle = connection_get_handle (conn, f->backend.i);
+ struct b_conn nxdata = { .b = f->backend.next, .conn = conn };
debug ("can_fua");
- /* TODO expose this to plugins and filters */
- return f->backend.next->can_fua (f->backend.next, conn);
+ if (f->filter.can_fua)
+ return f->filter.can_fua (&next_ops, &nxdata, handle);
+ else
+ return f->backend.next->can_fua (f->backend.next, conn);
}
static int
diff --git a/filters/blocksize/blocksize.c b/filters/blocksize/blocksize.c
index 8834457..88240cd 100644
--- a/filters/blocksize/blocksize.c
+++ b/filters/blocksize/blocksize.c
@@ -37,6 +37,7 @@
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
+#include <stdbool.h>
#include <inttypes.h>
#include <limits.h>
#include <errno.h>
@@ -141,6 +142,7 @@ blocksize_prepare (struct nbdkit_next_ops *next_ops, void *nxdata,
nbdkit_error ("disk is too small for minblock size %u", minblock);
return -1;
}
+ /* TODO: cache per-connection FUA mode? */
return 0;
}
@@ -206,9 +208,13 @@ blocksize_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
const char *buf = b;
uint32_t keep;
uint32_t drop;
+ bool need_flush = false;
- /* FIXME: Smarter handling of FUA - pass it through if the next layer
- * can handle it natively, but just once at end if next layer emulates. */
+ if ((flags & NBDKIT_FLAG_FUA) &&
+ next_ops->can_fua (nxdata) == NBDKIT_FUA_EMULATE) {
+ flags &= ~NBDKIT_FLAG_FUA;
+ need_flush = true;
+ }
/* Unaligned head */
if (offs & (minblock - 1)) {
@@ -247,6 +253,8 @@ blocksize_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
count -= keep;
}
+ if (need_flush)
+ return next_ops->flush (nxdata, 0, err);
return 0;
}
@@ -256,9 +264,13 @@ blocksize_trim (struct nbdkit_next_ops *next_ops, void *nxdata,
int *err)
{
uint32_t keep;
+ bool need_flush = false;
- /* FIXME: Smarter handling of FUA - pass it through if the next layer
- * can handle it natively, but just once at end if next layer emulates. */
+ if ((flags & NBDKIT_FLAG_FUA) &&
+ next_ops->can_fua (nxdata) == NBDKIT_FUA_EMULATE) {
+ flags &= ~NBDKIT_FLAG_FUA;
+ need_flush = true;
+ }
/* Unaligned head */
if (offs & (minblock - 1)) {
@@ -280,6 +292,8 @@ blocksize_trim (struct nbdkit_next_ops *next_ops, void *nxdata,
count -= keep;
}
+ if (need_flush)
+ return next_ops->flush (nxdata, 0, err);
return 0;
}
@@ -290,9 +304,13 @@ blocksize_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
{
uint32_t keep;
uint32_t drop;
+ bool need_flush = false;
- /* FIXME: Smarter handling of FUA - pass it through if the next layer
- * can handle it natively, but just once at end if next layer emulates. */
+ if ((flags & NBDKIT_FLAG_FUA) &&
+ next_ops->can_fua (nxdata) == NBDKIT_FUA_EMULATE) {
+ flags &= ~NBDKIT_FLAG_FUA;
+ need_flush = true;
+ }
/* Unaligned head */
if (offs & (minblock - 1)) {
@@ -329,6 +347,8 @@ blocksize_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
count -= keep;
}
+ if (need_flush)
+ return next_ops->flush (nxdata, 0, err);
return 0;
}
diff --git a/filters/cache/cache.c b/filters/cache/cache.c
index c8dd791..76a0438 100644
--- a/filters/cache/cache.c
+++ b/filters/cache/cache.c
@@ -90,6 +90,10 @@ static enum cache_mode {
CACHE_MODE_UNSAFE,
} cache_mode = CACHE_MODE_WRITEBACK;
+static int
+cache_flush (struct nbdkit_next_ops *next_ops, void *nxdata, void *handle,
+ uint32_t flags, int *err);
+
static void
cache_load (void)
{
@@ -228,7 +232,10 @@ cache_prepare (struct nbdkit_next_ops *next_ops, void *nxdata,
int64_t r;
r = cache_get_size (next_ops, nxdata, handle);
- return r >= 0 ? 0 : -1;
+ if (r < 0)
+ return -1;
+ /* TODO: cache per-connection FUA mode? */
+ return 0;
}
/* Return true if the block is allocated. Consults the bitmap. */
@@ -392,6 +399,7 @@ cache_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
uint32_t flags, int *err)
{
uint8_t *block;
+ bool need_flush = false;
block = malloc (BLKSIZE);
if (block == NULL) {
@@ -400,6 +408,11 @@ cache_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
return -1;
}
+ if ((flags & NBDKIT_FLAG_FUA) &&
+ next_ops->can_fua (nxdata) == NBDKIT_FUA_EMULATE) {
+ flags &= ~NBDKIT_FLAG_FUA;
+ need_flush = true;
+ }
while (count > 0) {
uint64_t blknum, blkoffs, n;
@@ -426,6 +439,8 @@ cache_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
}
free (block);
+ if (need_flush)
+ return cache_flush (next_ops, nxdata, handle, 0, err);
return 0;
}
@@ -436,6 +451,7 @@ cache_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
int *err)
{
uint8_t *block;
+ bool need_flush = false;
block = malloc (BLKSIZE);
if (block == NULL) {
@@ -445,6 +461,11 @@ cache_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
}
flags &= ~NBDKIT_FLAG_MAY_TRIM; /* See BLKSIZE comment above. */
+ if ((flags & NBDKIT_FLAG_FUA) &&
+ next_ops->can_fua (nxdata) == NBDKIT_FUA_EMULATE) {
+ flags &= ~NBDKIT_FLAG_FUA;
+ need_flush = true;
+ }
while (count > 0) {
uint64_t blknum, blkoffs, n;
@@ -469,6 +490,8 @@ cache_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
}
free (block);
+ if (need_flush)
+ return cache_flush (next_ops, nxdata, handle, 0, err);
return 0;
}
diff --git a/filters/cow/cow.c b/filters/cow/cow.c
index cbf4ed4..2f86f6c 100644
--- a/filters/cow/cow.c
+++ b/filters/cow/cow.c
@@ -61,6 +61,12 @@
* this we limit the thread model to SERIALIZE_ALL_REQUESTS so that
* there cannot be concurrent pwrite requests. We could relax this
* restriction with a bit of work.
+ *
+ * We allow the client to request FUA, and emulate it with a flush
+ * (arguably, since the write overlay is temporary, and since we
+ * serialize all requests, we could ignore FUA altogether, but
+ * flushing will be more correct if we improve the thread model to be
+ * more parallel).
*/
#include <config.h>
@@ -235,6 +241,12 @@ cow_can_flush (struct nbdkit_next_ops *next_ops, void *nxdata, void
*handle)
return 1;
}
+static int
+cow_can_fua (struct nbdkit_next_ops *next_ops, void *nxdata, void *handle)
+{
+ return NBDKIT_FUA_EMULATE;
+}
+
/* Return true if the block is allocated. Consults the bitmap. */
static bool
blk_is_allocated (uint64_t blknum)
@@ -309,6 +321,10 @@ blk_write (uint64_t blknum, const uint8_t *block, int *err)
return 0;
}
+static int
+cow_flush (struct nbdkit_next_ops *next_ops, void *nxdata, void *handle,
+ uint32_t flags, int *err);
+
/* Read data. */
static int
cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
@@ -390,6 +406,8 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
}
free (block);
+ if (flags & NBDKIT_FLAG_FUA)
+ return cow_flush (next_ops, nxdata, handle, 0, err);
return 0;
}
@@ -435,6 +453,8 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
}
free (block);
+ if (flags & NBDKIT_FLAG_FUA)
+ return cow_flush (next_ops, nxdata, handle, 0, err);
return 0;
}
@@ -466,6 +486,7 @@ static struct nbdkit_filter filter = {
.can_write = cow_can_write,
.can_flush = cow_can_flush,
.can_trim = cow_can_trim,
+ .can_fua = cow_can_fua,
.pread = cow_pread,
.pwrite = cow_pwrite,
.zero = cow_zero,
diff --git a/filters/log/log.c b/filters/log/log.c
index 2dd61c0..2079084 100644
--- a/filters/log/log.c
+++ b/filters/log/log.c
@@ -236,13 +236,13 @@ log_prepare (struct nbdkit_next_ops *next_ops, void *nxdata, void
*handle)
int r = next_ops->is_rotational (nxdata);
int t = next_ops->can_trim (nxdata);
int z = next_ops->can_zero (nxdata);
+ int F = next_ops->can_fua (nxdata);
- if (size < 0 || w < 0 || f < 0 || r < 0 || t < 0 || z < 0)
+ if (size < 0 || w < 0 || f < 0 || r < 0 || t < 0 || z < 0 || F <
0)
return -1;
- /* TODO expose can_fua to filters */
output (h, "Connect", 0, "size=0x%" PRIx64 " write=%d flush=%d
"
- "rotational=%d trim=%d zero=%d" /* fua=? */, size, w, f, r, t, z);
+ "rotational=%d trim=%d zero=%d fua=%d", size, w, f, r, t, z, F);
return 0;
}
--
2.14.3