Our default cow caching behavior is that if we have not yet
overwritten a portion of the image, we should pass the cache request
on to the underlying plugin (to make the upcoming reads from the
plugin possibly faster). But if we HAVE copied something locally, we
can use posix_fadvise (if available) to tell the kernel that we have
an upcoming reuse of that area of our local disk, and don't need to
bother the plugin. This is the default because it keeps the COW image
as thin as possible.
However, another sane behavior is comparable to the 'cache' filter's
'cache_on_read' parameter: a user may be wanting to force portions of
the COW overlay to become populated, but want something more efficient
in network traffic than NBD_CMD_READ followed by NBD_CMD_WRITE of
unchanged data. Hence, this patch also adds a 'cow_on_cache'
parameter to opt-in to the second behavior.
Signed-off-by: Eric Blake <eblake(a)redhat.com>
---
filters/cow/nbdkit-cow-filter.pod | 15 ++++--
filters/cow/blk.h | 16 +++++-
filters/cow/blk.c | 41 +++++++++++++-
filters/cow/cow.c | 90 +++++++++++++++++++++++++++++++
4 files changed, 157 insertions(+), 5 deletions(-)
diff --git a/filters/cow/nbdkit-cow-filter.pod b/filters/cow/nbdkit-cow-filter.pod
index 448f48c..ae8c5e1 100644
--- a/filters/cow/nbdkit-cow-filter.pod
+++ b/filters/cow/nbdkit-cow-filter.pod
@@ -57,9 +57,18 @@ serve the same data to each client.
=head1 PARAMETERS
-There are no parameters specific to nbdkit-cow-filter. Any parameters
-are passed through to and processed by the underlying plugin in the
-normal way.
+=over 4
+
+=item B<cow-on-cache=true>
+
+Treat a client cache request as a shortcut for copying unmodified data
+from the plugin to the overlay, rather than the default of passing
+cache requests on to the plugin. This parameter defaults to false
+(which leaves the overlay as small as possible), but setting it can be
+useful for converting cache commands into a form of copy-on-read
+behavior, in addition to the filter's normal copy-on-write semantics.
+
+=back
=head1 EXAMPLES
diff --git a/filters/cow/blk.h b/filters/cow/blk.h
index 429bb53..1c1d922 100644
--- a/filters/cow/blk.h
+++ b/filters/cow/blk.h
@@ -1,5 +1,5 @@
/* nbdkit
- * Copyright (C) 2018 Red Hat Inc.
+ * Copyright (C) 2018-2019 Red Hat Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
@@ -59,6 +59,20 @@ extern int blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
uint64_t blknum, uint8_t *block, int *err)
__attribute__((__nonnull__ (1, 4, 5)));
+/* Cache mode for blocks not already in overlay */
+enum cache_mode {
+ BLK_CACHE_IGNORE, /* Do nothing */
+ BLK_CACHE_PASSTHROUGH, /* Make cache request to plugin */
+ BLK_CACHE_READ, /* Make ignored read request to plugin */
+ BLK_CACHE_COW, /* Make read request to plugin, and write to overlay */
+};
+
+/* Cache a single block from the plugin. */
+extern int blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
+ uint64_t blknum, uint8_t *block, enum cache_mode,
+ int *err)
+ __attribute__((__nonnull__ (1, 4, 6)));
+
/* Write a single block. */
extern int blk_write (uint64_t blknum, const uint8_t *block, int *err)
__attribute__((__nonnull__ (2, 3)));
diff --git a/filters/cow/blk.c b/filters/cow/blk.c
index 9c99aee..be43f2f 100644
--- a/filters/cow/blk.c
+++ b/filters/cow/blk.c
@@ -1,5 +1,5 @@
/* nbdkit
- * Copyright (C) 2018 Red Hat Inc.
+ * Copyright (C) 2018-2019 Red Hat Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
@@ -190,6 +190,45 @@ blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
}
}
+int
+blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
+ uint64_t blknum, uint8_t *block, enum cache_mode mode, int *err)
+{
+ off_t offset = blknum * BLKSIZE;
+ bool allocated = blk_is_allocated (blknum);
+
+ nbdkit_debug ("cow: blk_cache block %" PRIu64 " (offset %" PRIu64
") is %s",
+ blknum, (uint64_t) offset,
+ !allocated ? "a hole" : "allocated");
+
+ if (allocated) {
+#if HAVE_POSIX_FADVISE
+ int r = posix_fadvise (fd, offset, BLKSIZE, POSIX_FADV_WILLNEED);
+ if (r) {
+ errno = r;
+ nbdkit_error ("posix_fadvise: %m");
+ return -1;
+ }
+#endif
+ return 0;
+ }
+ if (mode == BLK_CACHE_IGNORE)
+ return 0;
+ if (mode == BLK_CACHE_PASSTHROUGH)
+ return next_ops->cache (nxdata, BLKSIZE, offset, 0, err);
+ if (next_ops->pread (nxdata, block, BLKSIZE, offset, 0, err) == -1)
+ return -1;
+ if (mode == BLK_CACHE_COW) {
+ if (pwrite (fd, block, BLKSIZE, offset) == -1) {
+ *err = errno;
+ nbdkit_error ("pwrite: %m");
+ return -1;
+ }
+ blk_set_allocated (blknum);
+ }
+ return 0;
+}
+
int
blk_write (uint64_t blknum, const uint8_t *block, int *err)
{
diff --git a/filters/cow/cow.c b/filters/cow/cow.c
index aa1348b..006007e 100644
--- a/filters/cow/cow.c
+++ b/filters/cow/cow.c
@@ -58,6 +58,8 @@
*/
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+bool cow_on_cache;
+
static void
cow_load (void)
{
@@ -71,6 +73,24 @@ cow_unload (void)
blk_free ();
}
+static int
+cow_config (nbdkit_next_config *next, void *nxdata,
+ const char *key, const char *value)
+{
+ if (strcmp (key, "cow-on-cache") == 0) {
+ int r;
+
+ r = nbdkit_parse_bool (value);
+ if (r == -1)
+ return -1;
+ cow_on_cache = r;
+ return 0;
+ }
+ else {
+ return next (nxdata, key, value);
+ }
+}
+
static void *
cow_open (nbdkit_next_open *next, void *nxdata, int readonly)
{
@@ -152,6 +172,12 @@ cow_can_fua (struct nbdkit_next_ops *next_ops, void *nxdata, void
*handle)
return NBDKIT_FUA_EMULATE;
}
+static int
+cow_can_cache (struct nbdkit_next_ops *next_ops, void *nxdata, void *handle)
+{
+ return NBDKIT_FUA_NATIVE;
+}
+
static int cow_flush (struct nbdkit_next_ops *next_ops, void *nxdata, void *handle,
uint32_t flags, int *err);
/* Read data. */
@@ -391,6 +417,67 @@ cow_flush (struct nbdkit_next_ops *next_ops, void *nxdata, void
*handle,
return r;
}
+static int
+cow_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
+ void *handle, uint32_t count, uint64_t offset,
+ uint32_t flags, int *err)
+{
+ CLEANUP_FREE uint8_t *block = NULL;
+ uint64_t blknum, blkoffs;
+ int r;
+ uint64_t remaining = count; /* Rounding out could exceed 32 bits */
+ enum cache_mode mode; /* XXX Cache this per connection? */
+
+ switch (next_ops->can_cache (nxdata)) {
+ case NBDKIT_CACHE_NONE:
+ mode = BLK_CACHE_IGNORE;
+ break;
+ case NBDKIT_CACHE_EMULATE:
+ mode = BLK_CACHE_READ;
+ break;
+ case NBDKIT_CACHE_NATIVE:
+ mode = BLK_CACHE_PASSTHROUGH;
+ break;
+ default:
+ *err = EINVAL;
+ return -1;
+ }
+ if (cow_on_cache)
+ mode = BLK_CACHE_COW;
+
+ assert (!flags);
+ block = malloc (BLKSIZE);
+ if (block == NULL) {
+ *err = errno;
+ nbdkit_error ("malloc: %m");
+ return -1;
+ }
+
+ blknum = offset / BLKSIZE; /* block number */
+ blkoffs = offset % BLKSIZE; /* offset within the block */
+
+ /* Unaligned head */
+ remaining += blkoffs;
+ offset -= blkoffs;
+
+ /* Unaligned tail */
+ remaining = ROUND_UP (remaining, BLKSIZE);
+
+ /* Aligned body */
+ while (remaining) {
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ r = blk_cache (next_ops, nxdata, blknum, block, mode, err);
+ if (r == -1)
+ return -1;
+
+ remaining -= BLKSIZE;
+ offset += BLKSIZE;
+ blknum++;
+ }
+
+ return 0;
+}
+
static struct nbdkit_filter filter = {
.name = "cow",
.longname = "nbdkit copy-on-write (COW) filter",
@@ -398,6 +485,7 @@ static struct nbdkit_filter filter = {
.load = cow_load,
.unload = cow_unload,
.open = cow_open,
+ .config = cow_config,
.prepare = cow_prepare,
.get_size = cow_get_size,
.can_write = cow_can_write,
@@ -405,10 +493,12 @@ static struct nbdkit_filter filter = {
.can_trim = cow_can_trim,
.can_extents = cow_can_extents,
.can_fua = cow_can_fua,
+ .can_cache = cow_can_cache,
.pread = cow_pread,
.pwrite = cow_pwrite,
.zero = cow_zero,
.flush = cow_flush,
+ .cache = cow_cache,
};
NBDKIT_REGISTER_FILTER(filter)
--
2.20.1