By storing an extra bit in the overlay bitmap we can mark when whole
blocks have been trimmed. This allows us to match a feature of qcow2
(zero clusters).
In theory we could punch holes in the overlay to save a bit of disk
space.
---
filters/cow/blk.h | 4 ++
filters/cow/blk.c | 106 +++++++++++++++++++++++++++++++---------------
filters/cow/cow.c | 75 +++++++++++++++++++++++++++++++-
3 files changed, 149 insertions(+), 36 deletions(-)
diff --git a/filters/cow/blk.h b/filters/cow/blk.h
index 5eb30794..71eeb7ad 100644
--- a/filters/cow/blk.h
+++ b/filters/cow/blk.h
@@ -80,4 +80,8 @@ extern int blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
extern int blk_write (uint64_t blknum, const uint8_t *block, int *err)
__attribute__((__nonnull__ (2, 3)));
+/* Trim a single block. */
+extern int blk_trim (uint64_t blknum, int *err)
+ __attribute__((__nonnull__ (2)));
+
#endif /* NBDKIT_BLK_H */
diff --git a/filters/cow/blk.c b/filters/cow/blk.c
index 4a8adfb9..ae0db1fe 100644
--- a/filters/cow/blk.c
+++ b/filters/cow/blk.c
@@ -43,16 +43,27 @@
* plugin returns the same immutable data for each pread call we make,
* and optimize on this basis.
*
- * A block bitmap is maintained in memory recording if each block in
- * the temporary file is "allocated" (1) or "hole" (0).
+ * A 2-bit per block bitmap is maintained in memory recording if each
+ * block in the temporary file is:
+ *
+ * 00 = not allocated in the overlay (read through to the plugin)
+ * 01 = allocated in the overlay
+ * 10 = <unused>
+ * 11 = trimmed in the overlay
*
* When reading a block we first check the bitmap to see if that file
- * block is allocated or a hole. If allocated, we return it from the
- * temporary file. If a hole, we issue a pread to the underlying
- * plugin.
+ * block is allocated, trimmed or not. If allocated, we return it
+ * from the temporary file. Trimmed returns zeroes. If not allocated
+ * we issue a pread to the underlying plugin.
*
* When writing a block we unconditionally write the data to the
- * temporary file, setting the bit in the bitmap.
+ * temporary file, setting the bit in the bitmap. (Writing zeroes is
+ * handled the same way.)
+ *
+ * When trimming we set the trimmed flag in the bitmap for whole
+ * blocks, and handle the unaligned portions like writing zeroes
+ * above. We could punch holes in the overlay as an optimization, but
+ * for simplicity we do not do that yet.
*
* Since the overlay is a deleted temporary file, we can ignore FUA
* and flush commands.
@@ -92,9 +103,26 @@
/* The temporary overlay. */
static int fd = -1;
-/* Bitmap. Bit = 1 => allocated, 0 => hole. */
+/* Bitmap. */
static struct bitmap bm;
+enum bm_entry {
+ BLOCK_NOT_ALLOCATED = 0,
+ BLOCK_ALLOCATED = 1,
+ BLOCK_TRIMMED = 3,
+};
+
+static const char *
+state_to_string (enum bm_entry state)
+{
+ switch (state) {
+ case BLOCK_NOT_ALLOCATED: return "not allocated";
+ case BLOCK_ALLOCATED: return "allocated";
+ case BLOCK_TRIMMED: return "trimmed";
+ default: abort ();
+ }
+}
+
int
blk_init (void)
{
@@ -102,7 +130,7 @@ blk_init (void)
size_t len;
char *template;
- bitmap_init (&bm, BLKSIZE, 1 /* bits per block */);
+ bitmap_init (&bm, BLKSIZE, 2 /* bits per block */);
tmpdir = getenv ("TMPDIR");
if (!tmpdir)
@@ -163,28 +191,16 @@ blk_set_size (uint64_t new_size)
return 0;
}
-/* Return true if the block is allocated. Consults the bitmap. */
-static bool
-blk_is_allocated (uint64_t blknum)
-{
- return bitmap_get_blk (&bm, blknum, false);
-}
-
-/* Mark a block as allocated. */
-static void
-blk_set_allocated (uint64_t blknum)
-{
- bitmap_set_blk (&bm, blknum, true);
-}
-
/* This is a bit of a hack since usually this information is hidden in
* the blk module. However it is needed when calculating extents.
*/
void
blk_status (uint64_t blknum, bool *present, bool *trimmed)
{
- *present = blk_is_allocated (blknum);
- *trimmed = false;
+ enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
+
+ *present = state != BLOCK_NOT_ALLOCATED;
+ *trimmed = state == BLOCK_TRIMMED;
}
/* These are the block operations. They always read or write a single
@@ -195,15 +211,14 @@ blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
uint64_t blknum, uint8_t *block, int *err)
{
off_t offset = blknum * BLKSIZE;
- bool allocated = blk_is_allocated (blknum);
+ enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64
") is %s",
- blknum, (uint64_t) offset,
- !allocated ? "a hole" : "allocated");
+ blknum, (uint64_t) offset, state_to_string (state));
- if (!allocated) /* Read underlying plugin. */
+ if (state == BLOCK_NOT_ALLOCATED) /* Read underlying plugin. */
return next_ops->pread (nxdata, block, BLKSIZE, offset, 0, err);
- else { /* Read overlay. */
+ else if (state == BLOCK_ALLOCATED) { /* Read overlay. */
if (pread (fd, block, BLKSIZE, offset) == -1) {
*err = errno;
nbdkit_error ("pread: %m");
@@ -211,6 +226,10 @@ blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
}
return 0;
}
+ else /* state == BLOCK_TRIMMED */ {
+ memset (block, 0, BLKSIZE);
+ return 0;
+ }
}
int
@@ -218,13 +237,12 @@ blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
uint64_t blknum, uint8_t *block, enum cache_mode mode, int *err)
{
off_t offset = blknum * BLKSIZE;
- bool allocated = blk_is_allocated (blknum);
+ enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
nbdkit_debug ("cow: blk_cache block %" PRIu64 " (offset %" PRIu64
") is %s",
- blknum, (uint64_t) offset,
- !allocated ? "a hole" : "allocated");
+ blknum, (uint64_t) offset, state_to_string (state));
- if (allocated) {
+ if (state == BLOCK_ALLOCATED) {
#if HAVE_POSIX_FADVISE
int r = posix_fadvise (fd, offset, BLKSIZE, POSIX_FADV_WILLNEED);
if (r) {
@@ -235,6 +253,8 @@ blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
#endif
return 0;
}
+ if (state == BLOCK_TRIMMED)
+ return 0;
if (mode == BLK_CACHE_IGNORE)
return 0;
if (mode == BLK_CACHE_PASSTHROUGH)
@@ -247,7 +267,7 @@ blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
nbdkit_error ("pwrite: %m");
return -1;
}
- blk_set_allocated (blknum);
+ bitmap_set_blk (&bm, blknum, BLOCK_ALLOCATED);
}
return 0;
}
@@ -265,7 +285,23 @@ blk_write (uint64_t blknum, const uint8_t *block, int *err)
nbdkit_error ("pwrite: %m");
return -1;
}
- blk_set_allocated (blknum);
+ bitmap_set_blk (&bm, blknum, BLOCK_ALLOCATED);
return 0;
}
+
+int
+blk_trim (uint64_t blknum, int *err)
+{
+ off_t offset = blknum * BLKSIZE;
+
+ nbdkit_debug ("cow: blk_trim block %" PRIu64 " (offset %" PRIu64
")",
+ blknum, (uint64_t) offset);
+
+ /* XXX As an optimization we could punch a whole in the overlay
+ * here. However it's not trivial since BLKSIZE is unrelated to the
+ * overlay filesystem block size.
+ */
+ bitmap_set_blk (&bm, blknum, BLOCK_TRIMMED);
+ return 0;
+}
diff --git a/filters/cow/cow.c b/filters/cow/cow.c
index d12565e6..f3f44757 100644
--- a/filters/cow/cow.c
+++ b/filters/cow/cow.c
@@ -150,7 +150,7 @@ cow_can_write (struct nbdkit_next_ops *next_ops, void *nxdata, void
*handle)
static int
cow_can_trim (struct nbdkit_next_ops *next_ops, void *nxdata, void *handle)
{
- return 0;
+ return 1;
}
static int
@@ -428,6 +428,78 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
return 0;
}
+/* Trim data. */
+static int
+cow_trim (struct nbdkit_next_ops *next_ops, void *nxdata,
+ void *handle, uint32_t count, uint64_t offset, uint32_t flags,
+ int *err)
+{
+ CLEANUP_FREE uint8_t *block = NULL;
+ uint64_t blknum, blkoffs;
+ int r;
+
+ if (!IS_ALIGNED (count | offset, BLKSIZE)) {
+ block = malloc (BLKSIZE);
+ if (block == NULL) {
+ *err = errno;
+ nbdkit_error ("malloc: %m");
+ return -1;
+ }
+ }
+
+ blknum = offset / BLKSIZE; /* block number */
+ blkoffs = offset % BLKSIZE; /* offset within the block */
+
+ /* Unaligned head */
+ if (blkoffs) {
+ uint64_t n = MIN (BLKSIZE - blkoffs, count);
+
+ /* Do a read-modify-write operation on the current block.
+ * Hold the lock over the whole operation.
+ */
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ r = blk_read (next_ops, nxdata, blknum, block, err);
+ if (r != -1) {
+ memset (&block[blkoffs], 0, n);
+ r = blk_write (blknum, block, err);
+ }
+ if (r == -1)
+ return -1;
+
+ count -= n;
+ offset += n;
+ blknum++;
+ }
+
+ /* Aligned body */
+ while (count >= BLKSIZE) {
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ r = blk_trim (blknum, err);
+ if (r == -1)
+ return -1;
+
+ count -= BLKSIZE;
+ offset += BLKSIZE;
+ blknum++;
+ }
+
+ /* Unaligned tail */
+ if (count) {
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ r = blk_read (next_ops, nxdata, blknum, block, err);
+ if (r != -1) {
+ memset (&block[count], 0, BLKSIZE - count);
+ r = blk_write (blknum, block, err);
+ }
+ if (r == -1)
+ return -1;
+ }
+
+ /* flags & NBDKIT_FLAG_FUA is deliberately ignored. */
+
+ return 0;
+}
+
static int
cow_flush (struct nbdkit_next_ops *next_ops, void *nxdata, void *handle,
uint32_t flags, int *err)
@@ -612,6 +684,7 @@ static struct nbdkit_filter filter = {
.pread = cow_pread,
.pwrite = cow_pwrite,
.zero = cow_zero,
+ .trim = cow_trim,
.flush = cow_flush,
.cache = cow_cache,
.extents = cow_extents,
--
2.29.0.rc2