You're not supposed to read or write NBD servers at a granularity less
than the advertised minimum block size. nbdcopy has ignored this
requirement, and this is usually fine because the NBD servers we care
about support 512-byte sector granularity, and never advertise sizes /
extents less granular than sectors (even if it's a bit suboptimal in a
few cases).
However there is one new case where we do care: When writing to a
compressed qcow2 file, qemu advertises a minimum and preferred block
size of 64K, and it really means it. You cannot write blocks smaller
than this because of the way qcow2 compression is implemented.
This commit attempts to do the least work possible to fix this.
We modify the NBD operations for writing and zeroing. We query the
NBD server to find its preferred block size[*]. In write and zero
operations, we identify fragments of write/zero operations which do
not cover a whole preferred block. These partial block operations are
saved in a separate list. When we have assembled a complete preferred
block, the whole block can be retired to the server (synchronously, as
this is the slow path).
[*] Preferred block size is always >= minimum block size, and if we're
going to this effort it's better to use the preferred block size.
This code relies heavily on the assumption that nbdcopy only
writes/zeroes each byte of the output once.
Note this doesn't attempt to fix block size when reading from NBD
servers, nor attempt to limit the maximum block size when reading or
writing.
This is a partial fix for
https://bugzilla.redhat.com/2047660.
Further changes will be required in virt-v2v.
Link:
https://lists.gnu.org/archive/html/qemu-block/2022-01/threads.html#00729
Link:
https://bugzilla.redhat.com/show_bug.cgi?id=2047660
---
copy/nbd-ops.c | 371 ++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 352 insertions(+), 19 deletions(-)
diff --git a/copy/nbd-ops.c b/copy/nbd-ops.c
index 1ffe6a213f..7b87cdc45a 100644
--- a/copy/nbd-ops.c
+++ b/copy/nbd-ops.c
@@ -21,17 +21,32 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
+#include <inttypes.h>
#include <assert.h>
+#include <pthread.h>
#include <libnbd.h>
#include "nbdcopy.h"
#include "const-string-vector.h"
+#include "isaligned.h"
+#include "minmax.h"
+#include "rounding.h"
#include "vector.h"
static struct rw_ops nbd_ops;
+struct partial_block {
+ uint64_t offset; /* Offset (multiple of preferred block size). */
+ size_t seen; /* How many bytes seen so far. */
+ bool written; /* True if we saw at least one write. */
+ bool zeroed; /* True if we saw at least one zero. */
+ char *data; /* The incomplete partial block. */
+};
+
+DEFINE_VECTOR_TYPE (partial_blocks, struct partial_block)
+
DEFINE_VECTOR_TYPE (handles, struct nbd_handle *)
struct rw_nbd {
@@ -48,6 +63,34 @@ struct rw_nbd {
handles handles; /* One handle per connection. */
bool can_zero; /* Cached nbd_can_zero. */
+
+ /* Preferred block size (only used when writing/zeroing). Attempts
+ * to write/zero smaller than this size will use the partial blocks
+ * list below.
+ */
+ int64_t preferred;
+
+ /* Partial blocks (only used when writing/zeroing).
+ *
+ * This is a list of blocks of the preferred block size where we
+ * have not yet seen the entire block. We accumulate fragments
+ * until we have a whole preferred block that we can retire in a
+ * single operation (except in the case of the very last block in
+ * the file).
+ *
+ * The list is kept sorted by offset, and is assumed to be always
+ * small since we mostly copy sequentially and so partial blocks can
+ * be quickly filled and retired. You have to acquire the lock
+ * since all threads may manipulate this list. Partial blocks are
+ * handled on the slow path, and retired through handles[0].
+ *
+ * Each entry on the list represents a whole block of the preferred
+ * size. Each entry stores some metadata allowing us to quickly see
+ * how much of the whole block we have accumulated so far, and what
+ * mix of write or zero fragments were seen, and the partial data.
+ */
+ pthread_mutex_t partial_blocks_lock;
+ partial_blocks partial_blocks;
};
static void
@@ -113,11 +156,20 @@ open_one_nbd_handle (struct rw_nbd *rwn)
*/
if (rwn->handles.len == 0) {
rwn->can_zero = nbd_can_zero (nbd) > 0;
+
rwn->rw.size = nbd_get_size (nbd);
if (rwn->rw.size == -1) {
fprintf (stderr, "%s: %s: %s\n", prog, rwn->rw.name, nbd_get_error
());
exit (EXIT_FAILURE);
}
+
+ rwn->preferred = nbd_get_block_size (nbd, LIBNBD_SIZE_PREFERRED);
+ if (rwn->preferred == -1) {
+ fprintf (stderr, "%s: %s: %s\n", prog, rwn->rw.name, nbd_get_error
());
+ exit (EXIT_FAILURE);
+ }
+ if (rwn->preferred == 0)
+ rwn->preferred = 4096;
}
if (handles_append (&rwn->handles, nbd) == -1) {
@@ -137,6 +189,7 @@ nbd_rw_create_uri (const char *name, const char *uri, direction d)
rwn->create_t = CREATE_URI;
rwn->uri = uri;
rwn->d = d;
+ pthread_mutex_init (&rwn->partial_blocks_lock, NULL);
open_one_nbd_handle (rwn);
@@ -172,12 +225,16 @@ error:
exit (EXIT_FAILURE);
}
+static void retire_last_partial_block (struct rw_nbd *rwn);
+
static void
nbd_ops_close (struct rw *rw)
{
struct rw_nbd *rwn = (struct rw_nbd *) rw;
size_t i;
+ retire_last_partial_block (rwn);
+
for (i = 0; i < rwn->handles.len; ++i) {
if (nbd_shutdown (rwn->handles.ptr[i], 0) == -1) {
fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ());
@@ -188,6 +245,8 @@ nbd_ops_close (struct rw *rw)
handles_reset (&rwn->handles);
const_string_vector_reset (&rwn->argv);
+ partial_blocks_reset (&rwn->partial_blocks);
+ pthread_mutex_destroy (&rwn->partial_blocks_lock);
free (rw);
}
@@ -197,6 +256,8 @@ nbd_ops_flush (struct rw *rw)
struct rw_nbd *rwn = (struct rw_nbd *) rw;
size_t i;
+ retire_last_partial_block (rwn);
+
for (i = 0; i < rwn->handles.len; ++i) {
if (nbd_flush (rwn->handles.ptr[i], 0) == -1) {
fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ());
@@ -250,6 +311,173 @@ nbd_ops_start_multi_conn (struct rw *rw)
assert (rwn->handles.len == connections);
}
+static void retire_block (struct rw_nbd *rwn, size_t i);
+
+/* Comparison function used when searching through partial blocks list. */
+static int
+compare_offsets (const void *offsetp, const struct partial_block *entry)
+{
+ const uint64_t offset = *(uint64_t *)offsetp;
+
+ if (offset < entry->offset) return -1;
+ if (offset > entry->offset) return 1;
+ return 0;
+}
+
+/* Add to a partial block, retire if complete. */
+static void
+partial_add (struct rw_nbd *rwn, const void *data,
+ uint64_t len, uint64_t offset)
+{
+ uint64_t aligned_offset = ROUND_DOWN (offset, rwn->preferred);
+ struct partial_block *block;
+ size_t i;
+
+ pthread_mutex_lock (&rwn->partial_blocks_lock);
+
+ assert (len < rwn->preferred);
+
+ /* Find an existing partial block, if there is one. */
+ block = partial_blocks_search (&rwn->partial_blocks, &aligned_offset,
+ compare_offsets);
+ /* ... or allocate a new partial block and insert it. */
+ if (!block) {
+ struct partial_block t = { .offset = aligned_offset };
+ t.data = calloc (rwn->preferred, 1);
+ if (t.data == NULL) { perror ("malloc"); exit (EXIT_FAILURE); }
+
+ for (i = 0; i < rwn->partial_blocks.len; ++i) {
+ if (t.offset < rwn->partial_blocks.ptr[i].offset) {
+ if (partial_blocks_insert (&rwn->partial_blocks, t, i) == -1) {
+ perror ("malloc");
+ exit (EXIT_FAILURE);
+ }
+ block = &rwn->partial_blocks.ptr[i];
+ break;
+ }
+ }
+
+ if (!block) {
+ /* Insert it at the end. */
+ if (partial_blocks_append (&rwn->partial_blocks, t) == -1) {
+ perror ("malloc");
+ exit (EXIT_FAILURE);
+ }
+ block = &rwn->partial_blocks.ptr[rwn->partial_blocks.len-1];
+ }
+ }
+
+ assert (block->offset == aligned_offset);
+
+ if (data == NULL) {
+ /* If we're zeroing, it's easy. */
+ block->zeroed |= true;
+ /* block->data starts as zero, and we only write to each byte
+ * once, so we don't need a memset here.
+ */
+ }
+ else {
+ /* Writing is a bit harder, copy the data to the partial block. */
+ block->written |= true;
+ memcpy (&block->data[offset - aligned_offset], data, len);
+ }
+
+ /* We've seen more data. */
+ block->seen += len;
+ assert (block->seen <= rwn->preferred);
+
+ /* If we've now seen the whole block, retire it. */
+ if (block->seen == rwn->preferred) {
+ retire_block (rwn, block - rwn->partial_blocks.ptr);
+ block = NULL;
+ }
+
+ pthread_mutex_unlock (&rwn->partial_blocks_lock);
+}
+
+static void
+partial_write (struct rw_nbd *rwn, const void *data,
+ uint64_t len, uint64_t offset)
+{
+ partial_add (rwn, data, len, offset);
+}
+
+static void
+partial_zero (struct rw_nbd *rwn, uint64_t len, uint64_t offset)
+{
+ partial_add (rwn, NULL, len, offset);
+}
+
+/* Retire a previously partial (now full) block.
+ *
+ * Must be called with the lock held.
+ */
+static void
+retire_block (struct rw_nbd *rwn, size_t i)
+{
+ struct partial_block *block = &rwn->partial_blocks.ptr[i];
+ int r;
+
+ /* If the block was only zeroed, it must only contain zero bytes, so
+ * try zeroing.
+ *
+ * Note use block->seen here so we can handle the (really) partial
+ * data at the end of the file.
+ */
+ if (!block->written && block->zeroed && rwn->can_zero)
+ r = nbd_zero (rwn->handles.ptr[0], block->seen, block->offset, 0);
+ else
+ r = nbd_pwrite (rwn->handles.ptr[0],
+ block->data, block->seen, block->offset, 0);
+ if (r == -1) {
+ fprintf (stderr, "%s: %s\n", rwn->rw.name, nbd_get_error ());
+ exit (EXIT_FAILURE);
+ }
+
+ /* Free the partial block and remove it from the list. */
+ free (block->data);
+ partial_blocks_remove (&rwn->partial_blocks, i);
+}
+
+/* Retire last remaining partial block.
+ *
+ * Only the last block should be partial, otherwise it's an error.
+ * This is because nbdcopy should have called us exactly once for
+ * every byte of data in the file, so every earlier block should have
+ * been retired as we went along.
+ */
+static void
+retire_last_partial_block (struct rw_nbd *rwn)
+{
+ size_t i;
+
+ pthread_mutex_lock (&rwn->partial_blocks_lock);
+
+ if (rwn->partial_blocks.len == 0)
+ goto out;
+
+ if (rwn->partial_blocks.len >= 2) {
+ fprintf (stderr, "%s: INTERNAL ERROR: "
+ "too many partial_blocks after copying:\n", prog);
+ for (i = 0; i < rwn->partial_blocks.len; ++i) {
+ fprintf (stderr, "[%zu].{ offset = %" PRIu64 ", seen = %zu, [%c%c]
}\n",
+ i,
+ rwn->partial_blocks.ptr[i].offset,
+ rwn->partial_blocks.ptr[i].seen,
+ rwn->partial_blocks.ptr[i].written ? 'W' : '-',
+ rwn->partial_blocks.ptr[i].zeroed ? 'Z' : '-');
+ }
+ abort ();
+ }
+
+ assert (rwn->partial_blocks.len == 1);
+ retire_block (rwn, 0);
+ assert (rwn->partial_blocks.len == 0);
+
+ out:
+ pthread_mutex_unlock (&rwn->partial_blocks_lock);
+}
+
static size_t
nbd_ops_synch_read (struct rw *rw,
void *data, size_t len, uint64_t offset)
@@ -274,11 +502,34 @@ nbd_ops_synch_write (struct rw *rw,
const void *data, size_t len, uint64_t offset)
{
struct rw_nbd *rwn = (struct rw_nbd *) rw;
+ uint64_t blkoffs, n;
- if (nbd_pwrite (rwn->handles.ptr[0], data, len, offset, 0) == -1) {
- fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ());
- exit (EXIT_FAILURE);
+ blkoffs = offset % rwn->preferred;
+
+ /* Unaligned head. */
+ if (blkoffs) {
+ n = MIN (rwn->preferred - blkoffs, len);
+ partial_write (rwn, data, n, offset);
+ data += n;
+ len -= n;
+ offset += n;
+ }
+
+ /* Aligned body. */
+ n = ROUND_DOWN (len, rwn->preferred);
+ if (n) {
+ if (nbd_pwrite (rwn->handles.ptr[0], data, n, offset, 0) == -1) {
+ fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ());
+ exit (EXIT_FAILURE);
+ }
+ data += n;
+ len -= n;
+ offset += n;
}
+
+ /* Unaligned tail. */
+ if (len)
+ partial_write (rwn, data, len, offset);
}
static bool
@@ -286,15 +537,37 @@ nbd_ops_synch_zero (struct rw *rw, uint64_t offset, uint64_t count,
bool allocate)
{
struct rw_nbd *rwn = (struct rw_nbd *) rw;
+ uint64_t blkoffs, n;
if (!rwn->can_zero)
return false;
- if (nbd_zero (rwn->handles.ptr[0], count, offset,
- allocate ? LIBNBD_CMD_FLAG_NO_HOLE : 0) == -1) {
- fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ());
- exit (EXIT_FAILURE);
+ blkoffs = offset % rwn->preferred;
+
+ /* Unaligned head. */
+ if (blkoffs) {
+ n = MIN (rwn->preferred - blkoffs, count);
+ partial_zero (rwn, n, offset);
+ count -= n;
+ offset += n;
}
+
+ /* Aligned body. */
+ n = ROUND_DOWN (count, rwn->preferred);
+ if (n) {
+ if (nbd_zero (rwn->handles.ptr[0], n, offset,
+ allocate ? LIBNBD_CMD_FLAG_NO_HOLE : 0) == -1) {
+ fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ());
+ exit (EXIT_FAILURE);
+ }
+ count -= n;
+ offset += n;
+ }
+
+ /* Unaligned tail. */
+ if (count)
+ partial_zero (rwn, count, offset);
+
return true;
}
@@ -320,13 +593,43 @@ nbd_ops_asynch_write (struct rw *rw,
nbd_completion_callback cb)
{
struct rw_nbd *rwn = (struct rw_nbd *) rw;
+ void *data = slice_ptr (command->slice);
+ uint64_t len = command->slice.len;
+ uint64_t offset = command->offset;
+ uint64_t blkoffs, n;
- if (nbd_aio_pwrite (rwn->handles.ptr[command->worker->index],
- slice_ptr (command->slice),
- command->slice.len, command->offset,
- cb, 0) == -1) {
- fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ());
- exit (EXIT_FAILURE);
+ blkoffs = offset % rwn->preferred;
+
+ /* Unaligned head. */
+ if (blkoffs) {
+ n = MIN (rwn->preferred - blkoffs, len);
+ partial_write (rwn, data, n, offset);
+ data += n;
+ len -= n;
+ offset += n;
+ }
+
+ /* Aligned body. */
+ n = ROUND_DOWN (len, rwn->preferred);
+ if (n) {
+ if (nbd_aio_pwrite (rwn->handles.ptr[command->worker->index],
+ data, n, offset, cb, 0) == -1) {
+ fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ());
+ exit (EXIT_FAILURE);
+ }
+ data += n;
+ len -= n;
+ offset += n;
+ }
+
+ /* Unaligned tail. */
+ if (len)
+ partial_write (rwn, data, len, offset);
+
+ /* If we don't run the asynch command above, we must retire it ourselves. */
+ if (!n) {
+ int dummy = 0;
+ cb.callback (cb.user_data, &dummy);
}
}
@@ -335,6 +638,9 @@ nbd_ops_asynch_zero (struct rw *rw, struct command *command,
nbd_completion_callback cb, bool allocate)
{
struct rw_nbd *rwn = (struct rw_nbd *) rw;
+ uint64_t count = command->slice.len;
+ uint64_t offset = command->offset;
+ uint64_t blkoffs, n;
/* If the destination is already zero, do nothing. However we must
* call the callback so the command is freed.
@@ -348,14 +654,41 @@ nbd_ops_asynch_zero (struct rw *rw, struct command *command,
if (!rwn->can_zero)
return false;
- assert (command->slice.len <= UINT32_MAX);
+ assert (count <= UINT32_MAX);
- if (nbd_aio_zero (rwn->handles.ptr[command->worker->index],
- command->slice.len, command->offset,
- cb, allocate ? LIBNBD_CMD_FLAG_NO_HOLE : 0) == -1) {
- fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ());
- exit (EXIT_FAILURE);
+ blkoffs = offset % rwn->preferred;
+
+ /* Unaligned head. */
+ if (blkoffs) {
+ n = MIN (rwn->preferred - blkoffs, count);
+ partial_zero (rwn, n, offset);
+ count -= n;
+ offset += n;
+ }
+
+ /* Aligned body. */
+ n = ROUND_DOWN (count, rwn->preferred);
+ if (n) {
+ if (nbd_aio_zero (rwn->handles.ptr[command->worker->index],
+ n, offset, cb,
+ allocate ? LIBNBD_CMD_FLAG_NO_HOLE : 0) == -1) {
+ fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ());
+ exit (EXIT_FAILURE);
+ }
+ count -= n;
+ offset += n;
}
+
+ /* Unaligned tail. */
+ if (count)
+ partial_zero (rwn, count, offset);
+
+ /* If we don't run the asynch command above, we must retire it ourselves. */
+ if (!n) {
+ int dummy = 0;
+ cb.callback (cb.user_data, &dummy);
+ }
+
return true;
}
--
2.37.0.rc2