On Fri, Jan 28, 2022 at 10:37 PM Richard W.M. Jones <rjones(a)redhat.com> wrote:
For destinations, especially NBD, which have a preferred block size,
adjust our write strategy so we always write whole blocks / zeroes of
the preferred size.
---
copy/file-ops.c | 8 ++++
copy/main.c | 82 ++++++++++++++++++++++++++++++++++++-
copy/multi-thread-copying.c | 2 +
copy/nbd-ops.c | 21 ++++++++++
copy/nbdcopy.h | 5 +++
copy/null-ops.c | 7 ++++
copy/pipe-ops.c | 7 ++++
copy/synch-copying.c | 2 +
8 files changed, 132 insertions(+), 2 deletions(-)
diff --git a/copy/file-ops.c b/copy/file-ops.c
index 847043417..3daf55484 100644
--- a/copy/file-ops.c
+++ b/copy/file-ops.c
@@ -369,6 +369,13 @@ file_is_read_only (struct rw *rw)
return false;
}
+static uint64_t
+file_get_preferred_block_size (struct rw *rw)
+{
+ /* XXX Could return the filesystem block size here. */
+ return 4096;
+}
+
static bool
file_can_extents (struct rw *rw)
{
@@ -726,6 +733,7 @@ static struct rw_ops file_ops = {
.ops_name = "file_ops",
.close = file_close,
.is_read_only = file_is_read_only,
+ .get_preferred_block_size = file_get_preferred_block_size,
.can_extents = file_can_extents,
.can_multi_conn = file_can_multi_conn,
.start_multi_conn = file_start_multi_conn,
diff --git a/copy/main.c b/copy/main.c
index 67788b5d9..82537bafc 100644
--- a/copy/main.c
+++ b/copy/main.c
@@ -38,12 +38,15 @@
#include <libnbd.h>
+#include "isaligned.h"
#include "ispowerof2.h"
#include "human-size.h"
+#include "rounding.h"
#include "version.h"
#include "nbdcopy.h"
bool allocated; /* --allocated flag */
+uint64_t blocksize; /* destination block size */
unsigned connections = 4; /* --connections */
bool destination_is_zero; /* --destination-is-zero flag */
bool extents = true; /* ! --no-extents flag */
@@ -367,6 +370,29 @@ main (int argc, char *argv[])
if (threads < connections)
connections = threads;
+ /* Check the destination preferred block size. */
+ blocksize = dst->ops->get_preferred_block_size (dst);
+ if (!is_power_of_2 (blocksize)) {
+ fprintf (stderr, "%s: error: "
+ "destination block size is not a power of 2: %" PRIu64
"\n",
+ prog, blocksize);
+ exit (EXIT_FAILURE);
+ }
+ if (request_size < blocksize) {
+ fprintf (stderr, "%s: error: "
+ "request size (%u) < destination block size (%" PRIu64
"), "
+ "use the --request-size option to pick a larger request size"
+ "\n",
+ prog, request_size, blocksize);
+ exit (EXIT_FAILURE);
+ }
+ if (blocksize > THREAD_WORK_SIZE) {
+ fprintf (stderr, "%s: error: "
+ "destination block size is too large for nbdcopy: %" PRIu64
"\n",
+ prog, blocksize);
+ exit (EXIT_FAILURE);
+ }
+
/* Truncate the destination to the same size as the source. Only
* has an effect on regular files.
*/
@@ -392,9 +418,10 @@ main (int argc, char *argv[])
print_rw (src, "nbdcopy: src", stderr);
print_rw (dst, "nbdcopy: dst", stderr);
fprintf (stderr, "nbdcopy: connections=%u requests=%u threads=%u "
- "synchronous=%s\n",
+ "synchronous=%s blocksize=%" PRIu64 "\n",
connections, max_requests, threads,
- synchronous ? "true" : "false");
+ synchronous ? "true" : "false",
+ blocksize);
}
/* If multi-conn is enabled on either side, then at this point we
@@ -537,6 +564,57 @@ default_get_extents (struct rw *rw, uintptr_t index,
}
}
+/* Reading extents from the source drives writes and zeroes to the
+ * destination. We need to adjust the extents list so that any
+ * extents smaller than the destination block size are eliminated.
+ */
+void
+adjust_extents_for_blocksize (extent_list *exts)
+{
+ size_t i, j;
+ uint64_t end;
+
+ if (blocksize <= 1)
+ return;
+
+ /* Note we iterate [0..len-2] because the very last extent is
+ * permitted to be shorter than a whole block.
+ */
+ for (i = 0; i < exts->len - 1; ++i) {
+ assert (IS_ALIGNED (exts->ptr[i].offset, blocksize));
+ if (IS_ALIGNED (exts->ptr[i].length, blocksize))
+ continue;
+
+ if (!exts->ptr[i].zero) /* data */ {
+ /* Data extent: round the length up to the next block, delete
+ * any completely overlapped extents, and if still needed move
+ * up the start of the next extent.
+ */
+ exts->ptr[i].length = ROUND_UP (exts->ptr[i].length, blocksize);
+ end = exts->ptr[i].offset + exts->ptr[i].length;
+
+ for (j = i+1; j < exts->len && exts->ptr[j].offset < end; ++j)
{
+ if (exts->ptr[j].offset + exts->ptr[j].length <= end)
+ extent_list_remove (exts, j);
+ if (exts->ptr[j].offset < end)
+ exts->ptr[j].offset = end;
+ }
+ }
+ else {
+ /* Zero extent: Round the length down to blocksize, and if now
+ * zero-length, remove it.
+ */
+ exts->ptr[i].length = ROUND_DOWN (exts->ptr[i].length, blocksize);
+ if (i+1 < exts->len)
+ exts->ptr[i+1].offset = exts->ptr[i].offset + exts->ptr[i].length;
+ if (exts->ptr[i].length == 0) {
+ extent_list_remove (exts, i);
+ i--;
+ }
+ }
+ }
+}
I think this will be simpler and more efficient if we add a function
that returns
the next extent to handle from the list. Something like:
https://gitlab.com/nbdkit/libnbd/-/blob/master/examples/copy-libev.c#L308
This way we don't need to process the entire list multiple times or remove
extents from the list.
It is also easy to test such a function by feeding it with an extent array
and checking that it returns the right extent "stream".
+
/* Implementations of get_polling_fd and asynch_notify_* for backends
* which don't support polling.
*/
diff --git a/copy/multi-thread-copying.c b/copy/multi-thread-copying.c
index b17ca598b..b436ba779 100644
--- a/copy/multi-thread-copying.c
+++ b/copy/multi-thread-copying.c
@@ -154,6 +154,8 @@ worker_thread (void *indexp)
else
default_get_extents (src, index, offset, count, &exts);
+ adjust_extents_for_blocksize (&exts);
+
for (i = 0; i < exts.len; ++i) {
struct command *command;
struct buffer *buffer;
diff --git a/copy/nbd-ops.c b/copy/nbd-ops.c
index dfc62f203..1134ddd4e 100644
--- a/copy/nbd-ops.c
+++ b/copy/nbd-ops.c
@@ -195,6 +195,26 @@ nbd_ops_is_read_only (struct rw *rw)
return false;
}
+static uint64_t
+nbd_ops_get_preferred_block_size (struct rw *rw)
+{
+ struct rw_nbd *rwn = (struct rw_nbd *) rw;
+ struct nbd_handle *nbd;
+ int64_t r;
+
+ if (rwn->handles.len == 0)
+ return 4096;
+
+ nbd = rwn->handles.ptr[0];
+ r = nbd_get_block_size (nbd, LIBNBD_SIZE_PREFERRED);
+ if (r == -1) {
+ fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ());
+ exit (EXIT_FAILURE);
+ }
+
+ return r == 0 ? 4096 : (uint64_t) r;
+}
+
static bool
nbd_ops_can_extents (struct rw *rw)
{
@@ -491,6 +511,7 @@ static struct rw_ops nbd_ops = {
.ops_name = "nbd_ops",
.close = nbd_ops_close,
.is_read_only = nbd_ops_is_read_only,
+ .get_preferred_block_size = nbd_ops_get_preferred_block_size,
Why preferred block size and not minimum block size? For example if we
write 256k when the minimum block size is 64k, wouldn't qemu block layer
handle the write properly, creating 4 compressed clusters?
When not using a compress filter, qemu-nbd reports block size of 4k, and
using this value it will kill performance.
.can_extents = nbd_ops_can_extents,
.can_multi_conn = nbd_ops_can_multi_conn,
.start_multi_conn = nbd_ops_start_multi_conn,
diff --git a/copy/nbdcopy.h b/copy/nbdcopy.h
index e7fe1eab2..c6505a1d4 100644
--- a/copy/nbdcopy.h
+++ b/copy/nbdcopy.h
@@ -120,6 +120,9 @@ struct rw_ops {
/* Return true if this is a read-only connection. */
bool (*is_read_only) (struct rw *rw);
+ /* Read the preferred block size. */
+ uint64_t (*get_preferred_block_size) (struct rw *rw);
+
/* For source only, does it support reading extents? */
bool (*can_extents) (struct rw *rw);
@@ -206,12 +209,14 @@ struct rw_ops {
extern void default_get_extents (struct rw *rw, uintptr_t index,
uint64_t offset, uint64_t count,
extent_list *ret);
+extern void adjust_extents_for_blocksize (extent_list *exts);
extern void get_polling_fd_not_supported (struct rw *rw, uintptr_t index,
int *fd_rtn, int *direction_rtn);
extern void asynch_notify_read_write_not_supported (struct rw *rw,
uintptr_t index);
extern bool allocated;
+extern uint64_t blocksize;
extern unsigned connections;
extern bool destination_is_zero;
extern bool extents;
diff --git a/copy/null-ops.c b/copy/null-ops.c
index a38666d62..5e87f3502 100644
--- a/copy/null-ops.c
+++ b/copy/null-ops.c
@@ -66,6 +66,12 @@ null_is_read_only (struct rw *rw)
return false;
}
+static uint64_t
+null_get_preferred_block_size (struct rw *rw)
+{
+ return 1;
+}
+
static bool
null_can_extents (struct rw *rw)
{
@@ -154,6 +160,7 @@ static struct rw_ops null_ops = {
.ops_name = "null_ops",
.close = null_close,
.is_read_only = null_is_read_only,
+ .get_preferred_block_size = null_get_preferred_block_size,
.can_extents = null_can_extents,
.can_multi_conn = null_can_multi_conn,
.start_multi_conn = null_start_multi_conn,
diff --git a/copy/pipe-ops.c b/copy/pipe-ops.c
index f9b8599a3..f2a1f0090 100644
--- a/copy/pipe-ops.c
+++ b/copy/pipe-ops.c
@@ -72,6 +72,12 @@ pipe_is_read_only (struct rw *rw)
return false;
}
+static uint64_t
+pipe_get_preferred_block_size (struct rw *rw)
+{
+ return 1;
+}
+
static bool
pipe_can_extents (struct rw *rw)
{
@@ -165,6 +171,7 @@ static struct rw_ops pipe_ops = {
.close = pipe_close,
.is_read_only = pipe_is_read_only,
+ .get_preferred_block_size = pipe_get_preferred_block_size,
.can_extents = pipe_can_extents,
.can_multi_conn = pipe_can_multi_conn,
.start_multi_conn = pipe_start_multi_conn,
diff --git a/copy/synch-copying.c b/copy/synch-copying.c
index c9899c343..8d6c4d3d0 100644
--- a/copy/synch-copying.c
+++ b/copy/synch-copying.c
@@ -70,6 +70,8 @@ synch_copying (void)
else
default_get_extents (src, 0, offset, count, &exts);
+ adjust_extents_for_blocksize (&exts);
+
for (i = 0; i < exts.len; ++i) {
assert (exts.ptr[i].length <= count);
--
2.32.0
_______________________________________________
Libguestfs mailing list
Libguestfs(a)redhat.com
https://listman.redhat.com/mailman/listinfo/libguestfs