RFC for NBD protocol extension: extended headers
by Eric Blake
In response to this mail, I will be cross-posting a series of patches
to multiple projects as a proof-of-concept implementation and request
for comments on a new NBD protocol extension, called
NBD_OPT_EXTENDED_HEADERS. With this in place, it will be possible for
clients to request 64-bit zero, trim, cache, and block status
operations when supported by the server.
Not yet complete: an implementation of this in nbdkit. I also plan to
tweak libnbd's 'nbdinfo --map' and 'nbdcopy' to take …
[View More]advantage of the
larger block status results. Also, with 64-bit commands, we may want
to also make it easier to let servers advertise an actual maximum size
they are willing to accept for the commands in question (for example,
a server may be happy with a full 64-bit block status, but still want
to limit non-fast zero and cache to a smaller limit to avoid denial of
service).
--
Eric Blake, Principal Software Engineer
Red Hat, Inc. +1-919-301-3266
Virtualization: qemu.org | libvirt.org
[View Less]
2 years, 3 months
[PATCH] always 'max' for the appliance CPU model on all targes except ppc
by Daniel P. Berrangé
The current logic for selecting CPU model to use with the appliance
selects 'max' for all architectures except for ppc64le and aarch64.
On aarch64, it selects 'host' if KVM is available which is identical
to 'max'. For TCG it selects 'cortex-a57'. The 'max' model is a
superset of 'cortex-a57', so it is reasonable to use 'max' for TCG
too.
On ppc64, it selects no CPU model due to a historical bug where
using 'host' would break ability to fallback to TCG. Unfortunately
we can't use 'max' on PPC …
[View More]as this is actually an old G4 vintage
32-bit model, rather than a synonym for 'host' / all-TCG-features
as on other targets.
We can at least simplify the code to use 'max' in all scenarios
for appliance CPU model, and simply skip a CPU model for PPC.
Signed-off-by: Daniel P. Berrangé <berrange(a)redhat.com>
---
docs/C_SOURCE_FILES | 1 -
lib/Makefile.am | 1 -
lib/appliance-cpu.c | 93 ------------------------------------------
lib/guestfs-internal.h | 3 --
lib/launch-direct.c | 25 +++++-------
lib/launch-libvirt.c | 40 ++++++++----------
po/POTFILES | 1 -
7 files changed, 27 insertions(+), 137 deletions(-)
delete mode 100644 lib/appliance-cpu.c
diff --git a/docs/C_SOURCE_FILES b/docs/C_SOURCE_FILES
index a367c93a0..a26487d99 100644
--- a/docs/C_SOURCE_FILES
+++ b/docs/C_SOURCE_FILES
@@ -283,7 +283,6 @@ lib/actions-6.c
lib/actions-support.c
lib/actions-variants.c
lib/alloc.c
-lib/appliance-cpu.c
lib/appliance-kcmdline.c
lib/appliance-uefi.c
lib/appliance.c
diff --git a/lib/Makefile.am b/lib/Makefile.am
index 212bcb94a..40a4c9ac3 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -71,7 +71,6 @@ libguestfs_la_SOURCES = \
actions-variants.c \
alloc.c \
appliance.c \
- appliance-cpu.c \
appliance-kcmdline.c \
appliance-uefi.c \
available.c \
diff --git a/lib/appliance-cpu.c b/lib/appliance-cpu.c
deleted file mode 100644
index 54ac6e2e3..000000000
--- a/lib/appliance-cpu.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/* libguestfs
- * Copyright (C) 2009-2020 Red Hat Inc.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * The appliance choice of CPU model.
- */
-
-#include <config.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "guestfs.h"
-#include "guestfs-internal.h"
-
-/**
- * Return the right CPU model to use as the qemu C<-cpu> parameter or
- * its equivalent in libvirt. This returns:
- *
- * =over 4
- *
- * =item C<"host">
- *
- * The literal string C<"host"> means use C<-cpu host>.
- *
- * =item C<"max">
- *
- * The literal string C<"max"> means use C<-cpu max> (the best
- * possible). This requires awkward translation for libvirt.
- *
- * =item some string
- *
- * Some string such as C<"cortex-a57"> means use C<-cpu cortex-a57>.
- *
- * =item C<NULL>
- *
- * C<NULL> means no C<-cpu> option at all. Note returning C<NULL>
- * does not indicate an error.
- *
- * =back
- *
- * This is made unnecessarily hard and fragile because of two stupid
- * choices in QEMU:
- *
- * =over 4
- *
- * =item *
- *
- * The default for C<qemu-system-aarch64 -M virt> is to emulate a
- * C<cortex-a15> (WTF?).
- *
- * =item *
- *
- * We don't know for sure if KVM will work, but C<-cpu host> is broken
- * with TCG, so we almost always pass a broken C<-cpu> flag if KVM is
- * semi-broken in any way.
- *
- * =back
- */
-const char *
-guestfs_int_get_cpu_model (int kvm)
-{
-#if defined(__aarch64__)
- /* With -M virt, the default -cpu is cortex-a15. Stupid. */
- if (kvm)
- return "host";
- else
- return "cortex-a57";
-#elif defined(__powerpc64__)
- /* See discussion in https://bugzilla.redhat.com/show_bug.cgi?id=1605071 */
- return NULL;
-#else
- /* On most architectures we can use "max" to get the best possible CPU.
- * For recent qemu this should work even on TCG.
- */
- return "max";
-#endif
-}
diff --git a/lib/guestfs-internal.h b/lib/guestfs-internal.h
index 16755cfb3..33037a26d 100644
--- a/lib/guestfs-internal.h
+++ b/lib/guestfs-internal.h
@@ -718,9 +718,6 @@ extern const char *guestfs_int_drive_protocol_to_string (enum drive_protocol pro
/* appliance.c */
extern int guestfs_int_build_appliance (guestfs_h *g, char **kernel, char **initrd, char **appliance);
-/* appliance-cpu.c */
-const char *guestfs_int_get_cpu_model (int kvm);
-
/* appliance-kcmdline.c */
extern char *guestfs_int_appliance_command_line (guestfs_h *g, const char *appliance, int flags);
#define APPLIANCE_COMMAND_LINE_IS_TCG 1
diff --git a/lib/launch-direct.c b/lib/launch-direct.c
index ff0eaeb62..f41711353 100644
--- a/lib/launch-direct.c
+++ b/lib/launch-direct.c
@@ -354,7 +354,6 @@ launch_direct (guestfs_h *g, void *datav, const char *arg)
int force_tcg;
int force_kvm;
const char *accel_val = "kvm:tcg";
- const char *cpu_model;
CLEANUP_FREE char *append = NULL;
CLEANUP_FREE_STRING_LIST char **argv = NULL;
@@ -517,20 +516,18 @@ launch_direct (guestfs_h *g, void *datav, const char *arg)
#endif
} end_list ();
- cpu_model = guestfs_int_get_cpu_model (has_kvm && !force_tcg);
- if (cpu_model) {
-#if defined(__x86_64__)
- /* Temporary workaround for RHBZ#2082806 */
- if (STREQ (cpu_model, "max")) {
- start_list ("-cpu") {
- append_list (cpu_model);
- append_list ("la57=off");
- } end_list ();
- }
- else
+ /*
+ * Can't use 'max' on ppc64 since it is an old G4 model
+ * Also can't use 'host' due to TCG fallback issues
+ * https://bugzilla.redhat.com/show_bug.cgi?id=1605071
+ */
+#if !defined(__powerpc64__)
+# if defined(__x86_64__)
+ arg ("-cpu", "max,la57=off");
+# else
+ arg ("-cpu", "max");
+# endif
#endif
- arg ("-cpu", cpu_model);
- }
if (g->smp > 1)
arg_format ("-smp", "%d", g->smp);
diff --git a/lib/launch-libvirt.c b/lib/launch-libvirt.c
index 03d69e027..b97c91566 100644
--- a/lib/launch-libvirt.c
+++ b/lib/launch-libvirt.c
@@ -1161,8 +1161,6 @@ construct_libvirt_xml_cpu (guestfs_h *g,
const struct libvirt_xml_params *params,
xmlTextWriterPtr xo)
{
- const char *cpu_model;
-
start_element ("memory") {
attribute ("unit", "MiB");
string_format ("%d", g->memsize);
@@ -1173,30 +1171,24 @@ construct_libvirt_xml_cpu (guestfs_h *g,
string_format ("%d", g->memsize);
} end_element ();
- cpu_model = guestfs_int_get_cpu_model (params->data->is_kvm);
- if (cpu_model) {
- start_element ("cpu") {
- if (STREQ (cpu_model, "host")) {
- attribute ("mode", "host-passthrough");
- start_element ("model") {
- attribute ("fallback", "allow");
- } end_element ();
- }
- else if (STREQ (cpu_model, "max")) {
- /* https://bugzilla.redhat.com/show_bug.cgi?id=1935572#c11 */
- attribute ("mode", "maximum");
+ /*
+ * Can't use 'max' on ppc64 since it is an old G4 model
+ * Also can't use 'host' due to TCG fallback issues
+ * https://bugzilla.redhat.com/show_bug.cgi?id=1605071
+ */
+#if !defined(__powerpc64__)
+ start_element ("cpu") {
+ /* https://bugzilla.redhat.com/show_bug.cgi?id=1935572#c11 */
+ attribute ("mode", "maximum");
#if defined(__x86_64__)
- /* Temporary workaround for RHBZ#2082806 */
- start_element ("feature") {
- attribute ("policy", "disable");
- attribute ("name", "la57");
- } end_element ();
-#endif
- }
- else
- single_element ("model", cpu_model);
+ /* Temporary workaround for RHBZ#2082806 */
+ start_element ("feature") {
+ attribute ("policy", "disable");
+ attribute ("name", "la57");
} end_element ();
- }
+#endif
+ } end_element ();
+#endif
single_element_format ("vcpu", "%d", g->smp);
diff --git a/po/POTFILES b/po/POTFILES
index 32b975a04..67cdffb29 100644
--- a/po/POTFILES
+++ b/po/POTFILES
@@ -329,7 +329,6 @@ lib/actions-6.c
lib/actions-support.c
lib/actions-variants.c
lib/alloc.c
-lib/appliance-cpu.c
lib/appliance-kcmdline.c
lib/appliance-uefi.c
lib/appliance.c
--
2.36.1
[View Less]
2 years, 5 months
[PATCH v3] spec: Clarify BLOCK_STATUS reply details
by Eric Blake
Our docs were inconsistent on whether a NBD_REPLY_TYPE_BLOCK_STATUS
reply chunk can exceed the client's requested length, and silent on
whether the lengths must be consistent when multiple contexts were
negotiated. Clarify this to match existing practice as implemented in
qemu-nbd. Clean up some nearby grammatical errors while at it.
---
Another round of rewording attempts, based on feedback from Rich on
v2. I went ahead and pushed patch 1 and 2 of the v2 series, as they
were less …
[View More]controversial.
doc/proto.md | 42 ++++++++++++++++++++++++++++--------------
1 file changed, 28 insertions(+), 14 deletions(-)
diff --git a/doc/proto.md b/doc/proto.md
index 8a817d2..bacccfa 100644
--- a/doc/proto.md
+++ b/doc/proto.md
@@ -882,15 +882,25 @@ The procedure works as follows:
server supports.
- During transmission, a client can then indicate interest in metadata
for a given region by way of the `NBD_CMD_BLOCK_STATUS` command,
- where *offset* and *length* indicate the area of interest. The
- server MUST then respond with the requested information, for all
- contexts which were selected during negotiation. For every metadata
- context, the server sends one set of extent chunks, where the sizes
- of the extents MUST be less than or equal to the length as specified
- in the request. Each extent comes with a *flags* field, the
- semantics of which are defined by the metadata context.
-- A server MUST reply to `NBD_CMD_BLOCK_STATUS` with a structured
- reply of type `NBD_REPLY_TYPE_BLOCK_STATUS`.
+ where *offset* and *length* indicate the area of interest. On
+ success, the server MUST respond with one structured reply chunk of
+ type `NBD_REPLY_TYPE_BLOCK_STATUS` per metadata context selected
+ during negotiation, where each reply chunk is a list of one or more
+ consecutive extents for that context. Each extent comes with a
+ *flags* field, the semantics of which are defined by the metadata
+ context.
+
+The client's requested *length* is only a hint to the server, so the
+cumulative extent length contained in a chunk of the server's reply
+may be shorter or longer the original request. When more than one
+metadata context was negotiated, the reply chunks for the different
+contexts of a single block status request need not have the same
+number of extents or cumulative extent length.
+
+In the request, the client may use the `NBD_CMD_FLAG_REQ_ONE` command
+flag to further constrain the server's reply so that each chunk
+contains exactly one extent whose length does not exceed the client's
+original *length*.
A client MUST NOT use `NBD_CMD_BLOCK_STATUS` unless it selected a
nonzero number of metadata contexts during negotiation, and used the
@@ -1778,8 +1788,8 @@ MUST initiate a hard disconnect.
*length* MUST be 4 + (a positive integer multiple of 8). This reply
represents a series of consecutive block descriptors where the sum
of the length fields within the descriptors is subject to further
- constraints documented below. This chunk type MUST appear
- exactly once per metadata ID in a structured reply.
+ constraints documented below. A successful block status request MUST
+ have exactly one status chunk per negotiated metadata context ID.
The payload starts with:
@@ -1801,15 +1811,19 @@ MUST initiate a hard disconnect.
*length* of the final extent MAY result in a sum larger than the
original requested length, if the server has that information anyway
as a side effect of reporting the status of the requested region.
+ When multiple metadata contexts are negotiated, the reply chunks for
+ the different contexts need not have the same number of extents or
+ cumulative extent length.
Even if the client did not use the `NBD_CMD_FLAG_REQ_ONE` flag in
its request, the server MAY return fewer descriptors in the reply
than would be required to fully specify the whole range of requested
information to the client, if looking up the information would be
too resource-intensive for the server, so long as at least one
- extent is returned. Servers should however be aware that most
- clients implementations will then simply ask for the next extent
- instead.
+ extent is returned. Servers should however be aware that most
+ client implementations will likely follow up with a request for
+ extent information at the first offset not covered by a
+ reduced-length reply.
All error chunk types have bit 15 set, and begin with the same
*error*, *message length*, and optional *message* fields as
--
2.35.1
[View Less]
2 years, 6 months
[libnbd PATCH 0/2] Optimize h.pread_structured
by Eric Blake
Add some more safety against a non-compliant server with structured
reads, and kill off another useless copy operation for a noticeable
speedup when using h.pread_structured. However, I'm still trying to
play with code to see if PyObject_GetItem(memoryview,
PySlice_New(start, end, NULL)) can get me the same zero-copy python
object but in a way that can survive as long as the original object
lives, rather than forcing .release() at the end of the callback.
Maybe I'll have a patch 3/2 later …
[View More]today.
Eric Blake (2):
api: Tighter checking of structured read replies
python: Optimize away copy during pread_structured
lib/internal.h | 2 +-
generator/Python.ml | 20 ++++++++++++++------
generator/states-reply-simple.c | 4 ++--
generator/states-reply-structured.c | 6 ++++--
lib/aio.c | 7 +++++--
5 files changed, 26 insertions(+), 13 deletions(-)
--
2.36.1
[View Less]
2 years, 7 months
[nbdkit PATCH v2] blocksize: Avoid losing aligned writes to RMW race
by Eric Blake
[We are still investigating if a CVE needs to be assigned.]
We have a bug where the blocksize filter can lose the effects of an
aligned write that happens in parallel with an overlapping unaligned
write. In general, a client should be extremely cautious about
overlapping writes; it is not NBD's fault which of the two writes
lands on the disk (or even if both writes land partially, at
block-level granularities on which write landed last). However, a
client should still be able to assume that …
[View More]even when two writes
partially overlap, the final state of the disk of the non-overlapping
portion will be that of the one write explicitly touching that
portion, rather than stale data from before either write touched that
block.
Fix the bug by switching the blocksize filter locking from a mutex to
a rwlock. We still need mutual exclusion during all unaligned
operations (whether reading or writing to the plugin), because we
share the same bounce buffer among all such code, which we get via the
mutual exclusion of wrlock. But we now also add in shared exclusion
for aligned write-like operations (pwrite, trim, zero); these can
operate in parallel with one another, but must not be allowed to fall
in the window between when an overlapping unaligned operation has read
from the plugin but not yet written, so that we do not end up
re-writing stale contents that undo the portion of the aligned write
that was outside the unaligned region [it's ironic that we only need
this protection on write I/O, but get it by using the rdlock feature
of the rwlock]. Read-like operations (pread, extents, cache) don't
need to be protected from RMW operations, because it is already the
user's problem if they execute parallel overlapping reads while
modifying the same portion of the image; and they will still end up
seeing either the state before or after the unaligned write,
indistinguishable from if we had added more locking.
Whether this lock favors rdlock (aka aligned pwrite) or wrlock (aka
unaligned access) is implementation-defined by the pthread
implementation; if we need to be more precise, we'll have to introduce
our own rwlock implementation on top of lower-level primitives
(https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock mentions
the use of two mutex to favor readers, or a condvar/mutex to favor
writers). This is also not very fine-grained; it may turn out that we
could get more performance if instead of locking the entire operation
(across all clients, and regardless of whether the offsets overlap),
we instead just lock metadata and then track whether a new operation
overlaps with an existing unaligned operation, or even add in support
for parallel unaligned operations by having more than one bounce
buffer. But if performance truly matters, you're better off fixing
the client to do aligned access in the first place, rather than
needing the blocksize filter.
Add a test that fails without the change to blocksize.c. With some
carefully timed setups (using the delay filter to stall writes
reaching the plugin, and the plugin itself to stall reads coming back
from the plugin, so that we are reasonably sure of the overall
timeline), we can demonstrate the bug present in the unpatched code,
where an aligned write is lost when it lands in the wrong part of an
unaligned RMW cycle; the fixed code instead shows that the overlapping
operations were serialized. We may need to further tweak the test to
be more reliable even under heavy load, but hopefully 2 seconds per
stall (where a successful test requires 18 seconds) is sufficient for
now.
Reported-by: Nikolaus Rath <Nikolaus(a)rath.org>
Fixes: 1aadbf9971 ("blocksize: Allow parallel requests", v1.25.3)
---
In v2:
- Implement the blocksize fix.
- Drop RFC; I think this is ready, other than determining if we want
to tag it with a CVE number.
- Improve the test: print out more details before assertions, to aid
in debugging if it ever dies under CI resources
tests/Makefile.am | 2 +
filters/blocksize/blocksize.c | 26 +++--
tests/test-blocksize-sharding.sh | 168 +++++++++++++++++++++++++++++++
3 files changed, 187 insertions(+), 9 deletions(-)
create mode 100755 tests/test-blocksize-sharding.sh
diff --git a/tests/Makefile.am b/tests/Makefile.am
index b6d30c0a..67e7618b 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -1376,11 +1376,13 @@ TESTS += \
test-blocksize.sh \
test-blocksize-extents.sh \
test-blocksize-default.sh \
+ test-blocksize-sharding.sh \
$(NULL)
EXTRA_DIST += \
test-blocksize.sh \
test-blocksize-extents.sh \
test-blocksize-default.sh \
+ test-blocksize-sharding.sh \
$(NULL)
# blocksize-policy filter test.
diff --git a/filters/blocksize/blocksize.c b/filters/blocksize/blocksize.c
index 03da4971..1c81d5e3 100644
--- a/filters/blocksize/blocksize.c
+++ b/filters/blocksize/blocksize.c
@@ -51,10 +51,15 @@
#define BLOCKSIZE_MIN_LIMIT (64U * 1024)
-/* In order to handle parallel requests safely, this lock must be held
- * when using the bounce buffer.
+/* Lock in order to handle overlapping requests safely.
+ *
+ * Grabbed for exclusive access (wrlock) when using the bounce buffer.
+ *
+ * Grabbed for shared access (rdlock) when doing aligned writes.
+ * These can happen in parallel with one another, but must not land in
+ * between the read and write of an unaligned RMW operation.
*/
-static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_rwlock_t lock = PTHREAD_RWLOCK_INITIALIZER;
/* A single bounce buffer for alignment purposes, guarded by the lock.
* Size it to the maximum we allow for minblock.
@@ -255,7 +260,7 @@ blocksize_pread (nbdkit_next *next,
/* Unaligned head */
if (offs & (h->minblock - 1)) {
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ ACQUIRE_WRLOCK_FOR_CURRENT_SCOPE (&lock);
drop = offs & (h->minblock - 1);
keep = MIN (h->minblock - drop, count);
if (next->pread (next, bounce, h->minblock, offs - drop, flags, err) == -1)
@@ -278,7 +283,7 @@ blocksize_pread (nbdkit_next *next,
/* Unaligned tail */
if (count) {
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ ACQUIRE_WRLOCK_FOR_CURRENT_SCOPE (&lock);
if (next->pread (next, bounce, h->minblock, offs, flags, err) == -1)
return -1;
memcpy (buf, bounce, count);
@@ -306,7 +311,7 @@ blocksize_pwrite (nbdkit_next *next,
/* Unaligned head */
if (offs & (h->minblock - 1)) {
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ ACQUIRE_WRLOCK_FOR_CURRENT_SCOPE (&lock);
drop = offs & (h->minblock - 1);
keep = MIN (h->minblock - drop, count);
if (next->pread (next, bounce, h->minblock, offs - drop, 0, err) == -1)
@@ -321,6 +326,7 @@ blocksize_pwrite (nbdkit_next *next,
/* Aligned body */
while (count >= h->minblock) {
+ ACQUIRE_RDLOCK_FOR_CURRENT_SCOPE (&lock);
keep = MIN (h->maxdata, ROUND_DOWN (count, h->minblock));
if (next->pwrite (next, buf, keep, offs, flags, err) == -1)
return -1;
@@ -331,7 +337,7 @@ blocksize_pwrite (nbdkit_next *next,
/* Unaligned tail */
if (count) {
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ ACQUIRE_WRLOCK_FOR_CURRENT_SCOPE (&lock);
if (next->pread (next, bounce, h->minblock, offs, 0, err) == -1)
return -1;
memcpy (bounce, buf, count);
@@ -371,6 +377,7 @@ blocksize_trim (nbdkit_next *next,
/* Aligned body */
while (count) {
+ ACQUIRE_RDLOCK_FOR_CURRENT_SCOPE (&lock);
keep = MIN (h->maxlen, count);
if (next->trim (next, keep, offs, flags, err) == -1)
return -1;
@@ -413,7 +420,7 @@ blocksize_zero (nbdkit_next *next,
/* Unaligned head */
if (offs & (h->minblock - 1)) {
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ ACQUIRE_WRLOCK_FOR_CURRENT_SCOPE (&lock);
drop = offs & (h->minblock - 1);
keep = MIN (h->minblock - drop, count);
if (next->pread (next, bounce, h->minblock, offs - drop, 0, err) == -1)
@@ -428,6 +435,7 @@ blocksize_zero (nbdkit_next *next,
/* Aligned body */
while (count >= h->minblock) {
+ ACQUIRE_RDLOCK_FOR_CURRENT_SCOPE (&lock);
keep = MIN (h->maxlen, ROUND_DOWN (count, h->minblock));
if (next->zero (next, keep, offs, flags, err) == -1)
return -1;
@@ -437,7 +445,7 @@ blocksize_zero (nbdkit_next *next,
/* Unaligned tail */
if (count) {
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ ACQUIRE_WRLOCK_FOR_CURRENT_SCOPE (&lock);
if (next->pread (next, bounce, h->minblock, offs, 0, err) == -1)
return -1;
memset (bounce, 0, count);
diff --git a/tests/test-blocksize-sharding.sh b/tests/test-blocksize-sharding.sh
new file mode 100755
index 00000000..177668ed
--- /dev/null
+++ b/tests/test-blocksize-sharding.sh
@@ -0,0 +1,168 @@
+#!/usr/bin/env bash
+# nbdkit
+# Copyright (C) 2018-2022 Red Hat Inc.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# * Neither the name of Red Hat nor the names of its contributors may be
+# used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+
+# Demonstrate a fix for a bug where blocksize could lose aligned writes
+# run in parallel with unaligned writes
+
+source ./functions.sh
+set -e
+set -x
+
+requires_plugin eval
+requires_nbdsh_uri
+
+files='blocksize-sharding.img blocksize-sharding.tmp'
+rm -f $files
+cleanup_fn rm -f $files
+
+# Script a server that requires 16-byte aligned requests, and which delays
+# 4s after reads if a witness file exists. Couple it with the delay filter
+# that delays 2s before writes. If an unaligned and aligned write overlap,
+# and can execute in parallel, we would have this timeline:
+#
+# T1 aligned write 1's to 0/16 T2 unaligned write 2's to 4/12
+#t=0 blocksize: next->pwrite(0, 16) blocksize: next->pread(0, 16)
+# delay: wait 2s delay: next->pread(0, 16)
+# ... eval: read 0's, wait 4s
+#t=2 delay: next->pwrite(0, 16) ...
+# eval: write 1's ...
+# return ...
+#t=4 return 0's (now stale)
+# blocksize: next->pwrite(0, 16)
+# delay: wait 2s
+#t=6 delay: next->pwrite(0, 16)
+# eval: write stale RMW buffer
+#
+# leaving us with a sharded 0000222222222222 (T1's write is lost).
+# But as long as the blocksize filter detects the overlap, we should end
+# up with either 1111222222222222 (aligned write completed first), or with
+# 1111111111111111 (unaligned write completed first), either taking 8s,
+# but with no sharding.
+#
+# We also need an nbdsh script that kicks off parallel writes.
+export script='
+import os
+import time
+
+witness = os.getenv("witness")
+
+def touch(path):
+ open(path, "a").close()
+
+# First pass: check that two aligned operations work in parallel
+# Total time should be closer to 2 seconds, rather than 4 if serialized
+print("sanity check")
+ba1 = bytearray(b"1"*16)
+ba2 = bytearray(b"2"*16)
+buf1 = nbd.Buffer.from_bytearray(ba1)
+buf2 = nbd.Buffer.from_bytearray(ba2)
+touch(witness)
+start_t = time.time()
+h.aio_pwrite(buf1, 0)
+h.aio_pwrite(buf2, 0)
+
+while h.aio_in_flight() > 0:
+ h.poll(-1)
+end_t = time.time()
+os.unlink(witness)
+
+out = h.pread(16,0)
+print(out)
+t = end_t - start_t
+print(t)
+assert out in [b"1"*16, b"2"*16]
+assert t >= 2 and t <= 3
+
+# Next pass: try to kick off unaligned first
+print("unaligned first")
+h.zero(16, 0)
+ba3 = bytearray(b"3"*12)
+ba4 = bytearray(b"4"*16)
+buf3 = nbd.Buffer.from_bytearray(ba3)
+buf4 = nbd.Buffer.from_bytearray(ba4)
+touch(witness)
+start_t = time.time()
+h.aio_pwrite(buf3, 4)
+h.aio_pwrite(buf4, 0)
+
+while h.aio_in_flight() > 0:
+ h.poll(-1)
+end_t = time.time()
+os.unlink(witness)
+
+out = h.pread(16,0)
+print(out)
+t = end_t - start_t
+print(t)
+assert out in [b"4"*4 + b"3"*12, b"4"*16]
+assert t >= 8
+
+# Next pass: try to kick off aligned first
+print("aligned first")
+ba5 = bytearray(b"5"*16)
+ba6 = bytearray(b"6"*12)
+buf5 = nbd.Buffer.from_bytearray(ba5)
+buf6 = nbd.Buffer.from_bytearray(ba6)
+h.zero(16, 0)
+touch(witness)
+start_t = time.time()
+h.aio_pwrite(buf5, 0)
+h.aio_pwrite(buf6, 4)
+
+while h.aio_in_flight() > 0:
+ h.poll(-1)
+end_t = time.time()
+os.unlink(witness)
+
+out = h.pread(16,0)
+print(out)
+t = end_t - start_t
+print(t)
+assert out in [b"5"*4 + b"6"*12, b"5"*16]
+assert t >= 8
+'
+
+# Now run everything
+truncate --size=16 blocksize-sharding.img
+export witness="$PWD/blocksize-sharding.tmp"
+nbdkit -U - --filter=blocksize --filter=delay eval delay-write=2 \
+ config='ln -sf "$(realpath "$3")" $tmpdir/$2' \
+ img="$PWD/blocksize-sharding.img" tmp="$PWD/blocksize-sharding.tmp" \
+ get_size='echo 16' block_size='echo 16 64K 1M' \
+ thread_model='echo parallel' \
+ zero='dd if=/dev/zero of=$tmpdir/img skip=$4 count=$3 \
+ iflag=count_bytes,skip_bytes' \
+ pread='
+ dd if=$tmpdir/img skip=$4 count=$3 iflag=count_bytes,skip_bytes
+ if [ -f $tmpdir/tmp ]; then sleep 4; fi ' \
+ pwrite='dd of=$tmpdir/img seek=$4 conv=notrunc oflag=seek_bytes' \
+ --run 'nbdsh -u "$uri" -c "$script"'
--
2.36.1
[View Less]
2 years, 7 months
[nbdkit PATCH] linuxdisk: Reduce size of test
by Eric Blake
Using all of ../plugins as the contents for the linuxdisk gets
progressively bigger over time with incremental builds. Before I
cleaned it up, my plugins/rust/target/debug had accumulated over 6G of
cruft, causing test-linuxdisk-copy-out.sh to fail due to ENOSPC during
mke2fs. Pick a smaller directory tree to export, but still test that
we expose a subdirectory.
Fixes: 5da0f2fc9 ("Add linuxdisk plugin.")
---
plugins/linuxdisk/subdir/.gitignore | 0
tests/test-linuxdisk-copy-out.sh | 18 +…
[View More]++++++++---------
2 files changed, 9 insertions(+), 9 deletions(-)
create mode 100644 plugins/linuxdisk/subdir/.gitignore
diff --git a/plugins/linuxdisk/subdir/.gitignore b/plugins/linuxdisk/subdir/.gitignore
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test-linuxdisk-copy-out.sh b/tests/test-linuxdisk-copy-out.sh
index 3a38fb58..f926a55c 100755
--- a/tests/test-linuxdisk-copy-out.sh
+++ b/tests/test-linuxdisk-copy-out.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
# nbdkit
-# Copyright (C) 2018-2020 Red Hat Inc.
+# Copyright (C) 2018-2022 Red Hat Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -49,25 +49,25 @@ cleanup_fn rm -f $files
nbdkit -f -v -U - \
--filter=partition \
- linuxdisk $srcdir/../plugins partition=1 label=ROOT \
+ linuxdisk $srcdir/../plugins/linuxdisk partition=1 label=ROOT \
--run 'nbdcopy "$uri" linuxdisk-copy-out.img'
# Check the disk content.
guestfish --ro -a linuxdisk-copy-out.img -m /dev/sda <<EOF
# Check some known files and directories exist.
ll /
- ll /linuxdisk
- is-dir /linuxdisk
- is-file /linuxdisk/Makefile.am
+ ll /subdir
+ is-dir /subdir
+ is-file /Makefile.am
# This reads out all the directory entries and all file contents.
tar-out / - | cat >/dev/null
# Download some files and compare to local copies.
- download /linuxdisk/Makefile linuxdisk-copy-out.test1
- download /linuxdisk/Makefile.am linuxdisk-copy-out.test2
- download /linuxdisk/nbdkit-linuxdisk-plugin.pod linuxdisk-copy-out.test3
- download /linuxdisk/filesystem.c linuxdisk-copy-out.test4
+ download /Makefile linuxdisk-copy-out.test1
+ download /Makefile.am linuxdisk-copy-out.test2
+ download /nbdkit-linuxdisk-plugin.pod linuxdisk-copy-out.test3
+ download /filesystem.c linuxdisk-copy-out.test4
EOF
# Compare downloaded files to local versions.
--
2.36.1
[View Less]
2 years, 7 months
[libnbd PATCH] python: Speed up pread
by Eric Blake
Instead of malloc'ing a C buffer, nbd_pread()ing into it, then copying
it into an immutable Python bytes object, we can instead pre-create a
correctly-sized Python bytearray object, then nbd_pread() directly
into that object's underlying bytes.
While the data copying might not be the bottleneck compared to the
networking costs, it does have noticeable results; on my machine:
$ export script='
m=1024*1024
size=h.get_size()
for i in range(size // m):
buf = h.pread(m, m*i)
'
$ time ./run …
[View More]nbdkit -U - pattern 10G --run 'nbdsh -u $uri -c "$script"'
sped up from about 7.8s pre-patch to about 7.1s post-patch,
approximately a 10% speedup.
Note that this slightly changes the python API: h.pread[_structured]
now returns a mutable 'bytearray' object, rather than an immutable
'bytes' object. This makes it possible to modify the just-read string
in place, rather than having to create yet another memory buffer for
any modifications, which offers even more speedups when writing a
read-modify-write paradigm in python. But the change is
backwards-compatible - python already states that a read-write buffer
may be returned instead of readonly for any client that already
operated only on a buffer in a readonly manner.
---
Note that h.pread is more like Python read() semantics in creating a
buffer, while h.aio_pread is more like Python readinto() semantics in
modifying a passed-in buffer. But now that both code paths have a
python object prior to calling into the C API, my next task is to
improve the h.*pread_structured callback function to pass its buffer
as a slice of the Python input buffer, rather than doing yet another
round of pointless memcpy from C into python objects.
generator/Python.ml | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/generator/Python.ml b/generator/Python.ml
index 4ab18f6..1c4446e 100644
--- a/generator/Python.ml
+++ b/generator/Python.ml
@@ -1,6 +1,6 @@
(* hey emacs, this is OCaml code: -*- tuareg -*- *)
(* nbd client library in userspace: Python bindings
- * Copyright (C) 2013-2021 Red Hat Inc.
+ * Copyright (C) 2013-2022 Red Hat Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -293,7 +293,7 @@ let
| BytesIn (n, _) ->
pr " Py_buffer %s = { .obj = NULL };\n" n
| BytesOut (n, count) ->
- pr " char *%s = NULL;\n" n;
+ pr " PyObject *%s = NULL;\n" n;
pr " Py_ssize_t %s;\n" count
| BytesPersistIn (n, _)
| BytesPersistOut (n, _) ->
@@ -432,8 +432,8 @@ let
| Bool _ -> ()
| BytesIn _ -> ()
| BytesOut (n, count) ->
- pr " %s = malloc (%s);\n" n count;
- pr " if (%s == NULL) { PyErr_NoMemory (); goto out; }\n" n
+ pr " %s = PyByteArray_FromStringAndSize (NULL, %s);\n" n count;
+ pr " if (%s == NULL) goto out;\n" n
| BytesPersistIn (n, _) | BytesPersistOut (n, _) ->
pr " %s_buf = nbd_internal_py_get_aio_buffer (%s);\n" n n;
pr " if (!%s_buf) goto out;\n" n;
@@ -479,7 +479,7 @@ let
function
| Bool n -> pr ", %s" n
| BytesIn (n, _) -> pr ", %s.buf, %s.len" n n
- | BytesOut (n, count) -> pr ", %s, %s" n count
+ | BytesOut (n, count) -> pr ", PyByteArray_AS_STRING (%s), %s" n count
| BytesPersistIn (n, _)
| BytesPersistOut (n, _) -> pr ", %s_buf->data, %s_buf->len" n n
| Closure { cbname } -> pr ", %s" cbname
@@ -524,8 +524,9 @@ let
let use_ret = ref true in
List.iter (
function
- | BytesOut (n, count) ->
- pr " py_ret = PyBytes_FromStringAndSize (%s, %s);\n" n count;
+ | BytesOut (n, _) ->
+ pr " py_ret = %s;\n" n;
+ pr " %s = NULL;\n" n;
use_ret := false
| Bool _
| BytesIn _
@@ -572,7 +573,7 @@ let
| BytesIn (n, _) ->
pr " if (%s.obj)\n" n;
pr " PyBuffer_Release (&%s);\n" n
- | BytesOut (n, _) -> pr " free (%s);\n" n
+ | BytesOut (n, _) -> pr " Py_XDECREF (%s);\n" n
| BytesPersistIn _ | BytesPersistOut _ -> ()
| Closure { cbname } ->
pr " free_user_data (%s_user_data);\n" cbname
--
2.36.1
[View Less]
2 years, 7 months