This commit implements the core NBD protocol for the "base:allocation"
Block Status replies.
---
server/internal.h | 7 +
server/protocol.h | 17 +-
server/protocol-handshake-newstyle.c | 79 ++++++++-
server/protocol.c | 248 +++++++++++++++++++++++++--
4 files changed, 335 insertions(+), 16 deletions(-)
diff --git a/server/internal.h b/server/internal.h
index 825dd3e..03d6119 100644
--- a/server/internal.h
+++ b/server/internal.h
@@ -183,6 +183,7 @@ struct connection {
bool can_multi_conn;
bool using_tls;
bool structured_replies;
+ bool meta_context_base_allocation;
int sockin, sockout;
connection_recv_function recv;
@@ -219,6 +220,12 @@ extern int protocol_handshake_newstyle (struct connection *conn)
extern int protocol_recv_request_send_reply (struct connection *conn)
__attribute__((__nonnull__ (1)));
+/* The context ID of base:allocation. As far as I can tell it doesn't
+ * matter what this is as long as nbdkit always returns the same
+ * number.
+ */
+#define base_allocation_id 1
+
/* crypto.c */
#define root_tls_certificates_dir sysconfdir "/pki/" PACKAGE_NAME
extern void crypto_init (bool tls_set_on_cli);
diff --git a/server/protocol.h b/server/protocol.h
index 4fe3c75..a7de2f0 100644
--- a/server/protocol.h
+++ b/server/protocol.h
@@ -112,6 +112,7 @@ extern const char *name_of_nbd_rep (int);
#define NBD_REP_ACK 1
#define NBD_REP_SERVER 2
#define NBD_REP_INFO 3
+#define NBD_REP_META_CONTEXT 4
#define NBD_REP_ERR_UNSUP 0x80000001
#define NBD_REP_ERR_POLICY 0x80000002
#define NBD_REP_ERR_INVALID 0x80000003
@@ -128,6 +129,18 @@ struct fixed_new_option_reply_info_export {
uint16_t eflags; /* per-export flags */
} __attribute__((packed));
+/* NBD_REP_META_CONTEXT reply (follows fixed_new_option_reply). */
+struct fixed_new_option_reply_meta_context {
+ uint32_t context_id; /* metadata context ID */
+ /* followed by a string */
+} __attribute__((packed));
+
+/* NBD_REPLY_TYPE_BLOCK_STATUS block descriptor. */
+struct block_descriptor {
+ uint32_t length; /* length of block */
+ uint32_t status_flags; /* block type (hole etc) */
+} __attribute__((packed));
+
/* New-style handshake server reply when using NBD_OPT_EXPORT_NAME.
* Modern clients use NBD_OPT_GO instead of this.
*/
@@ -187,7 +200,7 @@ extern const char *name_of_nbd_reply_type (int);
#define NBD_REPLY_TYPE_NONE 0
#define NBD_REPLY_TYPE_OFFSET_DATA 1
#define NBD_REPLY_TYPE_OFFSET_HOLE 2
-#define NBD_REPLY_TYPE_BLOCK_STATUS 3
+#define NBD_REPLY_TYPE_BLOCK_STATUS 5
#define NBD_REPLY_TYPE_ERROR ((1<<15) + 1)
#define NBD_REPLY_TYPE_ERROR_OFFSET ((1<<15) + 2)
@@ -199,10 +212,12 @@ extern const char *name_of_nbd_cmd (int);
#define NBD_CMD_FLUSH 3
#define NBD_CMD_TRIM 4
#define NBD_CMD_WRITE_ZEROES 6
+#define NBD_CMD_BLOCK_STATUS 7
extern const char *name_of_nbd_cmd_flag (int);
#define NBD_CMD_FLAG_FUA (1<<0)
#define NBD_CMD_FLAG_NO_HOLE (1<<1)
+#define NBD_CMD_FLAG_REQ_ONE (1<<3)
/* Error codes (previously errno).
* See
http://git.qemu.org/?p=qemu.git;a=commitdiff;h=ca4414804114fd0095b317785b...
diff --git a/server/protocol-handshake-newstyle.c b/server/protocol-handshake-newstyle.c
index db01f7b..5edeaf3 100644
--- a/server/protocol-handshake-newstyle.c
+++ b/server/protocol-handshake-newstyle.c
@@ -133,6 +133,34 @@ send_newstyle_option_reply_info_export (struct connection *conn,
return 0;
}
+static int
+send_newstyle_option_reply_meta_context (struct connection *conn,
+ uint32_t option, uint32_t reply,
+ uint32_t context_id,
+ const char *name)
+{
+ struct fixed_new_option_reply fixed_new_option_reply;
+ struct fixed_new_option_reply_meta_context context;
+ const size_t namelen = strlen (name);
+
+ fixed_new_option_reply.magic = htobe64 (NBD_REP_MAGIC);
+ fixed_new_option_reply.option = htobe32 (option);
+ fixed_new_option_reply.reply = htobe32 (reply);
+ fixed_new_option_reply.replylen = htobe32 (sizeof context + namelen);
+ context.context_id = htobe32 (context_id);
+
+ if (conn->send (conn,
+ &fixed_new_option_reply,
+ sizeof fixed_new_option_reply) == -1 ||
+ conn->send (conn, &context, sizeof context) == -1 ||
+ conn->send (conn, name, namelen) == -1) {
+ nbdkit_error ("write: %m");
+ return -1;
+ }
+
+ return 0;
+}
+
/* Sub-function during negotiate_handshake_newstyle, to uniformly handle
* a client hanging up on a message boundary.
*/
@@ -452,6 +480,8 @@ negotiate_handshake_newstyle_options (struct connection *conn)
case NBD_OPT_LIST_META_CONTEXT:
case NBD_OPT_SET_META_CONTEXT:
{
+ int r;
+ bool can_extents;
uint32_t opt_index;
uint32_t exportnamelen;
uint32_t nr_queries;
@@ -469,6 +499,16 @@ negotiate_handshake_newstyle_options (struct connection *conn)
continue;
}
+ /* Work out if the server supports base:allocation. */
+ r = backend->can_extents (backend, conn);
+ if (r == -1) {
+ if (send_newstyle_option_reply (conn, option, NBD_REP_ERR_INVALID)
+ == -1)
+ return -1;
+ continue;
+ }
+ can_extents = r;
+
/* Minimum length of the option payload is:
* 32 bit export name length followed by empty export name
* + 32 bit number of queries followed by no queries
@@ -503,7 +543,17 @@ negotiate_handshake_newstyle_options (struct connection *conn)
* for SET: nr_queries == 0 means reset all contexts
*/
if (nr_queries == 0) {
- /* Nothing is supported now. */
+ if (option == NBD_OPT_SET_META_CONTEXT)
+ conn->meta_context_base_allocation = false;
+ else /* LIST */ {
+ if (can_extents) {
+ if (send_newstyle_option_reply_meta_context
+ (conn, option, NBD_REP_META_CONTEXT,
+ base_allocation_id, "base:allocation") == -1)
+ return -1;
+ }
+ }
+
if (send_newstyle_option_reply (conn, option, NBD_REP_ACK) == -1)
return -1;
}
@@ -525,7 +575,32 @@ negotiate_handshake_newstyle_options (struct connection *conn)
option == NBD_OPT_LIST_META_CONTEXT ? "query" :
"set",
(int) querylen, &data[opt_index]);
- /* Ignore query - nothing is supported. */
+ /* For LIST, "base:" returns all supported contexts in the
+ * base namespace. We only support "base:allocation".
+ */
+ if (option == NBD_OPT_LIST_META_CONTEXT &&
+ querylen == 5 &&
+ strncmp (&data[opt_index], "base:", 5) == 0) {
+ if (can_extents) {
+ if (send_newstyle_option_reply_meta_context
+ (conn, option, NBD_REP_META_CONTEXT,
+ base_allocation_id, "base:allocation") == -1)
+ return -1;
+ }
+ }
+ /* "base:allocation" requested by name. */
+ else if (querylen == 15 &&
+ strncmp (&data[opt_index], "base:allocation", 15) ==
0) {
+ if (can_extents) {
+ if (send_newstyle_option_reply_meta_context
+ (conn, option, NBD_REP_META_CONTEXT,
+ base_allocation_id, "base:allocation") == -1)
+ return -1;
+ if (option == NBD_OPT_SET_META_CONTEXT)
+ conn->meta_context_base_allocation = true;
+ }
+ }
+ /* Every other query must be ignored. */
opt_index += querylen;
nr_queries--;
diff --git a/server/protocol.c b/server/protocol.c
index f117d42..c713e12 100644
--- a/server/protocol.c
+++ b/server/protocol.c
@@ -36,6 +36,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
+#include <stdbool.h>
#include <inttypes.h>
#include <string.h>
#include <unistd.h>
@@ -78,6 +79,7 @@ validate_request (struct connection *conn,
case NBD_CMD_WRITE:
case NBD_CMD_TRIM:
case NBD_CMD_WRITE_ZEROES:
+ case NBD_CMD_BLOCK_STATUS:
if (!valid_range (conn, offset, count)) {
/* XXX Allow writes to extend the disk? */
nbdkit_error ("invalid request: %s: offset and count are out of range: "
@@ -106,7 +108,8 @@ validate_request (struct connection *conn,
}
/* Validate flags */
- if (flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE)) {
+ if (flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE |
+ NBD_CMD_FLAG_REQ_ONE)) {
nbdkit_error ("invalid request: unknown flag (0x%x)", flags);
*error = EINVAL;
return false;
@@ -117,6 +120,12 @@ validate_request (struct connection *conn,
*error = EINVAL;
return false;
}
+ if ((flags & NBD_CMD_FLAG_REQ_ONE) &&
+ cmd != NBD_CMD_BLOCK_STATUS) {
+ nbdkit_error ("invalid request: REQ_ONE flag needs BLOCK_STATUS request");
+ *error = EINVAL;
+ return false;
+ }
if (!conn->can_fua && (flags & NBD_CMD_FLAG_FUA)) {
nbdkit_error ("invalid request: FUA flag not supported");
*error = EINVAL;
@@ -157,14 +166,37 @@ validate_request (struct connection *conn,
return false;
}
+ /* Block status allowed? */
+ if (cmd == NBD_CMD_BLOCK_STATUS) {
+ if (!conn->structured_replies) {
+ nbdkit_error ("invalid request: "
+ "%s: structured replies was not negotiated",
+ name_of_nbd_cmd (cmd));
+ *error = EINVAL;
+ return false;
+ }
+ if (!conn->meta_context_base_allocation) {
+ nbdkit_error ("invalid request: "
+ "%s: base:allocation was not negotiated",
+ name_of_nbd_cmd (cmd));
+ *error = EINVAL;
+ return false;
+ }
+ }
+
return true; /* Command validates. */
}
/* This is called with the request lock held to actually execute the
* request (by calling the plugin). Note that the request fields have
* been validated already in 'validate_request' so we don't have to
- * check them again. 'buf' is either the data to be written or the
- * data to be returned, and points to a buffer of size 'count' bytes.
+ * check them again.
+ *
+ * 'buf' is either the data to be written or the data to be returned,
+ * and points to a buffer of size 'count' bytes.
+ *
+ * 'extents_map' is an empty extents map used for block status
+ * requests only.
*
* In all cases, the return value is the system errno value that will
* later be converted to the nbd error to send back to the client (0
@@ -173,7 +205,7 @@ validate_request (struct connection *conn,
static uint32_t
handle_request (struct connection *conn,
uint16_t cmd, uint16_t flags, uint64_t offset, uint32_t count,
- void *buf)
+ void *buf, struct nbdkit_extents_map *extents_map)
{
uint32_t f = 0;
bool fua = conn->can_fua && (flags & NBD_CMD_FLAG_FUA);
@@ -217,6 +249,14 @@ handle_request (struct connection *conn,
return err;
break;
+ case NBD_CMD_BLOCK_STATUS:
+ if (flags & NBD_CMD_FLAG_REQ_ONE)
+ f |= NBDKIT_FLAG_REQ_ONE;
+ if (backend->extents (backend, conn, count, offset, f,
+ extents_map, &err) == -1)
+ return err;
+ break;
+
default:
abort ();
}
@@ -224,6 +264,97 @@ handle_request (struct connection *conn,
return 0;
}
+static int
+count_extents (uint64_t offset, uint64_t length, uint32_t type,
+ void *rv)
+{
+ size_t *rp = rv;
+
+ (*rp)++;
+ return 0;
+}
+
+struct copy_extents_data {
+ size_t i;
+ struct block_descriptor *blocks;
+ size_t nr_blocks;
+};
+
+static int
+copy_extents (uint64_t offset, uint64_t length, uint32_t type,
+ void *dv)
+{
+ struct copy_extents_data *data = dv;
+ uint32_t type_flags;
+
+ assert (data->i < data->nr_blocks);
+
+ /* Because the original request is limited to a 32 bit count, length
+ * can never be > 32 bits in size.
+ */
+ assert (length <= UINT32_MAX);
+
+ /* Convert NBDKIT_EXTENT_* flags to NBD_STATE_* flags. However
+ * since these are deliberately chosen to be the same binary values
+ * we only have to mask here.
+ */
+ type_flags = type & 3;
+
+ data->blocks[data->i].length = length;
+ data->blocks[data->i].status_flags = type_flags;
+
+ data->i++;
+
+ return 0;
+}
+
+/* See note in protocol_recv_request_send_reply below. This returns 0
+ * on success or a positive errno.
+ */
+static int
+block_status_final_map (uint16_t flags,
+ uint32_t count, uint64_t offset,
+ struct nbdkit_extents_map *extents_map,
+ struct block_descriptor **blocks,
+ size_t *nr_blocks)
+{
+ const bool req_one = flags & NBD_CMD_FLAG_REQ_ONE;
+ uint32_t foreach_flags;
+ struct copy_extents_data data;
+
+ foreach_flags = NBDKIT_EXTENTS_FOREACH_FLAG_RANGE;
+ if (req_one)
+ foreach_flags |= NBDKIT_EXTENTS_FOREACH_FLAG_ONE;
+
+ /* Calculate the number of blocks we will be returning. */
+ *nr_blocks = 0;
+ if (nbdkit_extents_foreach (extents_map,
+ count_extents, nr_blocks,
+ foreach_flags,
+ offset, (uint64_t) count) == -1)
+ return errno;
+ assert (!req_one || *nr_blocks == 1);
+
+ /* Allocate the final array. */
+ *blocks = malloc (sizeof (struct block_descriptor) * *nr_blocks);
+ if (*blocks == NULL) {
+ nbdkit_error ("malloc: %m");
+ return errno;
+ }
+
+ /* Copy the extents into the array. */
+ data.i = 0;
+ data.blocks = *blocks;
+ data.nr_blocks = *nr_blocks;
+ if (nbdkit_extents_foreach (extents_map,
+ copy_extents, &data,
+ foreach_flags,
+ offset, (uint64_t) count) == -1)
+ return errno;
+
+ return 0;
+}
+
static int
skip_over_write_buffer (int sock, size_t count)
{
@@ -359,6 +490,60 @@ send_structured_reply_read (struct connection *conn,
return 1; /* command processed ok */
}
+static int
+send_structured_reply_block_status (struct connection *conn,
+ uint64_t handle,
+ uint16_t cmd, uint16_t flags,
+ uint32_t count, uint64_t offset,
+ const struct block_descriptor *blocks,
+ size_t nr_blocks)
+{
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->write_lock);
+ struct structured_reply reply;
+ uint32_t context_id;
+ size_t i;
+ int r;
+
+ assert (cmd == NBD_CMD_BLOCK_STATUS);
+
+ reply.magic = htobe32 (NBD_STRUCTURED_REPLY_MAGIC);
+ reply.handle = handle;
+ reply.flags = htobe16 (NBD_REPLY_FLAG_DONE);
+ reply.type = htobe16 (NBD_REPLY_TYPE_BLOCK_STATUS);
+ reply.length = htobe32 (sizeof context_id +
+ nr_blocks * sizeof (struct block_descriptor));
+
+ r = conn->send (conn, &reply, sizeof reply);
+ if (r == -1) {
+ nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
+ return connection_set_status (conn, -1);
+ }
+
+ /* Send the base:allocation context ID. */
+ context_id = htobe32 (base_allocation_id);
+ r = conn->send (conn, &context_id, sizeof context_id);
+ if (r == -1) {
+ nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
+ return connection_set_status (conn, -1);
+ }
+
+ /* Send each block descriptor. */
+ for (i = 0; i < nr_blocks; ++i) {
+ struct block_descriptor bd = blocks[i];
+
+ bd.length = htobe32 (bd.length);
+ bd.status_flags = htobe32 (bd.status_flags);
+
+ r = conn->send (conn, &bd, sizeof bd);
+ if (r == -1) {
+ nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
+ return connection_set_status (conn, -1);
+ }
+ }
+
+ return 1; /* command processed ok */
+}
+
static int
send_structured_reply_error (struct connection *conn,
uint64_t handle, uint16_t cmd, uint32_t error)
@@ -402,6 +587,9 @@ protocol_recv_request_send_reply (struct connection *conn)
uint32_t magic, count, error = 0;
uint64_t offset;
CLEANUP_FREE char *buf = NULL;
+ CLEANUP_EXTENTS_FREE struct nbdkit_extents_map *extents_map = NULL;
+ CLEANUP_FREE struct block_descriptor *blocks = NULL;
+ size_t nr_blocks = 0;
/* Read the request packet. */
{
@@ -449,6 +637,7 @@ protocol_recv_request_send_reply (struct connection *conn)
if (cmd == NBD_CMD_READ || cmd == NBD_CMD_WRITE) {
buf = malloc (count);
if (buf == NULL) {
+ out_of_memory:
perror ("malloc");
error = ENOMEM;
if (cmd == NBD_CMD_WRITE &&
@@ -458,6 +647,13 @@ protocol_recv_request_send_reply (struct connection *conn)
}
}
+ /* Allocate the extents map for block status only. */
+ if (cmd == NBD_CMD_BLOCK_STATUS) {
+ extents_map = nbdkit_extents_new ();
+ if (extents_map == NULL)
+ goto out_of_memory;
+ }
+
/* Receive the write data buffer. */
if (cmd == NBD_CMD_WRITE) {
r = conn->recv (conn, buf, count);
@@ -478,11 +674,29 @@ protocol_recv_request_send_reply (struct connection *conn)
}
else {
lock_request (conn);
- error = handle_request (conn, cmd, flags, offset, count, buf);
+ error = handle_request (conn, cmd, flags, offset, count, buf, extents_map);
assert ((int) error >= 0);
unlock_request (conn);
}
+ /* XXX There are complicated requirements for the block status
+ * reply, such as the offset, length and number of extents returned
+ * in the structured reply. To allow a simple implementation for
+ * plugins we don't make the plugins obey these requirements. This
+ * means at some point we need to filter what the plugin gives us to
+ * obey the protocol requirements. There are several places we
+ * could do that. Currently we do it here. Another possibility is
+ * to do it in server/plugins.c.
+ *
+ * Also note this only deals with base:allocation. If in future we
+ * want to describe other block status metadata then this code will
+ * require an overhaul.
+ */
+ if (error == 0 && cmd == NBD_CMD_BLOCK_STATUS) {
+ error = block_status_final_map (flags, count, offset, extents_map,
+ &blocks, &nr_blocks);
+ }
+
/* Send the reply packet. */
send_reply:
if (connection_get_status (conn) < 0)
@@ -498,15 +712,23 @@ protocol_recv_request_send_reply (struct connection *conn)
}
/* Currently we prefer to send simple replies for everything except
- * where we have to (ie. NBD_CMD_READ when structured_replies have
- * been negotiated). However this prevents us from sending
- * human-readable error messages to the client, so we should
- * reconsider this in future.
+ * where we have to (ie. NBD_CMD_READ and NBD_CMD_BLOCK_STATUS when
+ * structured_replies have been negotiated). However this prevents
+ * us from sending human-readable error messages to the client, so
+ * we should reconsider this in future.
*/
- if (conn->structured_replies && cmd == NBD_CMD_READ) {
- if (!error)
- return send_structured_reply_read (conn, request.handle, cmd,
- buf, count, offset);
+ if (conn->structured_replies &&
+ (cmd == NBD_CMD_READ || cmd == NBD_CMD_BLOCK_STATUS)) {
+ if (!error) {
+ if (cmd == NBD_CMD_READ)
+ return send_structured_reply_read (conn, request.handle, cmd,
+ buf, count, offset);
+ else /* NBD_CMD_BLOCK_STATUS */
+ return send_structured_reply_block_status (conn, request.handle,
+ cmd, flags,
+ count, offset,
+ blocks, nr_blocks);
+ }
else
return send_structured_reply_error (conn, request.handle, cmd, error);
}
--
2.20.1