Time to enhance the nbd plugin to request structured replies from the
server. For now, deal only with structured reads. The server can now
return sparse reads, even though we need nbdkit version 3 before we
can in turn return sparse reads back to the client.
In general, we have to assume the server is malicious, and so we must
sanity check that it sends replies we expect. Thus, we have a choice
between either implementing bookkeeping to ensure that the server
sends exactly as many non-overlapping chunks as needed to cover the
entire read request, or else ensuring that even when the server
cheats, we do not leak uninitialized memory back to our client. I
chose for simplicity, with the result that I end up calling memset()
more frequently than necessary for a compliant server, rather than
worrying about bookkeeping to detect a non-compliant server that is
attempting to cause an information leak.
Signed-off-by: Eric Blake <eblake(a)redhat.com>
---
plugins/nbd/nbd.c | 232 ++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 212 insertions(+), 20 deletions(-)
diff --git a/plugins/nbd/nbd.c b/plugins/nbd/nbd.c
index 313bf59..f729051 100644
--- a/plugins/nbd/nbd.c
+++ b/plugins/nbd/nbd.c
@@ -127,7 +127,9 @@ struct transaction {
int fds[2];
} u;
void *buf;
+ uint64_t offset;
uint32_t count;
+ uint32_t err;
struct transaction *next;
};
@@ -137,6 +139,7 @@ struct handle {
int fd;
int flags;
int64_t size;
+ bool structured;
pthread_t reader;
/* Prevents concurrent threads from interleaving writes to server */
@@ -230,9 +233,10 @@ nbd_mark_dead (struct handle *h)
return -1;
}
-/* Find and remove the transaction corresponding to cookie from the list. */
+/* Find and possibly remove the transaction corresponding to cookie
+ from the list. */
static struct transaction *
-find_trans_by_cookie (struct handle *h, uint64_t cookie)
+find_trans_by_cookie (struct handle *h, uint64_t cookie, bool remove)
{
struct transaction **ptr;
struct transaction *trans;
@@ -244,7 +248,7 @@ find_trans_by_cookie (struct handle *h, uint64_t cookie)
break;
ptr = &trans->next;
}
- if (trans)
+ if (trans && remove)
*ptr = trans->next;
nbd_unlock (h);
return trans;
@@ -303,6 +307,7 @@ nbd_request_full (struct handle *h, uint16_t flags, uint16_t type,
}
trans->buf = rep_buf;
trans->count = rep_buf ? count : 0;
+ trans->offset = offset;
nbd_lock (h);
if (h->dead) {
nbd_unlock (h);
@@ -315,7 +320,7 @@ nbd_request_full (struct handle *h, uint16_t flags, uint16_t type,
nbd_unlock (h);
if (nbd_request_raw (h, flags, type, offset, count, cookie, req_buf) == 0)
return fd;
- trans = find_trans_by_cookie (h, cookie);
+ trans = find_trans_by_cookie (h, cookie, true);
err:
err = errno;
@@ -340,35 +345,196 @@ nbd_request (struct handle *h, uint16_t flags, uint16_t type,
uint64_t offset,
/* Read a reply, and look up the fd corresponding to the transaction.
Return the server's non-negative answer (converted to local errno
- value) on success, or -1 on read failure. */
+ value) on success, or -1 on read failure. If structured replies
+ were negotiated, fd is set to -1 if there are still more replies
+ expected. */
static int
nbd_reply_raw (struct handle *h, int *fd)
{
- struct simple_reply rep;
+ union {
+ struct simple_reply simple;
+ struct structured_reply structured;
+ } rep;
struct transaction *trans;
- void *buf;
+ void *buf = NULL;
uint32_t count;
+ int error = NBD_SUCCESS;
+ bool more = false;
+ uint64_t offset = 0; /* absolute offset of structured read chunk from buf */
+ uint32_t len = 0; /* 0 except for structured reads */
+ bool zero = false; /* Whether to read or memset on structured read */
*fd = -1;
- if (read_full (h->fd, &rep, sizeof rep) < 0)
+ /* magic and handle overlap between simple and structured replies */
+ if (read_full (h->fd, &rep, sizeof rep.simple))
return nbd_mark_dead (h);
- if (be32toh (rep.magic) != NBD_SIMPLE_REPLY_MAGIC)
+ switch (be32toh (rep.simple.magic)) {
+ case NBD_SIMPLE_REPLY_MAGIC:
+ nbdkit_debug ("received simple reply for cookie %#" PRIx64 ", status
%s",
+ rep.simple.handle,
+ name_of_nbd_error(be32toh (rep.simple.error)));
+ error = be32toh (rep.simple.error);
+ break;
+ case NBD_STRUCTURED_REPLY_MAGIC:
+ if (!h->structured) {
+ nbdkit_error ("structured response without negotiation");
+ return nbd_mark_dead (h);
+ }
+ if (read_full (h->fd, sizeof rep.simple + (char *) &rep,
+ sizeof rep - sizeof rep.simple))
+ return nbd_mark_dead (h);
+ rep.structured.flags = be16toh (rep.structured.flags);
+ rep.structured.type = be16toh (rep.structured.type);
+ rep.structured.length = be32toh (rep.structured.length);
+ nbdkit_debug ("received structured reply %s for cookie %#" PRIx64
+ ", payload length %" PRId32,
+ name_of_nbd_reply_type(rep.structured.type),
+ rep.structured.handle, rep.structured.length);
+ if (rep.structured.length > 64 * 1024 * 1024) {
+ nbdkit_error ("structured reply length is suspiciously large: %" PRId32,
+ rep.structured.length);
+ return nbd_mark_dead (h);
+ }
+ if (rep.structured.length) {
+ /* Special case for OFFSET_DATA in order to read tail of chunk
+ directly into final buffer later on */
+ len = (rep.structured.type == NBD_REPLY_TYPE_OFFSET_DATA &&
+ rep.structured.length > sizeof offset) ? sizeof offset :
+ rep.structured.length;
+ buf = malloc (len);
+ if (!buf) {
+ nbdkit_error ("reading structured reply payload: %m");
+ return nbd_mark_dead (h);
+ }
+ if (read_full (h->fd, buf, len)) {
+ free (buf);
+ return nbd_mark_dead (h);
+ }
+ len = 0;
+ }
+ more = !(rep.structured.flags & NBD_REPLY_FLAG_DONE);
+ switch (rep.structured.type) {
+ case NBD_REPLY_TYPE_NONE:
+ if (rep.structured.length) {
+ nbdkit_error ("NBD_REPLY_TYPE_NONE with invalid payload");
+ free (buf);
+ return nbd_mark_dead (h);
+ }
+ if (more) {
+ nbdkit_error ("NBD_REPLY_TYPE_NONE without done flag");
+ return nbd_mark_dead (h);
+ }
+ break;
+ case NBD_REPLY_TYPE_OFFSET_DATA:
+ if (rep.structured.length <= sizeof offset) {
+ nbdkit_error ("structured reply OFFSET_DATA too small");
+ free (buf);
+ return nbd_mark_dead (h);
+ }
+ memcpy (&offset, buf, sizeof offset);
+ offset = be64toh (offset);
+ len = rep.structured.length - sizeof offset;
+ break;
+ case NBD_REPLY_TYPE_OFFSET_HOLE:
+ if (rep.structured.length != sizeof offset + sizeof len) {
+ nbdkit_error ("structured reply OFFSET_HOLE size incorrect");
+ free (buf);
+ return nbd_mark_dead (h);
+ }
+ memcpy (&offset, buf, sizeof offset);
+ offset = be64toh (offset);
+ memcpy (&len, buf, sizeof len);
+ len = be32toh (len);
+ if (!len) {
+ nbdkit_error ("structured reply OFFSET_HOLE length incorrect");
+ free (buf);
+ return nbd_mark_dead (h);
+ }
+ zero = true;
+ break;
+ default:
+ if (NBD_REPLY_TYPE_IS_ERR (rep.structured.type)) {
+ uint16_t errlen;
+
+ if (rep.structured.length < sizeof error + sizeof errlen) {
+ nbdkit_error ("structured reply error size incorrect");
+ free (buf);
+ return nbd_mark_dead (h);
+ }
+ memcpy (&errlen, (char *) buf + sizeof error, sizeof errlen);
+ errlen = be16toh (errlen);
+ if (errlen > rep.structured.length - sizeof error - sizeof errlen) {
+ nbdkit_error ("structured reply error message size incorrect");
+ free (buf);
+ return nbd_mark_dead (h);
+ }
+ memcpy (&error, buf, sizeof error);
+ error = be32toh (error);
+ if (errlen)
+ nbdkit_debug ("received structured error %s with message: %.*s",
+ name_of_nbd_error (error), errlen,
+ (char *) buf + sizeof error + sizeof errlen);
+ else
+ nbdkit_debug ("received structured error %s without message",
+ name_of_nbd_error (error));
+ free (buf);
+ }
+ else {
+ nbdkit_error ("received unexpected structured reply %s",
+ name_of_nbd_reply_type(rep.structured.type));
+ free (buf);
+ return nbd_mark_dead (h);
+ }
+ }
+ break;
+ default:
+ nbdkit_error ("received unexpected magic in reply: %#" PRIx32,
+ rep.simple.magic);
return nbd_mark_dead (h);
- nbdkit_debug ("received reply for cookie %#" PRIx64 ", status %s",
- rep.handle, name_of_nbd_error(be32toh (rep.error)));
- trans = find_trans_by_cookie (h, rep.handle);
+ }
+
+ trans = find_trans_by_cookie (h, rep.simple.handle, !more);
if (!trans) {
- nbdkit_error ("reply with unexpected cookie %#" PRIx64, rep.handle);
+ nbdkit_error ("reply with unexpected cookie %#" PRIx64,
rep.simple.handle);
return nbd_mark_dead (h);
}
- *fd = trans->u.fds[1];
+ if (!more)
+ *fd = trans->u.fds[1];
+ else if (error && !trans->err)
+ trans->err = error;
buf = trans->buf;
count = trans->count;
+ if (buf && h->structured &&
+ be32toh (rep.simple.magic) == NBD_SIMPLE_REPLY_MAGIC) {
+ nbdkit_error ("simple read reply when structured was expected");
+ return nbd_mark_dead (h);
+ }
+ if (len) {
+ if (!buf) {
+ nbdkit_error ("structured read response to a non-read command");
+ return nbd_mark_dead (h);
+ }
+ if (offset < trans->offset || offset > INT64_MAX ||
+ offset + len > trans->offset + count) {
+ nbdkit_error ("structured read reply with unexpected offset/length");
+ return nbd_mark_dead (h);
+ }
+ buf = (char *) buf + offset - trans->offset;
+ if (zero) {
+ /* We do not check whether the server mistakenly sends
+ non-compliant overlapping chunks, thus we must re-zero holes
+ rather than relying on our pre-initialization to zero */
+ memset (buf, 0, len);
+ buf = NULL;
+ }
+ else
+ count = len;
+ }
free (trans);
- switch (be32toh (rep.error)) {
+ switch (error) {
case NBD_SUCCESS:
- if (buf && read_full (h->fd, buf, count) < 0)
+ if (buf && read_full (h->fd, buf, count))
return nbd_mark_dead (h);
return 0;
case NBD_EPERM:
@@ -378,8 +544,7 @@ nbd_reply_raw (struct handle *h, int *fd)
case NBD_ENOMEM:
return ENOMEM;
default:
- nbdkit_debug ("unexpected error %d, squashing to EINVAL",
- be32toh (rep.error));
+ nbdkit_debug ("unexpected error %d, squashing to EINVAL", error);
/* fallthrough */
case NBD_EINVAL:
return EINVAL;
@@ -405,7 +570,9 @@ nbd_reader (void *handle)
r = nbd_reply_raw (h, &fd);
if (r >= 0) {
- if (write (fd, &r, sizeof r) != sizeof r) {
+ if (fd < 0)
+ nbdkit_debug ("partial reply handled, waiting for final reply");
+ else if (write (fd, &r, sizeof r) != sizeof r) {
nbdkit_error ("failed to write pipe: %m");
abort ();
}
@@ -527,7 +694,25 @@ nbd_newstyle_haggle (struct handle *h)
char *buffer;
uint16_t info;
- /* TODO: structured reads, block status */
+ nbdkit_debug ("trying NBD_OPT_STRUCTURED_REPLY");
+ opt.version = htobe64 (NEW_VERSION);
+ opt.option = htobe32 (NBD_OPT_STRUCTURED_REPLY);
+ opt.optlen = htobe32 (0);
+ if (write_full (h->fd, &opt, sizeof opt)) {
+ nbdkit_error ("unable to request NBD_OPT_STRUCTURED_REPLY: %m");
+ return -1;
+ }
+ if (nbd_newstyle_recv_option_reply (h, NBD_OPT_STRUCTURED_REPLY, &reply,
+ NULL) < 0)
+ return -1;
+ if (reply.reply == NBD_REP_ACK) {
+ nbdkit_debug ("structured replies enabled");
+ h->structured = true;
+ /* TODO: block status */
+ }
+ else {
+ nbdkit_debug ("structured replies disabled");
+ }
/* Try NBD_OPT_GO */
nbdkit_debug ("trying NBD_OPT_GO");
@@ -828,6 +1013,13 @@ nbd_pread (void *handle, void *buf, uint32_t count, uint64_t
offset,
int c;
assert (!flags);
+ /* We do not check whether structured read chunks cover the entire
+ buffer; as such, to avoid leaking uninit memory back to the
+ client, we must pre-zero the buffer. TODO: should nbdkit
+ guarantee a clean buffer to .pread, perhaps based on a flag set
+ when we register our plugin? */
+ if (h->structured)
+ memset (buf, 0, count);
c = nbd_request_full (h, 0, NBD_CMD_READ, offset, count, NULL, buf);
return c < 0 ? c : nbd_reply (h, c);
}
--
2.20.1