Instead of killing the connection when the server gives us an
unexpected structured or error reply for a cookie we don't recognize,
we can log the issue and otherwise ignore that packet (an unexpected
successful simple reply is harder if we did not negotiate structured
replies; see the lengthy comment in states-reply-simple.c). A
compliant server will never do this to us, but a transmission error
might. Here's a quick patch to nbdkit to demonstrate such a scenario:
| diff --git i/server/protocol.c w/server/protocol.c
| index 2ac77055..55b27e2e 100644
| --- i/server/protocol.c
| +++ w/server/protocol.c
| @@ -742,7 +742,7 @@ protocol_recv_request_send_reply (void)
| (cmd == NBD_CMD_READ || cmd == NBD_CMD_BLOCK_STATUS)) {
| if (!error) {
| if (cmd == NBD_CMD_READ)
| - return send_structured_reply_read (request.handle, cmd,
| + return send_structured_reply_read (0x100^request.handle, cmd,
| buf, count, offset);
| else /* NBD_CMD_BLOCK_STATUS */
| return send_structured_reply_block_status (request.handle,
Note that if a server bit-flips a cookie, our original command waiting
for the right cookie may never get a completion packet from the
server; thus, testing this requires aio commands (unless we someday
add APIs to make it easier for blocking commands that give up when
they hit a timeout). With the following command line and the hack to
nbdkit above:
$ ./run /path/to/nbdkit -U - memory 1M --run 'nbdsh -u "$uri" -c "\
buf=nbd.Buffer(1)
cookie=h.aio_pread(buf, 0)
try:
print(h.poll(1))
except nbd.Error as ex:
print(ex.string)
h.flush()
print(h.get_size())
"'
pre-patch we see that the connection is killed when h.poll()
encounters the garbage reply, before we get to the second command:
nbd_poll: no matching cookie found for server reply, this is probably a bug in the server
nbdkit: memory.1: error: read request: Connection reset by peer
nbdsh: command line script failed: nbd_flush: invalid state: DEAD: the handle must be
connected with the server: Invalid argument
while post-patch the read never retires, but the unexpected cookie is
skipped and we get to the flush:
1
1048576
Watching the LIBNBD_DEBUG=1 stream, we also see:
libnbd: debug: nbdsh: nbd_poll: skipped reply for unexpected cookie 281474976710657, this
is probably a bug in the server
---
generator/states-reply-simple.c | 25 ++++++++++++++++++++++---
generator/states-reply-structured.c | 24 +++++++++++++-----------
generator/states-reply.c | 27 ++++++++++++---------------
3 files changed, 47 insertions(+), 29 deletions(-)
diff --git a/generator/states-reply-simple.c b/generator/states-reply-simple.c
index 2a7b9a9..24f7efa 100644
--- a/generator/states-reply-simple.c
+++ b/generator/states-reply-simple.c
@@ -25,10 +25,29 @@ STATE_MACHINE {
uint64_t cookie;
error = be32toh (h->sbuf.simple_reply.error);
- cookie = be64toh (h->sbuf.simple_reply.handle);
- assert (cmd);
- assert (cmd->cookie == cookie);
+ if (cmd == NULL) {
+ /* Unexpected reply. If error was set or we have structured
+ * replies, we know there should be no payload, so the next byte
+ * on the wire (if any) will be another reply, and we can let
+ * FINISH_COMMAND diagnose/ignore the server bug. If not, we lack
+ * context to know whether the server thinks it was responding to
+ * NBD_CMD_READ, so it is safer to move to DEAD now than to risk
+ * consuming a server's potential data payload as a reply stream
+ * (even though we would be likely to produce a magic number
+ * mismatch on the next pass that would also move us to DEAD).
+ */
+ if (error || h->structured_replies)
+ SET_NEXT_STATE (%^FINISH_COMMAND);
+ else {
+ cookie = be64toh (h->sbuf.simple_reply.handle);
+ SET_NEXT_STATE (%.DEAD);
+ set_error (EPROTO,
+ "no matching cookie %" PRIu64 " found for server reply,
"
+ "this is probably a server bug", cookie);
+ }
+ return 0;
+ }
if (cmd->type == NBD_CMD_READ && h->structured_replies) {
set_error (0, "server sent unexpected simple reply for read");
diff --git a/generator/states-reply-structured.c b/generator/states-reply-structured.c
index 2db2cf6..4fbd62f 100644
--- a/generator/states-reply-structured.c
+++ b/generator/states-reply-structured.c
@@ -69,17 +69,12 @@ STATE_MACHINE {
REPLY.STRUCTURED_REPLY.CHECK:
struct command *cmd = h->reply_cmd;
uint16_t flags, type;
- uint64_t cookie;
uint32_t length;
flags = be16toh (h->sbuf.sr.structured_reply.flags);
type = be16toh (h->sbuf.sr.structured_reply.type);
- cookie = be64toh (h->sbuf.sr.structured_reply.handle);
length = be32toh (h->sbuf.sr.structured_reply.length);
- assert (cmd);
- assert (cmd->cookie == cookie);
-
/* Reject a server that replies with too much information, but don't
* reject a single structured reply to NBD_CMD_READ on the largest
* size we were willing to send. The most likely culprit is a server
@@ -88,12 +83,13 @@ STATE_MACHINE {
* not worth keeping the connection alive.
*/
if (length > MAX_REQUEST_SIZE + sizeof h->sbuf.sr.payload.offset_data) {
- set_error (0, "invalid server reply length");
+ set_error (0, "invalid server reply length %" PRIu32, length);
SET_NEXT_STATE (%.DEAD);
return 0;
}
- if (!h->structured_replies) {
+ /* Skip an unexpected structured reply, including to an unknown cookie. */
+ if (cmd == NULL || !h->structured_replies) {
resync:
h->rbuf = NULL;
h->rlen = length;
@@ -487,7 +483,6 @@ STATE_MACHINE {
uint16_t type;
uint32_t length;
- assert (cmd);
assert (h->rbuf == NULL);
switch (recv_into_rbuf (h)) {
case -1: SET_NEXT_STATE (%.DEAD); return 0;
@@ -496,6 +491,15 @@ STATE_MACHINE {
SET_NEXT_STATE (%.READY);
return 0;
case 0:
+ /* If this reply is to an unknown command, FINISH_COMMAND will
+ * diagnose and ignore the server bug. Otherwise, ensure the
+ * pending command sees a failure of EPROTO if it does not already
+ * have an error.
+ */
+ if (cmd == NULL) {
+ SET_NEXT_STATE (%^FINISH_COMMAND);
+ return 0;
+ }
type = be16toh (h->sbuf.sr.structured_reply.type);
length = be32toh (h->sbuf.sr.structured_reply.length);
debug (h, "unexpected reply type %u or payload length %" PRIu32
@@ -505,10 +509,8 @@ STATE_MACHINE {
if (cmd->error == 0)
cmd->error = EPROTO;
SET_NEXT_STATE (%FINISH);
- return 0;
- default:
- abort ();
}
+ return 0;
REPLY.STRUCTURED_REPLY.FINISH:
uint16_t flags;
diff --git a/generator/states-reply.c b/generator/states-reply.c
index 1a3a20a..0736342 100644
--- a/generator/states-reply.c
+++ b/generator/states-reply.c
@@ -124,7 +124,7 @@ STATE_MACHINE {
}
else {
SET_NEXT_STATE (%.DEAD); /* We've probably lost synchronization. */
- set_error (0, "invalid reply magic");
+ set_error (0, "invalid reply magic 0x%" PRIx32, magic);
return 0;
}
@@ -132,22 +132,13 @@ STATE_MACHINE {
* handle (our cookie) is stored at the same offset.
*/
cookie = be64toh (h->sbuf.simple_reply.handle);
- /* Find the command amongst the commands in flight. */
+ /* Find the command amongst the commands in flight. If the server sends
+ * a reply for an unknown cookie, FINISH will diagnose that later.
+ */
for (cmd = h->cmds_in_flight; cmd != NULL; cmd = cmd->next) {
if (cmd->cookie == cookie)
break;
}
- if (cmd == NULL) {
- /* An unexpected structured reply could be skipped, since it
- * includes a length; similarly an unexpected simple reply can be
- * skipped if we assume it was not a read. However, it's more
- * likely we've lost synchronization with the server.
- */
- SET_NEXT_STATE (%.DEAD);
- set_error (0, "no matching cookie found for server reply, "
- "this is probably a bug in the server");
- return 0;
- }
h->reply_cmd = cmd;
return 0;
@@ -167,10 +158,16 @@ STATE_MACHINE {
if (cmd->cookie == cookie)
break;
}
- assert (cmd != NULL);
assert (h->reply_cmd == cmd);
- h->reply_cmd = NULL;
+ if (cmd == NULL) {
+ debug (h, "skipped reply for unexpected cookie %" PRIu64
+ ", this is probably a bug in the server", cookie);
+ SET_NEXT_STATE (%.READY);
+ return 0;
+ }
+
retire = cmd->type == NBD_CMD_DISC;
+ h->reply_cmd = NULL;
/* Notify the user */
if (CALLBACK_IS_NOT_NULL (cmd->cb.completion)) {
--
2.37.1