We disabled Nagle's algorithm to allow less latency in our responses
reaching the client; but as a side effect, it leads to more network
overhead when we send a reply split across more than one write().
Take advantage of various means for grouping related writes (Linux'
MSG_MORE for sockets, gnutls' corking for TLS) to send a larger
packet, and adjust callers to pass in our internal SEND_MORE flag as a
hint when to use the improvements. I tested with appropriate
combinations from:
$ nbdkit {-p 10810,-U -} \
{--tls=require --tls-verify-peer --tls-psk=./keys.psk,} memory size=64m \
--run './aio-parallel-load{-tls,} {$unixsocket,nbd://localhost:10810}'
with the following IOPS measurements averaged over multiple runs:
pre post gain
unix plain: 802627.8 822944.1 2.53%
unix tls: 121871.6 125538.0 3.01%
tcp plain: 656400.1 685795.0 4.48%
tcp tls: 114552.1 120197.3 4.93%
which looks like an overall improvement, even though it is still close
to the margins for being in the noise.
Signed-off-by: Eric Blake <eblake(a)redhat.com>
---
server/internal.h | 4 ++++
server/connections.c | 10 ++++++++--
server/crypto.c | 7 +++++++
server/protocol-handshake-newstyle.c | 10 +++++-----
server/protocol.c | 16 +++++++++-------
5 files changed, 33 insertions(+), 14 deletions(-)
diff --git a/server/internal.h b/server/internal.h
index 50525f3..2281937 100644
--- a/server/internal.h
+++ b/server/internal.h
@@ -139,6 +139,10 @@ extern void change_user (void);
/* connections.c */
struct connection;
+enum {
+ SEND_MORE = 1, /* Hint to use MSG_MORE/corking to group send()s */
+};
+
typedef int (*connection_recv_function) (struct connection *,
void *buf, size_t len)
__attribute__((__nonnull__ (1, 2)));
diff --git a/server/connections.c b/server/connections.c
index 1a749b6..f56590c 100644
--- a/server/connections.c
+++ b/server/connections.c
@@ -330,7 +330,8 @@ free_connection (struct connection *conn)
}
/* Write buffer to conn->sockout with send() and either succeed completely
- * (returns 0) or fail (returns -1). flags is ignored for now.
+ * (returns 0) or fail (returns -1). flags may include SEND_MORE as a hint
+ * that this send will be followed by related data.
*/
static int
raw_send_socket (struct connection *conn, const void *vbuf, size_t len,
@@ -339,9 +340,14 @@ raw_send_socket (struct connection *conn, const void *vbuf, size_t
len,
int sock = conn->sockout;
const char *buf = vbuf;
ssize_t r;
+ int f = 0;
+#ifdef MSG_MORE
+ if (flags & SEND_MORE)
+ f |= MSG_MORE;
+#endif
while (len > 0) {
- r = send (sock, buf, len, 0);
+ r = send (sock, buf, len, f);
if (r == -1) {
if (errno == EINTR || errno == EAGAIN)
continue;
diff --git a/server/crypto.c b/server/crypto.c
index 3f87944..3f3d96b 100644
--- a/server/crypto.c
+++ b/server/crypto.c
@@ -357,6 +357,9 @@ crypto_send (struct connection *conn, const void *vbuf, size_t len,
int flags)
assert (session != NULL);
+ if (flags & SEND_MORE)
+ gnutls_record_cork (session);
+
while (len > 0) {
r = gnutls_record_send (session, buf, len);
if (r < 0) {
@@ -368,6 +371,10 @@ crypto_send (struct connection *conn, const void *vbuf, size_t len,
int flags)
len -= r;
}
+ if (!(flags & SEND_MORE) &&
+ gnutls_record_uncork (session, GNUTLS_RECORD_WAIT) < 0)
+ return -1;
+
return 0;
}
diff --git a/server/protocol-handshake-newstyle.c b/server/protocol-handshake-newstyle.c
index 6993c8e..e0136de 100644
--- a/server/protocol-handshake-newstyle.c
+++ b/server/protocol-handshake-newstyle.c
@@ -94,13 +94,13 @@ send_newstyle_option_reply_exportname (struct connection *conn,
if (conn->send (conn,
&fixed_new_option_reply,
- sizeof fixed_new_option_reply, 0) == -1) {
+ sizeof fixed_new_option_reply, SEND_MORE) == -1) {
nbdkit_error ("write: %s: %m", name_of_nbd_opt (option));
return -1;
}
len = htobe32 (name_len);
- if (conn->send (conn, &len, sizeof len, 0) == -1) {
+ if (conn->send (conn, &len, sizeof len, SEND_MORE) == -1) {
nbdkit_error ("write: %s: %s: %m",
name_of_nbd_opt (option), "sending length");
return -1;
@@ -132,7 +132,7 @@ send_newstyle_option_reply_info_export (struct connection *conn,
if (conn->send (conn,
&fixed_new_option_reply,
- sizeof fixed_new_option_reply, 0) == -1 ||
+ sizeof fixed_new_option_reply, SEND_MORE) == -1 ||
conn->send (conn, &export, sizeof export, 0) == -1) {
nbdkit_error ("write: %s: %m", name_of_nbd_opt (option));
return -1;
@@ -161,8 +161,8 @@ send_newstyle_option_reply_meta_context (struct connection *conn,
if (conn->send (conn,
&fixed_new_option_reply,
- sizeof fixed_new_option_reply, 0) == -1 ||
- conn->send (conn, &context, sizeof context, 0) == -1 ||
+ sizeof fixed_new_option_reply, SEND_MORE) == -1 ||
+ conn->send (conn, &context, sizeof context, SEND_MORE) == -1 ||
conn->send (conn, name, namelen, 0) == -1) {
nbdkit_error ("write: %s: %m", name_of_nbd_opt (option));
return -1;
diff --git a/server/protocol.c b/server/protocol.c
index 0e054ee..5967622 100644
--- a/server/protocol.c
+++ b/server/protocol.c
@@ -393,12 +393,13 @@ send_simple_reply (struct connection *conn,
ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->write_lock);
struct simple_reply reply;
int r;
+ int f = (cmd == NBD_CMD_READ && !error) ? SEND_MORE : 0;
reply.magic = htobe32 (NBD_SIMPLE_REPLY_MAGIC);
reply.handle = handle;
reply.error = htobe32 (nbd_errno (error, false));
- r = conn->send (conn, &reply, sizeof reply, 0);
+ r = conn->send (conn, &reply, sizeof reply, f);
if (r == -1) {
nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
return connection_set_status (conn, -1);
@@ -439,7 +440,7 @@ send_structured_reply_read (struct connection *conn,
reply.type = htobe16 (NBD_REPLY_TYPE_OFFSET_DATA);
reply.length = htobe32 (count + sizeof offset_data);
- r = conn->send (conn, &reply, sizeof reply, 0);
+ r = conn->send (conn, &reply, sizeof reply, SEND_MORE);
if (r == -1) {
nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
return connection_set_status (conn, -1);
@@ -447,7 +448,7 @@ send_structured_reply_read (struct connection *conn,
/* Send the offset + read data buffer. */
offset_data.offset = htobe64 (offset);
- r = conn->send (conn, &offset_data, sizeof offset_data, 0);
+ r = conn->send (conn, &offset_data, sizeof offset_data, SEND_MORE);
if (r == -1) {
nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd));
return connection_set_status (conn, -1);
@@ -573,7 +574,7 @@ send_structured_reply_block_status (struct connection *conn,
reply.length = htobe32 (sizeof context_id +
nr_blocks * sizeof (struct block_descriptor));
- r = conn->send (conn, &reply, sizeof reply, 0);
+ r = conn->send (conn, &reply, sizeof reply, SEND_MORE);
if (r == -1) {
nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
return connection_set_status (conn, -1);
@@ -581,7 +582,7 @@ send_structured_reply_block_status (struct connection *conn,
/* Send the base:allocation context ID. */
context_id = htobe32 (base_allocation_id);
- r = conn->send (conn, &context_id, sizeof context_id, 0);
+ r = conn->send (conn, &context_id, sizeof context_id, SEND_MORE);
if (r == -1) {
nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
return connection_set_status (conn, -1);
@@ -589,7 +590,8 @@ send_structured_reply_block_status (struct connection *conn,
/* Send each block descriptor. */
for (i = 0; i < nr_blocks; ++i) {
- r = conn->send (conn, &blocks[i], sizeof blocks[i], 0);
+ r = conn->send (conn, &blocks[i], sizeof blocks[i],
+ i == nr_blocks - 1 ? 0 : SEND_MORE);
if (r == -1) {
nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
return connection_set_status (conn, -1);
@@ -615,7 +617,7 @@ send_structured_reply_error (struct connection *conn,
reply.type = htobe16 (NBD_REPLY_TYPE_ERROR);
reply.length = htobe32 (0 /* no human readable error */ + sizeof error_data);
- r = conn->send (conn, &reply, sizeof reply, 0);
+ r = conn->send (conn, &reply, sizeof reply, SEND_MORE);
if (r == -1) {
nbdkit_error ("write error reply: %m");
return connection_set_status (conn, -1);
--
2.20.1