On 9/26/19 11:40 AM, Richard W.M. Jones wrote:
This adds new APIs for running a local NBD server and connecting to
it
using systemd socket activation (instead of stdin/stdout).
This includes interop tests against nbdkit and qemu-nbd which I
believe are the only NBD servers supporting socket activation. (If we
find others then we can add more interop tests in future.)
The upstream spec for systemd socket activation is here:
http://0pointer.de/blog/projects/socket-activation.html
---
+++ b/generator/states-connect.c
@@ -37,6 +37,9 @@
#include <sys/socket.h>
#include <sys/un.h>
+/* This is baked into the systemd socket activation API. */
+#define FIRST_SOCKET_ACTIVATION_FD 3
+
/* Disable Nagle's algorithm on the socket, but don't fail. */
static void
disable_nagle (int sock)
@@ -292,4 +295,110 @@ disable_nagle (int sock)
SET_NEXT_STATE (%^MAGIC.START);
return 0;
+ CONNECT_SA.START:
+ /* This is in some ways similar to nbd_aio_connect_command above,
+ * but we must pass a listening socket to the child process and
+ * we must set LISTEN_PID and LISTEN_FDS environment variables.
+ */
+ size_t len;
+ int s;
+ struct sockaddr_un addr;
+ pid_t pid;
+ int flags;
+ char listen_pid[16];
+
+ assert (!h->sock);
+ assert (h->argv);
+ assert (h->argv[0]);
+
+ /* Use /tmp instead of TMPDIR because we must ensure the path is
+ * short enough to store in the sockaddr_un. On some platforms this
+ * may cause problems so we may need to revisit it. XXX
+ */
Is the use of socketpair() any better than creating a socket under /tmp?
+ h->sa_tmpdir = strdup ("/tmp/libnbdXXXXXX");
+ h->sa_sockpath = strdup ("/tmp/libnbdXXXXXX/sock");
+ if (h->sa_tmpdir == NULL || h->sa_sockpath == NULL) {
+ SET_NEXT_STATE (%.DEAD);
+ set_error (errno, "strdup");
+ return 0;
+ }
+
+ if (mkdtemp (h->sa_tmpdir) == NULL) {
+ SET_NEXT_STATE (%.DEAD);
+ set_error (errno, "mkdtemp");
+ return 0;
+ }
+ len = strlen (h->sa_tmpdir);
+ memcpy (h->sa_sockpath, h->sa_tmpdir, len);
+
+ s = socket (AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0);
+ if (s == -1) {
+ SET_NEXT_STATE (%.DEAD);
+ set_error (errno, "socket");
+ return 0;
+ }
If we fail here or later, do/should we try to clean up the
/tmp/libnbdXXX directory created earlier?
/me reads ahead - nbd_close tries to address it
Still, if we fail at this point, h->sa_sockpath is set but not yet
created [1]
+
+ addr.sun_family = AF_UNIX;
+ memcpy (addr.sun_path, h->sa_sockpath, strlen (h->sa_sockpath) + 1);
+ if (bind (s, (struct sockaddr *) &addr, sizeof addr) == -1) {
+ SET_NEXT_STATE (%.DEAD);
+ set_error (errno, "bind: %s", h->sa_sockpath);
+ return 0;
+ }
+
+ if (listen (s, 1) == -1) {
+ SET_NEXT_STATE (%.DEAD);
+ set_error (errno, "listen");
+ return 0;
+ }
+
+ pid = fork ();
+ if (pid == -1) {
+ SET_NEXT_STATE (%.DEAD);
+ set_error (errno, "fork");
+ close (s);
+ return 0;
+ }
+ if (pid == 0) { /* child - run command */
+ if (s != FIRST_SOCKET_ACTIVATION_FD) {
+ dup2 (s, FIRST_SOCKET_ACTIVATION_FD);
No check for errors? But that's probably okay in this instance (we know
that we won't fail with EBADF, for example).
+ close (s);
+ }
+ else {
+ /* We must unset CLOEXEC on the fd. (dup2 above does this
+ * implicitly because CLOEXEC is set on the fd, not on the
+ * socket).
+ */
+ flags = fcntl (s, F_GETFD, 0);
+ if (flags == -1) {
+ perror ("fcntl: F_GETFD");
perror is not async-signal-safe. Calling it in a child of a
potentially-multi-threaded parent is therefore prone to deadlock, if
perror() triggers a request to grab any resource that was left locked by
a different thread holding the lock but stranded by the fork.
+ _exit (EXIT_FAILURE);
+ }
+ if (fcntl (s, F_SETFD, flags & ~FD_CLOEXEC) == -1) {
+ perror ("fcntl: F_SETFD");
+ _exit (EXIT_FAILURE);
+ }
About all we can _safely_ do is let our _exit() status inform the parent
process that something failed, and let the parent process print the
error message. But even passing errno as the _exit() value is not
portable (GNU Hurd errno values are intentionally larger than what fit
in 8-bit returns that the parent would see). It's also possible to set
up a cloexec pipe (in addition to everything else) so that the child can
write() error details into the pipe, but that's a lot of effort compared
to just blindly stating that the child failed but declaring full details
of why as lost.
+ }
+
+ snprintf (listen_pid, sizeof listen_pid, "%ld", (long) getpid ());
+ setenv ("LISTEN_PID", listen_pid, 1);
+ setenv ("LISTEN_FDS", "1", 1);
snprintf() and setenv() are also not async-signal-safe. Which is a
bummer, since you really don't know the child pid until in the child.
You could open-code the translation of a decimal number into a buffer
without snprintf, but you also have to figure out how to safely put it
into the environ seen by the child. How does systemd do it?
We _could_ create a helper executable, where we fork() into the child
process, exec() the helper to overwrite the child pid with something
that is no longer constrained by the limits of async-safety, so that the
helper app can then use snprintf/setenv at will before re-execing again
with the real target executable. The next question is how worried are
we about actually being hit by an non-async-signal-safe hang. What you
have works 99.9% of the time, but if we are trying to be a library that
is usable by any application, we have to decide if that is good enough.
+
+ /* Restore SIGPIPE back to SIG_DFL. */
+ signal (SIGPIPE, SIG_DFL);
+
+ execvp (h->argv[0], h->argv);
+ perror (h->argv[0]);
and another unsafe perror.
+ _exit (EXIT_FAILURE);
Is EXIT_FAILURE the best here, or should we consider exiting with status
126/127 to match what other programs (sh, env, nice, nohup, ...) do when
execvp() fails?
+ }
+
+ /* Parent. */
+ close (s);
+ h->pid = pid;
+
+ h->connaddrlen = sizeof addr;
+ memcpy (&h->connaddr, &addr, h->connaddrlen);
+ SET_NEXT_STATE (%^CONNECT.START);
+ return 0;
+
} /* END STATE MACHINE */
+++ b/interop/socket-activation.c
+#define SIZE 1048576
+
+int
+main (int argc, char *argv[])
+{
+ struct nbd_handle *nbd;
+ char tmpfile[] = "/tmp/nbdXXXXXX";
+ int fd;
+ int64_t actual_size;
+ char buf[512];
+ int r = -1;
+
+ /* Create a large sparse temporary file. */
+ fd = mkstemp (tmpfile);
+ if (fd == -1 ||
+ ftruncate (fd, SIZE) == -1 ||
+ close (fd) == -1) {
+ perror (tmpfile);
+ goto out;
+ }
Is it worth populating anything in the temp file...
+
+ nbd = nbd_create ();
+ if (nbd == NULL) {
+ fprintf (stderr, "%s\n", nbd_get_error ());
+ goto out;
+ }
+
+ char *args[] = { SERVER, SERVER_PARAMS, NULL };
+ if (nbd_connect_socket_activation (nbd, args) == -1) {
+ fprintf (stderr, "%s\n", nbd_get_error ());
+ goto out;
+ }
+
+ actual_size = nbd_get_size (nbd);
+ if (actual_size == -1) {
+ fprintf (stderr, "%s\n", nbd_get_error ());
+ goto out;
+ }
+ if (actual_size != SIZE) {
+ fprintf (stderr, "%s: actual size %" PRIi64 " <> expected size
%d",
+ argv[0], actual_size, SIZE);
+ goto out;
+ }
+
+ if (nbd_pread (nbd, buf, sizeof buf, 0, 0) == -1) {
+ fprintf (stderr, "%s\n", nbd_get_error ());
+ goto out;
+ }
...and checking that we read it back out? But even a successful read
without checking contents is pretty good evidence that the socket
activation worked.
+
+ if (nbd_shutdown (nbd, 0) == -1) {
+ fprintf (stderr, "%s\n", nbd_get_error ());
+ goto out;
+ }
+
+ nbd_close (nbd);
+
+ r = 0;
+ out:
+ unlink (tmpfile);
+
+ exit (r == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
diff --git a/lib/connect.c b/lib/connect.c
index f98bcdb..c1cbef7 100644
--- a/lib/connect.c
+++ b/lib/handle.c
@@ -129,6 +129,16 @@ nbd_close (struct nbd_handle *h)
free_cmd_list (h->cmds_in_flight);
free_cmd_list (h->cmds_done);
nbd_internal_free_string_list (h->argv);
+ if (h->sa_sockpath) {
+ if (h->pid > 0)
+ kill (h->pid, SIGTERM);
Are we sure that SIGTERM Is always going to be sufficient? Or do we
need a tiered approach where we try SIGTERM, but followup with SIGKILL
if too much time elapses? I guess it is the same situation for
nbd_connect_command, does that mean we should have a common helper
function for doing appropriate cleanup?
Do we document that nbd_close() may block?
+ unlink (h->sa_sockpath);
[1] we can end up calling unlink() on a path that does not exist. But
since we aren't checking error status here, we are effectively ignoring
the ENOENT in that case.
+ free (h->sa_sockpath);
+ }
+ if (h->sa_tmpdir) {
+ rmdir (h->sa_tmpdir);
+ free (h->sa_tmpdir);
+ }
free (h->unixsocket);
free (h->hostname);
free (h->port);
diff --git a/lib/internal.h b/lib/internal.h
index bdb0e83..bf8b9aa 100644
--- a/lib/internal.h
+++ b/lib/internal.h
@@ -188,6 +188,12 @@ struct nbd_handle {
char **argv;
pid_t pid;
+ /* When using systemd socket activation, this directory and socket
+ * must be deleted, and the pid above must be killed.
+ */
+ char *sa_tmpdir;
+ char *sa_sockpath;
+
/* When connecting to Unix domain socket. */
char *unixsocket;
--
Eric Blake, Principal Software Engineer
Red Hat, Inc. +1-919-301-3226
Virtualization:
qemu.org |
libvirt.org