This fixes the long-standing crash on close when nbdkit exits.
I did try first to fix threads so we're using a proper thread pool,
but that's difficult to implement. So this does the minimal change
needed to fix the crash instead.
There are still two segfaults that happen during running the test
suite. One is deliberately caused (tests/test-captive.sh). The other
appears to be an assertion failure bug in the new finalize code:
Thread 2 (Thread 0x7f12b820fa40 (LWP 231456)):
#0 0x00007f12b8783a1f in __GI___poll (fds=0x55771f18c550, nfds=nfds@entry=2,
timeout=timeout@entry=-1) at ../sysdeps/unix/sysv/linux/poll.c:29
#1 0x00005577156b9646 in poll (__timeout=-1, __nfds=2, __fds=<optimized out>) at
/usr/include/bits/poll2.h:46
#2 check_sockets_and_quit_fd (nr_socks=1, socks=0x55771f18c2e0) at sockets.c:466
#3 accept_incoming_connections (socks=0x55771f18c2e0, nr_socks=1) at sockets.c:494
#4 0x00005577156ac6ac in start_serving () at main.c:914
#5 main (argc=<optimized out>, argv=0x7ffd0bcb15d8) at main.c:685
Thread 1 (Thread 0x7f12b820e700 (LWP 231469)):
#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
#1 0x00007f12b86b28d9 in __GI_abort () at abort.c:79
#2 0x00007f12b86b27a9 in __assert_fail_base (fmt=0x7f12b881db18 "%s%s%s:%u:
%s%sAssertion `%s' failed.\n%n", assertion=0x5577156bb09d "h->state &
HANDLE_CONNECTED", file=0x5577156bb064 "backend.c", line=240,
function=<optimized out>) at assert.c:92
#3 0x00007f12b86c1a66 in __GI___assert_fail (assertion=assertion@entry=0x5577156bb09d
"h->state & HANDLE_CONNECTED", file=file@entry=0x5577156bb064
"backend.c", line=line@entry=240, function=function@entry=0x5577156bb880
<__PRETTY_FUNCTION__.6528> "backend_finalize") at assert.c:101
#4 0x00005577156ad04e in backend_finalize (b=<optimized out>,
conn=conn@entry=0x55771f1ac710) at backend.c:240
#5 0x00005577156b6ea8 in negotiate_handshake_newstyle_options (conn=<optimized
out>) at protocol-handshake-newstyle.c:484
#6 protocol_handshake_newstyle (conn=0x55771f1ac710) at
protocol-handshake-newstyle.c:762
#7 0x00005577156b5705 in protocol_handshake (conn=conn@entry=0x55771f1ac710) at
protocol-handshake.c:55
#8 0x00005577156af73a in handle_single_connection (sockin=<optimized out>,
sockout=15) at connections.c:167
#9 0x00005577156b8a48 in start_thread (datav=0x55771f18c620) at sockets.c:356
#10 0x00007f12b885f4e2 in start_thread (arg=<optimized out>) at
pthread_create.c:479
#11 0x00007f12b878e643 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Rich.
Show replies by date
This was only used in one place: when using the -s option on the
command line. In this case nbdkit -s would exit with EXIT_FAILURE if
there was some kind of I/O error or early client close. This is not
particularly useful because: (1) It prevents the plugin being unloaded
properly. (2) Client close is a normal shutdown strategy for some
clients, so not really an error. (3) It's undocumented and
inconsistent with other server modes. (4) It's awkward when doing
fuzzing.
Note that connection_get_status (added in commit 0e1f158a1a
"connections: Add thread-safe status indicator") is still required to
coordinate connection shutdown across threads.
---
server/connections.c | 21 +++++----------------
server/internal.h | 2 +-
server/main.c | 3 +--
3 files changed, 7 insertions(+), 19 deletions(-)
diff --git a/server/connections.c b/server/connections.c
index 95d8296..3adba7b 100644
--- a/server/connections.c
+++ b/server/connections.c
@@ -132,15 +132,17 @@ connection_worker (void *data)
return NULL;
}
-static int
-_handle_single_connection (int sockin, int sockout)
+void
+handle_single_connection (int sockin, int sockout)
{
const char *plugin_name;
- int ret = -1, r;
+ int r;
struct connection *conn;
int nworkers = threads ? threads : DEFAULT_PARALLEL_REQUESTS;
pthread_t *workers = NULL;
+ lock_connection ();
+
if (backend->thread_model (backend) < NBDKIT_THREAD_MODEL_PARALLEL ||
nworkers == 1)
nworkers = 0;
@@ -222,22 +224,9 @@ _handle_single_connection (int sockin, int sockout)
if (r == -1)
goto done;
- ret = connection_get_status (conn);
done:
free_connection (conn);
- return ret;
-}
-
-int
-handle_single_connection (int sockin, int sockout)
-{
- int r;
-
- lock_connection ();
- r = _handle_single_connection (sockin, sockout);
unlock_connection ();
-
- return r;
}
static struct connection *
diff --git a/server/internal.h b/server/internal.h
index bb667ce..b28b1c8 100644
--- a/server/internal.h
+++ b/server/internal.h
@@ -234,7 +234,7 @@ struct connection {
connection_close_function close;
};
-extern int handle_single_connection (int sockin, int sockout);
+extern void handle_single_connection (int sockin, int sockout);
extern int connection_get_status (struct connection *conn)
__attribute__((__nonnull__ (1)));
extern int connection_set_status (struct connection *conn, int value)
diff --git a/server/main.c b/server/main.c
index 3e2ac2f..dcfdbde 100644
--- a/server/main.c
+++ b/server/main.c
@@ -894,8 +894,7 @@ start_serving (void)
change_user ();
write_pidfile ();
threadlocal_new_server_thread ();
- if (handle_single_connection (0, 1) == -1)
- exit (EXIT_FAILURE);
+ handle_single_connection (0, 1);
return;
}
--
2.23.0
This function was only ever called straight after
accept_incoming_connections, so simply make
accept_incoming_connections free the sockets at quit time.
---
server/internal.h | 2 --
server/main.c | 2 --
server/sockets.c | 16 ++++++----------
3 files changed, 6 insertions(+), 14 deletions(-)
diff --git a/server/internal.h b/server/internal.h
index b28b1c8..4638727 100644
--- a/server/internal.h
+++ b/server/internal.h
@@ -474,8 +474,6 @@ extern int *bind_vsock (size_t *)
__attribute__((__nonnull__ (1)));
extern void accept_incoming_connections (int *socks, size_t nr_socks)
__attribute__((__nonnull__ (1)));
-extern void free_listening_sockets (int *socks, size_t nr_socks)
- __attribute__((__nonnull__ (1)));
/* threadlocal.c */
extern void threadlocal_init (void);
diff --git a/server/main.c b/server/main.c
index dcfdbde..8ff8e79 100644
--- a/server/main.c
+++ b/server/main.c
@@ -885,7 +885,6 @@ start_serving (void)
change_user ();
write_pidfile ();
accept_incoming_connections (socks, nr_socks);
- free_listening_sockets (socks, nr_socks); /* also closes them */
return;
}
@@ -913,7 +912,6 @@ start_serving (void)
fork_into_background ();
write_pidfile ();
accept_incoming_connections (socks, nr_socks);
- free_listening_sockets (socks, nr_socks);
}
static void
diff --git a/server/sockets.c b/server/sockets.c
index 119cb99..11bab6c 100644
--- a/server/sockets.c
+++ b/server/sockets.c
@@ -322,16 +322,6 @@ bind_vsock (size_t *nr_socks)
#endif
}
-void
-free_listening_sockets (int *socks, size_t nr_socks)
-{
- size_t i;
-
- for (i = 0; i < nr_socks; ++i)
- close (socks[i]);
- free (socks);
-}
-
struct thread_data {
int sock;
size_t instance_num;
@@ -476,6 +466,12 @@ check_sockets_and_quit_fd (int *socks, size_t nr_socks)
void
accept_incoming_connections (int *socks, size_t nr_socks)
{
+ size_t i;
+
while (!quit)
check_sockets_and_quit_fd (socks, nr_socks);
+
+ for (i = 0; i < nr_socks; ++i)
+ close (socks[i]);
+ free (socks);
}
--
2.23.0
A frequent cause of crashes on shutdown happens when the quit signal
has been received and the main thread and other threads race each
other at shutdown. As in the typical stack trace below what happens
is that the main thread unloads the plugins while they are still being
used by one of the connection threads, causing the connection thread
to segfault after calling dlclosed functions.
To avoid this simply count the number of connection threads we create,
and at exit in the main thread wait until this count drops to 0 before
we actually start unloading anything.
Thread 2 (Thread 0x7f8fb054ea40 (LWP 3105233)):
#0 _asn1_set_down (down=0x5633dd26c010, node=0x5633dd26be70) at parser_aux.h:116
#1 asn1_delete_structure2 (structure=structure@entry=0x7f8fb0dd3fa0
<_gnutls_pkix1_asn>, flags=flags@entry=0) at structure.c:328
#2 0x00007f8fb06e702b in asn1_delete_structure
(structure=structure@entry=0x7f8fb0dd3fa0 <_gnutls_pkix1_asn>) at structure.c:293
#3 0x00007f8fb0c66544 in _gnutls_global_deinit (destructor=1) at global.c:419
#4 0x00007f8fb0e0d13b in _dl_fini () at dl-fini.c:138
#5 0x00007f8fb0a0ae87 in __run_exit_handlers (status=0, listp=0x7f8fb0b8e578
<__exit_funcs>, run_list_atexit=run_list_atexit@entry=true,
run_dtors=run_dtors@entry=true) at exit.c:108
#6 0x00007f8fb0a0b040 in __GI_exit (status=<optimized out>) at exit.c:139
#7 0x00005633d47626ee in main (argc=<optimized out>, argv=0x7ffecd6115d8) at
main.c:705
Thread 1 (Thread 0x7f8fb054d700 (LWP 3105257)):
#0 backend_finalize (b=<optimized out>, conn=conn@entry=0x5633dd287c40) at
backend.c:227
#1 0x00005633d476579f in handle_single_connection (sockin=<optimized out>,
sockout=<optimized out>) at connections.c:222
#2 0x00005633d476e9e9 in start_thread (datav=0x5633dd269620) at sockets.c:351
#3 0x00007f8fb0b9e4e2 in start_thread (arg=<optimized out>) at
pthread_create.c:479
#4 0x00007f8fb0acd643 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
---
server/sockets.c | 35 +++++++++++++++++++++++++++++++++++
1 file changed, 35 insertions(+)
diff --git a/server/sockets.c b/server/sockets.c
index 11bab6c..3351286 100644
--- a/server/sockets.c
+++ b/server/sockets.c
@@ -322,6 +322,17 @@ bind_vsock (size_t *nr_socks)
#endif
}
+/* This counts the number of connection threads running (note: not the
+ * number of worker threads, each connection thread will start many
+ * worker independent threads in the current implementation). The
+ * purpose of this is so we can wait for all the connection threads to
+ * exit before we return from accept_incoming_connections, so that
+ * unload-time actions happen with no connections open.
+ */
+static pthread_mutex_t count_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t count_cond = PTHREAD_COND_INITIALIZER;
+static unsigned count = 0;
+
struct thread_data {
int sock;
size_t instance_num;
@@ -334,6 +345,10 @@ start_thread (void *datav)
debug ("accepted connection");
+ pthread_mutex_lock (&count_mutex);
+ count++;
+ pthread_mutex_unlock (&count_mutex);
+
/* Set thread-local data. */
threadlocal_new_server_thread ();
threadlocal_set_instance_num (data->instance_num);
@@ -341,6 +356,12 @@ start_thread (void *datav)
handle_single_connection (data->sock, data->sock);
free (data);
+
+ pthread_mutex_lock (&count_mutex);
+ count--;
+ pthread_cond_signal (&count_cond);
+ pthread_mutex_unlock (&count_mutex);
+
return NULL;
}
@@ -467,10 +488,24 @@ void
accept_incoming_connections (int *socks, size_t nr_socks)
{
size_t i;
+ int err;
while (!quit)
check_sockets_and_quit_fd (socks, nr_socks);
+ /* Wait for all threads to exit. */
+ pthread_mutex_lock (&count_mutex);
+ for (;;) {
+ if (count == 0)
+ break;
+ err = pthread_cond_wait (&count_cond, &count_mutex);
+ if (err != 0) {
+ errno = err;
+ perror ("pthread_cond_wait");
+ }
+ }
+ pthread_mutex_unlock (&count_mutex);
+
for (i = 0; i < nr_socks; ++i)
close (socks[i]);
free (socks);
--
2.23.0