When a highly multi-threaded program such as nbdcopy encounters an
error, there is a race condition in the library which can cause an
assertion failure and thus a core dump:
(1) An error occurs on one of the threads. nbdcopy calls exit(3).
(2) In lib/errors.c, the destructor calls pthread_key_delete.
(3) Another thread which is still running also encounters an error,
and inside libnbd the library calls set_error().
(4) The call to set_error() calls pthread_getspecific. This is
undefined behavior per POSIX, since the key has already been destroyed
in step (2), but glibc handles it by returning NULL with an EINVAL
error (POSIX recommends, but can't mandate, that course of action for
technical reasons). In any case, the error message is lost, and any
subsequent call to nbd_get_error() will return NULL.
(5) We enter the %DEAD state, where there is an assertion that
nbd_get_error() is not NULL. This assertion is supposed to be for
checking that the code called set_error() before entering the %DEAD
state. Although we did call set_error(), the error message was lost
at step (4) and nbd_get_error() will always return NULL. This
assertion failure causes a crash.
There aren't any good ways to fix this. We have proven that the
assertions are too tight (it IS possible for one thread's first use of
libnbd API, which causes the allocation of a thread-local error
context, to occur after another thread has already triggered the
destructor via exit(3)), so remove those. Meanwhile, since POSIX says
any use of a destroyed key is undefined, add some atomic bookkeeping
such that we track a reference count of how many threads have
allocated an error context and subsequently had the per-thread data
destructor called. Only call pthread_key_delete() if all threads had
the chance to exit; leaking small amounts of memory is preferable to
undefined behavior.
Affected tests that now produce the new leak message:
- copy/copy-nbd-error.sh (known issue - we probably want to improve
nbdcopy to gracefully issue NBD_CMD_DISC on error, rather than
outright calling exit(), to work around the assertion failure in
nbdkit 1.32.5)
- various fuse/* (not sure if we can address that; cleaning up FUSE is
tricky)
This reverts a small part of commit 831cbf7ee14c ("generator: Allow
DEAD state actions to run").
Thanks: Richard W.M. Jones <rjones(a)redhat.com>
Signed-off-by: Eric Blake <eblake(a)redhat.com>
---
Taking Rich's analysis and weakening of the assertions, but tweaking
the errors.c to avoid undefined behavior.
generator/state_machine_generator.ml | 5 +--
generator/states.c | 1 -
lib/errors.c | 50 ++++++++++++++++++----------
3 files changed, 34 insertions(+), 22 deletions(-)
diff --git a/generator/state_machine_generator.ml b/generator/state_machine_generator.ml
index 43c1ce4c..9f94bed1 100644
--- a/generator/state_machine_generator.ml
+++ b/generator/state_machine_generator.ml
@@ -449,10 +449,7 @@ let
pr " abort (); /* Should never happen, but keeps GCC happy. */\n";
pr " }\n";
pr "\n";
- pr " if (r == -1) {\n";
- pr " assert (nbd_get_error () != NULL);\n";
- pr " return -1;\n";
- pr " }\n";
+ pr " if (r == -1) return -1;\n";
pr " } while (!blocked);\n";
pr " return 0;\n";
pr "}\n";
diff --git a/generator/states.c b/generator/states.c
index fa0f8d71..58a896ca 100644
--- a/generator/states.c
+++ b/generator/states.c
@@ -192,7 +192,6 @@ STATE_MACHINE {
DEAD:
/* The caller should have used set_error() before reaching here */
- assert (nbd_get_error ());
abort_option (h);
nbd_internal_abort_commands (h, &h->cmds_to_issue);
nbd_internal_abort_commands (h, &h->cmds_in_flight);
diff --git a/lib/errors.c b/lib/errors.c
index 8b77650e..133c752b 100644
--- a/lib/errors.c
+++ b/lib/errors.c
@@ -34,6 +34,8 @@ struct last_error {
/* Thread-local storage of the last error. */
static pthread_key_t errors_key;
+/* Zero if errors_key is invalid, else 1 + threads using errors_key. */
+static _Atomic int key_use_count;
static void free_errors_key (void *vp) LIBNBD_ATTRIBUTE_NONNULL (1);
@@ -51,6 +53,7 @@ errors_key_create (void)
strerror (err));
abort ();
}
+ key_use_count++;
}
/* Destroy the thread-local key when the library is unloaded. */
@@ -59,16 +62,16 @@ static void errors_key_destroy (void) __attribute__ ((destructor));
static void
errors_key_destroy (void)
{
- struct last_error *last_error = pthread_getspecific (errors_key);
-
/* "No destructor functions shall be invoked by
* pthread_key_delete(). Any destructor function that may have been
* associated with key shall no longer be called upon thread exit."
+ * If any threads were still using the key, the memory they allocated
+ * will be leaked; this is unusual enough to diagnose.
*/
- if (last_error != NULL) {
- free (last_error->error);
- free (last_error);
- }
+ if (key_use_count > 1)
+ fprintf (stderr, "%s: %s: %d threads leaking thread-local data\n",
+ "libnbd", "errors_key_destroy", key_use_count - 1);
+ key_use_count = 0;
pthread_key_delete (errors_key);
}
@@ -80,6 +83,8 @@ free_errors_key (void *vp)
{
struct last_error *last_error = vp;
+ assert (key_use_count > 1);
+ key_use_count--;
free (last_error->error);
free (last_error);
}
@@ -87,16 +92,23 @@ free_errors_key (void *vp)
static struct last_error *
allocate_last_error_on_demand (void)
{
- struct last_error *last_error = pthread_getspecific (errors_key);
+ struct last_error *last_error = NULL;
- if (!last_error) {
- last_error = calloc (1, sizeof *last_error);
- if (last_error) {
- int err = pthread_setspecific (errors_key, last_error);
- if (err != 0) {
- /* This is not supposed to happen (XXX). */
- fprintf (stderr, "%s: %s: %s\n", "libnbd",
"pthread_setspecific",
- strerror (err));
+ if (key_use_count) {
+ last_error = pthread_getspecific (errors_key);
+ if (!last_error) {
+ last_error = calloc (1, sizeof *last_error);
+ if (last_error) {
+ int err = pthread_setspecific (errors_key, last_error);
+ key_use_count++;
+ if (err != 0) {
+ /* This is not supposed to happen (XXX). */
+ fprintf (stderr, "%s: %s: %s\n", "libnbd",
"pthread_setspecific",
+ strerror (err));
+ free (last_error);
+ last_error = NULL;
+ key_use_count--;
+ }
}
}
}
@@ -146,8 +158,10 @@ nbd_internal_get_error_context (void)
const char *
nbd_get_error (void)
{
- struct last_error *last_error = pthread_getspecific (errors_key);
+ struct last_error *last_error = NULL;
+ if (key_use_count)
+ last_error = pthread_getspecific (errors_key);
if (!last_error)
return NULL;
return last_error->error;
@@ -156,8 +170,10 @@ nbd_get_error (void)
int
nbd_get_errno (void)
{
- struct last_error *last_error = pthread_getspecific (errors_key);
+ struct last_error *last_error = NULL;
+ if (key_use_count)
+ last_error = pthread_getspecific (errors_key);
if (!last_error)
return 0;
return last_error->errnum;
--
2.39.2