On 01/25/22 12:26, Richard W.M. Jones wrote:
On platforms with 64K page sizes (RHEL + aarch64 or ppc64le) we have
hit a problem in test-memory-allocator-malloc-mlock.sh which appears
to be caused by:
1. The test requests a 2K mlocked buffer.
2. The allocated buffer straddles a page boundary sometimes.
3. Linux expands the mlock request to two pages, ie. the requested
size becomes 2 * 64K = 128K
4. The standard mlock limit on Linux is 64K so the mlock request is
rejected and the test fails.
Also POSIX requires mlock parameters to be page aligned (although
Linux does not require this).
(1) This is not precise; POSIX says that the implementation *may*
require the *base address* to be page aligned.
This commit attempts to fix both problems by ensuring the mlock
request is always page aligned. On the separate mlock path we use the
usual vector reserve function to do overflow checks and so on, but
then we reallocate the buffer using posix_memalign (or valloc) so it
is page aligned before locking it. This requires a duplicate
allocation and memcpy but this path is not performance critical.
I extended the test mlock size from 2K to 8K to try to provoke the
original bug (if it still happens) more often.
Thanks: Laszlo Ersek
---
configure.ac | 4 +-
common/allocators/malloc.c | 89 ++++++++++++++++++---
tests/test-memory-allocator-malloc-mlock.sh | 16 +++-
3 files changed, 94 insertions(+), 15 deletions(-)
diff --git a/configure.ac b/configure.ac
index c426aec8..1ab85e3e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -384,7 +384,9 @@ AC_CHECK_FUNCS([\
pipe \
pipe2 \
ppoll \
- posix_fadvise])
+ posix_fadvise \
+ posix_memalign \
+ valloc])
dnl Check for structs and members.
AC_CHECK_MEMBERS([struct dirent.d_type], [], [], [[#include <dirent.h>]])
diff --git a/common/allocators/malloc.c b/common/allocators/malloc.c
index eea44432..986a45c9 100644
--- a/common/allocators/malloc.c
+++ b/common/allocators/malloc.c
@@ -36,6 +36,9 @@
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
#ifdef HAVE_SYS_MMAN_H
#include <sys/mman.h>
@@ -46,6 +49,7 @@
#include <nbdkit-plugin.h>
#include "cleanup.h"
+#include "rounding.h"
#include "vector.h"
#include "allocator.h"
@@ -81,13 +85,46 @@ m_alloc_free (struct allocator *a)
}
}
-/* Extend the underlying bytearray if needed. mlock if requested. */
+/* Extend the underlying bytearray if needed. */
static int
-extend (struct m_alloc *ma, uint64_t new_size)
+extend_without_mlock (struct m_alloc *ma, uint64_t new_size)
{
- ACQUIRE_WRLOCK_FOR_CURRENT_SCOPE (&ma->lock);
size_t old_size, n;
+ if (ma->ba.cap < new_size) {
+ old_size = ma->ba.cap;
+ n = new_size - ma->ba.cap;
+
+ if (bytearray_reserve (&ma->ba, n) == -1) {
+ nbdkit_error ("realloc: %m");
+ return -1;
+ }
+
+ /* Initialize the newly allocated memory to 0. */
+ memset (ma->ba.ptr + old_size, 0, n);
+ }
+
+ return 0;
+}
+
+#ifdef HAVE_MLOCK
+static int
+extend_with_mlock (struct m_alloc *ma, uint64_t new_size)
+{
+ size_t old_size, n;
+ void *p;
+#ifdef HAVE_POSIX_MEMALIGN
+ int r;
+#endif
+
+ long pagesize = sysconf (_SC_PAGE_SIZE);
+ assert (pagesize > 0);
+
+ /* POSIX requires both base and size of mlock to be page aligned,
+ * even though Linux does not.
+ */
+ new_size = ROUND_UP (new_size, pagesize);
(2) Same comment as (1). So this is likely unnecessary (but not
necessarily wrong either).
+
if (ma->ba.cap < new_size) {
old_size = ma->ba.cap;
n = new_size - ma->ba.cap;
@@ -96,30 +133,62 @@ extend (struct m_alloc *ma, uint64_t new_size)
/* Since the memory might be moved by realloc, we must unlock the
* original array.
*/
- if (ma->use_mlock)
+ if (ma->use_mlock && ma->ba.ptr != NULL)
(3) I don't understand the nullity check on "ma->ba.ptr".
munlock (ma->ba.ptr, ma->ba.cap);
#endif
+ /* Call the normal vector reserve function so that it does the
+ * overflow checks.
+ */
if (bytearray_reserve (&ma->ba, n) == -1) {
nbdkit_error ("realloc: %m");
return -1;
}
+ assert (new_size <= ma->ba.cap);
/* Initialize the newly allocated memory to 0. */
memset (ma->ba.ptr + old_size, 0, n);
-#ifdef HAVE_MLOCK
- if (ma->use_mlock) {
- if (mlock (ma->ba.ptr, ma->ba.cap) == -1) {
- nbdkit_error ("allocator=malloc: mlock: %m");
- return -1;
- }
+ /* But now reallocate it as a page-aligned buffer so we can mlock it. */
+#ifdef HAVE_POSIX_MEMALIGN
+ if ((r = posix_memalign (&p, pagesize, ma->ba.cap)) != 0) {
+ errno = r;
+ nbdkit_error ("posix_memalign: %m");
+ return -1;
}
+#elif HAVE_VALLOC
+ p = valloc (ma->ba.cap);
+ if (p == NULL) {
+ nbdkit_error ("valloc: %m");
+ return -1;
+ }
+#else
+#error "this platform does not have posix_memalign or valloc"
#endif
+
+ memcpy (p, ma->ba.ptr, new_size);
+ if (mlock (ma->ba.ptr, new_size) == -1) {
(4) I don't understand the mlock() call. The memcpy() copies the
extended byte-array's contents from the original location to the aligned
allocation. But then we don't lock "p", we lock the original
byte-array's base address again.
I think this was not found by your testing because the size is rounded
up, and that masks the problem with realloc() inside the generic vector
implementation.
(5) Even if we mlocked "p", I still don't understand how the aligned
allocation would be saved in "ma".
(6) In connection, how does m_alloc_free() deal with the aligned allocation?
+ nbdkit_error ("allocator=malloc: mlock: %m");
+ return -1;
+ }
}
return 0;
}
+#endif /* HAVE_MLOCK */
+
+static int
+extend (struct m_alloc *ma, uint64_t new_size)
+{
+ ACQUIRE_WRLOCK_FOR_CURRENT_SCOPE (&ma->lock);
+
+#ifdef HAVE_MLOCK
+ if (ma->use_mlock)
+ return extend_with_mlock (ma, new_size);
+#endif
+
+ return extend_without_mlock (ma, new_size);
+}
static int
m_alloc_set_size_hint (struct allocator *a, uint64_t size_hint)
diff --git a/tests/test-memory-allocator-malloc-mlock.sh
b/tests/test-memory-allocator-malloc-mlock.sh
index 4ef92db6..983ef22b 100755
--- a/tests/test-memory-allocator-malloc-mlock.sh
+++ b/tests/test-memory-allocator-malloc-mlock.sh
@@ -46,9 +46,9 @@ if ! nbdkit memory --dump-plugin | grep -sq mlock=yes; then
fi
# ulimit -l is measured in kilobytes and so for this test must be at
-# least 2 (kilobytes) and we actually check it's a bit larger to allow
+# least 8 (kilobytes) and we actually check it's a bit larger to allow
# room for error. On Linux the default is usually 64.
-requires test `ulimit -l` -gt 8
+requires test `ulimit -l` -gt 16
sock=$(mktemp -u /tmp/nbdkit-test-sock.XXXXXX)
files="memory-allocator-malloc-mlock.pid $sock"
@@ -57,17 +57,21 @@ cleanup_fn rm -f $files
# Run nbdkit with memory plugin.
start_nbdkit -P memory-allocator-malloc-mlock.pid -U $sock \
- memory 2048 allocator=malloc,mlock=true
+ memory 8192 allocator=malloc,mlock=true
nbdsh --connect "nbd+unix://?socket=$sock" \
-c '
-# Write some stuff to the beginning, middle and end.
+# Write some stuff.
buf1 = b"1" * 512
h.pwrite(buf1, 0)
buf2 = b"2" * 512
h.pwrite(buf2, 1024)
buf3 = b"3" * 512
h.pwrite(buf3, 1536)
+buf4 = b"4" * 512
+h.pwrite(buf4, 4096)
+buf5 = b"5" * 1024
+h.pwrite(buf5, 8192-1024)
# Read it back.
buf11 = h.pread(len(buf1), 0)
@@ -76,4 +80,8 @@ buf22 = h.pread(len(buf2), 1024)
assert buf2 == buf22
buf33 = h.pread(len(buf3), 1536)
assert buf3 == buf33
+buf44 = h.pread(len(buf4), 4096)
+assert buf4 == buf44
+buf55 = h.pread(len(buf5), 8192-1024)
+assert buf5 == buf55
'
(7) For consistency, we should perhaps replace
8192-1024
with
8192-len(buf5)
Thanks,
Laszlo