perf analysis of the cow filter showed that a great deal of time was
spent holding the global lock. This change makes the locking more
fine-grained so now we are only using the global lock to protect the
bitmap from concurrent access and nothing else.
I added a new read-modify-write lock to protect a few places where we
use an RMW cycle to modify an unaligned block, which should be a
fairly rare occurrence. Previously the global lock protected these.
A benchmark shows this is a considerable improvement. Running the test
from here:
http://git.annexia.org/?p=libguestfs-talks.git;a=blob;f=2021-pipelines/be...
Before this change:
real 3m51.086s
user 0m0.994s
sys 0m3.568s
After this change:
real 2m1.091s
user 0m1.038s
sys 0m3.576s
Compared to a similar pipeline using qemu-img convert and qcow2:
http://git.annexia.org/?p=libguestfs-talks.git;a=blob;f=2021-pipelines/be...
real 2m22.368s
user 0m54.415s
sys 1m27.410s
---
filters/cow/blk.h | 7 -------
filters/cow/blk.c | 27 ++++++++++++++++++++++++++-
filters/cow/cow.c | 41 ++++++++++++++---------------------------
3 files changed, 40 insertions(+), 35 deletions(-)
diff --git a/filters/cow/blk.h b/filters/cow/blk.h
index d96b9924..be3e764a 100644
--- a/filters/cow/blk.h
+++ b/filters/cow/blk.h
@@ -44,13 +44,6 @@ extern int blk_init (void);
/* Close the overlay, free the bitmap. */
extern void blk_free (void);
-/*----------------------------------------------------------------------
- * ** NOTE **
- *
- * An exclusive lock must be held when you call any function below
- * this line.
- */
-
/* Allocate or resize the overlay and bitmap. */
extern int blk_set_size (uint64_t new_size);
diff --git a/filters/cow/blk.c b/filters/cow/blk.c
index 51ba91c4..cbd40188 100644
--- a/filters/cow/blk.c
+++ b/filters/cow/blk.c
@@ -90,9 +90,12 @@
#include <alloca.h>
#endif
+#include <pthread.h>
+
#include <nbdkit-filter.h>
#include "bitmap.h"
+#include "cleanup.h"
#include "fdatasync.h"
#include "rounding.h"
#include "pread.h"
@@ -104,6 +107,9 @@
/* The temporary overlay. */
static int fd = -1;
+/* This lock protects the bitmap from parallel access. */
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
/* Bitmap. */
static struct bitmap bm;
@@ -186,6 +192,8 @@ static uint64_t size = 0;
int
blk_set_size (uint64_t new_size)
{
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+
size = new_size;
if (bitmap_resize (&bm, size) == -1)
@@ -205,6 +213,7 @@ blk_set_size (uint64_t new_size)
void
blk_status (uint64_t blknum, bool *present, bool *trimmed)
{
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
*present = state != BLOCK_NOT_ALLOCATED;
@@ -219,7 +228,18 @@ blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
uint64_t blknum, uint8_t *block, int *err)
{
off_t offset = blknum * BLKSIZE;
- enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
+ enum bm_entry state;
+
+ /* The state might be modified from another thread - for example
+ * another thread might write (BLOCK_NOT_ALLOCATED ->
+ * BLOCK_ALLOCATED) while we are reading from the plugin, returning
+ * the old data. However a read issued after the write returns
+ * should always return the correct data.
+ */
+ {
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
+ }
nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64
") is %s",
blknum, (uint64_t) offset, state_to_string (state));
@@ -260,6 +280,8 @@ int
blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
uint64_t blknum, uint8_t *block, enum cache_mode mode, int *err)
{
+ /* XXX Could make this lock more fine-grained with some thought. */
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
off_t offset = blknum * BLKSIZE;
enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
unsigned n = BLKSIZE, tail = 0;
@@ -322,6 +344,8 @@ blk_write (uint64_t blknum, const uint8_t *block, int *err)
nbdkit_error ("pwrite: %m");
return -1;
}
+
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
bitmap_set_blk (&bm, blknum, BLOCK_ALLOCATED);
return 0;
@@ -339,6 +363,7 @@ blk_trim (uint64_t blknum, int *err)
* here. However it's not trivial since BLKSIZE is unrelated to the
* overlay filesystem block size.
*/
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
bitmap_set_blk (&bm, blknum, BLOCK_TRIMMED);
return 0;
}
diff --git a/filters/cow/cow.c b/filters/cow/cow.c
index 93e10f24..54291476 100644
--- a/filters/cow/cow.c
+++ b/filters/cow/cow.c
@@ -45,16 +45,17 @@
#include <nbdkit-filter.h>
#include "cleanup.h"
-
-#include "blk.h"
#include "isaligned.h"
#include "minmax.h"
#include "rounding.h"
-/* In order to handle parallel requests safely, this lock must be held
- * when calling any blk_* functions.
+#include "blk.h"
+
+/* Read-modify-write requests are serialized through this global lock.
+ * This is only used for unaligned requests which should be
+ * infrequent.
*/
-static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t rmw_lock = PTHREAD_MUTEX_INITIALIZER;
bool cow_on_cache;
@@ -117,7 +118,6 @@ cow_get_size (struct nbdkit_next_ops *next_ops, void *nxdata,
nbdkit_debug ("cow: underlying file size: %" PRIi64, size);
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
r = blk_set_size (size);
if (r == -1)
return -1;
@@ -221,7 +221,6 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
uint64_t n = MIN (BLKSIZE - blkoffs, count);
assert (block);
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
r = blk_read (next_ops, nxdata, blknum, block, err);
if (r == -1)
return -1;
@@ -242,7 +241,6 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
* smarter here.
*/
while (count >= BLKSIZE) {
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
r = blk_read (next_ops, nxdata, blknum, buf, err);
if (r == -1)
return -1;
@@ -256,7 +254,6 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
/* Unaligned tail */
if (count) {
assert (block);
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
r = blk_read (next_ops, nxdata, blknum, block, err);
if (r == -1)
return -1;
@@ -294,10 +291,10 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
uint64_t n = MIN (BLKSIZE - blkoffs, count);
/* Do a read-modify-write operation on the current block.
- * Hold the lock over the whole operation.
+ * Hold the rmw_lock over the whole operation.
*/
assert (block);
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
r = blk_read (next_ops, nxdata, blknum, block, err);
if (r != -1) {
memcpy (&block[blkoffs], buf, n);
@@ -314,7 +311,6 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
/* Aligned body */
while (count >= BLKSIZE) {
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
r = blk_write (blknum, buf, err);
if (r == -1)
return -1;
@@ -328,7 +324,7 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
/* Unaligned tail */
if (count) {
assert (block);
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
r = blk_read (next_ops, nxdata, blknum, block, err);
if (r != -1) {
memcpy (block, buf, count);
@@ -376,9 +372,9 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
uint64_t n = MIN (BLKSIZE - blkoffs, count);
/* Do a read-modify-write operation on the current block.
- * Hold the lock over the whole operation.
+ * Hold the rmw_lock over the whole operation.
*/
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
r = blk_read (next_ops, nxdata, blknum, block, err);
if (r != -1) {
memset (&block[blkoffs], 0, n);
@@ -399,7 +395,6 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
/* XXX There is the possibility of optimizing this: since this loop is
* writing a whole, aligned block, we should use FALLOC_FL_ZERO_RANGE.
*/
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
r = blk_write (blknum, block, err);
if (r == -1)
return -1;
@@ -411,7 +406,7 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
/* Unaligned tail */
if (count) {
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
r = blk_read (next_ops, nxdata, blknum, block, err);
if (r != -1) {
memset (&block[count], 0, BLKSIZE - count);
@@ -455,7 +450,7 @@ cow_trim (struct nbdkit_next_ops *next_ops, void *nxdata,
/* Do a read-modify-write operation on the current block.
* Hold the lock over the whole operation.
*/
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
r = blk_read (next_ops, nxdata, blknum, block, err);
if (r != -1) {
memset (&block[blkoffs], 0, n);
@@ -471,7 +466,6 @@ cow_trim (struct nbdkit_next_ops *next_ops, void *nxdata,
/* Aligned body */
while (count >= BLKSIZE) {
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
r = blk_trim (blknum, err);
if (r == -1)
return -1;
@@ -483,7 +477,7 @@ cow_trim (struct nbdkit_next_ops *next_ops, void *nxdata,
/* Unaligned tail */
if (count) {
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
r = blk_read (next_ops, nxdata, blknum, block, err);
if (r != -1) {
memset (&block[count], 0, BLKSIZE - count);
@@ -553,7 +547,6 @@ cow_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
/* Aligned body */
while (remaining) {
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
r = blk_cache (next_ops, nxdata, blknum, block, mode, err);
if (r == -1)
return -1;
@@ -588,12 +581,6 @@ cow_extents (struct nbdkit_next_ops *next_ops, void *nxdata,
assert (IS_ALIGNED (count, BLKSIZE));
assert (count > 0); /* We must make forward progress. */
- /* We hold the lock for the whole time, even when requesting extents
- * from the plugin, because we want to present an atomic picture of
- * the current state.
- */
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
-
while (count > 0) {
bool present, trimmed;
struct nbdkit_extent e;
--
2.29.0.rc2