[PATCH nbdkit] cow: Use more fine-grained locking.

Friday, 19 February 2021

perf analysis of the cow filter showed that a great deal of time was
spent holding the global lock.  This change makes the locking more
fine-grained so now we are only using the global lock to protect the
bitmap from concurrent access and nothing else.

I added a new read-modify-write lock to protect a few places where we
use an RMW cycle to modify an unaligned block, which should be a
fairly rare occurrence.  Previously the global lock protected these.

A benchmark shows this is a considerable improvement. Running the test
from here:
http://git.annexia.org/?p=libguestfs-talks.git;a=blob;f=2021-pipelines/be...

Before this change:
real	3m51.086s
user	0m0.994s
sys	0m3.568s

After this change:
real	2m1.091s
user	0m1.038s
sys	0m3.576s

Compared to a similar pipeline using qemu-img convert and qcow2:
http://git.annexia.org/?p=libguestfs-talks.git;a=blob;f=2021-pipelines/be...
real	2m22.368s
user	0m54.415s
sys	1m27.410s
---
 filters/cow/blk.h |  7 -------
 filters/cow/blk.c | 27 ++++++++++++++++++++++++++-
 filters/cow/cow.c | 41 ++++++++++++++---------------------------
 3 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/filters/cow/blk.h b/filters/cow/blk.h
index d96b9924..be3e764a 100644
--- a/filters/cow/blk.h
+++ b/filters/cow/blk.h
@@ -44,13 +44,6 @@ extern int blk_init (void);
 /* Close the overlay, free the bitmap. */
 extern void blk_free (void);
 
-/*----------------------------------------------------------------------
- * ** NOTE **
- *
- * An exclusive lock must be held when you call any function below
- * this line.
- */
-
 /* Allocate or resize the overlay and bitmap. */
 extern int blk_set_size (uint64_t new_size);
 
diff --git a/filters/cow/blk.c b/filters/cow/blk.c
index 51ba91c4..cbd40188 100644
--- a/filters/cow/blk.c
+++ b/filters/cow/blk.c
@@ -90,9 +90,12 @@
 #include <alloca.h>
 #endif
 
+#include <pthread.h>
+
 #include <nbdkit-filter.h>
 
 #include "bitmap.h"
+#include "cleanup.h"
 #include "fdatasync.h"
 #include "rounding.h"
 #include "pread.h"
@@ -104,6 +107,9 @@
 /* The temporary overlay. */
 static int fd = -1;
 
+/* This lock protects the bitmap from parallel access. */
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
 /* Bitmap. */
 static struct bitmap bm;
 
@@ -186,6 +192,8 @@ static uint64_t size = 0;
 int
 blk_set_size (uint64_t new_size)
 {
+  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+
   size = new_size;
 
   if (bitmap_resize (&bm, size) == -1)
@@ -205,6 +213,7 @@ blk_set_size (uint64_t new_size)
 void
 blk_status (uint64_t blknum, bool *present, bool *trimmed)
 {
+  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
   enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
 
   *present = state != BLOCK_NOT_ALLOCATED;
@@ -219,7 +228,18 @@ blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
           uint64_t blknum, uint8_t *block, int *err)
 {
   off_t offset = blknum * BLKSIZE;
-  enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
+  enum bm_entry state;
+
+  /* The state might be modified from another thread - for example
+   * another thread might write (BLOCK_NOT_ALLOCATED ->
+   * BLOCK_ALLOCATED) while we are reading from the plugin, returning
+   * the old data.  However a read issued after the write returns
+   * should always return the correct data.
+   */
+  {
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
+  }
 
   nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64
") is %s",
                 blknum, (uint64_t) offset, state_to_string (state));
@@ -260,6 +280,8 @@ int
 blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
            uint64_t blknum, uint8_t *block, enum cache_mode mode, int *err)
 {
+  /* XXX Could make this lock more fine-grained with some thought. */
+  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
   off_t offset = blknum * BLKSIZE;
   enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
   unsigned n = BLKSIZE, tail = 0;
@@ -322,6 +344,8 @@ blk_write (uint64_t blknum, const uint8_t *block, int *err)
     nbdkit_error ("pwrite: %m");
     return -1;
   }
+
+  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
   bitmap_set_blk (&bm, blknum, BLOCK_ALLOCATED);
 
   return 0;
@@ -339,6 +363,7 @@ blk_trim (uint64_t blknum, int *err)
    * here.  However it's not trivial since BLKSIZE is unrelated to the
    * overlay filesystem block size.
    */
+  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
   bitmap_set_blk (&bm, blknum, BLOCK_TRIMMED);
   return 0;
 }
diff --git a/filters/cow/cow.c b/filters/cow/cow.c
index 93e10f24..54291476 100644
--- a/filters/cow/cow.c
+++ b/filters/cow/cow.c
@@ -45,16 +45,17 @@
 #include <nbdkit-filter.h>
 
 #include "cleanup.h"
-
-#include "blk.h"
 #include "isaligned.h"
 #include "minmax.h"
 #include "rounding.h"
 
-/* In order to handle parallel requests safely, this lock must be held
- * when calling any blk_* functions.
+#include "blk.h"
+
+/* Read-modify-write requests are serialized through this global lock.
+ * This is only used for unaligned requests which should be
+ * infrequent.
  */
-static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t rmw_lock = PTHREAD_MUTEX_INITIALIZER;
 
 bool cow_on_cache;
 
@@ -117,7 +118,6 @@ cow_get_size (struct nbdkit_next_ops *next_ops, void *nxdata,
 
   nbdkit_debug ("cow: underlying file size: %" PRIi64, size);
 
-  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
   r = blk_set_size (size);
   if (r == -1)
     return -1;
@@ -221,7 +221,6 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
     uint64_t n = MIN (BLKSIZE - blkoffs, count);
 
     assert (block);
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r == -1)
       return -1;
@@ -242,7 +241,6 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
    * smarter here.
    */
   while (count >= BLKSIZE) {
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_read (next_ops, nxdata, blknum, buf, err);
     if (r == -1)
       return -1;
@@ -256,7 +254,6 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
   /* Unaligned tail */
   if (count) {
     assert (block);
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r == -1)
       return -1;
@@ -294,10 +291,10 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
     uint64_t n = MIN (BLKSIZE - blkoffs, count);
 
     /* Do a read-modify-write operation on the current block.
-     * Hold the lock over the whole operation.
+     * Hold the rmw_lock over the whole operation.
      */
     assert (block);
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r != -1) {
       memcpy (&block[blkoffs], buf, n);
@@ -314,7 +311,6 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
 
   /* Aligned body */
   while (count >= BLKSIZE) {
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_write (blknum, buf, err);
     if (r == -1)
       return -1;
@@ -328,7 +324,7 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
   /* Unaligned tail */
   if (count) {
     assert (block);
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r != -1) {
       memcpy (block, buf, count);
@@ -376,9 +372,9 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
     uint64_t n = MIN (BLKSIZE - blkoffs, count);
 
     /* Do a read-modify-write operation on the current block.
-     * Hold the lock over the whole operation.
+     * Hold the rmw_lock over the whole operation.
      */
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r != -1) {
       memset (&block[blkoffs], 0, n);
@@ -399,7 +395,6 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
     /* XXX There is the possibility of optimizing this: since this loop is
      * writing a whole, aligned block, we should use FALLOC_FL_ZERO_RANGE.
      */
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_write (blknum, block, err);
     if (r == -1)
       return -1;
@@ -411,7 +406,7 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
 
   /* Unaligned tail */
   if (count) {
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r != -1) {
       memset (&block[count], 0, BLKSIZE - count);
@@ -455,7 +450,7 @@ cow_trim (struct nbdkit_next_ops *next_ops, void *nxdata,
     /* Do a read-modify-write operation on the current block.
      * Hold the lock over the whole operation.
      */
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r != -1) {
       memset (&block[blkoffs], 0, n);
@@ -471,7 +466,6 @@ cow_trim (struct nbdkit_next_ops *next_ops, void *nxdata,
 
   /* Aligned body */
   while (count >= BLKSIZE) {
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_trim (blknum, err);
     if (r == -1)
       return -1;
@@ -483,7 +477,7 @@ cow_trim (struct nbdkit_next_ops *next_ops, void *nxdata,
 
   /* Unaligned tail */
   if (count) {
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r != -1) {
       memset (&block[count], 0, BLKSIZE - count);
@@ -553,7 +547,6 @@ cow_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
 
   /* Aligned body */
   while (remaining) {
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_cache (next_ops, nxdata, blknum, block, mode, err);
     if (r == -1)
       return -1;
@@ -588,12 +581,6 @@ cow_extents (struct nbdkit_next_ops *next_ops, void *nxdata,
   assert (IS_ALIGNED (count, BLKSIZE));
   assert (count > 0);           /* We must make forward progress. */
 
-  /* We hold the lock for the whole time, even when requesting extents
-   * from the plugin, because we want to present an atomic picture of
-   * the current state.
-   */
-  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
-
   while (count > 0) {
     bool present, trimmed;
     struct nbdkit_extent e;
-- 
2.29.0.rc2


    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009