On 1/1/19 8:33 AM, Richard W.M. Jones wrote:
The original plan was to have a background thread doing the reclaim.
However that cannot work given the design of filters, because a
background thread cannot access the next_ops struct which is only
available during requests.
Therefore we spread the work over the request threads. Each blk_*
function checks whether there is work to do, and if there is will
reclaim up to two blocks from the cache (thus ensuring that we always
make forward progress, since each blk_* function can only add at most
one block to the cache).
---
filters/cache/nbdkit-cache-filter.pod | 71 +++++++++++
filters/cache/cache.h | 13 ++
filters/cache/blk.c | 163 ++++++++++++++++++++++++++
filters/cache/cache.c | 56 +++++++++
filters/cache/lru.c | 7 +-
TODO | 11 +-
tests/Makefile.am | 4 +-
tests/test-cache-max-size.sh | 96 +++++++++++++++
8 files changed, 409 insertions(+), 12 deletions(-)
+=back
+
+The way this works is once the size of the cache exceeds
+S<C<SIZE> ✕ the high threshold>, the filter works to reduce the size
+of the cache until it is less than S<C<SIZE> ✕ the low threshold>.
+Once the size is below the low threshold, no more reclaim work is done
+until the size exceeds the high threshold again.
+
+The default thresholds are high 95% and low 80%. The thresholds are
+expressed as percentages of C<cache-max-size>, and may be greater than
+100.
Do the thresholds support fractional percentages, like 90.5, or must
they be integral? Should we enforce that low threshold < high
threshold? What does it really mean to have a threshold > 100 - that we
can never reach the high threshold?
+/* Do we support reclaiming cache blocks? */
+#ifdef FALLOC_FL_PUNCH_HOLE
+#define HAVE_CACHE_RECLAIM 1
+#else
+#undef HAVE_CACHE_RECLAIM
+#endif
+
Should the man page mention that max cache size can only be enforced
with kernel support? Do we want to go further and probe whether
FALLOC_FL_PUNCH_HOLE works on $TMPDIR? (Even if the kernel supports
punching holes, there are some filesystems that don't).
@@ -74,6 +76,31 @@ enum bm_entry {
BLOCK_DIRTY = 3,
};
+#ifdef HAVE_CACHE_RECLAIM
+/* If we are currently reclaiming blocks from the cache.
+ *
+ * The state machine starts in the NOT_RECLAIMING state. When the
+ * size of the cache exceeds the high threshold, we move to
+ * RECLAIMING_LRU. Once we have exhausted all LRU blocks, we move to
+ * RECLAIMING_ANY (reclaiming any blocks).
+ *
+ * If at any time the size of the cache goes below the low threshold
+ * we move back to the NOT_RECLAIMING state.
+ *
+ * reclaim_blk is the last block that we will looked at.
s/will //
+
+#ifdef HAVE_CACHE_RECLAIM
+
+static void reclaim_one (void);
+static void reclaim_lru (void);
+static void reclaim_any (void);
+static void reclaim_block (void);
+
+/* See if we need to reclaim blocks. */
+static void
+reclaim (void)
+{
+ struct stat statbuf;
+ uint64_t cache_allocated;
+
+ /* If the user didn't set cache-max-size, do nothing. */
+ if (max_size == -1) return;
+
+ /* Check the allocated size of the cache. */
+ if (fstat (fd, &statbuf) == -1) {
+ nbdkit_debug ("cache: fstat: %m");
+ return;
+ }
+ cache_allocated = statbuf.st_blocks * UINT64_C(512);
Is that calculation always going to be accurate, even when more
disks/file systems move towards 4k minimum access?
+
+ if (reclaiming) {
+ /* Keep reclaiming until the cache size drops below the low threshold. */
+ if (cache_allocated < max_size * lo_thresh / 100) {
+ nbdkit_debug ("cache: stop reclaiming");
+ reclaiming = NOT_RECLAIMING;
+ return;
+ }
+ }
+ else {
+ if (cache_allocated < max_size * hi_thresh / 100)
+ return;
+
+ /* Start reclaiming if the cache size goes over the high threshold. */
+ nbdkit_debug ("cache: start reclaiming");
+ reclaiming = RECLAIMING_LRU;
+ }
+
+ /* Reclaim up to 2 cache blocks. */
+ reclaim_one ();
+ reclaim_one ();
+}
+
+/* Reclaim a single cache block. */
+static void
+reclaim_one (void)
+{
+ assert (reclaiming);
+
+ if (reclaiming == RECLAIMING_LRU)
+ reclaim_lru ();
+ else
+ reclaim_any ();
+}
+
+static void
+reclaim_lru (void)
+{
+ uint64_t old_reclaim_blk;
+
+ /* Find the next block in the cache. */
+ reclaim_blk = bitmap_next (&bm, reclaim_blk+1);
+ old_reclaim_blk = reclaim_blk;
+
+ /* Search for an LRU block after this one. */
+ do {
+ if (! lru_has_been_recently_accessed (reclaim_blk)) {
+ reclaim_block ();
+ return;
+ }
+
+ reclaim_blk = bitmap_next (&bm, reclaim_blk+1);
+ if (reclaim_blk == -1) /* wrap around */
+ reclaim_blk = bitmap_next (&bm, 0);
+ } while (reclaim_blk >= 0 && old_reclaim_blk != reclaim_blk);
+
+ if (old_reclaim_blk == reclaim_blk) {
+ /* Run out of LRU blocks, so start reclaiming any block in the cache. */
+ nbdkit_debug ("cache: reclaiming any blocks");
+ reclaiming = RECLAIMING_ANY;
+ reclaim_one ();
+ return;
+ }
+}
+
+static void
+reclaim_any (void)
+{
+ /* Find the next block in the cache. */
+ reclaim_blk = bitmap_next (&bm, reclaim_blk+1);
+ if (reclaim_blk == -1) /* wrap around */
+ reclaim_blk = bitmap_next (&bm, 0);
+
+ reclaim_block ();
+}
+
+static void
+reclaim_block (void)
+{
+ if (reclaim_blk == -1) {
+ nbdkit_debug ("cache: run out of blocks to reclaim!");
+ return;
+ }
+
+ nbdkit_debug ("cache: reclaiming block %" PRIu64, reclaim_blk);
Where does this prioritize clean blocks over dirty blocks? Or is the
.pod change inaccurate given later changes to implementation?
+#ifdef FALLOC_FL_PUNCH_HOLE
+ if (fallocate (fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
+ reclaim_blk * BLKSIZE, BLKSIZE) == -1) {
+ nbdkit_error ("cache: reclaiming cache blocks: "
+ "fallocate: FALLOC_FL_PUNCH_HOLE: %m");
+ return;
+ }
+#else
+#error "no implementation for punching holes"
This #error should be unreachable, given the definition of
HAVE_CACHE_RECLAIM based on FALLOC_FL_PUNCH_HOLE.
+#endif
+
+ bitmap_set_blk (&bm, reclaim_blk, BLOCK_NOT_CACHED);
+}
+
+#else /* !HAVE_CACHE_RECLAIM */
+static void
+reclaim (void)
+{
+ /* nothing */
+}
+#endif /* !HAVE_CACHE_RECLAIM */
diff --git a/filters/cache/cache.c b/filters/cache/cache.c
index a14f789..67dde23 100644
--- a/filters/cache/cache.c
+++ b/filters/cache/cache.c
@@ -65,6 +65,8 @@
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
enum cache_mode cache_mode = CACHE_MODE_WRITEBACK;
+int64_t max_size = -1;
+int hi_thresh = 95, lo_thresh = 80;
bool cache_on_read = false;
static int cache_flush (struct nbdkit_next_ops *next_ops, void *nxdata, void *handle,
uint32_t flags, int *err);
@@ -105,6 +107,44 @@ cache_config (nbdkit_next_config *next, void *nxdata,
return -1;
}
}
+#ifdef HAVE_CACHE_RECLAIM
+ else if (strcmp (key, "cache-max-size") == 0) {
+ int64_t r;
+
+ r = nbdkit_parse_size (value);
+ if (r == -1)
+ return -1;
+ /* We set a lower limit for the cache size just to keep out of
+ * trouble.
+ */
+ if (r < 1024*1024) {
+ nbdkit_error ("cache-max-size is too small");
+ return -1;
+ }
+ max_size = r;
+ return 0;
+ }
+ else if (strcmp (key, "cache-high-threshold") == 0) {
+ if (sscanf (value, "%d", &hi_thresh) != 1) {
+ nbdkit_error ("invalid cache-high-threshold parameter: %s", value);
+ return -1;
+ }
+ if (hi_thresh <= 0) {
+ nbdkit_error ("cache-high-threshold must be greater than zero");
+ }
+ return 0;
+ }
+ else if (strcmp (key, "cache-low-threshold") == 0) {
+ if (sscanf (value, "%d", &lo_thresh) != 1) {
+ nbdkit_error ("invalid cache-low-threshold parameter: %s", value);
+ return -1;
+ }
+ if (lo_thresh <= 0) {
+ nbdkit_error ("cache-low-threshold must be greater than zero");
+ }
+ return 0;
+ }
+#endif
There is no #else, so on platforms that can't punch holes, this silently
passes cache-max-size and friends on to the next filter layer, where it
will probably result in some odd error about being unrecognized. Would
it be better to have an explicit #else branch here that specifically
handles (and rejects) these parameters where we can't implement them?
+++ b/tests/test-cache-max-size.sh
+
+# Check that this is a Linux-like system supporting /proc/pid/fd.
+if ! test -d /proc/self/fd; then
+ echo "$0: not a Linux-like system supporting /proc/pid/fd"
Would this read any better if you used /proc/\$pid/fd or /proc/PID/fd in
the error message? Especially since...
+ exit 77
+fi
+
+# Test that qemu-io works
+if ! qemu-io --help >/dev/null; then
+ echo "$0: missing or broken qemu-io"
+ exit 77
+fi
+
+# Need the stat command from coreutils.
+if ! stat --version >/dev/null; then
+ echo "$0: missing or broken stat command"
+ exit 77
+fi
+
+d=cache-max-size.d
+rm -rf $d
+mkdir -p $d
+cleanup_fn rm -rf $d
+
+# Create a cache directory.
+mkdir $d/cache
+TMPDIR=$d/cache
+export TMPDIR
+
+# Create an empty base image.
+truncate -s 1G $d/cache-max-size.img
+
+# Run nbdkit with the caching filter and a low size limit to ensure
+# that the reclaim code is exercised.
+start_nbdkit -P $d/cache-max-size.pid -U $d/cache-max-size.sock \
+ --filter=cache \
+ file $d/cache-max-size.img \
+ cache-max-size=10M cache-on-read=true
+
+# Write > 10M to the plugin.
+qemu-io -f raw "nbd+unix://?socket=$d/cache-max-size.sock" \
+ -c "w -P 0 0 10M" \
+ -c "w -P 0 20M 10M" \
+ -c "r 20M 10M" \
+ -c "r 30M 10M" \
+ -c "w -P 0 40M 10M"
+
+# We can't use ‘du’ on the cache directory because the cache file is
+# deleted by the filter, and so is only accessible via /proc/PID/fd.
...you use PID here to make it obvious it is a placeholder?
+# Get the /proc link to the cache file, and the size of it in
bytes.
+fddir="/proc/$( cat $d/cache-max-size.pid )/fd"
+ls -l $fddir
+fd="$fddir/$( ls -l $fddir | grep $TMPDIR | head -1 | awk '{print $9}'
)"
+stat -L $fd
+size=$(( $(stat -L -c '%b * %B' $fd) ))
Wow - that's quite a bit of work. But it looks reasonable and sticks to
tools likely to already be present for other tests, and avoids dragging
in lsof or other less-common dependencies.
+
+if [ "$size" -gt $(( 11 * 1024 * 1024 )) ]; then
+ echo "$0: cache size is larger than 10M (actual size: $size bytes)"
+ exit 1
+fi
--
Eric Blake, Principal Software Engineer
Red Hat, Inc. +1-919-301-3226
Virtualization:
qemu.org |
libvirt.org