Re: [Libguestfs] [PATCH nbdkit 1/5] Add new public nbdkit_parse_probability function

Tuesday, 16 May 2023

On 5/16/23 14:12, Richard W.M. Jones wrote:
...
 In nbdkit-error-filter we need to parse parameters as probabilities.
 This is useful enough to add to nbdkit, since we will use it in
 another filter in future.
 ---
  docs/nbdkit-plugin.pod                  | 19 +++++++
  plugins/python/nbdkit-python-plugin.pod |  6 ++
  include/nbdkit-common.h                 |  2 +
  server/nbdkit.syms                      |  1 +
  server/public.c                         | 37 +++++++++++++
  server/test-public.c                    | 73 +++++++++++++++++++++++++
  plugins/ocaml/NBDKit.mli                | 11 ++--
  plugins/ocaml/NBDKit.ml                 |  2 +
  plugins/ocaml/bindings.c                | 16 ++++++
  plugins/python/modfunctions.c           | 21 +++++++
  tests/test-python-plugin.py             | 12 ++++
  tests/test_ocaml_plugin.ml              |  1 +
  12 files changed, 196 insertions(+), 5 deletions(-)

 diff --git a/docs/nbdkit-plugin.pod b/docs/nbdkit-plugin.pod
 index 860c5cecb..e8d30a98e 100644
 --- a/docs/nbdkit-plugin.pod
 +++ b/docs/nbdkit-plugin.pod
 @@ -1433,6 +1433,25 @@ C<str> can be a string containing a case-insensitive form of
various
  common toggle values.  The function returns 0 or 1 if the parse was
  successful.  If there was an error, it returns C<-1>.

 +=head2 Parsing probabilities
 +
 +Use the C<nbdkit_parse_probability> utility function to parse
 +probabilities.  Common formats understood include: C<"0.1">,
C<"10%">
 +or C<"1:10">, which all mean a probability of 1 in 10.
 +
 + int nbdkit_parse_probability (const char *what, const char *str,
 +                               double *ret);
 +
 +The C<what> parameter is printed in error messages to provide context.
 +The C<str> parameter is the probability string.
 +
 +On success the function returns C<0> and sets C<*ret>.  B<Note> that
 +the probability returned may be outside the range S<[ 0.0..1.0 ]>, for
 +example if C<str == "200%">.  If you want to clamp the result you must
 +check that yourself.
 +
 +On error, nbdkit_error is called and C<-1> is returned.
 +
  =head2 Reading passwords

  The C<nbdkit_read_password> utility function can be used to read
 diff --git a/plugins/python/nbdkit-python-plugin.pod
b/plugins/python/nbdkit-python-plugin.pod
 index e328cc2e0..0e55dcfcc 100644
 --- a/plugins/python/nbdkit-python-plugin.pod
 +++ b/plugins/python/nbdkit-python-plugin.pod
 @@ -136,6 +136,12 @@ C<import errno>.
  Parse a string (such as "100M") into a size in bytes. Wraps the
  C<nbdkit_parse_size()> C function.

 +=head3 C<nbdkit.parse_probability(what, str)>
 +
 +Parse a string (such as "100%") into a probability, returning a
 +floating point number. Wraps the C<nbdkit_parse_probability()> C
 +function.
 +
  =head3 C<nbdkit.shutdown()>

  Request asynchronous server shutdown.
 diff --git a/include/nbdkit-common.h b/include/nbdkit-common.h
 index e070245b8..b44e77323 100644
 --- a/include/nbdkit-common.h
 +++ b/include/nbdkit-common.h
 @@ -105,6 +105,8 @@ NBDKIT_EXTERN_DECL (void, nbdkit_vdebug,

  NBDKIT_EXTERN_DECL (char *, nbdkit_absolute_path, (const char *path));
  NBDKIT_EXTERN_DECL (int64_t, nbdkit_parse_size, (const char *str));
 +NBDKIT_EXTERN_DECL (int, nbdkit_parse_probability,
 +                    (const char *what, const char *str, double *r));
  NBDKIT_EXTERN_DECL (int, nbdkit_parse_bool, (const char *str));
  NBDKIT_EXTERN_DECL (int, nbdkit_parse_int,
                      (const char *what, const char *str, int *r));
 diff --git a/server/nbdkit.syms b/server/nbdkit.syms
 index 353ea2a98..c6480e43e 100644
 --- a/server/nbdkit.syms
 +++ b/server/nbdkit.syms
 @@ -67,6 +67,7 @@
      nbdkit_parse_int64_t;
      nbdkit_parse_int8_t;
      nbdkit_parse_int;
 +    nbdkit_parse_probability;
      nbdkit_parse_size;
      nbdkit_parse_uint16_t;
      nbdkit_parse_uint32_t;
 diff --git a/server/public.c b/server/public.c
 index 71ea6779d..fe2b48c7c 100644
 --- a/server/public.c
 +++ b/server/public.c
 @@ -421,6 +421,43 @@ nbdkit_parse_size (const char *str)
    return size * scale;
  }

 +NBDKIT_DLL_PUBLIC int
 +nbdkit_parse_probability (const char *what, const char *str,
 +                          double *retp)
 +{
 +  double d, d2;
 +  char c;
 +  int n;
 +
 +  if (sscanf (str, "%lg%[:/]%lg%n", &d, &c, &d2, &n) == 3
&&
 +      strcmp (&str[n], "") == 0) { /* N:M or N/M */
 +    if (d == 0 && d2 == 0)         /* 0/0 is OK */
 +      ;
 +    else if (d2 == 0)              /* N/0 is bad */
 +      goto bad_parse;
 +    else
 +      d /= d2; 
I don't want to be a spoilsport, but this division makes me extra
nervous [1], on top of parsing floating point *at all* [2].

[2] Eric has just posted a 19-part series, v2, for QEMU, for fixing the
numerous issues in QEMU's floating point parser. Personal opinion: QEMU
doesn't even have a *valid reason* for parsing floating point! All QEMU
deals with are *integer quantities* such as disk images consisting of
whole numbers of bytes! But again, that's just a personal opinion.

[1] I maintain that floating point is only a part of the C language so
that C could be made "competitive" with Fortran. Therefore I honestly
believe that *no* floating point should be done in C without fully
engaging in numerical analysis. I'm not up to that -- I've had numerical
analysis in college, and I remember just enough of it to steer the heck
clear of it. In C99, there's a whole chapter dedicated to the floating
point environment (7.6 Floating-point environment <fenv.h>), and there's
also the normative Annex F ("IEC 60559 floating-point arithmetic").

In this particular case, we have a floating point division. The divisor
"d2" can be an extremely small *positive* number (DBL_MIN, or another
value with a very small positive mantissa and  a huge absolute value
negative exponent, like FLT_RADIX raised to DBL_MIN_EXP), and the
dividend "d" may be a huge positive number (DBL_MAX, or another value
with a large positive mantissa and a huge positive exponent like
FLT_RADIX raised to DBL_MAX_EXP).

The quotient d/d2 is then almost certainly not representable in the type
-- the resultant exponent would have to be about (DBL_MAX_EXP -
DBL_MIN_EXP), which in practice is about 2 * DBL_MAX_EXP -- and there's
just not another exponent bit available in the type for that.

And I can't even say whether that would result in a silent NaN, or a
silent positive infinity, or a floating point exception. I don't like
code that I can't reason about (even if it is ultimately my fault, for
not knowing the language well enough).

Relatedly... in patch#5, we have:

+  block_size = next_power_of_2 ((uint64_t) (100. / evil_probability));

where "evil_probability" comes from nbdkit_parse_probability(), and is
checked separately to be in [0, 1] (both boundaries inclusive, per the
code).

So, evil_probability==0 will produce a division by zero here, but even
if we assume that's just a typo and we actually mean to exclude 0, the
user can still provide a value near DBL_MIN on the command line, like
1E-37. 100 is 1E+2, so the quotient is 1E+39. Representing 10^39 in
binary requires ceil(39 * log2(10)) bits -- that's 130 bits; uint64_t is
too small for that. And then the following applies from C99:

6.3.1.4 Real floating and integer

    When a finite value of real floating type is converted to an integer
    type other than _Bool, the fractional part is discarded (i.e., the
    value is truncated toward zero). If the value of the integral part
    cannot be represented by the integer type, the behavior is
    undefined. [Footnote 50]

Footnote 50:

    The remaindering operation performed when a value of integer type is
    converted to unsigned type need not be performed when a value of
    real floating type is converted to unsigned type. Thus, the range of
    portable real floating values is (−1, Utype_MAX+1).

(Note the round parens in the footnote: it means the boundaries are
*excluded*. So, a value like -0.5 will be truncated to zero, and a value
like UINT64_MAX + 0.5 will be truncated to UINT64_MAX, but (-1) itself
and (UINT64_MAX + 1) itself are not permitted already.)

I recommend the following: take two unsigned integers, numerator "a" and
denominator "b", from the command line. We know how to parse those safely.

Maintain those values separately.

In every calculation where we need to multiply x by a/b, calculate
(x*a)/b, and use MUL_OVERFLOW() for the multiplication. Where we need to
divide x by (a/b), use (x*b)/a, again with MUL_OVERFLOW(). (And of
course the divisor must never be 0.)

In case we can determine the possible range for "x" *in advance*, for
example because we know we're going to divide x=100 by (a/b), we can
*also* determine the safe range for "a" and "b" as well -- for x=100,
x*b must fit in a uint64_t, so "b" must not exceed UINT64_MAX/100, and
that can be verified safely after parsing.

Floating point is in fact *supposed* to take away all this messing, and
"compress" the ratio into the mantissa, and set the exponent (the scale)
accordingly. But the simplicity is deceiving, due to cancellation, error
accumulation, etc. (Cue the articles "what every programmer should know
about floating point".)

I don't think I can *safely* use floating point, I can only say when
something looks unsafe. Maintaining two positive integers (numerator and
denominator) is more work, but I can at least *reason* about those.

If Eric is OK with this patch set, I won't try to block it, it just
makes me feel very uncomfortable. It's more than just scanf() lacking
proper internal error handling; the real complications come when we
perform operations on the floating point values.

Laszlo

...
 +  }
 +  else if (sscanf (str, "%lg%n", &d, &n) == 1) {
 +    if (strcmp (&str[n], "%") == 0) /* percentage */
 +      d /= 100.0;
 +    else if (strcmp (&str[n], "") == 0) /* probability */
 +      ;
 +    else
 +      goto bad_parse;
 +  }
 +  else
 +    goto bad_parse;
 +
 +  if (retp)
 +    *retp = d;
 +  return 0;
 +
 + bad_parse:
 +  nbdkit_error ("%s: could not parse '%s'", what, str);
 +  return -1;
 +}
 +
  /* Parse a string as a boolean, or return -1 after reporting the error.
   */
  NBDKIT_DLL_PUBLIC int
 diff --git a/server/test-public.c b/server/test-public.c
 index 676411290..0d84abdd2 100644
 --- a/server/test-public.c
 +++ b/server/test-public.c
 @@ -200,6 +200,78 @@ test_nbdkit_parse_size (void)
    return pass;
  }

 +static bool
 +test_nbdkit_parse_probability (void)
 +{
 +  size_t i;
 +  bool pass = true;
 +  struct pair {
 +    const char *str;
 +    int result;
 +    double expected;
 +  } tests[] = {
 +    /* Bogus strings */
 +    { "", -1 },
 +    { "garbage", -1 },
 +    { "0garbage", -1 },
 +    { "1X", -1 },
 +    { "1%%", -1 },
 +    { "1:", -1 },
 +    { "1:1:1", -1 },
 +    { "1:0", -1 }, /* format is valid but divide by zero is not allowed */
 +    { "1/", -1 },
 +    { "1/2/3", -1 },
 +
 +    /* Numbers. */
 +    { "0", 0, 0 },
 +    { "1", 0, 1 },
 +    { "2", 0, 2 }, /* values outside [0..1] range are allowed */
 +    { "0.1", 0, 0.1 },
 +    { "0.5", 0, 0.5 },
 +    { "0.9", 0, 0.9 },
 +    { "1.0000", 0, 1 },
 +
 +    /* Percentages. */
 +    { "0%", 0, 0 },
 +    { "50%", 0, 0.5 },
 +    { "100%", 0, 1 },
 +    { "90.25%", 0, 0.9025 },
 +
 +    /* N in M */
 +    { "1:1000", 0, 0.001 },
 +    { "1/1000", 0, 0.001 },
 +    { "2:99", 0, 2.0/99 },
 +    { "2/99", 0, 2.0/99 },
 +    { "0:1000000", 0, 0 },
 +  };
 +
 +  for (i = 0; i < ARRAY_SIZE (tests); i++) {
 +    int r;
 +    double d;
 +
 +    error_flagged = false;
 +    r = nbdkit_parse_probability ("test", tests[i].str, &d);
 +    if (r != tests[i].result) {
 +      fprintf (stderr,
 +               "Wrong return value for %s, got %d, expected %d\n",
 +               tests[i].str, r, tests[i].result);
 +      pass = false;
 +    }
 +    if (r == 0 && d != tests[i].expected) {
 +      fprintf (stderr,
 +               "Wrong result for %s, got %g, expected %g\n",
 +               tests[i].str, d, tests[i].expected);
 +      pass = false;
 +    }
 +    if ((r == -1) != error_flagged) {
 +      fprintf (stderr, "Wrong error message handling for %s\n",
tests[i].str);
 +      pass = false;
 +    }
 +  }
 +
 +  return pass;
 +}
 +
  static bool
  test_nbdkit_parse_ints (void)
  {
 @@ -503,6 +575,7 @@ main (int argc, char *argv[])
  {
    bool pass = true;
    pass &= test_nbdkit_parse_size ();
 +  pass &= test_nbdkit_parse_probability ();
    pass &= test_nbdkit_parse_ints ();
    pass &= test_nbdkit_read_password ();
    /* nbdkit_absolute_path and nbdkit_nanosleep not unit-tested here, but
 diff --git a/plugins/ocaml/NBDKit.mli b/plugins/ocaml/NBDKit.mli
 index 81447d07d..bc190f267 100644
 --- a/plugins/ocaml/NBDKit.mli
 +++ b/plugins/ocaml/NBDKit.mli
 @@ -134,9 +134,9 @@ val register_plugin :
      into [EINVAL]. *)
  val set_error : Unix.error -> unit

 -(** Bindings for [nbdkit_parse_size], [nbdkit_parse_bool] and
 -    [nbdkit_read_password].  See nbdkit-plugin(3) for information
 -    about these functions.
 +(** Bindings for [nbdkit_parse_size], [nbdkit_parse_probability],
 +    [nbdkit_parse_bool] and [nbdkit_read_password].  See
 +    nbdkit-plugin(3) for information about these functions.

      On error these functions all raise [Invalid_argument].  The
      actual error is sent to the nbdkit error log and is not
 @@ -145,10 +145,11 @@ val set_error : Unix.error -> unit
  (* Note OCaml has functions already for parsing other integers, so
   * there is no need to bind them here.  We only bind the functions
   * which have special abilities in nbdkit: [parse_size] can parse
 - * human sizes, [parse_bool] parses a range of nbdkit-specific
 - * boolean strings, and [read_password] suppresses echo.
 + * human sizes, [parse_probability] and [parse_bool] parses a range
 + * of nbdkit-specific strings, and [read_password] suppresses echo.
   *)
  val parse_size : string -> int64
 +val parse_probability : string -> string -> float
  val parse_bool : string -> bool
  val read_password : string -> string

 diff --git a/plugins/ocaml/NBDKit.ml b/plugins/ocaml/NBDKit.ml
 index e1cf28c94..2d1696917 100644
 --- a/plugins/ocaml/NBDKit.ml
 +++ b/plugins/ocaml/NBDKit.ml
 @@ -160,6 +160,8 @@ let register_plugin ~name
  (* Bindings to nbdkit server functions. *)
  external set_error : Unix.error -> unit = "ocaml_nbdkit_set_error"
[@@noalloc]
  external parse_size : string -> int64 = "ocaml_nbdkit_parse_size"
 +external parse_probability : string -> string -> float =
 +  "ocaml_nbdkit_parse_probability"
  external parse_bool : string -> bool = "ocaml_nbdkit_parse_bool"
  external read_password : string -> string = "ocaml_nbdkit_read_password"
  external realpath : string -> string = "ocaml_nbdkit_realpath"
 diff --git a/plugins/ocaml/bindings.c b/plugins/ocaml/bindings.c
 index f2c9ca07d..4885feac5 100644
 --- a/plugins/ocaml/bindings.c
 +++ b/plugins/ocaml/bindings.c
 @@ -74,6 +74,22 @@ ocaml_nbdkit_parse_size (value strv)
    CAMLreturn (rv);
  }

 +NBDKIT_DLL_PUBLIC value
 +ocaml_nbdkit_parse_probability (value whatv, value strv)
 +{
 +  CAMLparam2 (whatv, strv);
 +  CAMLlocal1 (dv);
 +  int r;
 +  double d;
 +
 +  r = nbdkit_parse_probability (String_val (whatv), String_val (strv), &d);
 +  if (r == -1)
 +    caml_invalid_argument ("nbdkit_parse_probability");
 +  dv = caml_copy_double (d);
 +
 +  CAMLreturn (dv);
 +}
 +
  NBDKIT_DLL_PUBLIC value
  ocaml_nbdkit_parse_bool (value strv)
  {
 diff --git a/plugins/python/modfunctions.c b/plugins/python/modfunctions.c
 index 479707e78..74bfd6f27 100644
 --- a/plugins/python/modfunctions.c
 +++ b/plugins/python/modfunctions.c
 @@ -122,11 +122,32 @@ parse_size (PyObject *self, PyObject *args)
    return PyLong_FromSize_t ((size_t)size);
  }

 +/* nbdkit.parse_probability */
 +static PyObject *
 +parse_probability (PyObject *self, PyObject *args)
 +{
 +  const char *what, *str;
 +  double d;
 +
 +  if (!PyArg_ParseTuple (args, "ss:parse_probability", &what, &str))
 +    return NULL;
 +
 +  if (nbdkit_parse_probability (what, str, &d) == -1) {
 +    PyErr_SetString (PyExc_ValueError,
 +                     "Unable to parse string as probability");
 +    return NULL;
 +  }
 +
 +  return PyFloat_FromDouble (d);
 +}
 +
  static PyMethodDef NbdkitMethods[] = {
    { "debug", debug, METH_VARARGS,
      "Print a debug message" },
    { "export_name", export_name, METH_NOARGS,
      "Return the optional export name negotiated with the client" },
 +  { "parse_probability", parse_probability, METH_VARARGS,
 +    "Parse probability strings into floating point number" },
    { "parse_size", parse_size, METH_VARARGS,
      "Parse human-readable size strings into bytes" },
    { "set_error", set_error, METH_VARARGS,
 diff --git a/tests/test-python-plugin.py b/tests/test-python-plugin.py
 index c3232a112..7f3a2c2e4 100644
 --- a/tests/test-python-plugin.py
 +++ b/tests/test-python-plugin.py
 @@ -55,8 +55,20 @@ class TestAPI(unittest.TestCase):
          with self.assertRaises(ValueError):
              nbdkit.parse_size('foo')

 +    def test_parse_probability(self):
 +        self.assertEqual(nbdkit.parse_probability('test', '1:10'), 0.1)
 +        self.assertEqual(nbdkit.parse_probability('test', '100%'), 1)
 +        self.assertEqual(nbdkit.parse_probability('test', '0'), 0)
 +
 +        with self.assertRaises(TypeError):
 +            nbdkit.parse_probability('test', 17)
 +
 +        with self.assertRaises(ValueError):
 +            nbdkit.parse_probability('test', 'bar')
 +

  TestAPI().test_parse_size()
 +TestAPI().test_parse_probability()

  def config(k, v):
 diff --git a/tests/test_ocaml_plugin.ml b/tests/test_ocaml_plugin.ml
 index 8132de8f8..bf998d361 100644
 --- a/tests/test_ocaml_plugin.ml
 +++ b/tests/test_ocaml_plugin.ml
 @@ -39,6 +39,7 @@ let sparse = Bytes.make nr_sectors '\000' (* sparseness bitmap
*)
  (* Test parse_* functions. *)
  let () =
    assert (NBDKit.parse_size "1M" = Int64.of_int (1024*1024));
 +  assert (NBDKit.parse_probability "test parse probability" "1:10" =
0.1);
    assert (NBDKit.parse_bool "true" = true);
    assert (NBDKit.parse_bool "0" = false)

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

Re: [Libguestfs] [PATCH nbdkit 1/5] Add new public nbdkit_parse_probability function