Re: [Libguestfs] [PATCH common 4/4] mltools: Allow waiting for killed PIDs

Friday, 15 July 2022

On 07/14/22 14:36, Richard W.M. Jones wrote:
...
 Add a new, optional [?wait] parameter to On_exit.kill, allowing
 programs to wait for a number of seconds for the subprocess to exit.
 ---
  mltools/on_exit.ml  | 30 +++++++++++++++++++++---------
  mltools/on_exit.mli | 14 ++++++++++++--
  2 files changed, 33 insertions(+), 11 deletions(-)

 diff --git a/mltools/on_exit.ml b/mltools/on_exit.ml
 index e8353df..cdaa83c 100644
 --- a/mltools/on_exit.ml
 +++ b/mltools/on_exit.ml
 @@ -24,10 +24,10 @@ open Unix
  open Printf

  type action =
 -  | Unlink of string     (* filename *)
 -  | Rm_rf of string      (* directory *)
 -  | Kill of int * int    (* signal, pid *)
 -  | Fn of (unit -> unit) (* generic function *)
 +  | Unlink of string        (* filename *)
 +  | Rm_rf of string         (* directory *)
 +  | Kill of int * int * int (* wait, signal, pid *)
 +  | Fn of (unit -> unit)    (* generic function *)

  (* List of (priority, action). *)
  let actions = ref []
 @@ -35,18 +35,30 @@ let actions = ref []
  (* Perform a single exit action, printing any exception but
   * otherwise ignoring failures.
   *)
 -let do_action action =
 +let rec do_action action = 
I'd slightly prefer (a) do_kill to be introduced either before
do_action, or (b) for do_kill to be defined inside do_action, to using
"let rec" here. I think I understand what "rec" does to scoping, but
still, we don't have actual recursion here.

(I've noticed this pattern in many places in the v2v projects, and it
always confuses me -- it doesn't bring too much convenience IMO, so I'd
rather restrict it to actual recursion.)

...
    try
      match action with
      | Unlink file -> Unix.unlink file
      | Rm_rf dir ->
         let cmd = sprintf "rm -rf %s" (Filename.quote dir) in
         ignore (Tools_utils.shell_command cmd)
 -    | Kill (signal, pid) ->
 -       kill pid signal
 +    | Kill (wait, signal, pid) ->
 +       do_kill ~wait ~signal ~pid
      | Fn f -> f ()
    with exn -> debug "%s" (Printexc.to_string exn)

 +and do_kill ~wait ~signal ~pid =
 +  kill pid signal;
 +
 +  let rec loop i =
 +    if i > 0 then (
 +      let pid', _ = waitpid [ WNOHANG ] pid in
 +      if pid' = 0 then
 +        loop (i-1) 
Missing: "sleep 1;" before the tail-recursive call, I think.

...
 +    )
 +  in
 +  loop wait
 +
  (* Make sure the actions are performed only once. *)
  let done_actions = ref false

 @@ -106,6 +118,6 @@ let rm_rf ?(prio = 1000) dir =
    register ();
    List.push_front (prio, Rm_rf dir) actions

 -let kill ?(prio = 1000) ?(signal = Sys.sigterm) pid =
 +let kill ?(prio = 1000) ?(wait = 0) ?(signal = Sys.sigterm) pid =
    register ();
 -  List.push_front (prio, Kill (signal, pid)) actions
 +  List.push_front (prio, Kill (wait, signal, pid)) actions
 diff --git a/mltools/on_exit.mli b/mltools/on_exit.mli
 index 910783e..dd35101 100644
 --- a/mltools/on_exit.mli
 +++ b/mltools/on_exit.mli
 @@ -58,12 +58,22 @@ val unlink : ?prio:int -> string -> unit
  val rm_rf : ?prio:int -> string -> unit
  (** Recursively remove a temporary directory on exit (using [rm -rf]). *)

 -val kill : ?prio:int -> ?signal:int -> int -> unit
 +val kill : ?prio:int -> ?wait:int -> ?signal:int -> int -> unit
  (** Kill [PID] on exit.  The signal sent defaults to [Sys.sigterm].

      Use this with care since you can end up unintentionally killing
      another process if [PID] goes away or doesn't exist before the
 -    program exits. *)
 +    program exits.
 +
 +    The optional [?wait] flag attempts to wait for a specified
 +    number of seconds for the subprocess to go away.  For example
 +    using [~wait:5] will wait for up to 5 seconds.   Since this
 +    runs when virt-v2v is exiting, it is best to keep waiting times
 +    as short as possible.  Also there is no way to report errors
 +    in the subprocess.  If reliable cleanup of a subprocess is
 +    required then this is not the correct place to do it.
 +
 +    [?wait] defaults to [0] which means we do not try to wait. *)

  val register : unit -> unit
  (** Force this module to register its at_exit function and signal

(please consider formatting *.mli before *.ml)

I believe I take the opposite position on this; I'd rather wait forever.
No subprocess is expected to hang, and we should leave no subprocess
behind. If a subprocess hangs, the parent process should (apparently)
hang forever too, and let users report bugs.

That would also eliminate the question of *how* to wait for N seconds;
we'd just drop WNOHANG from waitpid.

But this is just my opinion; food for thought. :)

Thanks,
Laszlo

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

Re: [Libguestfs] [PATCH common 4/4] mltools: Allow waiting for killed PIDs