On 07/14/22 14:36, Richard W.M. Jones wrote:
Add a new, optional [?wait] parameter to On_exit.kill, allowing
programs to wait for a number of seconds for the subprocess to exit.
---
mltools/on_exit.ml | 30 +++++++++++++++++++++---------
mltools/on_exit.mli | 14 ++++++++++++--
2 files changed, 33 insertions(+), 11 deletions(-)
diff --git a/mltools/on_exit.ml b/mltools/on_exit.ml
index e8353df..cdaa83c 100644
--- a/mltools/on_exit.ml
+++ b/mltools/on_exit.ml
@@ -24,10 +24,10 @@ open Unix
open Printf
type action =
- | Unlink of string (* filename *)
- | Rm_rf of string (* directory *)
- | Kill of int * int (* signal, pid *)
- | Fn of (unit -> unit) (* generic function *)
+ | Unlink of string (* filename *)
+ | Rm_rf of string (* directory *)
+ | Kill of int * int * int (* wait, signal, pid *)
+ | Fn of (unit -> unit) (* generic function *)
(* List of (priority, action). *)
let actions = ref []
@@ -35,18 +35,30 @@ let actions = ref []
(* Perform a single exit action, printing any exception but
* otherwise ignoring failures.
*)
-let do_action action =
+let rec do_action action =
I'd slightly prefer (a) do_kill to be introduced either before
do_action, or (b) for do_kill to be defined inside do_action, to using
"let rec" here. I think I understand what "rec" does to scoping, but
still, we don't have actual recursion here.
(I've noticed this pattern in many places in the v2v projects, and it
always confuses me -- it doesn't bring too much convenience IMO, so I'd
rather restrict it to actual recursion.)
try
match action with
| Unlink file -> Unix.unlink file
| Rm_rf dir ->
let cmd = sprintf "rm -rf %s" (Filename.quote dir) in
ignore (Tools_utils.shell_command cmd)
- | Kill (signal, pid) ->
- kill pid signal
+ | Kill (wait, signal, pid) ->
+ do_kill ~wait ~signal ~pid
| Fn f -> f ()
with exn -> debug "%s" (Printexc.to_string exn)
+and do_kill ~wait ~signal ~pid =
+ kill pid signal;
+
+ let rec loop i =
+ if i > 0 then (
+ let pid', _ = waitpid [ WNOHANG ] pid in
+ if pid' = 0 then
+ loop (i-1)
Missing: "sleep 1;" before the tail-recursive call, I think.
+ )
+ in
+ loop wait
+
(* Make sure the actions are performed only once. *)
let done_actions = ref false
@@ -106,6 +118,6 @@ let rm_rf ?(prio = 1000) dir =
register ();
List.push_front (prio, Rm_rf dir) actions
-let kill ?(prio = 1000) ?(signal = Sys.sigterm) pid =
+let kill ?(prio = 1000) ?(wait = 0) ?(signal = Sys.sigterm) pid =
register ();
- List.push_front (prio, Kill (signal, pid)) actions
+ List.push_front (prio, Kill (wait, signal, pid)) actions
diff --git a/mltools/on_exit.mli b/mltools/on_exit.mli
index 910783e..dd35101 100644
--- a/mltools/on_exit.mli
+++ b/mltools/on_exit.mli
@@ -58,12 +58,22 @@ val unlink : ?prio:int -> string -> unit
val rm_rf : ?prio:int -> string -> unit
(** Recursively remove a temporary directory on exit (using [rm -rf]). *)
-val kill : ?prio:int -> ?signal:int -> int -> unit
+val kill : ?prio:int -> ?wait:int -> ?signal:int -> int -> unit
(** Kill [PID] on exit. The signal sent defaults to [Sys.sigterm].
Use this with care since you can end up unintentionally killing
another process if [PID] goes away or doesn't exist before the
- program exits. *)
+ program exits.
+
+ The optional [?wait] flag attempts to wait for a specified
+ number of seconds for the subprocess to go away. For example
+ using [~wait:5] will wait for up to 5 seconds. Since this
+ runs when virt-v2v is exiting, it is best to keep waiting times
+ as short as possible. Also there is no way to report errors
+ in the subprocess. If reliable cleanup of a subprocess is
+ required then this is not the correct place to do it.
+
+ [?wait] defaults to [0] which means we do not try to wait. *)
val register : unit -> unit
(** Force this module to register its at_exit function and signal
(please consider formatting *.mli before *.ml)
I believe I take the opposite position on this; I'd rather wait forever.
No subprocess is expected to hang, and we should leave no subprocess
behind. If a subprocess hangs, the parent process should (apparently)
hang forever too, and let users report bugs.
That would also eliminate the question of *how* to wait for N seconds;
we'd just drop WNOHANG from waitpid.
But this is just my opinion; food for thought. :)
Thanks,
Laszlo