[PATCH v2v] output: -o rhv-upload: Kill nbdkit instances before finalizing

Tuesday, 8 February 2022

If nbdkit instance(s) are still running then they will hold open some
http connections to imageio.  In some versions of imageio, starting
finalization in this state causes a 60 second timeout.

See-also: https://listman.redhat.com/archives/libguestfs/2022-February/msg00111.html
Thanks: Nir Soffer
---
 output/output_rhv_upload.ml | 43 +++++++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/output/output_rhv_upload.ml b/output/output_rhv_upload.ml
index 4d8dc1c135..1fefed123c 100644
--- a/output/output_rhv_upload.ml
+++ b/output/output_rhv_upload.ml
@@ -280,12 +280,26 @@ e command line has to match the number of guest disk images (for
this guest: %d)
     ignore (Python_script.run_command cancel_script json_params [])
   in
 
-  (* Set up an at-exit handler so we delete the orphan disks on failure. *)
+  (* Set up an at-exit handler to perform some cleanups.
+   * - Kill nbdkit PIDs (only before finalization).
+   * - Delete the orphan disks (only on conversion failure).
+   *)
+  let nbdkit_pids = ref [] in
   On_exit.f (
     fun () ->
+      (* Kill the nbdkit PIDs. *)
+      List.iter (
+        fun pid ->
+          try
+            kill pid Sys.sigterm
+          with exn -> debug "%s" (Printexc.to_string exn)
+      ) !nbdkit_pids;
+      nbdkit_pids := [];
+
       (* virt-v2v writes v2vdir/done on success only. *)
-      let success = Sys.file_exists (dir // "done") in
-      if not success then (
+      let on_failure = not (Sys.file_exists (dir // "done")) in
+
+      if on_failure then (
         if disk_uuids <> [] then
           cancel !transfer_ids disk_uuids
       )
@@ -351,11 +365,7 @@ e command line has to match the number of guest disk images (for this
guest: %d)
       if is_ovirt_host then
         Nbdkit.add_arg cmd "is_ovirt_host" "true";
       let _, pid = Nbdkit.run_unix ~socket cmd in
-
-      (* --exit-with-parent should ensure nbdkit is cleaned
-       * up when we exit, but it's not supported everywhere.
-       *)
-      On_exit.kill pid;
+      List.push_front pid nbdkit_pids
   ) (List.combine disks disk_uuids);
 
   (* Stash some data we will need during finalization. *)
@@ -363,7 +373,7 @@ e command line has to match the number of guest disk images (for this
guest: %d)
   let t = (disk_sizes : int64 list), disk_uuids, !transfer_ids,
           finalize_script, createvm_script, json_params,
           rhv_storagedomain_uuid, rhv_cluster_uuid,
-          rhv_cluster_cpu_architecture, rhv_cluster_name in
+          rhv_cluster_cpu_architecture, rhv_cluster_name, nbdkit_pids in
   t
 
 and rhv_upload_finalize dir source inspect target_meta
@@ -374,7 +384,8 @@ and rhv_upload_finalize dir source inspect target_meta
                         (disk_sizes, disk_uuids, transfer_ids,
                          finalize_script, createvm_script, json_params,
                          rhv_storagedomain_uuid, rhv_cluster_uuid,
-                         rhv_cluster_cpu_architecture, rhv_cluster_name) =
+                         rhv_cluster_cpu_architecture, rhv_cluster_name,
+                         nbdkit_pids) =
   (* Check the cluster CPU arch matches what we derived about the
    * guest during conversion.
    *)
@@ -386,6 +397,16 @@ and rhv_upload_finalize dir source inspect target_meta
           rhv_cluster_name target_meta.guestcaps.gcaps_arch arch
   );
 
+  (* We must kill all our nbdkit instances before finalizing the
+   * transfer.  See:
+   * https://listman.redhat.com/archives/libguestfs/2022-February/msg00111.html
+   *
+   * We want to fail here if the kill fails because nbdkit
+   * died already, as that would be unexpected.
+   *)
+  List.iter (fun pid -> kill pid Sys.sigterm) !nbdkit_pids;
+  nbdkit_pids := []; (* Don't kill them again in the On_exit handler. *)
+
   (* Finalize all the transfers. *)
   let json_params =
     let ids = List.map (fun id -> JSON.String id) transfer_ids in
@@ -442,7 +463,7 @@ module RHVUpload = struct
   type t = int64 list * string list * string list *
            Python_script.script * Python_script.script *
            JSON.field list * string option * string option *
-           string option * string
+           string option * string * int list ref
 
   let to_string options =
     "-o rhv-upload" ^
-- 
2.32.0


    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009