If nbdkit instance(s) are still running then they will hold open some
http connections to imageio. In some versions of imageio, starting
finalization in this state causes a 60 second timeout.
See-also:
https://listman.redhat.com/archives/libguestfs/2022-February/msg00111.html
Thanks: Nir Soffer
---
output/output_rhv_upload.ml | 43 +++++++++++++++++++++++++++----------
1 file changed, 32 insertions(+), 11 deletions(-)
diff --git a/output/output_rhv_upload.ml b/output/output_rhv_upload.ml
index 4d8dc1c135..1fefed123c 100644
--- a/output/output_rhv_upload.ml
+++ b/output/output_rhv_upload.ml
@@ -280,12 +280,26 @@ e command line has to match the number of guest disk images (for
this guest: %d)
ignore (Python_script.run_command cancel_script json_params [])
in
- (* Set up an at-exit handler so we delete the orphan disks on failure. *)
+ (* Set up an at-exit handler to perform some cleanups.
+ * - Kill nbdkit PIDs (only before finalization).
+ * - Delete the orphan disks (only on conversion failure).
+ *)
+ let nbdkit_pids = ref [] in
On_exit.f (
fun () ->
+ (* Kill the nbdkit PIDs. *)
+ List.iter (
+ fun pid ->
+ try
+ kill pid Sys.sigterm
+ with exn -> debug "%s" (Printexc.to_string exn)
+ ) !nbdkit_pids;
+ nbdkit_pids := [];
+
(* virt-v2v writes v2vdir/done on success only. *)
- let success = Sys.file_exists (dir // "done") in
- if not success then (
+ let on_failure = not (Sys.file_exists (dir // "done")) in
+
+ if on_failure then (
if disk_uuids <> [] then
cancel !transfer_ids disk_uuids
)
@@ -351,11 +365,7 @@ e command line has to match the number of guest disk images (for this
guest: %d)
if is_ovirt_host then
Nbdkit.add_arg cmd "is_ovirt_host" "true";
let _, pid = Nbdkit.run_unix ~socket cmd in
-
- (* --exit-with-parent should ensure nbdkit is cleaned
- * up when we exit, but it's not supported everywhere.
- *)
- On_exit.kill pid;
+ List.push_front pid nbdkit_pids
) (List.combine disks disk_uuids);
(* Stash some data we will need during finalization. *)
@@ -363,7 +373,7 @@ e command line has to match the number of guest disk images (for this
guest: %d)
let t = (disk_sizes : int64 list), disk_uuids, !transfer_ids,
finalize_script, createvm_script, json_params,
rhv_storagedomain_uuid, rhv_cluster_uuid,
- rhv_cluster_cpu_architecture, rhv_cluster_name in
+ rhv_cluster_cpu_architecture, rhv_cluster_name, nbdkit_pids in
t
and rhv_upload_finalize dir source inspect target_meta
@@ -374,7 +384,8 @@ and rhv_upload_finalize dir source inspect target_meta
(disk_sizes, disk_uuids, transfer_ids,
finalize_script, createvm_script, json_params,
rhv_storagedomain_uuid, rhv_cluster_uuid,
- rhv_cluster_cpu_architecture, rhv_cluster_name) =
+ rhv_cluster_cpu_architecture, rhv_cluster_name,
+ nbdkit_pids) =
(* Check the cluster CPU arch matches what we derived about the
* guest during conversion.
*)
@@ -386,6 +397,16 @@ and rhv_upload_finalize dir source inspect target_meta
rhv_cluster_name target_meta.guestcaps.gcaps_arch arch
);
+ (* We must kill all our nbdkit instances before finalizing the
+ * transfer. See:
+ *
https://listman.redhat.com/archives/libguestfs/2022-February/msg00111.html
+ *
+ * We want to fail here if the kill fails because nbdkit
+ * died already, as that would be unexpected.
+ *)
+ List.iter (fun pid -> kill pid Sys.sigterm) !nbdkit_pids;
+ nbdkit_pids := []; (* Don't kill them again in the On_exit handler. *)
+
(* Finalize all the transfers. *)
let json_params =
let ids = List.map (fun id -> JSON.String id) transfer_ids in
@@ -442,7 +463,7 @@ module RHVUpload = struct
type t = int64 list * string list * string list *
Python_script.script * Python_script.script *
JSON.field list * string option * string option *
- string option * string
+ string option * string * int list ref
let to_string options =
"-o rhv-upload" ^
--
2.32.0