[Libguestfs] [PATCH v11 03/10] daemon: utils: New functions unix_canonical_path, utf16le_to_utf8 and tests.

Monday, 31 July 2017

These utility functions will be used in the OCaml inspection code.
---
 daemon/daemon_utils_tests.ml |  15 +++++++
 daemon/utils.ml              | 100 +++++++++++++++++++++++++++++++++++++++++++
 daemon/utils.mli             |  12 ++++++
 3 files changed, 127 insertions(+)

diff --git a/daemon/daemon_utils_tests.ml b/daemon/daemon_utils_tests.ml
index 892509d89..b1f02de30 100644
--- a/daemon/daemon_utils_tests.ml
+++ b/daemon/daemon_utils_tests.ml
@@ -46,3 +46,18 @@ let () =
 let () =
   assert (proc_unmangle_path "\\040" = " ");
   assert (proc_unmangle_path "\\040\\040" = "  ")
+
+(* Test unix_canonical_path. *)
+let () =
+  assert (unix_canonical_path "/" = "/");
+  assert (unix_canonical_path "/usr" = "/usr");
+  assert (unix_canonical_path "/usr/" = "/usr");
+  assert (unix_canonical_path "/usr/local" = "/usr/local");
+  assert (unix_canonical_path "///" = "/");
+  assert (unix_canonical_path "///usr//local//" = "/usr/local");
+  assert (unix_canonical_path "/usr///" = "/usr")
+
+(* Test utf16le_to_utf8. *)
+let () =
+  assert (utf16le_to_utf8
"\x57\x00\x69\x00\x6e\x00\x64\x00\x6f\x00\x77\x00\x73\x00" =
"Windows");
+  assert (utf16le_to_utf8
"\x57\x00\x69\x00\x6e\x00\x64\x00\x6f\x00\x77\x00\x73\x00\xae\x00" =
"Windows\xc2\xae")
diff --git a/daemon/utils.ml b/daemon/utils.ml
index b459a2314..b94515b71 100644
--- a/daemon/utils.ml
+++ b/daemon/utils.ml
@@ -240,3 +240,103 @@ let proc_unmangle_path path =
 let is_small_file path =
   is_regular_file path &&
     (stat path).st_size <= 2 * 1048 * 1024
+
+let unix_canonical_path path =
+  let is_absolute = String.length path > 0 && path.[0] = '/' in
+  let path = String.nsplit "/" path in
+  let path = List.filter ((<>) "") path in
+  (if is_absolute then "/" else "") ^ String.concat "/"
path
+
+(* Note that we cannot use iconv here because inside the appliance
+ * all i18n databases are deleted.  For the same reason we cannot
+ * use functions like hivex_value_string, as they also use iconv
+ * internally.
+ *
+ * https://en.wikipedia.org/wiki/UTF-16
+ * Also inspired by functions in glib's glib/gutf8.c
+ *)
+let rec utf16le_to_utf8 instr =
+  (* If the length is odd and the last character is ASCII NUL, just
+   * drop that.  (If it's not ASCII NUL, then there's an error)
+   *)
+  let len = String.length instr in
+  let instr =
+    if len mod 1 = 1 then (
+      if instr.[len-1] = '\000' then String.sub instr 0 (len-1)
+      else invalid_arg "input is not a valid UTF16-LE string: length is odd"
+    ) else instr in
+
+  (* The length should now be even.  If the last two bytes are
+   * '\0\0' then assume it's a NUL-terminated string from the
+   * Windows registry and drop both characters.
+   *)
+  let len = String.length instr in
+  let instr =
+    if len >= 2 && instr.[len-2] = '\000' && instr.[len-1] =
'\000' then
+      String.sub instr 0 (len-2)
+    else instr in
+
+  let outbuf = Buffer.create len in
+
+  (* Encode a wide character as UTF-8 and write to outbuf.
+   * Basically this is g_unichar_to_utf8 implemented in OCaml.
+   *)
+  let encode_utf8 c =
+    let first, len =
+      if c < 0x80 then
+        (0, 1)
+      else if c < 0x800 then
+        (0xc0, 2)
+      else if c < 0x10000 then
+        (0xe0, 3)
+      else if c < 0x200000 then
+        (0xf0, 4)
+      else if c < 0x4000000 then
+        (0xf8, 5)
+      else
+        (0xfc, 6) in
+    let rec loop i c =
+      if i = 0 then Buffer.add_char outbuf (Char.chr (c lor first))
+      else if i > 0 then (
+        loop (i-1) (c lsr 6);
+        Buffer.add_char outbuf (Char.chr ((c land 0x3f) lor 0x80))
+      )
+    in
+    loop (len-1) c
+  in
+
+  (* Loop over the input UTF16-LE characters. *)
+  let is_high_surrogate c = c >= 0xd800 && c < 0xdc00
+  and is_low_surrogate c = c >= 0xdc00 && c < 0xe000
+  and surrogate_value highc lowc =
+    0x1_0000 + (highc - 0xd800) * 0x400 + lowc - 0xdc00
+  in
+
+  let len = String.length instr in
+  let rec loop i =
+    if i+1 >= len then ()
+    else (
+      let c = Char.code instr.[i] + (Char.code instr.[i+1] lsl 8) in
+
+      let wc, skip =
+        (* High surrogate - must come first. *)
+        if is_high_surrogate c then (
+          if i+3 >= len then
+            invalid_arg "input is not a valid UTF16-LE string: high surrogate at end
of string";
+          let lowc = Char.code instr.[i+2] + (Char.code instr.[i+3] lsl 8) in
+          if not (is_low_surrogate lowc) then
+            invalid_arg "input is not a valid UTF16-LE string: high surrogate not
followed by low surrogate";
+          (surrogate_value c lowc, 4)
+        )
+        else if is_low_surrogate c then
+          invalid_arg "input is not a valid UTF16-LE string: unexpected low
surrogate"
+        else
+          (c, 2) in
+
+      encode_utf8 wc;
+      loop (i+skip)
+    )
+  in
+  loop 0;
+
+  Buffer.contents outbuf
diff --git a/daemon/utils.mli b/daemon/utils.mli
index 16569f018..b602115ef 100644
--- a/daemon/utils.mli
+++ b/daemon/utils.mli
@@ -85,3 +85,15 @@ val commandr : ?fold_stdout_on_stderr:bool -> string -> string
list -> (int * st
 
 val is_small_file : string -> bool
 (** Return true if the path is a small regular file. *)
+
+val unix_canonical_path : string -> string
+(** Canonicalize a Unix path, so "///usr//local//" ->
"/usr/local"
+
+    The path is modified in place because the result is always
+    the same length or shorter than the argument passed. *)
+
+val utf16le_to_utf8 : string -> string
+(** Convert a UTF16-LE string to UTF-8.
+
+    This uses a simple internal implementation since we cannot use
+    iconv inside the daemon. *)
-- 
2.13.2


    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

[Libguestfs] [PATCH v11 03/10] daemon: utils: New functions unix_canonical_path, utf16le_to_utf8 and tests.