These utility functions will be used in the OCaml inspection code.
---
daemon/daemon_utils_tests.ml | 15 +++++++
daemon/utils.ml | 100 +++++++++++++++++++++++++++++++++++++++++++
daemon/utils.mli | 12 ++++++
3 files changed, 127 insertions(+)
diff --git a/daemon/daemon_utils_tests.ml b/daemon/daemon_utils_tests.ml
index 892509d89..b1f02de30 100644
--- a/daemon/daemon_utils_tests.ml
+++ b/daemon/daemon_utils_tests.ml
@@ -46,3 +46,18 @@ let () =
let () =
assert (proc_unmangle_path "\\040" = " ");
assert (proc_unmangle_path "\\040\\040" = " ")
+
+(* Test unix_canonical_path. *)
+let () =
+ assert (unix_canonical_path "/" = "/");
+ assert (unix_canonical_path "/usr" = "/usr");
+ assert (unix_canonical_path "/usr/" = "/usr");
+ assert (unix_canonical_path "/usr/local" = "/usr/local");
+ assert (unix_canonical_path "///" = "/");
+ assert (unix_canonical_path "///usr//local//" = "/usr/local");
+ assert (unix_canonical_path "/usr///" = "/usr")
+
+(* Test utf16le_to_utf8. *)
+let () =
+ assert (utf16le_to_utf8
"\x57\x00\x69\x00\x6e\x00\x64\x00\x6f\x00\x77\x00\x73\x00" =
"Windows");
+ assert (utf16le_to_utf8
"\x57\x00\x69\x00\x6e\x00\x64\x00\x6f\x00\x77\x00\x73\x00\xae\x00" =
"Windows\xc2\xae")
diff --git a/daemon/utils.ml b/daemon/utils.ml
index ecbf924d5..68bb874d5 100644
--- a/daemon/utils.ml
+++ b/daemon/utils.ml
@@ -242,3 +242,103 @@ let proc_unmangle_path path =
let is_small_file path =
is_regular_file path &&
(stat path).st_size <= 2 * 1048 * 1024
+
+let unix_canonical_path path =
+ let is_absolute = String.length path > 0 && path.[0] = '/' in
+ let path = String.nsplit "/" path in
+ let path = List.filter ((<>) "") path in
+ (if is_absolute then "/" else "") ^ String.concat "/"
path
+
+(* Note that we cannot use iconv here because inside the appliance
+ * all i18n databases are deleted. For the same reason we cannot
+ * use functions like hivex_value_string, as they also use iconv
+ * internally.
+ *
+ *
https://en.wikipedia.org/wiki/UTF-16
+ * Also inspired by functions in glib's glib/gutf8.c
+ *)
+let rec utf16le_to_utf8 instr =
+ (* If the length is odd and the last character is ASCII NUL, just
+ * drop that. (If it's not ASCII NUL, then there's an error)
+ *)
+ let len = String.length instr in
+ let instr =
+ if len mod 1 = 1 then (
+ if instr.[len-1] = '\000' then String.sub instr 0 (len-1)
+ else invalid_arg "input is not a valid UTF16-LE string: length is odd"
+ ) else instr in
+
+ (* The length should now be even. If the last two bytes are
+ * '\0\0' then assume it's a NUL-terminated string from the
+ * Windows registry and drop both characters.
+ *)
+ let len = String.length instr in
+ let instr =
+ if len >= 2 && instr.[len-2] = '\000' && instr.[len-1] =
'\000' then
+ String.sub instr 0 (len-2)
+ else instr in
+
+ let outbuf = Buffer.create len in
+
+ (* Encode a wide character as UTF-8 and write to outbuf.
+ * Basically this is g_unichar_to_utf8 implemented in OCaml.
+ *)
+ let encode_utf8 c =
+ let first, len =
+ if c < 0x80 then
+ (0, 1)
+ else if c < 0x800 then
+ (0xc0, 2)
+ else if c < 0x10000 then
+ (0xe0, 3)
+ else if c < 0x200000 then
+ (0xf0, 4)
+ else if c < 0x4000000 then
+ (0xf8, 5)
+ else
+ (0xfc, 6) in
+ let rec loop i c =
+ if i = 0 then Buffer.add_char outbuf (Char.chr (c lor first))
+ else if i > 0 then (
+ loop (i-1) (c lsr 6);
+ Buffer.add_char outbuf (Char.chr ((c land 0x3f) lor 0x80))
+ )
+ in
+ loop (len-1) c
+ in
+
+ (* Loop over the input UTF16-LE characters. *)
+ let is_high_surrogate c = c >= 0xd800 && c < 0xdc00
+ and is_low_surrogate c = c >= 0xdc00 && c < 0xe000
+ and surrogate_value highc lowc =
+ 0x1_0000 + (highc - 0xd800) * 0x400 + lowc - 0xdc00
+ in
+
+ let len = String.length instr in
+ let rec loop i =
+ if i+1 >= len then ()
+ else (
+ let c = Char.code instr.[i] + (Char.code instr.[i+1] lsl 8) in
+
+ let wc, skip =
+ (* High surrogate - must come first. *)
+ if is_high_surrogate c then (
+ if i+3 >= len then
+ invalid_arg "input is not a valid UTF16-LE string: high surrogate at end
of string";
+ let lowc = Char.code instr.[i+2] + (Char.code instr.[i+3] lsl 8) in
+ if not (is_low_surrogate lowc) then
+ invalid_arg "input is not a valid UTF16-LE string: high surrogate not
followed by low surrogate";
+ (surrogate_value c lowc, 4)
+ )
+ else if is_low_surrogate c then
+ invalid_arg "input is not a valid UTF16-LE string: unexpected low
surrogate"
+ else
+ (c, 2) in
+
+ encode_utf8 wc;
+ loop (i+skip)
+ )
+ in
+ loop 0;
+
+ Buffer.contents outbuf
diff --git a/daemon/utils.mli b/daemon/utils.mli
index d3c8bdf4d..94a77de01 100644
--- a/daemon/utils.mli
+++ b/daemon/utils.mli
@@ -85,3 +85,15 @@ val commandr : ?flags:command_flag list -> string -> string list
-> (int * strin
val is_small_file : string -> bool
(** Return true if the path is a small regular file. *)
+
+val unix_canonical_path : string -> string
+(** Canonicalize a Unix path, so "///usr//local//" ->
"/usr/local"
+
+ The path is modified in place because the result is always
+ the same length or shorter than the argument passed. *)
+
+val utf16le_to_utf8 : string -> string
+(** Convert a UTF16-LE string to UTF-8.
+
+ This uses a simple internal implementation since we cannot use
+ iconv inside the daemon. *)
--
2.13.2