I changed the name of the function to is_valid_xml_string() to avoid
camel-case. I also briefly commented the function itself, since it
does expect its input to be valid UTF-8. Also, the patch was missing
an end-attribute call, which apparently didn't matter in this case
since the element was being ended subsequently anyway -- but it
missing was an oversight, not intentional.
The updated patch to handle illegal XML characters that appear in
registry values is below:
[todd@tm-nc hivex]# cat hivexml.patch
diff -urNp hivex-1.3.3-orig/xml/hivexml.c hivex-1.3.3-new/xml/hivexml.c
--- hivex-1.3.3-orig/xml/hivexml.c 2011-09-22 09:17:09.000000000 -0400
+++ hivex-1.3.3-new/xml/hivexml.c 2012-02-01 15:51:06.481728986 -0500
@@ -33,6 +33,7 @@
#endif
#include <libxml/xmlwriter.h>
+#include <libxml/chvalid.h>
#include "hivex.h"
@@ -208,6 +209,26 @@ filetime_to_8601 (int64_t windows_ticks)
return ret;
}
+/*
+ * Check that a UTF-8 string contains only valid XML characters.
+ * There is an assumption that the input string is valid UTF-8.
+ */
+static int
+is_valid_xml_string(const char *string)
+{
+ int c;
+ int pos = 0;
+ int len = strlen(string);
+ int charlen = len;
+ while ((c = xmlGetUTF8Char(string+pos, &charlen)) >= 0) {
+ if (xmlIsCharQ(c) == 0)
+ return 0;
+ pos += charlen;
+ charlen = len - pos;
+ }
+ return 1;
+}
+
static int
node_start (hive_h *h, void *writer_v, hive_node_h node, const char *name)
{
@@ -265,6 +286,20 @@ end_value (xmlTextWriterPtr writer)
XML_CHECK (xmlTextWriterEndElement, (writer));
}
+static void
+start_string(xmlTextWriterPtr writer, const char *encoding)
+{
+ XML_CHECK (xmlTextWriterStartElement, (writer, BAD_CAST "string"));
+ if (encoding)
+ XML_CHECK (xmlTextWriterWriteAttribute, (writer, BAD_CAST
"encoding", BAD_CAST encoding));
+}
+
+static void
+end_string(xmlTextWriterPtr writer)
+{
+ XML_CHECK (xmlTextWriterEndElement, (writer));
+}
+
static int
value_string (hive_h *h, void *writer_v, hive_node_h node, hive_value_h value,
hive_type t, size_t len, const char *key, const char *str)
@@ -292,9 +327,14 @@ value_string (hive_h *h, void *writer_v,
type = "unknown";
}
- start_value (writer, key, type, NULL);
+ int validXML = is_valid_xml_string(str);
+ start_value (writer, key, type, validXML ? NULL : "base64");
XML_CHECK (xmlTextWriterStartAttribute, (writer, BAD_CAST "value"));
- XML_CHECK (xmlTextWriterWriteString, (writer, BAD_CAST str));
+ if (validXML)
+ XML_CHECK (xmlTextWriterWriteString, (writer, BAD_CAST str));
+ else
+ XML_CHECK (xmlTextWriterWriteBase64, (writer, str, 0, strlen(str)));
+
XML_CHECK (xmlTextWriterEndAttribute, (writer));
end_value (writer);
return 0;
@@ -310,9 +350,15 @@ value_multiple_strings (hive_h *h, void
size_t i;
for (i = 0; argv[i] != NULL; ++i) {
- XML_CHECK (xmlTextWriterStartElement, (writer, BAD_CAST "string"));
- XML_CHECK (xmlTextWriterWriteString, (writer, BAD_CAST argv[i]));
- XML_CHECK (xmlTextWriterEndElement, (writer));
+ int validXML = is_valid_xml_string(argv[i]);
+ start_string(writer, validXML ? NULL : "base64");
+ XML_CHECK (xmlTextWriterStartAttribute, (writer, BAD_CAST "value"));
+ if (validXML)
+ XML_CHECK (xmlTextWriterWriteString, (writer, BAD_CAST argv[i]));
+ else
+ XML_CHECK (xmlTextWriterWriteBase64, (writer, argv[i], 0,
strlen(argv[i])));
+ XML_CHECK (xmlTextWriterEndAttribute, (writer));
+ end_string(writer);
}
end_value (writer);
[todd@tm-nc hivex]#