These work like our String.split and String.nsplit functions.
---
common/mlpcre/PCRE.ml | 33 +++++++++++++++++++++++++++++++++
common/mlpcre/PCRE.mli | 19 +++++++++++++++++++
common/mlpcre/pcre_tests.ml | 29 +++++++++++++++++++++++++++++
3 files changed, 81 insertions(+)
diff --git a/common/mlpcre/PCRE.ml b/common/mlpcre/PCRE.ml
index 753e247e4..b054928f9 100644
--- a/common/mlpcre/PCRE.ml
+++ b/common/mlpcre/PCRE.ml
@@ -52,5 +52,38 @@ let rec replace ?(global = false) patt subst subj =
xs ^ subst ^ zs
)
+let rec split patt subj =
+ if not (matches patt subj) then
+ subj, ""
+ else (
+ (* If patt matches "yyyy" in the original string then we have
+ * the following situation, where "xxxx" is the part of the
+ * original string before the match, and "zzzz..." is the
+ * part after the match:
+ * "xxxxyyyyzzzzzzzzzzzzz"
+ * ^ ^
+ * i1 i2
+ *)
+ let i1, i2 = subi 0 in
+ let xs = String.sub subj 0 i1 (* "xxxx", part before the match *) in
+ let zs = String.sub subj i2 (String.length subj - i2) (* after *) in
+ xs, zs
+ )
+
+and nsplit ?(max = 0) patt subj =
+ if max < 0 then
+ invalid_arg "PCRE.nsplit: max parameter should not be negative";
+
+ (* If we reached the limit, OR if the pattern does not match the string
+ * at all, return the rest of the string as a single element list.
+ *)
+ if max = 1 || not (matches patt subj) then
+ [subj]
+ else (
+ let s1, s2 = split patt subj in
+ let max = if max = 0 then 0 else max - 1 in
+ s1 :: nsplit ~max patt s2
+ )
+
let () =
Callback.register_exception "PCRE.Error" (Error ("", 0))
diff --git a/common/mlpcre/PCRE.mli b/common/mlpcre/PCRE.mli
index fcf6fd25e..eacb6fd90 100644
--- a/common/mlpcre/PCRE.mli
+++ b/common/mlpcre/PCRE.mli
@@ -110,3 +110,22 @@ val replace : ?global:bool -> regexp -> string -> string
-> string
Note that this function does not allow backreferences.
Any captures in [patt] are ignored. *)
+
+val split : regexp -> string -> string * string
+val nsplit : ?max:int -> regexp -> string -> string list
+(** [split patt subj] splits the string at the first occurrence
+ of the regular expression [patt], returning the parts of the
+ string before and after the match (the matching part is not
+ returned). If the pattern does not match then the whole
+ input is returned in the first string, and the second string
+ is empty.
+
+ [nsplit patt subj] is the same but the string is split
+ on every occurrence of [patt]. Note that if the pattern
+ matches at the beginning or end of the string, then an
+ empty string element will be returned at the beginning or
+ end of the list.
+
+ [nsplit] has an optional [?max] parameter which controls
+ the maximum length of the returned list. The final element
+ contains the remainder of the string. *)
diff --git a/common/mlpcre/pcre_tests.ml b/common/mlpcre/pcre_tests.ml
index 9d42914b9..346019c40 100644
--- a/common/mlpcre/pcre_tests.ml
+++ b/common/mlpcre/pcre_tests.ml
@@ -42,6 +42,20 @@ let replace ?(global = false) patt subst subj =
eprintf " %s\n%!" r;
r
+let split patt subj =
+ eprintf "PCRE.split <patt> %s ->%!" subj;
+ let s1, s2 = PCRE.split patt subj in
+ eprintf " (%s, %s)\n%!" s1 s2;
+ (s1, s2)
+
+let nsplit ?(max = 0) patt subj =
+ eprintf "PCRE.nsplit%s <patt> %s ->%!"
+ (if max = 0 then "" else sprintf " ~max:%d" max)
+ subj;
+ let ss = PCRE.nsplit ~max patt subj in
+ eprintf " [%s]\n%!" (String.concat "; " ss);
+ ss
+
let sub i =
eprintf "PCRE.sub %d ->%!" i;
let r = PCRE.sub i in
@@ -60,6 +74,7 @@ let () =
let re1 = compile "(a+)b" in
let re2 = compile "(a+)(b*)" in
let re3 = compile ~caseless:true "[^a-z0-9_]" in
+ let ws = compile "\\s+" in
assert (matches re0 "ccaaabbbb" = true);
assert (sub 0 = "aaab");
@@ -101,6 +116,20 @@ let () =
assert (replace ~global:true re3 "-" "this is
a\xc2\xa3FUNNY.name?"
(* = "this-is-a-FUNNY-name-" if UTF-8 worked *)
= "this-is-a--FUNNY-name-");
+
+ (* This also tests PCRE.split since that is used by nsplit. *)
+ assert (nsplit ~max:1 ws "a b c" = [ "a b c" ]);
+ assert (nsplit ~max:2 ws "a b c" = [ "a"; "b c" ]);
+ assert (nsplit ~max:3 ws "a b c" = [ "a"; "b";
"c" ]);
+ assert (nsplit ~max:10 ws "a b c" = [ "a"; "b";
"c" ]);
+ assert (nsplit ws "the cat sat on \t\t the mat." =
+ [ "the"; "cat"; "sat"; "on";
"the"; "mat." ]);
+ assert (nsplit ~max:5 ws "the cat sat on \t\t the mat." =
+ [ "the"; "cat"; "sat"; "on";
"the mat." ]);
+ assert (nsplit ws " the " = [ ""; "the"; ""
]);
+ assert (nsplit ws "the " = [ "the"; "" ]);
+ assert (nsplit ws " the" = [ ""; "the" ]);
+ assert (nsplit ws " \t the" = [ ""; "the" ]);
with
| Not_found ->
failwith "one of the PCRE.sub functions unexpectedly raised Not_found"
--
2.13.2