utils::string - split functions extended

Summary: - RSplit added - split limits added - tests Reviewers: teon.banek, dgleich Reviewed By: teon.banek Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D930
2017-10-25 13:02:54 +02:00 · 2017-10-25 13:02:54 +02:00 · 9f7ef8e0e9
commit 9f7ef8e0e9
parent df4933ea0f
2 changed files with 103 additions and 12 deletions
--- a/src/utils/string.hpp
+++ b/src/utils/string.hpp
@ -2,6 +2,7 @@

 #include <algorithm>
 #include <cctype>
+#include <iostream>
 #include <iterator>
 #include <regex>
 #include <sstream>
@ -90,29 +91,70 @@ inline std::string Replace(std::string src, const std::string &match,

 /**
 * Split string by delimeter and return vector of results.
- * If the delimiter is not provided, a different splitting algorithm is used.
- * Runs of consecutive whitespace are regarded as a single delimiter.
- * Additionally, the result will not contain empty strings at the start of end
- * as if the string was trimmed before splitting.
+ *
+ * @param src - The string to split.
+ * @param delimitier - The delimiter to split on.
+ * @param splits - The maximum number of splits. For the given value N the
+ * returned vector will contain at most (N + 1) elements. If given a negative
+ * value, all possible splits are performed.
+ * @return - a vector of splits.
 */
 inline std::vector<std::string> Split(const std::string &src,
-                                      const std::string &delimiter) {
+                                      const std::string &delimiter,
+                                      int splits = -1) {
+  std::vector<std::string> res;
  if (src.empty()) {
-    return {};
+    return res;
  }
  size_t index = 0;
-  size_t n = std::string::npos;
-  std::vector<std::string> res;
-  do {
-    n = src.find(delimiter, index);
+  while (splits < 0 || splits-- != 0) {
+    auto n = src.find(delimiter, index);
+    if (n == std::string::npos) break;
    res.emplace_back(src.substr(index, n - index));
    index = n + delimiter.size();
-  } while (n != std::string::npos);
+  }
+
+  res.emplace_back(src.substr(index));
+  return res;
+}
+
+/**
+ * Split string by delimeter, from right to left, and return vector of results.
+ * For example, RSplit("a.b.c.", ".", 1) results in {"a.b", "c"}.
+ *
+ * @param src - The string to split.
+ * @param delimitier - The delimiter to split on.
+ * @param splits - The maximum number of splits. For the given value N the
+ * returned vector will contain at most (N + 1) elements. If given a negative
+ * value, all possible splits are performed.
+ */
+inline std::vector<std::string> RSplit(const std::string &src,
+                                       const std::string &delimiter,
+                                       int splits = -1) {
+  std::vector<std::string> res;
+  if (src.empty()) {
+    return res;
+  }
+  size_t index = src.size();
+  while (splits < 0 || splits-- != 0) {
+    auto n = src.rfind(delimiter, index - 1);
+    if (n == std::string::npos) break;
+    res.emplace_back(
+        src.substr(n + delimiter.size(), index - n - delimiter.size()));
+    index = n;
+    if (n == 0) break;
+  }
+
+  res.emplace_back(src.substr(0, index));
+  std::reverse(res.begin(), res.end());
  return res;
 }

 /**
 * Split string by whitespace and return vector of results.
+ * Runs of consecutive whitespace are regarded as a single delimiter.
+ * Additionally, the result will not contain empty strings at the start of end
+ * as if the string was trimmed before splitting.
 */
 inline std::vector<std::string> Split(const std::string &src) {
  if (src.empty()) {
@ -160,5 +202,4 @@ inline bool EndsWith(const std::string &s, const std::string &suffix) {
 inline bool StartsWith(const std::string &s, const std::string &prefix) {
  return s.size() >= prefix.size() && s.compare(0, prefix.size(), prefix) == 0;
 }
-
 }
--- a/tests/unit/utils_string.cpp
+++ b/tests/unit/utils_string.cpp
@ -0,0 +1,50 @@
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include "utils/string.hpp"
+
+using vec = std::vector<std::string>;
+
+TEST(String, SplitNoLimit) {
+  EXPECT_EQ(utils::Split("aba", "a"), vec({"", "b", ""}));
+  EXPECT_EQ(utils::Split("aba", "b"), vec({"a", "a"}));
+  EXPECT_EQ(utils::Split("abba", "b"), vec({"a", "", "a"}));
+  EXPECT_EQ(utils::Split("aba", "c"), vec{"aba"});
+}
+
+TEST(String, RSplitNoLimit) {
+  // Tests same like for Split
+  EXPECT_EQ(utils::RSplit("aba", "a"), vec({"", "b", ""}));
+  EXPECT_EQ(utils::RSplit("aba", "b"), vec({"a", "a"}));
+  EXPECT_EQ(utils::RSplit("abba", "b"), vec({"a", "", "a"}));
+  EXPECT_EQ(utils::RSplit("aba", "c"), vec{"aba"});
+}
+
+TEST(String, SplitWithLimit) {
+  EXPECT_EQ(utils::Split("a.b.c.d", ".", 0), vec({"a.b.c.d"}));
+  EXPECT_EQ(utils::Split("a.b.c.d", ".", 1), vec({"a", "b.c.d"}));
+  EXPECT_EQ(utils::Split("a.b.c.d", ".", 2), vec({"a", "b", "c.d"}));
+  EXPECT_EQ(utils::Split("a.b.c.d", ".", 100), vec({"a", "b", "c", "d"}));
+  EXPECT_EQ(utils::Split("a.b.c.d", ".", -1), vec({"a", "b", "c", "d"}));
+  EXPECT_EQ(utils::Split("a.b.c.d", ".", -2), vec({"a", "b", "c", "d"}));
+  EXPECT_EQ(utils::Split("a.b.c.d", ".", -100), vec({"a", "b", "c", "d"}));
+  EXPECT_EQ(utils::Split("a..b..c", ".", 1), vec({"a", ".b..c"}));
+  EXPECT_EQ(utils::Split("a..b..c", ".", 2), vec({"a","", "b..c"}));
+}
+
+TEST(String, RSplitWithLimit) {
+  EXPECT_EQ(utils::RSplit("a.b.c.d", ".", 0), vec({"a.b.c.d"}));
+  EXPECT_EQ(utils::RSplit("a.b.c.d", ".", 1), vec({"a.b.c", "d"}));
+  EXPECT_EQ(utils::RSplit("a.b.c.d", ".", 2), vec({"a.b", "c", "d"}));
+  EXPECT_EQ(utils::RSplit("a.b.c.d", ".", 100), vec({"a", "b", "c", "d"}));
+  EXPECT_EQ(utils::RSplit("a.b.c.d", ".", -1), vec({"a", "b", "c", "d"}));
+  EXPECT_EQ(utils::RSplit("a.b.c.d", ".", -2), vec({"a", "b", "c", "d"}));
+  EXPECT_EQ(utils::RSplit("a.b.c.d", ".", -100), vec({"a", "b", "c", "d"}));
+  EXPECT_EQ(utils::RSplit("a..b..c", ".", 1), vec({"a..b.", "c"}));
+  EXPECT_EQ(utils::RSplit("a..b..c", ".", 2), vec({"a..b","", "c"}));
+}
+
+TEST(String, SplitWhistespace) {
+  EXPECT_EQ(utils::Split(" "), vec({}));
+  EXPECT_EQ(utils::Split("  a  b  "), vec({"a", "b"}));
+}