Skiplist::PositionAndCount refactor and test

Summary: - refactored so `less` is used instead of `greater` - added a fuzzy unit test Reviewers: mislav.bradac, buda, teon.banek Reviewed By: teon.banek Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D519
2017-07-05 12:01:41 +02:00 · 2017-07-05 12:01:41 +02:00 · feb854d0c7
commit feb854d0c7
parent e7111b286a
3 changed files with 122 additions and 40 deletions
--- a/src/data_structures/concurrent/skiplist.hpp
+++ b/src/data_structures/concurrent/skiplist.hpp
@ -530,7 +530,7 @@ class SkipList : private Lockable<lock_t> {
    /**
     * Position and count estimation. Gives estimates
     * on the position of the given item in this skiplist, and
-     * the number of identical items according to 'greater'.
+     * the number of identical items according to 'less'.
     *
     * If `item` is not contained in the skiplist,
     * then the position where it would be inserted is returned
@ -543,9 +543,9 @@ class SkipList : private Lockable<lock_t> {
     * TODO: tune the levels once benchmarks are available.
     *
     * @param item The item for which the position is estimated.
-     * @param greater Comparison function. It must be partially
+     * @param less Comparison function. It must be partially
     *  consistent with natural comparison of Skiplist elements:
-     *  if `greater` indicates that X is greater then
+     *  if `less` indicates that X is less than
     *  Y, then natural comparison must indicate the same. The
     *  reverse does not have to hold.
     * @param position_level_reduction - Defines at which level
@ -553,15 +553,15 @@ class SkipList : private Lockable<lock_t> {
     *  as log2(skiplist->size()) - position_level_reduction.
     * @param count_max_level - Defines the max level at which
     *  item count is estimated.
-     * @tparam TGreater Type of `greater`
+     * @tparam TLess Type of `less`
     * @return A pair of ints where the first element is the estimated
     *  position of item, and the second is the estimated number
-     *  of items that are the same according to `greater`.
+     *  of items that are the same according to `less`.
     */
-    template <typename TItem, typename TGreater = std::greater<T>>
-    auto position_and_count(const TItem &item, TGreater greater = TGreater{},
-                            int position_level_reduction = 10,
-                            int count_max_level = 3) {
+    template <typename TItem, typename TLess = std::less<T>>
+    std::pair<size_t, size_t> position_and_count(
+        const TItem &item, TLess less = TLess{},
+        int position_level_reduction = 10, int count_max_level = 3) {
      // the level at which position will be sought
      int position_level = std::max(
          0, static_cast<int>(std::lround(std::log2(skiplist->size()))) -
@ -576,8 +576,12 @@ class SkipList : private Lockable<lock_t> {
        // used for calculating item position
        int tower_count = 0;

+        // on the current height (i) find the last tower
+        // whose value is lesser than item, store it in pred
+        // while succ will be either skiplist end or the
+        // first element greater or equal to item
        succ = pred->forward(i);
-        while (succ && greater(item, succ->value())) {
+        while (succ && less(succ->value(), item)) {
          pred = succ;
          succ = succ->forward(i);
          tower_count++;
@ -585,22 +589,17 @@ class SkipList : private Lockable<lock_t> {

        // in the succs field we'll keep track of successors
        // that are equal to item, or nullptr otherwise
-        succs[i] = (!succ || greater(succ->value(), item)) ? nullptr : succ;
+        succs[i] = (!succ || less(item, succ->value())) ? nullptr : succ;

        position += (1 << i) * tower_count;
      }

-      // if succ is nullptr, we have the last skiplist element
-      if (succ == nullptr) {
-        // pred now contains the first node whose value <= item
-        // check if we found the item exactly (value == item)
-        bool found = pred != skiplist->header && !greater(item, pred->value());
-        return std::make_pair(position, found ? 1 : 0);
-      }
+      // if succ is nullptr, then item is greater than all elements in the list
+      if (succ == nullptr) return std::make_pair(size(), 0);

      // now we need to estimate the count of elements equal to item
      // we'll do that by looking for the first element that is greater
-      // then item, and counting how far we have to look
+      // than item, and counting how far we have to look

      // first find the rightmost (highest) succ that has value == item
      int count_level = 0;
@ -617,7 +616,7 @@ class SkipList : private Lockable<lock_t> {
      int count = 1 << count_level;
      for (; count_level >= 0; count_level--) {
        Node *next = succ->forward(count_level);
-        while (next && !greater(next->value(), item)) {
+        while (next && !less(item, next->value())) {
          succ = next;
          next = next->forward(count_level);
          count += 1 << count_level;
--- a/tests/manual/sl_position_and_count.cpp
+++ b/tests/manual/sl_position_and_count.cpp
@ -39,20 +39,20 @@ std::unique_ptr<SkipList<int>> make_sl(int size) {
 * @param size - size of the skiplist to test with
 * @param iterations - number of iterations of each test.
 * @param granulation - How many sequential ints should be
- *  considered equal in testing by the custom `greater`
+ *  considered equal in testing by the custom `less`
 *  function.
 */
 void test(int size, int iterations = 20, int granulation = 1) {
-  auto greater = [granulation](const int &a, const int &b) {
-    return a / granulation > b / granulation;
+  auto less = [granulation](const int &a, const int &b) {
+    return a / granulation < b / granulation;
  };
  log("\nTesting skiplist size {} with granulation {}", size, granulation);

  // test at 1/4, 1/2 and 3/4 points
-  std::vector<int> positions({size / 4, size / 2, size * 3 / 4});
+  std::vector<int> test_positions({size / 4, size / 2, size * 3 / 4});

-  std::vector<std::vector<int>> less(3);
-  std::vector<std::vector<int>> equal(3);
+  std::vector<std::vector<int>> position(3);
+  std::vector<std::vector<int>> count(3);
  std::vector<std::vector<double>> time(3);
  for (int iteration = 0; iteration < iterations; iteration++) {
    auto sl = make_sl(size);
@ -60,26 +60,26 @@ void test(int size, int iterations = 20, int granulation = 1) {
    for (auto pos : {0, 1, 2}) {
      clock_t start_time = clock();
      auto pos_and_count =
-          sl->access().position_and_count(positions[pos], greater);
+          sl->access().position_and_count(test_positions[pos], less);
      auto t = double(clock() - start_time) / CLOCKS_PER_SEC;

-      less[pos].push_back(pos_and_count.first);
-      equal[pos].push_back(pos_and_count.second);
+      position[pos].push_back(pos_and_count.first);
+      count[pos].push_back(pos_and_count.second);
      time[pos].push_back(t);
    }
  }

  // convert values to errors
-  for (auto pos : {0, 1, 2}) {
-    auto position = positions[pos];
-    log("\tPosition {}", position);
-    for (auto &less_elem : less[pos])
-      less_elem = std::abs(less_elem - position);
-    log("\t\tMean position error: {}", mean(less[pos]));
-    for (auto &equal_elem : equal[pos])
-      equal_elem = std::abs(equal_elem - granulation);
-    log("\t\tMean count error: {}", mean(equal[pos]));
-    log("\t\tMean time (ms): {}", mean(time[pos]) * 1000);
+  for (auto pos_index : {0, 1, 2}) {
+    auto test_position = test_positions[pos_index];
+    log("\tPosition {}", test_position);
+    for (auto &position_elem : position[pos_index])
+      position_elem = std::abs(position_elem - test_position);
+    log("\t\tMean position error: {}", mean(position[pos_index]));
+    for (auto &count_elem : count[pos_index])
+      count_elem = std::abs(count_elem - granulation);
+    log("\t\tMean count error: {}", mean(count[pos_index]));
+    log("\t\tMean time (ms): {}", mean(time[pos_index]) * 1000);
  }
 }

@ -92,7 +92,7 @@ int main(int argc, char *argv[]) {
  if (argc > 2) iterations = (int)std::stoi(argv[2]);

  std::vector<int> granulations;
-  for (int i = 1 ; i < size ; i *= 100) granulations.push_back(i);
+  for (int i = 1; i < size; i *= 100) granulations.push_back(i);
  for (auto granulation : granulations) test(size, iterations, granulation);

  return 0;
--- a/tests/unit/skiplist_position_and_count.cpp
+++ b/tests/unit/skiplist_position_and_count.cpp
@ -0,0 +1,83 @@
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "data_structures/concurrent/skiplist.hpp"
+#include "utils/assert.hpp"
+
+/* The following tests validate the SkipList::position_and_count estimation
+ * functionality. That function has a tunable speed vs. accuracy. The tests
+ * here test the absolutely-accurate parameterization, as well as the default
+ * one that should be optimal parametrization. As such the tests are
+ * stochastic and defined to validate generally acceptable behavior in
+ * a vast majority of cases. The probability of test failure due to
+ * stochasticity should be extremely small, but isn't zero.
+ */
+
+auto SkiplistRange(int count) {
+  auto sl = std::make_unique<SkipList<int>>();
+  auto access = sl->access();
+  for (int i = 0; i < count; i++) access.insert(i);
+  return sl;
+}
+
+auto Median(std::vector<int> &elements) {
+  auto elem_size = elements.size();
+  debug_assert(elem_size > 0, "Provide some elements to get median!");
+  std::sort(elements.begin(), elements.end());
+  if (elem_size % 2)
+    return elements[elem_size / 2];
+  else
+    return (elements[elem_size / 2 - 1] + elements[elem_size / 2]) / 2;
+}
+
+auto Less(int granularity) {
+  return [granularity](const int &a, const int &b) {
+    return a / granularity < b / granularity;
+  };
+}
+
+#define EXPECT_ABS_POS_COUNT(granularity, position, expected_position,         \
+                             expected_count)                                   \
+  {                                                                            \
+    auto sl = SkiplistRange(10000);                                            \
+    auto position_and_count =                                                  \
+        sl->access().position_and_count(position, Less(granularity), 1000, 0); \
+    EXPECT_EQ(position_and_count.first, expected_position);                    \
+    EXPECT_EQ(position_and_count.second, expected_count);                      \
+  }
+
+TEST(SkiplistPosAndCount, AbsoluteAccuracy) {
+  EXPECT_ABS_POS_COUNT(1, 42, 42, 1);
+  EXPECT_ABS_POS_COUNT(3, 42, 42, 3);
+  EXPECT_ABS_POS_COUNT(10, 42, 40, 10);
+}
+
+#define EXPECT_POS_COUNT(skiplist_size, position, expected_count,            \
+                         position_error_margin, count_error_margin)          \
+  {                                                                          \
+    std::vector<int> pos_errors;                                             \
+    std::vector<int> count_errors;                                           \
+                                                                             \
+    for (int i = 0; i < 30; i++) {                                           \
+      auto sl = SkiplistRange(skiplist_size);                                \
+      auto position_count = sl->access().position_and_count(position);       \
+      pos_errors.push_back(std::abs((long)position_count.first - position)); \
+      count_errors.push_back(                                                \
+          std::abs((long)position_count.second - expected_count));           \
+    }                                                                        \
+    EXPECT_LE(Median(pos_errors), position_error_margin);                    \
+    EXPECT_LE(Median(count_errors), count_error_margin);                     \
+  }
+
+TEST(SkiplistPosAndCount, DefaultSpeedAndAccuracy) {
+  EXPECT_POS_COUNT(5000, 42, 1, 20, 3);
+  EXPECT_POS_COUNT(5000, 2500, 1, 100, 3);
+  EXPECT_POS_COUNT(5000, 4500, 1, 200, 3);
+
+  // for an item greater then all list elements the returned
+  // estimations are always absolutely accurate
+  EXPECT_POS_COUNT(5000, 5000, 0, 0, 0);
+}