Add implementation of average number of equals estimate in SkipList (#9)

* Implement average number of equals estimate in SkipList
2021-02-10 14:38:54 +01:00 · 2021-02-10 14:38:54 +01:00 · fae407d3fe
commit fae407d3fe
parent 42c245df8a
4 changed files with 369 additions and 27 deletions
--- a/src/storage/v2/indices.cpp
+++ b/src/storage/v2/indices.cpp
@ -671,30 +671,6 @@ LabelPropertyIndex::Iterable::Iterator LabelPropertyIndex::Iterable::end() {
  return Iterator(this, index_accessor_.end());
 }

-// A helper function for determining the skip list layer used for estimating the
-// number of elements in the label property index. The lower layer we use, the
-// better approximation we get (if we use the lowest layer, we get the exact
-// numbers). However, lower skip list layers contain more elements so we must
-// iterate through more items to get the estimate.
-//
-// Our goal is to achieve balance between execution time and approximation
-// precision. The expected number of elements at the k-th skip list layer is N *
-// (1/2)^(k-1), where N is the skip-list size. We choose to iterate through no
-// more than sqrt(N) items for large N when calculating the estimate, so we need
-// to choose the skip-list layer such that N * (1/2)^(k-1) <= sqrt(N). That is
-// equivalent to k >= 1 + 1/2 * log2(N), so we choose k to be 1 + ceil(log2(N) /
-// 2).
-//
-// For N small enough (arbitrarily chosen to be 500), we will just use the
-// lowest layer to get the exact numbers. Mostly because this makes writing
-// tests easier.
-namespace {
-uint64_t SkipListLayerForEstimation(uint64_t N) {
-  if (N <= 500) return 1;
-  return std::min(1 + (utils::Log2(N) + 1) / 2, utils::kSkipListMaxHeight);
-}
-}  // namespace
-
 int64_t LabelPropertyIndex::ApproximateVertexCount(
    LabelId label, PropertyId property, const PropertyValue &value) const {
  auto it = index_.find({label, property});
@ -702,7 +678,20 @@ int64_t LabelPropertyIndex::ApproximateVertexCount(
            "Index for label {} and property {} doesn't exist", label.AsUint(),
            property.AsUint());
  auto acc = it->second.access();
-  return acc.estimate_count(value, SkipListLayerForEstimation(acc.size()));
+  if (!value.IsNull()) {
+    return acc.estimate_count(
+        value, utils::SkipListLayerForCountEstimation(acc.size()));
+  } else {
+    // The value `Null` won't ever appear in the index because it indicates that
+    // the property shouldn't exist. Instead, this value is used as an indicator
+    // to estimate the average number of equal elements in the list (for any
+    // given value).
+    return acc.estimate_average_number_of_equals(
+        [](const auto &first, const auto &second) {
+          return first.value == second.value;
+        },
+        utils::SkipListLayerForAverageEqualsEstimation(acc.size()));
+  }
 }

 int64_t LabelPropertyIndex::ApproximateVertexCount(
@ -714,8 +703,8 @@ int64_t LabelPropertyIndex::ApproximateVertexCount(
            "Index for label {} and property {} doesn't exist", label.AsUint(),
            property.AsUint());
  auto acc = it->second.access();
-  return acc.estimate_range_count(lower, upper,
-                                  SkipListLayerForEstimation(acc.size()));
+  return acc.estimate_range_count(
+      lower, upper, utils::SkipListLayerForCountEstimation(acc.size()));
 }

 void RemoveObsoleteEntries(Indices *indices,
--- a/src/storage/v2/indices.hpp
+++ b/src/storage/v2/indices.hpp
@ -244,6 +244,10 @@ class LabelPropertyIndex {
    return it->second.size();
  }

+  /// Supplying a specific value into the count estimation function will return
+  /// an estimated count of nodes which have their property's value set to
+  /// `value`. If the `value` specified is `Null`, then an average number of
+  /// equal elements is returned.
  int64_t ApproximateVertexCount(LabelId label, PropertyId property,
                                 const PropertyValue &value) const;

--- a/src/utils/skip_list.hpp
+++ b/src/utils/skip_list.hpp
@ -1,6 +1,7 @@
 #pragma once

 #include <atomic>
+#include <cmath>
 #include <cstdint>
 #include <cstdlib>
 #include <limits>
@ -104,6 +105,38 @@ size_t SkipListNodeSize(const SkipListNode<TObj> &node) {
  return sizeof(node) + node.height * sizeof(std::atomic<SkipListNode<TObj> *>);
 }

+/// A helper function for determining the skip list layer used for estimating
+/// the number of elements in, e.g. a database index. The lower layer we use,
+/// the better approximation we get (if we use the lowest layer, we get the
+/// exact numbers). However, lower skip list layers contain more elements so we
+/// must iterate through more items to get the estimate.
+///
+/// Our goal is to achieve balance between execution time and approximation
+/// precision. The expected number of elements at the k-th skip list layer is N
+/// * (1/2)^(k-1), where N is the skip-list size. We choose to iterate through
+/// no more than sqrt(N) items for large N when calculating the estimate, so we
+/// need to choose the skip-list layer such that N * (1/2)^(k-1) <= sqrt(N).
+/// That is equivalent to k >= 1 + 1/2 * log2(N), so we choose k to be 1 +
+/// ceil(log2(N) / 2).
+///
+/// For N small enough (arbitrarily chosen to be 500), we will just use the
+/// lowest layer to get the exact numbers. Mostly because this makes writing
+/// tests easier.
+constexpr uint64_t SkipListLayerForCountEstimation(const uint64_t N) {
+  if (N <= 500) return 1;
+  return std::min(1 + (utils::Log2(N) + 1) / 2, utils::kSkipListMaxHeight);
+}
+
+/// This function is written with the same intent as the function above except
+/// that it uses slightly higher layers for estimation because the
+/// `average_number_of_equals` estimate has a larger time complexity than the
+/// `*count` estimates.
+constexpr uint64_t SkipListLayerForAverageEqualsEstimation(const uint64_t N) {
+  if (N <= 500) return 1;
+  return std::min(1 + ((utils::Log2(N) * 2) / 3 + 1),
+                  utils::kSkipListMaxHeight);
+}
+
 /// The skip list doesn't have built-in reclamation of removed nodes (objects).
 /// This class handles all operations necessary to remove the nodes safely.
 ///
@ -708,6 +741,26 @@ class SkipList final {
                                                      max_layer_for_estimation);
    }

+    /// Estimates the average number of objects in the list that have the same
+    /// value using the provided equality operator. E.g., if the objects are:
+    /// 1, 2, 2, 3, 3, 3; the average number of equals is 2.
+    ///
+    /// The items in the list must already be ordered by the field that is used
+    /// in the equality operator. The default layer is chosen to optimize
+    /// duration vs. precision. The lower the layer used for estimation the
+    /// higher the duration of the count operation. If you set the maximum layer
+    /// for estimation to 1 you will get an exact average number.
+    ///
+    /// @return uint64_t estimated average number of equal items in the list
+    template <typename TCallable>
+    uint64_t estimate_average_number_of_equals(
+        const TCallable &equal_cmp,
+        int max_layer_for_estimation =
+            kSkipListCountEstimateDefaultLayer) const {
+      return skiplist_->template estimate_average_number_of_equals(
+          equal_cmp, max_layer_for_estimation);
+    }
+
    /// Removes the key from the list.
    ///
    /// @return bool indicating whether the removal was successful
@ -795,6 +848,15 @@ class SkipList final {
                                                      max_layer_for_estimation);
    }

+    template <typename TCallable>
+    uint64_t estimate_average_number_of_equals(
+        const TCallable &equal_cmp,
+        int max_layer_for_estimation =
+            kSkipListCountEstimateDefaultLayer) const {
+      return skiplist_->template estimate_average_number_of_equals(
+          equal_cmp, max_layer_for_estimation);
+    }
+
    uint64_t size() const { return skiplist_->size(); }

   private:
@ -1109,6 +1171,83 @@ class SkipList final {
    return count;
  }

+  template <typename TCallable>
+  uint64_t estimate_average_number_of_equals(
+      const TCallable &equal_cmp, int max_layer_for_estimation) const {
+    MG_ASSERT(max_layer_for_estimation >= 1 &&
+                  max_layer_for_estimation <= kSkipListMaxHeight,
+              "Invalid layer for SkipList count estimation!");
+
+    // We need to traverse some nodes to make the calculation correct, so find
+    // the first layer that has some nodes, starting from the hinted layer.
+    TNode *curr = nullptr;
+    int layer = max_layer_for_estimation;
+    while (curr == nullptr && layer > 0) {
+      layer -= 1;
+      curr = head_->nexts[layer].load(std::memory_order_acquire);
+    }
+    if (curr == nullptr) {
+      // There are no elements in the list.
+      return 0;
+    }
+
+    // Traverse the chain of nodes and count how many of them are unique and how
+    // many have been visited in total. The traversal is initiated from the
+    // determined layer. Then, equality is checked using layer 1 and a return to
+    // the desired layer is performed. The traversal over layers looks as
+    // follows ("+" are counted nodes, "*" are visited nodes):
+    //
+    //  10: *              *--*     *-- ...
+    //   9: |              |  |     |
+    //   8: |           *--*  |     |
+    //   7: |           |     |     |
+    //   6: |           |     |  *--*
+    //   5: |     *--*--*     |  |
+    //   4: |     |           |  |
+    //   3: |     |           |  |
+    //   2: |     |           |  |
+    //   1: +--+--+           +--+
+    uint64_t unique_count = 0;
+    uint64_t nodes_traversed = 0;
+    uint64_t traversal_limit = size_.load(std::memory_order_acquire);
+    if (layer != 0) {
+      // If the layer isn't 0 we don't want to traverse all of the equal items
+      // because the whole list can be the same item. That is why we limit the
+      // traversal to at most `sqrt(list_size)` items which is a good balance
+      // between general correctness and time complexity.
+      traversal_limit = static_cast<uint64_t>(
+          std::sqrt(size_.load(std::memory_order_acquire)));
+    }
+    while (curr != nullptr) {
+      // First, traverse the bottom layer to count the items.
+      ++unique_count;
+      TNode *pred = nullptr;
+      uint64_t current_traversed = 0;
+      while (curr != nullptr && current_traversed < traversal_limit) {
+        if (pred) {
+          const auto &pred_obj = pred->obj;
+          const auto &curr_obj = curr->obj;
+          if (!equal_cmp(pred_obj, curr_obj)) {
+            break;
+          }
+        }
+        pred = curr;
+        curr = pred->nexts[0].load(std::memory_order_acquire);
+        ++current_traversed;
+      }
+      nodes_traversed += current_traversed;
+      // Second, find a node that has the necessary hight to return to the
+      // desired layer.
+      while (curr != nullptr && curr->height - 1 < layer) {
+        curr = curr->nexts[curr->height - 1].load(std::memory_order_acquire);
+      }
+    }
+
+    MG_ASSERT(unique_count > 0);
+
+    return nodes_traversed / unique_count;
+  }
+
  bool ok_to_delete(TNode *candidate, int layer_found) {
    // The paper has an incorrect check here. It expects the `layer_found`
    // variable to be 1-indexed, but in fact it is 0-indexed.
--- a/tests/unit/skip_list.cpp
+++ b/tests/unit/skip_list.cpp
@ -3,6 +3,7 @@
 #include <fmt/format.h>
 #include <gtest/gtest.h>

+#include "utils/math.hpp"
 #include "utils/skip_list.hpp"
 #include "utils/timer.hpp"

@ -662,3 +663,212 @@ TEST(SkipList, EstimateRangeCount) {
    ASSERT_EQ(count, kMaxElements * kElementMembers);
  }
 }
+
+template <typename TElem, typename TCmp>
+void BenchmarkEstimateAverageNumberOfEquals(utils::SkipList<TElem> *list,
+                                            const TCmp &cmp) {
+  std::cout << "List size: " << list->size() << std::endl;
+  std::cout << "The index will use layer "
+            << utils::SkipListLayerForAverageEqualsEstimation(list->size())
+            << std::endl;
+  auto acc = list->access();
+  for (int layer = 1; layer <= utils::kSkipListMaxHeight; ++layer) {
+    utils::Timer timer;
+    auto estimate = acc.estimate_average_number_of_equals(cmp, layer);
+    auto duration = timer.Elapsed().count();
+    std::cout << "Estimate on layer " << layer << " is " << estimate << " in "
+              << duration << std::endl;
+  }
+}
+
+TEST(SkipList, EstimateAverageNumberOfEquals1) {
+  utils::SkipList<Counter> list;
+
+  // ~500k elements will yield an expected maximum height of 19.
+  const int kMaxElements = 1000;
+
+  // Create a list that has 1, then 2, then 3, then 4, ..., up to
+  // `kMaxElements` same keys next to each other.
+  {
+    auto acc = list.access();
+    for (int64_t i = 1; i <= kMaxElements; ++i) {
+      for (int64_t j = 1; j <= i; ++j) {
+        auto ret = acc.insert({i, j});
+        ASSERT_NE(ret.first, acc.end());
+        ASSERT_EQ(ret.first->key, i);
+        ASSERT_EQ(ret.first->value, j);
+        ASSERT_TRUE(ret.second);
+      }
+    }
+  }
+
+  // There are `kMaxElements * (kMaxElements + 1) / 2` members in the list.
+  ASSERT_EQ(list.size(), kMaxElements * (kMaxElements + 1) / 2);
+
+  // Benchmark the estimation function.
+  BenchmarkEstimateAverageNumberOfEquals(
+      &list, [](const auto &a, const auto &b) { return a.key == b.key; });
+
+  // Verify that the estimate on the lowest layer is correct.
+  {
+    auto acc = list.access();
+    uint64_t count = acc.estimate_average_number_of_equals(
+        [](const auto &a, const auto &b) { return a.key == b.key; }, 1);
+    // There are `kMaxElements` unique elements when observing the data with
+    // the specified equation operator so we divide the number of elements with
+    // `kMaxElements`.
+    ASSERT_EQ(list.size(), kMaxElements * (kMaxElements + 1) / 2);
+    ASSERT_EQ(count, (kMaxElements + 1) / 2);
+  }
+}
+
+TEST(SkipList, EstimateAverageNumberOfEquals2) {
+  utils::SkipList<Counter> list;
+
+  // 100k elements will yield an expected maximum height of 17.
+  const int kMaxElements = 100000;
+  const int kElementMembers = 1;
+
+  // Create a list that has `kMaxElements` sets of `kElementMembers` items that
+  // have same keys.
+  {
+    auto acc = list.access();
+    for (int64_t i = 0; i < kMaxElements; ++i) {
+      for (int64_t j = 0; j < kElementMembers; ++j) {
+        auto ret = acc.insert({i, j});
+        ASSERT_NE(ret.first, acc.end());
+        ASSERT_EQ(ret.first->key, i);
+        ASSERT_EQ(ret.first->value, j);
+        ASSERT_TRUE(ret.second);
+      }
+    }
+  }
+
+  // There are `kMaxElements * kElementMembers` members in the list.
+  ASSERT_EQ(list.size(), kMaxElements * kElementMembers);
+
+  // Benchmark the estimation function.
+  BenchmarkEstimateAverageNumberOfEquals(
+      &list, [](const auto &a, const auto &b) { return a.key == b.key; });
+
+  // Verify that the estimate on the lowest layer is correct.
+  {
+    auto acc = list.access();
+    uint64_t count = acc.estimate_average_number_of_equals(
+        [](const auto &a, const auto &b) { return a.key == b.key; }, 1);
+    ASSERT_EQ(count, kElementMembers);
+  }
+}
+
+TEST(SkipList, EstimateAverageNumberOfEquals3) {
+  utils::SkipList<Counter> list;
+
+  // 100k elements will yield an expected maximum height of 17
+  const int kMaxElements = 100;
+  const int kElementMembers = 1000;
+
+  // Create a list that has `kMaxElements` sets of `kElementMembers` items that
+  // have same keys.
+  {
+    auto acc = list.access();
+    for (int64_t i = 0; i < kMaxElements; ++i) {
+      for (int64_t j = 0; j < kElementMembers; ++j) {
+        auto ret = acc.insert({i, j});
+        ASSERT_NE(ret.first, acc.end());
+        ASSERT_EQ(ret.first->key, i);
+        ASSERT_EQ(ret.first->value, j);
+        ASSERT_TRUE(ret.second);
+      }
+    }
+  }
+
+  // There are `kMaxElements * kElementMembers` members in the list.
+  ASSERT_EQ(list.size(), kMaxElements * kElementMembers);
+
+  // Benchmark the estimation function.
+  BenchmarkEstimateAverageNumberOfEquals(
+      &list, [](const auto &a, const auto &b) { return a.key == b.key; });
+
+  // Verify that the estimate on the lowest layer is correct.
+  {
+    auto acc = list.access();
+    uint64_t count = acc.estimate_average_number_of_equals(
+        [](const auto &a, const auto &b) { return a.key == b.key; }, 1);
+    ASSERT_EQ(count, kElementMembers);
+  }
+}
+
+TEST(SkipList, EstimateAverageNumberOfEquals4) {
+  utils::SkipList<Counter> list;
+
+  // ~300k elements will yield an expected maximum height of 18.
+  const int kMaxElements = 100000;
+
+  // Create a list that has `kMaxElements` sets of 1 or 3 items that have same
+  // keys. The bias is 70% for a set that has 3 items, and 30% for a set that
+  // has 1 items.
+  std::mt19937 gen{std::random_device{}()};
+  std::uniform_real_distribution<> dis(0.0, 1.0);
+  {
+    auto acc = list.access();
+    for (int64_t i = 0; i < kMaxElements; ++i) {
+      for (int64_t j = 0; j < (dis(gen) < 0.7 ? 3 : 1); ++j) {
+        auto ret = acc.insert({i, j});
+        ASSERT_NE(ret.first, acc.end());
+        ASSERT_EQ(ret.first->key, i);
+        ASSERT_EQ(ret.first->value, j);
+        ASSERT_TRUE(ret.second);
+      }
+    }
+  }
+
+  // Benchmark the estimation function.
+  BenchmarkEstimateAverageNumberOfEquals(
+      &list, [](const auto &a, const auto &b) { return a.key == b.key; });
+
+  // Verify that the estimate on the lowest layer is correct.
+  {
+    auto acc = list.access();
+    uint64_t count = acc.estimate_average_number_of_equals(
+        [](const auto &a, const auto &b) { return a.key == b.key; }, 1);
+    // Because the test is randomized, the exact estimate on the lowest layer
+    // can't be known. But it definitely must be between 1 and 3 because the
+    // clusters of items are of sizes 1 and 3.
+    ASSERT_GE(count, 1);
+    ASSERT_LE(count, 3);
+  }
+}
+
+TEST(SkipList, EstimateAverageNumberOfEquals5) {
+  utils::SkipList<Counter> list;
+
+  // ~500k elements will yield an expected maximum height of 19.
+  const int kMaxElements = 1000000;
+
+  // Create a list that has `kMaxElements` items that have same keys.
+  {
+    auto acc = list.access();
+    for (int64_t i = 1; i <= kMaxElements; ++i) {
+      auto ret = acc.insert({1, i});
+      ASSERT_NE(ret.first, acc.end());
+      ASSERT_EQ(ret.first->key, 1);
+      ASSERT_EQ(ret.first->value, i);
+      ASSERT_TRUE(ret.second);
+    }
+  }
+
+  // There are `kMaxElements` members in the list.
+  ASSERT_EQ(list.size(), kMaxElements);
+
+  // Benchmark the estimation function.
+  BenchmarkEstimateAverageNumberOfEquals(
+      &list, [](const auto &a, const auto &b) { return a.key == b.key; });
+
+  // Verify that the estimate on the lowest layer is correct.
+  {
+    auto acc = list.access();
+    uint64_t count = acc.estimate_average_number_of_equals(
+        [](const auto &a, const auto &b) { return a.key == b.key; }, 1);
+    ASSERT_EQ(count, kMaxElements);
+  }
+}