Add implementation of average number of equals estimate in SkipList (#9)
* Implement average number of equals estimate in SkipList
This commit is contained in:
parent
42c245df8a
commit
fae407d3fe
@ -671,30 +671,6 @@ LabelPropertyIndex::Iterable::Iterator LabelPropertyIndex::Iterable::end() {
|
||||
return Iterator(this, index_accessor_.end());
|
||||
}
|
||||
|
||||
// A helper function for determining the skip list layer used for estimating the
|
||||
// number of elements in the label property index. The lower layer we use, the
|
||||
// better approximation we get (if we use the lowest layer, we get the exact
|
||||
// numbers). However, lower skip list layers contain more elements so we must
|
||||
// iterate through more items to get the estimate.
|
||||
//
|
||||
// Our goal is to achieve balance between execution time and approximation
|
||||
// precision. The expected number of elements at the k-th skip list layer is N *
|
||||
// (1/2)^(k-1), where N is the skip-list size. We choose to iterate through no
|
||||
// more than sqrt(N) items for large N when calculating the estimate, so we need
|
||||
// to choose the skip-list layer such that N * (1/2)^(k-1) <= sqrt(N). That is
|
||||
// equivalent to k >= 1 + 1/2 * log2(N), so we choose k to be 1 + ceil(log2(N) /
|
||||
// 2).
|
||||
//
|
||||
// For N small enough (arbitrarily chosen to be 500), we will just use the
|
||||
// lowest layer to get the exact numbers. Mostly because this makes writing
|
||||
// tests easier.
|
||||
namespace {
|
||||
uint64_t SkipListLayerForEstimation(uint64_t N) {
|
||||
if (N <= 500) return 1;
|
||||
return std::min(1 + (utils::Log2(N) + 1) / 2, utils::kSkipListMaxHeight);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
int64_t LabelPropertyIndex::ApproximateVertexCount(
|
||||
LabelId label, PropertyId property, const PropertyValue &value) const {
|
||||
auto it = index_.find({label, property});
|
||||
@ -702,7 +678,20 @@ int64_t LabelPropertyIndex::ApproximateVertexCount(
|
||||
"Index for label {} and property {} doesn't exist", label.AsUint(),
|
||||
property.AsUint());
|
||||
auto acc = it->second.access();
|
||||
return acc.estimate_count(value, SkipListLayerForEstimation(acc.size()));
|
||||
if (!value.IsNull()) {
|
||||
return acc.estimate_count(
|
||||
value, utils::SkipListLayerForCountEstimation(acc.size()));
|
||||
} else {
|
||||
// The value `Null` won't ever appear in the index because it indicates that
|
||||
// the property shouldn't exist. Instead, this value is used as an indicator
|
||||
// to estimate the average number of equal elements in the list (for any
|
||||
// given value).
|
||||
return acc.estimate_average_number_of_equals(
|
||||
[](const auto &first, const auto &second) {
|
||||
return first.value == second.value;
|
||||
},
|
||||
utils::SkipListLayerForAverageEqualsEstimation(acc.size()));
|
||||
}
|
||||
}
|
||||
|
||||
int64_t LabelPropertyIndex::ApproximateVertexCount(
|
||||
@ -714,8 +703,8 @@ int64_t LabelPropertyIndex::ApproximateVertexCount(
|
||||
"Index for label {} and property {} doesn't exist", label.AsUint(),
|
||||
property.AsUint());
|
||||
auto acc = it->second.access();
|
||||
return acc.estimate_range_count(lower, upper,
|
||||
SkipListLayerForEstimation(acc.size()));
|
||||
return acc.estimate_range_count(
|
||||
lower, upper, utils::SkipListLayerForCountEstimation(acc.size()));
|
||||
}
|
||||
|
||||
void RemoveObsoleteEntries(Indices *indices,
|
||||
|
@ -244,6 +244,10 @@ class LabelPropertyIndex {
|
||||
return it->second.size();
|
||||
}
|
||||
|
||||
/// Supplying a specific value into the count estimation function will return
|
||||
/// an estimated count of nodes which have their property's value set to
|
||||
/// `value`. If the `value` specified is `Null`, then an average number of
|
||||
/// equal elements is returned.
|
||||
int64_t ApproximateVertexCount(LabelId label, PropertyId property,
|
||||
const PropertyValue &value) const;
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <limits>
|
||||
@ -104,6 +105,38 @@ size_t SkipListNodeSize(const SkipListNode<TObj> &node) {
|
||||
return sizeof(node) + node.height * sizeof(std::atomic<SkipListNode<TObj> *>);
|
||||
}
|
||||
|
||||
/// A helper function for determining the skip list layer used for estimating
|
||||
/// the number of elements in, e.g. a database index. The lower layer we use,
|
||||
/// the better approximation we get (if we use the lowest layer, we get the
|
||||
/// exact numbers). However, lower skip list layers contain more elements so we
|
||||
/// must iterate through more items to get the estimate.
|
||||
///
|
||||
/// Our goal is to achieve balance between execution time and approximation
|
||||
/// precision. The expected number of elements at the k-th skip list layer is N
|
||||
/// * (1/2)^(k-1), where N is the skip-list size. We choose to iterate through
|
||||
/// no more than sqrt(N) items for large N when calculating the estimate, so we
|
||||
/// need to choose the skip-list layer such that N * (1/2)^(k-1) <= sqrt(N).
|
||||
/// That is equivalent to k >= 1 + 1/2 * log2(N), so we choose k to be 1 +
|
||||
/// ceil(log2(N) / 2).
|
||||
///
|
||||
/// For N small enough (arbitrarily chosen to be 500), we will just use the
|
||||
/// lowest layer to get the exact numbers. Mostly because this makes writing
|
||||
/// tests easier.
|
||||
constexpr uint64_t SkipListLayerForCountEstimation(const uint64_t N) {
|
||||
if (N <= 500) return 1;
|
||||
return std::min(1 + (utils::Log2(N) + 1) / 2, utils::kSkipListMaxHeight);
|
||||
}
|
||||
|
||||
/// This function is written with the same intent as the function above except
|
||||
/// that it uses slightly higher layers for estimation because the
|
||||
/// `average_number_of_equals` estimate has a larger time complexity than the
|
||||
/// `*count` estimates.
|
||||
constexpr uint64_t SkipListLayerForAverageEqualsEstimation(const uint64_t N) {
|
||||
if (N <= 500) return 1;
|
||||
return std::min(1 + ((utils::Log2(N) * 2) / 3 + 1),
|
||||
utils::kSkipListMaxHeight);
|
||||
}
|
||||
|
||||
/// The skip list doesn't have built-in reclamation of removed nodes (objects).
|
||||
/// This class handles all operations necessary to remove the nodes safely.
|
||||
///
|
||||
@ -708,6 +741,26 @@ class SkipList final {
|
||||
max_layer_for_estimation);
|
||||
}
|
||||
|
||||
/// Estimates the average number of objects in the list that have the same
|
||||
/// value using the provided equality operator. E.g., if the objects are:
|
||||
/// 1, 2, 2, 3, 3, 3; the average number of equals is 2.
|
||||
///
|
||||
/// The items in the list must already be ordered by the field that is used
|
||||
/// in the equality operator. The default layer is chosen to optimize
|
||||
/// duration vs. precision. The lower the layer used for estimation the
|
||||
/// higher the duration of the count operation. If you set the maximum layer
|
||||
/// for estimation to 1 you will get an exact average number.
|
||||
///
|
||||
/// @return uint64_t estimated average number of equal items in the list
|
||||
template <typename TCallable>
|
||||
uint64_t estimate_average_number_of_equals(
|
||||
const TCallable &equal_cmp,
|
||||
int max_layer_for_estimation =
|
||||
kSkipListCountEstimateDefaultLayer) const {
|
||||
return skiplist_->template estimate_average_number_of_equals(
|
||||
equal_cmp, max_layer_for_estimation);
|
||||
}
|
||||
|
||||
/// Removes the key from the list.
|
||||
///
|
||||
/// @return bool indicating whether the removal was successful
|
||||
@ -795,6 +848,15 @@ class SkipList final {
|
||||
max_layer_for_estimation);
|
||||
}
|
||||
|
||||
template <typename TCallable>
|
||||
uint64_t estimate_average_number_of_equals(
|
||||
const TCallable &equal_cmp,
|
||||
int max_layer_for_estimation =
|
||||
kSkipListCountEstimateDefaultLayer) const {
|
||||
return skiplist_->template estimate_average_number_of_equals(
|
||||
equal_cmp, max_layer_for_estimation);
|
||||
}
|
||||
|
||||
uint64_t size() const { return skiplist_->size(); }
|
||||
|
||||
private:
|
||||
@ -1109,6 +1171,83 @@ class SkipList final {
|
||||
return count;
|
||||
}
|
||||
|
||||
template <typename TCallable>
|
||||
uint64_t estimate_average_number_of_equals(
|
||||
const TCallable &equal_cmp, int max_layer_for_estimation) const {
|
||||
MG_ASSERT(max_layer_for_estimation >= 1 &&
|
||||
max_layer_for_estimation <= kSkipListMaxHeight,
|
||||
"Invalid layer for SkipList count estimation!");
|
||||
|
||||
// We need to traverse some nodes to make the calculation correct, so find
|
||||
// the first layer that has some nodes, starting from the hinted layer.
|
||||
TNode *curr = nullptr;
|
||||
int layer = max_layer_for_estimation;
|
||||
while (curr == nullptr && layer > 0) {
|
||||
layer -= 1;
|
||||
curr = head_->nexts[layer].load(std::memory_order_acquire);
|
||||
}
|
||||
if (curr == nullptr) {
|
||||
// There are no elements in the list.
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Traverse the chain of nodes and count how many of them are unique and how
|
||||
// many have been visited in total. The traversal is initiated from the
|
||||
// determined layer. Then, equality is checked using layer 1 and a return to
|
||||
// the desired layer is performed. The traversal over layers looks as
|
||||
// follows ("+" are counted nodes, "*" are visited nodes):
|
||||
//
|
||||
// 10: * *--* *-- ...
|
||||
// 9: | | | |
|
||||
// 8: | *--* | |
|
||||
// 7: | | | |
|
||||
// 6: | | | *--*
|
||||
// 5: | *--*--* | |
|
||||
// 4: | | | |
|
||||
// 3: | | | |
|
||||
// 2: | | | |
|
||||
// 1: +--+--+ +--+
|
||||
uint64_t unique_count = 0;
|
||||
uint64_t nodes_traversed = 0;
|
||||
uint64_t traversal_limit = size_.load(std::memory_order_acquire);
|
||||
if (layer != 0) {
|
||||
// If the layer isn't 0 we don't want to traverse all of the equal items
|
||||
// because the whole list can be the same item. That is why we limit the
|
||||
// traversal to at most `sqrt(list_size)` items which is a good balance
|
||||
// between general correctness and time complexity.
|
||||
traversal_limit = static_cast<uint64_t>(
|
||||
std::sqrt(size_.load(std::memory_order_acquire)));
|
||||
}
|
||||
while (curr != nullptr) {
|
||||
// First, traverse the bottom layer to count the items.
|
||||
++unique_count;
|
||||
TNode *pred = nullptr;
|
||||
uint64_t current_traversed = 0;
|
||||
while (curr != nullptr && current_traversed < traversal_limit) {
|
||||
if (pred) {
|
||||
const auto &pred_obj = pred->obj;
|
||||
const auto &curr_obj = curr->obj;
|
||||
if (!equal_cmp(pred_obj, curr_obj)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
pred = curr;
|
||||
curr = pred->nexts[0].load(std::memory_order_acquire);
|
||||
++current_traversed;
|
||||
}
|
||||
nodes_traversed += current_traversed;
|
||||
// Second, find a node that has the necessary hight to return to the
|
||||
// desired layer.
|
||||
while (curr != nullptr && curr->height - 1 < layer) {
|
||||
curr = curr->nexts[curr->height - 1].load(std::memory_order_acquire);
|
||||
}
|
||||
}
|
||||
|
||||
MG_ASSERT(unique_count > 0);
|
||||
|
||||
return nodes_traversed / unique_count;
|
||||
}
|
||||
|
||||
bool ok_to_delete(TNode *candidate, int layer_found) {
|
||||
// The paper has an incorrect check here. It expects the `layer_found`
|
||||
// variable to be 1-indexed, but in fact it is 0-indexed.
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <fmt/format.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "utils/math.hpp"
|
||||
#include "utils/skip_list.hpp"
|
||||
#include "utils/timer.hpp"
|
||||
|
||||
@ -662,3 +663,212 @@ TEST(SkipList, EstimateRangeCount) {
|
||||
ASSERT_EQ(count, kMaxElements * kElementMembers);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TElem, typename TCmp>
|
||||
void BenchmarkEstimateAverageNumberOfEquals(utils::SkipList<TElem> *list,
|
||||
const TCmp &cmp) {
|
||||
std::cout << "List size: " << list->size() << std::endl;
|
||||
std::cout << "The index will use layer "
|
||||
<< utils::SkipListLayerForAverageEqualsEstimation(list->size())
|
||||
<< std::endl;
|
||||
auto acc = list->access();
|
||||
for (int layer = 1; layer <= utils::kSkipListMaxHeight; ++layer) {
|
||||
utils::Timer timer;
|
||||
auto estimate = acc.estimate_average_number_of_equals(cmp, layer);
|
||||
auto duration = timer.Elapsed().count();
|
||||
std::cout << "Estimate on layer " << layer << " is " << estimate << " in "
|
||||
<< duration << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SkipList, EstimateAverageNumberOfEquals1) {
|
||||
utils::SkipList<Counter> list;
|
||||
|
||||
// ~500k elements will yield an expected maximum height of 19.
|
||||
const int kMaxElements = 1000;
|
||||
|
||||
// Create a list that has 1, then 2, then 3, then 4, ..., up to
|
||||
// `kMaxElements` same keys next to each other.
|
||||
{
|
||||
auto acc = list.access();
|
||||
for (int64_t i = 1; i <= kMaxElements; ++i) {
|
||||
for (int64_t j = 1; j <= i; ++j) {
|
||||
auto ret = acc.insert({i, j});
|
||||
ASSERT_NE(ret.first, acc.end());
|
||||
ASSERT_EQ(ret.first->key, i);
|
||||
ASSERT_EQ(ret.first->value, j);
|
||||
ASSERT_TRUE(ret.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// There are `kMaxElements * (kMaxElements + 1) / 2` members in the list.
|
||||
ASSERT_EQ(list.size(), kMaxElements * (kMaxElements + 1) / 2);
|
||||
|
||||
// Benchmark the estimation function.
|
||||
BenchmarkEstimateAverageNumberOfEquals(
|
||||
&list, [](const auto &a, const auto &b) { return a.key == b.key; });
|
||||
|
||||
// Verify that the estimate on the lowest layer is correct.
|
||||
{
|
||||
auto acc = list.access();
|
||||
uint64_t count = acc.estimate_average_number_of_equals(
|
||||
[](const auto &a, const auto &b) { return a.key == b.key; }, 1);
|
||||
// There are `kMaxElements` unique elements when observing the data with
|
||||
// the specified equation operator so we divide the number of elements with
|
||||
// `kMaxElements`.
|
||||
ASSERT_EQ(list.size(), kMaxElements * (kMaxElements + 1) / 2);
|
||||
ASSERT_EQ(count, (kMaxElements + 1) / 2);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SkipList, EstimateAverageNumberOfEquals2) {
|
||||
utils::SkipList<Counter> list;
|
||||
|
||||
// 100k elements will yield an expected maximum height of 17.
|
||||
const int kMaxElements = 100000;
|
||||
const int kElementMembers = 1;
|
||||
|
||||
// Create a list that has `kMaxElements` sets of `kElementMembers` items that
|
||||
// have same keys.
|
||||
{
|
||||
auto acc = list.access();
|
||||
for (int64_t i = 0; i < kMaxElements; ++i) {
|
||||
for (int64_t j = 0; j < kElementMembers; ++j) {
|
||||
auto ret = acc.insert({i, j});
|
||||
ASSERT_NE(ret.first, acc.end());
|
||||
ASSERT_EQ(ret.first->key, i);
|
||||
ASSERT_EQ(ret.first->value, j);
|
||||
ASSERT_TRUE(ret.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// There are `kMaxElements * kElementMembers` members in the list.
|
||||
ASSERT_EQ(list.size(), kMaxElements * kElementMembers);
|
||||
|
||||
// Benchmark the estimation function.
|
||||
BenchmarkEstimateAverageNumberOfEquals(
|
||||
&list, [](const auto &a, const auto &b) { return a.key == b.key; });
|
||||
|
||||
// Verify that the estimate on the lowest layer is correct.
|
||||
{
|
||||
auto acc = list.access();
|
||||
uint64_t count = acc.estimate_average_number_of_equals(
|
||||
[](const auto &a, const auto &b) { return a.key == b.key; }, 1);
|
||||
ASSERT_EQ(count, kElementMembers);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SkipList, EstimateAverageNumberOfEquals3) {
|
||||
utils::SkipList<Counter> list;
|
||||
|
||||
// 100k elements will yield an expected maximum height of 17
|
||||
const int kMaxElements = 100;
|
||||
const int kElementMembers = 1000;
|
||||
|
||||
// Create a list that has `kMaxElements` sets of `kElementMembers` items that
|
||||
// have same keys.
|
||||
{
|
||||
auto acc = list.access();
|
||||
for (int64_t i = 0; i < kMaxElements; ++i) {
|
||||
for (int64_t j = 0; j < kElementMembers; ++j) {
|
||||
auto ret = acc.insert({i, j});
|
||||
ASSERT_NE(ret.first, acc.end());
|
||||
ASSERT_EQ(ret.first->key, i);
|
||||
ASSERT_EQ(ret.first->value, j);
|
||||
ASSERT_TRUE(ret.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// There are `kMaxElements * kElementMembers` members in the list.
|
||||
ASSERT_EQ(list.size(), kMaxElements * kElementMembers);
|
||||
|
||||
// Benchmark the estimation function.
|
||||
BenchmarkEstimateAverageNumberOfEquals(
|
||||
&list, [](const auto &a, const auto &b) { return a.key == b.key; });
|
||||
|
||||
// Verify that the estimate on the lowest layer is correct.
|
||||
{
|
||||
auto acc = list.access();
|
||||
uint64_t count = acc.estimate_average_number_of_equals(
|
||||
[](const auto &a, const auto &b) { return a.key == b.key; }, 1);
|
||||
ASSERT_EQ(count, kElementMembers);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SkipList, EstimateAverageNumberOfEquals4) {
|
||||
utils::SkipList<Counter> list;
|
||||
|
||||
// ~300k elements will yield an expected maximum height of 18.
|
||||
const int kMaxElements = 100000;
|
||||
|
||||
// Create a list that has `kMaxElements` sets of 1 or 3 items that have same
|
||||
// keys. The bias is 70% for a set that has 3 items, and 30% for a set that
|
||||
// has 1 items.
|
||||
std::mt19937 gen{std::random_device{}()};
|
||||
std::uniform_real_distribution<> dis(0.0, 1.0);
|
||||
{
|
||||
auto acc = list.access();
|
||||
for (int64_t i = 0; i < kMaxElements; ++i) {
|
||||
for (int64_t j = 0; j < (dis(gen) < 0.7 ? 3 : 1); ++j) {
|
||||
auto ret = acc.insert({i, j});
|
||||
ASSERT_NE(ret.first, acc.end());
|
||||
ASSERT_EQ(ret.first->key, i);
|
||||
ASSERT_EQ(ret.first->value, j);
|
||||
ASSERT_TRUE(ret.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Benchmark the estimation function.
|
||||
BenchmarkEstimateAverageNumberOfEquals(
|
||||
&list, [](const auto &a, const auto &b) { return a.key == b.key; });
|
||||
|
||||
// Verify that the estimate on the lowest layer is correct.
|
||||
{
|
||||
auto acc = list.access();
|
||||
uint64_t count = acc.estimate_average_number_of_equals(
|
||||
[](const auto &a, const auto &b) { return a.key == b.key; }, 1);
|
||||
// Because the test is randomized, the exact estimate on the lowest layer
|
||||
// can't be known. But it definitely must be between 1 and 3 because the
|
||||
// clusters of items are of sizes 1 and 3.
|
||||
ASSERT_GE(count, 1);
|
||||
ASSERT_LE(count, 3);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SkipList, EstimateAverageNumberOfEquals5) {
|
||||
utils::SkipList<Counter> list;
|
||||
|
||||
// ~500k elements will yield an expected maximum height of 19.
|
||||
const int kMaxElements = 1000000;
|
||||
|
||||
// Create a list that has `kMaxElements` items that have same keys.
|
||||
{
|
||||
auto acc = list.access();
|
||||
for (int64_t i = 1; i <= kMaxElements; ++i) {
|
||||
auto ret = acc.insert({1, i});
|
||||
ASSERT_NE(ret.first, acc.end());
|
||||
ASSERT_EQ(ret.first->key, 1);
|
||||
ASSERT_EQ(ret.first->value, i);
|
||||
ASSERT_TRUE(ret.second);
|
||||
}
|
||||
}
|
||||
|
||||
// There are `kMaxElements` members in the list.
|
||||
ASSERT_EQ(list.size(), kMaxElements);
|
||||
|
||||
// Benchmark the estimation function.
|
||||
BenchmarkEstimateAverageNumberOfEquals(
|
||||
&list, [](const auto &a, const auto &b) { return a.key == b.key; });
|
||||
|
||||
// Verify that the estimate on the lowest layer is correct.
|
||||
{
|
||||
auto acc = list.access();
|
||||
uint64_t count = acc.estimate_average_number_of_equals(
|
||||
[](const auto &a, const auto &b) { return a.key == b.key; }, 1);
|
||||
ASSERT_EQ(count, kMaxElements);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user