Add implementation of average number of equals estimate in SkipList ()

* Implement average number of equals estimate in SkipList
This commit is contained in:
Marko Budiselić 2021-02-10 14:38:54 +01:00 committed by GitHub
parent 42c245df8a
commit fae407d3fe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 369 additions and 27 deletions

View File

@ -671,30 +671,6 @@ LabelPropertyIndex::Iterable::Iterator LabelPropertyIndex::Iterable::end() {
return Iterator(this, index_accessor_.end());
}
// A helper function for determining the skip list layer used for estimating the
// number of elements in the label property index. The lower layer we use, the
// better approximation we get (if we use the lowest layer, we get the exact
// numbers). However, lower skip list layers contain more elements so we must
// iterate through more items to get the estimate.
//
// Our goal is to achieve balance between execution time and approximation
// precision. The expected number of elements at the k-th skip list layer is N *
// (1/2)^(k-1), where N is the skip-list size. We choose to iterate through no
// more than sqrt(N) items for large N when calculating the estimate, so we need
// to choose the skip-list layer such that N * (1/2)^(k-1) <= sqrt(N). That is
// equivalent to k >= 1 + 1/2 * log2(N), so we choose k to be 1 + ceil(log2(N) /
// 2).
//
// For N small enough (arbitrarily chosen to be 500), we will just use the
// lowest layer to get the exact numbers. Mostly because this makes writing
// tests easier.
namespace {
uint64_t SkipListLayerForEstimation(uint64_t N) {
if (N <= 500) return 1;
return std::min(1 + (utils::Log2(N) + 1) / 2, utils::kSkipListMaxHeight);
}
} // namespace
int64_t LabelPropertyIndex::ApproximateVertexCount(
LabelId label, PropertyId property, const PropertyValue &value) const {
auto it = index_.find({label, property});
@ -702,7 +678,20 @@ int64_t LabelPropertyIndex::ApproximateVertexCount(
"Index for label {} and property {} doesn't exist", label.AsUint(),
property.AsUint());
auto acc = it->second.access();
return acc.estimate_count(value, SkipListLayerForEstimation(acc.size()));
if (!value.IsNull()) {
return acc.estimate_count(
value, utils::SkipListLayerForCountEstimation(acc.size()));
} else {
// The value `Null` won't ever appear in the index because it indicates that
// the property shouldn't exist. Instead, this value is used as an indicator
// to estimate the average number of equal elements in the list (for any
// given value).
return acc.estimate_average_number_of_equals(
[](const auto &first, const auto &second) {
return first.value == second.value;
},
utils::SkipListLayerForAverageEqualsEstimation(acc.size()));
}
}
int64_t LabelPropertyIndex::ApproximateVertexCount(
@ -714,8 +703,8 @@ int64_t LabelPropertyIndex::ApproximateVertexCount(
"Index for label {} and property {} doesn't exist", label.AsUint(),
property.AsUint());
auto acc = it->second.access();
return acc.estimate_range_count(lower, upper,
SkipListLayerForEstimation(acc.size()));
return acc.estimate_range_count(
lower, upper, utils::SkipListLayerForCountEstimation(acc.size()));
}
void RemoveObsoleteEntries(Indices *indices,

View File

@ -244,6 +244,10 @@ class LabelPropertyIndex {
return it->second.size();
}
/// Supplying a specific value into the count estimation function will return
/// an estimated count of nodes which have their property's value set to
/// `value`. If the `value` specified is `Null`, then an average number of
/// equal elements is returned.
int64_t ApproximateVertexCount(LabelId label, PropertyId property,
const PropertyValue &value) const;

View File

@ -1,6 +1,7 @@
#pragma once
#include <atomic>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <limits>
@ -104,6 +105,38 @@ size_t SkipListNodeSize(const SkipListNode<TObj> &node) {
return sizeof(node) + node.height * sizeof(std::atomic<SkipListNode<TObj> *>);
}
/// A helper function for determining the skip list layer used for estimating
/// the number of elements in, e.g. a database index. The lower layer we use,
/// the better approximation we get (if we use the lowest layer, we get the
/// exact numbers). However, lower skip list layers contain more elements so we
/// must iterate through more items to get the estimate.
///
/// Our goal is to achieve balance between execution time and approximation
/// precision. The expected number of elements at the k-th skip list layer is N
/// * (1/2)^(k-1), where N is the skip-list size. We choose to iterate through
/// no more than sqrt(N) items for large N when calculating the estimate, so we
/// need to choose the skip-list layer such that N * (1/2)^(k-1) <= sqrt(N).
/// That is equivalent to k >= 1 + 1/2 * log2(N), so we choose k to be 1 +
/// ceil(log2(N) / 2).
///
/// For N small enough (arbitrarily chosen to be 500), we will just use the
/// lowest layer to get the exact numbers. Mostly because this makes writing
/// tests easier.
constexpr uint64_t SkipListLayerForCountEstimation(const uint64_t N) {
if (N <= 500) return 1;
return std::min(1 + (utils::Log2(N) + 1) / 2, utils::kSkipListMaxHeight);
}
/// This function is written with the same intent as the function above except
/// that it uses slightly higher layers for estimation because the
/// `average_number_of_equals` estimate has a larger time complexity than the
/// `*count` estimates.
constexpr uint64_t SkipListLayerForAverageEqualsEstimation(const uint64_t N) {
if (N <= 500) return 1;
return std::min(1 + ((utils::Log2(N) * 2) / 3 + 1),
utils::kSkipListMaxHeight);
}
/// The skip list doesn't have built-in reclamation of removed nodes (objects).
/// This class handles all operations necessary to remove the nodes safely.
///
@ -708,6 +741,26 @@ class SkipList final {
max_layer_for_estimation);
}
/// Estimates the average number of objects in the list that have the same
/// value using the provided equality operator. E.g., if the objects are:
/// 1, 2, 2, 3, 3, 3; the average number of equals is 2.
///
/// The items in the list must already be ordered by the field that is used
/// in the equality operator. The default layer is chosen to optimize
/// duration vs. precision. The lower the layer used for estimation the
/// higher the duration of the count operation. If you set the maximum layer
/// for estimation to 1 you will get an exact average number.
///
/// @return uint64_t estimated average number of equal items in the list
template <typename TCallable>
uint64_t estimate_average_number_of_equals(
const TCallable &equal_cmp,
int max_layer_for_estimation =
kSkipListCountEstimateDefaultLayer) const {
return skiplist_->template estimate_average_number_of_equals(
equal_cmp, max_layer_for_estimation);
}
/// Removes the key from the list.
///
/// @return bool indicating whether the removal was successful
@ -795,6 +848,15 @@ class SkipList final {
max_layer_for_estimation);
}
template <typename TCallable>
uint64_t estimate_average_number_of_equals(
const TCallable &equal_cmp,
int max_layer_for_estimation =
kSkipListCountEstimateDefaultLayer) const {
return skiplist_->template estimate_average_number_of_equals(
equal_cmp, max_layer_for_estimation);
}
uint64_t size() const { return skiplist_->size(); }
private:
@ -1109,6 +1171,83 @@ class SkipList final {
return count;
}
template <typename TCallable>
uint64_t estimate_average_number_of_equals(
const TCallable &equal_cmp, int max_layer_for_estimation) const {
MG_ASSERT(max_layer_for_estimation >= 1 &&
max_layer_for_estimation <= kSkipListMaxHeight,
"Invalid layer for SkipList count estimation!");
// We need to traverse some nodes to make the calculation correct, so find
// the first layer that has some nodes, starting from the hinted layer.
TNode *curr = nullptr;
int layer = max_layer_for_estimation;
while (curr == nullptr && layer > 0) {
layer -= 1;
curr = head_->nexts[layer].load(std::memory_order_acquire);
}
if (curr == nullptr) {
// There are no elements in the list.
return 0;
}
// Traverse the chain of nodes and count how many of them are unique and how
// many have been visited in total. The traversal is initiated from the
// determined layer. Then, equality is checked using layer 1 and a return to
// the desired layer is performed. The traversal over layers looks as
// follows ("+" are counted nodes, "*" are visited nodes):
//
// 10: * *--* *-- ...
// 9: | | | |
// 8: | *--* | |
// 7: | | | |
// 6: | | | *--*
// 5: | *--*--* | |
// 4: | | | |
// 3: | | | |
// 2: | | | |
// 1: +--+--+ +--+
uint64_t unique_count = 0;
uint64_t nodes_traversed = 0;
uint64_t traversal_limit = size_.load(std::memory_order_acquire);
if (layer != 0) {
// If the layer isn't 0 we don't want to traverse all of the equal items
// because the whole list can be the same item. That is why we limit the
// traversal to at most `sqrt(list_size)` items which is a good balance
// between general correctness and time complexity.
traversal_limit = static_cast<uint64_t>(
std::sqrt(size_.load(std::memory_order_acquire)));
}
while (curr != nullptr) {
// First, traverse the bottom layer to count the items.
++unique_count;
TNode *pred = nullptr;
uint64_t current_traversed = 0;
while (curr != nullptr && current_traversed < traversal_limit) {
if (pred) {
const auto &pred_obj = pred->obj;
const auto &curr_obj = curr->obj;
if (!equal_cmp(pred_obj, curr_obj)) {
break;
}
}
pred = curr;
curr = pred->nexts[0].load(std::memory_order_acquire);
++current_traversed;
}
nodes_traversed += current_traversed;
// Second, find a node that has the necessary hight to return to the
// desired layer.
while (curr != nullptr && curr->height - 1 < layer) {
curr = curr->nexts[curr->height - 1].load(std::memory_order_acquire);
}
}
MG_ASSERT(unique_count > 0);
return nodes_traversed / unique_count;
}
bool ok_to_delete(TNode *candidate, int layer_found) {
// The paper has an incorrect check here. It expects the `layer_found`
// variable to be 1-indexed, but in fact it is 0-indexed.

View File

@ -3,6 +3,7 @@
#include <fmt/format.h>
#include <gtest/gtest.h>
#include "utils/math.hpp"
#include "utils/skip_list.hpp"
#include "utils/timer.hpp"
@ -662,3 +663,212 @@ TEST(SkipList, EstimateRangeCount) {
ASSERT_EQ(count, kMaxElements * kElementMembers);
}
}
template <typename TElem, typename TCmp>
void BenchmarkEstimateAverageNumberOfEquals(utils::SkipList<TElem> *list,
const TCmp &cmp) {
std::cout << "List size: " << list->size() << std::endl;
std::cout << "The index will use layer "
<< utils::SkipListLayerForAverageEqualsEstimation(list->size())
<< std::endl;
auto acc = list->access();
for (int layer = 1; layer <= utils::kSkipListMaxHeight; ++layer) {
utils::Timer timer;
auto estimate = acc.estimate_average_number_of_equals(cmp, layer);
auto duration = timer.Elapsed().count();
std::cout << "Estimate on layer " << layer << " is " << estimate << " in "
<< duration << std::endl;
}
}
TEST(SkipList, EstimateAverageNumberOfEquals1) {
utils::SkipList<Counter> list;
// ~500k elements will yield an expected maximum height of 19.
const int kMaxElements = 1000;
// Create a list that has 1, then 2, then 3, then 4, ..., up to
// `kMaxElements` same keys next to each other.
{
auto acc = list.access();
for (int64_t i = 1; i <= kMaxElements; ++i) {
for (int64_t j = 1; j <= i; ++j) {
auto ret = acc.insert({i, j});
ASSERT_NE(ret.first, acc.end());
ASSERT_EQ(ret.first->key, i);
ASSERT_EQ(ret.first->value, j);
ASSERT_TRUE(ret.second);
}
}
}
// There are `kMaxElements * (kMaxElements + 1) / 2` members in the list.
ASSERT_EQ(list.size(), kMaxElements * (kMaxElements + 1) / 2);
// Benchmark the estimation function.
BenchmarkEstimateAverageNumberOfEquals(
&list, [](const auto &a, const auto &b) { return a.key == b.key; });
// Verify that the estimate on the lowest layer is correct.
{
auto acc = list.access();
uint64_t count = acc.estimate_average_number_of_equals(
[](const auto &a, const auto &b) { return a.key == b.key; }, 1);
// There are `kMaxElements` unique elements when observing the data with
// the specified equation operator so we divide the number of elements with
// `kMaxElements`.
ASSERT_EQ(list.size(), kMaxElements * (kMaxElements + 1) / 2);
ASSERT_EQ(count, (kMaxElements + 1) / 2);
}
}
TEST(SkipList, EstimateAverageNumberOfEquals2) {
utils::SkipList<Counter> list;
// 100k elements will yield an expected maximum height of 17.
const int kMaxElements = 100000;
const int kElementMembers = 1;
// Create a list that has `kMaxElements` sets of `kElementMembers` items that
// have same keys.
{
auto acc = list.access();
for (int64_t i = 0; i < kMaxElements; ++i) {
for (int64_t j = 0; j < kElementMembers; ++j) {
auto ret = acc.insert({i, j});
ASSERT_NE(ret.first, acc.end());
ASSERT_EQ(ret.first->key, i);
ASSERT_EQ(ret.first->value, j);
ASSERT_TRUE(ret.second);
}
}
}
// There are `kMaxElements * kElementMembers` members in the list.
ASSERT_EQ(list.size(), kMaxElements * kElementMembers);
// Benchmark the estimation function.
BenchmarkEstimateAverageNumberOfEquals(
&list, [](const auto &a, const auto &b) { return a.key == b.key; });
// Verify that the estimate on the lowest layer is correct.
{
auto acc = list.access();
uint64_t count = acc.estimate_average_number_of_equals(
[](const auto &a, const auto &b) { return a.key == b.key; }, 1);
ASSERT_EQ(count, kElementMembers);
}
}
TEST(SkipList, EstimateAverageNumberOfEquals3) {
utils::SkipList<Counter> list;
// 100k elements will yield an expected maximum height of 17
const int kMaxElements = 100;
const int kElementMembers = 1000;
// Create a list that has `kMaxElements` sets of `kElementMembers` items that
// have same keys.
{
auto acc = list.access();
for (int64_t i = 0; i < kMaxElements; ++i) {
for (int64_t j = 0; j < kElementMembers; ++j) {
auto ret = acc.insert({i, j});
ASSERT_NE(ret.first, acc.end());
ASSERT_EQ(ret.first->key, i);
ASSERT_EQ(ret.first->value, j);
ASSERT_TRUE(ret.second);
}
}
}
// There are `kMaxElements * kElementMembers` members in the list.
ASSERT_EQ(list.size(), kMaxElements * kElementMembers);
// Benchmark the estimation function.
BenchmarkEstimateAverageNumberOfEquals(
&list, [](const auto &a, const auto &b) { return a.key == b.key; });
// Verify that the estimate on the lowest layer is correct.
{
auto acc = list.access();
uint64_t count = acc.estimate_average_number_of_equals(
[](const auto &a, const auto &b) { return a.key == b.key; }, 1);
ASSERT_EQ(count, kElementMembers);
}
}
TEST(SkipList, EstimateAverageNumberOfEquals4) {
utils::SkipList<Counter> list;
// ~300k elements will yield an expected maximum height of 18.
const int kMaxElements = 100000;
// Create a list that has `kMaxElements` sets of 1 or 3 items that have same
// keys. The bias is 70% for a set that has 3 items, and 30% for a set that
// has 1 items.
std::mt19937 gen{std::random_device{}()};
std::uniform_real_distribution<> dis(0.0, 1.0);
{
auto acc = list.access();
for (int64_t i = 0; i < kMaxElements; ++i) {
for (int64_t j = 0; j < (dis(gen) < 0.7 ? 3 : 1); ++j) {
auto ret = acc.insert({i, j});
ASSERT_NE(ret.first, acc.end());
ASSERT_EQ(ret.first->key, i);
ASSERT_EQ(ret.first->value, j);
ASSERT_TRUE(ret.second);
}
}
}
// Benchmark the estimation function.
BenchmarkEstimateAverageNumberOfEquals(
&list, [](const auto &a, const auto &b) { return a.key == b.key; });
// Verify that the estimate on the lowest layer is correct.
{
auto acc = list.access();
uint64_t count = acc.estimate_average_number_of_equals(
[](const auto &a, const auto &b) { return a.key == b.key; }, 1);
// Because the test is randomized, the exact estimate on the lowest layer
// can't be known. But it definitely must be between 1 and 3 because the
// clusters of items are of sizes 1 and 3.
ASSERT_GE(count, 1);
ASSERT_LE(count, 3);
}
}
TEST(SkipList, EstimateAverageNumberOfEquals5) {
utils::SkipList<Counter> list;
// ~500k elements will yield an expected maximum height of 19.
const int kMaxElements = 1000000;
// Create a list that has `kMaxElements` items that have same keys.
{
auto acc = list.access();
for (int64_t i = 1; i <= kMaxElements; ++i) {
auto ret = acc.insert({1, i});
ASSERT_NE(ret.first, acc.end());
ASSERT_EQ(ret.first->key, 1);
ASSERT_EQ(ret.first->value, i);
ASSERT_TRUE(ret.second);
}
}
// There are `kMaxElements` members in the list.
ASSERT_EQ(list.size(), kMaxElements);
// Benchmark the estimation function.
BenchmarkEstimateAverageNumberOfEquals(
&list, [](const auto &a, const auto &b) { return a.key == b.key; });
// Verify that the estimate on the lowest layer is correct.
{
auto acc = list.access();
uint64_t count = acc.estimate_average_number_of_equals(
[](const auto &a, const auto &b) { return a.key == b.key; }, 1);
ASSERT_EQ(count, kMaxElements);
}
}