GraphDbAccessor - label+property index cardinalities for exact value and value ranges

Summary:
- added functionality to `GraphDbAccessor` for cardinality estimates
- changed all `GraphDbAccessor::Count...` functions to return `int64_t`
- added the need functionality into `LabelPropertyIndex`
- modified `SkipList::position_and_count` to accept a custom `equals` function. Equality could not be implemented using only the custom `less` because it compares a templated `TItem` with skiplist element type `T`, and is therefore not symetrical.

Reviewers: teon.banek, buda, mislav.bradac

Reviewed By: teon.banek

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D521
This commit is contained in:
florijan 2017-07-05 16:00:07 +02:00
parent feb854d0c7
commit c22ac38ea2
8 changed files with 255 additions and 56 deletions

View File

@ -548,19 +548,24 @@ class SkipList : private Lockable<lock_t> {
* if `less` indicates that X is less than
* Y, then natural comparison must indicate the same. The
* reverse does not have to hold.
* @param greater Comparsion function, analogue to less.
* @param position_level_reduction - Defines at which level
* item position is estimated. Position level is defined
* as log2(skiplist->size()) - position_level_reduction.
* @param count_max_level - Defines the max level at which
* item count is estimated.
* @tparam TLess Type of `less`
* @tparam TItem - type of item skiplist elements are compared
* to. Does not have to be the same type as skiplist element.
* @tparam TLess - less comparison function type.
* @tparam TEqual - equality comparison function type.
* @return A pair of ints where the first element is the estimated
* position of item, and the second is the estimated number
* of items that are the same according to `less`.
*/
template <typename TItem, typename TLess = std::less<T>>
std::pair<size_t, size_t> position_and_count(
const TItem &item, TLess less = TLess{},
template <typename TItem, typename TLess = std::less<T>,
typename TEqual = std::equal_to<T>>
std::pair<int64_t, int64_t> position_and_count(
const TItem &item, TLess less = TLess(), TEqual equal = TEqual(),
int position_level_reduction = 10, int count_max_level = 3) {
// the level at which position will be sought
int position_level = std::max(
@ -579,9 +584,10 @@ class SkipList : private Lockable<lock_t> {
// on the current height (i) find the last tower
// whose value is lesser than item, store it in pred
// while succ will be either skiplist end or the
// first element greater or equal to item
// first element greater than item
succ = pred->forward(i);
while (succ && less(succ->value(), item)) {
while (succ &&
!(less(item, succ->value()) || equal(item, succ->value()))) {
pred = succ;
succ = succ->forward(i);
tower_count++;

View File

@ -60,15 +60,16 @@ void GraphDbAccessor::update_property_index(
property, record_accessor.vlist_, vertex);
}
size_t GraphDbAccessor::vertices_count() const {
int64_t GraphDbAccessor::vertices_count() const {
return db_.vertices_.access().size();
}
size_t GraphDbAccessor::vertices_count(const GraphDbTypes::Label &label) const {
int64_t GraphDbAccessor::vertices_count(
const GraphDbTypes::Label &label) const {
return db_.labels_index_.Count(label);
}
size_t GraphDbAccessor::vertices_count(
int64_t GraphDbAccessor::vertices_count(
const GraphDbTypes::Label &label,
const GraphDbTypes::Property &property) const {
const LabelPropertyIndex::Key key(label, property);
@ -77,6 +78,51 @@ size_t GraphDbAccessor::vertices_count(
return db_.label_property_index_.Count(key);
}
int64_t GraphDbAccessor::vertices_count(const GraphDbTypes::Label &label,
const GraphDbTypes::Property &property,
const PropertyValue &value) const {
const LabelPropertyIndex::Key key(label, property);
debug_assert(db_.label_property_index_.IndexExists(key),
"Index doesn't exist.");
return db_.label_property_index_.PositionAndCount(key, value).second;
}
int64_t GraphDbAccessor::vertices_count(
const GraphDbTypes::Label &label, const GraphDbTypes::Property &property,
const std::experimental::optional<utils::Bound<PropertyValue>> lower,
const std::experimental::optional<utils::Bound<PropertyValue>> upper)
const {
const LabelPropertyIndex::Key key(label, property);
debug_assert(db_.label_property_index_.IndexExists(key),
"Index doesn't exist.");
debug_assert(lower || upper, "At least one bound must be provided");
if (!upper) {
auto lower_pac =
db_.label_property_index_.PositionAndCount(key, lower.value().value());
int64_t size = db_.label_property_index_.Count(key);
return std::max(0l,
size - lower_pac.first -
(lower.value().IsInclusive() ? 0l : lower_pac.second));
} else if (!lower) {
auto upper_pac =
db_.label_property_index_.PositionAndCount(key, upper.value().value());
return upper.value().IsInclusive() ? upper_pac.first + upper_pac.second
: upper_pac.first;
} else {
auto lower_pac =
db_.label_property_index_.PositionAndCount(key, lower.value().value());
auto upper_pac =
db_.label_property_index_.PositionAndCount(key, upper.value().value());
auto result = upper_pac.first - lower_pac.first;
if (lower.value().IsExclusive()) result -= lower_pac.second;
if (upper.value().IsInclusive()) result += upper_pac.second;
return std::max(0l, result);
}
}
bool GraphDbAccessor::remove_vertex(VertexAccessor &vertex_accessor) {
vertex_accessor.SwitchNew();
// it's possible the vertex was removed already in this transaction
@ -133,11 +179,11 @@ void GraphDbAccessor::update_edge_type_index(
this->db_.edge_types_index_.Update(edge_type, edge_accessor.vlist_, edge);
}
size_t GraphDbAccessor::edges_count() const {
int64_t GraphDbAccessor::edges_count() const {
return db_.edges_.access().size();
}
size_t GraphDbAccessor::edges_count(
int64_t GraphDbAccessor::edges_count(
const GraphDbTypes::EdgeType &edge_type) const {
return db_.edge_types_index_.Count(edge_type);
}

View File

@ -5,14 +5,16 @@
#pragma once
#include <experimental/optional>
#include "cppitertools/filter.hpp"
#include "cppitertools/imap.hpp"
#include "graph_db.hpp"
#include "transactions/transaction.hpp"
#include "storage/edge_accessor.hpp"
#include "storage/vertex_accessor.hpp"
#include "transactions/transaction.hpp"
#include "utils/bound.hpp"
/** Thrown when creating an index which already exists. */
class IndexExistsException : public utils::BasicException {
@ -310,13 +312,13 @@ class GraphDbAccessor {
* Return approximate number of all vertices in the database.
* Note that this is always an over-estimate and never an under-estimate.
*/
size_t vertices_count() const;
int64_t vertices_count() const;
/*
* Return approximate number of all edges in the database.
* Note that this is always an over-estimate and never an under-estimate.
*/
size_t edges_count() const;
int64_t edges_count() const;
/**
* Return approximate number of vertices under indexes with the given label.
@ -324,7 +326,7 @@ class GraphDbAccessor {
* @param label - label to check for
* @return number of vertices with the given label
*/
size_t vertices_count(const GraphDbTypes::Label &label) const;
int64_t vertices_count(const GraphDbTypes::Label &label) const;
/**
* Return approximate number of vertices under indexes with the given label
@ -335,8 +337,32 @@ class GraphDbAccessor {
* @return number of vertices with the given label, fails if no such
* label+property index exists.
*/
size_t vertices_count(const GraphDbTypes::Label &label,
const GraphDbTypes::Property &property) const;
int64_t vertices_count(const GraphDbTypes::Label &label,
const GraphDbTypes::Property &property) const;
/**
* Returns approximate number of vertices that have the given label
* and the given value for the given property.
*
* Assumes that an index for that (label, property) exists.
*/
int64_t vertices_count(const GraphDbTypes::Label &label,
const GraphDbTypes::Property &property,
const PropertyValue &value) const;
/**
* Returns approximate number of vertices that have the given label
* and whose vaue is in the range defined by upper and lower @c Bound.
* At least one bound must be specified. If lower bound is not specified,
* the whole upper bound prefix is returned.
*
* Assumes that an index for that (label, property) exists.
*/
int64_t vertices_count(
const GraphDbTypes::Label &label, const GraphDbTypes::Property &property,
const std::experimental::optional<utils::Bound<PropertyValue>> lower,
const std::experimental::optional<utils::Bound<PropertyValue>> upper)
const;
/**
* Return approximate number of edges under indexes with the given edge_type.
@ -344,7 +370,7 @@ class GraphDbAccessor {
* @param edge_type - edge_type to check for
* @return number of edges with the given edge_type
*/
size_t edges_count(const GraphDbTypes::EdgeType &edge_type) const;
int64_t edges_count(const GraphDbTypes::EdgeType &edge_type) const;
/**
* Obtains the Label for the label's name.

View File

@ -202,8 +202,8 @@ class LabelPropertyIndex {
IndexEntry, Vertex>(
std::move(access), start_iter,
[value](const IndexEntry &entry) {
return !IndexEntry::Cmp(value, entry.value_) &&
!IndexEntry::Cmp(entry.value_, value);
return !IndexEntry::Less(value, entry.value_) &&
!IndexEntry::Less(entry.value_, value);
},
t,
[key](const IndexEntry &entry, const Vertex *const vertex) {
@ -232,13 +232,40 @@ class LabelPropertyIndex {
* @param key - key to query for.
* @return number of items
*/
size_t Count(const Key &key) {
int64_t Count(const Key &key) {
auto index = GetKeyStorage(key);
permanent_assert(index != nullptr, "Index doesn't exist.");
debug_assert(ready_for_use_.access().contains(key), "Index not yet ready.");
return index->access().size();
}
/**
* Returns the approximate position and count of the given value in the
* index for the given Key.
*
* Both are approximations for several reasons. Initially the position
* and count are obtained from the skipist (the index) and as such are
* not exact for perfromance reasons. At the same time the position
* and count are calculated based on property value comparison: an
* additional error is accumulated because the index could contain
* the same vertex with the same value multiple times,
* as well as the same vertex with different values.
*/
auto PositionAndCount(const Key &key, const PropertyValue &value) {
auto access = GetKeyStorage(key)->access();
return access.position_and_count(
value,
// the 'less' function
[](const PropertyValue &a, const IndexEntry &b) {
return IndexEntry::Less(a, b.value_);
},
// the 'equal_to' function
[](const PropertyValue &a, const IndexEntry &b) {
return !(IndexEntry::Less(a, b.value_) ||
IndexEntry::Less(b.value_, a));
});
}
/**
* @brief - Removes from the index all entries for which records don't contain
* the given label anymore, or the record was deleted before this transaction
@ -249,7 +276,8 @@ class LabelPropertyIndex {
*/
void Refresh(const tx::Snapshot &snapshot, tx::Engine &engine) {
return IndexUtils::Refresh<Key, IndexEntry, Vertex>(
indices_, snapshot, engine, [](const Key &key, const IndexEntry &entry) {
indices_, snapshot, engine,
[](const Key &key, const IndexEntry &entry) {
return LabelPropertyIndex::Exists(key, entry.value_, entry.record_);
});
}
@ -270,8 +298,8 @@ class LabelPropertyIndex {
// Comparision operators - we need them to keep this sorted inside
// skiplist.
bool operator<(const IndexEntry &other) const {
bool this_value_smaller = Cmp(this->value_, other.value_);
if (this_value_smaller || Cmp(other.value_, this->value_))
bool this_value_smaller = Less(this->value_, other.value_);
if (this_value_smaller || Less(other.value_, this->value_))
return this_value_smaller;
if (this->vlist_ != other.vlist_) return this->vlist_ < other.vlist_;
return this->record_ < other.record_;
@ -288,7 +316,7 @@ class LabelPropertyIndex {
* @return true if the first property value is smaller( should be before)
* than the second one
*/
static bool Cmp(const PropertyValue &a, const PropertyValue &b) {
static bool Less(const PropertyValue &a, const PropertyValue &b) {
if (a.type() != b.type() &&
!(IsCastableToDouble(a) && IsCastableToDouble(b)))
return a.type() < b.type();
@ -310,7 +338,7 @@ class LabelPropertyIndex {
auto vb = b.Value<std::vector<PropertyValue>>();
if (va.size() != vb.size()) return va.size() < vb.size();
return lexicographical_compare(va.begin(), va.end(), vb.begin(),
vb.end(), Cmp);
vb.end(), Less);
}
default:
permanent_fail("Unimplemented type operator.");
@ -350,8 +378,8 @@ class LabelPropertyIndex {
*/
bool IsAlreadyChecked(const IndexEntry &previous) const {
return previous.vlist_ == this->vlist_ &&
!Cmp(previous.value_, this->value_) &&
!Cmp(this->value_, previous.value_);
!Less(previous.value_, this->value_) &&
!Less(this->value_, previous.value_);
}
const PropertyValue value_;
@ -406,7 +434,7 @@ class LabelPropertyIndex {
// Property doesn't exists.
if (prop.type() == PropertyValue::Type::Null) return false;
// Property value is the same as expected.
return !IndexEntry::Cmp(prop, value) && !IndexEntry::Cmp(value, prop);
return !IndexEntry::Less(prop, value) && !IndexEntry::Less(value, prop);
}
ConcurrentMap<Key, SkipList<IndexEntry> *> indices_;

View File

@ -26,10 +26,34 @@ class Bound {
const auto &value() const { return value_; }
/** Whether the bound is inclusive or exclusive. */
auto type() const { return type_; }
auto IsInclusive() const { return type_ == BoundType::INCLUSIVE; }
auto IsExclusive() const { return type_ == BoundType::EXCLUSIVE; }
private:
TValue value_;
Type type_;
};
/**
* Creates an inclusive @c Bound.
*
* @param value - Bound value
* @tparam TValue - value type
*/
template <typename TValue>
Bound<TValue> MakeBoundInclusive(TValue value) {
return Bound<TValue>(value, BoundType::INCLUSIVE);
};
/**
* Creates an exclusive @c Bound.
*
* @param value - Bound value
* @tparam TValue - value type
*/
template <typename TValue>
Bound<TValue> MakeBoundExclusive(TValue value) {
return Bound<TValue>(value, BoundType::EXCLUSIVE);
};
} // namespace utils

View File

@ -38,15 +38,19 @@ std::unique_ptr<SkipList<int>> make_sl(int size) {
*
* @param size - size of the skiplist to test with
* @param iterations - number of iterations of each test.
* @param granulation - How many sequential ints should be
* @param granularity - How many sequential ints should be
* considered equal in testing by the custom `less`
* function.
*/
void test(int size, int iterations = 20, int granulation = 1) {
auto less = [granulation](const int &a, const int &b) {
return a / granulation < b / granulation;
void test(int size, int iterations = 20, int granularity = 1) {
auto less = [granularity](const int &a, const int &b) {
return a / granularity < b / granularity;
};
log("\nTesting skiplist size {} with granulation {}", size, granulation);
auto equal = [granularity](const int &a, const int &b) {
return a / granularity == b / granularity;
};
log("\nTesting skiplist size {} with granularity {}", size, granularity);
// test at 1/4, 1/2 and 3/4 points
std::vector<int> test_positions({size / 4, size / 2, size * 3 / 4});
@ -60,7 +64,7 @@ void test(int size, int iterations = 20, int granulation = 1) {
for (auto pos : {0, 1, 2}) {
clock_t start_time = clock();
auto pos_and_count =
sl->access().position_and_count(test_positions[pos], less);
sl->access().position_and_count(test_positions[pos], less, equal);
auto t = double(clock() - start_time) / CLOCKS_PER_SEC;
position[pos].push_back(pos_and_count.first);
@ -77,7 +81,7 @@ void test(int size, int iterations = 20, int granulation = 1) {
position_elem = std::abs(position_elem - test_position);
log("\t\tMean position error: {}", mean(position[pos_index]));
for (auto &count_elem : count[pos_index])
count_elem = std::abs(count_elem - granulation);
count_elem = std::abs(count_elem - granularity);
log("\t\tMean count error: {}", mean(count[pos_index]));
log("\t\tMean time (ms): {}", mean(time[pos_index]) * 1000);
}
@ -91,9 +95,9 @@ int main(int argc, char *argv[]) {
if (argc > 1) size = (int)std::stoi(argv[1]);
if (argc > 2) iterations = (int)std::stoi(argv[2]);
std::vector<int> granulations;
for (int i = 1; i < size; i *= 100) granulations.push_back(i);
for (auto granulation : granulations) test(size, iterations, granulation);
std::vector<int> granularitys;
for (int i = 1; i < size; i *= 100) granularitys.push_back(i);
for (auto granularity : granularitys) test(size, iterations, granularity);
return 0;
}

View File

@ -1,3 +1,4 @@
#include <experimental/optional>
#include <memory>
#include <gmock/gmock.h>
@ -6,6 +7,7 @@
#include "data_structures/ptr_int.hpp"
#include "database/graph_db_accessor.hpp"
#include "dbms/dbms.hpp"
#include "utils/bound.hpp"
using testing::UnorderedElementsAreArray;
@ -82,6 +84,67 @@ TEST(GraphDbAccessor, VertexByLabelPropertyCount) {
EXPECT_EQ(dba->vertices_count(), 14 + 15 + 16 + 17);
}
#define EXPECT_WITH_MARGIN(x, center) \
EXPECT_THAT( \
x, testing::AllOf(testing::Ge(center - 2), testing::Le(center + 2)));
TEST(GraphDbAccessor, VertexByLabelPropertyValueCount) {
Dbms dbms;
auto dba = dbms.active();
auto label = dba->label("label");
auto property = dba->property("property");
dba->BuildIndex(label, property);
// add some vertices without the property
for (int i = 0; i < 20; i++) dba->insert_vertex();
// add vertices with prop values [0, 29), ten vertices for each value
for (int i = 0; i < 300; i++) {
auto vertex = dba->insert_vertex();
vertex.add_label(label);
vertex.PropsSet(property, i / 10);
}
// add verties in t he [30, 40) range, 100 vertices for each value
for (int i = 0; i < 1000; i++) {
auto vertex = dba->insert_vertex();
vertex.add_label(label);
vertex.PropsSet(property, 30 + i / 100);
}
// test estimates for exact value count
EXPECT_WITH_MARGIN(dba->vertices_count(label, property, 10), 10);
EXPECT_WITH_MARGIN(dba->vertices_count(label, property, 14), 10);
EXPECT_WITH_MARGIN(dba->vertices_count(label, property, 30), 100);
EXPECT_WITH_MARGIN(dba->vertices_count(label, property, 39), 100);
EXPECT_EQ(dba->vertices_count(label, property, 40), 0);
// helper functions
auto Inclusive = [](int64_t value) {
return std::experimental::make_optional(
utils::MakeBoundInclusive(PropertyValue(value)));
};
auto Exclusive = [](int64_t value) {
return std::experimental::make_optional(
utils::MakeBoundExclusive(PropertyValue(value)));
};
auto Count = [&dba, label, property](auto lower, auto upper) {
return dba->vertices_count(label, property, lower, upper);
};
using std::experimental::nullopt;
EXPECT_DEATH(Count(nullopt, nullopt), "bound must be provided");
EXPECT_WITH_MARGIN(Count(nullopt, Exclusive(4)), 40);
EXPECT_WITH_MARGIN(Count(nullopt, Inclusive(4)), 50);
EXPECT_WITH_MARGIN(Count(Exclusive(13), nullopt), 160 + 1000);
EXPECT_WITH_MARGIN(Count(Inclusive(13), nullopt), 170 + 1000);
EXPECT_WITH_MARGIN(Count(Inclusive(13), Exclusive(14)), 10);
EXPECT_WITH_MARGIN(Count(Exclusive(13), Inclusive(14)), 10);
EXPECT_WITH_MARGIN(Count(Exclusive(13), Exclusive(13)), 0);
EXPECT_WITH_MARGIN(Count(Inclusive(20), Exclusive(13)), 0);
}
#undef EXPECT_WITH_MARGIN
TEST(GraphDbAccessor, EdgeByEdgeTypeCount) {
Dbms dbms;
auto dba = dbms.active();
@ -166,7 +229,8 @@ TEST(GraphDbAccessor, FilterLabelPropertySpecificValue) {
i);
}
// Inserts integers, double, lists, booleans into index and check if they are
// Inserts integers, double, lists, booleans into index and check if they
// are
// sorted as they should be sorted.
TEST(GraphDbAccessor, SortedLabelPropertyEntries) {
Dbms dbms;
@ -212,7 +276,8 @@ TEST(GraphDbAccessor, SortedLabelPropertyEntries) {
expected_property_value[20 + 2 * i + 1] = vertex_accessor.PropsAt(property);
}
// lists of ints - insert in reverse to check for comparision between lists.
// lists of ints - insert in reverse to check for comparision between
// lists.
for (int i = 9; i >= 0; --i) {
auto vertex_accessor = dba2->insert_vertex();
vertex_accessor.add_label(label);
@ -333,9 +398,3 @@ TEST(GraphDbAccessor, VisibilityAfterDeletion) {
EXPECT_EQ(Count(dba->vertices(lab, false)), 3);
EXPECT_EQ(Count(dba->vertices(lab, true)), 3);
}
int main(int argc, char **argv) {
google::InitGoogleLogging(argv[0]);
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

View File

@ -39,14 +39,20 @@ auto Less(int granularity) {
};
}
#define EXPECT_ABS_POS_COUNT(granularity, position, expected_position, \
expected_count) \
{ \
auto sl = SkiplistRange(10000); \
auto position_and_count = \
sl->access().position_and_count(position, Less(granularity), 1000, 0); \
EXPECT_EQ(position_and_count.first, expected_position); \
EXPECT_EQ(position_and_count.second, expected_count); \
auto Equal(int granularity) {
return [granularity](const int &a, const int &b) {
return a / granularity == b / granularity;
};
}
#define EXPECT_ABS_POS_COUNT(granularity, position, expected_position, \
expected_count) \
{ \
auto sl = SkiplistRange(10000); \
auto position_and_count = sl->access().position_and_count( \
position, Less(granularity), Equal(granularity), 1000, 0); \
EXPECT_EQ(position_and_count.first, expected_position); \
EXPECT_EQ(position_and_count.second, expected_count); \
}
TEST(SkiplistPosAndCount, AbsoluteAccuracy) {