diff --git a/include/data_structures/bloom/bloom_filter.hpp b/include/data_structures/bloom/bloom_filter.hpp new file mode 100644 index 000000000..33da0df80 --- /dev/null +++ b/include/data_structures/bloom/bloom_filter.hpp @@ -0,0 +1,67 @@ +#include +#include +#include + +/* + Implementation of a generic Bloom Filter. + + Read more about bloom filters here: + http://en.wikipedia.org/wiki/Bloom_filter + http://www.jasondavies.com/bloomfilter/ +*/ + +// Type specifies the type of data stored +template +class BloomFilter { + private: + using HashFunction = std::function; + using CompresionFunction = std::function; + + std::bitset filter_; + std::vector hashes_; + CompresionFunction compression_; + std::vector buckets; + + int default_compression(uint64_t hash) { return hash % BucketSize; } + + void get_buckets(const Type& data) { + for (int i = 0; i < hashes_.size(); i++) + buckets[i] = compression_(hashes_[i](data)); + } + + void print_buckets(std::vector& buckets) { + for (int i = 0; i < buckets.size(); i++) { + std::cout << buckets[i] << " "; + } + std::cout << std::endl; + } + + public: + BloomFilter(std::vector funcs, + CompresionFunction compression = {}) + : hashes_(funcs) { + if (!compression) + compression_ = std::bind(&BloomFilter::default_compression, this, + std::placeholders::_1); + else + compression_ = compression; + + buckets.resize(hashes_.size()); + } + + bool contains(const Type& data) { + get_buckets(data); + bool contains_element = true; + + for (int i = 0; i < buckets.size(); i++) + contains_element &= filter_[buckets[i]]; + + return contains_element; + } + + void insert(const Type& data) { + get_buckets(data); + + for (int i = 0; i < buckets.size(); i++) filter_[buckets[i]] = true; + } +}; diff --git a/include/data_structures/concurrent/concurrent_bloom_map.hpp b/include/data_structures/concurrent/concurrent_bloom_map.hpp new file mode 100644 index 000000000..8ffca031e --- /dev/null +++ b/include/data_structures/concurrent/concurrent_bloom_map.hpp @@ -0,0 +1,36 @@ +#pragma once + +#include "data_structures/concurrent/common.hpp" +#include "data_structures/concurrent/skiplist.hpp" +#include "data_structures/concurrent/concurrent_map.hpp" + + +using std::pair; + +template +class ConcurrentBloomMap { + using item_t = Item; + using list_it = typename SkipList::Iterator; + + private: + ConcurrentMap map_; + BloomFilter filter_; + + public: + ConcurrentBloomMap(BloomFilter filter) : filter_(filter) {} + + std::pair insert(const Key &key, const Value &data) { + filter_.insert(key); + + auto accessor = std::move(map_.access()); + + return accessor.insert(key, data); + } + + bool contains(const Key &key) { + if (!filter_.contains(key)) return false; + + auto accessor = map_.access(); + return accessor.contains(key); + } +}; diff --git a/tests/benchmark/data_structures/bloom/basic_bloom_filter.cpp b/tests/benchmark/data_structures/bloom/basic_bloom_filter.cpp new file mode 100644 index 000000000..36a74506d --- /dev/null +++ b/tests/benchmark/data_structures/bloom/basic_bloom_filter.cpp @@ -0,0 +1,59 @@ +#include +#include + +#include "data_structures/bloom/bloom_filter.hpp" +#include "logging/default.hpp" +#include "logging/streams/stdout.hpp" +#include "utils/command_line/arguments.hpp" +#include "utils/hashing/fnv64.hpp" +#include "utils/random/generator.h" + +#include "benchmark/benchmark_api.h" + +using utils::random::StringGenerator; +using StringHashFunction = std::function; + +template +static void TestBloom(benchmark::State& state, BloomFilter* +bloom, const std::vector& elements) { + while(state.KeepRunning()) { + for (int start = 0; start < state.range(0); start++) + if (start % 2) bloom->contains(elements[start]); + else bloom->insert(elements[start]); + } + state.SetComplexityN(state.range(0)); +} + +auto BM_Bloom = [](benchmark::State& state, auto* bloom, const auto& elements) { + TestBloom(state, bloom, elements); +}; + +void parse_args(int argc, char** argv) {} + +int main(int argc, char** argv) { + logging::init_async(); + logging::log->pipe(std::make_unique()); + + parse_args(argc, argv); + + StringGenerator generator(4); + + auto elements = utils::random::generate_vector(generator, 1 << 16); + + StringHashFunction hash1 = fnv64; + StringHashFunction hash2 = fnv1a64; + std::vector funcs = { + hash1, hash2 + }; + + BloomFilter bloom(funcs); + + benchmark::RegisterBenchmark("SimpleBloomFilter Benchmark Test", BM_Bloom, + &bloom, elements) + ->RangeMultiplier(2) + ->Range(1, 1 << 16) + ->Complexity(benchmark::oN); + + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); +} diff --git a/tests/benchmark/data_structures/concurrent/concurrent_bloom_map.cpp b/tests/benchmark/data_structures/concurrent/concurrent_bloom_map.cpp new file mode 100644 index 000000000..f305d8b20 --- /dev/null +++ b/tests/benchmark/data_structures/concurrent/concurrent_bloom_map.cpp @@ -0,0 +1,186 @@ +#include +#include + +#include "data_structures/bloom/bloom_filter.hpp" +#include "data_structures/concurrent/concurrent_bloom_map.hpp" +#include "logging/default.hpp" +#include "logging/streams/stdout.hpp" +#include "utils/command_line/arguments.hpp" +#include "utils/hashing/fnv64.hpp" +#include "utils/random/generator.h" + +#include "benchmark/benchmark_api.h" + +/* + ConcurrentMap Benchmark Test: + - tests time of Insertion, Contain and Delete operations + + - benchmarking time per operation + + - test run ConcurrentMap with the following keys and values: + - + - + - + - +*/ + +using utils::random::NumberGenerator; +using utils::random::PairGenerator; +using utils::random::StringGenerator; +using StringHashFunction = std::function; + +using IntegerGenerator = NumberGenerator, + std::default_random_engine, int>; + +// Global arguments +int MAX_ELEMENTS = 1 << 18, MULTIPLIER = 2; +int THREADS, RANGE_START, RANGE_END, STRING_LENGTH; + +/* + ConcurrentMap Insertion Benchmark Test +*/ +template +static void InsertValue(benchmark::State& state, ConcurrentBloomMap* map, + const std::vector>& elements) { + while (state.KeepRunning()) { + for (int start = 0; start < state.range(0); start++) { + map->insert(elements[start].first, elements[start].second); + } + } + state.SetComplexityN(state.range(0)); +} + +/* + ConcurrentMap Contains Benchmark Test +*/ +template +static void ContainsValue(benchmark::State& state, ConcurrentBloomMap* map, + const std::vector> elements) { + while (state.KeepRunning()) { + for (int start = 0; start < state.range(0); start++) { + map->contains(elements[start].first); + } + } + state.SetComplexityN(state.range(0)); +} + +auto BM_InsertValue = [](benchmark::State& state, auto* map, auto& elements) { + InsertValue(state, map, elements); +}; + +auto BM_ContainsValue = [](benchmark::State& state, auto* map, auto elements) { + ContainsValue(state, map, elements); +}; + +/* + Commandline Argument Parsing + + Arguments: + * Integer Range Minimum + -start number + + * Integer Range Maximum + - end number + + * Number of threads + - threads number + + * Random String lenght + -string-length number +*/ +void parse_arguments(int argc, char** argv) { + REGISTER_ARGS(argc, argv); + + RANGE_START = GET_ARG("-start", "0").get_int(); + RANGE_END = GET_ARG("-end", "1000000000").get_int(); + + THREADS = std::min(GET_ARG("-threads", "1").get_int(), + (int)std::thread::hardware_concurrency()); + + STRING_LENGTH = + ProgramArguments::instance().get_arg("-string-length", "128").get_int(); +} + +int main(int argc, char** argv) { + logging::init_async(); + logging::log->pipe(std::make_unique()); + + parse_arguments(argc, argv); + + StringGenerator sg(STRING_LENGTH); + IntegerGenerator ig(RANGE_START, RANGE_END); + + /* + Creates RandomGenerators, ConcurentMaps and Random Element Vectors for the + following use cases: + + Map elements contain keys and value for: + , + + + + */ + + // random generators for tests + PairGenerator piig(&ig, &ig); + PairGenerator pssg(&sg, &sg); + PairGenerator psig(&sg, &ig); + PairGenerator pisg(&ig, &sg); + + StringHashFunction hash1 = fnv64; + StringHashFunction hash2 = fnv1a64; + std::vector funcs = { + hash1, hash2 + }; + + BloomFilter bloom_filter_(funcs); + + // maps used for testing + //ConcurrentBloomMap ii_map; + //ConcurrentBloomMap is_map; + using Filter = BloomFilter; + ConcurrentBloomMap si_map(bloom_filter_); + ConcurrentBloomMap +ss_map(bloom_filter_); + + // random elements for testing + //auto ii_elems = utils::random::generate_vector(piig, MAX_ELEMENTS); + //auto is_elems = utils::random::generate_vector(pisg, MAX_ELEMENTS); + auto si_elems = utils::random::generate_vector(psig, MAX_ELEMENTS); + auto ss_elems = utils::random::generate_vector(pssg, MAX_ELEMENTS); + + /* insertion Tests */ + benchmark::RegisterBenchmark("InsertValue[String, Int]", BM_InsertValue, + &si_map, si_elems) + ->RangeMultiplier(MULTIPLIER) + ->Range(1, MAX_ELEMENTS) + ->Complexity(benchmark::oN) + ->Threads(THREADS); + + benchmark::RegisterBenchmark("InsertValue[String, String]", BM_InsertValue, + &ss_map, ss_elems) + ->RangeMultiplier(MULTIPLIER) + ->Range(1, MAX_ELEMENTS) + ->Complexity(benchmark::oN) + ->Threads(THREADS); + + // Contains Benchmark Tests + benchmark::RegisterBenchmark("ContainsValue[String, Int]", BM_ContainsValue, + &si_map, si_elems) + ->RangeMultiplier(MULTIPLIER) + ->Range(1, MAX_ELEMENTS) + ->Complexity(benchmark::oN) + ->Threads(THREADS); + + benchmark::RegisterBenchmark("ContainsValue[String, String]", + BM_ContainsValue, &ss_map, ss_elems) + ->RangeMultiplier(MULTIPLIER) + ->Range(1, MAX_ELEMENTS) + ->Complexity(benchmark::oN) + ->Threads(THREADS); + + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + + return 0; +} diff --git a/tests/unit/basic_bloom_filter.cpp b/tests/unit/basic_bloom_filter.cpp new file mode 100644 index 000000000..ac4df7fc2 --- /dev/null +++ b/tests/unit/basic_bloom_filter.cpp @@ -0,0 +1,45 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "utils/command_line/arguments.hpp" +#include "utils/hashing/fnv64.hpp" + +#include "data_structures/bloom/bloom_filter.hpp" + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wwritable-strings" + +using StringHashFunction = std::function; + +TEST_CASE("BloomFilter Test") { + StringHashFunction hash1 = fnv64; + StringHashFunction hash2 = fnv1a64; + + auto c = [](auto x) -> int { + return x % 4; + } ; + std::vector funcs = { + hash1, hash2 + }; + + BloomFilter bloom(funcs); + + std::string test = "test"; + std::string kifla = "kifla"; + + std::cout << hash1(test) << std::endl; + std::cout << hash2(test) << std::endl; + + std::cout << hash1(kifla) << std::endl; + std::cout << hash2(kifla) << std::endl; + + std::cout << bloom.contains(test) << std::endl; + bloom.insert(test); + std::cout << bloom.contains(test) << std::endl; + + std::cout << bloom.contains(kifla) << std::endl; + bloom.insert(kifla); + std::cout << bloom.contains(kifla) << std::endl; +} + +#pragma clang diagnostic pop