2022-09-07 18:15:32 +03:00

2926 lines
251 KiB

// Copyright 2022 Memgraph Ltd.
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
// License, and you may not use this file except in compliance with the Business Source License.
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
#pragma once
#include <bitset>
#include <initializer_list>
#include <string>
#include <unordered_set>
#include <vector>
namespace parser {
namespace lexer_constants {
namespace trie {
// Trie data structure implemented to be used in StrippedQuery. If you want to
// change it please rerun benchmark/stripped to be sure that performance of
// StrippedQuery is not degraded by the change. Also there are no tests that
// directly test this class, but there are tests that test StrippedQuery.
namespace detail {
inline int Noop(int x) { return x; }
}; // namespace detail
class Trie {
Trie() {}
Trie(std::initializer_list<std::string> l) {
for (const auto &s : l) {
void Insert(const std::string &s) {
int node_id = kRootIndex;
for (const auto &_c : s) {
const unsigned char &c = reinterpret_cast<const unsigned char &>(_c);
int &next_node_id = nodes_[node_id].next[c];
if (next_node_id == 0) {
next_node_id = nodes_.size();
// First assign then emplace_back because after emplace_back reference
// could be invalid.
node_id = next_node_id;
} else {
node_id = next_node_id;
nodes_[node_id].finish = true;
template <int (*Map)(int c) = detail::Noop>
int Match(const char *s) const {
int node_id = kRootIndex;
int longest_found_len = 0;
int i = 1;
for (const char *p = s; *p; ++p, ++i) {
const unsigned char &c = reinterpret_cast<const unsigned char &>(*p);
node_id = nodes_[node_id].next[Map(c)];
if (node_id == 0) break;
if (nodes_[node_id].finish) {
longest_found_len = i;
return longest_found_len;
struct Node {
int next[1 << (sizeof(unsigned char) * 8)] = {};
bool finish = false;
const static int kRootIndex = 0;
std::vector<Node> nodes_{1};
} // namespace trie
// All word constants should be lowercase in this file.
const int kBitsetSize = 65536;
const trie::Trie kKeywords = {"union",
// Unicode codepoints that are allowed at the start of the unescaped name.
const std::bitset<kBitsetSize> kUnescapedNameAllowedStarts(
// Unicode codepoints that are allowed at the middle of the unescaped name.
const std::bitset<kBitsetSize> kUnescapedNameAllowedParts(
const std::bitset<kBitsetSize> kSpaceParts(
const trie::Trie kSpecialTokens = {";",
"\xE2\x9F\xA8", // u8"\u27e8"
"\xE3\x80\x88", // u8"\u3008"
"\xEF\xB9\xA4", // u8"\ufe64"
"\xEF\xBC\x9C", // u8"\uff1c"
"\xE2\x9F\xA9", // u8"\u27e9"
"\xE3\x80\x89", // u8"\u3009"
"\xEF\xB9\xA5", // u8"\ufe65"
"\xEF\xBC\x9E", // u8"\uff1e"
"\xC2\xAD", // u8"\u00ad"
"\xE2\x80\x90", // u8"\u2010"
"\xE2\x80\x91", // u8"\u2011"
"\xE2\x80\x92", // u8"\u2012"
"\xE2\x80\x93", // u8"\u2013"
"\xE2\x80\x94", // u8"\u2014"
"\xE2\x80\x95", // u8"\u2015"
"\xE2\x88\x92", // u8"\u2212"
"\xEF\xB9\x98", // u8"\ufe58"
"\xEF\xB9\xA3", // u8"\ufe63"
"\xEF\xBC\x8D"}; // u8"\uff0d"
} // namespace lexer_constants
} // namespace parser