Replace map with bitset in stripped query

Summary:
Implement trie and use it in stripper

Make it nicer

Reviewers: buda, florijan

Reviewed By: florijan

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D614
This commit is contained in:
Mislav Bradac 2017-08-02 19:12:47 +02:00
parent 57dea09b5b
commit 6bc9deba5f
4 changed files with 2797 additions and 16523 deletions

View File

@ -76,7 +76,7 @@ StrippedQuery::StrippedQuery(const std::string &query) : original_(query) {
// Convert tokens to strings, perform lowercasing and filtering, store
// literals and nonaliased named expressions in return.
for (int i = 0; i < static_cast<int>(tokens.size()); ++i) {
const auto &token = tokens[i];
auto &token = tokens[i];
// Position is calculated in query after stripping and whitespace
// normalisation, not before. There will be twice as much tokens before
// this one because space tokens will be inserted between every one we also
@ -87,7 +87,8 @@ StrippedQuery::StrippedQuery(const std::string &query) : original_(query) {
case Token::UNMATCHED:
debug_assert(false, "Shouldn't happen");
case Token::KEYWORD: {
auto s = utils::ToLowerCase(token.second);
token.second = utils::ToLowerCase(token.second);
const auto &s = token.second;
// We don't strip NULL, since it can appear in special expressions
// like IS NULL and IS NOT NULL, but we strip true and false keywords.
if (s == "true") {
@ -134,7 +135,7 @@ StrippedQuery::StrippedQuery(const std::string &query) : original_(query) {
// Store nonaliased named expressions in returns in named_exprs_.
auto it = std::find_if(tokens.begin(), tokens.end(),
[](const std::pair<Token, std::string> &a) {
return utils::ToLowerCase(a.second) == "return";
return a.second == "return";
});
// There is no RETURN so there is nothing to do here.
if (it == tokens.end()) return;
@ -146,7 +147,7 @@ StrippedQuery::StrippedQuery(const std::string &query) : original_(query) {
while (it != tokens.end() && it->first == Token::SPACE) {
++it;
}
if (it != tokens.end() && utils::ToLowerCase(it->second) == "distinct") {
if (it != tokens.end() && it->second == "distinct") {
++it;
}
@ -170,11 +171,10 @@ StrippedQuery::StrippedQuery(const std::string &query) : original_(query) {
// list literal / function call.
int num_open_braces = 0;
int num_open_parantheses = 0;
for (; jt != tokens.end() &&
(jt->second != "," || num_open_braces || num_open_parantheses) &&
utils::ToLowerCase(jt->second) != "order" &&
utils::ToLowerCase(jt->second) != "skip" &&
utils::ToLowerCase(jt->second) != "limit";
for (;
jt != tokens.end() &&
(jt->second != "," || num_open_braces || num_open_parantheses) &&
jt->second != "order" && jt->second != "skip" && jt->second != "limit";
++jt) {
if (jt->second == "(") {
++num_open_parantheses;
@ -185,7 +185,7 @@ StrippedQuery::StrippedQuery(const std::string &query) : original_(query) {
} else if (jt->second == "]") {
--num_open_braces;
}
has_as |= utils::ToLowerCase(jt->second) == "as";
has_as |= jt->second == "as";
if (jt->first != Token::SPACE) {
last_non_space = jt;
}
@ -209,7 +209,7 @@ StrippedQuery::StrippedQuery(const std::string &query) : original_(query) {
}
}
std::string StrippedQuery::GetFirstUtf8Symbol(const char *_s) const {
std::string GetFirstUtf8Symbol(const char *_s) {
// According to
// https://stackoverflow.com/questions/16260033/reinterpret-cast-between-char-and-stduint8-t-safe
// this checks if casting from const char * to uint8_t is undefined behaviour.
@ -242,6 +242,39 @@ std::string StrippedQuery::GetFirstUtf8Symbol(const char *_s) const {
throw LexingException("Invalid character");
}
// Return codepoint of first utf8 symbol and its encoded length.
std::pair<int, int> GetFirstUtf8SymbolCodepoint(const char *_s) {
static_assert(std::is_same<std::uint8_t, unsigned char>::value,
"This library requires std::uint8_t to be implemented as "
"unsigned char.");
const uint8_t *s = reinterpret_cast<const uint8_t *>(_s);
if ((*s >> 7) == 0x00) return {*s & 0x7f, 1};
if ((*s >> 5) == 0x06) {
auto *s1 = s + 1;
if ((*s1 >> 6) != 0x02) throw LexingException("Invalid character");
return {((*s & 0x1f) << 6) | (*s1 & 0x3f), 2};
}
if ((*s >> 4) == 0x0e) {
auto *s1 = s + 1;
if ((*s1 >> 6) != 0x02) throw LexingException("Invalid character");
auto *s2 = s + 2;
if ((*s2 >> 6) != 0x02) throw LexingException("Invalid character");
return {((*s & 0x0f) << 12) | ((*s1 & 0x3f) << 6) | (*s2 & 0x3f), 3};
}
if ((*s >> 3) == 0x1e) {
auto *s1 = s + 1;
if ((*s1 >> 6) != 0x02) throw LexingException("Invalid character");
auto *s2 = s + 2;
if ((*s2 >> 6) != 0x02) throw LexingException("Invalid character");
auto *s3 = s + 3;
if ((*s3 >> 6) != 0x02) throw LexingException("Invalid character");
return {((*s & 0x07) << 18) | ((*s1 & 0x3f) << 12) | ((*s2 & 0x3f) << 6) |
(*s3 & 0x3f),
4};
}
throw LexingException("Invalid character");
}
// From here until end of file there are functions that calculate matches for
// every possible token. Functions are more or less compatible with Cypher.g4
// grammar. Unfortunately, they contain a lof of special cases and shouldn't
@ -257,30 +290,11 @@ std::string StrippedQuery::GetFirstUtf8Symbol(const char *_s) const {
// , _.'/ `\<-- \<
// `^^^` ^^ ^^
int StrippedQuery::MatchKeyword(int start) const {
int match = 0;
for (const auto &s : kKeywords) {
int len = s.size();
if (len < match) continue;
if (start + len > static_cast<int>(original_.size())) continue;
int i = 0;
while (i < len && s[i] == tolower(original_[start + i])) {
++i;
}
if (i == len) {
match = len;
}
}
return match;
return kKeywords.Match<tolower>(original_.c_str() + start);
}
int StrippedQuery::MatchSpecial(int start) const {
int match = 0;
for (const auto &s : kSpecialTokens) {
if (!original_.compare(start, s.size(), s)) {
match = std::max(match, static_cast<int>(s.size()));
}
}
return match;
return kSpecialTokens.Match(original_.c_str() + start);
}
int StrippedQuery::MatchString(int start) const {
@ -413,13 +427,19 @@ int StrippedQuery::MatchEscapedName(int start) const {
int StrippedQuery::MatchUnescapedName(int start) const {
auto i = start;
auto s = GetFirstUtf8Symbol(original_.data() + i);
if (!kUnescapedNameAllowedStarts.count(s)) return 0;
i += s.size();
auto got = GetFirstUtf8SymbolCodepoint(original_.data() + i);
if (got.first >= lexer_constants::kBitsetSize ||
!kUnescapedNameAllowedStarts[got.first]) {
return 0;
}
i += got.second;
while (i < static_cast<int>(original_.size())) {
s = GetFirstUtf8Symbol(original_.data() + i);
if (!kUnescapedNameAllowedParts.count(s)) break;
i += s.size();
got = GetFirstUtf8SymbolCodepoint(original_.data() + i);
if (got.first >= lexer_constants::kBitsetSize ||
!kUnescapedNameAllowedParts[got.first]) {
break;
}
i += got.second;
}
return i - start;
}
@ -434,9 +454,9 @@ int StrippedQuery::MatchWhitespaceAndComments(int start) const {
int comment_position = -1;
while (i < len) {
if (state == State::OUT) {
auto s = GetFirstUtf8Symbol(original_.data() + i);
if (kSpaceParts.count(s)) {
i += s.size();
auto got = GetFirstUtf8SymbolCodepoint(original_.data() + i);
if (got.first < lexer_constants::kBitsetSize && kSpaceParts[got.first]) {
i += got.second;
} else if (i + 1 < len && original_[i] == '/' &&
original_[i + 1] == '*') {
comment_position = i;

View File

@ -54,8 +54,6 @@ class StrippedQuery {
HashType hash() const { return hash_; }
private:
std::string GetFirstUtf8Symbol(const char *s) const;
// Return len of matched keyword if something is matched, otherwise 0.
int MatchKeyword(int start) const;
int MatchString(int start) const;

File diff suppressed because it is too large Load Diff

View File

@ -19,3 +19,4 @@ MATCH (n:X {foo: 'A'}) SET n = {foo: 'B', baz: 'C'} RETURN n
MATCH (n:X {foo: 'A'}) SET n += {foo: null} RETURN n
MATCH (n) WITH n LIMIT toInteger(ceil(1.7)) RETURN count(*) AS count
MATCH (a:A), (b:B) MERGE (a)-[r:TYPE]->(b) ON CREATE SET r.name = 'Lola' RETURN count(r)
CREATE (:L1:L2:L3:L4:L5:L6:L7 {p1: true, p2: 42, p3: "Here is some text that is not extremely short", p4:"Short text", p5: 234.434, p6: 11.11, p7: false})