From 74d0a426abfcea5cca1a0aadd956dc7ae2c8e30e Mon Sep 17 00:00:00 2001 From: Mislav Bradac <mislav.bradac@memgraph.io> Date: Sat, 9 Sep 2017 14:08:59 +0200 Subject: [PATCH] Use \u for utf16 and \U for utf32 Reviewers: buda Reviewed By: buda Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D773 --- CHANGELOG.md | 1 + docs/user_technical/open-cypher.md | 5 +++ src/query/common.cpp | 39 +++++++++++++------ .../frontend/opencypher/grammar/Cypher.g4 | 2 +- tests/unit/cypher_main_visitor.cpp | 8 +++- 5 files changed, 40 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a7636c9b1..e26675074 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ * `collect` aggregation now supports Map collection. * Map indexing supported. * `assert` function added. +* Use \u to specify 4 digit codepoint and \U for 8 digit ### Bug Fixes and Other Changes diff --git a/docs/user_technical/open-cypher.md b/docs/user_technical/open-cypher.md index 859ce511c..90ed7c85e 100644 --- a/docs/user_technical/open-cypher.md +++ b/docs/user_technical/open-cypher.md @@ -539,3 +539,8 @@ For example: The above would find the edge `r` which forms a circular connection on a node. This behaviour is not supported in openCypher reference and the query would fail. + +#### Unicode codepoints in string literal + +Use `\u` followed by 4 hex digits in string literal for UTF-16 codepoint and +'\U' with 8 hex digits for UTF-32 codepoint in memgraph. diff --git a/src/query/common.cpp b/src/query/common.cpp index 5d0115a96..49fabe43b 100644 --- a/src/query/common.cpp +++ b/src/query/common.cpp @@ -21,12 +21,11 @@ int64_t ParseIntegerLiteral(const std::string &s) { } std::string ParseStringLiteral(const std::string &s) { - // This function is declared as lambda since its semantics is highly specific - // for this conxtext and shouldn't be used elsewhere. - auto EncodeEscapedUnicodeCodepoint = [](const std::string &s, int &i) { - int j = i + 1; - const int kShortUnicodeLength = 4; + // These functions is declared as lambda since its semantics is highly + // specific for this conxtext and shouldn't be used elsewhere. + auto EncodeEscapedUnicodeCodepointUtf32 = [](const std::string &s, int &i) { const int kLongUnicodeLength = 8; + int j = i + 1; while (j < static_cast<int>(s.size()) - 1 && j < i + kLongUnicodeLength + 1 && isxdigit(s[j])) { ++j; @@ -36,7 +35,19 @@ std::string ParseStringLiteral(const std::string &s) { i += kLongUnicodeLength; std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter; return converter.to_bytes(t); - } else if (j - i >= kShortUnicodeLength + 1) { + } + throw SyntaxException( + "Expected 8 hex digits as unicode codepoint started with \\U. " + "Use \\u for 4 hex digits format."); + }; + auto EncodeEscapedUnicodeCodepointUtf16 = [](const std::string &s, int &i) { + const int kShortUnicodeLength = 4; + int j = i + 1; + while (j < static_cast<int>(s.size()) - 1 && + j < i + kShortUnicodeLength + 1 && isxdigit(s[j])) { + ++j; + } + if (j - i >= kShortUnicodeLength + 1) { char16_t t = stoi(s.substr(i + 1, kShortUnicodeLength), 0, 16); if (t >= 0xD800 && t <= 0xDBFF) { // t is high surrogate pair. Expect one more utf16 codepoint. @@ -72,12 +83,10 @@ std::string ParseStringLiteral(const std::string &s) { converter; return converter.to_bytes(t); } - } else { - // This should never happen, except grammar changes and we don't notice - // change in this production. - debug_assert(false, "can't happen"); - throw std::exception(); } + throw SyntaxException( + "Expected 4 hex digits as unicode codepoint started with \\u. " + "Use \\U for 8 hex digits format."); }; std::string unescaped; @@ -117,9 +126,15 @@ std::string ParseStringLiteral(const std::string &s) { unescaped += '\t'; break; case 'U': + try { + unescaped += EncodeEscapedUnicodeCodepointUtf32(s, i); + } catch (const std::range_error &) { + throw SemanticException("Invalid utf codepoint"); + } + break; case 'u': try { - unescaped += EncodeEscapedUnicodeCodepoint(s, i); + unescaped += EncodeEscapedUnicodeCodepointUtf16(s, i); } catch (const std::range_error &) { throw SemanticException("Invalid utf codepoint"); } diff --git a/src/query/frontend/opencypher/grammar/Cypher.g4 b/src/query/frontend/opencypher/grammar/Cypher.g4 index 192ac6c8e..ce1c513c0 100644 --- a/src/query/frontend/opencypher/grammar/Cypher.g4 +++ b/src/query/frontend/opencypher/grammar/Cypher.g4 @@ -250,7 +250,7 @@ StringLiteral : ( '"' ( StringLiteral_0 | EscapedChar )* '"' ) | ( '\'' ( StringLiteral_1 | EscapedChar )* '\'' ) ; -EscapedChar : '\\' ( '\\' | '\'' | '"' | ( 'B' | 'b' ) | ( 'F' | 'f' ) | ( 'N' | 'n' ) | ( 'R' | 'r' ) | ( 'T' | 't' ) | ( ( 'U' | 'u' ) ( HexDigit HexDigit HexDigit HexDigit ) ) | ( ( 'U' | 'u' ) ( HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit ) ) ) ; +EscapedChar : '\\' ( '\\' | '\'' | '"' | ( 'B' | 'b' ) | ( 'F' | 'f' ) | ( 'N' | 'n' ) | ( 'R' | 'r' ) | ( 'T' | 't' ) | ( 'u' ( HexDigit HexDigit HexDigit HexDigit ) ) | ( 'U' ( HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit ) ) ) ; numberLiteral : doubleLiteral | integerLiteral diff --git a/tests/unit/cypher_main_visitor.cpp b/tests/unit/cypher_main_visitor.cpp index f9f8994b1..6ec4dbd60 100644 --- a/tests/unit/cypher_main_visitor.cpp +++ b/tests/unit/cypher_main_visitor.cpp @@ -743,7 +743,7 @@ TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedChars) { } TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf16) { - TypeParam ast_generator("RETURN '\\u221daaa\\U221daaa'"); + TypeParam ast_generator("RETURN '\\u221daaa\\u221daaa'"); auto *query = ast_generator.query_; auto *return_clause = dynamic_cast<Return *>(query->clauses_[0]); auto *literal = dynamic_cast<PrimitiveLiteral *>( @@ -753,8 +753,12 @@ TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf16) { EXPECT_EQ(literal->token_position_, 2); } +TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf16Error) { + ASSERT_THROW(TypeParam("RETURN '\\U221daaa'"), SyntaxException); +} + TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf32) { - TypeParam ast_generator("RETURN '\\u0001F600aaaa\\U0001F600aaaaaaaa'"); + TypeParam ast_generator("RETURN '\\U0001F600aaaa\\U0001F600aaaaaaaa'"); auto *query = ast_generator.query_; auto *return_clause = dynamic_cast<Return *>(query->clauses_[0]); auto *literal = dynamic_cast<PrimitiveLiteral *>(