Use \u for utf16 and \U for utf32

Reviewers: buda Reviewed By: buda Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D773
2017-09-09 14:08:59 +02:00 · 2017-09-09 14:08:59 +02:00 · 74d0a426ab
commit 74d0a426ab
parent 92b9bbd4bd
5 changed files with 40 additions and 15 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -10,6 +10,7 @@
 * `collect` aggregation now supports Map collection.
 * Map indexing supported.
 * `assert` function added.
+* Use \u to specify 4 digit codepoint and \U for 8 digit

 ### Bug Fixes and Other Changes

--- a/docs/user_technical/open-cypher.md
+++ b/docs/user_technical/open-cypher.md
@ -539,3 +539,8 @@ For example:
 The above would find the edge `r` which forms a circular connection on a node.
 This behaviour is not supported in openCypher reference and the query would
 fail.
+
+#### Unicode codepoints in string literal
+
+Use `\u` followed by 4 hex digits in string literal for UTF-16 codepoint and
+'\U' with 8 hex digits for UTF-32 codepoint in memgraph.
--- a/src/query/common.cpp
+++ b/src/query/common.cpp
@ -21,12 +21,11 @@ int64_t ParseIntegerLiteral(const std::string &s) {
 }

 std::string ParseStringLiteral(const std::string &s) {
-  // This function is declared as lambda since its semantics is highly specific
-  // for this conxtext and shouldn't be used elsewhere.
-  auto EncodeEscapedUnicodeCodepoint = [](const std::string &s, int &i) {
-    int j = i + 1;
-    const int kShortUnicodeLength = 4;
+  // These functions is declared as lambda since its semantics is highly
+  // specific for this conxtext and shouldn't be used elsewhere.
+  auto EncodeEscapedUnicodeCodepointUtf32 = [](const std::string &s, int &i) {
    const int kLongUnicodeLength = 8;
+    int j = i + 1;
    while (j < static_cast<int>(s.size()) - 1 &&
           j < i + kLongUnicodeLength + 1 && isxdigit(s[j])) {
      ++j;
@ -36,7 +35,19 @@ std::string ParseStringLiteral(const std::string &s) {
      i += kLongUnicodeLength;
      std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
      return converter.to_bytes(t);
-    } else if (j - i >= kShortUnicodeLength + 1) {
+    }
+    throw SyntaxException(
+        "Expected 8 hex digits as unicode codepoint started with \\U. "
+        "Use \\u for 4 hex digits format.");
+  };
+  auto EncodeEscapedUnicodeCodepointUtf16 = [](const std::string &s, int &i) {
+    const int kShortUnicodeLength = 4;
+    int j = i + 1;
+    while (j < static_cast<int>(s.size()) - 1 &&
+           j < i + kShortUnicodeLength + 1 && isxdigit(s[j])) {
+      ++j;
+    }
+    if (j - i >= kShortUnicodeLength + 1) {
      char16_t t = stoi(s.substr(i + 1, kShortUnicodeLength), 0, 16);
      if (t >= 0xD800 && t <= 0xDBFF) {
        // t is high surrogate pair. Expect one more utf16 codepoint.
@ -72,12 +83,10 @@ std::string ParseStringLiteral(const std::string &s) {
            converter;
        return converter.to_bytes(t);
      }
-    } else {
-      // This should never happen, except grammar changes and we don't notice
-      // change in this production.
-      debug_assert(false, "can't happen");
-      throw std::exception();
    }
+    throw SyntaxException(
+        "Expected 4 hex digits as unicode codepoint started with \\u. "
+        "Use \\U for 8 hex digits format.");
  };

  std::string unescaped;
@ -117,9 +126,15 @@ std::string ParseStringLiteral(const std::string &s) {
          unescaped += '\t';
          break;
        case 'U':
+          try {
+            unescaped += EncodeEscapedUnicodeCodepointUtf32(s, i);
+          } catch (const std::range_error &) {
+            throw SemanticException("Invalid utf codepoint");
+          }
+          break;
        case 'u':
          try {
-            unescaped += EncodeEscapedUnicodeCodepoint(s, i);
+            unescaped += EncodeEscapedUnicodeCodepointUtf16(s, i);
          } catch (const std::range_error &) {
            throw SemanticException("Invalid utf codepoint");
          }
--- a/src/query/frontend/opencypher/grammar/Cypher.g4
+++ b/src/query/frontend/opencypher/grammar/Cypher.g4
@ -250,7 +250,7 @@ StringLiteral : ( '"' ( StringLiteral_0 | EscapedChar )* '"' )
              | ( '\'' ( StringLiteral_1 | EscapedChar )* '\'' )
              ;

-EscapedChar : '\\' ( '\\' | '\'' | '"' | ( 'B' | 'b' ) | ( 'F' | 'f' ) | ( 'N' | 'n' ) | ( 'R' | 'r' ) | ( 'T' | 't' ) | ( ( 'U' | 'u' ) ( HexDigit HexDigit HexDigit HexDigit ) ) | ( ( 'U' | 'u' ) ( HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit ) ) ) ;
+EscapedChar : '\\' ( '\\' | '\'' | '"' | ( 'B' | 'b' ) | ( 'F' | 'f' ) | ( 'N' | 'n' ) | ( 'R' | 'r' ) | ( 'T' | 't' ) | ( 'u' ( HexDigit HexDigit HexDigit HexDigit ) ) | ( 'U' ( HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit ) ) ) ;

 numberLiteral : doubleLiteral
              | integerLiteral
--- a/tests/unit/cypher_main_visitor.cpp
+++ b/tests/unit/cypher_main_visitor.cpp
@ -743,7 +743,7 @@ TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedChars) {
 }

 TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf16) {
-  TypeParam ast_generator("RETURN '\\u221daaa\\U221daaa'");
+  TypeParam ast_generator("RETURN '\\u221daaa\\u221daaa'");
  auto *query = ast_generator.query_;
  auto *return_clause = dynamic_cast<Return *>(query->clauses_[0]);
  auto *literal = dynamic_cast<PrimitiveLiteral *>(
@ -753,8 +753,12 @@ TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf16) {
  EXPECT_EQ(literal->token_position_, 2);
 }

+TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf16Error) {
+  ASSERT_THROW(TypeParam("RETURN '\\U221daaa'"), SyntaxException);
+}
+
 TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf32) {
-  TypeParam ast_generator("RETURN '\\u0001F600aaaa\\U0001F600aaaaaaaa'");
+  TypeParam ast_generator("RETURN '\\U0001F600aaaa\\U0001F600aaaaaaaa'");
  auto *query = ast_generator.query_;
  auto *return_clause = dynamic_cast<Return *>(query->clauses_[0]);
  auto *literal = dynamic_cast<PrimitiveLiteral *>(