From 74d0a426abfcea5cca1a0aadd956dc7ae2c8e30e Mon Sep 17 00:00:00 2001
From: Mislav Bradac <mislav.bradac@memgraph.io>
Date: Sat, 9 Sep 2017 14:08:59 +0200
Subject: [PATCH] Use \u for utf16 and \U for utf32

Reviewers: buda

Reviewed By: buda

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D773
---
 CHANGELOG.md                                  |  1 +
 docs/user_technical/open-cypher.md            |  5 +++
 src/query/common.cpp                          | 39 +++++++++++++------
 .../frontend/opencypher/grammar/Cypher.g4     |  2 +-
 tests/unit/cypher_main_visitor.cpp            |  8 +++-
 5 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a7636c9b1..e26675074 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@
 * `collect` aggregation now supports Map collection.
 * Map indexing supported.
 * `assert` function added.
+* Use \u to specify 4 digit codepoint and \U for 8 digit
 
 ### Bug Fixes and Other Changes
 
diff --git a/docs/user_technical/open-cypher.md b/docs/user_technical/open-cypher.md
index 859ce511c..90ed7c85e 100644
--- a/docs/user_technical/open-cypher.md
+++ b/docs/user_technical/open-cypher.md
@@ -539,3 +539,8 @@ For example:
 The above would find the edge `r` which forms a circular connection on a node.
 This behaviour is not supported in openCypher reference and the query would
 fail.
+
+#### Unicode codepoints in string literal
+
+Use `\u` followed by 4 hex digits in string literal for UTF-16 codepoint and
+'\U' with 8 hex digits for UTF-32 codepoint in memgraph.
diff --git a/src/query/common.cpp b/src/query/common.cpp
index 5d0115a96..49fabe43b 100644
--- a/src/query/common.cpp
+++ b/src/query/common.cpp
@@ -21,12 +21,11 @@ int64_t ParseIntegerLiteral(const std::string &s) {
 }
 
 std::string ParseStringLiteral(const std::string &s) {
-  // This function is declared as lambda since its semantics is highly specific
-  // for this conxtext and shouldn't be used elsewhere.
-  auto EncodeEscapedUnicodeCodepoint = [](const std::string &s, int &i) {
-    int j = i + 1;
-    const int kShortUnicodeLength = 4;
+  // These functions is declared as lambda since its semantics is highly
+  // specific for this conxtext and shouldn't be used elsewhere.
+  auto EncodeEscapedUnicodeCodepointUtf32 = [](const std::string &s, int &i) {
     const int kLongUnicodeLength = 8;
+    int j = i + 1;
     while (j < static_cast<int>(s.size()) - 1 &&
            j < i + kLongUnicodeLength + 1 && isxdigit(s[j])) {
       ++j;
@@ -36,7 +35,19 @@ std::string ParseStringLiteral(const std::string &s) {
       i += kLongUnicodeLength;
       std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
       return converter.to_bytes(t);
-    } else if (j - i >= kShortUnicodeLength + 1) {
+    }
+    throw SyntaxException(
+        "Expected 8 hex digits as unicode codepoint started with \\U. "
+        "Use \\u for 4 hex digits format.");
+  };
+  auto EncodeEscapedUnicodeCodepointUtf16 = [](const std::string &s, int &i) {
+    const int kShortUnicodeLength = 4;
+    int j = i + 1;
+    while (j < static_cast<int>(s.size()) - 1 &&
+           j < i + kShortUnicodeLength + 1 && isxdigit(s[j])) {
+      ++j;
+    }
+    if (j - i >= kShortUnicodeLength + 1) {
       char16_t t = stoi(s.substr(i + 1, kShortUnicodeLength), 0, 16);
       if (t >= 0xD800 && t <= 0xDBFF) {
         // t is high surrogate pair. Expect one more utf16 codepoint.
@@ -72,12 +83,10 @@ std::string ParseStringLiteral(const std::string &s) {
             converter;
         return converter.to_bytes(t);
       }
-    } else {
-      // This should never happen, except grammar changes and we don't notice
-      // change in this production.
-      debug_assert(false, "can't happen");
-      throw std::exception();
     }
+    throw SyntaxException(
+        "Expected 4 hex digits as unicode codepoint started with \\u. "
+        "Use \\U for 8 hex digits format.");
   };
 
   std::string unescaped;
@@ -117,9 +126,15 @@ std::string ParseStringLiteral(const std::string &s) {
           unescaped += '\t';
           break;
         case 'U':
+          try {
+            unescaped += EncodeEscapedUnicodeCodepointUtf32(s, i);
+          } catch (const std::range_error &) {
+            throw SemanticException("Invalid utf codepoint");
+          }
+          break;
         case 'u':
           try {
-            unescaped += EncodeEscapedUnicodeCodepoint(s, i);
+            unescaped += EncodeEscapedUnicodeCodepointUtf16(s, i);
           } catch (const std::range_error &) {
             throw SemanticException("Invalid utf codepoint");
           }
diff --git a/src/query/frontend/opencypher/grammar/Cypher.g4 b/src/query/frontend/opencypher/grammar/Cypher.g4
index 192ac6c8e..ce1c513c0 100644
--- a/src/query/frontend/opencypher/grammar/Cypher.g4
+++ b/src/query/frontend/opencypher/grammar/Cypher.g4
@@ -250,7 +250,7 @@ StringLiteral : ( '"' ( StringLiteral_0 | EscapedChar )* '"' )
               | ( '\'' ( StringLiteral_1 | EscapedChar )* '\'' )
               ;
 
-EscapedChar : '\\' ( '\\' | '\'' | '"' | ( 'B' | 'b' ) | ( 'F' | 'f' ) | ( 'N' | 'n' ) | ( 'R' | 'r' ) | ( 'T' | 't' ) | ( ( 'U' | 'u' ) ( HexDigit HexDigit HexDigit HexDigit ) ) | ( ( 'U' | 'u' ) ( HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit ) ) ) ;
+EscapedChar : '\\' ( '\\' | '\'' | '"' | ( 'B' | 'b' ) | ( 'F' | 'f' ) | ( 'N' | 'n' ) | ( 'R' | 'r' ) | ( 'T' | 't' ) | ( 'u' ( HexDigit HexDigit HexDigit HexDigit ) ) | ( 'U' ( HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit ) ) ) ;
 
 numberLiteral : doubleLiteral
               | integerLiteral
diff --git a/tests/unit/cypher_main_visitor.cpp b/tests/unit/cypher_main_visitor.cpp
index f9f8994b1..6ec4dbd60 100644
--- a/tests/unit/cypher_main_visitor.cpp
+++ b/tests/unit/cypher_main_visitor.cpp
@@ -743,7 +743,7 @@ TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedChars) {
 }
 
 TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf16) {
-  TypeParam ast_generator("RETURN '\\u221daaa\\U221daaa'");
+  TypeParam ast_generator("RETURN '\\u221daaa\\u221daaa'");
   auto *query = ast_generator.query_;
   auto *return_clause = dynamic_cast<Return *>(query->clauses_[0]);
   auto *literal = dynamic_cast<PrimitiveLiteral *>(
@@ -753,8 +753,12 @@ TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf16) {
   EXPECT_EQ(literal->token_position_, 2);
 }
 
+TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf16Error) {
+  ASSERT_THROW(TypeParam("RETURN '\\U221daaa'"), SyntaxException);
+}
+
 TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf32) {
-  TypeParam ast_generator("RETURN '\\u0001F600aaaa\\U0001F600aaaaaaaa'");
+  TypeParam ast_generator("RETURN '\\U0001F600aaaa\\U0001F600aaaaaaaa'");
   auto *query = ast_generator.query_;
   auto *return_clause = dynamic_cast<Return *>(query->clauses_[0]);
   auto *literal = dynamic_cast<PrimitiveLiteral *>(