Use \u for utf16 and \U for utf32

Reviewers: buda

Reviewed By: buda

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D773
This commit is contained in:
Mislav Bradac 2017-09-09 14:08:59 +02:00
parent 92b9bbd4bd
commit 74d0a426ab
5 changed files with 40 additions and 15 deletions

View File

@ -10,6 +10,7 @@
* `collect` aggregation now supports Map collection.
* Map indexing supported.
* `assert` function added.
* Use \u to specify 4 digit codepoint and \U for 8 digit
### Bug Fixes and Other Changes

View File

@ -539,3 +539,8 @@ For example:
The above would find the edge `r` which forms a circular connection on a node.
This behaviour is not supported in openCypher reference and the query would
fail.
#### Unicode codepoints in string literal
Use `\u` followed by 4 hex digits in string literal for UTF-16 codepoint and
'\U' with 8 hex digits for UTF-32 codepoint in memgraph.

View File

@ -21,12 +21,11 @@ int64_t ParseIntegerLiteral(const std::string &s) {
}
std::string ParseStringLiteral(const std::string &s) {
// This function is declared as lambda since its semantics is highly specific
// for this conxtext and shouldn't be used elsewhere.
auto EncodeEscapedUnicodeCodepoint = [](const std::string &s, int &i) {
int j = i + 1;
const int kShortUnicodeLength = 4;
// These functions is declared as lambda since its semantics is highly
// specific for this conxtext and shouldn't be used elsewhere.
auto EncodeEscapedUnicodeCodepointUtf32 = [](const std::string &s, int &i) {
const int kLongUnicodeLength = 8;
int j = i + 1;
while (j < static_cast<int>(s.size()) - 1 &&
j < i + kLongUnicodeLength + 1 && isxdigit(s[j])) {
++j;
@ -36,7 +35,19 @@ std::string ParseStringLiteral(const std::string &s) {
i += kLongUnicodeLength;
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
return converter.to_bytes(t);
} else if (j - i >= kShortUnicodeLength + 1) {
}
throw SyntaxException(
"Expected 8 hex digits as unicode codepoint started with \\U. "
"Use \\u for 4 hex digits format.");
};
auto EncodeEscapedUnicodeCodepointUtf16 = [](const std::string &s, int &i) {
const int kShortUnicodeLength = 4;
int j = i + 1;
while (j < static_cast<int>(s.size()) - 1 &&
j < i + kShortUnicodeLength + 1 && isxdigit(s[j])) {
++j;
}
if (j - i >= kShortUnicodeLength + 1) {
char16_t t = stoi(s.substr(i + 1, kShortUnicodeLength), 0, 16);
if (t >= 0xD800 && t <= 0xDBFF) {
// t is high surrogate pair. Expect one more utf16 codepoint.
@ -72,12 +83,10 @@ std::string ParseStringLiteral(const std::string &s) {
converter;
return converter.to_bytes(t);
}
} else {
// This should never happen, except grammar changes and we don't notice
// change in this production.
debug_assert(false, "can't happen");
throw std::exception();
}
throw SyntaxException(
"Expected 4 hex digits as unicode codepoint started with \\u. "
"Use \\U for 8 hex digits format.");
};
std::string unescaped;
@ -117,9 +126,15 @@ std::string ParseStringLiteral(const std::string &s) {
unescaped += '\t';
break;
case 'U':
try {
unescaped += EncodeEscapedUnicodeCodepointUtf32(s, i);
} catch (const std::range_error &) {
throw SemanticException("Invalid utf codepoint");
}
break;
case 'u':
try {
unescaped += EncodeEscapedUnicodeCodepoint(s, i);
unescaped += EncodeEscapedUnicodeCodepointUtf16(s, i);
} catch (const std::range_error &) {
throw SemanticException("Invalid utf codepoint");
}

View File

@ -250,7 +250,7 @@ StringLiteral : ( '"' ( StringLiteral_0 | EscapedChar )* '"' )
| ( '\'' ( StringLiteral_1 | EscapedChar )* '\'' )
;
EscapedChar : '\\' ( '\\' | '\'' | '"' | ( 'B' | 'b' ) | ( 'F' | 'f' ) | ( 'N' | 'n' ) | ( 'R' | 'r' ) | ( 'T' | 't' ) | ( ( 'U' | 'u' ) ( HexDigit HexDigit HexDigit HexDigit ) ) | ( ( 'U' | 'u' ) ( HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit ) ) ) ;
EscapedChar : '\\' ( '\\' | '\'' | '"' | ( 'B' | 'b' ) | ( 'F' | 'f' ) | ( 'N' | 'n' ) | ( 'R' | 'r' ) | ( 'T' | 't' ) | ( 'u' ( HexDigit HexDigit HexDigit HexDigit ) ) | ( 'U' ( HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit HexDigit ) ) ) ;
numberLiteral : doubleLiteral
| integerLiteral

View File

@ -743,7 +743,7 @@ TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedChars) {
}
TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf16) {
TypeParam ast_generator("RETURN '\\u221daaa\\U221daaa'");
TypeParam ast_generator("RETURN '\\u221daaa\\u221daaa'");
auto *query = ast_generator.query_;
auto *return_clause = dynamic_cast<Return *>(query->clauses_[0]);
auto *literal = dynamic_cast<PrimitiveLiteral *>(
@ -753,8 +753,12 @@ TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf16) {
EXPECT_EQ(literal->token_position_, 2);
}
TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf16Error) {
ASSERT_THROW(TypeParam("RETURN '\\U221daaa'"), SyntaxException);
}
TYPED_TEST(CypherMainVisitorTest, StringLiteralEscapedUtf32) {
TypeParam ast_generator("RETURN '\\u0001F600aaaa\\U0001F600aaaaaaaa'");
TypeParam ast_generator("RETURN '\\U0001F600aaaa\\U0001F600aaaaaaaa'");
auto *query = ast_generator.query_;
auto *return_clause = dynamic_cast<Return *>(query->clauses_[0]);
auto *literal = dynamic_cast<PrimitiveLiteral *>(