support "/" in unquoted strings (and therefore keys)

This commit is contained in:
Havoc Pennington 2011-11-16 11:03:16 -05:00
parent 20b75542e4
commit 3610dc8d76
5 changed files with 92 additions and 37 deletions

View File

@ -27,8 +27,9 @@ Implementation-wise, the format should have these properties:
heuristic. It should be clear what's invalid and invalid files
should generate errors.
- require minimal look-ahead; should be able to tokenize the file
by looking at only the current character and the next
character.
by looking at only the next three characters. (right now, the
only reason to look at three is to find "//" comments;
otherwise you can parse looking at two.)
HOCON is significantly harder to specify and to parse than
JSON. Think of it as moving the work from the person maintaining
@ -182,8 +183,10 @@ A sequence of characters outside of a quoted string is a string
value if:
- it does not contain "forbidden characters" '$', '"', '{', '}',
'[', ']', ':', '=', ',', '+', '#', '/', '\' (backslash), or
'[', ']', ':', '=', ',', '+', '#', '\' (backslash), or
whitespace.
- it does not contain the two-character string "//" (which
starts a comment)
- its initial characters do not parse as `true`, `false`, `null`,
or a number.
@ -199,9 +202,9 @@ the unquoted string `bar` but `bar10.0` is the unquoted string
`bar10.0`.
In general, once an unquoted string begins, it continues until a
forbidden character is encountered. Embedded (non-initial)
booleans, nulls, and numbers are not recognized as such, they are
part of the string.
forbidden character or the two-character string "//" is
encountered. Embedded (non-initial) booleans, nulls, and numbers
are not recognized as such, they are part of the string.
An unquoted string may not _begin_ with the digits 0-9 or with a
hyphen (`-`, 0x002D) because those are valid characters to begin a

View File

@ -84,7 +84,7 @@ final class Tokenizer {
final private ConfigOrigin origin;
final private Reader input;
private int oneCharBuffer;
final private LinkedList<Integer> buffer;
private int lineNumber;
final private Queue<Token> tokens;
final private WhitespaceSaver whitespaceSaver;
@ -94,7 +94,7 @@ final class Tokenizer {
this.origin = origin;
this.input = input;
this.allowComments = allowComments;
oneCharBuffer = -1;
this.buffer = new LinkedList<Integer>();
lineNumber = 0;
tokens = new LinkedList<Token>();
tokens.add(Tokens.START);
@ -102,27 +102,29 @@ final class Tokenizer {
}
private int nextChar() {
if (oneCharBuffer >= 0) {
int c = oneCharBuffer;
oneCharBuffer = -1;
return c;
} else {
// this should ONLY be called from nextCharSkippingComments
// or when inside a quoted string, everything else should
// use nextCharSkippingComments().
private int nextCharRaw() {
if (buffer.isEmpty()) {
try {
return input.read();
} catch (IOException e) {
throw new ConfigException.IO(origin, "read error: "
+ e.getMessage(), e);
}
} else {
int c = buffer.pop();
return c;
}
}
private void putBack(int c) {
if (oneCharBuffer >= 0) {
if (buffer.size() > 2) {
throw new ConfigException.BugOrBroken(
"bug: attempt to putBack() twice in a row");
"bug: putBack() three times, undesirable look-ahead");
}
oneCharBuffer = c;
buffer.push(c);
}
static boolean isWhitespace(int c) {
@ -135,29 +137,26 @@ final class Tokenizer {
private int slurpComment() {
for (;;) {
int c = nextChar();
int c = nextCharRaw();
if (c == -1 || c == '\n') {
return c;
}
}
}
// get next char, skipping non-newline whitespace
private int nextCharAfterWhitespace(WhitespaceSaver saver) {
// get next char, skipping comments
private int nextCharSkippingComments() {
for (;;) {
int c = nextChar();
int c = nextCharRaw();
if (c == -1) {
return -1;
} else {
if (isWhitespaceNotNewline(c)) {
saver.add(c);
continue;
} else if (allowComments) {
if (allowComments) {
if (c == '#') {
return slurpComment();
} else if (c == '/') {
int maybeSecondSlash = nextChar();
int maybeSecondSlash = nextCharRaw();
if (maybeSecondSlash == '/') {
return slurpComment();
} else {
@ -174,6 +173,24 @@ final class Tokenizer {
}
}
// get next char, skipping non-newline whitespace
private int nextCharAfterWhitespace(WhitespaceSaver saver) {
for (;;) {
int c = nextCharSkippingComments();
if (c == -1) {
return -1;
} else {
if (isWhitespaceNotNewline(c)) {
saver.add(c);
continue;
} else {
return c;
}
}
}
}
private ConfigException parseError(String message) {
return parseError(message, null);
}
@ -208,7 +225,7 @@ final class Tokenizer {
// chars JSON allows to be part of a number
static final String numberChars = "0123456789eE+-.";
// chars that stop an unquoted string
static final String notInUnquotedText = "$\"{}[]:=,\\+#/";
static final String notInUnquotedText = "$\"{}[]:=,\\+#";
// The rules here are intended to maximize convenience while
// avoiding confusion with real valid JSON. Basically anything
@ -217,7 +234,7 @@ final class Tokenizer {
private Token pullUnquotedText() {
ConfigOrigin origin = lineOrigin();
StringBuilder sb = new StringBuilder();
int c = nextChar();
int c = nextCharSkippingComments();
while (true) {
if (c == -1) {
break;
@ -244,7 +261,7 @@ final class Tokenizer {
return Tokens.newBoolean(origin, false);
}
c = nextChar();
c = nextCharSkippingComments();
}
// put back the char that ended the unquoted text
@ -258,12 +275,12 @@ final class Tokenizer {
StringBuilder sb = new StringBuilder();
sb.appendCodePoint(firstChar);
boolean containedDecimalOrE = false;
int c = nextChar();
int c = nextCharSkippingComments();
while (c != -1 && numberChars.indexOf(c) >= 0) {
if (c == '.' || c == 'e' || c == 'E')
containedDecimalOrE = true;
sb.appendCodePoint(c);
c = nextChar();
c = nextCharSkippingComments();
}
// the last character we looked at wasn't part of the number, put it
// back
@ -285,7 +302,7 @@ final class Tokenizer {
}
private void pullEscapeSequence(StringBuilder sb) {
int escaped = nextChar();
int escaped = nextCharRaw();
if (escaped == -1)
throw parseError("End of input but backslash in string had nothing after it");
@ -318,7 +335,7 @@ final class Tokenizer {
// kind of absurdly slow, but screw it for now
char[] a = new char[4];
for (int i = 0; i < 4; ++i) {
int c = nextChar();
int c = nextCharSkippingComments();
if (c == -1)
throw parseError("End of input but expecting 4 hex digits for \\uXXXX escape");
a[i] = (char) c;
@ -346,7 +363,7 @@ final class Tokenizer {
StringBuilder sb = new StringBuilder();
int c = '\0'; // value doesn't get used
do {
c = nextChar();
c = nextCharRaw();
if (c == -1)
throw parseError("End of input but string quote was still open");
@ -364,7 +381,7 @@ final class Tokenizer {
private Token pullSubstitution() {
// the initial '$' has already been consumed
ConfigOrigin origin = lineOrigin();
int c = nextChar();
int c = nextCharSkippingComments();
if (c != '{') {
throw parseError("'$' not followed by {");
}

View File

@ -269,4 +269,11 @@ class ConfParserTest extends TestUtils {
parseObject("""{ "a" : "y" "b" : "z" }""")
}
}
@Test
def keysWithSlash() {
val obj = parseObject("""/a/b/c=42, x/y/z : 32""")
assertEquals(42, obj.getInt("/a/b/c"))
assertEquals(32, obj.getInt("x/y/z"))
}
}

View File

@ -151,11 +151,12 @@ abstract trait TestUtils {
"""[ { "a" : 2, "b" : ${${a}} } ]""", // nested substitution
"[ = ]", // = is not a valid token in unquoted text
"[ + ]",
"[ / ]",
"[ # ]",
"[ \\ ]",
"[ # comment ]",
"${ #comment }",
"[ // comment ]",
"${ // comment }",
"{ include \"bar\" : 10 }", // include with a value after it
"{ include foo }", // include with unquoted string
"{ include : { \"a\" : 1 } }") // include used as unquoted key
@ -227,6 +228,7 @@ abstract trait TestUtils {
"[ trux ]",
"[ truex ]",
"[ 10x ]", // number token with trailing junk
"[ / ]", // unquoted string "slash"
"{ include \"foo\" }", // valid include
"{ include\n\"foo\" }", // include with just a newline separating from string
"{ include\"foo\" }", // include with no whitespace after it
@ -262,6 +264,7 @@ abstract trait TestUtils {
, 11]""",
"""[ 10 // comment
, 11]""",
"""{ /a/b/c : 10 }""", // key has a slash in it
ParseTest(false, true, "[${ foo.bar}]"), // substitution with leading spaces
ParseTest(false, true, "[${foo.bar }]"), // substitution with trailing spaces
ParseTest(false, true, "[${ \"foo.bar\"}]"), // substitution with leading spaces and quoted

View File

@ -2,11 +2,16 @@ package com.typesafe.config.impl
import org.junit.Assert.assertEquals
import org.junit.Test
import com.typesafe.config.ConfigException
class TokenizerTest extends TestUtils {
// FIXME most of this file should be using this method
private def tokenizerTest(expected: List[Token], s: String) {
assertEquals(List(Tokens.START) ++ expected ++ List(Tokens.END),
tokenizeAsList(s))
}
@Test
def tokenizeEmptyString() {
assertEquals(List(Tokens.START, Tokens.END),
@ -91,6 +96,14 @@ class TokenizerTest extends TestUtils {
assertEquals(expected, tokenizeAsList("""true foo"""))
}
@Test
def tokenizeUnquotedTextContainingSlash() {
tokenizerTest(List(tokenUnquoted("a/b/c/")), "a/b/c/")
tokenizerTest(List(tokenUnquoted("/")), "/")
tokenizerTest(List(tokenUnquoted("/"), tokenUnquoted(" "), tokenUnquoted("/")), "/ /")
tokenizerTest(Nil, "//")
}
@Test
def tokenizeUnquotedTextTrimsSpaces() {
val expected = List(Tokens.START, tokenUnquoted("foo"), Tokens.newLine(0), Tokens.END)
@ -182,4 +195,16 @@ class TokenizerTest extends TestUtils {
}
}
}
@Test
def commentsIgnoredInVariousContext() {
tokenizerTest(List(tokenString("//bar")), "\"//bar\"")
tokenizerTest(List(tokenString("#bar")), "\"#bar\"")
tokenizerTest(List(tokenUnquoted("bar")), "bar//comment")
tokenizerTest(List(tokenUnquoted("bar")), "bar#comment")
tokenizerTest(List(tokenInt(10)), "10//comment")
tokenizerTest(List(tokenInt(10)), "10#comment")
tokenizerTest(List(tokenDouble(3.14)), "3.14//comment")
tokenizerTest(List(tokenDouble(3.14)), "3.14#comment")
}
}