Make unquoted string values really work

This commit is contained in:
Havoc Pennington 2011-11-08 09:17:23 -05:00
parent e3e1d7392d
commit 9b2a96aef4
4 changed files with 132 additions and 61 deletions

74
SPEC.md
View File

@ -72,11 +72,11 @@ Different from JSON:
instead instead
- keys with an object as their value may omit `=`, so `foo { }` means - keys with an object as their value may omit `=`, so `foo { }` means
`foo = { }` `foo = { }`
- keys which contain no whitespace need not be quoted; the string - keys may be unquoted strings (see below for detailed definition)
is then used literally with no unescaping - only if a key is unquoted, the `.` character has a special meaning and
- if a key is not quoted, the `.` character has a special meaning and
creates a new object. So `foo.bar = 10` means to create an object at key creates a new object. So `foo.bar = 10` means to create an object at key
`foo`, then inside that object, create a key `bar` with value `10` `foo`, then inside that object, create a key `bar` with value
`10`.
- quoted keys _should not_ contain the `.` character because it's - quoted keys _should not_ contain the `.` character because it's
confusing, but it is permitted (to preserve the ability to use any string confusing, but it is permitted (to preserve the ability to use any string
as a key and thus convert an arbitrary map or JavaScript object into HOCON) as a key and thus convert an arbitrary map or JavaScript object into HOCON)
@ -100,32 +100,46 @@ Different from JSON:
- FIXME prepend operator? - FIXME prepend operator?
- a new type of value exists, substitution, which looks like `${some.path}` - a new type of value exists, substitution, which looks like `${some.path}`
(details below) (details below)
- to support substitutions, a value may consist of multiple strings which - String values may sometimes omit quotes.
are concatenated into one string. `"foo"${some.path}"bar"` - Unquoted strings may not contain '$', '"', '{', '}',
- String values may sometimes omit quotes. If a value does not parse as a '[', ']', ':', '=', ',', or '\' (backslash) and may not
substitution, quoted string, number, object, array, true, false, or null, contain whitespace (including newlines).
then that value will be parsed as a string value, created as - Unquoted strings do not support any form of escaping; the
follows: characters are all left as-is. If you need to use special
- take the string from the `=` to the first newline or comma characters or escaping, you have to quote the string.
- remove leading and trailing whitespace (whitespace defined - Because of "value concatenation" rules (see below) you can
only as ASCII whitespace, as with Java's trim() method) write a sentence with whitespace unquoted, though.
- what remains is treated as a sequence of strings, where - Any unquoted series of characters that parses as a
each string is either the raw inline UTF-8 data, a quoted substitution, true, false, null, number, or quoted string
string, or a substitution will be treated as the type it parses as, rather than as
- everything up to a `"` or `$` is a raw unquoted UTF-8 string; an unquoted string. However, in "value concatenation"
no unescaping is performed the non-string types convert to strings, which means
- at `"` a quoted string is parsed, with the usual escape you can have the word "true" in an unquoted sentence.
sequences; after the close `"` parsing the unquoted string - true, false, null, numbers only parse as such if they
continues. The quoted string must be well-formed or it's immediately follow at least one character that is not
an error. allowed in unquoted strings. That is, `truefoo` is
- at `$` a substitution is parsed. The substitution must be well-formed the value `true` then the unquoted string `foo`, but
or it's an error. `footrue` is the unquoted string `footrue`.
- to get a literal `"`, `$`, newline or comma, you would have to use - quoted strings and substitutions always parse as such
a quoted string since they begin with a character that can't be in an
- after the initial raw string, quoted string, or substitution, unquoted string.
parsing another one immediately begins and so on until the - Value concatenation: to support substitutions, and unquoted
end of the value. sentences with whitespace, a value may consist of multiple
- the resulting sequence of strings is concatenated values which are concatenated into one
string. `"foo"${some.path}"bar"` or `The quick brown fox`.
- let a "simple value" be the set of JSON values excluding
objects and arrays, and including unquoted strings and
substitutions.
- as long as simple values are separated only by non-newline
whitespace, the _whitespace between them is preserved_
and the values, along with the whitespace, are concatenated
into a string.
- Whitespace before the first and after the last simple value
will be discarded. Only whitespace _between_ simple values
is preserved.
- concatenation never spans a newline or a non-simple-value
token.
- the result of the concatenation is a string value.
- the special key `include` followed directly by a string value (with no - the special key `include` followed directly by a string value (with no
`=`) means to treat that string value as a filename and merge the `=`) means to treat that string value as a filename and merge the
object defined in that file into the current object, overriding object defined in that file into the current object, overriding

View File

@ -25,6 +25,22 @@ final class Tokenizer {
private int oneCharBuffer; private int oneCharBuffer;
private int lineNumber; private int lineNumber;
private Queue<Token> tokens; private Queue<Token> tokens;
// has to be saved inside value concatenations
private StringBuilder whitespace;
// may need to value-concat with next value
private boolean lastTokenWasSimpleValue;
TokenIterator(ConfigOrigin origin, Reader input) {
this.origin = origin;
this.input = input;
oneCharBuffer = -1;
lineNumber = 0;
tokens = new LinkedList<Token>();
tokens.add(Tokens.START);
whitespace = new StringBuilder();
lastTokenWasSimpleValue = false;
}
private int nextChar() { private int nextChar() {
if (oneCharBuffer >= 0) { if (oneCharBuffer >= 0) {
@ -49,15 +65,26 @@ final class Tokenizer {
oneCharBuffer = c; oneCharBuffer = c;
} }
static boolean isWhitespace(int c) {
// hoping this optimizes slightly by catching the most common ' '
// case up front.
return c == ' ' || c == '\n' || Character.isWhitespace(c);
}
static boolean isWhitespaceNotNewline(int c) {
return c == ' ' || (c != '\n' && Character.isWhitespace(c));
}
// get next char, skipping non-newline whitespace
private int nextCharAfterWhitespace() { private int nextCharAfterWhitespace() {
for (;;) { for (;;) {
int c = nextChar(); int c = nextChar();
if (c == -1) { if (c == -1) {
return -1; return -1;
} else if (c == '\n') { } else if (isWhitespaceNotNewline(c)) {
return c; if (lastTokenWasSimpleValue)
} else if (Character.isWhitespace(c)) { whitespace.appendCodePoint(c);
continue; continue;
} else { } else {
return c; return c;
@ -83,7 +110,7 @@ final class Tokenizer {
// chars JSON allows to be part of a number // chars JSON allows to be part of a number
static final String numberChars = "0123456789eE+-."; static final String numberChars = "0123456789eE+-.";
// chars that stop an unquoted string // chars that stop an unquoted string
static final String notInUnquotedText = "$\"{}[]:=\n,"; static final String notInUnquotedText = "$\"{}[]:=,\\";
// The rules here are intended to maximize convenience while // The rules here are intended to maximize convenience while
// avoiding confusion with real valid JSON. Basically anything // avoiding confusion with real valid JSON. Basically anything
@ -98,12 +125,15 @@ final class Tokenizer {
break; break;
} else if (notInUnquotedText.indexOf(c) >= 0) { } else if (notInUnquotedText.indexOf(c) >= 0) {
break; break;
} else if (isWhitespace(c)) {
break;
} else { } else {
sb.append((char) c); sb.append((char) c);
} }
// we parse true/false/null tokens as such no matter // we parse true/false/null tokens as such no matter
// what is after them. // what is after them, as long as they are at the
// start of the unquoted token.
if (sb.length() == 4) { if (sb.length() == 4) {
String s = sb.toString(); String s = sb.toString();
if (s.equals("true")) if (s.equals("true"))
@ -122,8 +152,7 @@ final class Tokenizer {
// put back the char that ended the unquoted text // put back the char that ended the unquoted text
putBack(c); putBack(c);
// chop trailing whitespace; have to quote to have trailing spaces. String s = sb.toString();
String s = sb.toString().trim();
return Tokens.newUnquotedText(origin, s); return Tokens.newUnquotedText(origin, s);
} }
@ -233,19 +262,50 @@ final class Tokenizer {
return Tokens.newString(lineOrigin(), sb.toString()); return Tokens.newString(lineOrigin(), sb.toString());
} }
// called if the next token is not a simple value;
// discards any whitespace we were saving between
// simple values.
private void nextIsNotASimpleValue() {
lastTokenWasSimpleValue = false;
whitespace.setLength(0);
}
// called if the next token IS a simple value,
// so creates a whitespace token if the previous
// token also was.
private void nextIsASimpleValue() {
if (lastTokenWasSimpleValue) {
// need to save whitespace between the two so
// the parser has the option to concatenate it.
if (whitespace.length() > 0) {
tokens.add(Tokens.newUnquotedText(lineOrigin(),
whitespace.toString()));
whitespace.setLength(0); // reset
}
// lastTokenWasSimpleValue = true still
} else {
lastTokenWasSimpleValue = true;
whitespace.setLength(0);
}
}
private void queueNextToken() { private void queueNextToken() {
int c = nextCharAfterWhitespace(); int c = nextCharAfterWhitespace();
if (c == -1) { if (c == -1) {
nextIsNotASimpleValue();
tokens.add(Tokens.END); tokens.add(Tokens.END);
} else if (c == '\n') { } else if (c == '\n') {
// newline tokens have the just-ended line number // newline tokens have the just-ended line number
nextIsNotASimpleValue();
tokens.add(Tokens.newLine(lineNumber)); tokens.add(Tokens.newLine(lineNumber));
lineNumber += 1; lineNumber += 1;
} else { } else {
Token t = null; Token t = null;
boolean tIsSimpleValue = false;
switch (c) { switch (c) {
case '"': case '"':
t = pullQuotedString(); t = pullQuotedString();
tIsSimpleValue = true;
break; break;
case ':': case ':':
t = Tokens.COLON; t = Tokens.COLON;
@ -270,6 +330,7 @@ final class Tokenizer {
if (t == null) { if (t == null) {
if (firstNumberChars.indexOf(c) >= 0) { if (firstNumberChars.indexOf(c) >= 0) {
t = pullNumber(c); t = pullNumber(c);
tIsSimpleValue = true;
} else if (notInUnquotedText.indexOf(c) >= 0) { } else if (notInUnquotedText.indexOf(c) >= 0) {
throw parseError(String throw parseError(String
.format("Character '%c' is not the start of any valid token", .format("Character '%c' is not the start of any valid token",
@ -277,25 +338,24 @@ final class Tokenizer {
} else { } else {
putBack(c); putBack(c);
t = pullUnquotedText(); t = pullUnquotedText();
tIsSimpleValue = true;
} }
} }
if (t == null) if (t == null)
throw new ConfigException.BugOrBroken( throw new ConfigException.BugOrBroken(
"bug: failed to generate next token"); "bug: failed to generate next token");
if (tIsSimpleValue) {
nextIsASimpleValue();
} else {
nextIsNotASimpleValue();
}
tokens.add(t); tokens.add(t);
} }
} }
TokenIterator(ConfigOrigin origin, Reader input) {
this.origin = origin;
this.input = input;
oneCharBuffer = -1;
lineNumber = 0;
tokens = new LinkedList<Token>();
tokens.add(Tokens.START);
}
@Override @Override
public boolean hasNext() { public boolean hasNext() {
return !tokens.isEmpty(); return !tokens.isEmpty();
@ -304,7 +364,7 @@ final class Tokenizer {
@Override @Override
public Token next() { public Token next() {
Token t = tokens.remove(); Token t = tokens.remove();
if (t != Tokens.END) { if (tokens.isEmpty() && t != Tokens.END) {
queueNextToken(); queueNextToken();
if (tokens.isEmpty()) if (tokens.isEmpty())
throw new ConfigException.BugOrBroken( throw new ConfigException.BugOrBroken(

View File

@ -66,6 +66,6 @@ class EquivalentsTest extends TestUtils {
// This is a little "checksum" to be sure we really tested what we were expecting. // This is a little "checksum" to be sure we really tested what we were expecting.
// it breaks every time you add a file, so you have to update it. // it breaks every time you add a file, so you have to update it.
assertEquals(1, dirCount) assertEquals(1, dirCount)
assertEquals(1, fileCount) assertEquals(2, fileCount)
} }
} }

View File

@ -63,25 +63,21 @@ class TokenizerTest extends TestUtils {
@Test @Test
def tokenizeAllTypesWithSingleSpaces() { def tokenizeAllTypesWithSingleSpaces() {
// all token types with no spaces (not sure JSON spec wants this to work,
// but spec is unclear to me when spaces are required, and banning them
// is actually extra work)
val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.CLOSE_CURLY, val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.CLOSE_CURLY,
Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, tokenString("foo"), Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, tokenString("foo"),
tokenLong(42), tokenTrue, tokenDouble(3.14), tokenUnquoted(" "), tokenLong(42), tokenUnquoted(" "), tokenTrue, tokenUnquoted(" "),
tokenFalse, tokenNull, Tokens.newLine(0), Tokens.END) tokenDouble(3.14), tokenUnquoted(" "), tokenFalse, tokenUnquoted(" "), tokenNull,
Tokens.newLine(0), Tokens.END)
assertEquals(expected, tokenizeAsList(""" , : } { ] [ "foo" 42 true 3.14 false null """ + "\n ")) assertEquals(expected, tokenizeAsList(""" , : } { ] [ "foo" 42 true 3.14 false null """ + "\n "))
} }
@Test @Test
def tokenizeAllTypesWithMultipleSpaces() { def tokenizeAllTypesWithMultipleSpaces() {
// all token types with no spaces (not sure JSON spec wants this to work,
// but spec is unclear to me when spaces are required, and banning them
// is actually extra work)
val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.CLOSE_CURLY, val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.CLOSE_CURLY,
Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, tokenString("foo"), Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, tokenString("foo"),
tokenLong(42), tokenTrue, tokenDouble(3.14), tokenUnquoted(" "), tokenLong(42), tokenUnquoted(" "), tokenTrue, tokenUnquoted(" "),
tokenFalse, tokenNull, Tokens.newLine(0), Tokens.END) tokenDouble(3.14), tokenUnquoted(" "), tokenFalse, tokenUnquoted(" "), tokenNull,
Tokens.newLine(0), Tokens.END)
assertEquals(expected, tokenizeAsList(""" , : } { ] [ "foo" 42 true 3.14 false null """ + "\n ")) assertEquals(expected, tokenizeAsList(""" , : } { ] [ "foo" 42 true 3.14 false null """ + "\n "))
} }
@ -111,13 +107,13 @@ class TokenizerTest extends TestUtils {
@Test @Test
def tokenizeUnquotedTextContainingSpaceTrue() { def tokenizeUnquotedTextContainingSpaceTrue() {
val expected = List(Tokens.START, tokenUnquoted("foo true"), Tokens.END) val expected = List(Tokens.START, tokenUnquoted("foo"), tokenUnquoted(" "), tokenTrue, Tokens.END)
assertEquals(expected, tokenizeAsList("""foo true""")) assertEquals(expected, tokenizeAsList("""foo true"""))
} }
@Test @Test
def tokenizeTrueAndSpaceAndUnquotedText() { def tokenizeTrueAndSpaceAndUnquotedText() {
val expected = List(Tokens.START, tokenTrue, tokenUnquoted("foo"), Tokens.END) val expected = List(Tokens.START, tokenTrue, tokenUnquoted(" "), tokenUnquoted("foo"), Tokens.END)
assertEquals(expected, tokenizeAsList("""true foo""")) assertEquals(expected, tokenizeAsList("""true foo"""))
} }
@ -129,7 +125,8 @@ class TokenizerTest extends TestUtils {
@Test @Test
def tokenizeUnquotedTextKeepsInternalSpaces() { def tokenizeUnquotedTextKeepsInternalSpaces() {
val expected = List(Tokens.START, tokenUnquoted("foo bar baz"), Tokens.newLine(0), Tokens.END) val expected = List(Tokens.START, tokenUnquoted("foo"), tokenUnquoted(" "), tokenUnquoted("bar"),
tokenUnquoted(" "), tokenUnquoted("baz"), Tokens.newLine(0), Tokens.END)
assertEquals(expected, tokenizeAsList(" foo bar baz \n")) assertEquals(expected, tokenizeAsList(" foo bar baz \n"))
} }