Make unquoted string values really work

This commit is contained in:
Havoc Pennington 2011-11-08 09:17:23 -05:00
parent e3e1d7392d
commit 9b2a96aef4
4 changed files with 132 additions and 61 deletions

74
SPEC.md
View File

@ -72,11 +72,11 @@ Different from JSON:
instead
- keys with an object as their value may omit `=`, so `foo { }` means
`foo = { }`
- keys which contain no whitespace need not be quoted; the string
is then used literally with no unescaping
- if a key is not quoted, the `.` character has a special meaning and
- keys may be unquoted strings (see below for detailed definition)
- only if a key is unquoted, the `.` character has a special meaning and
creates a new object. So `foo.bar = 10` means to create an object at key
`foo`, then inside that object, create a key `bar` with value `10`
`foo`, then inside that object, create a key `bar` with value
`10`.
- quoted keys _should not_ contain the `.` character because it's
confusing, but it is permitted (to preserve the ability to use any string
as a key and thus convert an arbitrary map or JavaScript object into HOCON)
@ -100,32 +100,46 @@ Different from JSON:
- FIXME prepend operator?
- a new type of value exists, substitution, which looks like `${some.path}`
(details below)
- to support substitutions, a value may consist of multiple strings which
are concatenated into one string. `"foo"${some.path}"bar"`
- String values may sometimes omit quotes. If a value does not parse as a
substitution, quoted string, number, object, array, true, false, or null,
then that value will be parsed as a string value, created as
follows:
- take the string from the `=` to the first newline or comma
- remove leading and trailing whitespace (whitespace defined
only as ASCII whitespace, as with Java's trim() method)
- what remains is treated as a sequence of strings, where
each string is either the raw inline UTF-8 data, a quoted
string, or a substitution
- everything up to a `"` or `$` is a raw unquoted UTF-8 string;
no unescaping is performed
- at `"` a quoted string is parsed, with the usual escape
sequences; after the close `"` parsing the unquoted string
continues. The quoted string must be well-formed or it's
an error.
- at `$` a substitution is parsed. The substitution must be well-formed
or it's an error.
- to get a literal `"`, `$`, newline or comma, you would have to use
a quoted string
- after the initial raw string, quoted string, or substitution,
parsing another one immediately begins and so on until the
end of the value.
- the resulting sequence of strings is concatenated
- String values may sometimes omit quotes.
- Unquoted strings may not contain '$', '"', '{', '}',
'[', ']', ':', '=', ',', or '\' (backslash) and may not
contain whitespace (including newlines).
- Unquoted strings do not support any form of escaping; the
characters are all left as-is. If you need to use special
characters or escaping, you have to quote the string.
- Because of "value concatenation" rules (see below) you can
write a sentence with whitespace unquoted, though.
- Any unquoted series of characters that parses as a
substitution, true, false, null, number, or quoted string
will be treated as the type it parses as, rather than as
an unquoted string. However, in "value concatenation"
the non-string types convert to strings, which means
you can have the word "true" in an unquoted sentence.
- true, false, null, numbers only parse as such if they
immediately follow at least one character that is not
allowed in unquoted strings. That is, `truefoo` is
the value `true` then the unquoted string `foo`, but
`footrue` is the unquoted string `footrue`.
- quoted strings and substitutions always parse as such
since they begin with a character that can't be in an
unquoted string.
- Value concatenation: to support substitutions, and unquoted
sentences with whitespace, a value may consist of multiple
values which are concatenated into one
string. `"foo"${some.path}"bar"` or `The quick brown fox`.
- let a "simple value" be the set of JSON values excluding
objects and arrays, and including unquoted strings and
substitutions.
- as long as simple values are separated only by non-newline
whitespace, the _whitespace between them is preserved_
and the values, along with the whitespace, are concatenated
into a string.
- Whitespace before the first and after the last simple value
will be discarded. Only whitespace _between_ simple values
is preserved.
- concatenation never spans a newline or a non-simple-value
token.
- the result of the concatenation is a string value.
- the special key `include` followed directly by a string value (with no
`=`) means to treat that string value as a filename and merge the
object defined in that file into the current object, overriding

View File

@ -25,6 +25,22 @@ final class Tokenizer {
private int oneCharBuffer;
private int lineNumber;
private Queue<Token> tokens;
// has to be saved inside value concatenations
private StringBuilder whitespace;
// may need to value-concat with next value
private boolean lastTokenWasSimpleValue;
TokenIterator(ConfigOrigin origin, Reader input) {
this.origin = origin;
this.input = input;
oneCharBuffer = -1;
lineNumber = 0;
tokens = new LinkedList<Token>();
tokens.add(Tokens.START);
whitespace = new StringBuilder();
lastTokenWasSimpleValue = false;
}
private int nextChar() {
if (oneCharBuffer >= 0) {
@ -49,15 +65,26 @@ final class Tokenizer {
oneCharBuffer = c;
}
static boolean isWhitespace(int c) {
// hoping this optimizes slightly by catching the most common ' '
// case up front.
return c == ' ' || c == '\n' || Character.isWhitespace(c);
}
static boolean isWhitespaceNotNewline(int c) {
return c == ' ' || (c != '\n' && Character.isWhitespace(c));
}
// get next char, skipping non-newline whitespace
private int nextCharAfterWhitespace() {
for (;;) {
int c = nextChar();
if (c == -1) {
return -1;
} else if (c == '\n') {
return c;
} else if (Character.isWhitespace(c)) {
} else if (isWhitespaceNotNewline(c)) {
if (lastTokenWasSimpleValue)
whitespace.appendCodePoint(c);
continue;
} else {
return c;
@ -83,7 +110,7 @@ final class Tokenizer {
// chars JSON allows to be part of a number
static final String numberChars = "0123456789eE+-.";
// chars that stop an unquoted string
static final String notInUnquotedText = "$\"{}[]:=\n,";
static final String notInUnquotedText = "$\"{}[]:=,\\";
// The rules here are intended to maximize convenience while
// avoiding confusion with real valid JSON. Basically anything
@ -98,12 +125,15 @@ final class Tokenizer {
break;
} else if (notInUnquotedText.indexOf(c) >= 0) {
break;
} else if (isWhitespace(c)) {
break;
} else {
sb.append((char) c);
}
// we parse true/false/null tokens as such no matter
// what is after them.
// what is after them, as long as they are at the
// start of the unquoted token.
if (sb.length() == 4) {
String s = sb.toString();
if (s.equals("true"))
@ -122,8 +152,7 @@ final class Tokenizer {
// put back the char that ended the unquoted text
putBack(c);
// chop trailing whitespace; have to quote to have trailing spaces.
String s = sb.toString().trim();
String s = sb.toString();
return Tokens.newUnquotedText(origin, s);
}
@ -233,19 +262,50 @@ final class Tokenizer {
return Tokens.newString(lineOrigin(), sb.toString());
}
// called if the next token is not a simple value;
// discards any whitespace we were saving between
// simple values.
private void nextIsNotASimpleValue() {
lastTokenWasSimpleValue = false;
whitespace.setLength(0);
}
// called if the next token IS a simple value,
// so creates a whitespace token if the previous
// token also was.
private void nextIsASimpleValue() {
if (lastTokenWasSimpleValue) {
// need to save whitespace between the two so
// the parser has the option to concatenate it.
if (whitespace.length() > 0) {
tokens.add(Tokens.newUnquotedText(lineOrigin(),
whitespace.toString()));
whitespace.setLength(0); // reset
}
// lastTokenWasSimpleValue = true still
} else {
lastTokenWasSimpleValue = true;
whitespace.setLength(0);
}
}
private void queueNextToken() {
int c = nextCharAfterWhitespace();
if (c == -1) {
nextIsNotASimpleValue();
tokens.add(Tokens.END);
} else if (c == '\n') {
// newline tokens have the just-ended line number
nextIsNotASimpleValue();
tokens.add(Tokens.newLine(lineNumber));
lineNumber += 1;
} else {
Token t = null;
boolean tIsSimpleValue = false;
switch (c) {
case '"':
t = pullQuotedString();
tIsSimpleValue = true;
break;
case ':':
t = Tokens.COLON;
@ -270,6 +330,7 @@ final class Tokenizer {
if (t == null) {
if (firstNumberChars.indexOf(c) >= 0) {
t = pullNumber(c);
tIsSimpleValue = true;
} else if (notInUnquotedText.indexOf(c) >= 0) {
throw parseError(String
.format("Character '%c' is not the start of any valid token",
@ -277,25 +338,24 @@ final class Tokenizer {
} else {
putBack(c);
t = pullUnquotedText();
tIsSimpleValue = true;
}
}
if (t == null)
throw new ConfigException.BugOrBroken(
"bug: failed to generate next token");
if (tIsSimpleValue) {
nextIsASimpleValue();
} else {
nextIsNotASimpleValue();
}
tokens.add(t);
}
}
TokenIterator(ConfigOrigin origin, Reader input) {
this.origin = origin;
this.input = input;
oneCharBuffer = -1;
lineNumber = 0;
tokens = new LinkedList<Token>();
tokens.add(Tokens.START);
}
@Override
public boolean hasNext() {
return !tokens.isEmpty();
@ -304,7 +364,7 @@ final class Tokenizer {
@Override
public Token next() {
Token t = tokens.remove();
if (t != Tokens.END) {
if (tokens.isEmpty() && t != Tokens.END) {
queueNextToken();
if (tokens.isEmpty())
throw new ConfigException.BugOrBroken(

View File

@ -66,6 +66,6 @@ class EquivalentsTest extends TestUtils {
// This is a little "checksum" to be sure we really tested what we were expecting.
// it breaks every time you add a file, so you have to update it.
assertEquals(1, dirCount)
assertEquals(1, fileCount)
assertEquals(2, fileCount)
}
}

View File

@ -63,25 +63,21 @@ class TokenizerTest extends TestUtils {
@Test
def tokenizeAllTypesWithSingleSpaces() {
// all token types with no spaces (not sure JSON spec wants this to work,
// but spec is unclear to me when spaces are required, and banning them
// is actually extra work)
val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.CLOSE_CURLY,
Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, tokenString("foo"),
tokenLong(42), tokenTrue, tokenDouble(3.14),
tokenFalse, tokenNull, Tokens.newLine(0), Tokens.END)
tokenUnquoted(" "), tokenLong(42), tokenUnquoted(" "), tokenTrue, tokenUnquoted(" "),
tokenDouble(3.14), tokenUnquoted(" "), tokenFalse, tokenUnquoted(" "), tokenNull,
Tokens.newLine(0), Tokens.END)
assertEquals(expected, tokenizeAsList(""" , : } { ] [ "foo" 42 true 3.14 false null """ + "\n "))
}
@Test
def tokenizeAllTypesWithMultipleSpaces() {
// all token types with no spaces (not sure JSON spec wants this to work,
// but spec is unclear to me when spaces are required, and banning them
// is actually extra work)
val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.CLOSE_CURLY,
Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, tokenString("foo"),
tokenLong(42), tokenTrue, tokenDouble(3.14),
tokenFalse, tokenNull, Tokens.newLine(0), Tokens.END)
tokenUnquoted(" "), tokenLong(42), tokenUnquoted(" "), tokenTrue, tokenUnquoted(" "),
tokenDouble(3.14), tokenUnquoted(" "), tokenFalse, tokenUnquoted(" "), tokenNull,
Tokens.newLine(0), Tokens.END)
assertEquals(expected, tokenizeAsList(""" , : } { ] [ "foo" 42 true 3.14 false null """ + "\n "))
}
@ -111,13 +107,13 @@ class TokenizerTest extends TestUtils {
@Test
def tokenizeUnquotedTextContainingSpaceTrue() {
val expected = List(Tokens.START, tokenUnquoted("foo true"), Tokens.END)
val expected = List(Tokens.START, tokenUnquoted("foo"), tokenUnquoted(" "), tokenTrue, Tokens.END)
assertEquals(expected, tokenizeAsList("""foo true"""))
}
@Test
def tokenizeTrueAndSpaceAndUnquotedText() {
val expected = List(Tokens.START, tokenTrue, tokenUnquoted("foo"), Tokens.END)
val expected = List(Tokens.START, tokenTrue, tokenUnquoted(" "), tokenUnquoted("foo"), Tokens.END)
assertEquals(expected, tokenizeAsList("""true foo"""))
}
@ -129,7 +125,8 @@ class TokenizerTest extends TestUtils {
@Test
def tokenizeUnquotedTextKeepsInternalSpaces() {
val expected = List(Tokens.START, tokenUnquoted("foo bar baz"), Tokens.newLine(0), Tokens.END)
val expected = List(Tokens.START, tokenUnquoted("foo"), tokenUnquoted(" "), tokenUnquoted("bar"),
tokenUnquoted(" "), tokenUnquoted("baz"), Tokens.newLine(0), Tokens.END)
assertEquals(expected, tokenizeAsList(" foo bar baz \n"))
}