mirror of
https://github.com/lightbend/config.git
synced 2025-01-15 23:01:05 +08:00
Make unquoted string values really work
This commit is contained in:
parent
e3e1d7392d
commit
9b2a96aef4
74
SPEC.md
74
SPEC.md
@ -72,11 +72,11 @@ Different from JSON:
|
||||
instead
|
||||
- keys with an object as their value may omit `=`, so `foo { }` means
|
||||
`foo = { }`
|
||||
- keys which contain no whitespace need not be quoted; the string
|
||||
is then used literally with no unescaping
|
||||
- if a key is not quoted, the `.` character has a special meaning and
|
||||
- keys may be unquoted strings (see below for detailed definition)
|
||||
- only if a key is unquoted, the `.` character has a special meaning and
|
||||
creates a new object. So `foo.bar = 10` means to create an object at key
|
||||
`foo`, then inside that object, create a key `bar` with value `10`
|
||||
`foo`, then inside that object, create a key `bar` with value
|
||||
`10`.
|
||||
- quoted keys _should not_ contain the `.` character because it's
|
||||
confusing, but it is permitted (to preserve the ability to use any string
|
||||
as a key and thus convert an arbitrary map or JavaScript object into HOCON)
|
||||
@ -100,32 +100,46 @@ Different from JSON:
|
||||
- FIXME prepend operator?
|
||||
- a new type of value exists, substitution, which looks like `${some.path}`
|
||||
(details below)
|
||||
- to support substitutions, a value may consist of multiple strings which
|
||||
are concatenated into one string. `"foo"${some.path}"bar"`
|
||||
- String values may sometimes omit quotes. If a value does not parse as a
|
||||
substitution, quoted string, number, object, array, true, false, or null,
|
||||
then that value will be parsed as a string value, created as
|
||||
follows:
|
||||
- take the string from the `=` to the first newline or comma
|
||||
- remove leading and trailing whitespace (whitespace defined
|
||||
only as ASCII whitespace, as with Java's trim() method)
|
||||
- what remains is treated as a sequence of strings, where
|
||||
each string is either the raw inline UTF-8 data, a quoted
|
||||
string, or a substitution
|
||||
- everything up to a `"` or `$` is a raw unquoted UTF-8 string;
|
||||
no unescaping is performed
|
||||
- at `"` a quoted string is parsed, with the usual escape
|
||||
sequences; after the close `"` parsing the unquoted string
|
||||
continues. The quoted string must be well-formed or it's
|
||||
an error.
|
||||
- at `$` a substitution is parsed. The substitution must be well-formed
|
||||
or it's an error.
|
||||
- to get a literal `"`, `$`, newline or comma, you would have to use
|
||||
a quoted string
|
||||
- after the initial raw string, quoted string, or substitution,
|
||||
parsing another one immediately begins and so on until the
|
||||
end of the value.
|
||||
- the resulting sequence of strings is concatenated
|
||||
- String values may sometimes omit quotes.
|
||||
- Unquoted strings may not contain '$', '"', '{', '}',
|
||||
'[', ']', ':', '=', ',', or '\' (backslash) and may not
|
||||
contain whitespace (including newlines).
|
||||
- Unquoted strings do not support any form of escaping; the
|
||||
characters are all left as-is. If you need to use special
|
||||
characters or escaping, you have to quote the string.
|
||||
- Because of "value concatenation" rules (see below) you can
|
||||
write a sentence with whitespace unquoted, though.
|
||||
- Any unquoted series of characters that parses as a
|
||||
substitution, true, false, null, number, or quoted string
|
||||
will be treated as the type it parses as, rather than as
|
||||
an unquoted string. However, in "value concatenation"
|
||||
the non-string types convert to strings, which means
|
||||
you can have the word "true" in an unquoted sentence.
|
||||
- true, false, null, numbers only parse as such if they
|
||||
immediately follow at least one character that is not
|
||||
allowed in unquoted strings. That is, `truefoo` is
|
||||
the value `true` then the unquoted string `foo`, but
|
||||
`footrue` is the unquoted string `footrue`.
|
||||
- quoted strings and substitutions always parse as such
|
||||
since they begin with a character that can't be in an
|
||||
unquoted string.
|
||||
- Value concatenation: to support substitutions, and unquoted
|
||||
sentences with whitespace, a value may consist of multiple
|
||||
values which are concatenated into one
|
||||
string. `"foo"${some.path}"bar"` or `The quick brown fox`.
|
||||
- let a "simple value" be the set of JSON values excluding
|
||||
objects and arrays, and including unquoted strings and
|
||||
substitutions.
|
||||
- as long as simple values are separated only by non-newline
|
||||
whitespace, the _whitespace between them is preserved_
|
||||
and the values, along with the whitespace, are concatenated
|
||||
into a string.
|
||||
- Whitespace before the first and after the last simple value
|
||||
will be discarded. Only whitespace _between_ simple values
|
||||
is preserved.
|
||||
- concatenation never spans a newline or a non-simple-value
|
||||
token.
|
||||
- the result of the concatenation is a string value.
|
||||
- the special key `include` followed directly by a string value (with no
|
||||
`=`) means to treat that string value as a filename and merge the
|
||||
object defined in that file into the current object, overriding
|
||||
|
@ -25,6 +25,22 @@ final class Tokenizer {
|
||||
private int oneCharBuffer;
|
||||
private int lineNumber;
|
||||
private Queue<Token> tokens;
|
||||
// has to be saved inside value concatenations
|
||||
private StringBuilder whitespace;
|
||||
// may need to value-concat with next value
|
||||
private boolean lastTokenWasSimpleValue;
|
||||
|
||||
TokenIterator(ConfigOrigin origin, Reader input) {
|
||||
this.origin = origin;
|
||||
this.input = input;
|
||||
oneCharBuffer = -1;
|
||||
lineNumber = 0;
|
||||
tokens = new LinkedList<Token>();
|
||||
tokens.add(Tokens.START);
|
||||
whitespace = new StringBuilder();
|
||||
lastTokenWasSimpleValue = false;
|
||||
}
|
||||
|
||||
|
||||
private int nextChar() {
|
||||
if (oneCharBuffer >= 0) {
|
||||
@ -49,15 +65,26 @@ final class Tokenizer {
|
||||
oneCharBuffer = c;
|
||||
}
|
||||
|
||||
static boolean isWhitespace(int c) {
|
||||
// hoping this optimizes slightly by catching the most common ' '
|
||||
// case up front.
|
||||
return c == ' ' || c == '\n' || Character.isWhitespace(c);
|
||||
}
|
||||
|
||||
static boolean isWhitespaceNotNewline(int c) {
|
||||
return c == ' ' || (c != '\n' && Character.isWhitespace(c));
|
||||
}
|
||||
|
||||
// get next char, skipping non-newline whitespace
|
||||
private int nextCharAfterWhitespace() {
|
||||
for (;;) {
|
||||
int c = nextChar();
|
||||
|
||||
if (c == -1) {
|
||||
return -1;
|
||||
} else if (c == '\n') {
|
||||
return c;
|
||||
} else if (Character.isWhitespace(c)) {
|
||||
} else if (isWhitespaceNotNewline(c)) {
|
||||
if (lastTokenWasSimpleValue)
|
||||
whitespace.appendCodePoint(c);
|
||||
continue;
|
||||
} else {
|
||||
return c;
|
||||
@ -83,7 +110,7 @@ final class Tokenizer {
|
||||
// chars JSON allows to be part of a number
|
||||
static final String numberChars = "0123456789eE+-.";
|
||||
// chars that stop an unquoted string
|
||||
static final String notInUnquotedText = "$\"{}[]:=\n,";
|
||||
static final String notInUnquotedText = "$\"{}[]:=,\\";
|
||||
|
||||
// The rules here are intended to maximize convenience while
|
||||
// avoiding confusion with real valid JSON. Basically anything
|
||||
@ -98,12 +125,15 @@ final class Tokenizer {
|
||||
break;
|
||||
} else if (notInUnquotedText.indexOf(c) >= 0) {
|
||||
break;
|
||||
} else if (isWhitespace(c)) {
|
||||
break;
|
||||
} else {
|
||||
sb.append((char) c);
|
||||
}
|
||||
|
||||
// we parse true/false/null tokens as such no matter
|
||||
// what is after them.
|
||||
// what is after them, as long as they are at the
|
||||
// start of the unquoted token.
|
||||
if (sb.length() == 4) {
|
||||
String s = sb.toString();
|
||||
if (s.equals("true"))
|
||||
@ -122,8 +152,7 @@ final class Tokenizer {
|
||||
// put back the char that ended the unquoted text
|
||||
putBack(c);
|
||||
|
||||
// chop trailing whitespace; have to quote to have trailing spaces.
|
||||
String s = sb.toString().trim();
|
||||
String s = sb.toString();
|
||||
return Tokens.newUnquotedText(origin, s);
|
||||
}
|
||||
|
||||
@ -233,19 +262,50 @@ final class Tokenizer {
|
||||
return Tokens.newString(lineOrigin(), sb.toString());
|
||||
}
|
||||
|
||||
// called if the next token is not a simple value;
|
||||
// discards any whitespace we were saving between
|
||||
// simple values.
|
||||
private void nextIsNotASimpleValue() {
|
||||
lastTokenWasSimpleValue = false;
|
||||
whitespace.setLength(0);
|
||||
}
|
||||
|
||||
// called if the next token IS a simple value,
|
||||
// so creates a whitespace token if the previous
|
||||
// token also was.
|
||||
private void nextIsASimpleValue() {
|
||||
if (lastTokenWasSimpleValue) {
|
||||
// need to save whitespace between the two so
|
||||
// the parser has the option to concatenate it.
|
||||
if (whitespace.length() > 0) {
|
||||
tokens.add(Tokens.newUnquotedText(lineOrigin(),
|
||||
whitespace.toString()));
|
||||
whitespace.setLength(0); // reset
|
||||
}
|
||||
// lastTokenWasSimpleValue = true still
|
||||
} else {
|
||||
lastTokenWasSimpleValue = true;
|
||||
whitespace.setLength(0);
|
||||
}
|
||||
}
|
||||
|
||||
private void queueNextToken() {
|
||||
int c = nextCharAfterWhitespace();
|
||||
if (c == -1) {
|
||||
nextIsNotASimpleValue();
|
||||
tokens.add(Tokens.END);
|
||||
} else if (c == '\n') {
|
||||
// newline tokens have the just-ended line number
|
||||
nextIsNotASimpleValue();
|
||||
tokens.add(Tokens.newLine(lineNumber));
|
||||
lineNumber += 1;
|
||||
} else {
|
||||
Token t = null;
|
||||
boolean tIsSimpleValue = false;
|
||||
switch (c) {
|
||||
case '"':
|
||||
t = pullQuotedString();
|
||||
tIsSimpleValue = true;
|
||||
break;
|
||||
case ':':
|
||||
t = Tokens.COLON;
|
||||
@ -270,6 +330,7 @@ final class Tokenizer {
|
||||
if (t == null) {
|
||||
if (firstNumberChars.indexOf(c) >= 0) {
|
||||
t = pullNumber(c);
|
||||
tIsSimpleValue = true;
|
||||
} else if (notInUnquotedText.indexOf(c) >= 0) {
|
||||
throw parseError(String
|
||||
.format("Character '%c' is not the start of any valid token",
|
||||
@ -277,25 +338,24 @@ final class Tokenizer {
|
||||
} else {
|
||||
putBack(c);
|
||||
t = pullUnquotedText();
|
||||
tIsSimpleValue = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (t == null)
|
||||
throw new ConfigException.BugOrBroken(
|
||||
"bug: failed to generate next token");
|
||||
|
||||
if (tIsSimpleValue) {
|
||||
nextIsASimpleValue();
|
||||
} else {
|
||||
nextIsNotASimpleValue();
|
||||
}
|
||||
|
||||
tokens.add(t);
|
||||
}
|
||||
}
|
||||
|
||||
TokenIterator(ConfigOrigin origin, Reader input) {
|
||||
this.origin = origin;
|
||||
this.input = input;
|
||||
oneCharBuffer = -1;
|
||||
lineNumber = 0;
|
||||
tokens = new LinkedList<Token>();
|
||||
tokens.add(Tokens.START);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return !tokens.isEmpty();
|
||||
@ -304,7 +364,7 @@ final class Tokenizer {
|
||||
@Override
|
||||
public Token next() {
|
||||
Token t = tokens.remove();
|
||||
if (t != Tokens.END) {
|
||||
if (tokens.isEmpty() && t != Tokens.END) {
|
||||
queueNextToken();
|
||||
if (tokens.isEmpty())
|
||||
throw new ConfigException.BugOrBroken(
|
||||
|
@ -66,6 +66,6 @@ class EquivalentsTest extends TestUtils {
|
||||
// This is a little "checksum" to be sure we really tested what we were expecting.
|
||||
// it breaks every time you add a file, so you have to update it.
|
||||
assertEquals(1, dirCount)
|
||||
assertEquals(1, fileCount)
|
||||
assertEquals(2, fileCount)
|
||||
}
|
||||
}
|
||||
|
@ -63,25 +63,21 @@ class TokenizerTest extends TestUtils {
|
||||
|
||||
@Test
|
||||
def tokenizeAllTypesWithSingleSpaces() {
|
||||
// all token types with no spaces (not sure JSON spec wants this to work,
|
||||
// but spec is unclear to me when spaces are required, and banning them
|
||||
// is actually extra work)
|
||||
val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.CLOSE_CURLY,
|
||||
Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, tokenString("foo"),
|
||||
tokenLong(42), tokenTrue, tokenDouble(3.14),
|
||||
tokenFalse, tokenNull, Tokens.newLine(0), Tokens.END)
|
||||
tokenUnquoted(" "), tokenLong(42), tokenUnquoted(" "), tokenTrue, tokenUnquoted(" "),
|
||||
tokenDouble(3.14), tokenUnquoted(" "), tokenFalse, tokenUnquoted(" "), tokenNull,
|
||||
Tokens.newLine(0), Tokens.END)
|
||||
assertEquals(expected, tokenizeAsList(""" , : } { ] [ "foo" 42 true 3.14 false null """ + "\n "))
|
||||
}
|
||||
|
||||
@Test
|
||||
def tokenizeAllTypesWithMultipleSpaces() {
|
||||
// all token types with no spaces (not sure JSON spec wants this to work,
|
||||
// but spec is unclear to me when spaces are required, and banning them
|
||||
// is actually extra work)
|
||||
val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.CLOSE_CURLY,
|
||||
Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, tokenString("foo"),
|
||||
tokenLong(42), tokenTrue, tokenDouble(3.14),
|
||||
tokenFalse, tokenNull, Tokens.newLine(0), Tokens.END)
|
||||
tokenUnquoted(" "), tokenLong(42), tokenUnquoted(" "), tokenTrue, tokenUnquoted(" "),
|
||||
tokenDouble(3.14), tokenUnquoted(" "), tokenFalse, tokenUnquoted(" "), tokenNull,
|
||||
Tokens.newLine(0), Tokens.END)
|
||||
assertEquals(expected, tokenizeAsList(""" , : } { ] [ "foo" 42 true 3.14 false null """ + "\n "))
|
||||
}
|
||||
|
||||
@ -111,13 +107,13 @@ class TokenizerTest extends TestUtils {
|
||||
|
||||
@Test
|
||||
def tokenizeUnquotedTextContainingSpaceTrue() {
|
||||
val expected = List(Tokens.START, tokenUnquoted("foo true"), Tokens.END)
|
||||
val expected = List(Tokens.START, tokenUnquoted("foo"), tokenUnquoted(" "), tokenTrue, Tokens.END)
|
||||
assertEquals(expected, tokenizeAsList("""foo true"""))
|
||||
}
|
||||
|
||||
@Test
|
||||
def tokenizeTrueAndSpaceAndUnquotedText() {
|
||||
val expected = List(Tokens.START, tokenTrue, tokenUnquoted("foo"), Tokens.END)
|
||||
val expected = List(Tokens.START, tokenTrue, tokenUnquoted(" "), tokenUnquoted("foo"), Tokens.END)
|
||||
assertEquals(expected, tokenizeAsList("""true foo"""))
|
||||
}
|
||||
|
||||
@ -129,7 +125,8 @@ class TokenizerTest extends TestUtils {
|
||||
|
||||
@Test
|
||||
def tokenizeUnquotedTextKeepsInternalSpaces() {
|
||||
val expected = List(Tokens.START, tokenUnquoted("foo bar baz"), Tokens.newLine(0), Tokens.END)
|
||||
val expected = List(Tokens.START, tokenUnquoted("foo"), tokenUnquoted(" "), tokenUnquoted("bar"),
|
||||
tokenUnquoted(" "), tokenUnquoted("baz"), Tokens.newLine(0), Tokens.END)
|
||||
assertEquals(expected, tokenizeAsList(" foo bar baz \n"))
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user