diff --git a/SPEC.md b/SPEC.md index 19389b63..e1a865b1 100644 --- a/SPEC.md +++ b/SPEC.md @@ -72,11 +72,11 @@ Different from JSON: instead - keys with an object as their value may omit `=`, so `foo { }` means `foo = { }` - - keys which contain no whitespace need not be quoted; the string - is then used literally with no unescaping - - if a key is not quoted, the `.` character has a special meaning and + - keys may be unquoted strings (see below for detailed definition) + - only if a key is unquoted, the `.` character has a special meaning and creates a new object. So `foo.bar = 10` means to create an object at key - `foo`, then inside that object, create a key `bar` with value `10` + `foo`, then inside that object, create a key `bar` with value + `10`. - quoted keys _should not_ contain the `.` character because it's confusing, but it is permitted (to preserve the ability to use any string as a key and thus convert an arbitrary map or JavaScript object into HOCON) @@ -100,32 +100,46 @@ Different from JSON: - FIXME prepend operator? - a new type of value exists, substitution, which looks like `${some.path}` (details below) - - to support substitutions, a value may consist of multiple strings which - are concatenated into one string. `"foo"${some.path}"bar"` - - String values may sometimes omit quotes. If a value does not parse as a - substitution, quoted string, number, object, array, true, false, or null, - then that value will be parsed as a string value, created as - follows: - - take the string from the `=` to the first newline or comma - - remove leading and trailing whitespace (whitespace defined - only as ASCII whitespace, as with Java's trim() method) - - what remains is treated as a sequence of strings, where - each string is either the raw inline UTF-8 data, a quoted - string, or a substitution - - everything up to a `"` or `$` is a raw unquoted UTF-8 string; - no unescaping is performed - - at `"` a quoted string is parsed, with the usual escape - sequences; after the close `"` parsing the unquoted string - continues. The quoted string must be well-formed or it's - an error. - - at `$` a substitution is parsed. The substitution must be well-formed - or it's an error. - - to get a literal `"`, `$`, newline or comma, you would have to use - a quoted string - - after the initial raw string, quoted string, or substitution, - parsing another one immediately begins and so on until the - end of the value. - - the resulting sequence of strings is concatenated + - String values may sometimes omit quotes. + - Unquoted strings may not contain '$', '"', '{', '}', + '[', ']', ':', '=', ',', or '\' (backslash) and may not + contain whitespace (including newlines). + - Unquoted strings do not support any form of escaping; the + characters are all left as-is. If you need to use special + characters or escaping, you have to quote the string. + - Because of "value concatenation" rules (see below) you can + write a sentence with whitespace unquoted, though. + - Any unquoted series of characters that parses as a + substitution, true, false, null, number, or quoted string + will be treated as the type it parses as, rather than as + an unquoted string. However, in "value concatenation" + the non-string types convert to strings, which means + you can have the word "true" in an unquoted sentence. + - true, false, null, numbers only parse as such if they + immediately follow at least one character that is not + allowed in unquoted strings. That is, `truefoo` is + the value `true` then the unquoted string `foo`, but + `footrue` is the unquoted string `footrue`. + - quoted strings and substitutions always parse as such + since they begin with a character that can't be in an + unquoted string. + - Value concatenation: to support substitutions, and unquoted + sentences with whitespace, a value may consist of multiple + values which are concatenated into one + string. `"foo"${some.path}"bar"` or `The quick brown fox`. + - let a "simple value" be the set of JSON values excluding + objects and arrays, and including unquoted strings and + substitutions. + - as long as simple values are separated only by non-newline + whitespace, the _whitespace between them is preserved_ + and the values, along with the whitespace, are concatenated + into a string. + - Whitespace before the first and after the last simple value + will be discarded. Only whitespace _between_ simple values + is preserved. + - concatenation never spans a newline or a non-simple-value + token. + - the result of the concatenation is a string value. - the special key `include` followed directly by a string value (with no `=`) means to treat that string value as a filename and merge the object defined in that file into the current object, overriding diff --git a/src/main/java/com/typesafe/config/impl/Tokenizer.java b/src/main/java/com/typesafe/config/impl/Tokenizer.java index 4a46011e..a4807bca 100644 --- a/src/main/java/com/typesafe/config/impl/Tokenizer.java +++ b/src/main/java/com/typesafe/config/impl/Tokenizer.java @@ -25,6 +25,22 @@ final class Tokenizer { private int oneCharBuffer; private int lineNumber; private Queue<Token> tokens; + // has to be saved inside value concatenations + private StringBuilder whitespace; + // may need to value-concat with next value + private boolean lastTokenWasSimpleValue; + + TokenIterator(ConfigOrigin origin, Reader input) { + this.origin = origin; + this.input = input; + oneCharBuffer = -1; + lineNumber = 0; + tokens = new LinkedList<Token>(); + tokens.add(Tokens.START); + whitespace = new StringBuilder(); + lastTokenWasSimpleValue = false; + } + private int nextChar() { if (oneCharBuffer >= 0) { @@ -49,15 +65,26 @@ final class Tokenizer { oneCharBuffer = c; } + static boolean isWhitespace(int c) { + // hoping this optimizes slightly by catching the most common ' ' + // case up front. + return c == ' ' || c == '\n' || Character.isWhitespace(c); + } + + static boolean isWhitespaceNotNewline(int c) { + return c == ' ' || (c != '\n' && Character.isWhitespace(c)); + } + + // get next char, skipping non-newline whitespace private int nextCharAfterWhitespace() { for (;;) { int c = nextChar(); if (c == -1) { return -1; - } else if (c == '\n') { - return c; - } else if (Character.isWhitespace(c)) { + } else if (isWhitespaceNotNewline(c)) { + if (lastTokenWasSimpleValue) + whitespace.appendCodePoint(c); continue; } else { return c; @@ -83,7 +110,7 @@ final class Tokenizer { // chars JSON allows to be part of a number static final String numberChars = "0123456789eE+-."; // chars that stop an unquoted string - static final String notInUnquotedText = "$\"{}[]:=\n,"; + static final String notInUnquotedText = "$\"{}[]:=,\\"; // The rules here are intended to maximize convenience while // avoiding confusion with real valid JSON. Basically anything @@ -98,12 +125,15 @@ final class Tokenizer { break; } else if (notInUnquotedText.indexOf(c) >= 0) { break; + } else if (isWhitespace(c)) { + break; } else { sb.append((char) c); } // we parse true/false/null tokens as such no matter - // what is after them. + // what is after them, as long as they are at the + // start of the unquoted token. if (sb.length() == 4) { String s = sb.toString(); if (s.equals("true")) @@ -122,8 +152,7 @@ final class Tokenizer { // put back the char that ended the unquoted text putBack(c); - // chop trailing whitespace; have to quote to have trailing spaces. - String s = sb.toString().trim(); + String s = sb.toString(); return Tokens.newUnquotedText(origin, s); } @@ -233,19 +262,50 @@ final class Tokenizer { return Tokens.newString(lineOrigin(), sb.toString()); } + // called if the next token is not a simple value; + // discards any whitespace we were saving between + // simple values. + private void nextIsNotASimpleValue() { + lastTokenWasSimpleValue = false; + whitespace.setLength(0); + } + + // called if the next token IS a simple value, + // so creates a whitespace token if the previous + // token also was. + private void nextIsASimpleValue() { + if (lastTokenWasSimpleValue) { + // need to save whitespace between the two so + // the parser has the option to concatenate it. + if (whitespace.length() > 0) { + tokens.add(Tokens.newUnquotedText(lineOrigin(), + whitespace.toString())); + whitespace.setLength(0); // reset + } + // lastTokenWasSimpleValue = true still + } else { + lastTokenWasSimpleValue = true; + whitespace.setLength(0); + } + } + private void queueNextToken() { int c = nextCharAfterWhitespace(); if (c == -1) { + nextIsNotASimpleValue(); tokens.add(Tokens.END); } else if (c == '\n') { // newline tokens have the just-ended line number + nextIsNotASimpleValue(); tokens.add(Tokens.newLine(lineNumber)); lineNumber += 1; } else { Token t = null; + boolean tIsSimpleValue = false; switch (c) { case '"': t = pullQuotedString(); + tIsSimpleValue = true; break; case ':': t = Tokens.COLON; @@ -270,6 +330,7 @@ final class Tokenizer { if (t == null) { if (firstNumberChars.indexOf(c) >= 0) { t = pullNumber(c); + tIsSimpleValue = true; } else if (notInUnquotedText.indexOf(c) >= 0) { throw parseError(String .format("Character '%c' is not the start of any valid token", @@ -277,25 +338,24 @@ final class Tokenizer { } else { putBack(c); t = pullUnquotedText(); + tIsSimpleValue = true; } } if (t == null) throw new ConfigException.BugOrBroken( "bug: failed to generate next token"); + + if (tIsSimpleValue) { + nextIsASimpleValue(); + } else { + nextIsNotASimpleValue(); + } + tokens.add(t); } } - TokenIterator(ConfigOrigin origin, Reader input) { - this.origin = origin; - this.input = input; - oneCharBuffer = -1; - lineNumber = 0; - tokens = new LinkedList<Token>(); - tokens.add(Tokens.START); - } - @Override public boolean hasNext() { return !tokens.isEmpty(); @@ -304,7 +364,7 @@ final class Tokenizer { @Override public Token next() { Token t = tokens.remove(); - if (t != Tokens.END) { + if (tokens.isEmpty() && t != Tokens.END) { queueNextToken(); if (tokens.isEmpty()) throw new ConfigException.BugOrBroken( diff --git a/src/test/scala/com/typesafe/config/impl/EquivalentsTest.scala b/src/test/scala/com/typesafe/config/impl/EquivalentsTest.scala index 63400762..a90324b1 100644 --- a/src/test/scala/com/typesafe/config/impl/EquivalentsTest.scala +++ b/src/test/scala/com/typesafe/config/impl/EquivalentsTest.scala @@ -66,6 +66,6 @@ class EquivalentsTest extends TestUtils { // This is a little "checksum" to be sure we really tested what we were expecting. // it breaks every time you add a file, so you have to update it. assertEquals(1, dirCount) - assertEquals(1, fileCount) + assertEquals(2, fileCount) } } diff --git a/src/test/scala/com/typesafe/config/impl/TokenizerTest.scala b/src/test/scala/com/typesafe/config/impl/TokenizerTest.scala index a32952a1..85ec661f 100644 --- a/src/test/scala/com/typesafe/config/impl/TokenizerTest.scala +++ b/src/test/scala/com/typesafe/config/impl/TokenizerTest.scala @@ -63,25 +63,21 @@ class TokenizerTest extends TestUtils { @Test def tokenizeAllTypesWithSingleSpaces() { - // all token types with no spaces (not sure JSON spec wants this to work, - // but spec is unclear to me when spaces are required, and banning them - // is actually extra work) val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.CLOSE_CURLY, Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, tokenString("foo"), - tokenLong(42), tokenTrue, tokenDouble(3.14), - tokenFalse, tokenNull, Tokens.newLine(0), Tokens.END) + tokenUnquoted(" "), tokenLong(42), tokenUnquoted(" "), tokenTrue, tokenUnquoted(" "), + tokenDouble(3.14), tokenUnquoted(" "), tokenFalse, tokenUnquoted(" "), tokenNull, + Tokens.newLine(0), Tokens.END) assertEquals(expected, tokenizeAsList(""" , : } { ] [ "foo" 42 true 3.14 false null """ + "\n ")) } @Test def tokenizeAllTypesWithMultipleSpaces() { - // all token types with no spaces (not sure JSON spec wants this to work, - // but spec is unclear to me when spaces are required, and banning them - // is actually extra work) val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.CLOSE_CURLY, Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, tokenString("foo"), - tokenLong(42), tokenTrue, tokenDouble(3.14), - tokenFalse, tokenNull, Tokens.newLine(0), Tokens.END) + tokenUnquoted(" "), tokenLong(42), tokenUnquoted(" "), tokenTrue, tokenUnquoted(" "), + tokenDouble(3.14), tokenUnquoted(" "), tokenFalse, tokenUnquoted(" "), tokenNull, + Tokens.newLine(0), Tokens.END) assertEquals(expected, tokenizeAsList(""" , : } { ] [ "foo" 42 true 3.14 false null """ + "\n ")) } @@ -111,13 +107,13 @@ class TokenizerTest extends TestUtils { @Test def tokenizeUnquotedTextContainingSpaceTrue() { - val expected = List(Tokens.START, tokenUnquoted("foo true"), Tokens.END) + val expected = List(Tokens.START, tokenUnquoted("foo"), tokenUnquoted(" "), tokenTrue, Tokens.END) assertEquals(expected, tokenizeAsList("""foo true""")) } @Test def tokenizeTrueAndSpaceAndUnquotedText() { - val expected = List(Tokens.START, tokenTrue, tokenUnquoted("foo"), Tokens.END) + val expected = List(Tokens.START, tokenTrue, tokenUnquoted(" "), tokenUnquoted("foo"), Tokens.END) assertEquals(expected, tokenizeAsList("""true foo""")) } @@ -129,7 +125,8 @@ class TokenizerTest extends TestUtils { @Test def tokenizeUnquotedTextKeepsInternalSpaces() { - val expected = List(Tokens.START, tokenUnquoted("foo bar baz"), Tokens.newLine(0), Tokens.END) + val expected = List(Tokens.START, tokenUnquoted("foo"), tokenUnquoted(" "), tokenUnquoted("bar"), + tokenUnquoted(" "), tokenUnquoted("baz"), Tokens.newLine(0), Tokens.END) assertEquals(expected, tokenizeAsList(" foo bar baz \n")) }