Make unquoted string values really work

2025-03-29 21:51:10 +08:00 · 2011-11-08 09:17:23 -05:00 · 2011-11-08 09:17:23 -05:00 · 9b2a96aef4
commit 9b2a96aef4
parent e3e1d7392d
4 changed files with 132 additions and 61 deletions
--- a/SPEC.md
+++ b/SPEC.md
@ -72,11 +72,11 @@ Different from JSON:
   instead
 - keys with an object as their value may omit `=`, so `foo { }` means
   `foo = { }`
- - keys which contain no whitespace need not be quoted; the string
+ - keys may be unquoted strings (see below for detailed definition)
-   is then used literally with no unescaping
+ - only if a key is unquoted, the `.` character has a special meaning and
 - if a key is not quoted, the `.` character has a special meaning and
   creates a new object. So `foo.bar = 10` means to create an object at key
-   `foo`, then inside that object, create a key `bar` with value `10`
+   `foo`, then inside that object, create a key `bar` with value
   `10`.
 - quoted keys _should not_ contain the `.` character because it's
   confusing, but it is permitted (to preserve the ability to use any string
   as a key and thus convert an arbitrary map or JavaScript object into HOCON)
@ -100,32 +100,46 @@ Different from JSON:
    - FIXME prepend operator?
 - a new type of value exists, substitution, which looks like `${some.path}`
   (details below)
- - to support substitutions, a value may consist of multiple strings which
+ - String values may sometimes omit quotes.
-   are concatenated into one string. `"foo"${some.path}"bar"`
+    - Unquoted strings may not contain '$', '"', '{', '}',
- - String values may sometimes omit quotes. If a value does not parse as a
+      '[', ']', ':', '=', ',', or '\' (backslash) and may not
-   substitution, quoted string, number, object, array, true, false, or null,
+      contain whitespace (including newlines).
-   then that value will be parsed as a string value, created as
+    - Unquoted strings do not support any form of escaping; the
-   follows:
+      characters are all left as-is. If you need to use special
-    - take the string from the `=` to the first newline or comma
+      characters or escaping, you have to quote the string.
-    - remove leading and trailing whitespace (whitespace defined
+    - Because of "value concatenation" rules (see below) you can
-      only as ASCII whitespace, as with Java's trim() method)
+      write a sentence with whitespace unquoted, though.
-    - what remains is treated as a sequence of strings, where
+    - Any unquoted series of characters that parses as a
-      each string is either the raw inline UTF-8 data, a quoted
+      substitution, true, false, null, number, or quoted string
-      string, or a substitution
+      will be treated as the type it parses as, rather than as
-    - everything up to a `"` or `$` is a raw unquoted UTF-8 string;
+      an unquoted string. However, in "value concatenation"
-      no unescaping is performed
+      the non-string types convert to strings, which means
-    - at `"` a quoted string is parsed, with the usual escape
+      you can have the word "true" in an unquoted sentence.
-      sequences; after the close `"` parsing the unquoted string
+    - true, false, null, numbers only parse as such if they
-      continues. The quoted string must be well-formed or it's
+      immediately follow at least one character that is not
-      an error.
+      allowed in unquoted strings. That is, `truefoo` is
-    - at `$` a substitution is parsed. The substitution must be well-formed
+      the value `true` then the unquoted string `foo`, but
-      or it's an error.
+      `footrue` is the unquoted string `footrue`.
-    - to get a literal `"`, `$`, newline or comma, you would have to use
+    - quoted strings and substitutions always parse as such
-      a quoted string
+      since they begin with a character that can't be in an
-    - after the initial raw string, quoted string, or substitution,
+      unquoted string.
-      parsing another one immediately begins and so on until the
+ - Value concatenation: to support substitutions, and unquoted
-      end of the value.
+   sentences with whitespace, a value may consist of multiple
-    - the resulting sequence of strings is concatenated
+   values which are concatenated into one
   string. `"foo"${some.path}"bar"` or `The quick brown fox`.
    - let a "simple value" be the set of JSON values excluding
      objects and arrays, and including unquoted strings and
      substitutions.
    - as long as simple values are separated only by non-newline
      whitespace, the _whitespace between them is preserved_
      and the values, along with the whitespace, are concatenated
      into a string.
    - Whitespace before the first and after the last simple value
      will be discarded. Only whitespace _between_ simple values
      is preserved.
    - concatenation never spans a newline or a non-simple-value
      token.
    - the result of the concatenation is a string value.
 - the special key `include` followed directly by a string value (with no
   `=`) means to treat that string value as a filename and merge the
   object defined in that file into the current object, overriding
--- a/src/main/java/com/typesafe/config/impl/Tokenizer.java
+++ b/src/main/java/com/typesafe/config/impl/Tokenizer.java
@ -25,6 +25,22 @@ final class Tokenizer {
        private int oneCharBuffer;
        private int lineNumber;
        private Queue<Token> tokens;
        // has to be saved inside value concatenations
        private StringBuilder whitespace;
        // may need to value-concat with next value
        private boolean lastTokenWasSimpleValue;
        TokenIterator(ConfigOrigin origin, Reader input) {
            this.origin = origin;
            this.input = input;
            oneCharBuffer = -1;
            lineNumber = 0;
            tokens = new LinkedList<Token>();
            tokens.add(Tokens.START);
            whitespace = new StringBuilder();
            lastTokenWasSimpleValue = false;
        }
        private int nextChar() {
            if (oneCharBuffer >= 0) {
@ -49,15 +65,26 @@ final class Tokenizer {
            oneCharBuffer = c;
        }
        static boolean isWhitespace(int c) {
            // hoping this optimizes slightly by catching the most common ' '
            // case up front.
            return c == ' ' || c == '\n' || Character.isWhitespace(c);
        }
        static boolean isWhitespaceNotNewline(int c) {
            return c == ' ' || (c != '\n' && Character.isWhitespace(c));
        }
        // get next char, skipping non-newline whitespace
        private int nextCharAfterWhitespace() {
            for (;;) {
                int c = nextChar();
                if (c == -1) {
                    return -1;
-                } else if (c == '\n') {
+                } else if (isWhitespaceNotNewline(c)) {
-                    return c;
+                    if (lastTokenWasSimpleValue)
-                } else if (Character.isWhitespace(c)) {
+                        whitespace.appendCodePoint(c);
                    continue;
                } else {
                    return c;
@ -83,7 +110,7 @@ final class Tokenizer {
        // chars JSON allows to be part of a number
        static final String numberChars = "0123456789eE+-.";
        // chars that stop an unquoted string
-        static final String notInUnquotedText = "$\"{}[]:=\n,";
+        static final String notInUnquotedText = "$\"{}[]:=,\\";
        // The rules here are intended to maximize convenience while
        // avoiding confusion with real valid JSON. Basically anything
@ -98,12 +125,15 @@ final class Tokenizer {
                    break;
                } else if (notInUnquotedText.indexOf(c) >= 0) {
                    break;
                } else if (isWhitespace(c)) {
                    break;
                } else {
                    sb.append((char) c);
                }
                // we parse true/false/null tokens as such no matter
-                // what is after them.
+                // what is after them, as long as they are at the
                // start of the unquoted token.
                if (sb.length() == 4) {
                    String s = sb.toString();
                    if (s.equals("true"))
@ -122,8 +152,7 @@ final class Tokenizer {
            // put back the char that ended the unquoted text
            putBack(c);
-            // chop trailing whitespace; have to quote to have trailing spaces.
+            String s = sb.toString();
            String s = sb.toString().trim();
            return Tokens.newUnquotedText(origin, s);
        }
@ -233,19 +262,50 @@ final class Tokenizer {
            return Tokens.newString(lineOrigin(), sb.toString());
        }
        // called if the next token is not a simple value;
        // discards any whitespace we were saving between
        // simple values.
        private void nextIsNotASimpleValue() {
            lastTokenWasSimpleValue = false;
            whitespace.setLength(0);
        }
        // called if the next token IS a simple value,
        // so creates a whitespace token if the previous
        // token also was.
        private void nextIsASimpleValue() {
            if (lastTokenWasSimpleValue) {
                // need to save whitespace between the two so
                // the parser has the option to concatenate it.
                if (whitespace.length() > 0) {
                    tokens.add(Tokens.newUnquotedText(lineOrigin(),
                            whitespace.toString()));
                    whitespace.setLength(0); // reset
                }
                // lastTokenWasSimpleValue = true still
            } else {
                lastTokenWasSimpleValue = true;
                whitespace.setLength(0);
            }
        }
        private void queueNextToken() {
            int c = nextCharAfterWhitespace();
            if (c == -1) {
                nextIsNotASimpleValue();
                tokens.add(Tokens.END);
            } else if (c == '\n') {
                // newline tokens have the just-ended line number
                nextIsNotASimpleValue();
                tokens.add(Tokens.newLine(lineNumber));
                lineNumber += 1;
            } else {
                Token t = null;
                boolean tIsSimpleValue = false;
                switch (c) {
                case '"':
                    t = pullQuotedString();
                    tIsSimpleValue = true;
                    break;
                case ':':
                    t = Tokens.COLON;
@ -270,6 +330,7 @@ final class Tokenizer {
                if (t == null) {
                    if (firstNumberChars.indexOf(c) >= 0) {
                        t = pullNumber(c);
                        tIsSimpleValue = true;
                    } else if (notInUnquotedText.indexOf(c) >= 0) {
                        throw parseError(String
                                .format("Character '%c' is not the start of any valid token",
@ -277,25 +338,24 @@ final class Tokenizer {
                    } else {
                        putBack(c);
                        t = pullUnquotedText();
                        tIsSimpleValue = true;
                    }
                }
                if (t == null)
                    throw new ConfigException.BugOrBroken(
                            "bug: failed to generate next token");
                if (tIsSimpleValue) {
                    nextIsASimpleValue();
                } else {
                    nextIsNotASimpleValue();
                }
                tokens.add(t);
            }
        }
        TokenIterator(ConfigOrigin origin, Reader input) {
            this.origin = origin;
            this.input = input;
            oneCharBuffer = -1;
            lineNumber = 0;
            tokens = new LinkedList<Token>();
            tokens.add(Tokens.START);
        }
        @Override
        public boolean hasNext() {
            return !tokens.isEmpty();
@ -304,7 +364,7 @@ final class Tokenizer {
        @Override
        public Token next() {
            Token t = tokens.remove();
-            if (t != Tokens.END) {
+            if (tokens.isEmpty() && t != Tokens.END) {
                queueNextToken();
                if (tokens.isEmpty())
                    throw new ConfigException.BugOrBroken(
--- a/src/test/scala/com/typesafe/config/impl/EquivalentsTest.scala
+++ b/src/test/scala/com/typesafe/config/impl/EquivalentsTest.scala
@ -66,6 +66,6 @@ class EquivalentsTest extends TestUtils {
        // This is a little "checksum" to be sure we really tested what we were expecting.
        // it breaks every time you add a file, so you have to update it.
        assertEquals(1, dirCount)
-        assertEquals(1, fileCount)
+        assertEquals(2, fileCount)
    }
 }
--- a/src/test/scala/com/typesafe/config/impl/TokenizerTest.scala
+++ b/src/test/scala/com/typesafe/config/impl/TokenizerTest.scala
@ -63,25 +63,21 @@ class TokenizerTest extends TestUtils {
    @Test
    def tokenizeAllTypesWithSingleSpaces() {
        // all token types with no spaces (not sure JSON spec wants this to work,
        // but spec is unclear to me when spaces are required, and banning them
        // is actually extra work)
        val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.CLOSE_CURLY,
            Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, tokenString("foo"),
-            tokenLong(42), tokenTrue, tokenDouble(3.14),
+            tokenUnquoted(" "), tokenLong(42), tokenUnquoted(" "), tokenTrue, tokenUnquoted(" "),
-            tokenFalse, tokenNull, Tokens.newLine(0), Tokens.END)
+            tokenDouble(3.14), tokenUnquoted(" "), tokenFalse, tokenUnquoted(" "), tokenNull,
            Tokens.newLine(0), Tokens.END)
        assertEquals(expected, tokenizeAsList(""" , : } { ] [ "foo" 42 true 3.14 false null """ + "\n "))
    }
    @Test
    def tokenizeAllTypesWithMultipleSpaces() {
        // all token types with no spaces (not sure JSON spec wants this to work,
        // but spec is unclear to me when spaces are required, and banning them
        // is actually extra work)
        val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.CLOSE_CURLY,
            Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, tokenString("foo"),
-            tokenLong(42), tokenTrue, tokenDouble(3.14),
+            tokenUnquoted("   "), tokenLong(42), tokenUnquoted("   "), tokenTrue, tokenUnquoted("   "),
-            tokenFalse, tokenNull, Tokens.newLine(0), Tokens.END)
+            tokenDouble(3.14), tokenUnquoted("   "), tokenFalse, tokenUnquoted("   "), tokenNull,
            Tokens.newLine(0), Tokens.END)
        assertEquals(expected, tokenizeAsList("""   ,   :   }   {   ]   [   "foo"   42   true   3.14   false   null   """ + "\n   "))
    }
@ -111,13 +107,13 @@ class TokenizerTest extends TestUtils {
    @Test
    def tokenizeUnquotedTextContainingSpaceTrue() {
-        val expected = List(Tokens.START, tokenUnquoted("foo true"), Tokens.END)
+        val expected = List(Tokens.START, tokenUnquoted("foo"), tokenUnquoted(" "), tokenTrue, Tokens.END)
        assertEquals(expected, tokenizeAsList("""foo true"""))
    }
    @Test
    def tokenizeTrueAndSpaceAndUnquotedText() {
-        val expected = List(Tokens.START, tokenTrue, tokenUnquoted("foo"), Tokens.END)
+        val expected = List(Tokens.START, tokenTrue, tokenUnquoted(" "), tokenUnquoted("foo"), Tokens.END)
        assertEquals(expected, tokenizeAsList("""true foo"""))
    }
@ -129,7 +125,8 @@ class TokenizerTest extends TestUtils {
    @Test
    def tokenizeUnquotedTextKeepsInternalSpaces() {
-        val expected = List(Tokens.START, tokenUnquoted("foo  bar baz"), Tokens.newLine(0), Tokens.END)
+        val expected = List(Tokens.START, tokenUnquoted("foo"), tokenUnquoted("  "), tokenUnquoted("bar"),
            tokenUnquoted(" "), tokenUnquoted("baz"), Tokens.newLine(0), Tokens.END)
        assertEquals(expected, tokenizeAsList("    foo  bar baz   \n"))
    }