Make unquoted string values really work

2025-01-15 23:01:05 +08:00 · 2011-11-08 09:17:23 -05:00 · 2011-11-08 09:17:23 -05:00 · 9b2a96aef4
commit 9b2a96aef4
parent e3e1d7392d
4 changed files with 132 additions and 61 deletions
--- a/SPEC.md
+++ b/SPEC.md
@ -72,11 +72,11 @@ Different from JSON:
   instead
 - keys with an object as their value may omit `=`, so `foo { }` means
   `foo = { }`
- - keys which contain no whitespace need not be quoted; the string
-   is then used literally with no unescaping
- - if a key is not quoted, the `.` character has a special meaning and
+ - keys may be unquoted strings (see below for detailed definition)
+ - only if a key is unquoted, the `.` character has a special meaning and
   creates a new object. So `foo.bar = 10` means to create an object at key
-   `foo`, then inside that object, create a key `bar` with value `10`
+   `foo`, then inside that object, create a key `bar` with value
+   `10`.
 - quoted keys _should not_ contain the `.` character because it's
   confusing, but it is permitted (to preserve the ability to use any string
   as a key and thus convert an arbitrary map or JavaScript object into HOCON)
@ -100,32 +100,46 @@ Different from JSON:
    - FIXME prepend operator?
 - a new type of value exists, substitution, which looks like `${some.path}`
   (details below)
- - to support substitutions, a value may consist of multiple strings which
-   are concatenated into one string. `"foo"${some.path}"bar"`
- - String values may sometimes omit quotes. If a value does not parse as a
-   substitution, quoted string, number, object, array, true, false, or null,
-   then that value will be parsed as a string value, created as
-   follows:
-    - take the string from the `=` to the first newline or comma
-    - remove leading and trailing whitespace (whitespace defined
-      only as ASCII whitespace, as with Java's trim() method)
-    - what remains is treated as a sequence of strings, where
-      each string is either the raw inline UTF-8 data, a quoted
-      string, or a substitution
-    - everything up to a `"` or `$` is a raw unquoted UTF-8 string;
-      no unescaping is performed
-    - at `"` a quoted string is parsed, with the usual escape
-      sequences; after the close `"` parsing the unquoted string
-      continues. The quoted string must be well-formed or it's
-      an error.
-    - at `$` a substitution is parsed. The substitution must be well-formed
-      or it's an error.
-    - to get a literal `"`, `$`, newline or comma, you would have to use
-      a quoted string
-    - after the initial raw string, quoted string, or substitution,
-      parsing another one immediately begins and so on until the
-      end of the value.
-    - the resulting sequence of strings is concatenated
+ - String values may sometimes omit quotes.
+    - Unquoted strings may not contain '$', '"', '{', '}',
+      '[', ']', ':', '=', ',', or '\' (backslash) and may not
+      contain whitespace (including newlines).
+    - Unquoted strings do not support any form of escaping; the
+      characters are all left as-is. If you need to use special
+      characters or escaping, you have to quote the string.
+    - Because of "value concatenation" rules (see below) you can
+      write a sentence with whitespace unquoted, though.
+    - Any unquoted series of characters that parses as a
+      substitution, true, false, null, number, or quoted string
+      will be treated as the type it parses as, rather than as
+      an unquoted string. However, in "value concatenation"
+      the non-string types convert to strings, which means
+      you can have the word "true" in an unquoted sentence.
+    - true, false, null, numbers only parse as such if they
+      immediately follow at least one character that is not
+      allowed in unquoted strings. That is, `truefoo` is
+      the value `true` then the unquoted string `foo`, but
+      `footrue` is the unquoted string `footrue`.
+    - quoted strings and substitutions always parse as such
+      since they begin with a character that can't be in an
+      unquoted string.
+ - Value concatenation: to support substitutions, and unquoted
+   sentences with whitespace, a value may consist of multiple
+   values which are concatenated into one
+   string. `"foo"${some.path}"bar"` or `The quick brown fox`.
+    - let a "simple value" be the set of JSON values excluding
+      objects and arrays, and including unquoted strings and
+      substitutions.
+    - as long as simple values are separated only by non-newline
+      whitespace, the _whitespace between them is preserved_
+      and the values, along with the whitespace, are concatenated
+      into a string.
+    - Whitespace before the first and after the last simple value
+      will be discarded. Only whitespace _between_ simple values
+      is preserved.
+    - concatenation never spans a newline or a non-simple-value
+      token.
+    - the result of the concatenation is a string value.
 - the special key `include` followed directly by a string value (with no
   `=`) means to treat that string value as a filename and merge the
   object defined in that file into the current object, overriding
--- a/src/main/java/com/typesafe/config/impl/Tokenizer.java
+++ b/src/main/java/com/typesafe/config/impl/Tokenizer.java
@ -25,6 +25,22 @@ final class Tokenizer {
        private int oneCharBuffer;
        private int lineNumber;
        private Queue<Token> tokens;
+        // has to be saved inside value concatenations
+        private StringBuilder whitespace;
+        // may need to value-concat with next value
+        private boolean lastTokenWasSimpleValue;
+
+        TokenIterator(ConfigOrigin origin, Reader input) {
+            this.origin = origin;
+            this.input = input;
+            oneCharBuffer = -1;
+            lineNumber = 0;
+            tokens = new LinkedList<Token>();
+            tokens.add(Tokens.START);
+            whitespace = new StringBuilder();
+            lastTokenWasSimpleValue = false;
+        }
+

        private int nextChar() {
            if (oneCharBuffer >= 0) {
@ -49,15 +65,26 @@ final class Tokenizer {
            oneCharBuffer = c;
        }

+        static boolean isWhitespace(int c) {
+            // hoping this optimizes slightly by catching the most common ' '
+            // case up front.
+            return c == ' ' || c == '\n' || Character.isWhitespace(c);
+        }
+
+        static boolean isWhitespaceNotNewline(int c) {
+            return c == ' ' || (c != '\n' && Character.isWhitespace(c));
+        }
+
+        // get next char, skipping non-newline whitespace
        private int nextCharAfterWhitespace() {
            for (;;) {
                int c = nextChar();

                if (c == -1) {
                    return -1;
-                } else if (c == '\n') {
-                    return c;
-                } else if (Character.isWhitespace(c)) {
+                } else if (isWhitespaceNotNewline(c)) {
+                    if (lastTokenWasSimpleValue)
+                        whitespace.appendCodePoint(c);
                    continue;
                } else {
                    return c;
@ -83,7 +110,7 @@ final class Tokenizer {
        // chars JSON allows to be part of a number
        static final String numberChars = "0123456789eE+-.";
        // chars that stop an unquoted string
-        static final String notInUnquotedText = "$\"{}[]:=\n,";
+        static final String notInUnquotedText = "$\"{}[]:=,\\";

        // The rules here are intended to maximize convenience while
        // avoiding confusion with real valid JSON. Basically anything
@ -98,12 +125,15 @@ final class Tokenizer {
                    break;
                } else if (notInUnquotedText.indexOf(c) >= 0) {
                    break;
+                } else if (isWhitespace(c)) {
+                    break;
                } else {
                    sb.append((char) c);
                }

                // we parse true/false/null tokens as such no matter
-                // what is after them.
+                // what is after them, as long as they are at the
+                // start of the unquoted token.
                if (sb.length() == 4) {
                    String s = sb.toString();
                    if (s.equals("true"))
@ -122,8 +152,7 @@ final class Tokenizer {
            // put back the char that ended the unquoted text
            putBack(c);

-            // chop trailing whitespace; have to quote to have trailing spaces.
-            String s = sb.toString().trim();
+            String s = sb.toString();
            return Tokens.newUnquotedText(origin, s);
        }

@ -233,19 +262,50 @@ final class Tokenizer {
            return Tokens.newString(lineOrigin(), sb.toString());
        }

+        // called if the next token is not a simple value;
+        // discards any whitespace we were saving between
+        // simple values.
+        private void nextIsNotASimpleValue() {
+            lastTokenWasSimpleValue = false;
+            whitespace.setLength(0);
+        }
+
+        // called if the next token IS a simple value,
+        // so creates a whitespace token if the previous
+        // token also was.
+        private void nextIsASimpleValue() {
+            if (lastTokenWasSimpleValue) {
+                // need to save whitespace between the two so
+                // the parser has the option to concatenate it.
+                if (whitespace.length() > 0) {
+                    tokens.add(Tokens.newUnquotedText(lineOrigin(),
+                            whitespace.toString()));
+                    whitespace.setLength(0); // reset
+                }
+                // lastTokenWasSimpleValue = true still
+            } else {
+                lastTokenWasSimpleValue = true;
+                whitespace.setLength(0);
+            }
+        }
+
        private void queueNextToken() {
            int c = nextCharAfterWhitespace();
            if (c == -1) {
+                nextIsNotASimpleValue();
                tokens.add(Tokens.END);
            } else if (c == '\n') {
                // newline tokens have the just-ended line number
+                nextIsNotASimpleValue();
                tokens.add(Tokens.newLine(lineNumber));
                lineNumber += 1;
            } else {
                Token t = null;
+                boolean tIsSimpleValue = false;
                switch (c) {
                case '"':
                    t = pullQuotedString();
+                    tIsSimpleValue = true;
                    break;
                case ':':
                    t = Tokens.COLON;
@ -270,6 +330,7 @@ final class Tokenizer {
                if (t == null) {
                    if (firstNumberChars.indexOf(c) >= 0) {
                        t = pullNumber(c);
+                        tIsSimpleValue = true;
                    } else if (notInUnquotedText.indexOf(c) >= 0) {
                        throw parseError(String
                                .format("Character '%c' is not the start of any valid token",
@ -277,25 +338,24 @@ final class Tokenizer {
                    } else {
                        putBack(c);
                        t = pullUnquotedText();
+                        tIsSimpleValue = true;
                    }
                }

                if (t == null)
                    throw new ConfigException.BugOrBroken(
                            "bug: failed to generate next token");
+
+                if (tIsSimpleValue) {
+                    nextIsASimpleValue();
+                } else {
+                    nextIsNotASimpleValue();
+                }
+
                tokens.add(t);
            }
        }

-        TokenIterator(ConfigOrigin origin, Reader input) {
-            this.origin = origin;
-            this.input = input;
-            oneCharBuffer = -1;
-            lineNumber = 0;
-            tokens = new LinkedList<Token>();
-            tokens.add(Tokens.START);
-        }
-
        @Override
        public boolean hasNext() {
            return !tokens.isEmpty();
@ -304,7 +364,7 @@ final class Tokenizer {
        @Override
        public Token next() {
            Token t = tokens.remove();
-            if (t != Tokens.END) {
+            if (tokens.isEmpty() && t != Tokens.END) {
                queueNextToken();
                if (tokens.isEmpty())
                    throw new ConfigException.BugOrBroken(
--- a/src/test/scala/com/typesafe/config/impl/EquivalentsTest.scala
+++ b/src/test/scala/com/typesafe/config/impl/EquivalentsTest.scala
@ -66,6 +66,6 @@ class EquivalentsTest extends TestUtils {
        // This is a little "checksum" to be sure we really tested what we were expecting.
        // it breaks every time you add a file, so you have to update it.
        assertEquals(1, dirCount)
-        assertEquals(1, fileCount)
+        assertEquals(2, fileCount)
    }
 }
--- a/src/test/scala/com/typesafe/config/impl/TokenizerTest.scala
+++ b/src/test/scala/com/typesafe/config/impl/TokenizerTest.scala
@ -63,25 +63,21 @@ class TokenizerTest extends TestUtils {

    @Test
    def tokenizeAllTypesWithSingleSpaces() {
-        // all token types with no spaces (not sure JSON spec wants this to work,
-        // but spec is unclear to me when spaces are required, and banning them
-        // is actually extra work)
        val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.CLOSE_CURLY,
            Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, tokenString("foo"),
-            tokenLong(42), tokenTrue, tokenDouble(3.14),
-            tokenFalse, tokenNull, Tokens.newLine(0), Tokens.END)
+            tokenUnquoted(" "), tokenLong(42), tokenUnquoted(" "), tokenTrue, tokenUnquoted(" "),
+            tokenDouble(3.14), tokenUnquoted(" "), tokenFalse, tokenUnquoted(" "), tokenNull,
+            Tokens.newLine(0), Tokens.END)
        assertEquals(expected, tokenizeAsList(""" , : } { ] [ "foo" 42 true 3.14 false null """ + "\n "))
    }

    @Test
    def tokenizeAllTypesWithMultipleSpaces() {
-        // all token types with no spaces (not sure JSON spec wants this to work,
-        // but spec is unclear to me when spaces are required, and banning them
-        // is actually extra work)
        val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.CLOSE_CURLY,
            Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, tokenString("foo"),
-            tokenLong(42), tokenTrue, tokenDouble(3.14),
-            tokenFalse, tokenNull, Tokens.newLine(0), Tokens.END)
+            tokenUnquoted("   "), tokenLong(42), tokenUnquoted("   "), tokenTrue, tokenUnquoted("   "),
+            tokenDouble(3.14), tokenUnquoted("   "), tokenFalse, tokenUnquoted("   "), tokenNull,
+            Tokens.newLine(0), Tokens.END)
        assertEquals(expected, tokenizeAsList("""   ,   :   }   {   ]   [   "foo"   42   true   3.14   false   null   """ + "\n   "))
    }

@ -111,13 +107,13 @@ class TokenizerTest extends TestUtils {

    @Test
    def tokenizeUnquotedTextContainingSpaceTrue() {
-        val expected = List(Tokens.START, tokenUnquoted("foo true"), Tokens.END)
+        val expected = List(Tokens.START, tokenUnquoted("foo"), tokenUnquoted(" "), tokenTrue, Tokens.END)
        assertEquals(expected, tokenizeAsList("""foo true"""))
    }

    @Test
    def tokenizeTrueAndSpaceAndUnquotedText() {
-        val expected = List(Tokens.START, tokenTrue, tokenUnquoted("foo"), Tokens.END)
+        val expected = List(Tokens.START, tokenTrue, tokenUnquoted(" "), tokenUnquoted("foo"), Tokens.END)
        assertEquals(expected, tokenizeAsList("""true foo"""))
    }

@ -129,7 +125,8 @@ class TokenizerTest extends TestUtils {

    @Test
    def tokenizeUnquotedTextKeepsInternalSpaces() {
-        val expected = List(Tokens.START, tokenUnquoted("foo  bar baz"), Tokens.newLine(0), Tokens.END)
+        val expected = List(Tokens.START, tokenUnquoted("foo"), tokenUnquoted("  "), tokenUnquoted("bar"),
+            tokenUnquoted(" "), tokenUnquoted("baz"), Tokens.newLine(0), Tokens.END)
        assertEquals(expected, tokenizeAsList("    foo  bar baz   \n"))
    }