support "/" in unquoted strings (and therefore keys)

2025-02-20 00:00:48 +08:00 · 2011-11-16 11:03:16 -05:00 · 2011-11-16 11:03:16 -05:00 · 3610dc8d76
commit 3610dc8d76
parent 20b75542e4
5 changed files with 92 additions and 37 deletions
--- a/HOCON.md
+++ b/HOCON.md
@ -27,8 +27,9 @@ Implementation-wise, the format should have these properties:
   heuristic. It should be clear what's invalid and invalid files
   should generate errors.
 - require minimal look-ahead; should be able to tokenize the file
-   by looking at only the current character and the next
-   character.
+   by looking at only the next three characters. (right now, the
+   only reason to look at three is to find "//" comments;
+   otherwise you can parse looking at two.)

 HOCON is significantly harder to specify and to parse than
 JSON. Think of it as moving the work from the person maintaining
@ -182,8 +183,10 @@ A sequence of characters outside of a quoted string is a string
 value if:

 - it does not contain "forbidden characters" '$', '"', '{', '}',
-   '[', ']', ':', '=', ',', '+', '#', '/', '\' (backslash), or
+   '[', ']', ':', '=', ',', '+', '#', '\' (backslash), or
   whitespace.
+ - it does not contain the two-character string "//" (which
+   starts a comment)
 - its initial characters do not parse as `true`, `false`, `null`,
   or a number.

@ -199,9 +202,9 @@ the unquoted string `bar` but `bar10.0` is the unquoted string
 `bar10.0`.

 In general, once an unquoted string begins, it continues until a
-forbidden character is encountered. Embedded (non-initial)
-booleans, nulls, and numbers are not recognized as such, they are
-part of the string.
+forbidden character or the two-character string "//" is
+encountered. Embedded (non-initial) booleans, nulls, and numbers
+are not recognized as such, they are part of the string.

 An unquoted string may not _begin_ with the digits 0-9 or with a
 hyphen (`-`, 0x002D) because those are valid characters to begin a
--- a/src/main/java/com/typesafe/config/impl/Tokenizer.java
+++ b/src/main/java/com/typesafe/config/impl/Tokenizer.java
@ -84,7 +84,7 @@ final class Tokenizer {

        final private ConfigOrigin origin;
        final private Reader input;
-        private int oneCharBuffer;
+        final private LinkedList<Integer> buffer;
        private int lineNumber;
        final private Queue<Token> tokens;
        final private WhitespaceSaver whitespaceSaver;
@ -94,7 +94,7 @@ final class Tokenizer {
            this.origin = origin;
            this.input = input;
            this.allowComments = allowComments;
-            oneCharBuffer = -1;
+            this.buffer = new LinkedList<Integer>();
            lineNumber = 0;
            tokens = new LinkedList<Token>();
            tokens.add(Tokens.START);
@ -102,27 +102,29 @@ final class Tokenizer {
        }


-        private int nextChar() {
-            if (oneCharBuffer >= 0) {
-                int c = oneCharBuffer;
-                oneCharBuffer = -1;
-                return c;
-            } else {
+        // this should ONLY be called from nextCharSkippingComments
+        // or when inside a quoted string, everything else should
+        // use nextCharSkippingComments().
+        private int nextCharRaw() {
+            if (buffer.isEmpty()) {
                try {
                    return input.read();
                } catch (IOException e) {
                    throw new ConfigException.IO(origin, "read error: "
                            + e.getMessage(), e);
                }
+            } else {
+                int c = buffer.pop();
+                return c;
            }
        }

        private void putBack(int c) {
-            if (oneCharBuffer >= 0) {
+            if (buffer.size() > 2) {
                throw new ConfigException.BugOrBroken(
-                        "bug: attempt to putBack() twice in a row");
+                        "bug: putBack() three times, undesirable look-ahead");
            }
-            oneCharBuffer = c;
+            buffer.push(c);
        }

        static boolean isWhitespace(int c) {
@ -135,29 +137,26 @@ final class Tokenizer {

        private int slurpComment() {
            for (;;) {
-                int c = nextChar();
+                int c = nextCharRaw();
                if (c == -1 || c == '\n') {
                    return c;
                }
            }
        }

-        // get next char, skipping non-newline whitespace
-        private int nextCharAfterWhitespace(WhitespaceSaver saver) {
+        // get next char, skipping comments
+        private int nextCharSkippingComments() {
            for (;;) {
-                int c = nextChar();
+                int c = nextCharRaw();

                if (c == -1) {
                    return -1;
                } else {
-                    if (isWhitespaceNotNewline(c)) {
-                        saver.add(c);
-                        continue;
-                    } else if (allowComments) {
+                    if (allowComments) {
                        if (c == '#') {
                            return slurpComment();
                        } else if (c == '/') {
-                            int maybeSecondSlash = nextChar();
+                            int maybeSecondSlash = nextCharRaw();
                            if (maybeSecondSlash == '/') {
                                return slurpComment();
                            } else {
@ -174,6 +173,24 @@ final class Tokenizer {
            }
        }

+        // get next char, skipping non-newline whitespace
+        private int nextCharAfterWhitespace(WhitespaceSaver saver) {
+            for (;;) {
+                int c = nextCharSkippingComments();
+
+                if (c == -1) {
+                    return -1;
+                } else {
+                    if (isWhitespaceNotNewline(c)) {
+                        saver.add(c);
+                        continue;
+                    } else {
+                        return c;
+                    }
+                }
+            }
+        }
+
        private ConfigException parseError(String message) {
            return parseError(message, null);
        }
@ -208,7 +225,7 @@ final class Tokenizer {
        // chars JSON allows to be part of a number
        static final String numberChars = "0123456789eE+-.";
        // chars that stop an unquoted string
-        static final String notInUnquotedText = "$\"{}[]:=,\\+#/";
+        static final String notInUnquotedText = "$\"{}[]:=,\\+#";

        // The rules here are intended to maximize convenience while
        // avoiding confusion with real valid JSON. Basically anything
@ -217,7 +234,7 @@ final class Tokenizer {
        private Token pullUnquotedText() {
            ConfigOrigin origin = lineOrigin();
            StringBuilder sb = new StringBuilder();
-            int c = nextChar();
+            int c = nextCharSkippingComments();
            while (true) {
                if (c == -1) {
                    break;
@ -244,7 +261,7 @@ final class Tokenizer {
                        return Tokens.newBoolean(origin, false);
                }

-                c = nextChar();
+                c = nextCharSkippingComments();
            }

            // put back the char that ended the unquoted text
@ -258,12 +275,12 @@ final class Tokenizer {
            StringBuilder sb = new StringBuilder();
            sb.appendCodePoint(firstChar);
            boolean containedDecimalOrE = false;
-            int c = nextChar();
+            int c = nextCharSkippingComments();
            while (c != -1 && numberChars.indexOf(c) >= 0) {
                if (c == '.' || c == 'e' || c == 'E')
                    containedDecimalOrE = true;
                sb.appendCodePoint(c);
-                c = nextChar();
+                c = nextCharSkippingComments();
            }
            // the last character we looked at wasn't part of the number, put it
            // back
@ -285,7 +302,7 @@ final class Tokenizer {
        }

        private void pullEscapeSequence(StringBuilder sb) {
-            int escaped = nextChar();
+            int escaped = nextCharRaw();
            if (escaped == -1)
                throw parseError("End of input but backslash in string had nothing after it");

@ -318,7 +335,7 @@ final class Tokenizer {
                // kind of absurdly slow, but screw it for now
                char[] a = new char[4];
                for (int i = 0; i < 4; ++i) {
-                    int c = nextChar();
+                    int c = nextCharSkippingComments();
                    if (c == -1)
                        throw parseError("End of input but expecting 4 hex digits for \\uXXXX escape");
                    a[i] = (char) c;
@ -346,7 +363,7 @@ final class Tokenizer {
            StringBuilder sb = new StringBuilder();
            int c = '\0'; // value doesn't get used
            do {
-                c = nextChar();
+                c = nextCharRaw();
                if (c == -1)
                    throw parseError("End of input but string quote was still open");

@ -364,7 +381,7 @@ final class Tokenizer {
        private Token pullSubstitution() {
            // the initial '$' has already been consumed
            ConfigOrigin origin = lineOrigin();
-            int c = nextChar();
+            int c = nextCharSkippingComments();
            if (c != '{') {
                throw parseError("'$' not followed by {");
            }
--- a/src/test/scala/com/typesafe/config/impl/ConfParserTest.scala
+++ b/src/test/scala/com/typesafe/config/impl/ConfParserTest.scala
@ -269,4 +269,11 @@ class ConfParserTest extends TestUtils {
            parseObject("""{ "a" : "y" "b" : "z" }""")
        }
    }
+
+    @Test
+    def keysWithSlash() {
+        val obj = parseObject("""/a/b/c=42, x/y/z : 32""")
+        assertEquals(42, obj.getInt("/a/b/c"))
+        assertEquals(32, obj.getInt("x/y/z"))
+    }
 }
--- a/src/test/scala/com/typesafe/config/impl/TestUtils.scala
+++ b/src/test/scala/com/typesafe/config/impl/TestUtils.scala
@ -151,11 +151,12 @@ abstract trait TestUtils {
        """[ { "a" : 2, "b" : ${${a}} } ]""", // nested substitution
        "[ = ]", // = is not a valid token in unquoted text
        "[ + ]",
-        "[ / ]",
        "[ # ]",
        "[ \\ ]",
        "[ # comment ]",
        "${ #comment }",
+        "[ // comment ]",
+        "${ // comment }",
        "{ include \"bar\" : 10 }", // include with a value after it
        "{ include foo }", // include with unquoted string
        "{ include : { \"a\" : 1 } }") // include used as unquoted key
@ -227,6 +228,7 @@ abstract trait TestUtils {
        "[ trux ]",
        "[ truex ]",
        "[ 10x ]", // number token with trailing junk
+        "[ / ]", // unquoted string "slash"
        "{ include \"foo\" }", // valid include
        "{ include\n\"foo\" }", // include with just a newline separating from string
        "{ include\"foo\" }", // include with no whitespace after it
@ -262,6 +264,7 @@ abstract trait TestUtils {
 , 11]""",
        """[ 10 // comment
 , 11]""",
+        """{ /a/b/c : 10 }""", // key has a slash in it
        ParseTest(false, true, "[${ foo.bar}]"), // substitution with leading spaces
        ParseTest(false, true, "[${foo.bar }]"), // substitution with trailing spaces
        ParseTest(false, true, "[${ \"foo.bar\"}]"), // substitution with leading spaces and quoted
--- a/src/test/scala/com/typesafe/config/impl/TokenizerTest.scala
+++ b/src/test/scala/com/typesafe/config/impl/TokenizerTest.scala
@ -2,11 +2,16 @@ package com.typesafe.config.impl

 import org.junit.Assert.assertEquals
 import org.junit.Test
-
 import com.typesafe.config.ConfigException

 class TokenizerTest extends TestUtils {

+    // FIXME most of this file should be using this method
+    private def tokenizerTest(expected: List[Token], s: String) {
+        assertEquals(List(Tokens.START) ++ expected ++ List(Tokens.END),
+            tokenizeAsList(s))
+    }
+
    @Test
    def tokenizeEmptyString() {
        assertEquals(List(Tokens.START, Tokens.END),
@ -91,6 +96,14 @@ class TokenizerTest extends TestUtils {
        assertEquals(expected, tokenizeAsList("""true foo"""))
    }

+    @Test
+    def tokenizeUnquotedTextContainingSlash() {
+        tokenizerTest(List(tokenUnquoted("a/b/c/")), "a/b/c/")
+        tokenizerTest(List(tokenUnquoted("/")), "/")
+        tokenizerTest(List(tokenUnquoted("/"), tokenUnquoted(" "), tokenUnquoted("/")), "/ /")
+        tokenizerTest(Nil, "//")
+    }
+
    @Test
    def tokenizeUnquotedTextTrimsSpaces() {
        val expected = List(Tokens.START, tokenUnquoted("foo"), Tokens.newLine(0), Tokens.END)
@ -182,4 +195,16 @@ class TokenizerTest extends TestUtils {
            }
        }
    }
+
+    @Test
+    def commentsIgnoredInVariousContext() {
+        tokenizerTest(List(tokenString("//bar")), "\"//bar\"")
+        tokenizerTest(List(tokenString("#bar")), "\"#bar\"")
+        tokenizerTest(List(tokenUnquoted("bar")), "bar//comment")
+        tokenizerTest(List(tokenUnquoted("bar")), "bar#comment")
+        tokenizerTest(List(tokenInt(10)), "10//comment")
+        tokenizerTest(List(tokenInt(10)), "10#comment")
+        tokenizerTest(List(tokenDouble(3.14)), "3.14//comment")
+        tokenizerTest(List(tokenDouble(3.14)), "3.14#comment")
+    }
 }