Merge pull request #271 from fpringvaldsen/feature/lossless-tokens

Add lossless tokens
This commit is contained in:
Havoc Pennington 2015-03-04 13:29:20 -05:00
commit f3e11bc583
7 changed files with 307 additions and 128 deletions

View File

@ -203,7 +203,7 @@ final class Parser {
} }
previous = next; previous = next;
next = tokens.next(); next = nextTokenIgnoringWhitespace();
} }
// put our concluding token in the queue with all the comments // put our concluding token in the queue with all the comments
@ -219,7 +219,7 @@ final class Parser {
private TokenWithComments popTokenWithoutTrailingComment() { private TokenWithComments popTokenWithoutTrailingComment() {
if (buffer.isEmpty()) { if (buffer.isEmpty()) {
Token t = tokens.next(); Token t = nextTokenIgnoringWhitespace();
if (Tokens.isComment(t)) { if (Tokens.isComment(t)) {
consolidateCommentBlock(t); consolidateCommentBlock(t);
return buffer.pop(); return buffer.pop();
@ -243,7 +243,7 @@ final class Parser {
if (!attractsTrailingComments(withPrecedingComments.token)) { if (!attractsTrailingComments(withPrecedingComments.token)) {
return withPrecedingComments; return withPrecedingComments;
} else if (buffer.isEmpty()) { } else if (buffer.isEmpty()) {
Token after = tokens.next(); Token after = nextTokenIgnoringWhitespace();
if (Tokens.isComment(after)) { if (Tokens.isComment(after)) {
return withPrecedingComments.add(after); return withPrecedingComments.add(after);
} else { } else {
@ -319,6 +319,16 @@ final class Parser {
return t; return t;
} }
// Grabs the next Token off of the TokenIterator, ignoring
// IgnoredWhitespace tokens
private Token nextTokenIgnoringWhitespace() {
Token t;
do {
t = tokens.next();
} while (Tokens.isIgnoredWhitespace(t));
return t;
}
private AbstractConfigValue addAnyCommentsAfterAnyComma(AbstractConfigValue v) { private AbstractConfigValue addAnyCommentsAfterAnyComma(AbstractConfigValue v) {
TokenWithComments t = nextToken(); // do NOT skip newlines, we only TokenWithComments t = nextToken(); // do NOT skip newlines, we only
// want same-line comments // want same-line comments
@ -1063,6 +1073,11 @@ final class Parser {
while (expression.hasNext()) { while (expression.hasNext()) {
Token t = expression.next(); Token t = expression.next();
// Ignore all IgnoredWhitespace tokens
if (Tokens.isIgnoredWhitespace(t))
continue;
if (Tokens.isValueWithType(t, ConfigValueType.STRING)) { if (Tokens.isValueWithType(t, ConfigValueType.STRING)) {
AbstractConfigValue v = Tokens.getValue(t); AbstractConfigValue v = Tokens.getValue(t);
// this is a quoted string; so any periods // this is a quoted string; so any periods

View File

@ -10,26 +10,34 @@ class Token {
final private TokenType tokenType; final private TokenType tokenType;
final private String debugString; final private String debugString;
final private ConfigOrigin origin; final private ConfigOrigin origin;
final private String tokenText;
Token(TokenType tokenType, ConfigOrigin origin) { Token(TokenType tokenType, ConfigOrigin origin) {
this(tokenType, origin, null); this(tokenType, origin, null);
} }
Token(TokenType tokenType, ConfigOrigin origin, String debugString) { Token(TokenType tokenType, ConfigOrigin origin, String tokenText) {
this(tokenType, origin, tokenText, null);
}
Token(TokenType tokenType, ConfigOrigin origin, String tokenText, String debugString) {
this.tokenType = tokenType; this.tokenType = tokenType;
this.origin = origin; this.origin = origin;
this.debugString = debugString; this.debugString = debugString;
this.tokenText = tokenText;
} }
// this is used for singleton tokens like COMMA or OPEN_CURLY // this is used for singleton tokens like COMMA or OPEN_CURLY
static Token newWithoutOrigin(TokenType tokenType, String debugString) { static Token newWithoutOrigin(TokenType tokenType, String debugString, String tokenText) {
return new Token(tokenType, null, debugString); return new Token(tokenType, null, tokenText, debugString);
} }
final TokenType tokenType() { final TokenType tokenType() {
return tokenType; return tokenType;
} }
public String tokenText() { return tokenText; }
// this is final because we don't always use the origin() accessor, // this is final because we don't always use the origin() accessor,
// and we don't because it throws if origin is null // and we don't because it throws if origin is null
final ConfigOrigin origin() { final ConfigOrigin origin() {

View File

@ -16,6 +16,7 @@ enum TokenType {
VALUE, VALUE,
NEWLINE, NEWLINE,
UNQUOTED_TEXT, UNQUOTED_TEXT,
IGNORED_WHITESPACE,
SUBSTITUTION, SUBSTITUTION,
PROBLEM, PROBLEM,
COMMENT, COMMENT,

View File

@ -52,6 +52,14 @@ final class Tokenizer {
return new TokenIterator(origin, input, flavor != ConfigSyntax.JSON); return new TokenIterator(origin, input, flavor != ConfigSyntax.JSON);
} }
static String render(Iterator<Token> tokens) {
StringBuilder renderedText = new StringBuilder();
while (tokens.hasNext()) {
renderedText.append(tokens.next().tokenText());
}
return renderedText.toString();
}
private static class TokenIterator implements Iterator<Token> { private static class TokenIterator implements Iterator<Token> {
private static class WhitespaceSaver { private static class WhitespaceSaver {
@ -66,25 +74,23 @@ final class Tokenizer {
} }
void add(int c) { void add(int c) {
if (lastTokenWasSimpleValue) whitespace.appendCodePoint(c);
whitespace.appendCodePoint(c);
} }
Token check(Token t, ConfigOrigin baseOrigin, int lineNumber) { Token check(Token t, ConfigOrigin baseOrigin, int lineNumber) {
if (isSimpleValue(t)) { if (isSimpleValue(t)) {
return nextIsASimpleValue(baseOrigin, lineNumber); return nextIsASimpleValue(baseOrigin, lineNumber);
} else { } else {
nextIsNotASimpleValue(); return nextIsNotASimpleValue(baseOrigin, lineNumber);
return null;
} }
} }
// called if the next token is not a simple value; // called if the next token is not a simple value;
// discards any whitespace we were saving between // discards any whitespace we were saving between
// simple values. // simple values.
private void nextIsNotASimpleValue() { private Token nextIsNotASimpleValue(ConfigOrigin baseOrigin, int lineNumber) {
lastTokenWasSimpleValue = false; lastTokenWasSimpleValue = false;
whitespace.setLength(0); return createWhitespaceTokenFromSaver(baseOrigin, lineNumber);
} }
// called if the next token IS a simple value, // called if the next token IS a simple value,
@ -92,24 +98,29 @@ final class Tokenizer {
// token also was. // token also was.
private Token nextIsASimpleValue(ConfigOrigin baseOrigin, private Token nextIsASimpleValue(ConfigOrigin baseOrigin,
int lineNumber) { int lineNumber) {
if (lastTokenWasSimpleValue) { Token t = createWhitespaceTokenFromSaver(baseOrigin, lineNumber);
// need to save whitespace between the two so if (!lastTokenWasSimpleValue) {
// the parser has the option to concatenate it.
if (whitespace.length() > 0) {
Token t = Tokens.newUnquotedText(
lineOrigin(baseOrigin, lineNumber),
whitespace.toString());
whitespace.setLength(0); // reset
return t;
} else {
// lastTokenWasSimpleValue = true still
return null;
}
} else {
lastTokenWasSimpleValue = true; lastTokenWasSimpleValue = true;
whitespace.setLength(0);
return null;
} }
return t;
}
private Token createWhitespaceTokenFromSaver(ConfigOrigin baseOrigin,
int lineNumber) {
if (whitespace.length() > 0) {
Token t;
if (lastTokenWasSimpleValue) {
t = Tokens.newUnquotedText(
lineOrigin(baseOrigin, lineNumber),
whitespace.toString());
} else {
t = Tokens.newIgnoredWhitespace(lineOrigin(baseOrigin, lineNumber),
whitespace.toString());
}
whitespace.setLength(0); // reset
return t;
}
return null;
} }
} }
@ -260,10 +271,12 @@ final class Tokenizer {
// ONE char has always been consumed, either the # or the first /, but // ONE char has always been consumed, either the # or the first /, but
// not both slashes // not both slashes
private Token pullComment(int firstChar) { private Token pullComment(int firstChar) {
boolean doubleSlash = false;
if (firstChar == '/') { if (firstChar == '/') {
int discard = nextCharRaw(); int discard = nextCharRaw();
if (discard != '/') if (discard != '/')
throw new ConfigException.BugOrBroken("called pullComment but // not seen"); throw new ConfigException.BugOrBroken("called pullComment but // not seen");
doubleSlash = true;
} }
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
@ -271,7 +284,10 @@ final class Tokenizer {
int c = nextCharRaw(); int c = nextCharRaw();
if (c == -1 || c == '\n') { if (c == -1 || c == '\n') {
putBack(c); putBack(c);
return Tokens.newComment(lineOrigin, sb.toString()); if (doubleSlash)
return Tokens.newCommentDoubleSlash(lineOrigin, sb.toString());
else
return Tokens.newCommentHash(lineOrigin, sb.toString());
} else { } else {
sb.appendCodePoint(c); sb.appendCodePoint(c);
} }
@ -367,11 +383,16 @@ final class Tokenizer {
} }
} }
private void pullEscapeSequence(StringBuilder sb) throws ProblemException { private void pullEscapeSequence(StringBuilder sb, StringBuilder sbOrig) throws ProblemException {
int escaped = nextCharRaw(); int escaped = nextCharRaw();
if (escaped == -1) if (escaped == -1)
throw problem("End of input but backslash in string had nothing after it"); throw problem("End of input but backslash in string had nothing after it");
// This is needed so we return the unescaped escape characters back out when rendering
// the token
sbOrig.appendCodePoint('\\');
sbOrig.appendCodePoint(escaped);
switch (escaped) { switch (escaped) {
case '"': case '"':
sb.append('"'); sb.append('"');
@ -407,6 +428,7 @@ final class Tokenizer {
a[i] = (char) c; a[i] = (char) c;
} }
String digits = new String(a); String digits = new String(a);
sbOrig.append(a);
try { try {
sb.appendCodePoint(Integer.parseInt(digits, 16)); sb.appendCodePoint(Integer.parseInt(digits, 16));
} catch (NumberFormatException e) { } catch (NumberFormatException e) {
@ -424,7 +446,7 @@ final class Tokenizer {
} }
} }
private void appendTripleQuotedString(StringBuilder sb) throws ProblemException { private void appendTripleQuotedString(StringBuilder sb, StringBuilder sbOrig) throws ProblemException {
// we are after the opening triple quote and need to consume the // we are after the opening triple quote and need to consume the
// close triple // close triple
int consecutiveQuotes = 0; int consecutiveQuotes = 0;
@ -451,26 +473,37 @@ final class Tokenizer {
} }
sb.appendCodePoint(c); sb.appendCodePoint(c);
sbOrig.appendCodePoint(c);
} }
} }
private Token pullQuotedString() throws ProblemException { private Token pullQuotedString() throws ProblemException {
// the open quote has already been consumed // the open quote has already been consumed
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
// We need a second string builder to keep track of escape characters.
// We want to return them exactly as they appeared in the original text,
// which means we will need a new StringBuilder to escape escape characters
// so we can also keep the actual value of the string. This is gross.
StringBuilder sbOrig = new StringBuilder();
sbOrig.appendCodePoint('"');
while (true) { while (true) {
int c = nextCharRaw(); int c = nextCharRaw();
if (c == -1) if (c == -1)
throw problem("End of input but string quote was still open"); throw problem("End of input but string quote was still open");
if (c == '\\') { if (c == '\\') {
pullEscapeSequence(sb); pullEscapeSequence(sb, sbOrig);
} else if (c == '"') { } else if (c == '"') {
sbOrig.appendCodePoint(c);
break; break;
} else if (Character.isISOControl(c)) { } else if (Character.isISOControl(c)) {
throw problem(asString(c), "JSON does not allow unescaped " + asString(c) throw problem(asString(c), "JSON does not allow unescaped " + asString(c)
+ " in quoted strings, use a backslash escape"); + " in quoted strings, use a backslash escape");
} else { } else {
sb.appendCodePoint(c); sb.appendCodePoint(c);
sbOrig.appendCodePoint(c);
} }
} }
@ -478,13 +511,14 @@ final class Tokenizer {
if (sb.length() == 0) { if (sb.length() == 0) {
int third = nextCharRaw(); int third = nextCharRaw();
if (third == '"') { if (third == '"') {
appendTripleQuotedString(sb); sbOrig.appendCodePoint(third);
appendTripleQuotedString(sb, sbOrig);
} else { } else {
putBack(third); putBack(third);
} }
}
return Tokens.newString(lineOrigin, sb.toString()); }
return Tokens.newString(lineOrigin, sb.toString(), sbOrig.toString());
} }
private Token pullPlusEquals() throws ProblemException { private Token pullPlusEquals() throws ProblemException {

View File

@ -16,7 +16,11 @@ final class Tokens {
final private AbstractConfigValue value; final private AbstractConfigValue value;
Value(AbstractConfigValue value) { Value(AbstractConfigValue value) {
super(TokenType.VALUE, value.origin()); this(value, null);
}
Value(AbstractConfigValue value, String origText) {
super(TokenType.VALUE, value.origin(), origText);
this.value = value; this.value = value;
} }
@ -72,6 +76,11 @@ final class Tokens {
public int hashCode() { public int hashCode() {
return 41 * (41 + super.hashCode()) + lineNumber(); return 41 * (41 + super.hashCode()) + lineNumber();
} }
@Override
public String tokenText() {
return "\n";
}
} }
// This is not a Value, because it requires special processing // This is not a Value, because it requires special processing
@ -107,6 +116,30 @@ final class Tokens {
public int hashCode() { public int hashCode() {
return 41 * (41 + super.hashCode()) + value.hashCode(); return 41 * (41 + super.hashCode()) + value.hashCode();
} }
@Override
public String tokenText() {
return value;
}
}
static private class IgnoredWhitespace extends Token {
final private String value;
IgnoredWhitespace(ConfigOrigin origin, String s) {
super(TokenType.IGNORED_WHITESPACE, origin);
this.value = s;
}
String value() { return value; }
@Override
public String toString() { return "'" + value + "' (WHITESPACE)"; }
@Override
public String tokenText() {
return value;
}
} }
static private class Problem extends Token { static private class Problem extends Token {
@ -177,7 +210,7 @@ final class Tokens {
} }
} }
static private class Comment extends Token { static private abstract class Comment extends Token {
final private String text; final private String text;
Comment(ConfigOrigin origin, String text) { Comment(ConfigOrigin origin, String text) {
@ -185,6 +218,28 @@ final class Tokens {
this.text = text; this.text = text;
} }
final static class DoubleSlashComment extends Comment {
DoubleSlashComment(ConfigOrigin origin, String text) {
super(origin, text);
}
@Override
public String tokenText() {
return "//" + super.text;
}
}
final static class HashComment extends Comment {
HashComment(ConfigOrigin origin, String text) {
super(origin, text);
}
@Override
public String tokenText() {
return "#" + super.text;
}
}
String text() { String text() {
return text; return text;
} }
@ -235,6 +290,11 @@ final class Tokens {
return value; return value;
} }
@Override
public String tokenText() {
return "${" + (this.optional? "?" : "") + Tokenizer.render(this.value.iterator()) + "}";
}
@Override @Override
public String toString() { public String toString() {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
@ -344,6 +404,10 @@ final class Tokens {
} }
} }
static boolean isIgnoredWhitespace(Token token) {
return token instanceof IgnoredWhitespace;
}
static boolean isSubstitution(Token token) { static boolean isSubstitution(Token token) {
return token instanceof Substitution; return token instanceof Substitution;
} }
@ -366,16 +430,16 @@ final class Tokens {
} }
} }
final static Token START = Token.newWithoutOrigin(TokenType.START, "start of file"); final static Token START = Token.newWithoutOrigin(TokenType.START, "start of file", "");
final static Token END = Token.newWithoutOrigin(TokenType.END, "end of file"); final static Token END = Token.newWithoutOrigin(TokenType.END, "end of file", "");
final static Token COMMA = Token.newWithoutOrigin(TokenType.COMMA, "','"); final static Token COMMA = Token.newWithoutOrigin(TokenType.COMMA, "','", ",");
final static Token EQUALS = Token.newWithoutOrigin(TokenType.EQUALS, "'='"); final static Token EQUALS = Token.newWithoutOrigin(TokenType.EQUALS, "'='", "=");
final static Token COLON = Token.newWithoutOrigin(TokenType.COLON, "':'"); final static Token COLON = Token.newWithoutOrigin(TokenType.COLON, "':'", ":");
final static Token OPEN_CURLY = Token.newWithoutOrigin(TokenType.OPEN_CURLY, "'{'"); final static Token OPEN_CURLY = Token.newWithoutOrigin(TokenType.OPEN_CURLY, "'{'", "{");
final static Token CLOSE_CURLY = Token.newWithoutOrigin(TokenType.CLOSE_CURLY, "'}'"); final static Token CLOSE_CURLY = Token.newWithoutOrigin(TokenType.CLOSE_CURLY, "'}'", "}");
final static Token OPEN_SQUARE = Token.newWithoutOrigin(TokenType.OPEN_SQUARE, "'['"); final static Token OPEN_SQUARE = Token.newWithoutOrigin(TokenType.OPEN_SQUARE, "'['", "[");
final static Token CLOSE_SQUARE = Token.newWithoutOrigin(TokenType.CLOSE_SQUARE, "']'"); final static Token CLOSE_SQUARE = Token.newWithoutOrigin(TokenType.CLOSE_SQUARE, "']'", "]");
final static Token PLUS_EQUALS = Token.newWithoutOrigin(TokenType.PLUS_EQUALS, "'+='"); final static Token PLUS_EQUALS = Token.newWithoutOrigin(TokenType.PLUS_EQUALS, "'+='", "+=");
static Token newLine(ConfigOrigin origin) { static Token newLine(ConfigOrigin origin) {
return new Line(origin); return new Line(origin);
@ -386,14 +450,22 @@ final class Tokens {
return new Problem(origin, what, message, suggestQuotes, cause); return new Problem(origin, what, message, suggestQuotes, cause);
} }
static Token newComment(ConfigOrigin origin, String text) { static Token newCommentDoubleSlash(ConfigOrigin origin, String text) {
return new Comment(origin, text); return new Comment.DoubleSlashComment(origin, text);
}
static Token newCommentHash(ConfigOrigin origin, String text) {
return new Comment.HashComment(origin, text);
} }
static Token newUnquotedText(ConfigOrigin origin, String s) { static Token newUnquotedText(ConfigOrigin origin, String s) {
return new UnquotedText(origin, s); return new UnquotedText(origin, s);
} }
static Token newIgnoredWhitespace(ConfigOrigin origin, String s) {
return new IgnoredWhitespace(origin, s);
}
static Token newSubstitution(ConfigOrigin origin, boolean optional, List<Token> expression) { static Token newSubstitution(ConfigOrigin origin, boolean optional, List<Token> expression) {
return new Substitution(origin, optional, expression); return new Substitution(origin, optional, expression);
} }
@ -401,32 +473,35 @@ final class Tokens {
static Token newValue(AbstractConfigValue value) { static Token newValue(AbstractConfigValue value) {
return new Value(value); return new Value(value);
} }
static Token newValue(AbstractConfigValue value, String origText) {
static Token newString(ConfigOrigin origin, String value) { return new Value(value, origText);
return newValue(new ConfigString.Quoted(origin, value));
} }
static Token newInt(ConfigOrigin origin, int value, String originalText) { static Token newString(ConfigOrigin origin, String value, String origText) {
return newValue(new ConfigString.Quoted(origin, value), origText);
}
static Token newInt(ConfigOrigin origin, int value, String origText) {
return newValue(ConfigNumber.newNumber(origin, value, return newValue(ConfigNumber.newNumber(origin, value,
originalText)); origText), origText);
} }
static Token newDouble(ConfigOrigin origin, double value, static Token newDouble(ConfigOrigin origin, double value,
String originalText) { String origText) {
return newValue(ConfigNumber.newNumber(origin, value, return newValue(ConfigNumber.newNumber(origin, value,
originalText)); origText), origText);
} }
static Token newLong(ConfigOrigin origin, long value, String originalText) { static Token newLong(ConfigOrigin origin, long value, String origText) {
return newValue(ConfigNumber.newNumber(origin, value, return newValue(ConfigNumber.newNumber(origin, value,
originalText)); origText), origText);
} }
static Token newNull(ConfigOrigin origin) { static Token newNull(ConfigOrigin origin) {
return newValue(new ConfigNull(origin)); return newValue(new ConfigNull(origin), "null");
} }
static Token newBoolean(ConfigOrigin origin, boolean value) { static Token newBoolean(ConfigOrigin origin, boolean value) {
return newValue(new ConfigBoolean(origin, value)); return newValue(new ConfigBoolean(origin, value), "" + value);
} }
} }

View File

@ -611,12 +611,14 @@ abstract trait TestUtils {
def tokenFalse = Tokens.newBoolean(fakeOrigin(), false) def tokenFalse = Tokens.newBoolean(fakeOrigin(), false)
def tokenNull = Tokens.newNull(fakeOrigin()) def tokenNull = Tokens.newNull(fakeOrigin())
def tokenUnquoted(s: String) = Tokens.newUnquotedText(fakeOrigin(), s) def tokenUnquoted(s: String) = Tokens.newUnquotedText(fakeOrigin(), s)
def tokenString(s: String) = Tokens.newString(fakeOrigin(), s) def tokenString(s: String) = Tokens.newString(fakeOrigin(), s, s)
def tokenDouble(d: Double) = Tokens.newDouble(fakeOrigin(), d, null) def tokenDouble(d: Double) = Tokens.newDouble(fakeOrigin(), d, null)
def tokenInt(i: Int) = Tokens.newInt(fakeOrigin(), i, null) def tokenInt(i: Int) = Tokens.newInt(fakeOrigin(), i, null)
def tokenLong(l: Long) = Tokens.newLong(fakeOrigin(), l, null) def tokenLong(l: Long) = Tokens.newLong(fakeOrigin(), l, null)
def tokenLine(line: Int) = Tokens.newLine(fakeOrigin.withLineNumber(line)) def tokenLine(line: Int) = Tokens.newLine(fakeOrigin.withLineNumber(line))
def tokenComment(text: String) = Tokens.newComment(fakeOrigin(), text) def tokenCommentDoubleSlash(text: String) = Tokens.newCommentDoubleSlash(fakeOrigin(), text)
def tokenCommentHash(text: String) = Tokens.newCommentHash(fakeOrigin(), text)
def tokenWhitespace(text: String) = Tokens.newIgnoredWhitespace(fakeOrigin(), text)
private def tokenMaybeOptionalSubstitution(optional: Boolean, expression: Token*) = { private def tokenMaybeOptionalSubstitution(optional: Boolean, expression: Token*) = {
val l = new java.util.ArrayList[Token] val l = new java.util.ArrayList[Token]
@ -657,6 +659,10 @@ abstract trait TestUtils {
tokenize(s).asScala.toList tokenize(s).asScala.toList
} }
def tokenizeAsString(s: String) = {
Tokenizer.render(tokenize(s))
}
// this is importantly NOT using Path.newPath, which relies on // this is importantly NOT using Path.newPath, which relies on
// the parser; in the test suite we are often testing the parser, // the parser; in the test suite we are often testing the parser,
// so we don't want to use the parser to build the expected result. // so we don't want to use the parser to build the expected result.

View File

@ -14,18 +14,21 @@ class TokenizerTest extends TestUtils {
private def tokenizerTest(expected: List[Token], s: String) { private def tokenizerTest(expected: List[Token], s: String) {
assertEquals(List(Tokens.START) ++ expected ++ List(Tokens.END), assertEquals(List(Tokens.START) ++ expected ++ List(Tokens.END),
tokenizeAsList(s)) tokenizeAsList(s))
assertEquals(s, tokenizeAsString(s))
} }
@Test @Test
def tokenizeEmptyString() { def tokenizeEmptyString() {
assertEquals(List(Tokens.START, Tokens.END), val source = ""
tokenizeAsList("")) val expected = List()
tokenizerTest(expected, source)
} }
@Test @Test
def tokenizeNewlines() { def tokenizeNewlines() {
assertEquals(List(Tokens.START, tokenLine(1), tokenLine(2), Tokens.END), val source = "\n\n"
tokenizeAsList("\n\n")) val expected = List(tokenLine(1), tokenLine(2))
tokenizerTest(expected, source)
} }
@Test @Test
@ -33,75 +36,86 @@ class TokenizerTest extends TestUtils {
// all token types with no spaces (not sure JSON spec wants this to work, // all token types with no spaces (not sure JSON spec wants this to work,
// but spec is unclear to me when spaces are required, and banning them // but spec is unclear to me when spaces are required, and banning them
// is actually extra work). // is actually extra work).
val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.EQUALS, Tokens.CLOSE_CURLY, val source = """,:=}{][+="foo"""" + "\"\"\"bar\"\"\"" + """true3.14false42null${a.b}${?x.y}${"c.d"}""" + "\n"
val expected = List(Tokens.COMMA, Tokens.COLON, Tokens.EQUALS, Tokens.CLOSE_CURLY,
Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, Tokens.PLUS_EQUALS, tokenString("foo"), Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, Tokens.PLUS_EQUALS, tokenString("foo"),
tokenString("bar"), tokenTrue, tokenDouble(3.14), tokenFalse, tokenString("bar"), tokenTrue, tokenDouble(3.14), tokenFalse,
tokenLong(42), tokenNull, tokenSubstitution(tokenUnquoted("a.b")), tokenLong(42), tokenNull, tokenSubstitution(tokenUnquoted("a.b")),
tokenOptionalSubstitution(tokenUnquoted("x.y")), tokenOptionalSubstitution(tokenUnquoted("x.y")),
tokenKeySubstitution("c.d"), tokenLine(1), Tokens.END) tokenKeySubstitution("c.d"), tokenLine(1))
assertEquals(expected, tokenizeAsList(""",:=}{][+="foo"""" + "\"\"\"bar\"\"\"" + """true3.14false42null${a.b}${?x.y}${"c.d"}""" + "\n")) tokenizerTest(expected, source)
} }
@Test @Test
def tokenizeAllTypesWithSingleSpaces() { def tokenizeAllTypesWithSingleSpaces() {
val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.EQUALS, Tokens.CLOSE_CURLY, val source = """ , : = } { ] [ += "foo" """ + "\"\"\"bar\"\"\"" + """ 42 true 3.14 false null ${a.b} ${?x.y} ${"c.d"} """ + "\n "
Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, Tokens.PLUS_EQUALS, tokenString("foo"), val expected = List(tokenWhitespace(" "), Tokens.COMMA, tokenWhitespace(" "), Tokens.COLON, tokenWhitespace(" "),
tokenUnquoted(" "), tokenString("bar"), tokenUnquoted(" "), tokenLong(42), tokenUnquoted(" "), tokenTrue, tokenUnquoted(" "), Tokens.EQUALS, tokenWhitespace(" "), Tokens.CLOSE_CURLY, tokenWhitespace(" "), Tokens.OPEN_CURLY, tokenWhitespace(" "),
tokenDouble(3.14), tokenUnquoted(" "), tokenFalse, tokenUnquoted(" "), tokenNull, Tokens.CLOSE_SQUARE, tokenWhitespace(" "), Tokens.OPEN_SQUARE, tokenWhitespace(" "), Tokens.PLUS_EQUALS, tokenWhitespace(" "),
tokenString("foo"), tokenUnquoted(" "), tokenString("bar"), tokenUnquoted(" "), tokenLong(42), tokenUnquoted(" "),
tokenTrue, tokenUnquoted(" "), tokenDouble(3.14), tokenUnquoted(" "), tokenFalse, tokenUnquoted(" "), tokenNull,
tokenUnquoted(" "), tokenSubstitution(tokenUnquoted("a.b")), tokenUnquoted(" "), tokenUnquoted(" "), tokenSubstitution(tokenUnquoted("a.b")), tokenUnquoted(" "),
tokenOptionalSubstitution(tokenUnquoted("x.y")), tokenUnquoted(" "), tokenOptionalSubstitution(tokenUnquoted("x.y")), tokenUnquoted(" "),
tokenKeySubstitution("c.d"), tokenKeySubstitution("c.d"), tokenWhitespace(" "),
tokenLine(1), Tokens.END) tokenLine(1), tokenWhitespace(" "))
assertEquals(expected, tokenizeAsList(""" , : = } { ] [ += "foo" """ + "\"\"\"bar\"\"\"" + """ 42 true 3.14 false null ${a.b} ${?x.y} ${"c.d"} """ + "\n ")) tokenizerTest(expected, source)
} }
@Test @Test
def tokenizeAllTypesWithMultipleSpaces() { def tokenizeAllTypesWithMultipleSpaces() {
val expected = List(Tokens.START, Tokens.COMMA, Tokens.COLON, Tokens.EQUALS, Tokens.CLOSE_CURLY, val source = """ , : = } { ] [ += "foo" """ + "\"\"\"bar\"\"\"" + """ 42 true 3.14 false null ${a.b} ${?x.y} ${"c.d"} """ + "\n "
Tokens.OPEN_CURLY, Tokens.CLOSE_SQUARE, Tokens.OPEN_SQUARE, Tokens.PLUS_EQUALS, tokenString("foo"), val expected = List(tokenWhitespace(" "), Tokens.COMMA, tokenWhitespace(" "), Tokens.COLON, tokenWhitespace(" "),
Tokens.EQUALS, tokenWhitespace(" "), Tokens.CLOSE_CURLY, tokenWhitespace(" "), Tokens.OPEN_CURLY, tokenWhitespace(" "), Tokens.CLOSE_SQUARE,
tokenWhitespace(" "), Tokens.OPEN_SQUARE, tokenWhitespace(" "), Tokens.PLUS_EQUALS, tokenWhitespace(" "), tokenString("foo"),
tokenUnquoted(" "), tokenString("bar"), tokenUnquoted(" "), tokenLong(42), tokenUnquoted(" "), tokenTrue, tokenUnquoted(" "), tokenUnquoted(" "), tokenString("bar"), tokenUnquoted(" "), tokenLong(42), tokenUnquoted(" "), tokenTrue, tokenUnquoted(" "),
tokenDouble(3.14), tokenUnquoted(" "), tokenFalse, tokenUnquoted(" "), tokenNull, tokenDouble(3.14), tokenUnquoted(" "), tokenFalse, tokenUnquoted(" "), tokenNull,
tokenUnquoted(" "), tokenSubstitution(tokenUnquoted("a.b")), tokenUnquoted(" "), tokenUnquoted(" "), tokenSubstitution(tokenUnquoted("a.b")), tokenUnquoted(" "),
tokenOptionalSubstitution(tokenUnquoted("x.y")), tokenUnquoted(" "), tokenOptionalSubstitution(tokenUnquoted("x.y")), tokenUnquoted(" "),
tokenKeySubstitution("c.d"), tokenKeySubstitution("c.d"), tokenWhitespace(" "),
tokenLine(1), Tokens.END) tokenLine(1), tokenWhitespace(" "))
assertEquals(expected, tokenizeAsList(""" , : = } { ] [ += "foo" """ + "\"\"\"bar\"\"\"" + """ 42 true 3.14 false null ${a.b} ${?x.y} ${"c.d"} """ + "\n ")) tokenizerTest(expected, source)
} }
@Test @Test
def tokenizeTrueAndUnquotedText() { def tokenizeTrueAndUnquotedText() {
val expected = List(Tokens.START, tokenTrue, tokenUnquoted("foo"), Tokens.END) val source = """truefoo"""
assertEquals(expected, tokenizeAsList("""truefoo""")) val expected = List(tokenTrue, tokenUnquoted("foo"))
tokenizerTest(expected, source)
} }
@Test @Test
def tokenizeFalseAndUnquotedText() { def tokenizeFalseAndUnquotedText() {
val expected = List(Tokens.START, tokenFalse, tokenUnquoted("foo"), Tokens.END) val source = """falsefoo"""
assertEquals(expected, tokenizeAsList("""falsefoo""")) val expected = List(tokenFalse, tokenUnquoted("foo"))
tokenizerTest(expected, source)
} }
@Test @Test
def tokenizeNullAndUnquotedText() { def tokenizeNullAndUnquotedText() {
val expected = List(Tokens.START, tokenNull, tokenUnquoted("foo"), Tokens.END) val source = """nullfoo"""
assertEquals(expected, tokenizeAsList("""nullfoo""")) val expected = List(tokenNull, tokenUnquoted("foo"))
tokenizerTest(expected, source)
} }
@Test @Test
def tokenizeUnquotedTextContainingTrue() { def tokenizeUnquotedTextContainingTrue() {
val expected = List(Tokens.START, tokenUnquoted("footrue"), Tokens.END) val source = """footrue"""
assertEquals(expected, tokenizeAsList("""footrue""")) val expected = List(tokenUnquoted("footrue"))
tokenizerTest(expected, source)
} }
@Test @Test
def tokenizeUnquotedTextContainingSpaceTrue() { def tokenizeUnquotedTextContainingSpaceTrue() {
val expected = List(Tokens.START, tokenUnquoted("foo"), tokenUnquoted(" "), tokenTrue, Tokens.END) val source = """foo true"""
assertEquals(expected, tokenizeAsList("""foo true""")) val expected = List(tokenUnquoted("foo"), tokenUnquoted(" "), tokenTrue)
tokenizerTest(expected, source)
} }
@Test @Test
def tokenizeTrueAndSpaceAndUnquotedText() { def tokenizeTrueAndSpaceAndUnquotedText() {
val expected = List(Tokens.START, tokenTrue, tokenUnquoted(" "), tokenUnquoted("foo"), Tokens.END) val source = """true foo"""
assertEquals(expected, tokenizeAsList("""true foo""")) val expected = List(tokenTrue, tokenUnquoted(" "), tokenUnquoted("foo"))
tokenizerTest(expected, source)
} }
@Test @Test
@ -109,28 +123,33 @@ class TokenizerTest extends TestUtils {
tokenizerTest(List(tokenUnquoted("a/b/c/")), "a/b/c/") tokenizerTest(List(tokenUnquoted("a/b/c/")), "a/b/c/")
tokenizerTest(List(tokenUnquoted("/")), "/") tokenizerTest(List(tokenUnquoted("/")), "/")
tokenizerTest(List(tokenUnquoted("/"), tokenUnquoted(" "), tokenUnquoted("/")), "/ /") tokenizerTest(List(tokenUnquoted("/"), tokenUnquoted(" "), tokenUnquoted("/")), "/ /")
tokenizerTest(List(tokenComment("")), "//") tokenizerTest(List(tokenCommentDoubleSlash("")), "//")
} }
@Test @Test
def tokenizeUnquotedTextTrimsSpaces() { def tokenizeUnquotedTextKeepsSpaces() {
val expected = List(Tokens.START, tokenUnquoted("foo"), tokenLine(1), Tokens.END) val source = " foo \n"
assertEquals(expected, tokenizeAsList(" foo \n")) val expected = List(tokenWhitespace(" "), tokenUnquoted("foo"), tokenWhitespace(" "),
tokenLine(1))
tokenizerTest(expected, source)
} }
@Test @Test
def tokenizeUnquotedTextKeepsInternalSpaces() { def tokenizeUnquotedTextKeepsInternalSpaces() {
val expected = List(Tokens.START, tokenUnquoted("foo"), tokenUnquoted(" "), tokenUnquoted("bar"), val source = " foo bar baz \n"
tokenUnquoted(" "), tokenUnquoted("baz"), tokenLine(1), Tokens.END) val expected = List(tokenWhitespace(" "), tokenUnquoted("foo"), tokenUnquoted(" "),
assertEquals(expected, tokenizeAsList(" foo bar baz \n")) tokenUnquoted("bar"), tokenUnquoted(" "), tokenUnquoted("baz"), tokenWhitespace(" "),
tokenLine(1))
tokenizerTest(expected, source)
} }
@Test @Test
def tokenizeMixedUnquotedQuoted() { def tokenizeMixedUnquotedQuoted() {
val expected = List(Tokens.START, tokenUnquoted("foo"), val source = " foo\"bar\"baz \n"
tokenString("bar"), tokenUnquoted("baz"), val expected = List(tokenWhitespace(" "), tokenUnquoted("foo"),
tokenLine(1), Tokens.END) tokenString("bar"), tokenUnquoted("baz"), tokenWhitespace(" "),
assertEquals(expected, tokenizeAsList(" foo\"bar\"baz \n")) tokenLine(1))
tokenizerTest(expected, source)
} }
@Test @Test
@ -147,13 +166,14 @@ class TokenizerTest extends TestUtils {
val tests = List[UnescapeTest]((""" "" """, ""), val tests = List[UnescapeTest]((""" "" """, ""),
(" \"\\u0000\" ", Character.toString(0)), // nul byte (" \"\\u0000\" ", Character.toString(0)), // nul byte
(""" "\"\\\/\b\f\n\r\t" """, "\"\\/\b\f\n\r\t"), (""" "\"\\\/\b\f\n\r\t" """, "\"\\/\b\f\n\r\t"),
("\"\\u0046\"", "F"), (" \"\\u0046\" ", "F"),
("\"\\u0046\\u0046\"", "FF")) (" \"\\u0046\\u0046\" ", "FF"))
for (t <- tests) { for (t <- tests) {
describeFailure(t.toString) { describeFailure(t.toString) {
assertEquals(List(Tokens.START, Tokens.newValue(t.result), Tokens.END), val expected = List(tokenWhitespace(" "), Tokens.newValue(t.result, t.toString),
tokenizeAsList(t.escaped)) tokenWhitespace(" "))
tokenizerTest(expected, t.escaped)
} }
} }
} }
@ -182,32 +202,37 @@ class TokenizerTest extends TestUtils {
@Test @Test
def tokenizerEmptyTripleQuoted(): Unit = { def tokenizerEmptyTripleQuoted(): Unit = {
assertEquals(List(Tokens.START, tokenString(""), Tokens.END), val source = "\"\"\"\"\"\""
tokenizeAsList("\"\"\"\"\"\"")) val expected = List(tokenString(""))
tokenizerTest(expected, source)
} }
@Test @Test
def tokenizerTrivialTripleQuoted(): Unit = { def tokenizerTrivialTripleQuoted(): Unit = {
assertEquals(List(Tokens.START, tokenString("bar"), Tokens.END), val source = "\"\"\"bar\"\"\""
tokenizeAsList("\"\"\"bar\"\"\"")) val expected = List(tokenString("bar"))
tokenizerTest(expected, source)
} }
@Test @Test
def tokenizerNoEscapesInTripleQuoted(): Unit = { def tokenizerNoEscapesInTripleQuoted(): Unit = {
assertEquals(List(Tokens.START, tokenString("\\n"), Tokens.END), val source = "\"\"\"\\n\"\"\""
tokenizeAsList("\"\"\"\\n\"\"\"")) val expected = List(tokenString("\\n"))
tokenizerTest(expected, source)
} }
@Test @Test
def tokenizerTrailingQuotesInTripleQuoted(): Unit = { def tokenizerTrailingQuotesInTripleQuoted(): Unit = {
assertEquals(List(Tokens.START, tokenString("\"\"\""), Tokens.END), val source = "\"\"\"\"\"\"\"\"\""
tokenizeAsList("\"\"\"\"\"\"\"\"\"")) val expected = List(tokenString("\"\"\""))
tokenizerTest(expected, source)
} }
@Test @Test
def tokenizerNewlineInTripleQuoted(): Unit = { def tokenizerNewlineInTripleQuoted(): Unit = {
assertEquals(List(Tokens.START, tokenString("foo\nbar"), Tokens.END), val source = "\"\"\"foo\nbar\"\"\""
tokenizeAsList("\"\"\"foo\nbar\"\"\"")) val expected = List(tokenString("foo\nbar"))
tokenizerTest(expected, source)
} }
@Test @Test
@ -229,8 +254,8 @@ class TokenizerTest extends TestUtils {
for (t <- tests) { for (t <- tests) {
describeFailure(t.toString()) { describeFailure(t.toString()) {
assertEquals(List(Tokens.START, t.result, Tokens.END), val expected = List(t.result)
tokenizeAsList(t.s)) tokenizerTest(expected, t.s)
} }
} }
} }
@ -239,15 +264,30 @@ class TokenizerTest extends TestUtils {
def commentsHandledInVariousContexts() { def commentsHandledInVariousContexts() {
tokenizerTest(List(tokenString("//bar")), "\"//bar\"") tokenizerTest(List(tokenString("//bar")), "\"//bar\"")
tokenizerTest(List(tokenString("#bar")), "\"#bar\"") tokenizerTest(List(tokenString("#bar")), "\"#bar\"")
tokenizerTest(List(tokenUnquoted("bar"), tokenComment("comment")), "bar//comment") tokenizerTest(List(tokenUnquoted("bar"), tokenCommentDoubleSlash("comment")), "bar//comment")
tokenizerTest(List(tokenUnquoted("bar"), tokenComment("comment")), "bar#comment") tokenizerTest(List(tokenUnquoted("bar"), tokenCommentHash("comment")), "bar#comment")
tokenizerTest(List(tokenInt(10), tokenComment("comment")), "10//comment") tokenizerTest(List(tokenInt(10), tokenCommentDoubleSlash("comment")), "10//comment")
tokenizerTest(List(tokenInt(10), tokenComment("comment")), "10#comment") tokenizerTest(List(tokenInt(10), tokenCommentHash("comment")), "10#comment")
tokenizerTest(List(tokenDouble(3.14), tokenComment("comment")), "3.14//comment") tokenizerTest(List(tokenDouble(3.14), tokenCommentDoubleSlash("comment")), "3.14//comment")
tokenizerTest(List(tokenDouble(3.14), tokenComment("comment")), "3.14#comment") tokenizerTest(List(tokenDouble(3.14), tokenCommentHash("comment")), "3.14#comment")
// be sure we keep the newline // be sure we keep the newline
tokenizerTest(List(tokenInt(10), tokenComment("comment"), tokenLine(1), tokenInt(12)), "10//comment\n12") tokenizerTest(List(tokenInt(10), tokenCommentDoubleSlash("comment"), tokenLine(1), tokenInt(12)), "10//comment\n12")
tokenizerTest(List(tokenInt(10), tokenComment("comment"), tokenLine(1), tokenInt(12)), "10#comment\n12") tokenizerTest(List(tokenInt(10), tokenCommentHash("comment"), tokenLine(1), tokenInt(12)), "10#comment\n12")
// be sure we handle multi-line comments
tokenizerTest(List(tokenCommentDoubleSlash("comment"), tokenLine(1), tokenCommentDoubleSlash("comment2")),
"//comment\n//comment2")
tokenizerTest(List(tokenCommentHash("comment"), tokenLine(1), tokenCommentHash("comment2")),
"#comment\n#comment2")
tokenizerTest(List(tokenWhitespace(" "), tokenCommentDoubleSlash("comment\r"),
tokenLine(1), tokenWhitespace(" "), tokenCommentDoubleSlash("comment2 "),
tokenLine(2), tokenCommentDoubleSlash("comment3 "),
tokenLine(3), tokenLine(4), tokenCommentDoubleSlash("comment4")),
" //comment\r\n //comment2 \n//comment3 \n\n//comment4")
tokenizerTest(List(tokenWhitespace(" "), tokenCommentDoubleSlash("comment\r"),
tokenLine(1), tokenWhitespace(" "), tokenCommentDoubleSlash("comment2 "),
tokenLine(2), tokenCommentDoubleSlash("comment3 "),
tokenLine(3), tokenLine(4), tokenCommentDoubleSlash("comment4")),
" //comment\r\n //comment2 \n//comment3 \n\n//comment4")
} }
@Test @Test