make whitespace handling Unicode-correct.

This is maybe a little above and beyond what will ever matter in practice.
This commit is contained in:
Havoc Pennington 2011-11-12 21:46:14 -05:00
parent b0fdc6456c
commit 7b0e5a1471
5 changed files with 132 additions and 13 deletions

View File

@ -3,6 +3,7 @@ package com.typesafe.config;
import java.util.concurrent.TimeUnit;
import com.typesafe.config.impl.ConfigImpl;
import com.typesafe.config.impl.ConfigUtil;
/**
* This class holds some global static methods for the config package.
@ -68,10 +69,11 @@ public final class Config {
*/
public static long parseDuration(String input,
ConfigOrigin originForException, String pathForException) {
String s = input.trim();
String s = ConfigUtil.unicodeTrim(input);
String originalUnitString = getUnits(s);
String unitString = originalUnitString;
String numberString = s.substring(0, s.length() - unitString.length()).trim();
String numberString = ConfigUtil.unicodeTrim(s.substring(0, s.length()
- unitString.length()));
TimeUnit units = null;
// this would be caught later anyway, but the error message
@ -150,7 +152,7 @@ public final class Config {
*/
public static long parseMemorySize(String input,
ConfigOrigin originForException, String pathForException) {
String s = input.trim();
String s = ConfigUtil.unicodeTrim(input);
String unitStringMaybePlural = getUnits(s);
String unitString;
if (unitStringMaybePlural.endsWith("s"))
@ -159,9 +161,8 @@ public final class Config {
else
unitString = unitStringMaybePlural;
String unitStringLower = unitString.toLowerCase();
String numberString = s.substring(0,
s.length() - unitStringMaybePlural.length())
.trim();
String numberString = ConfigUtil.unicodeTrim(s.substring(0, s.length()
- unitStringMaybePlural.length()));
// this would be caught later anyway, but the error message
// is more helpful if we check it here.

View File

@ -1,7 +1,8 @@
package com.typesafe.config.impl;
final class ConfigUtil {
/** This is public just for the "config" package to use, don't touch it */
final public class ConfigUtil {
static boolean equalsHandlingNull(Object a, Object b) {
if (a == null && b != null)
return false;
@ -50,4 +51,68 @@ final class ConfigUtil {
sb.append('"');
return sb.toString();
}
static boolean isWhitespace(int codepoint) {
switch (codepoint) {
// try to hit the most common ASCII ones first, then the nonbreaking
// spaces that Java brokenly leaves out of isWhitespace.
case ' ':
case '\n':
case '\u00A0':
case '\u2007':
case '\u202F':
return true;
default:
return Character.isWhitespace(codepoint);
}
}
/** This is public just for the "config" package to use, don't touch it! */
public static String unicodeTrim(String s) {
// this is dumb because it looks like there aren't any whitespace
// characters that need surrogate encoding. But, points for
// pedantic correctness! It's future-proof or something.
// String.trim() actually is broken, since there are plenty of
// non-ASCII whitespace characters.
final int length = s.length();
if (length == 0)
return s;
int start = 0;
while (true) {
char c = s.charAt(start);
if (c == ' ' || c == '\n') {
start += 1;
} else {
int cp = s.codePointAt(start);
if (isWhitespace(cp))
start += Character.charCount(cp);
else
break;
}
}
int end = length;
while (true) {
char c = s.charAt(end - 1);
if (c == ' ' || c == '\n') {
--end;
} else {
int cp;
int delta;
if (Character.isLowSurrogate(c)) {
cp = s.codePointAt(end - 2);
delta = 2;
} else {
cp = s.codePointAt(end - 1);
delta = 1;
}
if (isWhitespace(cp))
end -= delta;
else
break;
}
}
return s.substring(start, end);
}
}

View File

@ -384,7 +384,7 @@ final class Parser {
for (int i = 0; i < s.length(); ++i) {
char c = s.charAt(i);
if (!Character.isWhitespace(c))
if (!ConfigUtil.isWhitespace(c))
return false;
}
return true;
@ -796,7 +796,7 @@ final class Parser {
// do something much faster than the full parser if
// we just have something like "foo" or "foo.bar"
private static Path speculativeFastParsePath(String path) {
String s = path.trim();
String s = ConfigUtil.unicodeTrim(path);
if (hasUnsafeChars(s))
return null;
if (s.startsWith(".") || s.endsWith(".") || s.contains(".."))

View File

@ -125,13 +125,11 @@ final class Tokenizer {
}
static boolean isWhitespace(int c) {
// hoping this optimizes slightly by catching the most common ' '
// case up front.
return c == ' ' || c == '\n' || Character.isWhitespace(c);
return ConfigUtil.isWhitespace(c);
}
static boolean isWhitespaceNotNewline(int c) {
return c == ' ' || (c != '\n' && Character.isWhitespace(c));
return c != '\n' && ConfigUtil.isWhitespace(c);
}
private int slurpComment() {

View File

@ -0,0 +1,55 @@
package com.typesafe.config.impl
import org.junit.Assert._
import org.junit._
class UtilTest extends TestUtils {
private lazy val supplementaryChars = {
val sb = new java.lang.StringBuilder()
val codepoints = Seq(
0x2070E, 0x20731, 0x20779, 0x20C53, 0x20C78,
0x20C96, 0x20CCF, 0x20CD5, 0x20D15, 0x20D7C)
for (c <- codepoints) {
sb.appendCodePoint(c)
}
assertTrue(sb.length() > codepoints.length)
sb.toString()
}
@Test
def unicodeTrimSupplementaryChars() {
assertEquals("", ConfigUtil.unicodeTrim(""))
assertEquals("a", ConfigUtil.unicodeTrim("a"))
assertEquals("abc", ConfigUtil.unicodeTrim("abc"))
assertEquals(supplementaryChars, ConfigUtil.unicodeTrim(supplementaryChars))
val s = " \u00A0 \n " + supplementaryChars + " \n \u00A0 "
val asciiTrimmed = s.trim()
val unitrimmed = ConfigUtil.unicodeTrim(s)
assertFalse(asciiTrimmed.equals(unitrimmed))
assertEquals(supplementaryChars, unitrimmed)
}
@Test
def definitionOfWhitespace() {
assertTrue(ConfigUtil.isWhitespace(' '))
assertTrue(ConfigUtil.isWhitespace('\n'))
// these three are nonbreaking spaces
assertTrue(ConfigUtil.isWhitespace('\u00A0'))
assertTrue(ConfigUtil.isWhitespace('\u2007'))
assertTrue(ConfigUtil.isWhitespace('\u202F'))
// vertical tab, a weird one
assertTrue(ConfigUtil.isWhitespace('\u000B'))
// file separator, another weird one
assertTrue(ConfigUtil.isWhitespace('\u001C'))
}
@Test
def equalsThatHandlesNull() {
assertTrue(ConfigUtil.equalsHandlingNull(null, null))
assertFalse(ConfigUtil.equalsHandlingNull(new Object(), null))
assertFalse(ConfigUtil.equalsHandlingNull(null, new Object()))
assertTrue(ConfigUtil.equalsHandlingNull("", ""))
}
}