make whitespace handling Unicode-correct.

This is maybe a little above and beyond what will ever matter in practice.
2025-04-15 05:30:43 +08:00 · 2011-11-12 21:46:14 -05:00 · 2011-11-12 21:46:14 -05:00 · 7b0e5a1471
commit 7b0e5a1471
parent b0fdc6456c
5 changed files with 132 additions and 13 deletions
--- a/src/main/java/com/typesafe/config/Config.java
+++ b/src/main/java/com/typesafe/config/Config.java
@ -3,6 +3,7 @@ package com.typesafe.config;
 import java.util.concurrent.TimeUnit;

 import com.typesafe.config.impl.ConfigImpl;
+import com.typesafe.config.impl.ConfigUtil;

 /**
 * This class holds some global static methods for the config package.
@ -68,10 +69,11 @@ public final class Config {
     */
    public static long parseDuration(String input,
            ConfigOrigin originForException, String pathForException) {
-        String s = input.trim();
+        String s = ConfigUtil.unicodeTrim(input);
        String originalUnitString = getUnits(s);
        String unitString = originalUnitString;
-        String numberString = s.substring(0, s.length() - unitString.length()).trim();
+        String numberString = ConfigUtil.unicodeTrim(s.substring(0, s.length()
+                - unitString.length()));
        TimeUnit units = null;

        // this would be caught later anyway, but the error message
@ -150,7 +152,7 @@ public final class Config {
     */
    public static long parseMemorySize(String input,
            ConfigOrigin originForException, String pathForException) {
-        String s = input.trim();
+        String s = ConfigUtil.unicodeTrim(input);
        String unitStringMaybePlural = getUnits(s);
        String unitString;
        if (unitStringMaybePlural.endsWith("s"))
@ -159,9 +161,8 @@ public final class Config {
        else
            unitString = unitStringMaybePlural;
        String unitStringLower = unitString.toLowerCase();
-        String numberString = s.substring(0,
-                s.length() - unitStringMaybePlural.length())
-                .trim();
+        String numberString = ConfigUtil.unicodeTrim(s.substring(0, s.length()
+                - unitStringMaybePlural.length()));

        // this would be caught later anyway, but the error message
        // is more helpful if we check it here.
--- a/src/main/java/com/typesafe/config/impl/ConfigUtil.java
+++ b/src/main/java/com/typesafe/config/impl/ConfigUtil.java
@ -1,7 +1,8 @@
 package com.typesafe.config.impl;


-final class ConfigUtil {
+/** This is public just for the "config" package to use, don't touch it */
+final public class ConfigUtil {
    static boolean equalsHandlingNull(Object a, Object b) {
        if (a == null && b != null)
            return false;
@ -50,4 +51,68 @@ final class ConfigUtil {
        sb.append('"');
        return sb.toString();
    }
+
+    static boolean isWhitespace(int codepoint) {
+        switch (codepoint) {
+        // try to hit the most common ASCII ones first, then the nonbreaking
+        // spaces that Java brokenly leaves out of isWhitespace.
+        case ' ':
+        case '\n':
+        case '\u00A0':
+        case '\u2007':
+        case '\u202F':
+            return true;
+        default:
+            return Character.isWhitespace(codepoint);
+        }
+    }
+
+    /** This is public just for the "config" package to use, don't touch it! */
+    public static String unicodeTrim(String s) {
+        // this is dumb because it looks like there aren't any whitespace
+        // characters that need surrogate encoding. But, points for
+        // pedantic correctness! It's future-proof or something.
+        // String.trim() actually is broken, since there are plenty of
+        // non-ASCII whitespace characters.
+        final int length = s.length();
+        if (length == 0)
+            return s;
+
+        int start = 0;
+        while (true) {
+            char c = s.charAt(start);
+            if (c == ' ' || c == '\n') {
+                start += 1;
+            } else {
+                int cp = s.codePointAt(start);
+                if (isWhitespace(cp))
+                    start += Character.charCount(cp);
+                else
+                    break;
+            }
+        }
+
+        int end = length;
+        while (true) {
+            char c = s.charAt(end - 1);
+            if (c == ' ' || c == '\n') {
+                --end;
+            } else {
+                int cp;
+                int delta;
+                if (Character.isLowSurrogate(c)) {
+                    cp = s.codePointAt(end - 2);
+                    delta = 2;
+                } else {
+                    cp = s.codePointAt(end - 1);
+                    delta = 1;
+                }
+                if (isWhitespace(cp))
+                    end -= delta;
+                else
+                    break;
+            }
+        }
+        return s.substring(start, end);
+    }
 }
--- a/src/main/java/com/typesafe/config/impl/Parser.java
+++ b/src/main/java/com/typesafe/config/impl/Parser.java
@ -384,7 +384,7 @@ final class Parser {

            for (int i = 0; i < s.length(); ++i) {
                char c = s.charAt(i);
-                if (!Character.isWhitespace(c))
+                if (!ConfigUtil.isWhitespace(c))
                    return false;
            }
            return true;
@ -796,7 +796,7 @@ final class Parser {
    // do something much faster than the full parser if
    // we just have something like "foo" or "foo.bar"
    private static Path speculativeFastParsePath(String path) {
-        String s = path.trim();
+        String s = ConfigUtil.unicodeTrim(path);
        if (hasUnsafeChars(s))
            return null;
        if (s.startsWith(".") || s.endsWith(".") || s.contains(".."))
--- a/src/main/java/com/typesafe/config/impl/Tokenizer.java
+++ b/src/main/java/com/typesafe/config/impl/Tokenizer.java
@ -125,13 +125,11 @@ final class Tokenizer {
        }

        static boolean isWhitespace(int c) {
-            // hoping this optimizes slightly by catching the most common ' '
-            // case up front.
-            return c == ' ' || c == '\n' || Character.isWhitespace(c);
+            return ConfigUtil.isWhitespace(c);
        }

        static boolean isWhitespaceNotNewline(int c) {
-            return c == ' ' || (c != '\n' && Character.isWhitespace(c));
+            return c != '\n' && ConfigUtil.isWhitespace(c);
        }

        private int slurpComment() {
--- a/src/test/scala/com/typesafe/config/impl/UtilTest.scala
+++ b/src/test/scala/com/typesafe/config/impl/UtilTest.scala
@ -0,0 +1,55 @@
+package com.typesafe.config.impl
+
+import org.junit.Assert._
+import org.junit._
+
+class UtilTest extends TestUtils {
+    private lazy val supplementaryChars = {
+        val sb = new java.lang.StringBuilder()
+        val codepoints = Seq(
+            0x2070E, 0x20731, 0x20779, 0x20C53, 0x20C78,
+            0x20C96, 0x20CCF, 0x20CD5, 0x20D15, 0x20D7C)
+        for (c <- codepoints) {
+            sb.appendCodePoint(c)
+        }
+        assertTrue(sb.length() > codepoints.length)
+        sb.toString()
+    }
+
+    @Test
+    def unicodeTrimSupplementaryChars() {
+        assertEquals("", ConfigUtil.unicodeTrim(""))
+        assertEquals("a", ConfigUtil.unicodeTrim("a"))
+        assertEquals("abc", ConfigUtil.unicodeTrim("abc"))
+        assertEquals(supplementaryChars, ConfigUtil.unicodeTrim(supplementaryChars))
+
+        val s = " \u00A0 \n  " + supplementaryChars + "  \n  \u00A0 "
+        val asciiTrimmed = s.trim()
+        val unitrimmed = ConfigUtil.unicodeTrim(s)
+
+        assertFalse(asciiTrimmed.equals(unitrimmed))
+        assertEquals(supplementaryChars, unitrimmed)
+    }
+
+    @Test
+    def definitionOfWhitespace() {
+        assertTrue(ConfigUtil.isWhitespace(' '))
+        assertTrue(ConfigUtil.isWhitespace('\n'))
+        // these three are nonbreaking spaces
+        assertTrue(ConfigUtil.isWhitespace('\u00A0'))
+        assertTrue(ConfigUtil.isWhitespace('\u2007'))
+        assertTrue(ConfigUtil.isWhitespace('\u202F'))
+        // vertical tab, a weird one
+        assertTrue(ConfigUtil.isWhitespace('\u000B'))
+        // file separator, another weird one
+        assertTrue(ConfigUtil.isWhitespace('\u001C'))
+    }
+
+    @Test
+    def equalsThatHandlesNull() {
+        assertTrue(ConfigUtil.equalsHandlingNull(null, null))
+        assertFalse(ConfigUtil.equalsHandlingNull(new Object(), null))
+        assertFalse(ConfigUtil.equalsHandlingNull(null, new Object()))
+        assertTrue(ConfigUtil.equalsHandlingNull("", ""))
+    }
+}