From 5b170ab63aaf4be23d7c6b42d2f844579f82a489 Mon Sep 17 00:00:00 2001 From: Karlatemp Date: Mon, 12 Jul 2021 15:46:03 +0800 Subject: [PATCH] Add `HtmlEntity` for decode html entity --- .../src/commonMain/kotlin/HtmlEntity.kt | 264 ++++++++++++++++++ .../net/mamoe/mirai/utils/HtmlEscapeTest.kt | 25 ++ 2 files changed, 289 insertions(+) create mode 100644 mirai-core-utils/src/commonMain/kotlin/HtmlEntity.kt create mode 100644 mirai-core-utils/src/commonTest/kotlin/net/mamoe/mirai/utils/HtmlEscapeTest.kt diff --git a/mirai-core-utils/src/commonMain/kotlin/HtmlEntity.kt b/mirai-core-utils/src/commonMain/kotlin/HtmlEntity.kt new file mode 100644 index 000000000..514bf3806 --- /dev/null +++ b/mirai-core-utils/src/commonMain/kotlin/HtmlEntity.kt @@ -0,0 +1,264 @@ +/* + * Copyright 2019-2021 Mamoe Technologies and contributors. + * + * 此源代码的使用受 GNU AFFERO GENERAL PUBLIC LICENSE version 3 许可证的约束, 可以在以下链接找到该许可证. + * Use of this source code is governed by the GNU AGPLv3 license that can be found through the following link. + * + * https://github.com/mamoe/mirai/blob/dev/LICENSE + */ + +package net.mamoe.mirai.utils + +@Suppress("RegExpRedundantEscape") +private val STR_TO_CHAR_PATTERN = """\&(\#?[A-Za-z0-9]+?)\;""".toRegex() + +public fun String.decodeHtmlEscape(): String = replace(STR_TO_CHAR_PATTERN) { match -> + STR_TO_CHAR_MAPPINGS[match.value]?.let { return@replace it } + val match1 = match.groups[1]!!.value + if (match1.length > 1 && match1[0] == '#') { + if (match1.length > 2) { + if (match1[1] == 'x') { // hex + match1.substring(2).toIntOrNull(16)?.let { + return@replace it.toChar().toString() + } + } + } + match1.substring(1).toIntOrNull()?.let { + return@replace it.toChar().toString() + } + } + + match.value +} + + +private val STR_TO_CHAR_MAPPINGS: Map by lazy { +// + val result = HashMap(223) + result["&"] = "\u0026" + result["<"] = "\u003c" + result[">"] = "\u003e" + result[" "] = "\u00a0" + result["¡"] = "\u00a1" + result["¢"] = "\u00a2" + result["£"] = "\u00a3" + result["¤"] = "\u00a4" + result["¥"] = "\u00a5" + result["¦"] = "\u00a6" + result["§"] = "\u00a7" + result["¨"] = "\u00a8" + result["©"] = "\u00a9" + result["ª"] = "\u00aa" + result["«"] = "\u00ab" + result["¬"] = "\u00ac" + result["­"] = "\u00ad" + result["®"] = "\u00ae" + result["¯"] = "\u00af" + result["°"] = "\u00b0" + result["±"] = "\u00b1" + result["²"] = "\u00b2" + result["³"] = "\u00b3" + result["´"] = "\u00b4" + result["µ"] = "\u00b5" + result["¶"] = "\u00b6" + result["·"] = "\u00b7" + result["¸"] = "\u00b8" + result["¹"] = "\u00b9" + result["º"] = "\u00ba" + result["»"] = "\u00bb" + result["¼"] = "\u00bc" + result["½"] = "\u00bd" + result["¾"] = "\u00be" + result["¿"] = "\u00bf" + result["À"] = "\u00c0" + result["Á"] = "\u00c1" + result["Â"] = "\u00c2" + result["Ã"] = "\u00c3" + result["Ä"] = "\u00c4" + result["Å"] = "\u00c5" + result["Æ"] = "\u00c6" + result["Ç"] = "\u00c7" + result["È"] = "\u00c8" + result["É"] = "\u00c9" + result["Ê"] = "\u00ca" + result["Ë"] = "\u00cb" + result["Ì"] = "\u00cc" + result["Í"] = "\u00cd" + result["Î"] = "\u00ce" + result["Ï"] = "\u00cf" + result["Ð"] = "\u00d0" + result["Ñ"] = "\u00d1" + result["Ò"] = "\u00d2" + result["Ó"] = "\u00d3" + result["Ô"] = "\u00d4" + result["Õ"] = "\u00d5" + result["Ö"] = "\u00d6" + result["×"] = "\u00d7" + result["Ø"] = "\u00d8" + result["Ù"] = "\u00d9" + result["Ú"] = "\u00da" + result["Û"] = "\u00db" + result["Ü"] = "\u00dc" + result["Ý"] = "\u00dd" + result["Þ"] = "\u00de" + result["ß"] = "\u00df" + result["à"] = "\u00e0" + result["á"] = "\u00e1" + result["â"] = "\u00e2" + result["ã"] = "\u00e3" + result["ä"] = "\u00e4" + result["å"] = "\u00e5" + result["æ"] = "\u00e6" + result["ç"] = "\u00e7" + result["è"] = "\u00e8" + result["é"] = "\u00e9" + result["ê"] = "\u00ea" + result["ë"] = "\u00eb" + result["ì"] = "\u00ec" + result["í"] = "\u00ed" + result["î"] = "\u00ee" + result["ï"] = "\u00ef" + result["ð"] = "\u00f0" + result["ñ"] = "\u00f1" + result["ò"] = "\u00f2" + result["ó"] = "\u00f3" + result["ô"] = "\u00f4" + result["õ"] = "\u00f5" + result["ö"] = "\u00f6" + result["÷"] = "\u00f7" + result["ø"] = "\u00f8" + result["ù"] = "\u00f9" + result["ú"] = "\u00fa" + result["û"] = "\u00fb" + result["ü"] = "\u00fc" + result["ý"] = "\u00fd" + result["þ"] = "\u00fe" + result["ÿ"] = "\u00ff" + result["ƒ"] = "\u0192" + result["Α"] = "\u0391" + result["Β"] = "\u0392" + result["Γ"] = "\u0393" + result["Δ"] = "\u0394" + result["Ε"] = "\u0395" + result["Ζ"] = "\u0396" + result["Η"] = "\u0397" + result["Θ"] = "\u0398" + result["Ι"] = "\u0399" + result["Κ"] = "\u039a" + result["Λ"] = "\u039b" + result["Μ"] = "\u039c" + result["Ν"] = "\u039d" + result["Ξ"] = "\u039e" + result["Ο"] = "\u039f" + result["Π"] = "\u03a0" + result["Ρ"] = "\u03a1" + result["Σ"] = "\u03a3" + result["Τ"] = "\u03a4" + result["Υ"] = "\u03a5" + result["Φ"] = "\u03a6" + result["Χ"] = "\u03a7" + result["Ψ"] = "\u03a8" + result["Ω"] = "\u03a9" + result["α"] = "\u03b1" + result["β"] = "\u03b2" + result["γ"] = "\u03b3" + result["δ"] = "\u03b4" + result["ε"] = "\u03b5" + result["ζ"] = "\u03b6" + result["η"] = "\u03b7" + result["θ"] = "\u03b8" + result["ι"] = "\u03b9" + result["κ"] = "\u03ba" + result["λ"] = "\u03bb" + result["μ"] = "\u03bc" + result["ν"] = "\u03bd" + result["ξ"] = "\u03be" + result["ο"] = "\u03bf" + result["π"] = "\u03c0" + result["ρ"] = "\u03c1" + result["ς"] = "\u03c2" + result["σ"] = "\u03c3" + result["τ"] = "\u03c4" + result["υ"] = "\u03c5" + result["φ"] = "\u03c6" + result["χ"] = "\u03c7" + result["ψ"] = "\u03c8" + result["ω"] = "\u03c9" + result["ϑ"] = "\u03d1" + result["ϒ"] = "\u03d2" + result["ϖ"] = "\u03d6" + result["•"] = "\u2022" + result["…"] = "\u2026" + result["′"] = "\u2032" + result["″"] = "\u2033" + result["‾"] = "\u203e" + result["⁄"] = "\u2044" + result["℘"] = "\u2118" + result["ℑ"] = "\u2111" + result["ℜ"] = "\u211c" + result["™"] = "\u2122" + result["ℵ"] = "\u2135" + result["←"] = "\u2190" + result["↑"] = "\u2191" + result["→"] = "\u2192" + result["↓"] = "\u2193" + result["↔"] = "\u2194" + result["↵"] = "\u21b5" + result["⇐"] = "\u21d0" + result["⇑"] = "\u21d1" + result["⇒"] = "\u21d2" + result["⇓"] = "\u21d3" + result["⇔"] = "\u21d4" + result["∀"] = "\u2200" + result["∂"] = "\u2202" + result["∃"] = "\u2203" + result["∅"] = "\u2205" + result["∇"] = "\u2207" + result["∈"] = "\u2208" + result["∉"] = "\u2209" + result["∋"] = "\u220b" + result["∏"] = "\u220f" + result["∑"] = "\u2211" + result["−"] = "\u2212" + result["∗"] = "\u2217" + result["√"] = "\u221a" + result["∝"] = "\u221d" + result["∞"] = "\u221e" + result["∠"] = "\u2220" + result["∧"] = "\u2227" + result["∨"] = "\u2228" + result["∩"] = "\u2229" + result["∪"] = "\u222a" + result["∫"] = "\u222b" + result["∴"] = "\u2234" + result["∼"] = "\u223c" + result["≅"] = "\u2245" + result["≈"] = "\u2248" + result["≠"] = "\u2260" + result["≡"] = "\u2261" + result["≤"] = "\u2264" + result["≥"] = "\u2265" + result["⊂"] = "\u2282" + result["⊃"] = "\u2283" + result["⊄"] = "\u2284" + result["⊆"] = "\u2286" + result["⊇"] = "\u2287" + result["⊕"] = "\u2295" + result["⊗"] = "\u2297" + result["⊥"] = "\u22a5" + result["⋅"] = "\u22c5" + result["⌈"] = "\u2308" + result["⌉"] = "\u2309" + result["⌊"] = "\u230a" + result["⌋"] = "\u230b" + result["⟨"] = "\u2329" + result["⟩"] = "\u232a" + result["◊"] = "\u25ca" + result["♠"] = "\u2660" + result["♣"] = "\u2663" + result["♥"] = "\u2665" + result["♦"] = "\u2666" +// + result +} + diff --git a/mirai-core-utils/src/commonTest/kotlin/net/mamoe/mirai/utils/HtmlEscapeTest.kt b/mirai-core-utils/src/commonTest/kotlin/net/mamoe/mirai/utils/HtmlEscapeTest.kt new file mode 100644 index 000000000..2ce85b733 --- /dev/null +++ b/mirai-core-utils/src/commonTest/kotlin/net/mamoe/mirai/utils/HtmlEscapeTest.kt @@ -0,0 +1,25 @@ +/* + * Copyright 2019-2021 Mamoe Technologies and contributors. + * + * 此源代码的使用受 GNU AFFERO GENERAL PUBLIC LICENSE version 3 许可证的约束, 可以在以下链接找到该许可证. + * Use of this source code is governed by the GNU AGPLv3 license that can be found through the following link. + * + * https://github.com/mamoe/mirai/blob/dev/LICENSE + */ + +package net.mamoe.mirai.utils + +import kotlin.test.Test +import kotlin.test.assertEquals + +internal class HtmlEscapeTest { + @Test + fun testDecode() { + val ALL = "Ͻ∏«ü⋅Φ∃≤ç‾∉ΗöêÞÒÔª⌉î⊄μ≥ß∧í>¯ëßÓ»κ÷♣ßζÍ∅〉η″∞\ÒêØ⌈ÜÏôλ♠èφÌ°ºÞ〈òÕ∏<™←⊥Α⌉õéÝμݙ˺∠ï∅òæΗ©Υþ♥îΕ←Ü∩Ê∗∅∪⇔ñðΔ℘↑ç…Â′‾Ζÿ♦óÉÿíÛâ∨⇑¤÷¶þε<→♥⊗ƒü∗ð″Ρ³Ε§þ∇ìÚ±Τè¥↓ýτñ≈¨ΠΔ∈〈ýë↑õ®ÛÏϖáÓ∫>Ê∇óΙψ∫ÎΖÐυ·≥∼òà∝∠∋∈û⊥ϖÐ⊆ΛøΗ♠⊂¨åéΨ∉∞⊂ñ£ℵÇ⟨¾¢Í♣öäΝØΑΘÌ⊕⊃Ú⊃ð⌊⊂↵⌈«ς∂ΞΙ◊ÙνÍ©↓¯β∩ì≠⌉↑∋⊄ο∗½Β∼∪ΡξÊν♥îâÖΜ®ο≡ℜ→Æ∀ÃΓ∫κξδïã⊇×Ëƒμ¿­↓ρ⋅Ο⊆æ¼Δ√àÔÈå⁄ø<ìľ¬↔ΓΩ∂⊇ 〉ÕëªÉ⁄ùíΕáκ•Á≤㡦ςéÁ׵ų⇓⇒χλυΖιý≥σ² αλ¶èªÌÖÄ⇔ ♦∑σϒφτ♦Κθϖβ→τ⇐ñΠΘùç⌊¿Õ∼≈&ϒϑγ⇓ℑσ™ΛηυΡ⇑γÔ°Àø´æ⌋ùℵ…⊄′≠ςζ⇒⌋⌈åÓÁÙ÷√∃≡⊃◊≡ρ¬⊗Û⇓∨∴¾öä§Ò∏Àϑó&⇐ℵ∧ℜ′Ñ⁄ÿπ⊆⇐∝∩ϒõãÑπ⇑⊕ƒ″‾⊇Ð∪Ü−á∞⊗ρôâ℘∑ι←⇒⌊∇∀♠ä◊θ↔¿¥©θúΦê>½Ο±Γ∀¡ü↵Ú≅∈Χ¨ιûℜÖû²∂¦ΞϑΒ´×↔¸Π­•»¯¹Ψ£§ℑú℘∃−♣εÇΩΝΑ∧λωÀ⊥ΧΒ≠πàΩ⇔°¤Þ¸¦¼δχΨΜ∨Μℑ∠δφ⊕Ý¡¥úŹɭ·αο≅γ⌋ËΧ∉ΛÅôΙΝΚΣ嶤³•ÆѺ®χ¢ÈΣ∴ïξβµΦΚ&ΤΘ…ÈΤ↵Ξ·Ç«µ£²ζÃÙψωναΥΣ⟩∑∝∋√⋅≤Ο¬Ø ¹Æ´¢ωÄΥ≅¸η≈∴−ψμ" + val RESP = "\u00cf\u00bd\u220f\u00ab\u00fc\u22c5\u03a6\u2203\u2264\u00e7\u203e\u2209\u0397\u00f6\u00ea\u00de\u00d2\u00d4\u00aa\u2309\u00ee\u2284\u00ce\u00bc\u2265\u00df\u2227\u00ed\u003e\u00af\u00eb\u00df\u00d3\u00bb\u03ba\u00f7\u2663\u00df\u03b6\u00cd\u2205\u232a\u03b7\u2033\u221e\u005c\u00d2\u00ea\u00d8\u2308\u00dc\u00cf\u00f4\u03bb\u2660\u00e8\u03c6\u00cc\u00b0\u00ba\u00de\u2329\u00f2\u00d5\u220f\u003c\u2122\u2190\u22a5\u0391\u2309\u00f5\u00e9\u00dd\u03bc\u00dd\u2122\u00cb\u00ba\u2220\u00ef\u2205\u00f2\u00e6\u0397\u00a9\u03a5\u00fe\u2665\u00ee\u0395\u2190\u00dc\u2229\u00ca\u2217\u2205\u222a\u21d4\u00f1\u00f0\u0394\u2118\u2191\u00e7\u2026\u00c2\u2032\u203e\u0396\u00ff\u2666\u00f3\u00c9\u00ff\u00ed\u00db\u00e2\u2228\u21d1\u00a4\u00f7\u00b6\u00fe\u03b5\u003c\u2192\u2665\u2297\u0192\u00fc\u2217\u00f0\u2033\u03a1\u00b3\u0395\u00a7\u00fe\u2207\u00ec\u00da\u00b1\u03a4\u00e8\u00a5\u2193\u00fd\u03c4\u00f1\u2248\u00a8\u03a0\u0394\u2208\u2329\u00fd\u00eb\u2191\u00f5\u00ae\u00db\u00cf\u03d6\u00e1\u00d3\u222b\u003e\u00ca\u2207\u00f3\u0399\u03c8\u222b\u00ce\u0396\u00d0\u03c5\u00b7\u2265\u223c\u00f2\u00e0\u221d\u2220\u220b\u2208\u00fb\u22a5\u03d6\u00d0\u2286\u039b\u00f8\u0397\u2660\u2282\u00a8\u00e5\u00e9\u03a8\u2209\u221e\u2282\u00f1\u00a3\u2135\u00c7\u2329\u00be\u00a2\u00cd\u2663\u00f6\u00e4\u039d\u00d8\u0391\u0398\u00cc\u2295\u2283\u00da\u2283\u00f0\u230a\u2282\u21b5\u2308\u00ab\u03c2\u2202\u039e\u0399\u25ca\u00d9\u03bd\u00cd\u00a9\u2193\u00af\u03b2\u2229\u00ec\u2260\u2309\u2191\u220b\u2284\u03bf\u2217\u00bd\u0392\u223c\u222a\u03a1\u03be\u00ca\u03bd\u2665\u00ee\u00e2\u00d6\u039c\u00ae\u03bf\u2261\u211c\u2192\u00c6\u2200\u00c3\u0393\u222b\u03ba\u03be\u03b4\u00ef\u00e3\u2287\u00d7\u00cb\u0192\u03bc\u00bf\u00ad\u2193\u03c1\u22c5\u039f\u2286\u00e6\u00bc\u0394\u221a\u00e0\u00d4\u00c8\u00e5\u2044\u00f8\u003c\u00ec\u00c4\u00be\u00ac\u2194\u0393\u03a9\u2202\u2287\u00a0\u232a\u00d5\u00eb\u00aa\u00c9\u2044\u00f9\u00ed\u0395\u00e1\u03ba\u2022\u00c1\u2264\u00e3\u00a1\u00a6\u03c2\u00e9\u00c1\u00d7\u00b5\u00c5\u00b3\u21d3\u21d2\u03c7\u03bb\u03c5\u0396\u03b9\u00fd\u2265\u03c3\u00b2\u00a0\u03b1\u03bb\u00c2\u00b6\u00e8\u00aa\u00cc\u00d6\u00c4\u21d4\u000a\u2666\u2211\u03c3\u03d2\u03c6\u03c4\u2666\u039a\u03b8\u03d6\u03b2\u2192\u03c4\u21d0\u00c3\u00b1\u03a0\u0398\u00f9\u00e7\u230a\u00bf\u00d5\u223c\u2248\u0026\u03d2\u03d1\u03b3\u21d3\u2111\u03c3\u2122\u039b\u03b7\u03c5\u03a1\u21d1\u03b3\u00d4\u00c2\u00b0\u00c0\u00f8\u00b4\u00e6\u230b\u00f9\u2135\u2026\u2284\u2032\u2260\u03c2\u03b6\u21d2\u230b\u2308\u00e5\u00d3\u00c1\u00d9\u00f7\u221a\u2203\u2261\u2283\u25ca\u2261\u03c1\u00ac\u2297\u00db\u21d3\u2228\u2234\u00be\u00f6\u00e4\u00a7\u00d2\u220f\u00c0\u03d1\u00f3\u0026\u21d0\u2135\u2227\u211c\u2032\u00d1\u2044\u00ff\u03c0\u2286\u21d0\u221d\u2229\u03d2\u00f5\u00e3\u00d1\u03c0\u21d1\u2295\u0192\u2033\u203e\u2287\u00d0\u222a\u00dc\u2212\u00e1\u221e\u2297\u03c1\u00f4\u00e2\u2118\u2211\u03b9\u2190\u21d2\u230a\u2207\u2200\u2660\u00e4\u25ca\u03b8\u2194\u00bf\u00a5\u00a9\u03b8\u00fa\u03a6\u00ea\u003e\u00bd\u039f\u00b1\u0393\u2200\u00a1\u00fc\u21b5\u00da\u2245\u2208\u03a7\u00a8\u03b9\u00fb\u211c\u00d6\u00fb\u00b2\u2202\u00a6\u039e\u03d1\u0392\u00b4\u00d7\u2194\u00b8\u03a0\u00ad\u2022\u00bb\u00af\u00b9\u03a8\u00a3\u00a7\u2111\u00fa\u2118\u2203\u2212\u2663\u03b5\u00c7\u03a9\u039d\u0391\u2227\u00ce\u00bb\u03c9\u00c0\u22a5\u03a7\u0392\u2260\u03c0\u00e0\u03a9\u21d4\u00b0\u00a4\u00de\u00b8\u00a6\u00bc\u03b4\u03c7\u03a8\u039c\u2228\u039c\u2111\u2220\u03b4\u03c6\u2295\u00dd\u00a1\u00a5\u00fa\u00c5\u00b9\u00c9\u00ad\u00b7\u03b1\u03bf\u2245\u03b3\u230b\u00cb\u03a7\u2209\u039b\u00c5\u00f4\u0399\u039d\u039a\u03a3\u03b5\u00b6\u00a4\u00b3\u2022\u00c6\u00d1\u00ba\u00ae\u03c7\u00a2\u00c8\u03a3\u2234\u00ef\u03be\u03b2\u00b5\u03a6\u039a\u0026\u03a4\u0398\u2026\u00c8\u03a4\u21b5\u039e\u00b7\u00c7\u00ab\u00b5\u00a3\u00b2\u03b6\u00c3\u00d9\u03c8\u03c9\u03bd\u03b1\u03a5\u03a3\u232a\u2211\u221d\u220b\u221a\u22c5\u2264\u039f\u00ac\u00d8\u00a0\u00b9\u00c6\u00b4\u00a2\u03c9\u00c4\u03a5\u2245\u00b8\u03b7\u2248\u2234\u2212\u03c8\u03bc" + assertEquals( + RESP, + ALL.decodeHtmlEscape() + ) + } +} \ No newline at end of file