From 855a3e54d30f26c02d3ebded4b1c8280aa0db5fb Mon Sep 17 00:00:00 2001 From: KingToolbox Date: Fri, 17 Jul 2020 02:57:05 +0800 Subject: [PATCH] Add iterator to search gap buffer and wildcard matching, whole word matching. --- src/Onigmo/enc/ascii.c | 5 + src/Onigmo/enc/big5.c | 48 ++ src/Onigmo/enc/cp1251.c | 16 + src/Onigmo/enc/euc_jp.c | 70 ++ src/Onigmo/enc/euc_kr.c | 49 ++ src/Onigmo/enc/euc_tw.c | 44 ++ src/Onigmo/enc/gb18030.c | 362 +++++++++ src/Onigmo/enc/iso8859_1.c | 23 + src/Onigmo/enc/iso8859_10.c | 23 + src/Onigmo/enc/iso8859_11.c | 5 + src/Onigmo/enc/iso8859_13.c | 23 + src/Onigmo/enc/iso8859_14.c | 23 + src/Onigmo/enc/iso8859_15.c | 23 + src/Onigmo/enc/iso8859_16.c | 23 + src/Onigmo/enc/iso8859_2.c | 23 + src/Onigmo/enc/iso8859_3.c | 23 + src/Onigmo/enc/iso8859_4.c | 23 + src/Onigmo/enc/iso8859_5.c | 16 + src/Onigmo/enc/iso8859_6.c | 5 + src/Onigmo/enc/iso8859_7.c | 16 + src/Onigmo/enc/iso8859_8.c | 5 + src/Onigmo/enc/iso8859_9.c | 23 + src/Onigmo/enc/koi8.c | 16 + src/Onigmo/enc/koi8_r.c | 16 + src/Onigmo/enc/sjis.c | 82 +- src/Onigmo/enc/unicode.c | 84 +- src/Onigmo/enc/unicode/name2ctype.h | 106 +++ src/Onigmo/enc/utf16_be.c | 93 +++ src/Onigmo/enc/utf16_le.c | 92 +++ src/Onigmo/enc/utf32_be.c | 81 ++ src/Onigmo/enc/utf32_le.c | 84 +- src/Onigmo/enc/utf8.c | 104 +++ src/Onigmo/oniggnu.h | 4 +- src/Onigmo/onigposix.h | 2 +- src/Onigmo/oniguruma.h | 79 +- src/Onigmo/regcomp.c | 10 +- src/Onigmo/regenc.c | 172 +++++ src/Onigmo/regenc.h | 12 +- src/Onigmo/regexec.c | 1098 ++++++++++++++------------- src/Onigmo/reggnu.c | 14 +- src/Onigmo/regint.h | 23 +- src/Onigmo/regparse.c | 48 +- src/Onigmo/regparse.h | 4 + src/Onigmo/regposerr.c | 4 +- src/Onigmo/regposix.c | 28 +- src/Onigmo/regsyntax.c | 16 + src/Onigmo/sample/crnl.c | 21 +- src/Onigmo/sample/encode.c | 31 +- src/Onigmo/sample/listcap.c | 17 +- src/Onigmo/sample/names.c | 13 +- src/Onigmo/sample/posix.c | 7 +- src/Onigmo/sample/simple.c | 13 +- src/Onigmo/sample/sql.c | 13 +- src/Onigmo/sample/syntax.c | 13 +- src/Onigmo/testc.c | 7 +- src/Onigmo/testu.c | 7 +- src/Onigmo/win32/testc.c | 7 +- src/README.md | 7 + 58 files changed, 2656 insertions(+), 643 deletions(-) diff --git a/src/Onigmo/enc/ascii.c b/src/Onigmo/enc/ascii.c index 6009a5f..9b63d92 100644 --- a/src/Onigmo/enc/ascii.c +++ b/src/Onigmo/enc/ascii.c @@ -32,20 +32,25 @@ OnigEncodingType OnigEncodingASCII = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "US-ASCII", /* name */ 1, /* max byte length */ 1, /* min byte length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, onigenc_ascii_mbc_case_fold, + onigenc_ascii_mbc_case_fold_se, onigenc_ascii_apply_all_case_fold, onigenc_ascii_get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, onigenc_ascii_is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/big5.c b/src/Onigmo/enc/big5.c index 19fd45f..420bf99 100644 --- a/src/Onigmo/enc/big5.c +++ b/src/Onigmo/enc/big5.c @@ -54,12 +54,24 @@ big5_mbc_enc_len(const UChar* p) return EncLen_BIG5[*p]; } +static int +big5_mbc_enc_len_se(OnigIterator* it, OnigPosition p) +{ + return EncLen_BIG5[ONIG_CHARAT(p)]; +} + static OnigCodePoint big5_mbc_to_code(const UChar* p, const UChar* end) { return onigenc_mbn_mbc_to_code(ONIG_ENCODING_BIG5, p, end); } +static OnigCodePoint +big5_mbc_to_code_se(OnigIterator* it, OnigPosition p, OnigPosition end) +{ + return onigenc_mbn_mbc_to_code_se(it, ONIG_ENCODING_BIG5, p, end); +} + static int big5_code_to_mbc(OnigCodePoint code, UChar *buf) { @@ -74,6 +86,14 @@ big5_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, pp, end, lower); } +static int +big5_mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, OnigPosition* pp, OnigPosition end, + UChar* lower) +{ + return onigenc_mbn_mbc_case_fold_se(it, ONIG_ENCODING_BIG5, flag, + pp, end, lower); +} + #if 0 static int big5_is_mbc_ambiguous(OnigCaseFoldType flag, @@ -134,6 +154,29 @@ big5_left_adjust_char_head(const UChar* start, const UChar* s) return (UChar* )(p + ((s - p) & ~1)); } +static OnigPosition +big5_left_adjust_char_head_se(OnigIterator* it, OnigPosition start, OnigPosition s) +{ + OnigPosition p; + int len; + + if (s <= start) return s; + p = s; + + if (BIG5_ISMB_TRAIL(ONIG_CHARAT(p))) { + while (p > start) { + if (! BIG5_ISMB_FIRST(ONIG_CHARAT(--p))) { + p++; + break; + } + } + } + len = enclen_se(it, ONIG_ENCODING_BIG5, p); + if (p + len > s) return p; + p += len; + return (p + ((s - p) & ~1)); +} + static int big5_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED) { @@ -144,20 +187,25 @@ big5_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED) OnigEncodingType OnigEncodingBIG5 = { big5_mbc_enc_len, + big5_mbc_enc_len_se, "Big5", /* name */ 2, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, big5_mbc_to_code, + big5_mbc_to_code_se, onigenc_mb2_code_to_mbclen, big5_code_to_mbc, big5_mbc_case_fold, + big5_mbc_case_fold_se, onigenc_ascii_apply_all_case_fold, onigenc_ascii_get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, big5_is_code_ctype, onigenc_not_support_get_ctype_code_range, big5_left_adjust_char_head, + big5_left_adjust_char_head_se, big5_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/cp1251.c b/src/Onigmo/enc/cp1251.c index 79f7fb9..4cb0769 100644 --- a/src/Onigmo/enc/cp1251.c +++ b/src/Onigmo/enc/cp1251.c @@ -115,6 +115,17 @@ cp1251_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } +static int +cp1251_mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag ARG_UNUSED, + OnigPosition* pp, OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + *lower = ENC_CP1251_TO_LOWER_CASE(c); + (*pp)++; + return 1; +} + static int cp1251_is_code_ctype(OnigCodePoint code, unsigned int ctype) { @@ -182,20 +193,25 @@ cp1251_get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingCP1251 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "CP1251", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, cp1251_mbc_case_fold, + cp1251_mbc_case_fold_se, cp1251_apply_all_case_fold, cp1251_get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, cp1251_is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/euc_jp.c b/src/Onigmo/enc/euc_jp.c index 9e5664e..8a29ac4 100644 --- a/src/Onigmo/enc/euc_jp.c +++ b/src/Onigmo/enc/euc_jp.c @@ -148,6 +148,12 @@ mbc_enc_len(const UChar* p) return EncLen_EUCJP[*p]; } +static int +mbc_enc_len_se(OnigIterator* it, OnigPosition p) +{ + return EncLen_EUCJP[ONIG_CHARAT(p)]; +} + static OnigCodePoint mbc_to_code(const UChar* p, const UChar* end) { @@ -166,6 +172,24 @@ mbc_to_code(const UChar* p, const UChar* end) return n; } +static OnigCodePoint +mbc_to_code_se(OnigIterator* it, OnigPosition p, OnigPosition end) +{ + int c, i, len; + OnigCodePoint n; + + len = mbc_enc_len_se(it, p); + n = (OnigCodePoint )ONIG_CHARAT(p++); + if (len == 1) return n; + + for (i = 1; i < len; i++) { + if (p >= end) break; + c = ONIG_CHARAT(p++); + n <<= 8; n += c; + } + return n; +} + static int code_to_mbclen(OnigCodePoint code) { @@ -310,6 +334,28 @@ mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, } } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag ARG_UNUSED, + OnigPosition* pp, OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + if (ONIGENC_IS_MBC_ASCII_SE(c)) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(c); + (*pp)++; + return 1; + } + else { + OnigCodePoint code; + int len; + + code = get_lower_case(mbc_to_code_se(it, *pp, end)); + len = code_to_mbc(code, lower); + (*pp) += len; + return len; /* return byte length of converted char to lower */ + } +} + static UChar* left_adjust_char_head(const UChar* start, const UChar* s) { @@ -329,6 +375,25 @@ left_adjust_char_head(const UChar* start, const UChar* s) return (UChar* )(p + ((s - p) & ~1)); } +static OnigPosition +left_adjust_char_head_se(OnigIterator* it, OnigPosition start, OnigPosition s) +{ + /* In this encoding + mb-trail bytes doesn't mix with single bytes. + */ + OnigPosition p; + int len; + + if (s <= start) return s; + p = s; + + while (!eucjp_islead(ONIG_CHARAT(p)) && p > start) p--; + len = mbc_enc_len_se(it, p); + if (p + len > s) return p; + p += len; + return (p + ((s - p) & ~1)); +} + static int is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED) { @@ -512,20 +577,25 @@ get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out, OnigEncodingType OnigEncodingEUC_JP = { mbc_enc_len, + mbc_enc_len_se, "EUC-JP", /* name */ 3, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, mbc_to_code, + mbc_to_code_se, code_to_mbclen, code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, property_name_to_ctype, is_code_ctype, get_ctype_code_range, left_adjust_char_head, + left_adjust_char_head_se, is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/euc_kr.c b/src/Onigmo/enc/euc_kr.c index 19d1121..44b53e8 100644 --- a/src/Onigmo/enc/euc_kr.c +++ b/src/Onigmo/enc/euc_kr.c @@ -54,12 +54,24 @@ euckr_mbc_enc_len(const UChar* p) return EncLen_EUCKR[*p]; } +static int +euckr_mbc_enc_len_se(OnigIterator* it, OnigPosition p) +{ + return EncLen_EUCKR[ONIG_CHARAT(p)]; +} + static OnigCodePoint euckr_mbc_to_code(const UChar* p, const UChar* end) { return onigenc_mbn_mbc_to_code(ONIG_ENCODING_EUC_KR, p, end); } +static OnigCodePoint +euckr_mbc_to_code_se(OnigIterator* it, OnigPosition p, OnigPosition end) +{ + return onigenc_mbn_mbc_to_code_se(it, ONIG_ENCODING_EUC_KR, p, end); +} + static int euckr_code_to_mbc(OnigCodePoint code, UChar *buf) { @@ -74,6 +86,14 @@ euckr_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, pp, end, lower); } +static int +euckr_mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, OnigPosition* pp, OnigPosition end, + UChar* lower) +{ + return onigenc_mbn_mbc_case_fold_se(it, ONIG_ENCODING_EUC_KR, flag, + pp, end, lower); +} + #if 0 static int euckr_is_mbc_ambiguous(OnigCaseFoldType flag, @@ -110,6 +130,25 @@ euckr_left_adjust_char_head(const UChar* start, const UChar* s) return (UChar* )(p + ((s - p) & ~1)); } +static OnigPosition +euckr_left_adjust_char_head_se(OnigIterator* it, OnigPosition start, OnigPosition s) +{ + /* Assumed in this encoding, + mb-trail bytes don't mix with single bytes. + */ + OnigPosition p; + int len; + + if (s <= start) return s; + p = s; + + while (!euckr_islead(ONIG_CHARAT(p)) && p > start) p--; + len = enclen_se(it, ONIG_ENCODING_EUC_KR, p); + if (p + len > s) return p; + p += len; + return (p + ((s - p) & ~1)); +} + static int euckr_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED) { @@ -120,20 +159,25 @@ euckr_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED) OnigEncodingType OnigEncodingEUC_KR = { euckr_mbc_enc_len, + euckr_mbc_enc_len_se, "EUC-KR", /* name */ 2, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, euckr_mbc_to_code, + euckr_mbc_to_code_se, onigenc_mb2_code_to_mbclen, euckr_code_to_mbc, euckr_mbc_case_fold, + euckr_mbc_case_fold_se, onigenc_ascii_apply_all_case_fold, onigenc_ascii_get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, euckr_is_code_ctype, onigenc_not_support_get_ctype_code_range, euckr_left_adjust_char_head, + euckr_left_adjust_char_head_se, euckr_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; @@ -141,20 +185,25 @@ OnigEncodingType OnigEncodingEUC_KR = { /* Same with OnigEncodingEUC_KR except the name */ OnigEncodingType OnigEncodingEUC_CN = { euckr_mbc_enc_len, + euckr_mbc_enc_len_se, "EUC-CN", /* name */ 2, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, euckr_mbc_to_code, + euckr_mbc_to_code_se, onigenc_mb2_code_to_mbclen, euckr_code_to_mbc, euckr_mbc_case_fold, + euckr_mbc_case_fold_se, onigenc_ascii_apply_all_case_fold, onigenc_ascii_get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, euckr_is_code_ctype, onigenc_not_support_get_ctype_code_range, euckr_left_adjust_char_head, + euckr_left_adjust_char_head_se, euckr_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/euc_tw.c b/src/Onigmo/enc/euc_tw.c index dd3e62e..4d7d5ce 100644 --- a/src/Onigmo/enc/euc_tw.c +++ b/src/Onigmo/enc/euc_tw.c @@ -54,12 +54,24 @@ euctw_mbc_enc_len(const UChar* p) return EncLen_EUCTW[*p]; } +static int +euctw_mbc_enc_len_se(OnigIterator* it, OnigPosition p) +{ + return EncLen_EUCTW[ONIG_CHARAT(p)]; +} + static OnigCodePoint euctw_mbc_to_code(const UChar* p, const UChar* end) { return onigenc_mbn_mbc_to_code(ONIG_ENCODING_EUC_TW, p, end); } +static OnigCodePoint +euctw_mbc_to_code_se(OnigIterator* it, OnigPosition p, OnigPosition end) +{ + return onigenc_mbn_mbc_to_code_se(it, ONIG_ENCODING_EUC_TW, p, end); +} + static int euctw_code_to_mbc(OnigCodePoint code, UChar *buf) { @@ -74,6 +86,14 @@ euctw_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, pp, end, lower); } +static int +euctw_mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, OnigPosition* pp, OnigPosition end, + UChar* lower) +{ + return onigenc_mbn_mbc_case_fold_se(it, ONIG_ENCODING_EUC_TW, flag, + pp, end, lower); +} + static int euctw_is_code_ctype(OnigCodePoint code, unsigned int ctype) { @@ -101,6 +121,25 @@ euctw_left_adjust_char_head(const UChar* start, const UChar* s) return (UChar* )(p + ((s - p) & ~1)); } +static OnigPosition +euctw_left_adjust_char_head_se(OnigIterator* it, OnigPosition start, OnigPosition s) +{ + /* Assumed in this encoding, + mb-trail bytes don't mix with single bytes. + */ + OnigPosition p; + int len; + + if (s <= start) return s; + p = s; + + while (!euctw_islead(ONIG_CHARAT(p)) && p > start) p--; + len = enclen_se(it, ONIG_ENCODING_EUC_TW, p); + if (p + len > s) return p; + p += len; + return (p + ((s - p) & ~1)); +} + static int euctw_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED) { @@ -111,20 +150,25 @@ euctw_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED) OnigEncodingType OnigEncodingEUC_TW = { euctw_mbc_enc_len, + euctw_mbc_enc_len_se, "EUC-TW", /* name */ 4, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, euctw_mbc_to_code, + euctw_mbc_to_code_se, onigenc_mb4_code_to_mbclen, euctw_code_to_mbc, euctw_mbc_case_fold, + euctw_mbc_case_fold_se, onigenc_ascii_apply_all_case_fold, onigenc_ascii_get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, euctw_is_code_ctype, onigenc_not_support_get_ctype_code_range, euctw_left_adjust_char_head, + euctw_left_adjust_char_head_se, euctw_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/gb18030.c b/src/Onigmo/enc/gb18030.c index 0b33e77..795fc80 100644 --- a/src/Onigmo/enc/gb18030.c +++ b/src/Onigmo/enc/gb18030.c @@ -75,12 +75,34 @@ gb18030_mbc_enc_len(const UChar* p) return 2; } +static int +gb18030_mbc_enc_len_se(OnigIterator* it, OnigPosition p) +{ + UChar c0, c1; + + c0 = ONIG_CHARAT(p); + if (GB18030_MAP[c0] != CM) + return 1; + c1 = ONIG_CHARAT(p+1); + if (GB18030_MAP[c1] == C4) + return 4; + if (GB18030_MAP[c1] == C1) + return 1; /* illegal sequence */ + return 2; +} + static OnigCodePoint gb18030_mbc_to_code(const UChar* p, const UChar* end) { return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end); } +static OnigCodePoint +gb18030_mbc_to_code_se(OnigIterator* it, OnigPosition p, OnigPosition end) +{ + return onigenc_mbn_mbc_to_code_se(it, ONIG_ENCODING_GB18030, p, end); +} + static int gb18030_code_to_mbc(OnigCodePoint code, UChar *buf) { @@ -95,6 +117,14 @@ gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, pp, end, lower); } +static int +gb18030_mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, OnigPosition* pp, OnigPosition end, + UChar* lower) +{ + return onigenc_mbn_mbc_case_fold_se(it, ONIG_ENCODING_GB18030, flag, + pp, end, lower); +} + #if 0 static int gb18030_is_mbc_ambiguous(OnigCaseFoldType flag, @@ -469,6 +499,333 @@ gb18030_left_adjust_char_head(const UChar* start, const UChar* s) return (UChar* )s; /* never come here. (escape warning) */ } +static OnigPosition +gb18030_left_adjust_char_head_se(OnigIterator* it, OnigPosition start, OnigPosition s) +{ + OnigPosition p; + UChar c; + enum state state = S_START; + + DEBUG_GB18030(("----------------\n")); + for (p = s; p >= start; p--) { + c = ONIG_CHARAT(p); + DEBUG_GB18030(("state %d --(%02x)-->\n", state, c)); + switch (state) { + case S_START: + switch (GB18030_MAP[c]) { + case C1: + return s; + case C2: + state = S_one_C2; /* C2 */ + break; + case C4: + state = S_one_C4; /* C4 */ + break; + case CM: + state = S_one_CM; /* CM */ + break; + } + break; + case S_one_C2: /* C2 */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return s; + case CM: + state = S_odd_CM_one_CX; /* CM C2 */ + break; + } + break; + case S_one_C4: /* C4 */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return s; + case CM: + state = S_one_CMC4; + break; + } + break; + case S_one_CM: /* CM */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + return s; + case C4: + state = S_odd_C4CM; + break; + case CM: + state = S_odd_CM_one_CX; /* CM CM */ + break; + } + break; + + case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return (s - 1); + case CM: + state = S_even_CM_one_CX; + break; + } + break; + case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return s; + case CM: + state = S_odd_CM_one_CX; + break; + } + break; + + case S_one_CMC4: /* CM C4 */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + return (s - 1); + case C4: + state = S_one_C4_odd_CMC4; /* C4 CM C4 */ + break; + case CM: + state = S_even_CM_one_CX; /* CM CM C4 */ + break; + } + break; + case S_odd_CMC4: /* CM C4 CM C4 CM C4 */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + return (s - 1); + case C4: + state = S_one_C4_odd_CMC4; + break; + case CM: + state = S_odd_CM_odd_CMC4; + break; + } + break; + case S_one_C4_odd_CMC4: /* C4 CM C4 */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return (s - 1); + case CM: + state = S_even_CMC4; /* CM C4 CM C4 */ + break; + } + break; + case S_even_CMC4: /* CM C4 CM C4 */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + return (s - 3); + case C4: + state = S_one_C4_even_CMC4; + break; + case CM: + state = S_odd_CM_even_CMC4; + break; + } + break; + case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return (s - 3); + case CM: + state = S_odd_CMC4; + break; + } + break; + + case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return (s - 3); + case CM: + state = S_even_CM_odd_CMC4; + break; + } + break; + case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return (s - 1); + case CM: + state = S_odd_CM_odd_CMC4; + break; + } + break; + + case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return (s - 1); + case CM: + state = S_even_CM_even_CMC4; + break; + } + break; + case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return (s - 3); + case CM: + state = S_odd_CM_even_CMC4; + break; + } + break; + + case S_odd_C4CM: /* C4 CM */ /* C4 CM C4 CM C4 CM*/ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return s; + case CM: + state = S_one_CM_odd_C4CM; /* CM C4 CM */ + break; + } + break; + case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + return (s - 2); /* |CM C4 CM */ + case C4: + state = S_even_C4CM; + break; + case CM: + state = S_even_CM_odd_C4CM; + break; + } + break; + case S_even_C4CM: /* C4 CM C4 CM */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return (s - 2); /* C4|CM C4 CM */ + case CM: + state = S_one_CM_even_C4CM; + break; + } + break; + case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + return (s - 0); /*|CM C4 CM C4|CM */ + case C4: + state = S_odd_C4CM; + break; + case CM: + state = S_even_CM_even_C4CM; + break; + } + break; + + case S_even_CM_odd_C4CM: /* CM CM C4 CM */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return (s - 0); /* |CM CM|C4|CM */ + case CM: + state = S_odd_CM_odd_C4CM; + break; + } + break; + case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return (s - 2); /* |CM CM|CM C4 CM */ + case CM: + state = S_even_CM_odd_C4CM; + break; + } + break; + + case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return (s - 2); /* |CM CM|C4|CM C4 CM */ + case CM: + state = S_odd_CM_even_C4CM; + break; + } + break; + case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */ + switch (GB18030_MAP[c]) { + case C1: + case C2: + case C4: + return (s - 0); /* |CM CM|CM C4 CM C4|CM */ + case CM: + state = S_even_CM_even_C4CM; + break; + } + break; + } + } + + DEBUG_GB18030(("state %d\n", state)); + switch (state) { + case S_START: return (s - 0); + case S_one_C2: return (s - 0); + case S_one_C4: return (s - 0); + case S_one_CM: return (s - 0); + + case S_odd_CM_one_CX: return (s - 1); + case S_even_CM_one_CX: return (s - 0); + + case S_one_CMC4: return (s - 1); + case S_odd_CMC4: return (s - 1); + case S_one_C4_odd_CMC4: return (s - 1); + case S_even_CMC4: return (s - 3); + case S_one_C4_even_CMC4: return (s - 3); + + case S_odd_CM_odd_CMC4: return (s - 3); + case S_even_CM_odd_CMC4: return (s - 1); + + case S_odd_CM_even_CMC4: return (s - 1); + case S_even_CM_even_CMC4: return (s - 3); + + case S_odd_C4CM: return (s - 0); + case S_one_CM_odd_C4CM: return (s - 2); + case S_even_C4CM: return (s - 2); + case S_one_CM_even_C4CM: return (s - 0); + + case S_even_CM_odd_C4CM: return (s - 0); + case S_odd_CM_odd_C4CM: return (s - 2); + case S_even_CM_even_C4CM: return (s - 2); + case S_odd_CM_even_C4CM: return (s - 0); + } + + return s; /* never come here. (escape warning) */ +} + static int gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED) { @@ -477,20 +834,25 @@ gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED) OnigEncodingType OnigEncodingGB18030 = { gb18030_mbc_enc_len, + gb18030_mbc_enc_len_se, "GB18030", /* name */ 4, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, gb18030_mbc_to_code, + gb18030_mbc_to_code_se, onigenc_mb4_code_to_mbclen, gb18030_code_to_mbc, gb18030_mbc_case_fold, + gb18030_mbc_case_fold_se, onigenc_ascii_apply_all_case_fold, onigenc_ascii_get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, gb18030_is_code_ctype, onigenc_not_support_get_ctype_code_range, gb18030_left_adjust_char_head, + gb18030_left_adjust_char_head_se, gb18030_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_1.c b/src/Onigmo/enc/iso8859_1.c index 5b0a6fd..057c6ab 100644 --- a/src/Onigmo/enc/iso8859_1.c +++ b/src/Onigmo/enc/iso8859_1.c @@ -216,6 +216,24 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, return 1; } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, OnigPosition* pp, + OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + if (c == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { + *lower++ = 's'; + *lower = 's'; + (*pp)++; + return 2; + } + + *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(c); + (*pp)++; + return 1; +} + #if 0 static int is_mbc_ambiguous(OnigCaseFoldType flag, @@ -254,20 +272,25 @@ is_code_ctype(OnigCodePoint code, unsigned int ctype) OnigEncodingType OnigEncodingISO_8859_1 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-1", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_10.c b/src/Onigmo/enc/iso8859_10.c index d90bfbe..90ebac2 100644 --- a/src/Onigmo/enc/iso8859_10.c +++ b/src/Onigmo/enc/iso8859_10.c @@ -121,6 +121,24 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, + OnigPosition* pp, OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + if (c == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { + *lower++ = 's'; + *lower = 's'; + (*pp)++; + return 2; + } + + *lower = ENC_ISO_8859_10_TO_LOWER_CASE(c); + (*pp)++; + return 1; +} + #if 0 static int is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) @@ -221,20 +239,25 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingISO_8859_10 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-10", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_11.c b/src/Onigmo/enc/iso8859_11.c index f543ab2..3f2cc81 100644 --- a/src/Onigmo/enc/iso8859_11.c +++ b/src/Onigmo/enc/iso8859_11.c @@ -78,20 +78,25 @@ is_code_ctype(OnigCodePoint code, unsigned int ctype) OnigEncodingType OnigEncodingISO_8859_11 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-11", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, onigenc_ascii_mbc_case_fold, + onigenc_ascii_mbc_case_fold_se, onigenc_ascii_apply_all_case_fold, onigenc_ascii_get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_13.c b/src/Onigmo/enc/iso8859_13.c index 7dd7d55..52d40f8 100644 --- a/src/Onigmo/enc/iso8859_13.c +++ b/src/Onigmo/enc/iso8859_13.c @@ -121,6 +121,24 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, + OnigPosition* pp, OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + if (c == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { + *lower++ = 's'; + *lower = 's'; + (*pp)++; + return 2; + } + + *lower = ENC_ISO_8859_13_TO_LOWER_CASE(c); + (*pp)++; + return 1; +} + #if 0 static int is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) @@ -210,20 +228,25 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingISO_8859_13 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-13", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_14.c b/src/Onigmo/enc/iso8859_14.c index d407624..6415599 100644 --- a/src/Onigmo/enc/iso8859_14.c +++ b/src/Onigmo/enc/iso8859_14.c @@ -121,6 +121,24 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, + OnigPosition* pp, OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + if (c == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { + *lower++ = 's'; + *lower = 's'; + (*pp)++; + return 2; + } + + *lower = ENC_ISO_8859_14_TO_LOWER_CASE(c); + (*pp)++; + return 1; /* return byte length of converted char to lower */ +} + #if 0 static int is_mbc_ambiguous(OnigCaseFoldType flag, @@ -223,20 +241,25 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingISO_8859_14 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-14", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_15.c b/src/Onigmo/enc/iso8859_15.c index cdd8cce..c637a50 100644 --- a/src/Onigmo/enc/iso8859_15.c +++ b/src/Onigmo/enc/iso8859_15.c @@ -121,6 +121,24 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, + OnigPosition* pp, OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + if (c == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { + *lower++ = 's'; + *lower = 's'; + (*pp)++; + return 2; + } + + *lower = ENC_ISO_8859_15_TO_LOWER_CASE(c); + (*pp)++; + return 1; /* return byte length of converted char to lower */ +} + #if 0 static int is_mbc_ambiguous(OnigCaseFoldType flag, @@ -217,20 +235,25 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingISO_8859_15 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-15", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_16.c b/src/Onigmo/enc/iso8859_16.c index b2ef5ae..454c2cc 100644 --- a/src/Onigmo/enc/iso8859_16.c +++ b/src/Onigmo/enc/iso8859_16.c @@ -121,6 +121,24 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, + OnigPosition* pp, OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + if (c == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { + *lower++ = 's'; + *lower = 's'; + (*pp)++; + return 2; + } + + *lower = ENC_ISO_8859_16_TO_LOWER_CASE(c); + (*pp)++; + return 1; /* return byte length of converted char to lower */ +} + #if 0 static int is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) @@ -219,20 +237,25 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingISO_8859_16 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-16", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_2.c b/src/Onigmo/enc/iso8859_2.c index 1a8ab68..ea1792b 100644 --- a/src/Onigmo/enc/iso8859_2.c +++ b/src/Onigmo/enc/iso8859_2.c @@ -121,6 +121,24 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, + OnigPosition* pp, OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + if (c == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { + *lower++ = 's'; + *lower = 's'; + (*pp)++; + return 2; + } + + *lower = ENC_ISO_8859_2_TO_LOWER_CASE(c); + (*pp)++; + return 1; /* return byte length of converted char to lower */ +} + #if 0 static int is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) @@ -217,20 +235,25 @@ is_code_ctype(OnigCodePoint code, unsigned int ctype) OnigEncodingType OnigEncodingISO_8859_2 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-2", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_3.c b/src/Onigmo/enc/iso8859_3.c index 2188d4e..686112e 100644 --- a/src/Onigmo/enc/iso8859_3.c +++ b/src/Onigmo/enc/iso8859_3.c @@ -121,6 +121,24 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, return 1; } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, OnigPosition* pp, + OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + if (c == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { + *lower++ = 's'; + *lower = 's'; + (*pp)++; + return 2; + } + + *lower = ENC_ISO_8859_3_TO_LOWER_CASE(c); + (*pp)++; + return 1; +} + #if 0 static int is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) @@ -217,20 +235,25 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingISO_8859_3 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-3", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_4.c b/src/Onigmo/enc/iso8859_4.c index baa9442..a8718c3 100644 --- a/src/Onigmo/enc/iso8859_4.c +++ b/src/Onigmo/enc/iso8859_4.c @@ -121,6 +121,24 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, + OnigPosition* pp, OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + if (c == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { + *lower++ = 's'; + *lower = 's'; + (*pp)++; + return 2; + } + + *lower = ENC_ISO_8859_4_TO_LOWER_CASE(c); + (*pp)++; + return 1; /* return byte length of converted char to lower */ +} + #if 0 static int is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) @@ -219,20 +237,25 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingISO_8859_4 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-4", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_5.c b/src/Onigmo/enc/iso8859_5.c index 90f0f6d..d1276be 100644 --- a/src/Onigmo/enc/iso8859_5.c +++ b/src/Onigmo/enc/iso8859_5.c @@ -114,6 +114,17 @@ mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag ARG_UNUSED, + OnigPosition* pp, OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + *lower = ENC_ISO_8859_5_TO_LOWER_CASE(c); + (*pp)++; + return 1; +} + #if 0 static int is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) @@ -208,20 +219,25 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingISO_8859_5 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-5", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_6.c b/src/Onigmo/enc/iso8859_6.c index a59a63c..f87c744 100644 --- a/src/Onigmo/enc/iso8859_6.c +++ b/src/Onigmo/enc/iso8859_6.c @@ -78,20 +78,25 @@ is_code_ctype(OnigCodePoint code, unsigned int ctype) OnigEncodingType OnigEncodingISO_8859_6 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-6", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, onigenc_ascii_mbc_case_fold, + onigenc_ascii_mbc_case_fold_se, onigenc_ascii_apply_all_case_fold, onigenc_ascii_get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_7.c b/src/Onigmo/enc/iso8859_7.c index 1000238..bd30d87 100644 --- a/src/Onigmo/enc/iso8859_7.c +++ b/src/Onigmo/enc/iso8859_7.c @@ -114,6 +114,17 @@ mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag ARG_UNUSED, + OnigPosition* pp, OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + *lower = ENC_ISO_8859_7_TO_LOWER_CASE(c); + (*pp)++; + return 1; +} + #if 0 static int is_mbc_ambiguous(OnigCaseFoldType flag, @@ -204,20 +215,25 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingISO_8859_7 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-7", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_8.c b/src/Onigmo/enc/iso8859_8.c index 81b3988..9649503 100644 --- a/src/Onigmo/enc/iso8859_8.c +++ b/src/Onigmo/enc/iso8859_8.c @@ -78,20 +78,25 @@ is_code_ctype(OnigCodePoint code, unsigned int ctype) OnigEncodingType OnigEncodingISO_8859_8 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-8", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, onigenc_ascii_mbc_case_fold, + onigenc_ascii_mbc_case_fold_se, onigenc_ascii_apply_all_case_fold, onigenc_ascii_get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/iso8859_9.c b/src/Onigmo/enc/iso8859_9.c index 827046a..8db464c 100644 --- a/src/Onigmo/enc/iso8859_9.c +++ b/src/Onigmo/enc/iso8859_9.c @@ -121,6 +121,24 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, + OnigPosition* pp, OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + if (c == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { + *lower++ = 's'; + *lower = 's'; + (*pp)++; + return 2; + } + + *lower = ENC_ISO_8859_9_TO_LOWER_CASE(c); + (*pp)++; + return 1; +} + #if 0 static int is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) @@ -210,20 +228,25 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingISO_8859_9 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "ISO-8859-9", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/koi8.c b/src/Onigmo/enc/koi8.c index 5967ec1..ad76c29 100644 --- a/src/Onigmo/enc/koi8.c +++ b/src/Onigmo/enc/koi8.c @@ -115,6 +115,17 @@ koi8_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } +static int +koi8_mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag ARG_UNUSED, + OnigPosition* pp, OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + *lower = ENC_KOI8_TO_LOWER_CASE(c); + (*pp)++; + return 1; +} + #if 0 static int koi8_is_mbc_ambiguous(OnigAmbigType flag, const OnigUChar** pp, const OnigUChar* end) @@ -232,20 +243,25 @@ koi8_get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingKOI8 = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "KOI8", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, koi8_mbc_case_fold, + koi8_mbc_case_fold_se, koi8_apply_all_case_fold, koi8_get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, koi8_is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/koi8_r.c b/src/Onigmo/enc/koi8_r.c index f8c9f27..38f7f90 100644 --- a/src/Onigmo/enc/koi8_r.c +++ b/src/Onigmo/enc/koi8_r.c @@ -114,6 +114,17 @@ koi8_r_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } +static int +koi8_r_mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag ARG_UNUSED, + OnigPosition* pp, OnigPosition end ARG_UNUSED, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + *lower = ENC_KOI8_R_TO_LOWER_CASE(c); + (*pp)++; + return 1; +} + #if 0 static int koi8_r_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) @@ -194,20 +205,25 @@ koi8_r_get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingKOI8_R = { onigenc_single_byte_mbc_enc_len, + onigenc_single_byte_mbc_enc_len_se, "KOI8-R", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, onigenc_single_byte_mbc_to_code, + onigenc_single_byte_mbc_to_code_se, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, koi8_r_mbc_case_fold, + koi8_r_mbc_case_fold_se, koi8_r_apply_all_case_fold, koi8_r_get_case_fold_codes_by_str, onigenc_minimum_property_name_to_ctype, koi8_r_is_code_ctype, onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/sjis.c b/src/Onigmo/enc/sjis.c index 4eb0aaf..d56150d 100644 --- a/src/Onigmo/enc/sjis.c +++ b/src/Onigmo/enc/sjis.c @@ -174,6 +174,12 @@ mbc_enc_len(const UChar* p) return EncLen_SJIS[*p]; } +static int +mbc_enc_len_se(OnigIterator* it, OnigPosition p) +{ + return EncLen_SJIS[ONIG_CHARAT(p)]; +} + static int code_to_mbclen(OnigCodePoint code) { @@ -212,6 +218,25 @@ mbc_to_code(const UChar* p, const UChar* end) return n; } +static OnigCodePoint +mbc_to_code_se(OnigIterator* it, OnigPosition p, OnigPosition end) +{ + int c, i, len; + OnigCodePoint n; + + len = mbc_enc_len_se(it, p); + c = ONIG_CHARAT(p++); + n = c; + if (len == 1) return n; + + for (i = 1; i < len; i++) { + if (p >= end) break; + c = ONIG_CHARAT(p++); + n <<= 8; n += c; + } + return n; +} + static int code_to_mbc(OnigCodePoint code, UChar *buf) { @@ -309,7 +334,7 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, static int mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, - const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower) + const UChar** pp, const UChar* end, UChar* lower) { const UChar* p = *pp; @@ -329,6 +354,28 @@ mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, } } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag ARG_UNUSED, + OnigPosition* pp, OnigPosition end, UChar* lower) +{ + const UChar c = ONIG_CHARAT(*pp); + + if (ONIGENC_IS_MBC_ASCII_SE(c)) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(c); + (*pp)++; + return 1; + } + else { + OnigCodePoint code; + int len; + + code = get_lower_case(mbc_to_code_se(it, *pp, end)); + len = code_to_mbc(code, lower); + (*pp) += len; + return len; /* return byte length of converted char to lower */ + } +} + #if 0 static int is_mbc_ambiguous(OnigCaseFoldType flag, @@ -377,6 +424,29 @@ left_adjust_char_head(const UChar* start, const UChar* s) return (UChar* )(p + ((s - p) & ~1)); } +static OnigPosition +left_adjust_char_head_se(OnigIterator* it, OnigPosition start, OnigPosition s) +{ + OnigPosition p; + int len; + + if (s <= start) return s; + p = s; + + if (SJIS_ISMB_TRAIL(ONIG_CHARAT(p))) { + while (p > start) { + if (! SJIS_ISMB_FIRST(ONIG_CHARAT(--p))) { + p++; + break; + } + } + } + len = mbc_enc_len_se(it, p); + if (p + len > s) return p; + p += len; + return (p + ((s - p) & ~1)); +} + static int is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED) { @@ -531,40 +601,50 @@ get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out, #ifdef ENC_CP932 OnigEncodingType OnigEncodingCP932 = { mbc_enc_len, + mbc_enc_len_se, "CP932", /* name */ 2, /* max byte length */ 1, /* min byte length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, mbc_to_code, + mbc_to_code_se, code_to_mbclen, code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, property_name_to_ctype, is_code_ctype, get_ctype_code_range, left_adjust_char_head, + left_adjust_char_head_se, is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; #else OnigEncodingType OnigEncodingSJIS = { mbc_enc_len, + mbc_enc_len_se, "Shift_JIS", /* name */ 2, /* max byte length */ 1, /* min byte length */ onigenc_is_mbc_newline_0x0a, + onigenc_is_mbc_newline_0x0a_se, mbc_to_code, + mbc_to_code_se, code_to_mbclen, code_to_mbc, mbc_case_fold, + mbc_case_fold_se, apply_all_case_fold, get_case_fold_codes_by_str, property_name_to_ctype, is_code_ctype, get_ctype_code_range, left_adjust_char_head, + left_adjust_char_head_se, is_allowed_reverse_match, ONIGENC_FLAG_NONE, }; diff --git a/src/Onigmo/enc/unicode.c b/src/Onigmo/enc/unicode.c index 2b53eac..babf937 100644 --- a/src/Onigmo/enc/unicode.c +++ b/src/Onigmo/enc/unicode.c @@ -108,6 +108,7 @@ typedef struct { #define numberof(array) (int)(sizeof(array) / sizeof((array)[0])) #define CODE_RANGES_NUM numberof(CodeRanges) +#define CODE_SCRIPTS_NUM numberof(CodeScripts) extern int onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype) @@ -148,6 +149,17 @@ onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out, return onigenc_unicode_ctype_code_range(ctype, ranges); } +extern const OnigCodePoint* +onigenc_unicode_code_script(OnigCodePoint code) +{ + for (int ctype = 0; ctype < CODE_SCRIPTS_NUM; ctype++) { + if (onig_is_in_code_range((UChar*) CodeScripts[ctype], code)) { + return CodeScripts[ctype]; + } + } + return 0; +} + #include "st.h" #define PROPERTY_NAME_MAX_SIZE (MAX_WORD_LENGTH + 1) @@ -220,12 +232,19 @@ static struct st_hash_type type_code3_hash = { code3_hash, }; - -static st_table* FoldTable; /* fold-1, fold-2, fold-3 */ -static st_table* Unfold1Table; -static st_table* Unfold2Table; -static st_table* Unfold3Table; -static int CaseFoldInited = 0; +#ifdef USE_SHARED_UNICODE_TABLE + st_table* FoldTable; /* fold-1, fold-2, fold-3 */ + st_table* Unfold1Table; + st_table* Unfold2Table; + st_table* Unfold3Table; + int CaseFoldInited = 0; +#else + static st_table* FoldTable; /* fold-1, fold-2, fold-3 */ + static st_table* Unfold1Table; + static st_table* Unfold2Table; + static st_table* Unfold3Table; + static int CaseFoldInited = 0; +#endif //USE_SHARED_UNICODE_TABLE static int init_case_fold_table(void) { @@ -338,6 +357,59 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc, return len; } +extern int +onigenc_unicode_mbc_case_fold_se(OnigIterator* it, OnigEncoding enc, + OnigCaseFoldType flag ARG_UNUSED, OnigPosition* pp, OnigPosition end, + UChar* fold) +{ + CodePointList3 *to; + OnigCodePoint code; + int i, len, rlen; + OnigPosition p = *pp; + + if (CaseFoldInited == 0) init_case_fold_table(); + + code = ONIGENC_MBC_TO_CODE_SE(it, enc, p, end); + len = enclen_se(it, enc, p); + *pp += len; + +#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI + if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { + if (code == 0x0049) { + return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold); + } + else if (code == 0x0130) { + return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold); + } + } +#endif + + if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0) { + if (to->n == 1) { + return ONIGENC_CODE_TO_MBC(enc, to->code[0], fold); + } +#if 0 + /* NO NEEDS TO CHECK */ + else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { +#else + else { +#endif + rlen = 0; + for (i = 0; i < to->n; i++) { + len = ONIGENC_CODE_TO_MBC(enc, to->code[i], fold); + fold += len; + rlen += len; + } + return rlen; + } + } + + for (i = 0; i < len; i++) { + *fold++ = ONIG_CHARAT(p++); + } + return len; +} + extern int onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg) diff --git a/src/Onigmo/enc/unicode/name2ctype.h b/src/Onigmo/enc/unicode/name2ctype.h index 2e80edf..2b516e2 100644 --- a/src/Onigmo/enc/unicode/name2ctype.h +++ b/src/Onigmo/enc/unicode/name2ctype.h @@ -25940,6 +25940,112 @@ static const OnigCodePoint* const CodeRanges[] = { CR_In_No_Block, #endif /* USE_UNICODE_PROPERTIES */ }; + +static const OnigCodePoint* const CodeScripts[] = { + CR_Common, + CR_Latin, + CR_Greek, + CR_Cyrillic, + CR_Armenian, + CR_Hebrew, + CR_Arabic, + CR_Syriac, + CR_Thaana, + CR_Devanagari, + CR_Bengali, + CR_Gurmukhi, + CR_Gujarati, + CR_Oriya, + CR_Tamil, + CR_Telugu, + CR_Kannada, + CR_Malayalam, + CR_Sinhala, + CR_Thai, + CR_Lao, + CR_Tibetan, + CR_Myanmar, + CR_Georgian, + CR_Hangul, + CR_Ethiopic, + CR_Cherokee, + CR_Canadian_Aboriginal, + CR_Ogham, + CR_Runic, + CR_Khmer, + CR_Mongolian, + CR_Hiragana, + CR_Katakana, + CR_Bopomofo, + CR_Han, + CR_Yi, + CR_Old_Italic, + CR_Gothic, + CR_Deseret, + CR_Inherited, + CR_Tagalog, + CR_Hanunoo, + CR_Buhid, + CR_Tagbanwa, + CR_Limbu, + CR_Tai_Le, + CR_Linear_B, + CR_Ugaritic, + CR_Shavian, + CR_Osmanya, + CR_Cypriot, + CR_Braille, + CR_Buginese, + CR_Coptic, + CR_New_Tai_Lue, + CR_Glagolitic, + CR_Tifinagh, + CR_Syloti_Nagri, + CR_Old_Persian, + CR_Kharoshthi, + CR_Balinese, + CR_Cuneiform, + CR_Phoenician, + CR_Phags_Pa, + CR_Nko, + CR_Sundanese, + CR_Lepcha, + CR_Ol_Chiki, + CR_Vai, + CR_Saurashtra, + CR_Kayah_Li, + CR_Rejang, + CR_Lycian, + CR_Carian, + CR_Lydian, + CR_Cham, + CR_Tai_Tham, + CR_Tai_Viet, + CR_Avestan, + CR_Egyptian_Hieroglyphs, + CR_Samaritan, + CR_Lisu, + CR_Bamum, + CR_Javanese, + CR_Meetei_Mayek, + CR_Imperial_Aramaic, + CR_Old_South_Arabian, + CR_Inscriptional_Parthian, + CR_Inscriptional_Pahlavi, + CR_Old_Turkic, + CR_Kaithi, + CR_Batak, + CR_Brahmi, + CR_Mandaic, + CR_Chakma, + CR_Meroitic_Cursive, + CR_Meroitic_Hieroglyphs, + CR_Miao, + CR_Sharada, + CR_Sora_Sompeng, + CR_Takri +}; + struct uniname2ctype_struct { int name, ctype; }; diff --git a/src/Onigmo/enc/utf16_be.c b/src/Onigmo/enc/utf16_be.c index 3ccc73e..80a83a7 100644 --- a/src/Onigmo/enc/utf16_be.c +++ b/src/Onigmo/enc/utf16_be.c @@ -54,6 +54,12 @@ utf16be_mbc_enc_len(const UChar* p) return EncLen_UTF16[*p]; } +static int +utf16be_mbc_enc_len_se(OnigIterator* it, OnigPosition p) +{ + return EncLen_UTF16[ONIG_CHARAT(p)]; +} + static int utf16be_is_mbc_newline(const UChar* p, const UChar* end) { @@ -71,6 +77,26 @@ utf16be_is_mbc_newline(const UChar* p, const UChar* end) return 0; } +static int +utf16be_is_mbc_newline_se(OnigIterator* it, OnigPosition p, OnigPosition end) +{ + if (p + 1 < end) { + const UChar c0 = ONIG_CHARAT(p); + const UChar c1 = ONIG_CHARAT(p+1); + + if (c1 == 0x0a && c0 == 0x00) + return 1; +#ifdef USE_UNICODE_ALL_LINE_TERMINATORS + if ((c1 == 0x0b || c1 == 0x0c || c1 == 0x0d || c1 == 0x85) + && c0 == 0x00) + return 1; + if (c0 == 0x20 && (c1 == 0x29 || c1 == 0x28)) + return 1; +#endif + } + return 0; +} + static OnigCodePoint utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) { @@ -87,6 +113,24 @@ utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) return code; } +static OnigCodePoint +utf16be_mbc_to_code_se(OnigIterator* it, OnigPosition p, OnigPosition end ARG_UNUSED) +{ + OnigCodePoint code; + const UChar c0 = ONIG_CHARAT(p); + const UChar c1 = ONIG_CHARAT(p+1); + + if (UTF16_IS_SURROGATE_FIRST(c0)) { + code = ((((c0 - 0xd8) << 2) + ((c1 & 0xc0) >> 6) + 1) << 16) + + ((((c1 & 0x3f) << 2) + (ONIG_CHARAT(p+2) - 0xdc)) << 8) + + ONIG_CHARAT(p+3); + } + else { + code = c0 * 256 + c1; + } + return code; +} + static int utf16be_code_to_mbclen(OnigCodePoint code) { @@ -145,6 +189,35 @@ utf16be_mbc_case_fold(OnigCaseFoldType flag, pp, end, fold); } +static int +utf16be_mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, + OnigPosition* pp, OnigPosition end, UChar* fold) +{ + const UChar c0 = ONIG_CHARAT(*pp); + const UChar c1 = ONIG_CHARAT(*pp+1); + + if (ONIGENC_IS_ASCII_CODE(c1) && c0 == 0) { +#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI + if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { + if (c1 == 0x49) { + *fold++ = 0x01; + *fold = 0x31; + (*pp) += 2; + return 2; + } + } +#endif + + *fold++ = 0; + *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(c1); + *pp += 2; + return 2; + } + else + return onigenc_unicode_mbc_case_fold_se(it, ONIG_ENCODING_UTF16_BE, flag, + pp, end, fold); +} + #if 0 static int utf16be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) @@ -194,6 +267,21 @@ utf16be_left_adjust_char_head(const UChar* start, const UChar* s) return (UChar* )s; } +static OnigPosition +utf16be_left_adjust_char_head_se(OnigIterator* it, OnigPosition start, OnigPosition s) +{ + if (s <= start) return s; + + if ((s - start) % 2 == 1) { + s--; + } + + if (UTF16_IS_SURROGATE_SECOND(ONIG_CHARAT(s)) && s > start + 1) + s -= 2; + + return s; +} + static int utf16be_get_case_fold_codes_by_str(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) @@ -204,20 +292,25 @@ utf16be_get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingUTF16_BE = { utf16be_mbc_enc_len, + utf16be_mbc_enc_len_se, "UTF-16BE", /* name */ 4, /* max byte length */ 2, /* min byte length */ utf16be_is_mbc_newline, + utf16be_is_mbc_newline_se, utf16be_mbc_to_code, + utf16be_mbc_to_code_se, utf16be_code_to_mbclen, utf16be_code_to_mbc, utf16be_mbc_case_fold, + utf16be_mbc_case_fold_se, onigenc_unicode_apply_all_case_fold, utf16be_get_case_fold_codes_by_str, onigenc_unicode_property_name_to_ctype, onigenc_unicode_is_code_ctype, onigenc_utf16_32_get_ctype_code_range, utf16be_left_adjust_char_head, + utf16be_left_adjust_char_head_se, onigenc_always_false_is_allowed_reverse_match, ONIGENC_FLAG_UNICODE, }; diff --git a/src/Onigmo/enc/utf16_le.c b/src/Onigmo/enc/utf16_le.c index 1be3f9e..3dfb7bf 100644 --- a/src/Onigmo/enc/utf16_le.c +++ b/src/Onigmo/enc/utf16_le.c @@ -60,6 +60,12 @@ utf16le_mbc_enc_len(const UChar* p) return EncLen_UTF16[*(p+1)]; } +static int +utf16le_mbc_enc_len_se(OnigIterator* it, OnigPosition p) +{ + return EncLen_UTF16[ONIG_CHARAT(p+1)]; +} + static int utf16le_is_mbc_newline(const UChar* p, const UChar* end) { @@ -77,6 +83,26 @@ utf16le_is_mbc_newline(const UChar* p, const UChar* end) return 0; } +static int +utf16le_is_mbc_newline_se(OnigIterator* it, OnigPosition p, OnigPosition end) +{ + if (p + 1 < end) { + const UChar c0 = ONIG_CHARAT(p); + const UChar c1 = ONIG_CHARAT(p+1); + + if (c0 == 0x0a && c1 == 0x00) + return 1; +#ifdef USE_UNICODE_ALL_LINE_TERMINATORS + if ((c0 == 0x0b || c0 == 0x0c || c0 == 0x0d || c0 == 0x85) + && c1 == 0x00) + return 1; + if (c1 == 0x20 && (c0 == 0x29 || c0 == 0x28)) + return 1; +#endif + } + return 0; +} + static OnigCodePoint utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) { @@ -95,6 +121,24 @@ utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) return code; } +static OnigCodePoint +utf16le_mbc_to_code_se(OnigIterator* it, OnigPosition p, OnigPosition end ARG_UNUSED) +{ + OnigCodePoint code; + const UChar c0 = ONIG_CHARAT(p); + const UChar c1 = ONIG_CHARAT(p+1); + + if (UTF16_IS_SURROGATE_FIRST(c1)) { + code = ((((c1 - 0xd8) << 2) + ((c0 & 0xc0) >> 6) + 1) << 16) + + ((((c0 & 0x3f) << 2) + (ONIG_CHARAT(p+3) - 0xdc)) << 8) + + ONIG_CHARAT(p+2); + } + else { + code = c1 * 256 + c0; + } + return code; +} + static int utf16le_code_to_mbc(OnigCodePoint code, UChar *buf) { @@ -147,6 +191,34 @@ utf16le_mbc_case_fold(OnigCaseFoldType flag, fold); } +static int +utf16le_mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, + OnigPosition* pp, OnigPosition end, UChar* fold) +{ + const UChar c = ONIG_CHARAT(*pp); + + if (ONIGENC_IS_ASCII_CODE(c) && ONIG_CHARAT(*pp+1) == 0) { +#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI + if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { + if (c == 0x49) { + *fold++ = 0x31; + *fold = 0x01; + (*pp) += 2; + return 2; + } + } +#endif + + *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(c); + *fold = 0; + *pp += 2; + return 2; + } + else + return onigenc_unicode_mbc_case_fold_se(it, ONIG_ENCODING_UTF16_LE, flag, pp, end, + fold); +} + #if 0 static int utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, @@ -195,6 +267,21 @@ utf16le_left_adjust_char_head(const UChar* start, const UChar* s) return (UChar* )s; } +static OnigPosition +utf16le_left_adjust_char_head_se(OnigIterator* it, OnigPosition start, OnigPosition s) +{ + if (s <= start) return s; + + if ((s - start) % 2 == 1) { + s--; + } + + if (UTF16_IS_SURROGATE_SECOND(ONIG_CHARAT(s+1)) && s > start + 1) + s -= 2; + + return s; +} + static int utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) @@ -205,20 +292,25 @@ utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingUTF16_LE = { utf16le_mbc_enc_len, + utf16le_mbc_enc_len_se, "UTF-16LE", /* name */ 4, /* max byte length */ 2, /* min byte length */ utf16le_is_mbc_newline, + utf16le_is_mbc_newline_se, utf16le_mbc_to_code, + utf16le_mbc_to_code_se, utf16le_code_to_mbclen, utf16le_code_to_mbc, utf16le_mbc_case_fold, + utf16le_mbc_case_fold_se, onigenc_unicode_apply_all_case_fold, utf16le_get_case_fold_codes_by_str, onigenc_unicode_property_name_to_ctype, onigenc_unicode_is_code_ctype, onigenc_utf16_32_get_ctype_code_range, utf16le_left_adjust_char_head, + utf16le_left_adjust_char_head_se, onigenc_always_false_is_allowed_reverse_match, ONIGENC_FLAG_UNICODE, }; diff --git a/src/Onigmo/enc/utf32_be.c b/src/Onigmo/enc/utf32_be.c index 13eacbd..e33a0ac 100644 --- a/src/Onigmo/enc/utf32_be.c +++ b/src/Onigmo/enc/utf32_be.c @@ -35,6 +35,12 @@ utf32be_mbc_enc_len(const UChar* p ARG_UNUSED) return 4; } +static int +utf32be_mbc_enc_len_se(OnigIterator* it ARG_UNUSED, OnigPosition p ARG_UNUSED) +{ + return 4; +} + static int utf32be_is_mbc_newline(const UChar* p, const UChar* end) { @@ -53,12 +59,39 @@ utf32be_is_mbc_newline(const UChar* p, const UChar* end) return 0; } +static int +utf32be_is_mbc_newline_se(OnigIterator* it, OnigPosition p, OnigPosition end) +{ + if (p + 3 < end) { + const UChar c2 = ONIG_CHARAT(p+2); + const UChar c3 = ONIG_CHARAT(p+3); + + if (c3 == 0x0a && c2 == 0 && ONIG_CHARAT(p+1) == 0 && ONIG_CHARAT(p) == 0) + return 1; +#ifdef USE_UNICODE_ALL_LINE_TERMINATORS + if ((c3 == 0x0b || c3 == 0x0c || c3 == 0x0d || c3 == 0x85) + && c2 == 0 && ONIG_CHARAT(p+1) == 0 && ONIG_CHARAT(p) == 0x00) + return 1; + if (c2 == 0x20 && (c3 == 0x29 || c3 == 0x28) + && ONIG_CHARAT(p+1) == 0 && ONIG_CHARAT(p) == 0) + return 1; +#endif + } + return 0; +} + static OnigCodePoint utf32be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) { return (OnigCodePoint )(((p[0] * 256 + p[1]) * 256 + p[2]) * 256 + p[3]); } +static OnigCodePoint +utf32be_mbc_to_code_se(OnigIterator* it, OnigPosition p, OnigPosition end ARG_UNUSED) +{ + return (OnigCodePoint )(((ONIG_CHARAT(p) * 256 + ONIG_CHARAT(p+1)) * 256 + ONIG_CHARAT(p+2)) * 256 + ONIG_CHARAT(p+3)); +} + static int utf32be_code_to_mbclen(OnigCodePoint code ARG_UNUSED) { @@ -108,6 +141,38 @@ utf32be_mbc_case_fold(OnigCaseFoldType flag, fold); } +static int +utf32be_mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, + OnigPosition* pp, OnigPosition end, UChar* fold) +{ + OnigPosition p = *pp; + const UChar c3 = ONIG_CHARAT(p+3); + + if (ONIGENC_IS_ASCII_CODE(c3) && ONIG_CHARAT(p+2) == 0 && ONIG_CHARAT(p+1) == 0 && ONIG_CHARAT(p) == 0) { + *fold++ = 0; + *fold++ = 0; + +#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI + if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { + if (c3 == 0x49) { + *fold++ = 0x01; + *fold = 0x31; + (*pp) += 4; + return 4; + } + } +#endif + + *fold++ = 0; + *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(c3); + *pp += 4; + return 4; + } + else + return onigenc_unicode_mbc_case_fold_se(it, ONIG_ENCODING_UTF32_BE, flag, pp, end, + fold); +} + #if 0 static int utf32be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) @@ -152,6 +217,17 @@ utf32be_left_adjust_char_head(const UChar* start, const UChar* s) return (UChar* )(s - rem); } +static OnigPosition +utf32be_left_adjust_char_head_se(OnigIterator* it, OnigPosition start, OnigPosition s) +{ + int rem; + + if (s <= start) return s; + + rem = (int )((s - start) % 4); + return (s - rem); +} + static int utf32be_get_case_fold_codes_by_str(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) @@ -162,20 +238,25 @@ utf32be_get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingUTF32_BE = { utf32be_mbc_enc_len, + utf32be_mbc_enc_len_se, "UTF-32BE", /* name */ 4, /* max byte length */ 4, /* min byte length */ utf32be_is_mbc_newline, + utf32be_is_mbc_newline_se, utf32be_mbc_to_code, + utf32be_mbc_to_code_se, utf32be_code_to_mbclen, utf32be_code_to_mbc, utf32be_mbc_case_fold, + utf32be_mbc_case_fold_se, onigenc_unicode_apply_all_case_fold, utf32be_get_case_fold_codes_by_str, onigenc_unicode_property_name_to_ctype, onigenc_unicode_is_code_ctype, onigenc_utf16_32_get_ctype_code_range, utf32be_left_adjust_char_head, + utf32be_left_adjust_char_head_se, onigenc_always_false_is_allowed_reverse_match, ONIGENC_FLAG_UNICODE, }; diff --git a/src/Onigmo/enc/utf32_le.c b/src/Onigmo/enc/utf32_le.c index 405dad8..c8b3d94 100644 --- a/src/Onigmo/enc/utf32_le.c +++ b/src/Onigmo/enc/utf32_le.c @@ -35,6 +35,12 @@ utf32le_mbc_enc_len(const UChar* p ARG_UNUSED) return 4; } +static int +utf32le_mbc_enc_len_se(OnigIterator* it ARG_UNUSED, OnigPosition p ARG_UNUSED) +{ + return 4; +} + static int utf32le_is_mbc_newline(const UChar* p, const UChar* end) { @@ -43,7 +49,7 @@ utf32le_is_mbc_newline(const UChar* p, const UChar* end) return 1; #ifdef USE_UNICODE_ALL_LINE_TERMINATORS if ((*p == 0x0b ||*p == 0x0c ||*p == 0x0d || *p == 0x85) - && *(p+1) == 0x00 && (p+2) == 0x00 && *(p+3) == 0x00) + && *(p+1) == 0x00 && *(p+2) == 0x00 && *(p+3) == 0x00) return 1; if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28) && *(p+2) == 0x00 && *(p+3) == 0x00) @@ -53,12 +59,39 @@ utf32le_is_mbc_newline(const UChar* p, const UChar* end) return 0; } +static int +utf32le_is_mbc_newline_se(OnigIterator* it, OnigPosition p, OnigPosition end) +{ + if (p + 3 < end) { + const UChar c0 = ONIG_CHARAT(p); + const UChar c1 = ONIG_CHARAT(p+1); + + if (c0 == 0x0a && c1 == 0 && ONIG_CHARAT(p+2) == 0 && ONIG_CHARAT(p+3) == 0) + return 1; +#ifdef USE_UNICODE_ALL_LINE_TERMINATORS + if ((c0 == 0x0b || c0 == 0x0c || c0 == 0x0d || c0 == 0x85) + && c1 == 0x00 && ONIG_CHARAT(p+2) == 0x00 && ONIG_CHARAT(p+3) == 0x00) + return 1; + if (c1 == 0x20 && (c0 == 0x29 || c0 == 0x28) + && ONIG_CHARAT(p+2) == 0x00 && ONIG_CHARAT(p+3) == 0x00) + return 1; +#endif + } + return 0; +} + static OnigCodePoint utf32le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) { return (OnigCodePoint )(((p[3] * 256 + p[2]) * 256 + p[1]) * 256 + p[0]); } +static OnigCodePoint +utf32le_mbc_to_code_se(OnigIterator* it, OnigPosition p, OnigPosition end ARG_UNUSED) +{ + return (OnigCodePoint )(((ONIG_CHARAT(p+3) * 256 + ONIG_CHARAT(p+2)) * 256 + ONIG_CHARAT(p+1)) * 256 + ONIG_CHARAT(p)); +} + static int utf32le_code_to_mbclen(OnigCodePoint code ARG_UNUSED) { @@ -109,6 +142,39 @@ utf32le_mbc_case_fold(OnigCaseFoldType flag, fold); } +static int +utf32le_mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, + OnigPosition* pp, OnigPosition end, UChar* fold) +{ + const OnigPosition p = *pp; + const UChar c = ONIG_CHARAT(p); + + if (ONIGENC_IS_ASCII_CODE(c) && ONIG_CHARAT(p+1) == 0 && ONIG_CHARAT(p+2) == 0 && ONIG_CHARAT(p+3) == 0) { +#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI + if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { + if (c == 0x49) { + *fold++ = 0x31; + *fold++ = 0x01; + } + } + else { +#endif + *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(c); + *fold++ = 0; +#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI + } +#endif + + *fold++ = 0; + *fold = 0; + *pp += 4; + return 4; + } + else + return onigenc_unicode_mbc_case_fold_se(it, ONIG_ENCODING_UTF32_LE, flag, pp, end, + fold); +} + #if 0 static int utf32le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) @@ -152,6 +218,17 @@ utf32le_left_adjust_char_head(const UChar* start, const UChar* s) return (UChar* )(s - rem); } +static OnigPosition +utf32le_left_adjust_char_head_se(OnigIterator* it, OnigPosition start, OnigPosition s) +{ + int rem; + + if (s <= start) return s; + + rem = (int )((s - start) % 4); + return (s - rem); +} + static int utf32le_get_case_fold_codes_by_str(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) @@ -162,20 +239,25 @@ utf32le_get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingUTF32_LE = { utf32le_mbc_enc_len, + utf32le_mbc_enc_len_se, "UTF-32LE", /* name */ 4, /* max byte length */ 4, /* min byte length */ utf32le_is_mbc_newline, + utf32le_is_mbc_newline_se, utf32le_mbc_to_code, + utf32le_mbc_to_code_se, utf32le_code_to_mbclen, utf32le_code_to_mbc, utf32le_mbc_case_fold, + utf32le_mbc_case_fold_se, onigenc_unicode_apply_all_case_fold, utf32le_get_case_fold_codes_by_str, onigenc_unicode_property_name_to_ctype, onigenc_unicode_is_code_ctype, onigenc_utf16_32_get_ctype_code_range, utf32le_left_adjust_char_head, + utf32le_left_adjust_char_head_se, onigenc_always_false_is_allowed_reverse_match, ONIGENC_FLAG_UNICODE, }; diff --git a/src/Onigmo/enc/utf8.c b/src/Onigmo/enc/utf8.c index 4c923ed..ec3ecd4 100644 --- a/src/Onigmo/enc/utf8.c +++ b/src/Onigmo/enc/utf8.c @@ -65,6 +65,12 @@ mbc_enc_len(const UChar* p) return EncLen_UTF8[*p]; } +static int +mbc_enc_len_se(OnigIterator* it, OnigPosition p) +{ + return EncLen_UTF8[ONIG_CHARAT(p)]; +} + static int is_mbc_newline(const UChar* p, const UChar* end) { @@ -88,6 +94,32 @@ is_mbc_newline(const UChar* p, const UChar* end) return 0; } +static int +is_mbc_newline_se(OnigIterator* it, OnigPosition p, OnigPosition end) +{ + if (p < end) { + const UChar c0 = ONIG_CHARAT(p); + if (c0 == 0x0a) return 1; + +#ifdef USE_UNICODE_ALL_LINE_TERMINATORS + if (c0 == 0x0b || c0 == 0x0c || c0 == 0x0d) return 1; + if (p + 1 < end) { + const UChar c1 = ONIG_CHARAT(p+1); + if (c1 == 0x85 && c0 == 0xc2) /* U+0085 */ + return 1; + if (p + 2 < end) { + const UChar c2 = ONIG_CHARAT(p+2); + if ((c2 == 0xa8 || c2 == 0xa9) + && c1 == 0x80 && c0 == 0xe2) /* U+2028, U+2029 */ + return 1; + } + } +#endif + } + + return 0; +} + static OnigCodePoint mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) { @@ -115,6 +147,33 @@ mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) } } +static OnigCodePoint +mbc_to_code_se(OnigIterator* it, OnigPosition p, OnigPosition end ARG_UNUSED) +{ + int c, len; + OnigCodePoint n; + + len = mbc_enc_len_se(it, p); + c = ONIG_CHARAT(p++); + if (len > 1) { + len--; + n = c & ((1 << (6 - len)) - 1); + while (len--) { + c = ONIG_CHARAT(p++); + n = (n << 6) | (c & ((1 << 6) - 1)); + } + return n; + } + else { +#ifdef USE_INVALID_CODE_SCHEME + if (c > 0xfd) { + return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF); + } +#endif + return (OnigCodePoint )c; + } +} + static int code_to_mbclen(OnigCodePoint code) { @@ -217,6 +276,34 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, } } +static int +mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag, OnigPosition* pp, + OnigPosition end, UChar* fold) +{ + const UChar c = ONIG_CHARAT(*pp); + + if (ONIGENC_IS_MBC_ASCII_SE(c)) { +#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI + if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { + if (c == 0x49) { + *fold++ = 0xc4; + *fold = 0xb1; + (*pp)++; + return 2; + } + } +#endif + + *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(c); + (*pp)++; + return 1; /* return byte length of converted char to lower */ + } + else { + return onigenc_unicode_mbc_case_fold_se(it, ONIG_ENCODING_UTF8, flag, + pp, end, fold); + } +} + #if 0 static int is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) @@ -275,6 +362,18 @@ left_adjust_char_head(const UChar* start, const UChar* s) return (UChar* )p; } +static OnigPosition +left_adjust_char_head_se(OnigIterator* it, OnigPosition start, OnigPosition s) +{ + OnigPosition p; + + if (s <= start) return s; + p = s; + + while (!utf8_islead(ONIG_CHARAT(p)) && p > start) p--; + return p; +} + static int get_case_fold_codes_by_str(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) @@ -285,20 +384,25 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, OnigEncodingType OnigEncodingUTF8 = { mbc_enc_len, + mbc_enc_len_se, "UTF-8", /* name */ 6, /* max byte length */ 1, /* min byte length */ is_mbc_newline, + is_mbc_newline_se, mbc_to_code, + mbc_to_code_se, code_to_mbclen, code_to_mbc, mbc_case_fold, + mbc_case_fold_se, onigenc_unicode_apply_all_case_fold, get_case_fold_codes_by_str, onigenc_unicode_property_name_to_ctype, onigenc_unicode_is_code_ctype, get_ctype_code_range, left_adjust_char_head, + left_adjust_char_head_se, onigenc_always_true_is_allowed_reverse_match, ONIGENC_FLAG_UNICODE, }; diff --git a/src/Onigmo/oniggnu.h b/src/Onigmo/oniggnu.h index 3da9f23..db381f7 100644 --- a/src/Onigmo/oniggnu.h +++ b/src/Onigmo/oniggnu.h @@ -68,9 +68,9 @@ void re_free_pattern P_((struct re_pattern_buffer*)); ONIG_EXTERN int re_adjust_startpos P_((struct re_pattern_buffer*, const char*, int, int, int)); ONIG_EXTERN -int re_search P_((struct re_pattern_buffer*, const char*, int, int, int, struct re_registers*)); +int re_search P_((OnigIterator* it, struct re_pattern_buffer*, OnigPosition, OnigPosition, OnigPosition, OnigPosition, struct re_registers*)); ONIG_EXTERN -int re_match P_((struct re_pattern_buffer*, const char *, int, int, struct re_registers*)); +int re_match P_((OnigIterator* it, struct re_pattern_buffer*, OnigPosition, OnigPosition, OnigPosition, struct re_registers*)); ONIG_EXTERN void re_set_casetable P_((const char*)); ONIG_EXTERN diff --git a/src/Onigmo/onigposix.h b/src/Onigmo/onigposix.h index 25aa8f2..6e121df 100644 --- a/src/Onigmo/onigposix.h +++ b/src/Onigmo/onigposix.h @@ -152,7 +152,7 @@ ONIG_EXTERN const char* onig_copyright P_((void)); ONIG_EXTERN int regcomp P_((regex_t* reg, const char* pat, int options)); -ONIG_EXTERN int regexec P_((regex_t* reg, const char* str, size_t nmatch, regmatch_t* matches, int options)); +ONIG_EXTERN OnigPosition regexec P_((OnigIterator* it, regex_t* reg, OnigPosition str, size_t nmatch, regmatch_t* matches, int options)); ONIG_EXTERN void regfree P_((regex_t* reg)); ONIG_EXTERN size_t regerror P_((int code, const regex_t* reg, char* buf, size_t size)); diff --git a/src/Onigmo/oniguruma.h b/src/Onigmo/oniguruma.h index daa40ee..bd6176d 100644 --- a/src/Onigmo/oniguruma.h +++ b/src/Onigmo/oniguruma.h @@ -102,11 +102,31 @@ extern "C" { typedef unsigned char OnigUChar; typedef unsigned int OnigCodePoint; typedef unsigned int OnigCtype; -typedef size_t OnigDistance; -typedef ptrdiff_t OnigPosition; +typedef size_t OnigDistance; #define ONIG_INFINITE_DISTANCE ~((OnigDistance )0) +#if defined(_MSC_VER) +typedef __int64 OnigPosition; +#else +typedef long long OnigPosition; +#endif +#define ONIG_BADPOS -1 +#define ONIG_IS_BADPOS(p) (p == ONIG_BADPOS) +#define ONIG_IS_NOT_BADPOS(p) (p != ONIG_BADPOS) + +typedef UChar (*OnigCharAtFunc)(OnigPosition pos, const void* ptr); +typedef struct OnigIteratorStruct { + OnigCharAtFunc at; + const void* ptr; +} OnigIterator; + +#define ONIG_CHARAT(pos) (it->at(pos, it->ptr)) + +/* Iterator API */ +ONIG_EXTERN +UChar onig_default_charat P_((OnigPosition pos, const void* ptr)); + typedef unsigned int OnigCaseFoldType; /* case fold flag */ ONIG_EXTERN OnigCaseFoldType OnigDefaultCaseFoldFlag; @@ -148,20 +168,25 @@ typedef int (*OnigApplyAllCaseFoldFunc)(OnigCodePoint from, OnigCodePoint* to, i typedef struct OnigEncodingTypeST { int (*mbc_enc_len)(const OnigUChar* p); + int (*mbc_enc_len_se)(OnigIterator* it, OnigPosition p); const char* name; int max_enc_len; int min_enc_len; int (*is_mbc_newline)(const OnigUChar* p, const OnigUChar* end); + int (*is_mbc_newline_se)(OnigIterator* it, OnigPosition p, OnigPosition end); OnigCodePoint (*mbc_to_code)(const OnigUChar* p, const OnigUChar* end); + OnigCodePoint (*mbc_to_code_se)(OnigIterator* it, OnigPosition p, OnigPosition end); int (*code_to_mbclen)(OnigCodePoint code); int (*code_to_mbc)(OnigCodePoint code, OnigUChar *buf); int (*mbc_case_fold)(OnigCaseFoldType flag, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to); + int (*mbc_case_fold_se)(OnigIterator* it, OnigCaseFoldType flag, OnigPosition* pp, OnigPosition end, OnigUChar* to); int (*apply_all_case_fold)(OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg); int (*get_case_fold_codes_by_str)(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem acs[]); int (*property_name_to_ctype)(struct OnigEncodingTypeST* enc, OnigUChar* p, OnigUChar* end); int (*is_code_ctype)(OnigCodePoint code, OnigCtype ctype); int (*get_ctype_code_range)(OnigCtype ctype, OnigCodePoint* sb_out, const OnigCodePoint* ranges[]); OnigUChar* (*left_adjust_char_head)(const OnigUChar* start, const OnigUChar* p); + OnigPosition (*left_adjust_char_head_se)(OnigIterator* it, OnigPosition start, OnigPosition p); int (*is_allowed_reverse_match)(const OnigUChar* p, const OnigUChar* end); unsigned int flags; } OnigEncodingType; @@ -270,37 +295,57 @@ ONIG_EXTERN OnigEncodingType OnigEncodingGB18030; #define ONIGENC_IS_UNDEF(enc) ((enc) == ONIG_ENCODING_UNDEF) #define ONIGENC_IS_SINGLEBYTE(enc) (ONIGENC_MBC_MAXLEN(enc) == 1) #define ONIGENC_IS_MBC_HEAD(enc,p) (ONIGENC_MBC_ENC_LEN(enc,p) != 1) +#define ONIGENC_IS_MBC_HEAD_SE(it,enc,p) (ONIGENC_MBC_ENC_LEN_SE(it,enc,p) != 1) #define ONIGENC_IS_MBC_ASCII(p) (*(p) < 128) +#define ONIGENC_IS_MBC_ASCII_SE(c) (c < 128) #define ONIGENC_IS_CODE_ASCII(code) ((code) < 128) #define ONIGENC_IS_MBC_WORD(enc,s,end) \ ONIGENC_IS_CODE_WORD(enc,ONIGENC_MBC_TO_CODE(enc,s,end)) +#define ONIGENC_IS_MBC_WORD_SE(it,enc,s,end) \ + ONIGENC_IS_CODE_WORD(enc,ONIGENC_MBC_TO_CODE_SE(it,enc,s,end)) +#define ONIGENC_IS_MBC_SINGLEBYTE_SE(it,enc,s,end) \ + (ONIGENC_MBC_TO_CODE_SE(it,enc,s,end) <= 0xFF) #define ONIGENC_IS_MBC_ASCII_WORD(enc,s,end) \ onigenc_ascii_is_code_ctype( \ ONIGENC_MBC_TO_CODE(enc,s,end),ONIGENC_CTYPE_WORD) +#define ONIGENC_IS_MBC_ASCII_WORD_SE(it,enc,s,end) \ + onigenc_ascii_is_code_ctype( \ + ONIGENC_MBC_TO_CODE_SE(it,enc,s,end),ONIGENC_CTYPE_WORD) #define ONIGENC_IS_UNICODE(enc) ((enc)->flags & ONIGENC_FLAG_UNICODE) +#define ONIGENC_SCRIPT(enc,s,end) (onigenc_unicode_code_script(ONIGENC_MBC_TO_CODE(enc,s,end))) +#define ONIGENC_SCRIPT_SE(it,enc,s,end) (onigenc_unicode_code_script(ONIGENC_MBC_TO_CODE_SE(it,enc,s,end))) #define ONIGENC_NAME(enc) ((enc)->name) #define ONIGENC_MBC_CASE_FOLD(enc,flag,pp,end,buf) \ (enc)->mbc_case_fold(flag,(const OnigUChar** )pp,end,buf) +#define ONIGENC_MBC_CASE_FOLD_SE(it,enc,flag,pp,end,buf) \ + (enc)->mbc_case_fold_se(it,flag,(OnigPosition* )pp,end,buf) #define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \ (enc)->is_allowed_reverse_match(s,end) #define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s) \ (enc)->left_adjust_char_head(start, s) +#define ONIGENC_LEFT_ADJUST_CHAR_HEAD_SE(it,enc,start,s) \ + (enc)->left_adjust_char_head_se(it, start, s) #define ONIGENC_APPLY_ALL_CASE_FOLD(enc,case_fold_flag,f,arg) \ (enc)->apply_all_case_fold(case_fold_flag,f,arg) #define ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc,case_fold_flag,p,end,acs) \ (enc)->get_case_fold_codes_by_str(case_fold_flag,p,end,acs) #define ONIGENC_STEP_BACK(enc,start,s,n) \ onigenc_step_back((enc),(start),(s),(n)) +#define ONIGENC_STEP_BACK_SE(it,enc,start,s,n) \ + onigenc_step_back_se((it),(enc),(start),(s),(n)) #define ONIGENC_MBC_ENC_LEN(enc,p) (enc)->mbc_enc_len(p) +#define ONIGENC_MBC_ENC_LEN_SE(it,enc,p) (enc)->mbc_enc_len_se((it),(p)) #define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len) #define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc) #define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len) #define ONIGENC_IS_MBC_NEWLINE(enc,p,end) (enc)->is_mbc_newline((p),(end)) +#define ONIGENC_IS_MBC_NEWLINE_SE(it,enc,p,end) (enc)->is_mbc_newline_se((it),(p),(end)) #define ONIGENC_MBC_TO_CODE(enc,p,end) (enc)->mbc_to_code((p),(end)) +#define ONIGENC_MBC_TO_CODE_SE(it,enc,p,end) (enc)->mbc_to_code_se((it),(p),(end)) #define ONIGENC_CODE_TO_MBCLEN(enc,code) (enc)->code_to_mbclen(code) #define ONIGENC_CODE_TO_MBC(enc,code,buf) (enc)->code_to_mbc(code,buf) #define ONIGENC_PROPERTY_NAME_TO_CTYPE(enc,p,end) \ @@ -342,7 +387,8 @@ ONIG_EXTERN OnigEncodingType OnigEncodingGB18030; ONIG_EXTERN OnigUChar* onigenc_step_back P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, int n)); - +ONIG_EXTERN +OnigPosition onigenc_step_back_se P_((OnigIterator* it, OnigEncoding enc, OnigPosition start, OnigPosition s, int n)); /* encoding API */ ONIG_EXTERN @@ -356,18 +402,29 @@ void onigenc_set_default_caseconv_table P_((const OnigUChar* table)); ONIG_EXTERN OnigUChar* onigenc_get_right_adjust_char_head_with_prev P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, const OnigUChar** prev)); ONIG_EXTERN +OnigPosition onigenc_get_right_adjust_char_head_with_prev_se P_((OnigIterator* it, OnigEncoding enc, OnigPosition start, OnigPosition s, OnigPosition* prev)); +ONIG_EXTERN OnigUChar* onigenc_get_prev_char_head P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s)); ONIG_EXTERN +OnigPosition onigenc_get_prev_char_head_se P_((OnigIterator* it, OnigEncoding enc, OnigPosition start, OnigPosition s)); +ONIG_EXTERN OnigUChar* onigenc_get_left_adjust_char_head P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s)); ONIG_EXTERN +OnigPosition onigenc_get_left_adjust_char_head_se P_((OnigIterator* it, OnigEncoding enc, OnigPosition start, OnigPosition s)); +ONIG_EXTERN OnigUChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s)); ONIG_EXTERN +OnigPosition onigenc_get_right_adjust_char_head_se P_((OnigIterator* it, OnigEncoding enc, OnigPosition start, OnigPosition s)); +ONIG_EXTERN int onigenc_strlen P_((OnigEncoding enc, const OnigUChar* p, const OnigUChar* end)); ONIG_EXTERN +int onigenc_strlen_se P_((OnigIterator* it, OnigEncoding enc, OnigPosition p, OnigPosition end)); +ONIG_EXTERN int onigenc_strlen_null P_((OnigEncoding enc, const OnigUChar* p)); ONIG_EXTERN int onigenc_str_bytelen_null P_((OnigEncoding enc, const OnigUChar* p)); - +ONIG_EXTERN +OnigPosition onigenc_str_bytelen_null_se P_((OnigIterator* it, OnigEncoding enc, OnigPosition p)); /* PART: regular expression */ @@ -406,7 +463,9 @@ typedef unsigned int OnigOptionType; #define ONIG_OPTION_WORD_BOUND_ALL_RANGE (ONIG_OPTION_POSIX_BRACKET_ALL_RANGE << 1) /* options (newline) */ #define ONIG_OPTION_NEWLINE_CRLF (ONIG_OPTION_WORD_BOUND_ALL_RANGE << 1) -#define ONIG_OPTION_MAXBIT ONIG_OPTION_NEWLINE_CRLF /* limit */ +// options (whole word) +#define SE_ONIG_OPTION_WHOLEWORD (ONIG_OPTION_NEWLINE_CRLF << 1) +#define ONIG_OPTION_MAXBIT SE_ONIG_OPTION_WHOLEWORD /* limit */ #define ONIG_OPTION_ON(options,regopt) ((options) |= (regopt)) #define ONIG_OPTION_OFF(options,regopt) ((options) &= ~(regopt)) @@ -422,6 +481,7 @@ typedef struct { } OnigSyntaxType; ONIG_EXTERN OnigSyntaxType OnigSyntaxASIS; +ONIG_EXTERN OnigSyntaxType OnigSyntaxWildChar; ONIG_EXTERN OnigSyntaxType OnigSyntaxPosixBasic; ONIG_EXTERN OnigSyntaxType OnigSyntaxPosixExtended; ONIG_EXTERN OnigSyntaxType OnigSyntaxEmacs; @@ -436,6 +496,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxPython; /* predefined syntaxes (see regsyntax.c) */ #define ONIG_SYNTAX_ASIS (&OnigSyntaxASIS) +#define ONIG_SYNTAX_WILDCHAR (&OnigSyntaxWildChar) #define ONIG_SYNTAX_POSIX_BASIC (&OnigSyntaxPosixBasic) #define ONIG_SYNTAX_POSIX_EXTENDED (&OnigSyntaxPosixExtended) #define ONIG_SYNTAX_EMACS (&OnigSyntaxEmacs) @@ -775,11 +836,13 @@ int onig_recompile P_((OnigRegex, const OnigUChar* pattern, const OnigUChar* pat ONIG_EXTERN int onig_recompile_deluxe P_((OnigRegex reg, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); ONIG_EXTERN -OnigPosition onig_search P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option)); +OnigPosition onig_search P_((OnigIterator* it, OnigRegex, OnigPosition str, OnigPosition end, OnigPosition start, OnigPosition range, OnigRegion* region, OnigOptionType option)); ONIG_EXTERN -OnigPosition onig_search_gpos P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* global_pos, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option)); +OnigPosition onig_search_gpos P_((OnigIterator* it, OnigRegex, OnigPosition str, OnigPosition end, OnigPosition global_pos, OnigPosition start, OnigPosition range, OnigRegion* region, OnigOptionType option)); ONIG_EXTERN -OnigPosition onig_match P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option)); +OnigPosition onig_match P_((OnigIterator* it, OnigRegex, OnigPosition str, OnigPosition end, OnigPosition at, OnigRegion* region, OnigOptionType option)); +ONIG_EXTERN +OnigPosition onig_match_gpos P_((OnigIterator* it, OnigRegex, OnigPosition str, OnigPosition end, OnigPosition global_pos, OnigPosition at, OnigRegion* region, OnigOptionType option)); ONIG_EXTERN OnigRegion* onig_region_new P_((void)); ONIG_EXTERN diff --git a/src/Onigmo/regcomp.c b/src/Onigmo/regcomp.c index 382546a..6bc0f8b 100644 --- a/src/Onigmo/regcomp.c +++ b/src/Onigmo/regcomp.c @@ -5791,9 +5791,13 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, scan_env.mem_nodes_dynamic = (Node** )NULL; } + if (IS_WHOLEWORD(reg->options)) + r = add_opcode(reg, OP_WORD_BEGIN); r = compile_tree(root, reg); if (r == 0) { - r = add_opcode(reg, OP_END); + if (IS_WHOLEWORD(reg->options)) + r = add_opcode(reg, OP_WORD_END); + r = add_opcode(reg, OP_END); #ifdef USE_SUBEXP_CALL if (scan_env.num_call > 0) { r = unset_addr_list_fix(&uslist, reg); @@ -5996,6 +6000,10 @@ onig_end(void) onig_free_node_list(); #endif +#ifdef USE_SHARED_UNICODE_TABLE + onig_free_shared_unicode_table(); +#endif + onig_inited = 0; THREAD_ATOMIC_END; diff --git a/src/Onigmo/regenc.c b/src/Onigmo/regenc.c index 78fe0fc..8083b97 100644 --- a/src/Onigmo/regenc.c +++ b/src/Onigmo/regenc.c @@ -61,6 +61,16 @@ onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const U return p; } +extern OnigPosition +onigenc_get_right_adjust_char_head_se(OnigIterator* it, OnigEncoding enc, OnigPosition start, OnigPosition s) +{ + OnigPosition p = ONIGENC_LEFT_ADJUST_CHAR_HEAD_SE(it, enc, start, s); + if (p < s) { + p += enclen_se(it, enc, p); + } + return p; +} + extern UChar* onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc, const UChar* start, const UChar* s, const UChar** prev) @@ -77,6 +87,22 @@ onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc, return p; } +extern OnigPosition +onigenc_get_right_adjust_char_head_with_prev_se(OnigIterator* it, OnigEncoding enc, + OnigPosition start, OnigPosition s, OnigPosition* prev) +{ + OnigPosition p = ONIGENC_LEFT_ADJUST_CHAR_HEAD_SE(it, enc, start, s); + + if (p < s) { + if (prev) *prev = p; + p += enclen_se(it, enc, p); + } + else { + if (prev) *prev = ONIG_BADPOS; /* Sorry */ + } + return p; +} + extern UChar* onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s) { @@ -86,6 +112,15 @@ onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s) return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1); } +extern OnigPosition +onigenc_get_prev_char_head_se(OnigIterator* it, OnigEncoding enc, OnigPosition start, OnigPosition s) +{ + if (s <= start) + return ONIG_BADPOS; + + return ONIGENC_LEFT_ADJUST_CHAR_HEAD_SE(it, enc, start, s - 1); +} + extern UChar* onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n) { @@ -98,6 +133,18 @@ onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n) return (UChar* )s; } +extern OnigPosition +onigenc_step_back_se(OnigIterator* it, OnigEncoding enc, OnigPosition start, OnigPosition s, int n) +{ + while (ONIG_IS_NOT_BADPOS(s) && n-- > 0) { + if (s <= start) + return ONIG_BADPOS; + + s = ONIGENC_LEFT_ADJUST_CHAR_HEAD_SE(it, enc, start, s - 1); + } + return s; +} + extern UChar* onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n) { @@ -121,6 +168,18 @@ onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end) return n; } +extern int +onigenc_strlen_se(OnigIterator* it, OnigEncoding enc, OnigPosition p, OnigPosition end) +{ + int n = 0; + + while (p < end) { + p += ONIGENC_MBC_ENC_LEN_SE(it, enc, p); + n++; + } + return n; +} + extern int onigenc_strlen_null(OnigEncoding enc, const UChar* s) { @@ -170,6 +229,30 @@ onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s) } } +extern OnigPosition +onigenc_str_bytelen_null_se(OnigIterator* it, OnigEncoding enc, OnigPosition s) +{ + OnigPosition start = s; + OnigPosition p = s; + + while (1) { + if (ONIG_CHARAT(p) == '\0') { + OnigPosition q; + int len = ONIGENC_MBC_MINLEN(enc); + + if (len == 1) return p - start; + q = p + 1; + while (len > 1) { + if (ONIG_CHARAT(q) != '\0') break; + q++; + len--; + } + if (len == 1) return p - start; + } + p += ONIGENC_MBC_ENC_LEN_SE(it, enc, p); + } +} + const UChar OnigEncAsciiToLowerCaseTable[] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', @@ -362,6 +445,12 @@ onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UC return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s); } +extern OnigPosition +onigenc_get_left_adjust_char_head_se(OnigIterator* it, OnigEncoding enc, OnigPosition start, OnigPosition s) +{ + return ONIGENC_LEFT_ADJUST_CHAR_HEAD_SE(it, enc, start, s); +} + const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = { { 0x41, 0x61 }, { 0x42, 0x62 }, @@ -570,6 +659,15 @@ onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end) return 0; } +extern int +onigenc_is_mbc_newline_0x0a_se(OnigIterator* it, OnigPosition p, OnigPosition end) +{ + if (p < end) { + if (ONIG_CHARAT(p) == 0x0a) return 1; + } + return 0; +} + /* for single byte encodings */ extern int onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p, @@ -581,6 +679,17 @@ onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p, return 1; /* return byte length of converted char to lower */ } +/* for single byte encodings */ +extern int +onigenc_ascii_mbc_case_fold_se(OnigIterator* it, OnigCaseFoldType flag ARG_UNUSED, OnigPosition* p, + OnigPosition end ARG_UNUSED, UChar* lower) +{ + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(ONIG_CHARAT(*p)); + + (*p)++; + return 1; /* return byte length of converted char to lower */ +} + #if 0 extern int onigenc_ascii_is_mbc_ambiguous(OnigCaseFoldType flag, @@ -599,12 +708,24 @@ onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED) return 1; } +extern int +onigenc_single_byte_mbc_enc_len_se(OnigIterator* it ARG_UNUSED, OnigPosition p ARG_UNUSED) +{ + return 1; +} + extern OnigCodePoint onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) { return (OnigCodePoint )(*p); } +extern OnigCodePoint +onigenc_single_byte_mbc_to_code_se(OnigIterator* it, OnigPosition p, OnigPosition end ARG_UNUSED) +{ + return (OnigCodePoint )(ONIG_CHARAT(p)); +} + extern int onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED) { @@ -625,6 +746,13 @@ onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED, return (UChar* )s; } +extern OnigPosition +onigenc_single_byte_left_adjust_char_head_se(OnigIterator* it ARG_UNUSED, OnigPosition start ARG_UNUSED, + OnigPosition s) +{ + return s; +} + extern int onigenc_always_true_is_allowed_reverse_match(const UChar* s ARG_UNUSED, const UChar* end ARG_UNUSED) @@ -666,6 +794,24 @@ onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end) return n; } +extern OnigCodePoint +onigenc_mbn_mbc_to_code_se(OnigIterator* it, OnigEncoding enc, OnigPosition p, OnigPosition end) +{ + int c, i, len; + OnigCodePoint n; + + len = enclen_se(it, enc, p); + n = (OnigCodePoint )(ONIG_CHARAT(p++)); + if (len == 1) return n; + + for (i = 1; i < len; i++) { + if (p >= end) break; + c = ONIG_CHARAT(p++); + n <<= 8; n += c; + } + return n; +} + extern int onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end ARG_UNUSED, @@ -691,6 +837,32 @@ onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED, } } +extern int +onigenc_mbn_mbc_case_fold_se(OnigIterator* it, OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED, + OnigPosition* pp, OnigPosition end ARG_UNUSED, + UChar* lower) +{ + int len; + OnigPosition p = *pp; + const UChar c = ONIG_CHARAT(*pp); + + if (ONIGENC_IS_MBC_ASCII_SE(c)) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(c); + (*pp)++; + return 1; + } + else { + int i; + + len = enclen_se(it, enc, p); + for (i = 0; i < len; i++) { + *lower++ = ONIG_CHARAT(p++); + } + (*pp) += len; + return len; /* return byte length of converted to lower char */ + } +} + #if 0 extern int onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag, diff --git a/src/Onigmo/regenc.h b/src/Onigmo/regenc.h index e10fbde..bc34825 100644 --- a/src/Onigmo/regenc.h +++ b/src/Onigmo/regenc.h @@ -73,6 +73,7 @@ typedef struct { #define ONIG_CHECK_NULL_RETURN_VAL(p,val) if (ONIG_IS_NULL(p)) return (val) #define enclen(enc,p) ONIGENC_MBC_ENC_LEN(enc,p) +#define enclen_se(it,enc,p) ONIGENC_MBC_ENC_LEN_SE(it,enc,p) /* character types bit flag */ #define BIT_CTYPE_NEWLINE (1<< ONIGENC_CTYPE_NEWLINE) @@ -107,7 +108,7 @@ typedef struct { #define USE_CRNL_AS_LINE_TERMINATOR #define USE_UNICODE_PROPERTIES /* #define USE_UNICODE_CASE_FOLD_TURKISH_AZERI */ -/* #define USE_UNICODE_ALL_LINE_TERMINATORS */ /* see Unicode.org UTS #18 */ +#define USE_UNICODE_ALL_LINE_TERMINATORS /* see Unicode.org UTS #18 */ #define ONIG_ENCODING_INIT_DEFAULT ONIG_ENCODING_ASCII @@ -119,22 +120,29 @@ ONIG_EXTERN int onigenc_apply_all_case_fold_with_map P_((int map_size, const Oni ONIG_EXTERN int onigenc_get_case_fold_codes_by_str_with_map P_((int map_size, const OnigPairCaseFoldCodes map[], int ess_tsett_flag, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])); ONIG_EXTERN int onigenc_not_support_get_ctype_code_range P_((OnigCtype ctype, OnigCodePoint* sb_out, const OnigCodePoint* ranges[])); ONIG_EXTERN int onigenc_is_mbc_newline_0x0a P_((const UChar* p, const UChar* end)); +ONIG_EXTERN int onigenc_is_mbc_newline_0x0a_se P_((OnigIterator* it, OnigPosition p, OnigPosition end)); /* methods for single byte encoding */ ONIG_EXTERN int onigenc_ascii_mbc_case_fold P_((OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower)); +ONIG_EXTERN int onigenc_ascii_mbc_case_fold_se P_((OnigIterator* it, OnigCaseFoldType flag, OnigPosition* p, OnigPosition end, UChar* lower)); ONIG_EXTERN int onigenc_single_byte_mbc_enc_len P_((const UChar* p)); +ONIG_EXTERN int onigenc_single_byte_mbc_enc_len_se P_((OnigIterator* it, OnigPosition p)); ONIG_EXTERN OnigCodePoint onigenc_single_byte_mbc_to_code P_((const UChar* p, const UChar* end)); +ONIG_EXTERN OnigCodePoint onigenc_single_byte_mbc_to_code_se P_((OnigIterator* it, OnigPosition p, OnigPosition end)); ONIG_EXTERN int onigenc_single_byte_code_to_mbclen P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_single_byte_code_to_mbc P_((OnigCodePoint code, UChar *buf)); ONIG_EXTERN UChar* onigenc_single_byte_left_adjust_char_head P_((const UChar* start, const UChar* s)); +ONIG_EXTERN OnigPosition onigenc_single_byte_left_adjust_char_head_se P_((OnigIterator* it, OnigPosition start, OnigPosition s)); ONIG_EXTERN int onigenc_always_true_is_allowed_reverse_match P_((const UChar* s, const UChar* end)); ONIG_EXTERN int onigenc_always_false_is_allowed_reverse_match P_((const UChar* s, const UChar* end)); ONIG_EXTERN int onigenc_ascii_is_code_ctype P_((OnigCodePoint code, unsigned int ctype)); /* methods for multi byte encoding */ ONIG_EXTERN OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, const UChar* p, const UChar* end)); +ONIG_EXTERN OnigCodePoint onigenc_mbn_mbc_to_code_se P_((OnigIterator* it, OnigEncoding enc, OnigPosition p, OnigPosition end)); ONIG_EXTERN int onigenc_mbn_mbc_case_fold P_((OnigEncoding enc, OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower)); +ONIG_EXTERN int onigenc_mbn_mbc_case_fold_se P_((OnigIterator* it, OnigEncoding enc, OnigCaseFoldType flag, OnigPosition* p, OnigPosition end, UChar* lower)); ONIG_EXTERN int onigenc_mb2_code_to_mbclen P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_mb2_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); ONIG_EXTERN int onigenc_minimum_property_name_to_ctype P_((OnigEncoding enc, UChar* p, UChar* end)); @@ -146,11 +154,13 @@ ONIG_EXTERN int onigenc_mb4_is_code_ctype P_((OnigEncoding enc, OnigCodePoint co /* in enc/unicode.c */ +ONIG_EXTERN const OnigCodePoint* onigenc_unicode_code_script P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_unicode_is_code_ctype P_((OnigCodePoint code, unsigned int ctype)); ONIG_EXTERN int onigenc_utf16_32_get_ctype_code_range P_((OnigCtype ctype, OnigCodePoint *sb_out, const OnigCodePoint* ranges[])); ONIG_EXTERN int onigenc_unicode_ctype_code_range P_((int ctype, const OnigCodePoint* ranges[])); ONIG_EXTERN int onigenc_unicode_get_case_fold_codes_by_str P_((OnigEncoding enc, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])); ONIG_EXTERN int onigenc_unicode_mbc_case_fold P_((OnigEncoding enc, OnigCaseFoldType flag, const UChar** pp, const UChar* end, UChar* fold)); +ONIG_EXTERN int onigenc_unicode_mbc_case_fold_se P_((OnigIterator* it, OnigEncoding enc, OnigCaseFoldType flag, OnigPosition* pp, OnigPosition end, UChar* fold)); ONIG_EXTERN int onigenc_unicode_apply_all_case_fold P_((OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg)); diff --git a/src/Onigmo/regexec.c b/src/Onigmo/regexec.c index e094f6c..c5fa86a 100644 --- a/src/Onigmo/regexec.c +++ b/src/Onigmo/regexec.c @@ -30,23 +30,23 @@ #include "regint.h" -#define USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE +//#define USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE #ifdef USE_CRNL_AS_LINE_TERMINATOR -#define ONIGENC_IS_MBC_CRNL(enc,p,end) \ - (ONIGENC_MBC_TO_CODE(enc,p,end) == 13 && \ - ONIGENC_MBC_TO_CODE(enc,(p+enclen(enc,p)),end) == 10) -#define ONIGENC_IS_MBC_NEWLINE_EX(enc,p,start,end,option,check_prev) \ - is_mbc_newline_ex((enc),(p),(start),(end),(option),(check_prev)) +#define ONIGENC_IS_MBC_CRNL_SE(it,enc,p,end) \ + (ONIGENC_MBC_TO_CODE_SE(it,enc,p,end) == 13 && \ + ONIGENC_MBC_TO_CODE_SE(it,enc,(p+enclen_se(it,enc,p)),end) == 10) +#define ONIGENC_IS_MBC_NEWLINE_EX_SE(it, enc,p,start,end,option,check_prev) \ + is_mbc_newline_ex_se((it), (enc),(p),(start),(end),(option),(check_prev)) static int -is_mbc_newline_ex(OnigEncoding enc, const UChar *p, const UChar *start, - const UChar *end, OnigOptionType option, int check_prev) +is_mbc_newline_ex_se(OnigIterator* it, OnigEncoding enc, OnigPosition p, OnigPosition start, + OnigPosition end, OnigOptionType option, int check_prev) { if (IS_NEWLINE_CRLF(option)) { - if (ONIGENC_MBC_TO_CODE(enc, p, end) == 0x0a) { + if (ONIGENC_MBC_TO_CODE_SE(it, enc, p, end) == 0x0a) { if (check_prev) { - const UChar *prev = onigenc_get_prev_char_head(enc, start, p); - if ((prev != NULL) && ONIGENC_MBC_TO_CODE(enc, prev, end) == 0x0d) + OnigPosition prev = onigenc_get_prev_char_head_se(it, enc, start, p); + if ((ONIG_IS_NOT_BADPOS(prev)) && ONIGENC_MBC_TO_CODE_SE(it, enc, prev, end) == 0x0d) return 0; else return 1; @@ -55,23 +55,23 @@ is_mbc_newline_ex(OnigEncoding enc, const UChar *p, const UChar *start, return 1; } else { - const UChar *pnext = p + enclen(enc, p); + OnigPosition pnext = p + enclen_se(it, enc, p); if (pnext < end && - ONIGENC_MBC_TO_CODE(enc, p, end) == 0x0d && - ONIGENC_MBC_TO_CODE(enc, pnext, end) == 0x0a) + ONIGENC_MBC_TO_CODE_SE(it, enc, p, end) == 0x0d && + ONIGENC_MBC_TO_CODE_SE(it, enc, pnext, end) == 0x0a) return 1; - if (ONIGENC_IS_MBC_NEWLINE(enc, p, end)) + if (ONIGENC_IS_MBC_NEWLINE_SE(it, enc, p, end)) return 1; return 0; } } else { - return ONIGENC_IS_MBC_NEWLINE(enc, p, end); + return ONIGENC_IS_MBC_NEWLINE_SE(it, enc, p, end); } } #else /* USE_CRNL_AS_LINE_TERMINATOR */ -#define ONIGENC_IS_MBC_NEWLINE_EX(enc,p,start,end,option,check_prev) \ - ONIGENC_IS_MBC_NEWLINE((enc), (p), (end)) +#define ONIGENC_IS_MBC_NEWLINE_EX_SE(it, enc,p,start,end,option,check_prev) \ + ONIGENC_IS_MBC_NEWLINE_SE((it), (enc), (p), (end)) #endif /* USE_CRNL_AS_LINE_TERMINATOR */ #ifdef USE_CAPTURE_HISTORY @@ -915,7 +915,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, break;\ }\ else {\ - UChar* endp;\ + OnigPosition endp;\ (isnull) = 1;\ while (k < stk) {\ if (k->type == STK_MEM_START) {\ @@ -925,7 +925,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, if (BIT_STATUS_AT(reg->bt_mem_end, k->u.mem.num))\ endp = STACK_AT(k->u.mem.end)->u.mem.pstr;\ else\ - endp = (UChar* )k->u.mem.end;\ + endp = k->u.mem.end;\ if (STACK_AT(k->u.mem.start)->u.mem.pstr != endp) {\ (isnull) = 0; break;\ }\ @@ -956,7 +956,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, break;\ }\ else {\ - UChar* endp;\ + OnigPosition endp;\ (isnull) = 1;\ while (k < stk) {\ if (k->type == STK_MEM_START) {\ @@ -966,7 +966,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, if (BIT_STATUS_AT(reg->bt_mem_end, k->u.mem.num))\ endp = STACK_AT(k->u.mem.end)->u.mem.pstr;\ else\ - endp = (UChar* )k->u.mem.end;\ + endp = k->u.mem.end;\ if (STACK_AT(k->u.mem.start)->u.mem.pstr != endp) {\ (isnull) = 0; break;\ }\ @@ -1026,32 +1026,32 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, }\ } while(0) - -#define STRING_CMP(s1,s2,len) do {\ +#define STRING_CMP_SE(it,s1,s2,len) do {\ while (len-- > 0) {\ - if (*s1++ != *s2++) goto fail;\ + if (ONIG_CHARAT(s1++) != ONIG_CHARAT(s2++)) goto fail;\ }\ } while(0) -#define STRING_CMP_IC(case_fold_flag,s1,ps2,len) do {\ - if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len) == 0) \ +#define STRING_CMP_IC_SE(it, case_fold_flag,s1,ps2,len) do {\ + if (string_cmp_ic_se(it, encode, case_fold_flag, s1, ps2, len) == 0) \ goto fail; \ } while(0) -static int string_cmp_ic(OnigEncoding enc, int case_fold_flag, - UChar* s1, UChar** ps2, OnigDistance mblen) +static int string_cmp_ic_se(OnigIterator* it, OnigEncoding enc, int case_fold_flag, + OnigPosition s1, OnigPosition* ps2, OnigPosition mblen) { UChar buf1[ONIGENC_MBC_CASE_FOLD_MAXLEN]; UChar buf2[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - UChar *p1, *p2, *end1, *s2, *end2; + UChar* p1, *p2; + OnigPosition end1, s2, end2; int len1, len2; s2 = *ps2; end1 = s1 + mblen; end2 = s2 + mblen; while (s1 < end1) { - len1 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s1, end1, buf1); - len2 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s2, end2, buf2); + len1 = ONIGENC_MBC_CASE_FOLD_SE(it, enc, case_fold_flag, &s1, end1, buf1); + len2 = ONIGENC_MBC_CASE_FOLD_SE(it, enc, case_fold_flag, &s2, end2, buf2); if (len1 != len2) return 0; p1 = buf1; p2 = buf2; @@ -1066,17 +1066,17 @@ static int string_cmp_ic(OnigEncoding enc, int case_fold_flag, return 1; } -#define STRING_CMP_VALUE(s1,s2,len,is_fail) do {\ +#define STRING_CMP_VALUE_SE(it, s1,s2,len,is_fail) do {\ is_fail = 0;\ while (len-- > 0) {\ - if (*s1++ != *s2++) {\ + if (ONIG_CHARAT(s1++) != ONIG_CHARAT(s2++)) {\ is_fail = 1; break;\ }\ }\ } while(0) -#define STRING_CMP_VALUE_IC(case_fold_flag,s1,ps2,len,is_fail) do {\ - if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len) == 0) \ +#define STRING_CMP_VALUE_IC_SE(it, case_fold_flag,s1,ps2,len,is_fail) do {\ + if (string_cmp_ic_se(it, encode, case_fold_flag, s1, ps2, len) == 0) \ is_fail = 1; \ else \ is_fail = 0; \ @@ -1100,7 +1100,7 @@ static int string_cmp_ic(OnigEncoding enc, int case_fold_flag, #ifdef USE_CAPTURE_HISTORY static int make_capture_history_tree(OnigCaptureTreeNode* node, OnigStackType** kp, - OnigStackType* stk_top, UChar* str, regex_t* reg) + OnigStackType* stk_top, OnigPosition str, regex_t* reg) { int n, r; OnigCaptureTreeNode* child; @@ -1155,12 +1155,12 @@ static int mem_is_in_memp(int mem, int num, UChar* memp) return 0; } -static int backref_match_at_nested_level(regex_t* reg +static int backref_match_at_nested_level(OnigIterator* it, regex_t* reg , OnigStackType* top, OnigStackType* stk_base , int ignore_case, int case_fold_flag - , int nest, int mem_num, UChar* memp, UChar** s, const UChar* send) + , int nest, int mem_num, UChar* memp, OnigPosition* s, OnigPosition send) { - UChar *ss, *p, *pstart, *pend = NULL_UCHARP; + OnigPosition ss, p, pstart, pend = ONIG_BADPOS; int level; OnigStackType* k; @@ -1178,19 +1178,19 @@ static int backref_match_at_nested_level(regex_t* reg if (k->type == STK_MEM_START) { if (mem_is_in_memp(k->u.mem.num, mem_num, memp)) { pstart = k->u.mem.pstr; - if (pend != NULL_UCHARP) { + if (pend != ONIG_BADPOS) { if (pend - pstart > send - *s) return 0; /* or goto next_mem; */ p = pstart; ss = *s; if (ignore_case != 0) { - if (string_cmp_ic(reg->enc, case_fold_flag, + if (string_cmp_ic_se(it, reg->enc, case_fold_flag, pstart, &ss, pend - pstart) == 0) return 0; /* or goto next_mem; */ } else { while (p < pend) { - if (*p++ != *ss++) return 0; /* or goto next_mem; */ + if (ONIG_CHARAT(p++) != ONIG_CHARAT(ss++)) return 0; /* or goto next_mem; */ } } @@ -1304,25 +1304,25 @@ typedef struct { /* match data(str - end) from position (sstart). */ /* if sstart == str then set sprev to NULL. */ static OnigPosition -match_at(regex_t* reg, const UChar* str, const UChar* end, +match_at(OnigIterator* it, regex_t* reg, OnigPosition str, OnigPosition end, #ifdef USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE - const UChar* right_range, + const OnigPosition right_range, #endif - const UChar* sstart, UChar* sprev, OnigMatchArg* msa) + const OnigPosition sstart, OnigPosition sprev, OnigMatchArg* msa) { static UChar FinishCode[] = { OP_FINISH }; int i, num_mem, pop_level; - ptrdiff_t n, best_len; + OnigPosition n, best_len; LengthType tlen, tlen2; MemNumType mem; RelAddrType addr; OnigOptionType option = reg->options; OnigEncoding encode = reg->enc; OnigCaseFoldType case_fold_flag = reg->case_fold_flag; - UChar *s, *q, *sbegin; + OnigPosition s, q, sbegin; UChar *p = reg->p; - UChar *pkeep; + OnigPosition pkeep; char *alloca_base; OnigStackType *stk_alloc, *stk_base, *stk, *stk_end; OnigStackType *stkp; /* used as any purpose. */ @@ -1370,27 +1370,28 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #endif /* USE_SUBEXP_CALL */ #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "match_at: str: %d (%p), end: %d (%p), start: %d (%p), sprev: %d (%p)\n", - (int )str, str, (int )end, end, (int )sstart, sstart, (int )sprev, sprev); + fprintf(stderr, "match_at: ptr: %p, str: %d, end: %d, start: %d, sprev: %d\n", + it->ptr, (int )str, (int )end, (int )sstart, (int )sprev); fprintf(stderr, "size: %d, start offset: %d\n", (int )(end - str), (int )(sstart - str)); #endif STACK_PUSH_ENSURED(STK_ALT, FinishCode); /* bottom stack */ best_len = ONIG_MISMATCH; - s = (UChar* )sstart; - pkeep = (UChar* )sstart; + s = sstart; + pkeep = sstart; while (1) { #ifdef ONIG_DEBUG_MATCH - if (s) { - UChar *q, *bp, buf[50]; + if (ONIG_IS_NOT_BADPOS(s)) { + OnigPosition q; + UChar *bp, buf[50]; int len; fprintf(stderr, "%4d> \"", (int )(s - str)); bp = buf; if (*p != OP_FINISH) { /* s may not be a valid pointer if OP_FINISH. */ for (i = 0, q = s; i < 7 && q < end; i++) { - len = enclen(encode, q); - while (len-- > 0) *bp++ = *q++; + len = enclen_se(it, encode, q); + while (len-- > 0) *bp++ = ONIG_CHARAT(q++); } } if (q < end) { xmemcpy(bp, "...\"", 4); bp += 4; } @@ -1413,7 +1414,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (IS_FIND_LONGEST(option)) { if (n > msa->best_len) { msa->best_len = n; - msa->best_s = (UChar* )sstart; + msa->best_s = sstart; } else goto end_best_len; @@ -1433,11 +1434,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (BIT_STATUS_AT(reg->bt_mem_start, i)) rmt[i].rm_so = (regoff_t )(STACK_AT(mem_start_stk[i])->u.mem.pstr - str); else - rmt[i].rm_so = (regoff_t )((UChar* )((void* )(mem_start_stk[i])) - str); + rmt[i].rm_so = (regoff_t )((OnigPosition)mem_start_stk[i] - str); rmt[i].rm_eo = (regoff_t )((BIT_STATUS_AT(reg->bt_mem_end, i) ? STACK_AT(mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[i])) - str); + : (OnigPosition)mem_end_stk[i]) - str); } else { rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS; @@ -1453,11 +1454,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (BIT_STATUS_AT(reg->bt_mem_start, i)) region->beg[i] = STACK_AT(mem_start_stk[i])->u.mem.pstr - str; else - region->beg[i] = (UChar* )((void* )mem_start_stk[i]) - str; + region->beg[i] = (OnigPosition)mem_start_stk[i] - str; region->end[i] = (BIT_STATUS_AT(reg->bt_mem_end, i) ? STACK_AT(mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[i])) - str; + : (OnigPosition)mem_end_stk[i]) - str; } else { region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; @@ -1484,7 +1485,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, stkp = stk_base; r = make_capture_history_tree(region->history_root, &stkp, - stk, (UChar* )str, reg); + stk, str, reg); if (r < 0) { best_len = r; /* error code */ goto finish; @@ -1519,10 +1520,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_EXACT1: MOP_IN(OP_EXACT1); #if 0 DATA_ENSURE(1); - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; #endif - if (*p != *s++) goto fail; + if (*p != ONIG_CHARAT(s++)) goto fail; DATA_ENSURE(0); p++; MOP_OUT; @@ -1534,7 +1535,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; DATA_ENSURE(1); - len = ONIGENC_MBC_CASE_FOLD(encode, + len = ONIGENC_MBC_CASE_FOLD_SE(it, encode, /* DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag), */ case_fold_flag, &s, end, lowbuf); @@ -1552,9 +1553,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_EXACT2: MOP_IN(OP_EXACT2); DATA_ENSURE(2); - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; sprev = s; p++; s++; MOP_OUT; @@ -1563,11 +1564,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_EXACT3: MOP_IN(OP_EXACT3); DATA_ENSURE(3); - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; sprev = s; p++; s++; MOP_OUT; @@ -1576,13 +1577,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_EXACT4: MOP_IN(OP_EXACT4); DATA_ENSURE(4); - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; sprev = s; p++; s++; MOP_OUT; @@ -1591,15 +1592,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_EXACT5: MOP_IN(OP_EXACT5); DATA_ENSURE(5); - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; sprev = s; p++; s++; MOP_OUT; @@ -1610,7 +1611,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_LENGTH_INC(tlen, p); DATA_ENSURE(tlen); while (tlen-- > 0) { - if (*p++ != *s++) goto fail; + if (*p++ != ONIG_CHARAT(s++)) goto fail; } sprev = s - 1; MOP_OUT; @@ -1628,7 +1629,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, while (p < endp) { sprev = s; DATA_ENSURE(1); - len = ONIGENC_MBC_CASE_FOLD(encode, + len = ONIGENC_MBC_CASE_FOLD_SE(it, encode, /* DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag), */ case_fold_flag, &s, end, lowbuf); @@ -1647,23 +1648,23 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_EXACTMB2N1: MOP_IN(OP_EXACTMB2N1); DATA_ENSURE(2); - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; MOP_OUT; break; case OP_EXACTMB2N2: MOP_IN(OP_EXACTMB2N2); DATA_ENSURE(4); - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; sprev = s; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; MOP_OUT; continue; @@ -1671,18 +1672,18 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_EXACTMB2N3: MOP_IN(OP_EXACTMB2N3); DATA_ENSURE(6); - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; sprev = s; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; MOP_OUT; continue; @@ -1692,9 +1693,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_LENGTH_INC(tlen, p); DATA_ENSURE(tlen * 2); while (tlen-- > 0) { - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; } sprev = s - 2; @@ -1706,11 +1707,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_LENGTH_INC(tlen, p); DATA_ENSURE(tlen * 3); while (tlen-- > 0) { - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; } sprev = s - 3; @@ -1724,7 +1725,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, tlen2 *= tlen; DATA_ENSURE(tlen2); while (tlen2-- > 0) { - if (*p != *s) goto fail; + if (*p != ONIG_CHARAT(s)) goto fail; p++; s++; } sprev = s - tlen; @@ -1734,33 +1735,33 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_CCLASS: MOP_IN(OP_CCLASS); DATA_ENSURE(1); - if (BITSET_AT(((BitSetRef )p), *s) == 0) goto fail; + if (BITSET_AT(((BitSetRef )p), ONIG_CHARAT(s)) == 0) goto fail; p += SIZE_BITSET; - s += enclen(encode, s); /* OP_CCLASS can match mb-code. \D, \S */ + s += enclen_se(it, encode, s); /* OP_CCLASS can match mb-code. \D, \S */ MOP_OUT; break; case OP_CCLASS_MB: MOP_IN(OP_CCLASS_MB); - if (! ONIGENC_IS_MBC_HEAD(encode, s)) goto fail; + if (! ONIGENC_IS_MBC_HEAD_SE(it, encode, s)) goto fail; cclass_mb: GET_LENGTH_INC(tlen, p); { OnigCodePoint code; - UChar *ss; + OnigPosition ss; int mb_len; DATA_ENSURE(1); - mb_len = enclen(encode, s); + mb_len = enclen_se(it, encode, s); DATA_ENSURE(mb_len); ss = s; s += mb_len; - code = ONIGENC_MBC_TO_CODE(encode, ss, s); + code = ONIGENC_MBC_TO_CODE_SE(it, encode, ss, s); #ifdef PLATFORM_UNALIGNED_WORD_ACCESS if (! onig_is_in_code_range(p, code)) goto fail; #else - q = p; + UChar* q = p; ALIGNMENT_RIGHT(q); if (! onig_is_in_code_range(q, code)) goto fail; #endif @@ -1771,12 +1772,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_CCLASS_MIX: MOP_IN(OP_CCLASS_MIX); DATA_ENSURE(1); - if (ONIGENC_IS_MBC_HEAD(encode, s)) { + if (ONIGENC_IS_MBC_HEAD_SE(it, encode, s)) { p += SIZE_BITSET; goto cclass_mb; } else { - if (BITSET_AT(((BitSetRef )p), *s) == 0) + if (BITSET_AT(((BitSetRef )p), ONIG_CHARAT(s)) == 0) goto fail; p += SIZE_BITSET; @@ -1789,15 +1790,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_CCLASS_NOT: MOP_IN(OP_CCLASS_NOT); DATA_ENSURE(1); - if (BITSET_AT(((BitSetRef )p), *s) != 0) goto fail; + if (BITSET_AT(((BitSetRef )p), ONIG_CHARAT(s)) != 0) goto fail; p += SIZE_BITSET; - s += enclen(encode, s); + s += enclen_se(it, encode, s); MOP_OUT; break; case OP_CCLASS_MB_NOT: MOP_IN(OP_CCLASS_MB_NOT); DATA_ENSURE(1); - if (! ONIGENC_IS_MBC_HEAD(encode, s)) { + if (! ONIGENC_IS_MBC_HEAD_SE(it, encode, s)) { s++; GET_LENGTH_INC(tlen, p); p += tlen; @@ -1808,24 +1809,24 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_LENGTH_INC(tlen, p); { OnigCodePoint code; - UChar *ss; - int mb_len = enclen(encode, s); + OnigPosition ss; + int mb_len = enclen_se(it, encode, s); if (! DATA_ENSURE_CHECK(mb_len)) { DATA_ENSURE(1); - s = (UChar* )end; + s = end; p += tlen; goto cc_mb_not_success; } ss = s; s += mb_len; - code = ONIGENC_MBC_TO_CODE(encode, ss, s); + code = ONIGENC_MBC_TO_CODE_SE(it, encode, ss, s); #ifdef PLATFORM_UNALIGNED_WORD_ACCESS if (onig_is_in_code_range(p, code)) goto fail; #else - q = p; + UChar* q = p; ALIGNMENT_RIGHT(q); if (onig_is_in_code_range(q, code)) goto fail; #endif @@ -1838,12 +1839,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_CCLASS_MIX_NOT: MOP_IN(OP_CCLASS_MIX_NOT); DATA_ENSURE(1); - if (ONIGENC_IS_MBC_HEAD(encode, s)) { + if (ONIGENC_IS_MBC_HEAD_SE(it, encode, s)) { p += SIZE_BITSET; goto cclass_mb_not; } else { - if (BITSET_AT(((BitSetRef )p), *s) != 0) + if (BITSET_AT(((BitSetRef )p), ONIG_CHARAT(s)) != 0) goto fail; p += SIZE_BITSET; @@ -1859,15 +1860,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, OnigCodePoint code; void *node; int mb_len; - UChar *ss; + OnigPosition ss; DATA_ENSURE(1); GET_POINTER_INC(node, p); - mb_len = enclen(encode, s); + mb_len = enclen_se(it, encode, s); ss = s; s += mb_len; DATA_ENSURE(0); - code = ONIGENC_MBC_TO_CODE(encode, ss, s); + code = ONIGENC_MBC_TO_CODE_SE(it, encode, ss, s); if (onig_is_code_in_cc_len(mb_len, code, node) == 0) goto fail; } MOP_OUT; @@ -1875,16 +1876,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_ANYCHAR: MOP_IN(OP_ANYCHAR); DATA_ENSURE(1); - n = enclen(encode, s); + n = enclen_se(it, encode, s); DATA_ENSURE(n); - if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 0)) goto fail; + if (ONIGENC_IS_MBC_NEWLINE_EX_SE(it, encode, s, str, end, option, 0)) goto fail; s += n; MOP_OUT; break; case OP_ANYCHAR_ML: MOP_IN(OP_ANYCHAR_ML); DATA_ENSURE(1); - n = enclen(encode, s); + n = enclen_se(it, encode, s); DATA_ENSURE(n); s += n; MOP_OUT; @@ -1893,9 +1894,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_ANYCHAR_STAR: MOP_IN(OP_ANYCHAR_STAR); while (DATA_ENSURE_CHECK1) { STACK_PUSH_ALT(p, s, sprev, pkeep); - n = enclen(encode, s); + n = enclen_se(it, encode, s); DATA_ENSURE(n); - if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 0)) goto fail; + if (ONIGENC_IS_MBC_NEWLINE_EX_SE(it, encode, s, str, end, option, 0)) goto fail; sprev = s; s += n; } @@ -1905,7 +1906,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_ANYCHAR_ML_STAR: MOP_IN(OP_ANYCHAR_ML_STAR); while (DATA_ENSURE_CHECK1) { STACK_PUSH_ALT(p, s, sprev, pkeep); - n = enclen(encode, s); + n = enclen_se(it, encode, s); if (n > 1) { DATA_ENSURE(n); sprev = s; @@ -1921,12 +1922,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_ANYCHAR_STAR_PEEK_NEXT: MOP_IN(OP_ANYCHAR_STAR_PEEK_NEXT); while (DATA_ENSURE_CHECK1) { - if (*p == *s) { + if (*p == ONIG_CHARAT(s)) { STACK_PUSH_ALT(p + 1, s, sprev, pkeep); } - n = enclen(encode, s); + n = enclen_se(it, encode, s); DATA_ENSURE(n); - if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 0)) goto fail; + if (ONIGENC_IS_MBC_NEWLINE_EX_SE(it, encode, s, str, end, option, 0)) goto fail; sprev = s; s += n; } @@ -1936,10 +1937,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_ANYCHAR_ML_STAR_PEEK_NEXT:MOP_IN(OP_ANYCHAR_ML_STAR_PEEK_NEXT); while (DATA_ENSURE_CHECK1) { - if (*p == *s) { + if (*p == ONIG_CHARAT(s)) { STACK_PUSH_ALT(p + 1, s, sprev, pkeep); } - n = enclen(encode, s); + n = enclen_se(it, encode, s); if (n > 1) { DATA_ENSURE(n); sprev = s; @@ -1962,9 +1963,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (scv) goto fail; STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem, pkeep); - n = enclen(encode, s); + n = enclen_se(it, encode, s); DATA_ENSURE(n); - if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 0)) goto fail; + if (ONIGENC_IS_MBC_NEWLINE_EX_SE(it, encode, s, str, end, option, 0)) goto fail; sprev = s; s += n; } @@ -1980,7 +1981,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (scv) goto fail; STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem, pkeep); - n = enclen(encode, s); + n = enclen_se(it, encode, s); if (n > 1) { DATA_ENSURE(n); sprev = s; @@ -1997,53 +1998,53 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_WORD: MOP_IN(OP_WORD); DATA_ENSURE(1); - if (! ONIGENC_IS_MBC_WORD(encode, s, end)) + if (! ONIGENC_IS_MBC_WORD_SE(it, encode, s, end)) goto fail; - s += enclen(encode, s); + s += enclen_se(it, encode, s); MOP_OUT; break; case OP_ASCII_WORD: MOP_IN(OP_ASCII_WORD); DATA_ENSURE(1); - if (! ONIGENC_IS_MBC_ASCII_WORD(encode, s, end)) + if (! ONIGENC_IS_MBC_ASCII_WORD_SE(it, encode, s, end)) goto fail; - s += enclen(encode, s); + s += enclen_se(it, encode, s); MOP_OUT; break; case OP_NOT_WORD: MOP_IN(OP_NOT_WORD); DATA_ENSURE(1); - if (ONIGENC_IS_MBC_WORD(encode, s, end)) + if (ONIGENC_IS_MBC_WORD_SE(it, encode, s, end)) goto fail; - s += enclen(encode, s); + s += enclen_se(it, encode, s); MOP_OUT; break; case OP_NOT_ASCII_WORD: MOP_IN(OP_NOT_ASCII_WORD); DATA_ENSURE(1); - if (ONIGENC_IS_MBC_ASCII_WORD(encode, s, end)) + if (ONIGENC_IS_MBC_ASCII_WORD_SE(it, encode, s, end)) goto fail; - s += enclen(encode, s); + s += enclen_se(it, encode, s); MOP_OUT; break; case OP_WORD_BOUND: MOP_IN(OP_WORD_BOUND); if (ON_STR_BEGIN(s)) { DATA_ENSURE(1); - if (! ONIGENC_IS_MBC_WORD(encode, s, end)) + if (! ONIGENC_IS_MBC_WORD_SE(it, encode, s, end)) goto fail; } else if (ON_STR_END(s)) { - if (! ONIGENC_IS_MBC_WORD(encode, sprev, end)) + if (! ONIGENC_IS_MBC_WORD_SE(it, encode, sprev, end)) goto fail; } else { - if (ONIGENC_IS_MBC_WORD(encode, s, end) - == ONIGENC_IS_MBC_WORD(encode, sprev, end)) + if (ONIGENC_IS_MBC_WORD_SE(it, encode, s, end) + == ONIGENC_IS_MBC_WORD_SE(it, encode, sprev, end)) goto fail; } MOP_OUT; @@ -2053,16 +2054,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_ASCII_WORD_BOUND: MOP_IN(OP_ASCII_WORD_BOUND); if (ON_STR_BEGIN(s)) { DATA_ENSURE(1); - if (! ONIGENC_IS_MBC_ASCII_WORD(encode, s, end)) + if (! ONIGENC_IS_MBC_ASCII_WORD_SE(it, encode, s, end)) goto fail; } else if (ON_STR_END(s)) { - if (! ONIGENC_IS_MBC_ASCII_WORD(encode, sprev, end)) + if (! ONIGENC_IS_MBC_ASCII_WORD_SE(it, encode, sprev, end)) goto fail; } else { - if (ONIGENC_IS_MBC_ASCII_WORD(encode, s, end) - == ONIGENC_IS_MBC_ASCII_WORD(encode, sprev, end)) + if (ONIGENC_IS_MBC_ASCII_WORD_SE(it, encode, s, end) + == ONIGENC_IS_MBC_ASCII_WORD_SE(it, encode, sprev, end)) goto fail; } MOP_OUT; @@ -2071,16 +2072,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_NOT_WORD_BOUND: MOP_IN(OP_NOT_WORD_BOUND); if (ON_STR_BEGIN(s)) { - if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_WORD(encode, s, end)) + if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_WORD_SE(it, encode, s, end)) goto fail; } else if (ON_STR_END(s)) { - if (ONIGENC_IS_MBC_WORD(encode, sprev, end)) + if (ONIGENC_IS_MBC_WORD_SE(it, encode, sprev, end)) goto fail; } else { - if (ONIGENC_IS_MBC_WORD(encode, s, end) - != ONIGENC_IS_MBC_WORD(encode, sprev, end)) + if (ONIGENC_IS_MBC_WORD_SE(it, encode, s, end) + != ONIGENC_IS_MBC_WORD_SE(it, encode, sprev, end)) goto fail; } MOP_OUT; @@ -2089,16 +2090,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_NOT_ASCII_WORD_BOUND: MOP_IN(OP_NOT_ASCII_WORD_BOUND); if (ON_STR_BEGIN(s)) { - if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_ASCII_WORD(encode, s, end)) + if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_ASCII_WORD_SE(it, encode, s, end)) goto fail; } else if (ON_STR_END(s)) { - if (ONIGENC_IS_MBC_ASCII_WORD(encode, sprev, end)) + if (ONIGENC_IS_MBC_ASCII_WORD_SE(it, encode, sprev, end)) goto fail; } else { - if (ONIGENC_IS_MBC_ASCII_WORD(encode, s, end) - != ONIGENC_IS_MBC_ASCII_WORD(encode, sprev, end)) + if (ONIGENC_IS_MBC_ASCII_WORD_SE(it, encode, s, end) + != ONIGENC_IS_MBC_ASCII_WORD_SE(it, encode, sprev, end)) goto fail; } MOP_OUT; @@ -2107,18 +2108,23 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #ifdef USE_WORD_BEGIN_END case OP_WORD_BEGIN: MOP_IN(OP_WORD_BEGIN); - if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_WORD(encode, s, end)) { - if (ON_STR_BEGIN(s) || !ONIGENC_IS_MBC_WORD(encode, sprev, end)) { + if (DATA_ENSURE_CHECK1 /*&& ONIGENC_IS_MBC_WORD_SE(it, encode, s, end)*/) { + if (ON_STR_BEGIN(s) || !ONIGENC_IS_MBC_WORD_SE(it, encode, sprev, end) || !ONIGENC_IS_MBC_WORD_SE(it, encode, s, end)) { MOP_OUT; continue; + } else if (ONIGENC_SCRIPT_SE(it, encode, sprev, end) != ONIGENC_SCRIPT_SE(it, encode, s, end)) { + if (!ONIGENC_IS_MBC_SINGLEBYTE_SE(it, encode, sprev, end) || !ONIGENC_IS_MBC_SINGLEBYTE_SE(it, encode, s, end)) { + MOP_OUT; + continue; + } } } goto fail; break; case OP_ASCII_WORD_BEGIN: MOP_IN(OP_ASCII_WORD_BEGIN); - if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_ASCII_WORD(encode, s, end)) { - if (ON_STR_BEGIN(s) || !ONIGENC_IS_MBC_ASCII_WORD(encode, sprev, end)) { + if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_ASCII_WORD_SE(it, encode, s, end)) { + if (ON_STR_BEGIN(s) || !ONIGENC_IS_MBC_ASCII_WORD_SE(it, encode, sprev, end)) { MOP_OUT; continue; } @@ -2127,18 +2133,23 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, break; case OP_WORD_END: MOP_IN(OP_WORD_END); - if (!ON_STR_BEGIN(s) && ONIGENC_IS_MBC_WORD(encode, sprev, end)) { - if (ON_STR_END(s) || !ONIGENC_IS_MBC_WORD(encode, s, end)) { + if (!ON_STR_BEGIN(s) /*&& ONIGENC_IS_MBC_WORD_SE(it, encode, sprev, end)*/) { + if (ON_STR_END(s) || !ONIGENC_IS_MBC_WORD_SE(it, encode, s, end) || !ONIGENC_IS_MBC_WORD_SE(it, encode, sprev, end)) { MOP_OUT; continue; + } else if (ONIGENC_SCRIPT_SE(it, encode, s, end) != ONIGENC_SCRIPT_SE(it, encode, sprev, end)) { + if (!ONIGENC_IS_MBC_SINGLEBYTE_SE(it, encode, s, end) || !ONIGENC_IS_MBC_SINGLEBYTE_SE(it, encode, sprev, end)) { + MOP_OUT; + continue; + } } } goto fail; break; case OP_ASCII_WORD_END: MOP_IN(OP_ASCII_WORD_END); - if (!ON_STR_BEGIN(s) && ONIGENC_IS_MBC_ASCII_WORD(encode, sprev, end)) { - if (ON_STR_END(s) || !ONIGENC_IS_MBC_ASCII_WORD(encode, s, end)) { + if (!ON_STR_BEGIN(s) && ONIGENC_IS_MBC_ASCII_WORD_SE(it, encode, sprev, end)) { + if (ON_STR_END(s) || !ONIGENC_IS_MBC_ASCII_WORD_SE(it, encode, s, end)) { MOP_OUT; continue; } @@ -2168,12 +2179,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, MOP_OUT; continue; } - else if (ONIGENC_IS_MBC_NEWLINE(encode, sprev, end) + else if (ONIGENC_IS_MBC_NEWLINE_SE(it, encode, sprev, end) #ifdef USE_CRNL_AS_LINE_TERMINATOR && !(IS_NEWLINE_CRLF(option) - && ONIGENC_IS_MBC_CRNL(encode, sprev, end)) + && ONIGENC_IS_MBC_CRNL_SE(it,encode, sprev, end)) #endif - && !ON_STR_END(s)) { + /*&& !ON_STR_END(s)*/) { MOP_OUT; continue; } @@ -2183,7 +2194,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_END_LINE: MOP_IN(OP_END_LINE); if (ON_STR_END(s)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE_EX(encode, sprev, str, end, option, 1)) { + if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE_EX_SE(it, encode, sprev, str, end, option, 1)) { #endif if (IS_NOTEOL(msa->options)) goto fail; MOP_OUT; @@ -2192,7 +2203,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } #endif } - else if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 1)) { + else if (ONIGENC_IS_MBC_NEWLINE_EX_SE(it, encode, s, str, end, option, 1)) { MOP_OUT; continue; } @@ -2202,7 +2213,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_SEMI_END_BUF: MOP_IN(OP_SEMI_END_BUF); if (ON_STR_END(s)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE_EX(encode, sprev, str, end, option, 1)) { + if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE_EX_SE(it, encode, sprev, str, end, option, 1)) { #endif if (IS_NOTEOL(msa->options)) goto fail; MOP_OUT; @@ -2211,16 +2222,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } #endif } - else if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 1)) { - UChar* ss = s + enclen(encode, s); + else if (ONIGENC_IS_MBC_NEWLINE_EX_SE(it, encode, s, str, end, option, 1)) { + OnigPosition ss = s + enclen_se(it, encode, s); if (ON_STR_END(ss)) { MOP_OUT; continue; } #ifdef USE_CRNL_AS_LINE_TERMINATOR else if (IS_NEWLINE_CRLF(option) - && ONIGENC_IS_MBC_CRNL(encode, s, end)) { - ss += enclen(encode, ss); + && ONIGENC_IS_MBC_CRNL_SE(it, encode, s, end)) { + ss += enclen_se(it, encode, ss); if (ON_STR_END(ss)) { MOP_OUT; continue; @@ -2256,7 +2267,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_MEMORY_START: MOP_IN(OP_MEMORY_START); GET_MEMNUM_INC(mem, p); - mem_start_stk[mem] = (OnigStackIndex )((void* )s); + mem_start_stk[mem] = (OnigStackIndex)(s); MOP_OUT; continue; break; @@ -2270,7 +2281,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_MEMORY_END: MOP_IN(OP_MEMORY_END); GET_MEMNUM_INC(mem, p); - mem_end_stk[mem] = (OnigStackIndex )((void* )s); + mem_end_stk[mem] = (OnigStackIndex)(s); MOP_OUT; continue; break; @@ -2293,13 +2304,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_MEMORY_END_REC: MOP_IN(OP_MEMORY_END_REC); GET_MEMNUM_INC(mem, p); - mem_end_stk[mem] = (OnigStackIndex )((void* )s); + mem_end_stk[mem] = (OnigStackIndex)s; STACK_GET_MEM_START(mem, stkp); if (BIT_STATUS_AT(reg->bt_mem_start, mem)) mem_start_stk[mem] = GET_STACK_INDEX(stkp); else - mem_start_stk[mem] = (OnigStackIndex )((void* )stkp->u.mem.pstr); + mem_start_stk[mem] = (OnigStackIndex )(stkp->u.mem.pstr); STACK_PUSH_MEM_END_MARK(mem); MOP_OUT; @@ -2322,7 +2333,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, backref: { int len; - UChar *pstart, *pend; + OnigPosition pstart, pend; /* if you want to remove following line, you should check in parse and compile time. */ @@ -2333,16 +2344,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (BIT_STATUS_AT(reg->bt_mem_start, mem)) pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; else - pstart = (UChar* )((void* )mem_start_stk[mem]); + pstart = (OnigPosition)mem_start_stk[mem]; pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + : (OnigPosition)mem_end_stk[mem]); n = pend - pstart; DATA_ENSURE(n); sprev = s; - STRING_CMP(pstart, s, n); - while (sprev + (len = enclen(encode, sprev)) < s) + STRING_CMP_SE(it, pstart, s, n); + while (sprev + (len = enclen_se(it, encode, sprev)) < s) sprev += len; MOP_OUT; @@ -2354,7 +2365,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_MEMNUM_INC(mem, p); { int len; - UChar *pstart, *pend; + OnigPosition pstart, pend; /* if you want to remove following line, you should check in parse and compile time. */ @@ -2365,16 +2376,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (BIT_STATUS_AT(reg->bt_mem_start, mem)) pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; else - pstart = (UChar* )((void* )mem_start_stk[mem]); + pstart = (OnigPosition)(mem_start_stk[mem]); pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + : (OnigPosition)(mem_end_stk[mem])); n = pend - pstart; DATA_ENSURE(n); sprev = s; - STRING_CMP_IC(case_fold_flag, pstart, &s, n); - while (sprev + (len = enclen(encode, sprev)) < s) + STRING_CMP_IC_SE(it, case_fold_flag, pstart, &s, n); + while (sprev + (len = enclen_se(it, encode, sprev)) < s) sprev += len; MOP_OUT; @@ -2385,7 +2396,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_BACKREF_MULTI: MOP_IN(OP_BACKREF_MULTI); { int len, is_fail; - UChar *pstart, *pend, *swork; + OnigPosition pstart, pend, swork; GET_LENGTH_INC(tlen, p); for (i = 0; i < tlen; i++) { @@ -2397,19 +2408,19 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (BIT_STATUS_AT(reg->bt_mem_start, mem)) pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; else - pstart = (UChar* )((void* )mem_start_stk[mem]); + pstart = (OnigPosition)(mem_start_stk[mem]); pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + : (OnigPosition)(mem_end_stk[mem])); n = pend - pstart; DATA_ENSURE(n); sprev = s; swork = s; - STRING_CMP_VALUE(pstart, swork, n, is_fail); + STRING_CMP_VALUE_SE(it, pstart, swork, n, is_fail); if (is_fail) continue; s = swork; - while (sprev + (len = enclen(encode, sprev)) < s) + while (sprev + (len = enclen_se(it, encode, sprev)) < s) sprev += len; p += (SIZE_MEMNUM * (tlen - i - 1)); @@ -2424,7 +2435,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_BACKREF_MULTI_IC: MOP_IN(OP_BACKREF_MULTI_IC); { int len, is_fail; - UChar *pstart, *pend, *swork; + OnigPosition pstart, pend, swork; GET_LENGTH_INC(tlen, p); for (i = 0; i < tlen; i++) { @@ -2436,19 +2447,19 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (BIT_STATUS_AT(reg->bt_mem_start, mem)) pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; else - pstart = (UChar* )((void* )mem_start_stk[mem]); + pstart = (OnigPosition)(mem_start_stk[mem]); pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + : (OnigPosition)(mem_end_stk[mem])); n = pend - pstart; DATA_ENSURE(n); sprev = s; swork = s; - STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); + STRING_CMP_VALUE_IC_SE(it, case_fold_flag, pstart, &swork, n, is_fail); if (is_fail) continue; s = swork; - while (sprev + (len = enclen(encode, sprev)) < s) + while (sprev + (len = enclen_se(it, encode, sprev)) < s) sprev += len; p += (SIZE_MEMNUM * (tlen - i - 1)); @@ -2472,9 +2483,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_LENGTH_INC(tlen, p); sprev = s; - if (backref_match_at_nested_level(reg, stk, stk_base, ic + if (backref_match_at_nested_level(it, reg, stk, stk_base, ic , case_fold_flag, (int )level, (int )tlen, p, &s, end)) { - while (sprev + (len = enclen(encode, sprev)) < s) + while (sprev + (len = enclen_se(it, encode, sprev)) < s) sprev += len; p += (SIZE_MEMNUM * tlen); @@ -2520,8 +2531,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_NULL_CHECK(isnull, mem, s); if (isnull) { #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "NULL_CHECK_END: skip id:%d, s:%d (%p)\n", - (int )mem, (int )s, s); + fprintf(stderr, "NULL_CHECK_END: skip id:%d, s:%d, ptr: %p)\n", + (int )mem, (int )s, it->ptr); #endif null_check_found: /* empty loop founded, skip next instruction */ @@ -2555,8 +2566,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_NULL_CHECK_MEMST(isnull, mem, s, reg); if (isnull) { #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%d (%p)\n", - (int )mem, (int )s, s); + fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%d, ptr: %p)\n", + (int )mem, (int )s, it->ptr); #endif if (isnull == -1) goto fail; goto null_check_found; @@ -2581,8 +2592,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #endif if (isnull) { #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "NULL_CHECK_END_MEMST_PUSH: skip id:%d, s:%d (%p)\n", - (int )mem, (int )s, s); + fprintf(stderr, "NULL_CHECK_END_MEMST_PUSH: skip id:%d, s:%d ptr: %p)\n", + (int )mem, (int )s, it->ptr); #endif if (isnull == -1) goto fail; goto null_check_found; @@ -2656,7 +2667,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_PUSH_OR_JUMP_EXACT1: MOP_IN(OP_PUSH_OR_JUMP_EXACT1); GET_RELADDR_INC(addr, p); - if (*p == *s && DATA_ENSURE_CHECK1) { + if (*p == ONIG_CHARAT(s) && DATA_ENSURE_CHECK1) { p++; STACK_PUSH_ALT(p + addr, s, sprev, pkeep); MOP_OUT; @@ -2669,7 +2680,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_PUSH_IF_PEEK_NEXT: MOP_IN(OP_PUSH_IF_PEEK_NEXT); GET_RELADDR_INC(addr, p); - if (*p == *s) { + if (*p == ONIG_CHARAT(s)) { p++; STACK_PUSH_ALT(p + addr, s, sprev, pkeep); MOP_OUT; @@ -2821,9 +2832,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_LOOK_BEHIND: MOP_IN(OP_LOOK_BEHIND); GET_LENGTH_INC(tlen, p); - s = (UChar* )ONIGENC_STEP_BACK(encode, str, s, (int )tlen); - if (IS_NULL(s)) goto fail; - sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); + s = ONIGENC_STEP_BACK_SE(it, encode, str, s, (int )tlen); + if (ONIG_IS_BADPOS(s)) goto fail; + sprev = onigenc_get_prev_char_head_se(it, encode, str, s); MOP_OUT; continue; break; @@ -2831,8 +2842,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_PUSH_LOOK_BEHIND_NOT: MOP_IN(OP_PUSH_LOOK_BEHIND_NOT); GET_RELADDR_INC(addr, p); GET_LENGTH_INC(tlen, p); - q = (UChar* )ONIGENC_STEP_BACK(encode, str, s, (int )tlen); - if (IS_NULL(q)) { + q = ONIGENC_STEP_BACK_SE(it, encode, str, s, (int )tlen); + if (ONIG_IS_BADPOS(q)) { /* too short case -> success. ex. /(? text_range) end = text_range; - s = (UChar* )text; + s = text; if (enc->max_enc_len == enc->min_enc_len) { int n = enc->max_enc_len; while (s < end) { - if (*s == *target) { + if (ONIG_CHARAT(s) == *target) { p = s + 1; t = target + 1; - if (target_end == t || memcmp(t, p, target_end - t) == 0) + if (target_end == t || memcmp_se(it, t, target_end, p) == 0) return s; } s += n; } - return (UChar* )NULL; + return ONIG_BADPOS; } while (s < end) { - if (*s == *target) { + if (ONIG_CHARAT(s) == *target) { p = s + 1; t = target + 1; - if (target_end == t || memcmp(t, p, target_end - t) == 0) + if (target_end == t || memcmp_se(it, t, target_end, p) == 0) return s; } - s += enclen(enc, s); + s += enclen_se(it, enc, s); } - return (UChar* )NULL; + return ONIG_BADPOS; } static int -str_lower_case_match(OnigEncoding enc, int case_fold_flag, +str_lower_case_match_se(OnigIterator* it, OnigEncoding enc, int case_fold_flag, const UChar* t, const UChar* tend, - const UChar* p, const UChar* end) + OnigPosition p, OnigPosition end) { int lowlen; UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; while (t < tend) { - lowlen = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, lowbuf); + lowlen = ONIGENC_MBC_CASE_FOLD_SE(it, enc, case_fold_flag, &p, end, lowbuf); q = lowbuf; while (lowlen > 0) { if (*t++ != *q++) return 0; @@ -2993,103 +3017,105 @@ str_lower_case_match(OnigEncoding enc, int case_fold_flag, return 1; } -static UChar* -slow_search_ic(OnigEncoding enc, int case_fold_flag, +static OnigPosition +slow_search_ic_se(OnigIterator* it, OnigEncoding enc, int case_fold_flag, UChar* target, UChar* target_end, - const UChar* text, const UChar* text_end, UChar* text_range) + OnigPosition text, OnigPosition text_end, OnigPosition text_range) { - UChar *s, *end; + OnigPosition s, end; - end = (UChar* )text_end; + end = text_end; end -= target_end - target - 1; if (end > text_range) end = text_range; - s = (UChar* )text; + s = text; while (s < end) { - if (str_lower_case_match(enc, case_fold_flag, target, target_end, + if (str_lower_case_match_se(it, enc, case_fold_flag, target, target_end, s, text_end)) return s; - s += enclen(enc, s); + s += enclen_se(it, enc, s); } - return (UChar* )NULL; + return ONIG_BADPOS; } -static UChar* -slow_search_backward(OnigEncoding enc, UChar* target, UChar* target_end, - const UChar* text, const UChar* adjust_text, - const UChar* text_end, const UChar* text_start) +static OnigPosition +slow_search_backward_se(OnigIterator* it, OnigEncoding enc, UChar* target, UChar* target_end, + OnigPosition text, OnigPosition adjust_text, + OnigPosition text_end, OnigPosition text_start) { - UChar *t, *p, *s; + UChar* t; + OnigPosition p, s; - s = (UChar* )text_end; + s = text_end; s -= (target_end - target); if (s > text_start) - s = (UChar* )text_start; + s = text_start; else - s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, adjust_text, s); + s = ONIGENC_LEFT_ADJUST_CHAR_HEAD_SE(it, enc, adjust_text, s); while (s >= text) { - if (*s == *target) { + if (ONIG_CHARAT(s) == *target) { p = s + 1; t = target + 1; while (t < target_end) { - if (*t != *p++) + if (*t != ONIG_CHARAT(p++)) break; t++; } if (t == target_end) return s; } - s = (UChar* )onigenc_get_prev_char_head(enc, adjust_text, s); + s = onigenc_get_prev_char_head_se(it, enc, adjust_text, s); } - return (UChar* )NULL; + return ONIG_BADPOS; } -static UChar* -slow_search_backward_ic(OnigEncoding enc, int case_fold_flag, +static OnigPosition +slow_search_backward_ic_se(OnigIterator* it, OnigEncoding enc, int case_fold_flag, UChar* target, UChar* target_end, - const UChar* text, const UChar* adjust_text, - const UChar* text_end, const UChar* text_start) + OnigPosition text, OnigPosition adjust_text, + OnigPosition text_end, OnigPosition text_start) { - UChar *s; + OnigPosition s; - s = (UChar* )text_end; + s = text_end; s -= (target_end - target); if (s > text_start) - s = (UChar* )text_start; + s = text_start; else - s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, adjust_text, s); + s = ONIGENC_LEFT_ADJUST_CHAR_HEAD_SE(it, enc, adjust_text, s); while (s >= text) { - if (str_lower_case_match(enc, case_fold_flag, + if (str_lower_case_match_se(it, enc, case_fold_flag, target, target_end, s, text_end)) return s; - s = (UChar* )onigenc_get_prev_char_head(enc, adjust_text, s); + s = onigenc_get_prev_char_head_se(it, enc, adjust_text, s); } - return (UChar* )NULL; + return ONIG_BADPOS; } #ifndef USE_SUNDAY_QUICK_SEARCH /* Boyer-Moore-Horspool search applied to a multibyte string */ -static UChar* -bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, - const UChar* text_range) +static OnigPosition +bm_search_notrev_se(OnigIterator* it, regex_t* reg, const UChar* target, const UChar* target_end, + OnigPosition text, OnigPosition text_end, + OnigPosition text_range) { - const UChar *s, *se, *t, *p, *end; + const UChar *t; + OnigPosition s, se, p, end; const UChar *tail; ptrdiff_t skip, tlen1; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search_notrev: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", - (int )text, text, (int )text_end, text_end, (int )text_range, text_range); + fprintf(stderr, "bm_search_notrev_se: text: %d, text_end: %d, text_range: %d\n", + (int )text, (int )text_end, (int )text_range); #endif tail = target_end - 1; @@ -3104,42 +3130,43 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, while (s < end) { p = se = s + tlen1; t = tail; - while (*p == *t) { - if (t == target) return (UChar* )s; + while (ONIG_CHARAT(p) == *t) { + if (t == target) return s; p--; t--; } - skip = reg->map[*se]; - t = s; + skip = reg->map[ONIG_CHARAT(se)]; + p = s; do { - s += enclen(reg->enc, s); - } while ((s - t) < skip && s < end); + s += enclen_se(it, reg->enc, s); + } while ((s - p) < skip && s < end); } } else { while (s < end) { p = se = s + tlen1; t = tail; - while (*p == *t) { - if (t == target) return (UChar* )s; + while (ONIG_CHARAT(p) == *t) { + if (t == target) return s; p--; t--; } skip = reg->int_map[*se]; - t = s; + p = s; do { - s += enclen(reg->enc, s); - } while ((s - t) < skip && s < end); + s += enclen_se(it, reg->enc, s); + } while ((s - p) < skip && s < end); } } - return (UChar* )NULL; + return ONIG_BADPOS; } /* Boyer-Moore-Horspool search */ -static UChar* -bm_search(regex_t* reg, const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, const UChar* text_range) +static OnigPosition +bm_search_se(OnigIterator* it, regex_t* reg, const UChar* target, const UChar* target_end, + OnigPosition text, OnigPosition text_end, OnigPosition text_range) { - const UChar *s, *t, *p, *end; + const UChar *t; + OnigPosition s, p, end; const UChar *tail; end = text_range + (target_end - target) - 1; @@ -3152,42 +3179,42 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, while (s < end) { p = s; t = tail; - while (*p == *t) { - if (t == target) return (UChar* )p; + while (ONIG_CHARAT(p) == *t) { + if (t == target) return p; p--; t--; } - s += reg->map[*s]; + s += reg->map[ONIG_CHARAT(s)]; } } else { /* see int_map[] */ while (s < end) { p = s; t = tail; - while (*p == *t) { - if (t == target) return (UChar* )p; + while (ONIG_CHARAT(p) == *t) { + if (t == target) return p; p--; t--; } s += reg->int_map[*s]; } } - return (UChar* )NULL; + return ONIG_BADPOS; } /* Boyer-Moore-Horspool search applied to a multibyte string (ignore case) */ -static UChar* -bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, - const UChar* text_range) +static OnigPosition +bm_search_notrev_ic(OnigIterator* it, regex_t* reg, const UChar* target, const UChar* target_end, + OnigPosition text, OnigPosition text_end, + OnigPosition text_range) { - const UChar *s, *se, *t, *end; + OnigPosition s, se, t, end; const UChar *tail; ptrdiff_t skip, tlen1; OnigEncoding enc = reg->enc; int case_fold_flag = reg->case_fold_flag; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search_notrev_ic: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", - (int )text, text, (int )text_end, text_end, (int )text_range, text_range); + fprintf(stderr, "bm_search_notrev_ic: text: %d, text_end: %d, text_range: %d\n", + (int )text, (int )text_end, (int )text_range); #endif tail = target_end - 1; @@ -3203,11 +3230,11 @@ bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, se = s + tlen1; if (str_lower_case_match(enc, case_fold_flag, target, target_end, s, se + 1)) - return (UChar* )s; - skip = reg->map[*se]; + return s; + skip = reg->map[ONIG_CHARAT(se)]; t = s; do { - s += enclen(reg->enc, s); + s += enclen_se(it, reg->enc, s); } while ((s - t) < skip && s < end); } } @@ -3216,31 +3243,31 @@ bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, se = s + tlen1; if (str_lower_case_match(enc, case_fold_flag, target, target_end, s, se + 1)) - return (UChar* )s; - skip = reg->int_map[*se]; + return s; + skip = reg->int_map[ONIG_CHARAT(se)]; t = s; do { - s += enclen(reg->enc, s); + s += enclen_se(it, reg->enc, s); } while ((s - t) < skip && s < end); } } - return (UChar* )NULL; + return ONIG_BADPOS; } /* Boyer-Moore-Horspool search (ignore case) */ -static UChar* -bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, const UChar* text_range) +static OnigPosition +bm_search_ic_se(OnigIterator* it, regex_t* reg, const UChar* target, const UChar* target_end, + OnigPosition text, OnigPosition text_end, OnigPosition text_range) { - const UChar *s, *p, *end; + OnigPosition s, p, end; const UChar *tail; OnigEncoding enc = reg->enc; int case_fold_flag = reg->case_fold_flag; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search_ic: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", - (int )text, text, (int )text_end, text_end, (int )text_range, text_range); + fprintf(stderr, "bm_search_ic_se: text: %d, text_end: %d, text_range: %d\n", + (int )text, (int )text_end, (int )text_range); #endif end = text_range + (target_end - target) - 1; @@ -3254,8 +3281,8 @@ bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, p = s - (target_end - target) + 1; if (str_lower_case_match(enc, case_fold_flag, target, target_end, p, s + 1)) - return (UChar* )p; - s += reg->map[*s]; + return p; + s += reg->map[ONIG_CHARAT(s)]; } } else { /* see int_map[] */ @@ -3263,30 +3290,31 @@ bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, p = s - (target_end - target) + 1; if (str_lower_case_match(enc, case_fold_flag, target, target_end, p, s + 1)) - return (UChar* )p; - s += reg->int_map[*s]; + return p; + s += reg->int_map[ONIG_CHARAT(s)]; } } - return (UChar* )NULL; + return ONIG_BADPOS; } #else /* USE_SUNDAY_QUICK_SEARCH */ /* Sunday's quick search applied to a multibyte string */ -static UChar* -bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, - const UChar* text_range) +static OnigPosition +bm_search_notrev_se(OnigIterator* it, regex_t* reg, const UChar* target, const UChar* target_end, + OnigPosition text, OnigPosition text_end, + OnigPosition text_range) { - const UChar *s, *se, *t, *p, *end; + const UChar *t; + OnigPosition s, se, p, end; const UChar *tail; ptrdiff_t skip, tlen1; OnigEncoding enc = reg->enc; - int (*mbc_enc_len)(const OnigUChar* p) = enc->mbc_enc_len; + int (*mbc_enc_len_se)(OnigIterator* it, OnigPosition p) = enc->mbc_enc_len_se; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search_notrev: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", - (int )text, text, (int )text_end, text_end, (int )text_range, text_range); + fprintf(stderr, "bm_search_notrev_se: text: %d, text_end: %d, text_range: %d\n", + (int )text, (int )text_end, (int )text_range); #endif tail = target_end - 1; @@ -3301,44 +3329,45 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, while (s < end) { p = se = s + tlen1; t = tail; - while (*p == *t) { - if (t == target) return (UChar* )s; + while (ONIG_CHARAT(p) == *t) { + if (t == target) return s; p--; t--; } if (s + 1 >= end) break; - skip = reg->map[se[1]]; - t = s; + skip = reg->map[ONIG_CHARAT(se + 1)]; + p = s; do { - s += mbc_enc_len(s); - } while ((s - t) < skip && s < end); + s += mbc_enc_len_se(it, s); + } while ((s - p) < skip && s < end); } } else { while (s < end) { p = se = s + tlen1; t = tail; - while (*p == *t) { - if (t == target) return (UChar* )s; + while (ONIG_CHARAT(p) == *t) { + if (t == target) return s; p--; t--; } if (s + 1 >= end) break; - skip = reg->int_map[se[1]]; - t = s; + skip = reg->int_map[ONIG_CHARAT(se + 1)]; + p = s; do { - s += mbc_enc_len(s); - } while ((s - t) < skip && s < end); + s += mbc_enc_len_se(it, s); + } while ((s - p) < skip && s < end); } } - return (UChar* )NULL; + return ONIG_BADPOS; } /* Sunday's quick search */ -static UChar* -bm_search(regex_t* reg, const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, const UChar* text_range) +static OnigPosition +bm_search_se(OnigIterator* it, regex_t* reg, const UChar* target, const UChar* target_end, + OnigPosition text, OnigPosition text_end, OnigPosition text_range) { - const UChar *s, *t, *p, *end; + const UChar *t; + OnigPosition s, p, end; const UChar *tail; ptrdiff_t tlen1; @@ -3353,45 +3382,46 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, while (s < end) { p = s; t = tail; - while (*p == *t) { - if (t == target) return (UChar* )p; + while (ONIG_CHARAT(p) == *t) { + if (t == target) return p; p--; t--; } if (s + 1 >= end) break; - s += reg->map[s[1]]; + s += reg->map[ONIG_CHARAT(s + 1)]; } } else { /* see int_map[] */ while (s < end) { p = s; t = tail; - while (*p == *t) { - if (t == target) return (UChar* )p; + while (ONIG_CHARAT(p) == *t) { + if (t == target) return p; p--; t--; } if (s + 1 >= end) break; - s += reg->int_map[s[1]]; + s += reg->int_map[ONIG_CHARAT(s + 1)]; } } - return (UChar* )NULL; + return ONIG_BADPOS; } /* Sunday's quick search applied to a multibyte string (ignore case) */ -static UChar* -bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, - const UChar* text_range) +static OnigPosition +bm_search_notrev_ic_se(OnigIterator* it, regex_t* reg, const UChar* target, const UChar* target_end, + OnigPosition text, const OnigPosition text_end, + OnigPosition text_range) { - const UChar *s, *se, *t, *end; + OnigPosition t; + OnigPosition s, se, end; const UChar *tail; ptrdiff_t skip, tlen1; OnigEncoding enc = reg->enc; - int (*mbc_enc_len)(const OnigUChar* p) = enc->mbc_enc_len; + int (*mbc_enc_len_se)(OnigIterator* it, OnigPosition p) = enc->mbc_enc_len_se; int case_fold_flag = reg->case_fold_flag; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search_notrev_ic: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", - (int )text, text, (int )text_end, text_end, (int )text_range, text_range); + fprintf(stderr, "bm_search_notrev_ic: text: %d, text_end: %d, text_range: %d\n", + (int )text, (int )text_end, (int )text_range); #endif tail = target_end - 1; @@ -3405,49 +3435,49 @@ bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, if (IS_NULL(reg->int_map)) { while (s < end) { se = s + tlen1; - if (str_lower_case_match(enc, case_fold_flag, target, target_end, + if (str_lower_case_match_se(it, enc, case_fold_flag, target, target_end, s, se + 1)) - return (UChar* )s; + return s; if (s + 1 >= end) break; - skip = reg->map[se[1]]; + skip = reg->map[ONIG_CHARAT(se + 1)]; t = s; do { - s += mbc_enc_len(s); + s += mbc_enc_len_se(it, s); } while ((s - t) < skip && s < end); } } else { while (s < end) { se = s + tlen1; - if (str_lower_case_match(enc, case_fold_flag, target, target_end, + if (str_lower_case_match_se(it, enc, case_fold_flag, target, target_end, s, se + 1)) - return (UChar* )s; + return s; if (s + 1 >= end) break; - skip = reg->int_map[se[1]]; + skip = reg->int_map[ONIG_CHARAT(se + 1)]; t = s; do { - s += mbc_enc_len(s); + s += mbc_enc_len_se(it, s); } while ((s - t) < skip && s < end); } } - return (UChar* )NULL; + return ONIG_BADPOS; } /* Sunday's quick search (ignore case) */ -static UChar* -bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, const UChar* text_range) +static OnigPosition +bm_search_ic_se(OnigIterator* it, regex_t* reg, const UChar* target, const UChar* target_end, + OnigPosition text, OnigPosition text_end, OnigPosition text_range) { - const UChar *s, *p, *end; + OnigPosition s, p, end; const UChar *tail; ptrdiff_t tlen1; OnigEncoding enc = reg->enc; int case_fold_flag = reg->case_fold_flag; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search_ic: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", - (int )text, text, (int )text_end, text_end, (int )text_range, text_range); + fprintf(stderr, "bm_search_ic_se: text: %d, text_end: %d, text_range: %d\n", + (int )text, (int )text_end, (int )text_range); #endif tail = target_end - 1; @@ -3460,24 +3490,24 @@ bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, if (IS_NULL(reg->int_map)) { while (s < end) { p = s - tlen1; - if (str_lower_case_match(enc, case_fold_flag, target, target_end, + if (str_lower_case_match_se(it, enc, case_fold_flag, target, target_end, p, s + 1)) - return (UChar* )p; + return p; if (s + 1 >= end) break; - s += reg->map[s[1]]; + s += reg->map[ONIG_CHARAT(s + 1)]; } } else { /* see int_map[] */ while (s < end) { p = s - tlen1; - if (str_lower_case_match(enc, case_fold_flag, target, target_end, + if (str_lower_case_match_se(it, enc, case_fold_flag, target, target_end, p, s + 1)) - return (UChar* )p; + return p; if (s + 1 >= end) break; - s += reg->int_map[s[1]]; + s += reg->int_map[ONIG_CHARAT(s + 1)]; } } - return (UChar* )NULL; + return ONIG_BADPOS; } #endif /* USE_SUNDAY_QUICK_SEARCH */ @@ -3502,70 +3532,78 @@ set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED, return 0; } -static UChar* -bm_search_backward(regex_t* reg, const UChar* target, const UChar* target_end, - const UChar* text, const UChar* adjust_text, - const UChar* text_end, const UChar* text_start) +static OnigPosition +bm_search_backward(OnigIterator* it, regex_t* reg, const UChar* target, const UChar* target_end, + OnigPosition text, OnigPosition adjust_text, + OnigPosition text_end, OnigPosition text_start) { - const UChar *s, *t, *p; + const UChar *t; + OnigPosition s, p; s = text_end - (target_end - target); if (text_start < s) s = text_start; else - s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, adjust_text, s); + s = ONIGENC_LEFT_ADJUST_CHAR_HEAD_SE(it, reg->enc, adjust_text, s); while (s >= text) { p = s; t = target; - while (t < target_end && *p == *t) { + while (t < target_end && ONIG_CHARAT(p) == *t) { p++; t++; } if (t == target_end) - return (UChar* )s; + return s; - s -= reg->int_map_backward[*s]; - s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, adjust_text, s); + s -= reg->int_map_backward[ONIG_CHARAT(s)]; + s = ONIGENC_LEFT_ADJUST_CHAR_HEAD_SE(it, reg->enc, adjust_text, s); } - return (UChar* )NULL; + return ONIG_BADPOS; } -static UChar* -map_search(OnigEncoding enc, UChar map[], - const UChar* text, const UChar* text_range) +static OnigPosition +map_search_se(OnigIterator* it, OnigEncoding enc, UChar map[], + OnigPosition text, OnigPosition text_range) { - const UChar *s = text; + OnigPosition s = text; while (s < text_range) { - if (map[*s]) return (UChar* )s; + if (map[ONIG_CHARAT(s)]) return s; - s += enclen(enc, s); + s += enclen_se(it, enc, s); } - return (UChar* )NULL; + return ONIG_BADPOS; } -static UChar* -map_search_backward(OnigEncoding enc, UChar map[], - const UChar* text, const UChar* adjust_text, - const UChar* text_start) +static OnigPosition +map_search_backward(OnigIterator* it, OnigEncoding enc, UChar map[], + OnigPosition text, OnigPosition adjust_text, + OnigPosition text_start) { - const UChar *s = text_start; + OnigPosition s = text_start; while (s >= text) { - if (map[*s]) return (UChar* )s; + if (map[ONIG_CHARAT(s)]) return s; - s = onigenc_get_prev_char_head(enc, adjust_text, s); + s = onigenc_get_prev_char_head_se(it, enc, adjust_text, s); } - return (UChar* )NULL; + return ONIG_BADPOS; } extern OnigPosition -onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, OnigRegion* region, +onig_match(OnigIterator* it, regex_t* reg, OnigPosition str, OnigPosition end, OnigPosition at, OnigRegion* region, OnigOptionType option) { - ptrdiff_t r; - UChar *prev; + return onig_match_gpos(it, reg, str, end, at, at, region, option); +} + +extern OnigPosition +onig_match_gpos(OnigIterator* it, regex_t* reg, OnigPosition str, OnigPosition end, OnigPosition global_pos, OnigPosition at, OnigRegion* region, + OnigOptionType option) +{ + OnigPosition r; + OnigPosition prev; OnigMatchArg msa; #if defined(USE_RECOMPILE_API) && defined(USE_MULTI_THREAD_SYSTEM) @@ -3593,7 +3631,7 @@ onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, On THREAD_ATOMIC_END; #endif /* USE_RECOMPILE_API && USE_MULTI_THREAD_SYSTEM */ - MATCH_ARG_INIT(msa, option, region, at, at); + MATCH_ARG_INIT(msa, option, region, at, global_pos); #ifdef USE_COMBINATION_EXPLOSION_CHECK { int offset = at - str; @@ -3612,8 +3650,8 @@ onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, On r = 0; if (r == 0) { - prev = (UChar* )onigenc_get_prev_char_head(reg->enc, str, at); - r = match_at(reg, str, end, + prev = onigenc_get_prev_char_head_se(it, reg->enc, str, at); + r = match_at(it, reg, str, end, #ifdef USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE end, #endif @@ -3626,14 +3664,14 @@ onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, On } static int -forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, - UChar* range, UChar** low, UChar** high, UChar** low_prev) +forward_search_range(OnigIterator* it, regex_t* reg, OnigPosition str, OnigPosition end, OnigPosition s, + OnigPosition range, OnigPosition* low, OnigPosition* high, OnigPosition* low_prev) { - UChar *p, *pprev = (UChar* )NULL; + OnigPosition p, pprev = ONIG_BADPOS; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "forward_search_range: str: %d (%p), end: %d (%p), s: %d (%p), range: %d (%p)\n", - (int )str, str, (int )end, end, (int )s, s, (int )range, range); + fprintf(stderr, "forward_search_range: ptr: %p, str: %d, end: %d, s: %d, range: %d\n", + it->ptr, (int )str, (int )end, (int )s, (int )range); #endif p = s; @@ -3642,59 +3680,59 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, p += reg->dmin; } else { - UChar *q = p + reg->dmin; - while (p < q) p += enclen(reg->enc, p); + OnigPosition q = p + reg->dmin; + while (p < q) p += enclen_se(it, reg->enc, p); } } retry: switch (reg->optimize) { case ONIG_OPTIMIZE_EXACT: - p = slow_search(reg->enc, reg->exact, reg->exact_end, p, end, range); + p = slow_search_se(it, reg->enc, reg->exact, reg->exact_end, p, end, range); break; case ONIG_OPTIMIZE_EXACT_IC: - p = slow_search_ic(reg->enc, reg->case_fold_flag, + p = slow_search_ic_se(it, reg->enc, reg->case_fold_flag, reg->exact, reg->exact_end, p, end, range); break; case ONIG_OPTIMIZE_EXACT_BM: - p = bm_search(reg, reg->exact, reg->exact_end, p, end, range); + p = bm_search_se(it, reg, reg->exact, reg->exact_end, p, end, range); break; case ONIG_OPTIMIZE_EXACT_BM_NOT_REV: - p = bm_search_notrev(reg, reg->exact, reg->exact_end, p, end, range); + p = bm_search_notrev_se(it, reg, reg->exact, reg->exact_end, p, end, range); break; case ONIG_OPTIMIZE_EXACT_BM_IC: - p = bm_search_ic(reg, reg->exact, reg->exact_end, p, end, range); + p = bm_search_ic_se(it, reg, reg->exact, reg->exact_end, p, end, range); break; case ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC: - p = bm_search_notrev_ic(reg, reg->exact, reg->exact_end, p, end, range); + p = bm_search_notrev_ic_se(it, reg, reg->exact, reg->exact_end, p, end, range); break; case ONIG_OPTIMIZE_MAP: - p = map_search(reg->enc, reg->map, p, range); + p = map_search_se(it, reg->enc, reg->map, p, range); break; } - if (p && p < range) { + if (ONIG_IS_NOT_BADPOS(p) && p < range) { if (p - reg->dmin < s) { retry_gate: pprev = p; - p += enclen(reg->enc, p); + p += enclen_se(it, reg->enc, p); goto retry; } if (reg->sub_anchor) { - UChar* prev; + OnigPosition prev; switch (reg->sub_anchor) { case ANCHOR_BEGIN_LINE: if (!ON_STR_BEGIN(p)) { - prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : str), p); - if (!ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, prev, str, end, reg->options, 0)) + prev = onigenc_get_prev_char_head_se(it, reg->enc, + (ONIG_IS_NOT_BADPOS(pprev) ? pprev : str), p); + if (!ONIGENC_IS_MBC_NEWLINE_EX_SE(it, reg->enc, prev, str, end, reg->options, 0)) goto retry_gate; } break; @@ -3702,13 +3740,13 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, case ANCHOR_END_LINE: if (ON_STR_END(p)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - prev = (UChar* )onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : str), p); - if (prev && ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, prev, str, end, reg->options, 1)) + prev = onigenc_get_prev_char_head_se(it, reg->enc, + (ONIG_IS_NOT_BADPOS(pprev) ? pprev : str), p); + if (prev && ONIGENC_IS_MBC_NEWLINE_EX_SE(it, reg->enc, prev, str, end, reg->options, 1)) goto retry_gate; #endif } - else if (! ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, p, str, end, reg->options, 1)) + else if (! ONIGENC_IS_MBC_NEWLINE_EX_SE(it, reg->enc, p, str, end, reg->options, 1)) goto retry_gate; break; } @@ -3718,26 +3756,26 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, *low = p; if (low_prev) { if (*low > s) - *low_prev = onigenc_get_prev_char_head(reg->enc, s, p); + *low_prev = onigenc_get_prev_char_head_se(it, reg->enc, s, p); else - *low_prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : str), p); + *low_prev = onigenc_get_prev_char_head_se(it, reg->enc, + (ONIG_IS_NOT_BADPOS(pprev) ? pprev : str), p); } } else { if (reg->dmax != ONIG_INFINITE_DISTANCE) { *low = p - reg->dmax; if (*low > s) { - *low = onigenc_get_right_adjust_char_head_with_prev(reg->enc, s, - *low, (const UChar** )low_prev); - if (low_prev && IS_NULL(*low_prev)) - *low_prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : s), *low); + *low = onigenc_get_right_adjust_char_head_with_prev_se(it, reg->enc, s, + *low, low_prev); + if (low_prev && ONIG_IS_BADPOS(*low_prev)) + *low_prev = onigenc_get_prev_char_head_se(it, reg->enc, + (ONIG_IS_NOT_BADPOS(pprev) ? pprev : s), *low); } else { if (low_prev) - *low_prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : str), *low); + *low_prev = onigenc_get_prev_char_head_se(it, reg->enc, + (ONIG_IS_NOT_BADPOS(pprev) ? pprev : str), *low); } } } @@ -3761,12 +3799,12 @@ static int set_bm_backward_skip P_((UChar* s, UChar* end, OnigEncoding enc, #define BM_BACKWARD_SEARCH_LENGTH_THRESHOLD 100 static int -backward_search_range(regex_t* reg, const UChar* str, const UChar* end, - UChar* s, const UChar* range, UChar* adjrange, - UChar** low, UChar** high) +backward_search_range_se(OnigIterator* it, regex_t* reg, OnigPosition str, OnigPosition end, + OnigPosition s, OnigPosition range, OnigPosition adjrange, + OnigPosition* low, OnigPosition* high) { int r; - UChar *p; + OnigPosition p; range += reg->dmin; p = s; @@ -3775,14 +3813,14 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, switch (reg->optimize) { case ONIG_OPTIMIZE_EXACT: exact_method: - p = slow_search_backward(reg->enc, reg->exact, reg->exact_end, + p = slow_search_backward_se(it, reg->enc, reg->exact, reg->exact_end, range, adjrange, end, p); break; case ONIG_OPTIMIZE_EXACT_IC: case ONIG_OPTIMIZE_EXACT_BM_IC: case ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC: - p = slow_search_backward_ic(reg->enc, reg->case_fold_flag, + p = slow_search_backward_ic_se(it, reg->enc, reg->case_fold_flag, reg->exact, reg->exact_end, range, adjrange, end, p); break; @@ -3797,24 +3835,24 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, &(reg->int_map_backward)); if (r) return r; } - p = bm_search_backward(reg, reg->exact, reg->exact_end, range, adjrange, + p = bm_search_backward(it, reg, reg->exact, reg->exact_end, range, adjrange, end, p); break; case ONIG_OPTIMIZE_MAP: - p = map_search_backward(reg->enc, reg->map, range, adjrange, p); + p = map_search_backward(it, reg->enc, reg->map, range, adjrange, p); break; } - if (p) { + if (ONIG_IS_NOT_BADPOS(p)) { if (reg->sub_anchor) { - UChar* prev; + OnigPosition prev; switch (reg->sub_anchor) { case ANCHOR_BEGIN_LINE: if (!ON_STR_BEGIN(p)) { - prev = onigenc_get_prev_char_head(reg->enc, str, p); - if (!ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, prev, str, end, reg->options, 0)) { + prev = onigenc_get_prev_char_head_se(it, reg->enc, str, p); + if (!ONIGENC_IS_MBC_NEWLINE_EX_SE(it, reg->enc, prev, str, end, reg->options, 0)) { p = prev; goto retry; } @@ -3824,17 +3862,17 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, case ANCHOR_END_LINE: if (ON_STR_END(p)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - prev = onigenc_get_prev_char_head(reg->enc, adjrange, p); - if (IS_NULL(prev)) goto fail; - if (ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, prev, str, end, reg->options, 1)) { + prev = onigenc_get_prev_char_head_se(it, reg->enc, adjrange, p); + if (ONIG_IS_BADPOS(prev)) goto fail; + if (ONIGENC_IS_MBC_NEWLINE_EX_SE(it, reg->enc, prev, str, end, reg->options, 1)) { p = prev; goto retry; } #endif } - else if (! ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, p, str, end, reg->options, 1)) { - p = onigenc_get_prev_char_head(reg->enc, adjrange, p); - if (IS_NULL(p)) goto fail; + else if (! ONIGENC_IS_MBC_NEWLINE_EX_SE(it, reg->enc, p, str, end, reg->options, 1)) { + p = onigenc_get_prev_char_head_se(it, reg->enc, adjrange, p); + if (ONIG_IS_BADPOS(p)) goto fail; goto retry; } break; @@ -3845,42 +3883,42 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, if (reg->dmax != ONIG_INFINITE_DISTANCE) { *low = p - reg->dmax; *high = p - reg->dmin; - *high = onigenc_get_right_adjust_char_head(reg->enc, adjrange, *high); + *high = onigenc_get_right_adjust_char_head_se(it, reg->enc, adjrange, *high); } #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "backward_search_range: low: %d, high: %d\n", - (int )(*low - str), (int )(*high - str)); + fprintf(stderr, "backward_search_range_se: low: %d, high: %d\n", + (int )(*low), (int )(*high)); #endif return 1; /* success */ } fail: #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "backward_search_range: fail.\n"); + fprintf(stderr, "backward_search_range_se: fail.\n"); #endif return 0; /* fail */ } extern OnigPosition -onig_search(regex_t* reg, const UChar* str, const UChar* end, - const UChar* start, const UChar* range, OnigRegion* region, OnigOptionType option) +onig_search(OnigIterator* it, regex_t* reg, OnigPosition str, OnigPosition end, + OnigPosition start, OnigPosition range, OnigRegion* region, OnigOptionType option) { - return onig_search_gpos(reg, str, end, start, start, range, region, option); + return onig_search_gpos(it, reg, str, end, start, start, range, region, option); } extern OnigPosition -onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, - const UChar* global_pos, - const UChar* start, const UChar* range, OnigRegion* region, OnigOptionType option) +onig_search_gpos(OnigIterator* it, regex_t* reg, OnigPosition str, OnigPosition end, + OnigPosition global_pos, + OnigPosition start, OnigPosition range, OnigRegion* region, OnigOptionType option) { - ptrdiff_t r; - UChar *s, *prev; + OnigPosition r; + OnigPosition s, prev; OnigMatchArg msa; #ifdef USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE - const UChar *orig_start = start; - const UChar *orig_range = range; + OnigPosition orig_start = start; + OnigPosition orig_range = range; #endif #if defined(USE_RECOMPILE_API) && defined(USE_MULTI_THREAD_SYSTEM) @@ -3910,8 +3948,8 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, - "onig_search (entry point): str: %d (%p), end: %d, start: %d, range: %d\n", - (int )str, str, (int )(end - str), (int )(start - str), (int )(range - str)); + "onig_search (entry point): ptr: %p, str: %d , end: %d, start: %d, range: %d\n", + it->ptr, (int )str, (int )end, (int )start, (int )range); #endif if (region @@ -3929,7 +3967,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, #ifdef USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE #define MATCH_AND_RETURN_CHECK(upper_range) \ - r = match_at(reg, str, end, (upper_range), s, prev, &msa); \ + r = match_at(it, reg, str, end, (upper_range), s, prev, &msa); \ if (r != ONIG_MISMATCH) {\ if (r >= 0) {\ if (! IS_FIND_LONGEST(reg->options)) {\ @@ -3940,7 +3978,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, } #else #define MATCH_AND_RETURN_CHECK(upper_range) \ - r = match_at(reg, str, end, (upper_range), s, prev, &msa); \ + r = match_at(it, reg, str, end, (upper_range), s, prev, &msa); \ if (r != ONIG_MISMATCH) {\ if (r >= 0) {\ goto match;\ @@ -3951,7 +3989,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, #else #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE #define MATCH_AND_RETURN_CHECK(none) \ - r = match_at(reg, str, end, s, prev, &msa);\ + r = match_at(it, reg, str, end, s, prev, &msa);\ if (r != ONIG_MISMATCH) {\ if (r >= 0) {\ if (! IS_FIND_LONGEST(reg->options)) {\ @@ -3962,7 +4000,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, } #else #define MATCH_AND_RETURN_CHECK(none) \ - r = match_at(reg, str, end, s, prev, &msa);\ + r = match_at(it, reg, str, end, s, prev, &msa);\ if (r != ONIG_MISMATCH) {\ if (r >= 0) {\ goto match;\ @@ -3975,7 +4013,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, /* anchor optimize: resume search range */ if (reg->anchor != 0 && str < end) { - UChar *min_semi_end, *max_semi_end; + OnigPosition min_semi_end, max_semi_end; if (reg->anchor & ANCHOR_BEGIN_POSITION) { /* search start-position only */ @@ -4001,7 +4039,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, } } else if (reg->anchor & ANCHOR_END_BUF) { - min_semi_end = max_semi_end = (UChar* )end; + min_semi_end = max_semi_end = end; end_buf: if ((OnigDistance )(max_semi_end - str) < reg->anchor_dmin) @@ -4011,7 +4049,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, if ((OnigDistance )(min_semi_end - start) > reg->anchor_dmax) { start = min_semi_end - reg->anchor_dmax; if (start < end) - start = onigenc_get_right_adjust_char_head(reg->enc, str, start); + start = onigenc_get_right_adjust_char_head_se(it, reg->enc, str, start); } if ((OnigDistance )(max_semi_end - (range - 1)) < reg->anchor_dmin) { range = max_semi_end - reg->anchor_dmin + 1; @@ -4027,23 +4065,23 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, } if ((OnigDistance )(max_semi_end - start) < reg->anchor_dmin) { start = max_semi_end - reg->anchor_dmin; - start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, start); + start = ONIGENC_LEFT_ADJUST_CHAR_HEAD_SE(it, reg->enc, str, start); } if (range > start) goto mismatch_no_msa; } } else if (reg->anchor & ANCHOR_SEMI_END_BUF) { - UChar* pre_end = ONIGENC_STEP_BACK(reg->enc, str, end, 1); + OnigPosition pre_end = ONIGENC_STEP_BACK_SE(it, reg->enc, str, end, 1); - max_semi_end = (UChar* )end; - if (ONIGENC_IS_MBC_NEWLINE(reg->enc, pre_end, end)) { + max_semi_end = end; + if (ONIGENC_IS_MBC_NEWLINE_SE(it, reg->enc, pre_end, end)) { min_semi_end = pre_end; #ifdef USE_CRNL_AS_LINE_TERMINATOR - pre_end = ONIGENC_STEP_BACK(reg->enc, str, pre_end, 1); - if (IS_NOT_NULL(pre_end) && + pre_end = ONIGENC_STEP_BACK_SE(it, reg->enc, str, pre_end, 1); + if (ONIG_IS_NOT_BADPOS(pre_end) && IS_NEWLINE_CRLF(reg->options) && - ONIGENC_IS_MBC_CRNL(reg->enc, pre_end, end)) { + ONIGENC_IS_MBC_CRNL_SE(it, reg->enc, pre_end, end)) { min_semi_end = pre_end; } #endif @@ -4052,7 +4090,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, } } else { - min_semi_end = (UChar* )end; + min_semi_end = end; goto end_buf; } } @@ -4063,7 +4101,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, } } else if (str == end) { /* empty string */ - static const UChar* address_for_empty_string = (UChar* )""; + static OnigPosition address_for_empty_string = 0; #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "onig_search: empty string.\n"); @@ -4071,8 +4109,8 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, if (reg->threshold_len == 0) { start = end = str = address_for_empty_string; - s = (UChar* )start; - prev = (UChar* )NULL; + s = start; + prev = ONIG_BADPOS; MATCH_ARG_INIT(msa, option, region, start, start); #ifdef USE_COMBINATION_EXPLOSION_CHECK @@ -4087,7 +4125,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "onig_search(apply anchor): end: %d, start: %d, range: %d\n", - (int )(end - str), (int )(start - str), (int )(range - str)); + (int )end, (int )start, (int )range); #endif MATCH_ARG_INIT(msa, option, region, start, global_pos); @@ -4098,23 +4136,23 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, } #endif - s = (UChar* )start; + s = start; if (range > start) { /* forward search */ if (s > str) - prev = onigenc_get_prev_char_head(reg->enc, str, s); + prev = onigenc_get_prev_char_head_se(it, reg->enc, str, s); else - prev = (UChar* )NULL; + prev = ONIG_BADPOS; if (reg->optimize != ONIG_OPTIMIZE_NONE) { - UChar *sch_range, *low, *high, *low_prev; + OnigPosition sch_range, low, high, low_prev; - sch_range = (UChar* )range; + sch_range = range; if (reg->dmax != 0) { if (reg->dmax == ONIG_INFINITE_DISTANCE) - sch_range = (UChar* )end; + sch_range = end; else { sch_range += reg->dmax; - if (sch_range > end) sch_range = (UChar* )end; + if (sch_range > end) sch_range = end; } } @@ -4123,7 +4161,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, if (reg->dmax != ONIG_INFINITE_DISTANCE) { do { - if (! forward_search_range(reg, str, end, s, sch_range, + if (! forward_search_range(it, reg, str, end, s, sch_range, &low, &high, &low_prev)) goto mismatch; if (s < low) { s = low; @@ -4132,14 +4170,14 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, while (s <= high) { MATCH_AND_RETURN_CHECK(orig_range); prev = s; - s += enclen(reg->enc, s); + s += enclen_se(it, reg->enc, s); } } while (s < range); goto mismatch; } else { /* check only. */ - if (! forward_search_range(reg, str, end, s, sch_range, - &low, &high, (UChar** )NULL)) goto mismatch; + if (! forward_search_range(it, reg, str, end, s, sch_range, + &low, &high, (OnigPosition* )NULL)) goto mismatch; if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) { do { @@ -4147,13 +4185,13 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, msa.gpos = s; /* move \G position */ MATCH_AND_RETURN_CHECK(orig_range); prev = s; - s += enclen(reg->enc, s); + s += enclen_se(it, reg->enc, s); if ((reg->anchor & ANCHOR_LOOK_BEHIND) == 0) { - while (!ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, prev, str, end, reg->options, 0) + while (!ONIGENC_IS_MBC_NEWLINE_EX_SE(it, reg->enc, prev, str, end, reg->options, 0) && s < range) { prev = s; - s += enclen(reg->enc, s); + s += enclen_se(it, reg->enc, s); } } } while (s < range); @@ -4165,7 +4203,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, do { MATCH_AND_RETURN_CHECK(orig_range); prev = s; - s += enclen(reg->enc, s); + s += enclen_se(it, reg->enc, s); } while (s < range); if (s == range) { /* because empty match with /$/. */ @@ -4175,23 +4213,23 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, else { /* backward search */ #ifdef USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE if (orig_start < end) - orig_start += enclen(reg->enc, orig_start); /* is upper range */ + orig_start += enclen_se(it, reg->enc, orig_start); /* is upper range */ #endif if (reg->optimize != ONIG_OPTIMIZE_NONE) { - UChar *low, *high, *adjrange, *sch_start; + OnigPosition low, high, adjrange, sch_start; if (range < end) - adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range); + adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD_SE(it, reg->enc, str, range); else - adjrange = (UChar* )end; + adjrange = end; if (reg->dmax != ONIG_INFINITE_DISTANCE && (end - range) >= reg->threshold_len) { do { sch_start = s + reg->dmax; - if (sch_start > end) sch_start = (UChar* )end; - if (backward_search_range(reg, str, end, sch_start, range, adjrange, + if (sch_start > end) sch_start = end; + if (backward_search_range_se(it, reg, str, end, sch_start, range, adjrange, &low, &high) <= 0) goto mismatch; @@ -4199,7 +4237,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, s = high; while (s >= low) { - prev = onigenc_get_prev_char_head(reg->enc, str, s); + prev = onigenc_get_prev_char_head_se(it, reg->enc, str, s); MATCH_AND_RETURN_CHECK(orig_start); s = prev; } @@ -4212,22 +4250,22 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, sch_start = s; if (reg->dmax != 0) { if (reg->dmax == ONIG_INFINITE_DISTANCE) - sch_start = (UChar* )end; + sch_start = end; else { sch_start += reg->dmax; - if (sch_start > end) sch_start = (UChar* )end; + if (sch_start > end) sch_start = end; else - sch_start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, + sch_start = ONIGENC_LEFT_ADJUST_CHAR_HEAD_SE(it, reg->enc, start, sch_start); } } - if (backward_search_range(reg, str, end, sch_start, range, adjrange, + if (backward_search_range_se(it, reg, str, end, sch_start, range, adjrange, &low, &high) <= 0) goto mismatch; } } do { - prev = onigenc_get_prev_char_head(reg->enc, str, s); + prev = onigenc_get_prev_char_head_se(it, reg->enc, str, s); MATCH_AND_RETURN_CHECK(orig_start); s = prev; } while (s >= range); @@ -4333,3 +4371,7 @@ onig_copy_encoding(OnigEncoding to, OnigEncoding from) *to = *from; } +extern UChar +onig_default_charat (OnigPosition pos, const void* ptr) { + return ptr ? *((const UChar*)(ptr) + pos) : '\0'; +} diff --git a/src/Onigmo/reggnu.c b/src/Onigmo/reggnu.c index 33a4e4f..a12b311 100644 --- a/src/Onigmo/reggnu.c +++ b/src/Onigmo/reggnu.c @@ -62,20 +62,20 @@ re_adjust_startpos(regex_t* reg, const char* string, int size, } extern int -re_match(regex_t* reg, const char* str, int size, int pos, +re_match(OnigIterator* it, regex_t* reg, OnigPosition str, OnigPosition size, OnigPosition pos, struct re_registers* regs) { - return (int )onig_match(reg, (UChar* )str, (UChar* )(str + size), - (UChar* )(str + pos), regs, ONIG_OPTION_NONE); + return (int )onig_match(it, reg, str, (str + size), + (str + pos), regs, ONIG_OPTION_NONE); } extern int -re_search(regex_t* bufp, const char* string, int size, int startpos, int range, +re_search(OnigIterator* it, regex_t* bufp, OnigPosition str, OnigPosition size, OnigPosition startpos, OnigPosition range, struct re_registers* regs) { - return (int )onig_search(bufp, (UChar* )string, (UChar* )(string + size), - (UChar* )(string + startpos), - (UChar* )(string + startpos + range), + return (int )onig_search(it, bufp, str, (str + size), + (str + startpos), + (str + startpos + range), regs, ONIG_OPTION_NONE); } diff --git a/src/Onigmo/regint.h b/src/Onigmo/regint.h index e293031..7e3478e 100644 --- a/src/Onigmo/regint.h +++ b/src/Onigmo/regint.h @@ -76,6 +76,8 @@ #define USE_SHARED_CCLASS_TABLE #define USE_SUNDAY_QUICK_SEARCH +#define USE_SHARED_UNICODE_TABLE + #define INIT_MATCH_STACK_SIZE 160 #define DEFAULT_MATCH_STACK_LIMIT_SIZE 0 /* unlimited */ @@ -336,6 +338,7 @@ typedef unsigned int BitStatusType; #define IS_POSIX_BRACKET_ALL_RANGE(option) ((option) & ONIG_OPTION_POSIX_BRACKET_ALL_RANGE) #define IS_WORD_BOUND_ALL_RANGE(option) ((option) & ONIG_OPTION_WORD_BOUND_ALL_RANGE) #define IS_NEWLINE_CRLF(option) ((option) & ONIG_OPTION_NEWLINE_CRLF) +#define IS_WHOLEWORD(option) ((option) & SE_ONIG_OPTION_WHOLEWORD) /* OP_SET_OPTION is required for these options. #define IS_DYNAMIC_OPTION(option) \ @@ -743,19 +746,19 @@ typedef struct { BBuf* mbuf; /* multi-byte info or NULL */ } CClassNode; -typedef intptr_t OnigStackIndex; +typedef OnigPosition OnigStackIndex; typedef struct _OnigStackType { unsigned int type; union { struct { UChar *pcode; /* byte code position */ - UChar *pstr; /* string position */ - UChar *pstr_prev; /* previous char position of pstr */ + OnigPosition pstr; /* string position */ + OnigPosition pstr_prev; /* previous char position of pstr */ #ifdef USE_COMBINATION_EXPLOSION_CHECK unsigned int state_check; #endif - UChar *pkeep; /* keep pattern position */ + OnigPosition pkeep; /* keep pattern position */ } state; struct { int count; /* for OP_REPEAT_INC, OP_REPEAT_INC_NG */ @@ -767,20 +770,20 @@ typedef struct _OnigStackType { } repeat_inc; struct { int num; /* memory num */ - UChar *pstr; /* start/end position */ + OnigPosition pstr; /* start/end position */ /* Following information is set, if this stack type is MEM-START */ OnigStackIndex start; /* prev. info (for backtrack "(...)*" ) */ OnigStackIndex end; /* prev. info (for backtrack "(...)*" ) */ } mem; struct { int num; /* null check id */ - UChar *pstr; /* start position */ + OnigPosition pstr; /* start position */ } null_check; #ifdef USE_SUBEXP_CALL struct { UChar *ret_addr; /* byte code position */ int num; /* null check id */ - UChar *pstr; /* string position */ + OnigPosition pstr; /* string position */ } call_frame; #endif } u; @@ -791,11 +794,11 @@ typedef struct { size_t stack_n; OnigOptionType options; OnigRegion* region; - const UChar* start; /* search start position */ - const UChar* gpos; /* global position (for \G: BEGIN_POSITION) */ + OnigPosition start; /* search start position */ + OnigPosition gpos; /* global position (for \G: BEGIN_POSITION) */ #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE OnigPosition best_len; /* for ONIG_OPTION_FIND_LONGEST */ - UChar* best_s; + OnigPosition best_s; #endif #ifdef USE_COMBINATION_EXPLOSION_CHECK void* state_check_buff; diff --git a/src/Onigmo/regparse.c b/src/Onigmo/regparse.c index b326f17..469e8bc 100644 --- a/src/Onigmo/regparse.c +++ b/src/Onigmo/regparse.c @@ -35,7 +35,6 @@ #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS - OnigSyntaxType OnigSyntaxRuby = { (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | @@ -64,7 +63,7 @@ OnigSyntaxType OnigSyntaxRuby = { ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) - , ( ONIG_OPTION_ASCII_RANGE | ONIG_OPTION_POSIX_BRACKET_ALL_RANGE | + , ( /*ONIG_OPTION_ASCII_RANGE |*/ ONIG_OPTION_POSIX_BRACKET_ALL_RANGE | ONIG_OPTION_WORD_BOUND_ALL_RANGE ) , { @@ -5341,6 +5340,51 @@ onig_free_shared_cclass_table(void) #endif /* USE_SHARED_CCLASS_TABLE */ +#ifdef USE_SHARED_UNICODE_TABLE + +extern st_table* FoldTable; /* fold-1, fold-2, fold-3 */ +extern st_table* Unfold1Table; +extern st_table* Unfold2Table; +extern st_table* Unfold3Table; +extern int CaseFoldInited; + +static int +i_free_shared_unicode_table(st_str_end_key* key, Node* node, void* arg ARG_UNUSED) +{ + if (IS_NOT_NULL(key)) xfree(key); + return ST_DELETE; +} + +extern int +onig_free_shared_unicode_table(void) +{ + THREAD_ATOMIC_START; + if (IS_NOT_NULL(FoldTable)) { + onig_st_free_table(FoldTable); + FoldTable = NULL; + } + + if (IS_NOT_NULL(Unfold1Table)) { + onig_st_free_table(Unfold1Table); + Unfold1Table = NULL; + } + + if (IS_NOT_NULL(Unfold2Table)) { + onig_st_free_table(Unfold2Table); + Unfold2Table = NULL; + } + + if (IS_NOT_NULL(Unfold3Table)) { + onig_st_free_table(Unfold3Table); + Unfold3Table = NULL; + } + CaseFoldInited = 0; + + THREAD_ATOMIC_END; + return 0; +} + +#endif // USE_SHARED_UNICODE_TABLE #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS static int diff --git a/src/Onigmo/regparse.h b/src/Onigmo/regparse.h index bb584ad..1a1196b 100644 --- a/src/Onigmo/regparse.h +++ b/src/Onigmo/regparse.h @@ -347,6 +347,10 @@ extern int onig_names_free P_((regex_t* reg)); extern int onig_parse_make_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env)); extern int onig_free_shared_cclass_table P_((void)); +#ifdef USE_SHARED_UNICODE_TABLE +extern int onig_free_shared_unicode_table P_((void)); +#endif //USE_SHARED_UNICODE_TABLE + #ifdef ONIG_DEBUG #ifdef USE_NAMED_GROUP extern int onig_print_names(FILE*, regex_t*); diff --git a/src/Onigmo/regposerr.c b/src/Onigmo/regposerr.c index 56f75ab..e6ffcc0 100644 --- a/src/Onigmo/regposerr.c +++ b/src/Onigmo/regposerr.c @@ -27,7 +27,9 @@ * SUCH DAMAGE. */ -#include "config.h" +#define regex_t onig_regex_t +#include "regint.h" +#undef regex_t #include "onigposix.h" #ifdef HAVE_STRING_H diff --git a/src/Onigmo/regposix.c b/src/Onigmo/regposix.c index 41c69ef..30f1155 100644 --- a/src/Onigmo/regposix.c +++ b/src/Onigmo/regposix.c @@ -48,6 +48,18 @@ } \ } while(0) +/* #define ENC_STRING_LEN_SE(enc,s,len) len = strlen(s) */ +#define ENC_STRING_LEN_SE(enc,s,len) do { \ + if (ONIGENC_MBC_MINLEN(enc) == 1) { \ + OnigPosition tmps = s; \ + while (ONIG_CHARAT(tmps) != 0) tmps++; \ + len = tmps - s; \ + } \ + else { \ + len = onigenc_str_bytelen_null_se(it, enc, s); \ + } \ +} while(0) + typedef struct { int onig_err; int posix_err; @@ -163,12 +175,12 @@ regcomp(regex_t* reg, const char* pattern, int posix_options) return 0; } -extern int -regexec(regex_t* reg, const char* str, size_t nmatch, +extern OnigPosition +regexec(OnigIterator* it, regex_t* reg, OnigPosition str, size_t nmatch, regmatch_t pmatch[], int posix_options) { - int r, i, len; - UChar* end; + OnigPosition r, i, len; + OnigPosition end; regmatch_t* pm; OnigOptionType options; @@ -190,9 +202,9 @@ regexec(regex_t* reg, const char* str, size_t nmatch, pm = pmatch; } - ENC_STRING_LEN(ONIG_C(reg)->enc, str, len); - end = (UChar* )(str + len); - r = (int )onig_search(ONIG_C(reg), (UChar* )str, end, (UChar* )str, end, + ENC_STRING_LEN_SE(ONIG_C(reg)->enc, str, len); + end = str + len; + r = onig_search(it, ONIG_C(reg), str, end, str, end, (OnigRegion* )pm, options); if (r >= 0) { @@ -207,7 +219,7 @@ regexec(regex_t* reg, const char* str, size_t nmatch, pmatch[i].rm_so = pmatch[i].rm_eo = ONIG_REGION_NOTPOS; } else { - r = onig2posix_error_code(r); + r = onig2posix_error_code((int)r); } if (pm != pmatch && pm != NULL) diff --git a/src/Onigmo/regsyntax.c b/src/Onigmo/regsyntax.c index ec8c9eb..cf03592 100644 --- a/src/Onigmo/regsyntax.c +++ b/src/Onigmo/regsyntax.c @@ -46,6 +46,22 @@ OnigSyntaxType OnigSyntaxASIS = { } }; +OnigSyntaxType OnigSyntaxWildChar = { + ONIG_SYN_OP_VARIABLE_META_CHARACTERS + , 0 + , 0 + , ONIG_OPTION_SINGLELINE + , + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )'?' /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )'*' /* anychar anytime */ + } +}; + OnigSyntaxType OnigSyntaxPosixBasic = { ( SYN_POSIX_COMMON_OP | ONIG_SYN_OP_ESC_LPAREN_SUBEXP | ONIG_SYN_OP_ESC_BRACE_INTERVAL ) diff --git a/src/Onigmo/sample/crnl.c b/src/Onigmo/sample/crnl.c index ffff915..089f213 100644 --- a/src/Onigmo/sample/crnl.c +++ b/src/Onigmo/sample/crnl.c @@ -9,12 +9,12 @@ #include #include "oniguruma.h" -/* #define USE_UNICODE_ALL_LINE_TERMINATORS */ +#define USE_UNICODE_ALL_LINE_TERMINATORS static int nfail = 0; -static void result(int no, int from, int to, - int expected_from, int expected_to) +static void result(int no, OnigPosition from, OnigPosition to, + OnigPosition expected_from, OnigPosition expected_to) { fprintf(stderr, "%3d: ", no); if (from == expected_from && to == expected_to) { @@ -22,7 +22,7 @@ static void result(int no, int from, int to, } else { fprintf(stderr, "Fail: expected: (%d-%d), result: (%d-%d)\n", - expected_from, expected_to, from, to); + (int)expected_from, (int)expected_to, (int)from, (int)to); nfail++; } @@ -32,12 +32,13 @@ static int x0(int no, char* pattern_arg, char* str_arg, int start_offset, int expected_from, int expected_to, int backward) { - int r; - unsigned char *start, *range, *end; + OnigPosition r; + OnigPosition start, range, end; regex_t* reg; OnigErrorInfo einfo; OnigRegion *region; UChar *pattern, *str; + OnigIterator it = {onig_default_charat, str_arg}; pattern = (UChar* )pattern_arg; str = (UChar* )str_arg; @@ -53,16 +54,16 @@ x0(int no, char* pattern_arg, char* str_arg, region = onig_region_new(); - end = str + strlen((char* )str); + end = strlen((char* )str); if (backward) { start = end + start_offset; - range = str; + range = 0; } else { - start = str + start_offset; + start = start_offset; range = end; } - r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + r = onig_search(&it, reg, 0, end, start, range, region, ONIG_OPTION_NONE); if (r >= 0 || r == ONIG_MISMATCH) { result(no, region->beg[0], region->end[0], expected_from, expected_to); } diff --git a/src/Onigmo/sample/encode.c b/src/Onigmo/sample/encode.c index 55aed3f..aee4511 100644 --- a/src/Onigmo/sample/encode.c +++ b/src/Onigmo/sample/encode.c @@ -5,24 +5,24 @@ #include "oniguruma.h" static int -search(regex_t* reg, unsigned char* str, unsigned char* end) +search(OnigIterator* it, regex_t* reg, OnigPosition str, OnigPosition end) { - int r; - unsigned char *start, *range; + OnigPosition r; + OnigPosition start, range; OnigRegion *region; region = onig_region_new(); start = str; range = end; - r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + r = onig_search(it, reg, str, end, start, range, region, ONIG_OPTION_NONE); if (r >= 0) { int i; - fprintf(stderr, "match at %d (%s)\n", r, + fprintf(stderr, "match at %d (%s)\n", (int)r, ONIGENC_NAME(onig_get_encoding(reg))); for (i = 0; i < region->num_regs; i++) { - fprintf(stderr, "%d: (%ld-%ld)\n", i, region->beg[i], region->end[i]); + fprintf(stderr, "%d: (%ld-%ld)\n", i, (int)region->beg[i], (int)region->end[i]); } } else if (r == ONIG_MISMATCH) { @@ -45,13 +45,15 @@ static int exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr) { - int r; - unsigned char *end; + OnigPosition r; + OnigPosition end; regex_t* reg; OnigErrorInfo einfo; UChar* pattern = (UChar* )apattern; UChar* str = (UChar* )astr; + OnigIterator it = {onig_default_charat, str}; + onig_init(); r = onig_new(®, pattern, pattern + onigenc_str_bytelen_null(enc, pattern), options, enc, ONIG_SYNTAX_DEFAULT, &einfo); @@ -62,8 +64,8 @@ exec(OnigEncoding enc, OnigOptionType options, return -1; } - end = str + onigenc_str_bytelen_null(enc, str); - r = search(reg, str, end); + end = onigenc_str_bytelen_null(enc, str); + r = search(&it, reg, 0, end); onig_free(reg); onig_end(); @@ -84,13 +86,14 @@ static int exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc, OnigOptionType options, char* apattern, char* astr) { - int r; - unsigned char *end; + OnigPosition r; + OnigPosition end; regex_t* reg; OnigCompileInfo ci; OnigErrorInfo einfo; UChar* pattern = (UChar* )apattern; UChar* str = (UChar* )astr; + OnigIterator it = {onig_default_charat, str}; ci.num_of_elements = 5; ci.pattern_enc = pattern_enc; @@ -109,8 +112,8 @@ exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc, return -1; } - end = str + onigenc_str_bytelen_null(str_enc, str); - r = search(reg, str, end); + end = onigenc_str_bytelen_null(str_enc, str); + r = search(&it, reg, 0, end); onig_free(reg); onig_end(); diff --git a/src/Onigmo/sample/listcap.c b/src/Onigmo/sample/listcap.c index 547eff9..0bff374 100644 --- a/src/Onigmo/sample/listcap.c +++ b/src/Onigmo/sample/listcap.c @@ -20,18 +20,19 @@ node_callback(int group, OnigPosition beg, OnigPosition end, int level, for (i = 0; i < level * 2; i++) fputc(' ', stderr); - fprintf(stderr, "%d: (%ld-%ld)\n", group, beg, end); + fprintf(stderr, "%d: (%ld-%ld)\n", group, (int)beg, (int)end); return 0; } extern int ex(unsigned char* str, unsigned char* pattern, OnigSyntaxType* syntax) { - int r; - unsigned char *start, *range, *end; + OnigPosition r; + OnigPosition start, range, end; regex_t* reg; OnigErrorInfo einfo; OnigRegion *region; + OnigIterator it = {onig_default_charat, str}; r = onig_new(®, pattern, pattern + strlen((char* )pattern), ONIG_OPTION_DEFAULT, ONIG_ENCODING_ASCII, syntax, &einfo); @@ -48,16 +49,16 @@ extern int ex(unsigned char* str, unsigned char* pattern, region = onig_region_new(); - end = str + strlen((char* )str); - start = str; + end = strlen((char* )str); + start = 0; range = end; - r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + r = onig_search(&it, reg, 0, end, start, range, region, ONIG_OPTION_NONE); if (r >= 0) { int i; - fprintf(stderr, "match at %d\n", r); + fprintf(stderr, "match at %d\n", (int)r); for (i = 0; i < region->num_regs; i++) { - fprintf(stderr, "%d: (%ld-%ld)\n", i, region->beg[i], region->end[i]); + fprintf(stderr, "%d: (%ld-%ld)\n", i, (long)region->beg[i], (long)region->end[i]); } fprintf(stderr, "\n"); diff --git a/src/Onigmo/sample/names.c b/src/Onigmo/sample/names.c index f6ca2f8..ea125af 100644 --- a/src/Onigmo/sample/names.c +++ b/src/Onigmo/sample/names.c @@ -19,21 +19,22 @@ name_callback(const UChar* name, const UChar* name_end, ref = onig_name_to_backref_number(reg, name, name_end, region); s = (ref == gn ? "*" : ""); fprintf(stderr, "%s (%d): ", name, gn); - fprintf(stderr, "(%ld-%ld) %s\n", region->beg[gn], region->end[gn], s); + fprintf(stderr, "(%ld-%ld) %s\n", (int)region->beg[gn], (int)region->end[gn], s); } return 0; /* 0: continue */ } extern int main(int argc, char* argv[]) { - int r; - unsigned char *start, *range, *end; + OnigPosition r; + OnigPosition start, range, end; regex_t* reg; OnigErrorInfo einfo; OnigRegion *region; static UChar* pattern = (UChar* )"(?a*)(?b*)(?c*)"; static UChar* str = (UChar* )"aaabbbbcc"; + OnigIterator it = {onig_default_charat, str}; r = onig_new(®, pattern, pattern + strlen((char* )pattern), ONIG_OPTION_DEFAULT, ONIG_ENCODING_ASCII, ONIG_SYNTAX_DEFAULT, &einfo); @@ -48,10 +49,10 @@ extern int main(int argc, char* argv[]) region = onig_region_new(); - end = str + strlen((char* )str); - start = str; + end = strlen((char* )str); + start = 0; range = end; - r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + r = onig_search(&it, reg, 0, end, start, range, region, ONIG_OPTION_NONE); if (r >= 0) { fprintf(stderr, "match at %d\n\n", r); r = onig_foreach_name(reg, name_callback, (void* )region); diff --git a/src/Onigmo/sample/posix.c b/src/Onigmo/sample/posix.c index d24ee35..d4b6364 100644 --- a/src/Onigmo/sample/posix.c +++ b/src/Onigmo/sample/posix.c @@ -2,6 +2,10 @@ * posix.c */ #include +#define regex_t onig_regex_t +#include "regint.h" +#undef regex_t + #include "onigposix.h" typedef unsigned char UChar; @@ -11,8 +15,9 @@ static int x(regex_t* reg, unsigned char* pattern, unsigned char* str) int r, i; char buf[200]; regmatch_t pmatch[20]; + OnigIterator it = {onig_default_charat, str}; - r = regexec(reg, (char* )str, reg->re_nsub + 1, pmatch, 0); + r = (int)regexec(&it, reg, 0, reg->re_nsub + 1, pmatch, 0); if (r != 0 && r != REG_NOMATCH) { regerror(r, reg, buf, sizeof(buf)); fprintf(stderr, "ERROR: %s\n", buf); diff --git a/src/Onigmo/sample/simple.c b/src/Onigmo/sample/simple.c index 19c92a1..31fe5f1 100644 --- a/src/Onigmo/sample/simple.c +++ b/src/Onigmo/sample/simple.c @@ -7,14 +7,15 @@ extern int main(int argc, char* argv[]) { - int r; - unsigned char *start, *range, *end; + OnigPosition r; + OnigPosition start, range, end; regex_t* reg; OnigErrorInfo einfo; OnigRegion *region; static UChar* pattern = (UChar* )"a(.*)b|[e-f]+"; static UChar* str = (UChar* )"zzzzaffffffffb"; + OnigIterator it = {onig_default_charat, str}; r = onig_new(®, pattern, pattern + strlen((char* )pattern), ONIG_OPTION_DEFAULT, ONIG_ENCODING_ASCII, ONIG_SYNTAX_DEFAULT, &einfo); @@ -27,16 +28,16 @@ extern int main(int argc, char* argv[]) region = onig_region_new(); - end = str + strlen((char* )str); - start = str; + end = strlen((char* )str); + start = 0; range = end; - r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + r = onig_search(&it, reg, 0, end, start, range, region, ONIG_OPTION_NONE); if (r >= 0) { int i; fprintf(stderr, "match at %d\n", r); for (i = 0; i < region->num_regs; i++) { - fprintf(stderr, "%d: (%ld-%ld)\n", i, region->beg[i], region->end[i]); + fprintf(stderr, "%d: (%ld-%ld)\n", i, (int)region->beg[i], (int)region->end[i]); } } else if (r == ONIG_MISMATCH) { diff --git a/src/Onigmo/sample/sql.c b/src/Onigmo/sample/sql.c index 15ae762..8a91652 100644 --- a/src/Onigmo/sample/sql.c +++ b/src/Onigmo/sample/sql.c @@ -9,14 +9,15 @@ extern int main(int argc, char* argv[]) { static OnigSyntaxType SQLSyntax; - int r; - unsigned char *start, *range, *end; + OnigPosition r; + OnigPosition start, range, end; regex_t* reg; OnigErrorInfo einfo; OnigRegion *region; static UChar* pattern = (UChar* )"\\_%\\\\__zz"; static UChar* str = (UChar* )"a_abcabcabc\\ppzz"; + OnigIterator it = {onig_default_charat, str}; onig_set_syntax_op (&SQLSyntax, ONIG_SYN_OP_VARIABLE_META_CHARACTERS); onig_set_syntax_op2 (&SQLSyntax, 0); @@ -44,16 +45,16 @@ extern int main(int argc, char* argv[]) region = onig_region_new(); - end = str + strlen((char* )str); - start = str; + end = strlen((char* )str); + start = 0; range = end; - r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + r = onig_search(&it, reg, 0, end, start, range, region, ONIG_OPTION_NONE); if (r >= 0) { int i; fprintf(stderr, "match at %d\n", r); for (i = 0; i < region->num_regs; i++) { - fprintf(stderr, "%d: (%ld-%ld)\n", i, region->beg[i], region->end[i]); + fprintf(stderr, "%d: (%ld-%ld)\n", i, (int)region->beg[i], (int)region->end[i]); } } else if (r == ONIG_MISMATCH) { diff --git a/src/Onigmo/sample/syntax.c b/src/Onigmo/sample/syntax.c index 1384978..b1f4105 100644 --- a/src/Onigmo/sample/syntax.c +++ b/src/Onigmo/sample/syntax.c @@ -8,13 +8,14 @@ extern int exec(OnigSyntaxType* syntax, char* apattern, char* astr) { - int r; - unsigned char *start, *range, *end; + OnigPosition r; + OnigPosition start, range, end; regex_t* reg; OnigErrorInfo einfo; OnigRegion *region; UChar* pattern = (UChar* )apattern; UChar* str = (UChar* )astr; + OnigIterator it = {onig_default_charat, str}; r = onig_new(®, pattern, pattern + strlen((char* )pattern), ONIG_OPTION_DEFAULT, ONIG_ENCODING_ASCII, syntax, &einfo); @@ -27,16 +28,16 @@ extern int exec(OnigSyntaxType* syntax, region = onig_region_new(); - end = str + strlen((char* )str); - start = str; + end = strlen((char* )str); + start = 0; range = end; - r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + r = onig_search(&it, reg, 0, end, start, range, region, ONIG_OPTION_NONE); if (r >= 0) { int i; fprintf(stderr, "match at %d\n", r); for (i = 0; i < region->num_regs; i++) { - fprintf(stderr, "%d: (%ld-%ld)\n", i, region->beg[i], region->end[i]); + fprintf(stderr, "%d: (%ld-%ld)\n", i, (int)region->beg[i], (int)region->end[i]); } } else if (r == ONIG_MISMATCH) { diff --git a/src/Onigmo/testc.c b/src/Onigmo/testc.c index 4d8ef7a..94f0129 100644 --- a/src/Onigmo/testc.c +++ b/src/Onigmo/testc.c @@ -33,7 +33,7 @@ static OnigRegion* region; static void xx(char* pattern, char* str, int from, int to, int mem, int not) { - int r; + OnigPosition r; #ifdef POSIX_TEST regex_t reg; @@ -89,6 +89,7 @@ static void xx(char* pattern, char* str, int from, int to, int mem, int not) regex_t* reg; OnigErrorInfo einfo; OnigSyntaxType syn = *ONIG_SYNTAX_DEFAULT; + OnigIterator it = {onig_default_charat, str}; /* ONIG_OPTION_OFF(syn.options, ONIG_OPTION_ASCII_RANGE); */ @@ -102,8 +103,8 @@ static void xx(char* pattern, char* str, int from, int to, int mem, int not) return ; } - r = onig_search(reg, (UChar* )str, (UChar* )(str + SLEN(str)), - (UChar* )str, (UChar* )(str + SLEN(str)), + r = onig_search(&it, reg, 0, SLEN(str), + 0, SLEN(str), region, ONIG_OPTION_NONE); if (r < ONIG_MISMATCH) { char s[ONIG_MAX_ERROR_MESSAGE_LEN]; diff --git a/src/Onigmo/testu.c b/src/Onigmo/testu.c index 0024d55..2405c8f 100644 --- a/src/Onigmo/testu.c +++ b/src/Onigmo/testu.c @@ -57,7 +57,7 @@ static void uconv(char* from, char* to, int len) static void xx(char* pattern, char* str, int from, int to, int mem, int not) { - int r; + OnigPosition r; char cpat[4000], cstr[4000]; #ifdef POSIX_TEST @@ -118,6 +118,7 @@ static void xx(char* pattern, char* str, int from, int to, int mem, int not) OnigCompileInfo ci; OnigErrorInfo einfo; OnigSyntaxType syn = *ONIG_SYNTAX_DEFAULT; + OnigIterator it = {onig_default_charat, str}; /* ONIG_OPTION_OFF(syn.options, ONIG_OPTION_ASCII_RANGE); */ @@ -148,8 +149,8 @@ static void xx(char* pattern, char* str, int from, int to, int mem, int not) return ; } - r = onig_search(reg, (UChar* )str, (UChar* )(str + ulen(str)), - (UChar* )str, (UChar* )(str + ulen(str)), + r = onig_search(&it, reg, 0, ulen(str), + 0, ulen(str), region, ONIG_OPTION_NONE); if (r < ONIG_MISMATCH) { char s[ONIG_MAX_ERROR_MESSAGE_LEN]; diff --git a/src/Onigmo/win32/testc.c b/src/Onigmo/win32/testc.c index c8f484a..b315fe1 100644 --- a/src/Onigmo/win32/testc.c +++ b/src/Onigmo/win32/testc.c @@ -33,7 +33,7 @@ static OnigRegion* region; static void xx(char* pattern, char* str, int from, int to, int mem, int not) { - int r; + OnigPosition r; #ifdef POSIX_TEST regex_t reg; @@ -89,6 +89,7 @@ static void xx(char* pattern, char* str, int from, int to, int mem, int not) regex_t* reg; OnigErrorInfo einfo; OnigSyntaxType syn = *ONIG_SYNTAX_DEFAULT; + OnigIterator it = {onig_default_charat, str}; /* ONIG_OPTION_OFF(syn.options, ONIG_OPTION_ASCII_RANGE); */ @@ -102,8 +103,8 @@ static void xx(char* pattern, char* str, int from, int to, int mem, int not) return ; } - r = onig_search(reg, (UChar* )str, (UChar* )(str + SLEN(str)), - (UChar* )str, (UChar* )(str + SLEN(str)), + r = onig_search(&it, reg, 0, SLEN(str), + 0, SLEN(str), region, ONIG_OPTION_NONE); if (r < ONIG_MISMATCH) { char s[ONIG_MAX_ERROR_MESSAGE_LEN]; diff --git a/src/README.md b/src/README.md index e69de29..4c964ef 100644 --- a/src/README.md +++ b/src/README.md @@ -0,0 +1,7 @@ +# Components + +Below is a list of (some) WindTerm components in alphabetical order, along with a brief description of each. + +## Onigmo + +An improved version based on Onigmo 5.13.5. In particular, the addition of iterator makes it possible to match gap buffer or nonadjacent memory blocks. Please refer to the sample files for how to use. \ No newline at end of file