diff options
Diffstat (limited to 'ext/mbstring/oniguruma')
61 files changed, 5363 insertions, 1126 deletions
diff --git a/ext/mbstring/oniguruma/AUTHORS b/ext/mbstring/oniguruma/AUTHORS new file mode 100644 index 000000000..93167bd43 --- /dev/null +++ b/ext/mbstring/oniguruma/AUTHORS @@ -0,0 +1 @@ +sndgk393 AT ybb DOT ne DOT jp (K.Kosako) diff --git a/ext/mbstring/oniguruma/COPYING b/ext/mbstring/oniguruma/COPYING index ed3fa53b2..4d321bb93 100644 --- a/ext/mbstring/oniguruma/COPYING +++ b/ext/mbstring/oniguruma/COPYING @@ -1,4 +1,4 @@ -OniGuruma LICENSE +Oniguruma LICENSE ----------------- When this software is partly used or it is distributed with Ruby, @@ -6,7 +6,7 @@ this of Ruby follows the license of Ruby. It follows the BSD license in the case of the one except for it. /*- - * Copyright (c) 2002-2004 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/ext/mbstring/oniguruma/HISTORY b/ext/mbstring/oniguruma/HISTORY index c648c5455..6c824a697 100644 --- a/ext/mbstring/oniguruma/HISTORY +++ b/ext/mbstring/oniguruma/HISTORY @@ -1,5 +1,364 @@ History +2006/09/19: Version 4.4.4 + +2006/09/19: [test] success in ruby 1.9.0 (2006-08-22) [i686-linux]. +2006/09/19: [impl] (thanks KOYAMA Tetsuji) + HAVE_STDARG_PROTOTYPES was not defined in Mac OS X + by Xcode 2.4(gcc 4.0.1) problem. [php-dev 1312] etc... + +2006/09/15: Version 4.4.3 + +2006/09/15: [test] success in ruby 1.9.0 (2006-08-22) [i686-linux]. +2006/09/15: [bug] (thanks Allan Odgaard) + out of range access in bm_search_notrev(). + (p < s) + +2006/09/08: Version 4.4.2 + +2006/09/08: [test] success in ruby 1.9.0 (2006-08-22) [i686-linux]. +2006/09/08: [bug] (thanks K.Takata) + out of range access in bm_search_notrev(). +2006/09/04: [spec] (thanks K.Takata) + allow look-behind in negative look-behind. + ex. /(?<!(?<=a)b|c)d/ + +2006/08/29: Version 4.4.1 + +2006/08/29: [test] success in ruby 1.9.0 (2006-08-22) [i686-linux]. +2006/08/29: [dist] (thanks Seiji Masugata) + add configure option --enable-combination-explosion-check + +2006/08/25: Version 4.4.0 + +2006/08/25: [test] success in ruby 1.9.0 (2006-08-22) [i686-linux]. +2006/08/25: [impl] add_state_check_num() should be enclosed in + ifdef USE_COMBINATION_EXPLOSION_CHECK. +2006/08/23: [spec] config USE_COMBINATION_EXPLOSION_CHECK is enabled + in Ruby mode only. +2006/08/22: [impl] remove last line comma in enum OpCode. +2006/08/22: [impl] remove OP_STATE_CHECK_ANYCHAR_STAR_PEEK_NEXT and + OP_STATE_CHECK_ANYCHAR_ML_STAR_PEEK_NEXT. +2006/08/22: [impl] remove OP_BACKREF3. + +2006/08/21: Version 4.3.1 + +2006/08/21: [test] success in ruby 1.9.0 (2006-07-28) [i686-linux]. +2006/08/21: [impl] change stack type values + and re-define STK_MASK_TO_VOID_TARGET etc... +2006/08/21: [impl] set repeat_range[].upper to 0x7fffffff as infinite. +2006/08/21: [impl] add STATE_CHECK_BUFF_MALLOC_THRESHOLD_SIZE. +2006/08/21: [impl] reduce (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} +2006/09/21: [impl] reduce (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} + if backreference is not used. +2006/08/17: [bug] should check scan_env.num_call > 0 for backrefed pattern + in combination explosion check. + +2006/08/17: Version 4.3.0 + +2006/08/17: [test] success in ruby 1.9.0 (2006-07-28) [i686-linux]. +2006/08/17: [new] add config USE_COMBINATION_EXPLOSION_CHECK. + check /(.+)*/, /(\s*foo\s*)*/ etc... + [API] add num_comb_exp_check member in regex_t. + [dist] change LTVERSION value to "1:0:0" in configure.in. +2006/08/15: [bug] OP_REPEAT_INC process in match_at(). + should check repeat-count >= range-upper and + range-upper may be infinite. + +2006/08/11: Version 4.2.3 + +2006/08/11: [test] success in ruby 1.9.0 (2006-07-28) [i686-linux]. +2006/08/10: [impl] remove double call in set_qualifier(). +2006/08/10: [impl] remove by_number member in QualifierNode. +2006/08/09: [impl] remove a comma at the end of enum ReduceType + for escape warning on Mac OS X. +2006/08/07: [impl] remove warning in regcomp.c. +2006/08/07: [spec] move definition of USE_BACKREF_AT_LEVEL into NOT_RUBY. + +2006/08/03: Version 4.2.2 + +2006/08/03: [test] success in ruby 1.9.0 (2006-07-28) [i686-linux]. +2006/08/03: [bug] (thanks Hiroyuki Yamamoto) + segmentation fault in regexec(). (POSIX API) +2006/08/02: [bug] combination of \G in look-ahead/look-behind and other + anchors(\A, \z, \Z) cause invalid result. + ex. /(?!\G)a\z/.match("ba") + start arg. of MATCH_ARG_INIT() should be original + arg. of onig_search(). + +2006/07/31: Version 4.2.1 + +2006/07/31: [test] success in ruby 1.9.0 (2006-07-28) [i686-linux]. +2006/07/31: [bug] (thanks Kimura Minoru) + re-implement bm_search_notrev(). +2006/07/31: [impl] bm_search_notrev() refactoring. +2006/07/31: [bug] (thanks Kimura Minoru) + fix incomplete multibyte string in exact info. +2006/07/31: [impl] (thanks Seiji Masugata) + remove cast in va_init_list() for Intel C Compiler. + +2006/07/18: Version 4.2.0 + +2006/07/18: [test] success in ruby 1.9.0 (2006-03-01) [i686-linux]. +2006/07/18: [new] (thanks Wolfgang Nadasi-Donner) + add back reference with nest level. + \k<name+n>, \k<name-n> +2006/07/11: [impl] change long to unsigned long for ONIG_OPTION_XXX + and ONIG_SYN_XXX number literals. + +2006/07/03: Version 4.1.2 + +2006/07/03: [test] success in ruby 1.9.0 (2006-03-01) [i686-linux]. +2006/07/03: [spec] (thanks Wolfgang Nadasi-Donner) + allow \G in look-behind. + add ANCHOR_BEGIN_POSITION flag in setup_tree(). +2006/06/12: [impl] (thanks matz) + fix cast from char* to const char* + in onig_snprintf_with_pattern(). + fix cast from char* to const char* + for PopularQStr[] and ReduceQStr[]. + +2006/05/22: Version 4.1.1 + +2006/05/22: [test] success in ruby 1.9.0 (2006-03-01) [i686-linux]. +2006/05/22: [impl] add position string argument to STACK_BASE_CHECK(). +2006/05/22: [bug] (thanks NARUSE, Yui) + add STK_NULL_CHECK_END to IS_TO_VOID_TARGET(). + ex. core dump in + /(?<pare>\(([^\(\)]++|\g<pare>)*+\))/.match('((a))') + +2006/05/15: Version 4.1.0 + +2006/05/15: [test] success in ruby 1.9.0 (2006-03-01) [i686-linux]. +2006/05/15: [impl] thread atomic changes for onig_end() and + onig_free_node_list(). +2006/05/15: [test] success in ruby 1.9.0 (2006-03-01) [i686-linux]. +2005/05/15: [dist] update API, API.ja, FAQ, FAQ.ja. +2006/05/15: [spec] remove onig_recompile(), onig_recompile_deluxe() + and re_recompile_pattern(). + add config USE_RECOMPILE_API. +2006/05/15: [impl] improved thread safe implementation of onig_search() + and onig_match(). + +2006/05/11: Version 4.0.4 + +2006/05/11: [test] success in ruby 1.9.0 (2006-03-01) [i686-linux]. +2006/05/11: [bug] (thanks Yuji Kaneda) + dead-lock in onig_end(). +2006/05/11: [dist] update index.html. + +2006/05/08: Version 4.0.3 + +2006/05/08: [test] success in ruby 1.9.0 (2006-03-01) [i686-linux]. +2006/05/08: [bug] (thanks Allan Odgaard) + Segmentation fault in backward search. + ex. /^\t.*$/ +2006/04/18: [dist] update index.html. +2006/04/05: [dist] update index.html. +2006/03/24: [dist] update doc/RE, doc/RE.ja. + +2006/03/23: Version 4.0.2 + +2006/03/22: [test] success in ruby 1.9.0 (2006-03-01) [i686-linux]. +2006/03/22: [impl] add both of ONIG_OPTION_DONT_CAPTURE_GROUP + and ONIG_OPTION_CAPTURE_GROUP check. +2006/03/22: [spec] add error code ONIGERR_INVALID_COMBINATION_OF_OPTIONS. +2006/03/22: [impl] remove USE_NAMED_GROUP condition from + ONIG_OPTION_DONT_CAPTURE_GROUP check in parse_effect(). +2006/03/22: [new] add API onig_noname_group_capture_is_active(). +2006/03/01: [spec] rename regex object type from regex_t to OnigRegexType. + add typedef OnigRegexType regex_t + unless ONIG_ESCAPE_REGEX_T_COLLISION is defined. +2006/02/27: [spec] change ONIG_MAX_MULTI_BYTE_RANGES_NUM from 1000 + to 10000. (for docdiff program) +2006/02/17: [dist] change COPYING year 2005 -> 2006. + +2006/02/07: Version 4.0.1 + +2006/02/07: [test] success in ruby 1.9.0 (2005-11-28) [i686-linux]. +2006/02/07: [bug] memory leaks in onig_free_shared_cclass_table(). +2006/02/03: [ruby] add -m 0644 option to install command in "make 19". +2006/02/03: [impl] rename ANCHOR_ANYCHAR_STAR_PL to ANCHOR_ANYCHAR_STAR_ML. + change from IS_POSIXLINE() to IS_MULTILINE() + for ANCHOR_ANYCHAR_START/_ML decision + in optimize_node_left(). +2006/01/26: [dist] update index.html for Oniguruma 2.5.3. +2006/01/25: [dist] update URL in index.html. + +2006/01/24: Version 4.0.0 + +2006/01/24: [test] success in ruby 1.9.0 (2005-11-28) [i386-cygwin]. +2006/01/24: [test] success in ruby 1.9.0 (2005-11-28) [i686-linux]. +2006/01/24: [dist] remove warnings from sample/encode.c. +2006/01/24: [dist] change install description in README(.ja). +2006/01/24: [dist] remove re.c.XXX.patch from distribution and CVS. +2006/01/24: [dist] --- support shared library --- + use GNU libtool/automake. + change configure.in and add Makefile.am, sample/Makefile.am. + add AUTHORS file. +2006/01/24: [dist] test programs return exit code -1 when test fails. +2006/01/24: [bug] (thanks KIMURA Koichi) + invalid syntax definition in ONIG_SYNTAX_GREP. + ONIG_SYN_OP_BRACE_INTERVAL + -> ONIG_SYN_OP_ESC_BRACE_INTERVAL +2006/01/23: [dist] fix configure.in for onig-config. +2006/01/19: [new] add new config USE_UNICODE_ALL_LINE_TERMINATORS. + (U+000d, U+0085, U+2028, U+2029) +2005/12/29: [dist] change pmatch array size to 25 in testconv.rb. +2005/12/26: [dist] fix name in test.rb. +2005/12/26: [dist] update index.html for 2.5.1. + +2005/11/29: Version 3.9.1 + +2005/11/29: [test] success in ruby 1.9.0 (2005-11-28) [i686-linux]. +2005/11/24: [test] success in ruby 1.9.0 (2005-08-09) [i686-linux]. +2005/11/21: [test] success in ruby 1.9.0 (2005-11-20) [i386-cygwin]. +2005/11/21: [bug] (thanks Allan Odgaard) + utf-8 character comments in extended mode leads + invalid result. + ex. /(?x)(?<= # <any-utf-8 multibyte char>o\n~) / + fix onigenc_unicode_is_code_ctype() and + utf8_is_code_ctype(). +2005/11/20: [bug] (thanks MATSUMOTO Satoshi) (thanks Isao Sonobe) + begin-line anchor and BM search optimization leads + invalid result in UTF-16/32. + fix in set_optimize_exact_info(). + +2005/11/20: Version 3.9.0 + +2005/11/20: [test] success in ruby 1.9.0 (2005-11-20) [i386-cygwin]. +2005/11/20: [test] success in ruby 1.9.0 (2005-10-18) [i386-cygwin]. +2005/11/20: [new] add new config USE_CRNL_AS_LINE_TERMINATOR. + (!!! NO SUPPORT experimental option !!!) +2005/11/15: [bug] (thanks Allan Odgaard) + tok->escape was not cleared in fetch_token_in_cc(). + ex. [\s&&[^\n]] makes wrong result. +2005/10/18: [impl] (thanks nobu) + change sjis_mbc_enc_len() + and node_new_cclass_by_codepoint_range() scope to static. +2005/09/05: [dist] remove link to MultiFind. +2005/09/01: [dist] add link to yagrep. + +2005/08/23: Version 3.8.9 + +2005/08/23: [test] success in ruby 1.9.0 (2005-08-09) [i686-linux]. +2005/08/23: [inst] fix Makefile.in for make ctest/ptest. + +2005/08/23: Version 3.8.8 + +2005/08/23: [test] success in ruby 1.9.0 (2005-08-09) [i686-linux]. +2005/08/23: [impl] split is_code_in_cc() from onig_is_code_in_cc(). +2005/08/23: [impl] should check DATA_ENSURE() at OP_CCLASS_NODE in match_at(). +2005/08/23: [impl] (thanks akr) + add ONIG_OPTION_MAXBIT for escape conflict with + Ruby's option. +2005/08/22: [impl] escape GCC 4.0 warnings for testc.c. +2005/08/22: [bug] (thanks nobu, matz) [ruby-dev:26840] + UTF-8 0xFE, 0xFF handling bug in code_is_in_cclass_node(). + abort on /\S*/ =~ "\xfe" +2005/08/22: [impl] escape GCC 4.0 warnings for sample/*.c. +2005/08/22: [impl] fix testconvu.rb. +2005/08/22: [impl] escape GCC 4.0 warnings. + +2005/08/09: Version 3.8.7 + +2005/08/09: [test] success in ruby 1.9.0 (2005-08-09) [i686-linux]. +2005/08/09: [bug] (thanks Allan Odgaard) + should not call enc_len() for s == range + in onig_search(). +2005/08/01: [dist] add mkdir $prefix, mkdir $exec_prefix to make install. + +2005/07/27: Version 3.8.6 + +2005/07/27: [test] success in ruby 1.9.0 (2005-07-26) [i686-linux]. +2005/07/27: [impl] update onig-config.in. +2005/07/26: [new] (thanks Yen-Ju Chen) + add Oniguruma configuration check program. + (onig-config.in) + +2005/07/14: Version 3.8.5 + +2005/07/14: [test] success in ruby 1.9.0 (2005-07-14) [i686-linux]. +2005/07/11: [test] success in ruby 1.9.0 (2005-07-04) [i686-linux]. +2005/07/11: [bug] (thanks nobu) [ruby-dev:26505] + invalid handling for /\c\x/ and /\C-\x/. + fix fetch_escaped_value(). +2005/07/05: [impl] (thanks Alexey Zakhlestine) + escape GCC 4.0 warnings. + +2005/07/01: Version 3.8.4 + +2005/07/01: [test] success in ruby 1.9.0 (2005-07-01) [i686-linux]. +2005/06/30: [test] success in ruby 1.9.0 (2005-06-28) [i686-linux]. +2005/06/30: [dist] add GB 18030 test to sample/encode.c. +2005/06/30: [impl] escape warning of gb18030_left_adjust_char_head(). +2005/06/30: [new] (contributed by KUBO Takehiro) + add new character encoding ONIG_ENCODING_GB18030. +2005/06/30: [bug] invalid ctype check for multibyte encodings. + ("graph", "print") + fix onigenc_mb2/4_is_code_ctype(), + eucjp_is_code_ctype() and sjis_is_code_ctype(). +2005/06/30: [bug] invalid conversion from code point to mbc in + onigenc_mb4_code_to_mbc(). + +2005/06/28: Version 3.8.3 + +2005/06/28: [test] success in ruby 1.9.0 (2005-06-28) [i686-linux]. +2005/06/27: [test] success in ruby 1.9.0 (2005-05-31) [i686-linux]. +2005/06/27: [bug] (thanks Wolfgang Nadasi-Donner) + invalid check for never ending recursion. + lower zero quantifier should be treated as + a non-recursive call alternative. + ex. /(?<bal>[^()]*(\(\g<bal>\)[^()]*)*)/ +2005/06/15: [impl] add divide_ambig_string_node_sub(). +2005/06/15: [dist] add a test to sample/encode.c. +2005/06/10: [new] add ONIG_SYNTAX_PERL_NG. (Perl + named group) + +2005/06/01: Version 3.8.2 + +2005/06/01: [test] success in ruby 1.9.0 (2005-05-31) [i686-linux]. +2005/05/31: [dist] add doc/FAQ and doc/FAQ.ja. +2005/05/31: [impl] minor change in node_new(). +2005/05/30: [test] success in ruby 1.9.0 (2005-05-11) [i686-linux]. +2005/05/30: [bug] (thanks Allan Odgaard) + FreeNodeList null check should be on thread-atomic + in node_new(). + +2005/05/11: Version 3.8.1 + +2005/05/11: [test] success in ruby 1.9.0 (2005-05-11) [i386-mswin32]. +2005/05/11: [dist] update win32/Makefile (make 19). +2005/05/11: [test] success in ruby 1.9.0 (2005-05-11) [i686-linux]. +2005/05/06: [test] success in ruby 1.9.0 (2005-05-06) [i686-linux]. +2005/05/06: [impl] (thanks nobu) [ruby-core:4815] + add #ifdef USE_VARIABLE_META_CHARS to goto label. +2005/04/25: [test] success in ruby 1.9.0 (2005-04-25) [i686-linux]. +2005/04/25: [impl] change DEFAULT_WARN_FUNCTION and DEFAULT_VERB_WARN_FUNCTION + to onig_rb_warn() and onig_rb_warning(). + +2005/04/15: Version 3.8.0 + +2005/04/15: [test] success in ruby 1.9.0 (2005-04-14) [i686-linux]. +2005/04/01: [test] success in ruby 1.9.0 (2005-03-24) [i686-linux]. +2005/04/01: [impl] (thanks Joe Orton) + (thanks Moriyoshi Koizumi) + many const-ification to many *.[ch] files. + +2005/03/25: Version 3.7.2 + +2005/03/25: [test] success in ruby 1.9.0 (2005-03-24) [i686-linux]. +2005/03/23: [test] success in ruby 1.9.0 (2005-03-20) [i686-linux]. +2005/03/23: [test] success in ruby 1.9.0 (2005-03-08) [i686-linux]. +2005/03/23: [new] add ONIG_SYNTAX_ASIS. +2005/03/23: [new] add ONIG_SYN_OP2_INEFFECTIVE_ESCAPE. +2005/03/09: [spec] rename MBCTYPE_XXX to RE_MBCTYPE_XXX. (GNU API) +2005/03/08: [test] success in ruby 1.9.0 (2005-03-08) [i686-linux]. +2005/03/08: [impl] (thanks matz) [ruby-dev:25783] + should not allocate memory for key data in st.c. + move st_*_strend() functions from st.c. fixed some + potential memory leaks. + (imported from Ruby 1.9 2005-03-08) + 2005/03/07: Version 3.7.1 2005/03/07: [test] success in ruby 1.9.0 (2005-03-07) [i686-linux]. @@ -24,7 +383,7 @@ History remove reggnu.c from make 19. 2005/02/19: [dist] update doc/API and doc/API.ja. 2005/02/19: [test] success in ruby 1.9.0 (2005-02-19) [i386-cygwin]. -2005/02/19: [impl] (thanks Alexey Zakhlestin) +2005/02/19: [impl] (thanks Alexey Zakhlestine) change UChar* to const UChar* in oniguruma.h, regenc.h and regparse.h. 2005/02/13: [impl] change UChar* to const UChar* in oniguruma.h and @@ -1366,8 +1725,30 @@ svn copy http://localhost/repos/trunk/oniguruma http://localhost/repos/branches/ <create tag> svn copy http://localhost/repos/trunk/oniguruma http://localhost/repos/tags/oniguruma/X.X.X -m "onigdXXXXXXXX" -<show all tags> + +<CVS: show all tags> cvs history -T -<add tag> +<CVS: add tag> cvs rtag "VERSION_X_X_X" oniguruma + + +<GNU Autotools: bootstrap> +* write Makefile.am and configure.in. +> aclocal +> libtoolize +> automake --foreign --add-missing +> autoconf +> configure --with-rubydir=... CFLAGS="-O2 -Wall" + + +<GNU libtool: version management> + + VERSION = current:revision:age + + current: interface number (from 0) + revision: implementation number of same interface (from 0) + age: number of supported previous interfaces + (if current only supported then age == 0) + +//END diff --git a/ext/mbstring/oniguruma/README b/ext/mbstring/oniguruma/README index dc4fb3b64..f2cc7c981 100644 --- a/ext/mbstring/oniguruma/README +++ b/ext/mbstring/oniguruma/README @@ -1,4 +1,4 @@ -README 2005/02/04 +README 2006/05/15 Oniguruma ---- (C) K.Kosako <sndgk393 AT ybb DOT ne DOT jp> @@ -14,11 +14,12 @@ Supported character encodings: ASCII, UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, EUC-JP, EUC-TW, EUC-KR, EUC-CN, - Shift_JIS, Big5, KOI8-R, KOI8 (*), + Shift_JIS, Big5, GB 18030, KOI8-R, KOI8, ISO-8859-1, ISO-8859-2, ISO-8859-3, ISO-8859-4, ISO-8859-5, ISO-8859-6, ISO-8859-7, ISO-8859-8, ISO-8859-9, ISO-8859-10, ISO-8859-11, ISO-8859-13, ISO-8859-14, ISO-8859-15, ISO-8859-16 +* GB 18030: contributed by KUBO Takehiro * KOI8 is not included in library archive by default setup. (need to edit Makefile if you want to use it.) ------------------------------------------------------------ @@ -31,15 +32,20 @@ Install 2. make 3. make install - library file: libonig.a + * uninstall - test (ASCII/EUC-JP) + make uninstall - make ctest + * test (ASCII/EUC-JP) - uninstall + make atest - make uninstall + * configuration check + + onig-config --cflags + onig-config --libs + onig-config --prefix + onig-config --exec-prefix @@ -73,8 +79,21 @@ Regular Expressions Usage - Include oniguruma.h in your program. (native API) - See doc/API for native API. + Include oniguruma.h in your program. (Oniguruma API) + See doc/API for Oniguruma API. + + If you want to disable UChar type (== unsigned char) definition + in oniguruma.h, define ONIG_ESCAPE_UCHAR_COLLISION and then + include oniguruma.h. + + If you want to disable regex_t type definition in oniguruma.h, + define ONIG_ESCAPE_REGEX_T_COLLISION and then include oniguruma.h. + + Example of the compiling/linking command line in Unix or Cygwin, + (prefix == /usr/local case) + + cc sample.c -L/usr/local/lib -lonig + If you want to use static link library(onig_s.lib) in Win32, add option -DONIG_EXTERN=extern to C compiler. @@ -83,19 +102,20 @@ Usage Sample Programs - sample/simple.c example of the minimum (native API) + sample/simple.c example of the minimum (Oniguruma API) sample/names.c example of the named group callback. sample/encode.c example of some encodings. sample/listcap.c example of the capture history. sample/posix.c POSIX API sample. sample/sql.c example of the variable meta characters. (SQL-like pattern matching) - sample/syntax.c Perl and Java syntax test. + sample/syntax.c Perl, Java and ASIS syntax test. Source Files oniguruma.h Oniguruma API header file. (public) + onig-config.in configuration check program template. regenc.h character encodings framework header file. regint.h internal definitions @@ -125,9 +145,10 @@ Source Files enc/euc_tw.c EUC-TW encoding. enc/euc_kr.c EUC-KR, EUC-CN encoding. enc/sjis.c Shift_JIS encoding. - enc/big5.c Big5 encoding. - enc/koi8.c KOI8 encoding. - enc/koi8_r.c KOI8-R encoding. + enc/big5.c Big5 encoding. + enc/gb18030.c GB 18030 encoding (contributed by KUBO Takehiro) + enc/koi8.c KOI8 encoding. + enc/koi8_r.c KOI8-R encoding. enc/iso8859_1.c ISO-8859-1 encoding. (Latin-1) enc/iso8859_2.c ISO-8859-2 encoding. (Latin-2) enc/iso8859_3.c ISO-8859-3 encoding. (Latin-3) @@ -159,7 +180,6 @@ Source Files API differences with Japanized GNU regex(version 0.12) of Ruby 1.8/1.6 + re_compile_fastmap() is removed. - + re_recompile_pattern() is added. + re_alloc_pattern() is added. @@ -169,7 +189,6 @@ ToDo ? Unicode Property. ? ambig-flag Katakana <-> Hiragana. ? add ONIG_OPTION_NOTBOS/NOTEOS. (\A, \z, \Z) - ? add ONIG_SYNTAX_ASIS. ?? \X (== \PM\pM*) ?? implement syntax behavior ONIG_SYN_CONTEXT_INDEP_ANCHORS. ?? variable line separator. diff --git a/ext/mbstring/oniguruma/README.ja b/ext/mbstring/oniguruma/README.ja index 44553abfe..2394e958f 100644 --- a/ext/mbstring/oniguruma/README.ja +++ b/ext/mbstring/oniguruma/README.ja @@ -1,4 +1,4 @@ -README.ja 2005/02/04 +README.ja 2006/05/15 µ´¼Ö ---- (C) K.Kosako <sndgk393 AT ybb DOT ne DOT jp> @@ -14,11 +14,12 @@ http://www.freebsd.org/cgi/cvsweb.cgi/ports/devel/oniguruma/ ASCII, UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, EUC-JP, EUC-TW, EUC-KR, EUC-CN, - Shift_JIS, Big5, KOI8-R, KOI8 (*), + Shift_JIS, Big5, GB 18030, KOI8-R, KOI8, ISO-8859-1, ISO-8859-2, ISO-8859-3, ISO-8859-4, ISO-8859-5, ISO-8859-6, ISO-8859-7, ISO-8859-8, ISO-8859-9, ISO-8859-10, ISO-8859-11, ISO-8859-13, ISO-8859-14, ISO-8859-15, ISO-8859-16 +* GB 18030: µ×ÊÝ·òÍλáÄó¶¡ * KOI8¤Ï¥Ç¥Õ¥©¥ë¥È¤Î¥»¥Ã¥È¥¢¥Ã¥×¤Ç¤Ï¥é¥¤¥Ö¥é¥ê¤ÎÃæ¤Ë´Þ¤Þ¤ì¤Ê¤¤¡£ (ɬÍפǤ¢¤ì¤ÐMakefile¤òÊÔ½¸¤¹¤ë¤³¤È) ------------------------------------------------------------ @@ -31,15 +32,21 @@ http://www.freebsd.org/cgi/cvsweb.cgi/ports/devel/oniguruma/ 2. make 3. make install - ¥é¥¤¥Ö¥é¥ê¥Õ¥¡¥¤¥ë: libonig.a + ¥¢¥ó¥¤¥ó¥¹¥È¡¼¥ë + + make uninstall ưºî¥Æ¥¹¥È (ASCII/EUC-JP) - make ctest + make atest - ¥¢¥ó¥¤¥ó¥¹¥È¡¼¥ë - make uninstall + ¹½À®³Îǧ + + onig-config --cflags + onig-config --libs + onig-config --prefix + onig-config --exec-prefix @@ -71,8 +78,28 @@ http://www.freebsd.org/cgi/cvsweb.cgi/ports/devel/oniguruma/ »ÈÍÑÊýË¡ - »ÈÍѤ¹¤ë¥×¥í¥°¥é¥à¤Ç¡¢oniguruma.h¤ò¥¤¥ó¥¯¥ë¡¼¥É¤¹¤ë(Native API¤Î¾ì¹ç)¡£ - Native API¤Ë¤Ä¤¤¤Æ¤Ï¡¢doc/API.ja¤ò»²¾È¡£ + »ÈÍѤ¹¤ë¥×¥í¥°¥é¥à¤Ç¡¢oniguruma.h¤ò¥¤¥ó¥¯¥ë¡¼¥É¤¹¤ë(Oniguruma API¤Î¾ì¹ç)¡£ + Oniguruma API¤Ë¤Ä¤¤¤Æ¤Ï¡¢doc/API.ja¤ò»²¾È¡£ + + oniguruma.h¤ÇÄêµÁ¤µ¤ì¤Æ¤¤¤ë·¿Ì¾UChar(== unsigned char)¤ò̵¸ú¤Ë¤·¤¿¤¤¾ì¹ç + ¤Ë¤Ï¡¢ONIG_ESCAPE_UCHAR_COLLISION¤òdefine¤·¤Æ¤«¤éoniguruma.h¤ò¥¤¥ó¥¯¥ë¡¼¥É + ¤¹¤ë¤³¤È¡£¤³¤Î¤È¤¤Ë¤ÏUChar¤ÏÄêµÁ¤µ¤ì¤º¡¢OnigUChar¤È¤¤¤¦Ì¾Á°¤ÎÄêµÁ¤Î¤ß¤¬ + ͸ú¤Ë¤Ê¤ë¡£ + + oniguruma.h¤ÇÄêµÁ¤µ¤ì¤Æ¤¤¤ë·¿Ì¾regex_t¤ò̵¸ú¤Ë¤·¤¿¤¤¾ì¹ç¤Ë¤Ï¡¢ + ONIG_ESCAPE_REGEX_T_COLLISION¤òdefine¤·¤Æ¤«¤éoniguruma.h¤ò¥¤¥ó¥¯¥ë¡¼¥É + ¤¹¤ë¤³¤È¡£¤³¤Î¤È¤¤Ë¤Ïregex_t¤ÏÄêµÁ¤µ¤ì¤º¡¢OnigRegexType, OnigRegex¤È¤¤¤¦ + ̾Á°¤ÎÄêµÁ¤Î¤ß¤¬Í¸ú¤Ë¤Ê¤ë¡£ + + Unix/Cygwin¾å¤Ç¥³¥ó¥Ñ¥¤¥ë¡¢¥ê¥ó¥¯¤¹¤ë¾ì¹ç¤ÎÎã¡§ + (prefix¤¬/usr/local¤Î¤È¤) + cc sample.c -L/usr/local/lib -lonig + + GNU libtool¤ò»ÈÍѤ·¤Æ¤¤¤ë¤Î¤Ç¡¢¥×¥é¥Ã¥È¥Õ¥©¡¼¥à¤¬¶¦Í¥é¥¤¥Ö¥é¥ê¤ò¥µ¥Ý¡¼¥È¤·¤Æ + ¤¤¤ì¤Ð¡¢»ÈÍѤǤ¤ë¤è¤¦¤Ë¤Ê¤Ã¤Æ¤¤¤ë¡£ + ÀÅۥ饤¥Ö¥é¥ê¤È¶¦Í¥é¥¤¥Ö¥é¥ê¤Î¤É¤Á¤é¤ò»ÈÍѤ¹¤ë¤«¤ò»ØÄꤹ¤ëÊýË¡¡¢¼Â¹Ô»þÅÀ¤Ç¤Î + ´Ä¶ÀßÄêÊýË¡¤Ë¤Ä¤Æ¤Ï¡¢¼«Ê¬¤ÇÄ´¤Ù¤Æ²¼¤µ¤¤¡£ + Win32¤Ç¥¹¥¿¥Æ¥£¥Ã¥¯¥ê¥ó¥¯¥é¥¤¥Ö¥é¥ê(onig_s.lib)¤ò¥ê¥ó¥¯¤¹¤ë¾ì¹ç¤Ë¤Ï¡¢ ¥³¥ó¥Ñ¥¤¥ë¤¹¤ë¤È¤¤Ë -DONIG_EXTERN=extern ¤ò¥³¥ó¥Ñ¥¤¥ë°ú¿ô¤ËÄɲ乤뤳¤È¡£ @@ -80,18 +107,19 @@ http://www.freebsd.org/cgi/cvsweb.cgi/ports/devel/oniguruma/ »ÈÍÑÎã¥×¥í¥°¥é¥à - sample/simple.c ºÇ¾®Îã (native API) + sample/simple.c ºÇ¾®Îã (Oniguruma API) sample/names.c ̾Á°ÉÕ¤¥°¥ë¡¼¥×¥³¡¼¥ë¥Ð¥Ã¥¯»ÈÍÑÎã sample/encode.c ´ö¤Ä¤«¤Îʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥°»ÈÍÑÎã sample/listcap.c Êá³ÍÍúÎòµ¡Ç½¤Î»ÈÍÑÎã sample/posix.c POSIX API»ÈÍÑÎã sample/sql.c ²ÄÊѥ᥿ʸ»úµ¡Ç½»ÈÍÑÎã (SQL-like ¥Ñ¥¿¡¼¥ó) - sample/syntax.c Perl¤ÈJavaʸˡ¤Î¥Æ¥¹¥È + sample/syntax.c Perl¡¢Java¡¢ASISʸˡ¤Î¥Æ¥¹¥È ¥½¡¼¥¹¥Õ¥¡¥¤¥ë oniguruma.h µ´¼ÖAPI¥Ø¥Ã¥À (¸ø³«) + onig-config.in onig-config¥×¥í¥°¥é¥à ¥Æ¥ó¥×¥ì¡¼¥È regenc.h ʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥°ÏÈÁȤߥإåÀ regint.h ÆâÉôÀë¸À @@ -122,6 +150,7 @@ http://www.freebsd.org/cgi/cvsweb.cgi/ports/devel/oniguruma/ enc/euc_kr.c EUC-KR, EUC-CN ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° enc/sjis.c Shift_JIS ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° enc/big5.c Big5 ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + enc/gb18030.c GB 18030 ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° (µ×ÊÝ·òÍλá Äó¶¡) enc/koi8.c KOI8 ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° enc/koi8_r.c KOI8-R ¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° enc/iso8859_1.c ISO-8859-1 (Latin-1) @@ -155,7 +184,6 @@ http://www.freebsd.org/cgi/cvsweb.cgi/ports/devel/oniguruma/ Ruby 1.8/1.6¤ÎÆüËܸ첽GNU regex¤È¤ÎAPI¤Î°ã¤¤ + re_compile_fastmap() ¤Ïºï½ü¤µ¤ì¤¿¡£ - + re_recompile_pattern() ¤¬Äɲ䵤줿¡£ + re_alloc_pattern() ¤¬Äɲ䵤줿¡£ @@ -165,7 +193,6 @@ Ruby 1.8/1.6¤ÎÆüËܸ첽GNU regex¤È¤ÎAPI¤Î°ã¤¤ ? Unicode¥×¥í¥Ñ¥Æ¥£ ? ambig-flag Katakana <-> Hiragana ? ONIG_OPTION_NOTBOS/NOTEOSÄɲà (\A, \z, \Z) - ? ONIG_SYNTAX_ASISÄɲà ?? \X (== \PM\pM*) ?? ʸˡÍ×ÁÇ ONIG_SYN_CONTEXT_INDEP_ANCHORS¤Î¼ÂÁõ ?? ²þ¹Ôʸ»ú(ʸ»úÎó)¤òÊѹ¹¤Ç¤¤ë @@ -174,4 +201,4 @@ Ruby 1.8/1.6¤ÎÆüËܸ첽GNU regex¤È¤ÎAPI¤Î°ã¤¤ and I'm thankful to Akinori MUSHA. -Mail Address: K.Kosako <sndgk393 AT ybb DOT ne DOT jp> +¥¢¥É¥ì¥¹: K.Kosako <sndgk393 AT ybb DOT ne DOT jp> diff --git a/ext/mbstring/oniguruma/config.h.in b/ext/mbstring/oniguruma/config.h.in index 5ca2056fb..4a2fc28d8 100644 --- a/ext/mbstring/oniguruma/config.h.in +++ b/ext/mbstring/oniguruma/config.h.in @@ -1,69 +1,108 @@ -/* config.h.in. Generated automatically from configure.in by autoheader. */ +/* config.h.in. Generated from configure.in by autoheader. */ -/* Define if using alloca.c. */ -#undef C_ALLOCA - -/* Define to empty if the keyword does not work. */ -#undef const - -/* Define to one of _getb67, GETB67, getb67 for Cray-2 and Cray-YMP systems. - This function is required for alloca.c support on those systems. */ +/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP + systems. This function is required for `alloca.c' support on those systems. + */ #undef CRAY_STACKSEG_END -/* Define if you have alloca, as a function or macro. */ +/* Define to 1 if using `alloca.c'. */ +#undef C_ALLOCA + +/* Define to 1 if you have `alloca', as a function or macro. */ #undef HAVE_ALLOCA -/* Define if you have <alloca.h> and it should be used (not on Ultrix). */ +/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix). + */ #undef HAVE_ALLOCA_H -/* If using the C implementation of alloca, define if you know the - direction of stack growth for your system; otherwise it will be - automatically deduced at run-time. - STACK_DIRECTION > 0 => grows toward higher addresses - STACK_DIRECTION < 0 => grows toward lower addresses - STACK_DIRECTION = 0 => direction of growth unknown - */ -#undef STACK_DIRECTION +/* Define to 1 if you have the <dlfcn.h> header file. */ +#undef HAVE_DLFCN_H -/* Define if you have the ANSI C header files. */ -#undef STDC_HEADERS +/* Define to 1 if you have the <inttypes.h> header file. */ +#undef HAVE_INTTYPES_H -/* Define if you can safely include both <sys/time.h> and <time.h>. */ -#undef TIME_WITH_SYS_TIME +/* Define to 1 if you have the <memory.h> header file. */ +#undef HAVE_MEMORY_H -/* The number of bytes in a int. */ -#undef SIZEOF_INT +/* Define if compilerr supports prototypes */ +#undef HAVE_PROTOTYPES -/* The number of bytes in a long. */ -#undef SIZEOF_LONG +/* Define if compiler supports stdarg prototypes */ +#undef HAVE_STDARG_PROTOTYPES -/* The number of bytes in a short. */ -#undef SIZEOF_SHORT +/* Define to 1 if you have the <stdint.h> header file. */ +#undef HAVE_STDINT_H -/* Define if you have the <stdlib.h> header file. */ +/* Define to 1 if you have the <stdlib.h> header file. */ #undef HAVE_STDLIB_H -/* Define if you have the <string.h> header file. */ +/* Define to 1 if you have the <strings.h> header file. */ +#undef HAVE_STRINGS_H + +/* Define to 1 if you have the <string.h> header file. */ #undef HAVE_STRING_H -/* Define if you have the <strings.h> header file. */ -#undef HAVE_STRINGS_H +/* Define to 1 if you have the <sys/stat.h> header file. */ +#undef HAVE_SYS_STAT_H -/* Define if you have the <sys/types.h> header file. */ -#undef HAVE_SYS_TYPES_H +/* Define to 1 if you have the <sys/times.h> header file. */ +#undef HAVE_SYS_TIMES_H -/* Define if you have the <sys/time.h> header file. */ +/* Define to 1 if you have the <sys/time.h> header file. */ #undef HAVE_SYS_TIME_H -/* Define if you have the <sys/times.h> header file. */ -#undef HAVE_SYS_TIMES_H +/* Define to 1 if you have the <sys/types.h> header file. */ +#undef HAVE_SYS_TYPES_H -/* Define if you have the <unistd.h> header file. */ +/* Define to 1 if you have the <unistd.h> header file. */ #undef HAVE_UNISTD_H -/* Define if you have the function argument prototype */ -#undef HAVE_PROTOTYPES +/* Name of package */ +#undef PACKAGE -/* Define if you have the variable length prototypes and stdarg.h */ -#undef HAVE_STDARG_PROTOTYPES +/* Define to the address where bug reports for this package should be sent. */ +#undef PACKAGE_BUGREPORT + +/* Define to the full name of this package. */ +#undef PACKAGE_NAME + +/* Define to the full name and version of this package. */ +#undef PACKAGE_STRING + +/* Define to the one symbol short name of this package. */ +#undef PACKAGE_TARNAME + +/* Define to the version of this package. */ +#undef PACKAGE_VERSION + +/* The size of a `int', as computed by sizeof. */ +#undef SIZEOF_INT + +/* The size of a `long', as computed by sizeof. */ +#undef SIZEOF_LONG + +/* The size of a `short', as computed by sizeof. */ +#undef SIZEOF_SHORT +/* If using the C implementation of alloca, define if you know the + direction of stack growth for your system; otherwise it will be + automatically deduced at run-time. + STACK_DIRECTION > 0 => grows toward higher addresses + STACK_DIRECTION < 0 => grows toward lower addresses + STACK_DIRECTION = 0 => direction of growth unknown */ +#undef STACK_DIRECTION + +/* Define to 1 if you have the ANSI C header files. */ +#undef STDC_HEADERS + +/* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */ +#undef TIME_WITH_SYS_TIME + +/* Define if combination explosion check */ +#undef USE_COMBINATION_EXPLOSION_CHECK + +/* Version number of package */ +#undef VERSION + +/* Define to empty if `const' does not conform to ANSI C. */ +#undef const diff --git a/ext/mbstring/oniguruma/doc/API b/ext/mbstring/oniguruma/doc/API new file mode 100644 index 000000000..7374f65bd --- /dev/null +++ b/ext/mbstring/oniguruma/doc/API @@ -0,0 +1,586 @@ +Oniguruma API Version 4.1.0 2006/05/15 + +#include <oniguruma.h> + + +# int onig_init(void) + + Initialize library. + + You don't have to call it explicitly, because it is called in onig_new(). + + +# int onig_error_code_to_str(UChar* err_buf, int err_code, ...) + + Get error message string. + If this function is used for onig_new(), + don't call this after the pattern argument of onig_new() is freed. + + normal return: error message string length + + arguments + 1 err_buf: error message string buffer. + (required size: ONIG_MAX_ERROR_MESSAGE_LEN) + 2 err_code: error code returned by other API functions. + 3 err_info (optional): error info returned by onig_new(). + + +# void onig_set_warn_func(OnigWarnFunc func) + + Set warning function. + + WARNING: + '[', '-', ']' in character class without escape. + ']' in pattern without escape. + + arguments + 1 func: function pointer. void (*func)(char* warning_message) + + +# void onig_set_verb_warn_func(OnigWarnFunc func) + + Set verbose warning function. + + WARNING: + redundant nested repeat operator. + + arguments + 1 func: function pointer. void (*func)(char* warning_message) + + +# int onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end, + OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, + OnigErrorInfo* err_info) + + Create a regex object. + + normal return: ONIG_NORMAL + + arguments + 1 reg: return regex object's address. + 2 pattern: regex pattern string. + 3 pattern_end: terminate address of pattern. (pattern + pattern length) + 4 option: compile time options. + + ONIG_OPTION_NONE no option + ONIG_OPTION_SINGLELINE '^' -> '\A', '$' -> '\z', '\Z' -> '\z' + ONIG_OPTION_MULTILINE '.' match with newline + ONIG_OPTION_IGNORECASE ambiguity match on + ONIG_OPTION_EXTEND extended pattern form + ONIG_OPTION_FIND_LONGEST find longest match + ONIG_OPTION_FIND_NOT_EMPTY ignore empty match + ONIG_OPTION_NEGATE_SINGLELINE + clear ONIG_OPTION_SINGLELINE which is enabled on + ONIG_SYNTAX_POSIX_BASIC, ONIG_SYNTAX_POSIX_EXTENDED, + ONIG_SYNTAX_PERL, ONIG_SYNTAX_PERL_NG, ONIG_SYNTAX_JAVA + + ONIG_OPTION_DONT_CAPTURE_GROUP only named group captured. + ONIG_OPTION_CAPTURE_GROUP named and no-named group captured. + + 5 enc: character encoding. + + ONIG_ENCODING_ASCII ASCII + ONIG_ENCODING_ISO_8859_1 ISO 8859-1 + ONIG_ENCODING_ISO_8859_2 ISO 8859-2 + ONIG_ENCODING_ISO_8859_3 ISO 8859-3 + ONIG_ENCODING_ISO_8859_4 ISO 8859-4 + ONIG_ENCODING_ISO_8859_5 ISO 8859-5 + ONIG_ENCODING_ISO_8859_6 ISO 8859-6 + ONIG_ENCODING_ISO_8859_7 ISO 8859-7 + ONIG_ENCODING_ISO_8859_8 ISO 8859-8 + ONIG_ENCODING_ISO_8859_9 ISO 8859-9 + ONIG_ENCODING_ISO_8859_10 ISO 8859-10 + ONIG_ENCODING_ISO_8859_11 ISO 8859-11 + ONIG_ENCODING_ISO_8859_13 ISO 8859-13 + ONIG_ENCODING_ISO_8859_14 ISO 8859-14 + ONIG_ENCODING_ISO_8859_15 ISO 8859-15 + ONIG_ENCODING_ISO_8859_16 ISO 8859-16 + ONIG_ENCODING_UTF8 UTF-8 + ONIG_ENCODING_UTF16_BE UTF-16BE + ONIG_ENCODING_UTF16_LE UTF-16LE + ONIG_ENCODING_UTF32_BE UTF-32BE + ONIG_ENCODING_UTF32_LE UTF-32LE + ONIG_ENCODING_EUC_JP EUC-JP + ONIG_ENCODING_EUC_TW EUC-TW + ONIG_ENCODING_EUC_KR EUC-KR + ONIG_ENCODING_EUC_CN EUC-CN + ONIG_ENCODING_SJIS Shift_JIS + ONIG_ENCODING_KOI8 KOI8 + ONIG_ENCODING_KOI8_R KOI8-R + ONIG_ENCODING_BIG5 Big5 + ONIG_ENCODING_GB18030 GB 18030 + + or any OnigEncodingType data address defined by user. + + 6 syntax: address of pattern syntax definition. + + ONIG_SYNTAX_ASIS plain text + ONIG_SYNTAX_POSIX_BASIC POSIX Basic RE + ONIG_SYNTAX_POSIX_EXTENDED POSIX Extended RE + ONIG_SYNTAX_EMACS Emacs + ONIG_SYNTAX_GREP grep + ONIG_SYNTAX_GNU_REGEX GNU regex + ONIG_SYNTAX_JAVA Java (Sun java.util.regex) + ONIG_SYNTAX_PERL Perl + ONIG_SYNTAX_PERL_NG Perl + named group + ONIG_SYNTAX_RUBY Ruby + ONIG_SYNTAX_DEFAULT default (== Ruby) + onig_set_default_syntax() + + or any OnigSyntaxType data address defined by user. + + 7 err_info: address for return optional error info. + Use this value as 3rd argument of onig_error_code_to_str(). + + + +# int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, + OnigCompileInfo* ci, OnigErrorInfo* einfo) + + Create a regex object. + This function is deluxe version of onig_new(). + + normal return: ONIG_NORMAL + + arguments + 1 reg: return address of regex object. + 2 pattern: regex pattern string. + 3 pattern_end: terminate address of pattern. (pattern + pattern length) + 4 ci: compile time info. + + ci->num_of_elements: number of elements in ci. (current version: 5) + ci->pattern_enc: pattern string character encoding. + ci->target_enc: target string character encoding. + ci->syntax: address of pattern syntax definition. + ci->option: compile time option. + ci->ambig_flag: character matching ambiguity bit flag for + ONIG_OPTION_IGNORECASE mode. + + ONIGENC_AMBIGUOUS_MATCH_NONE: exact + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE: ignore case for ASCII + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE: ignore case for non-ASCII + ONIGENC_AMBIGUOUS_MATCH_COMPOUND: grapheme cluster as a char + ONIGENC_AMBIGUOUS_MATCH_FULL: all ambiguity on + ONIGENC_AMBIGUOUS_MATCH_DEFAULT: (ASCII | NONASCII) + onig_set_default_ambig_flag() + + 5 err_info: address for return optional error info. + Use this value as 3rd argument of onig_error_code_to_str(). + + + Different character encoding combination is allowed for + the following cases only. + + pattern_enc: ASCII, ISO_8859_1 + target_enc: UTF16_BE, UTF16_LE, UTF32_BE, UTF32_LE + + pattern_enc: UTF16_BE/LE + target_enc: UTF16_LE/BE + + pattern_enc: UTF32_BE/LE + target_enc: UTF32_LE/BE + + +# void onig_free(regex_t* reg) + + Free memory used by regex object. + + arguments + 1 reg: regex object. + + +# int onig_search(regex_t* reg, const UChar* str, const UChar* end, const UChar* start, + const UChar* range, OnigRegion* region, OnigOptionType option) + + Search string and return search result and matching region. + + normal return: match position offset (i.e. p - str >= 0) + not found: ONIG_MISMATCH (< 0) + + arguments + 1 reg: regex object + 2 str: target string + 3 end: terminate address of target string + 4 start: search start address of target string + 5 range: search terminate address of target string + in forward search (start <= searched string head < range) + in backward search (range <= searched string head <= start) + 6 region: address for return group match range info (NULL is allowed) + 7 option: search time option + + ONIG_OPTION_NOTBOL string head(str) isn't considered as begin of line + ONIG_OPTION_NOTEOL string end (end) isn't considered as end of line + ONIG_OPTION_POSIX_REGION region argument is regmatch_t[] of POSIX API. + + +# int onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, + OnigRegion* region, OnigOptionType option) + + Match string and return result and matching region. + + normal return: match length (>= 0) + not match: ONIG_MISMATCH ( < 0) + + arguments + 1 reg: regex object + 2 str: target string + 3 end: terminate address of target string + 4 at: match address of target string + 5 region: address for return group match range info (NULL is allowed) + 6 option: search time option + + ONIG_OPTION_NOTBOL string head(str) isn't considered as begin of line + ONIG_OPTION_NOTEOL string end (end) isn't considered as end of line + ONIG_OPTION_POSIX_REGION region argument is regmatch_t[] type of POSIX API. + + +# OnigRegion* onig_region_new(void) + + Create a region. + + +# void onig_region_free(OnigRegion* region, int free_self) + + Free memory used by region. + + arguments + 1 region: target region + 2 free_self: [1: free all, 0: free memory used in region but not self] + + +# void onig_region_copy(OnigRegion* to, OnigRegion* from) + + Copy contents of region. + + arguments + 1 to: target region + 2 from: source region + + +# void onig_region_clear(OnigRegion* region) + + Clear contents of region. + + arguments + 1 region: target region + + +# int onig_region_resize(OnigRegion* region, int n) + + Resize group range area of region. + + normal return: ONIG_NORMAL + + arguments + 1 region: target region + 2 n: new size + + +# int onig_name_to_group_numbers(regex_t* reg, const UChar* name, const UChar* name_end, + int** num_list) + + Return the group number list of the name. + Named subexp is defined by (?<name>....). + + normal return: number of groups for the name. + (ex. /(?<x>..)(?<x>..)/ ==> 2) + name not found: -1 + + arguments + 1 reg: regex object. + 2 name: group name. + 3 name_end: terminate address of group name. + 4 num_list: return list of group number. + + +# int onig_name_to_backref_number(regex_t* reg, const UChar* name, const UChar* name_end, + OnigRegion *region) + + Return the group number corresponding to the named backref (\k<name>). + If two or more regions for the groups of the name are effective, + the greatest number in it is obtained. + + normal return: group number. + + arguments + 1 reg: regex object. + 2 name: group name. + 3 name_end: terminate address of group name. + 4 region: search/match result region. + + +# int onig_foreach_name(regex_t* reg, + int (*func)(const UChar*, const UChar*, int,int*,regex_t*,void*), + void* arg) + + Iterate function call for all names. + + normal return: 0 + error: func's return value. + + arguments + 1 reg: regex object. + 2 func: callback function. + func(name, name_end, <number of groups>, <group number's list>, + reg, arg); + if func does not return 0, then iteration is stopped. + 3 arg: argument for func. + + +# int onig_number_of_names(regex_t* reg) + + Return the number of names defined in the pattern. + Multiple definitions of one name is counted as one. + + arguments + 1 reg: regex object. + + +# OnigEncoding onig_get_encoding(regex_t* reg) +# OnigOptionType onig_get_options(regex_t* reg) +# OnigAmbigType onig_get_ambig_flag(regex_t* reg) +# OnigSyntaxType* onig_get_syntax(regex_t* reg) + + Return a value of the regex object. + + arguments + 1 reg: regex object. + + +# int onig_number_of_captures(regex_t* reg) + + Return the number of capture group in the pattern. + + arguments + 1 reg: regex object. + + +# int onig_number_of_capture_histories(regex_t* reg) + + Return the number of capture history defined in the pattern. + + You can't use capture history if ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY + is disabled in the pattern syntax.(disabled in the default syntax) + + arguments + 1 reg: regex object. + + + +# OnigCaptureTreeNode* onig_get_capture_tree(OnigRegion* region) + + Return the root node of capture history data tree. + + This value is undefined if matching has faild. + + arguments + 1 region: matching result. + + +# int onig_capture_tree_traverse(OnigRegion* region, int at, + int(*func)(int,int,int,int,int,void*), void* arg) + + Traverse and callback in capture history data tree. + + normal return: 0 + error: callback func's return value. + + arguments + 1 region: match region data. + 2 at: callback position. + + ONIG_TRAVERSE_CALLBACK_AT_FIRST: callback first, then traverse childs. + ONIG_TRAVERSE_CALLBACK_AT_LAST: traverse childs first, then callback. + ONIG_TRAVERSE_CALLBACK_AT_BOTH: callback first, then traverse childs, + and at last callback again. + + 3 func: callback function. + if func does not return 0, then traverse is stopped. + + int func(int group, int beg, int end, int level, int at, + void* arg) + + group: group number + beg: capture start position + end: capture end position + level: nest level (from 0) + at: callback position + ONIG_TRAVERSE_CALLBACK_AT_FIRST + ONIG_TRAVERSE_CALLBACK_AT_LAST + arg: optional callback argument + + 4 arg; optional callback argument. + + +# int onig_noname_group_capture_is_active(regex_t* reg) + + Return noname group capture activity. + + active: 1 + inactive: 0 + + arguments + 1 reg: regex object. + + if option ONIG_OPTION_DONT_CAPTURE_GROUP == ON + --> inactive + + if the regex pattern have named group + and syntax ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP == ON + and option ONIG_OPTION_CAPTURE_GROUP == OFF + --> inactive + + else --> active + + +# UChar* onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s) + + Return previous character head address. + + arguments + 1 enc: character encoding + 2 start: string address + 3 s: target address of string + + +# UChar* onigenc_get_left_adjust_char_head(OnigEncoding enc, + const UChar* start, const UChar* s) + + Return left-adjusted head address of a character. + + arguments + 1 enc: character encoding + 2 start: string address + 3 s: target address of string + + +# UChar* onigenc_get_right_adjust_char_head(OnigEncoding enc, + const UChar* start, const UChar* s) + + Return right-adjusted head address of a character. + + arguments + 1 enc: character encoding + 2 start: string address + 3 s: target address of string + + +# int onigenc_strlen(OnigEncoding enc, const UChar* s, const UChar* end) +# int onigenc_strlen_null(OnigEncoding enc, const UChar* s) + + Return number of characters in the string. + + +# int onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s) + + Return number of bytes in the string. + + +# int onig_set_default_syntax(OnigSyntaxType* syntax) + + Set default syntax. + + arguments + 1 syntax: address of pattern syntax definition. + + +# void onig_copy_syntax(OnigSyntaxType* to, OnigSyntaxType* from) + + Copy syntax. + + arguments + 1 to: destination address. + 2 from: source address. + + +# unsigned int onig_get_syntax_op(OnigSyntaxType* syntax) +# unsigned int onig_get_syntax_op2(OnigSyntaxType* syntax) +# unsigned int onig_get_syntax_behavior(OnigSyntaxType* syntax) +# OnigOptionType onig_get_syntax_options(OnigSyntaxType* syntax) + +# void onig_set_syntax_op(OnigSyntaxType* syntax, unsigned int op) +# void onig_set_syntax_op2(OnigSyntaxType* syntax, unsigned int op2) +# void onig_set_syntax_behavior(OnigSyntaxType* syntax, unsigned int behavior) +# void onig_set_syntax_options(OnigSyntaxType* syntax, OnigOptionType options) + + Get/Set elements of the syntax. + + arguments + 1 syntax: syntax + 2 op, op2, behavior, options: value of element. + + +# void onig_copy_encoding(OnigEncoding to, OnigOnigEncoding from) + + Copy encoding. + + arguments + 1 to: destination address. + 2 from: source address. + + +# int onig_set_meta_char(OnigEncoding enc, unsigned int what, + OnigCodePoint code) + + Set a variable meta character to the code point value. + Except for an escape character, this meta characters specification + is not work, if ONIG_SYN_OP_VARIABLE_META_CHARACTERS is not effective + by the syntax. (Build-in syntaxes are not effective.) + + normal return: ONIG_NORMAL + + arguments + 1 enc: target encoding + 2 what: specifies which meta character it is. + + ONIG_META_CHAR_ESCAPE + ONIG_META_CHAR_ANYCHAR + ONIG_META_CHAR_ANYTIME + ONIG_META_CHAR_ZERO_OR_ONE_TIME + ONIG_META_CHAR_ONE_OR_MORE_TIME + ONIG_META_CHAR_ANYCHAR_ANYTIME + + 3 code: meta character or ONIG_INEFFECTIVE_META_CHAR. + + +# OnigAmbigType onig_get_default_ambig_flag() + + Get default ambig flag. + + +# int onig_set_default_ambig_flag(OnigAmbigType ambig_flag) + + Set default ambig flag. + + 1 ambig_flag: ambiguity flag + + +# unsigned int onig_get_match_stack_limit_size(void) + + Return the maximum number of stack size. + (default: 0 == unlimited) + + +# int onig_set_match_stack_limit_size(unsigned int size) + + Set the maximum number of stack size. + (size = 0: unlimited) + + normal return: ONIG_NORMAL + + +# int onig_end(void) + + The use of this library is finished. + + normal return: ONIG_NORMAL + + It is not allowed to use regex objects which created + before onig_end() call. + + +# const char* onig_version(void) + + Return version string. (ex. "2.2.8") + +// END diff --git a/ext/mbstring/oniguruma/doc/API.ja b/ext/mbstring/oniguruma/doc/API.ja new file mode 100644 index 000000000..2682da480 --- /dev/null +++ b/ext/mbstring/oniguruma/doc/API.ja @@ -0,0 +1,593 @@ +µ´¼Ö¥¤¥ó¥¿¡¼¥Õ¥§¡¼¥¹ Version 4.1.0 2006/05/15 + +#include <oniguruma.h> + + +# int onig_init(void) + + ¥é¥¤¥Ö¥é¥ê¤Î½é´ü²½ + + onig_new()¤ÎÃæ¤Ç¸Æ¤Ó½Ð¤µ¤ì¤ë¤Î¤Ç¡¢¤³¤Î´Ø¿ô¤òÌÀ¼¨Åª¤Ë¸Æ¤Ó½Ð¤µ¤Ê¤¯¤Æ¤â¤è¤¤¡£ + + +# int onig_error_code_to_str(UChar* err_buf, int err_code, ...) + + ¥¨¥é¡¼¥á¥Ã¥»¡¼¥¸¤ò¼èÆÀ¤¹¤ë¡£ + + ¤³¤Î´Ø¿ô¤ò¡¢onig_new()¤Î·ë²Ì¤ËÂФ·¤Æ¸Æ¤Ó½Ð¤¹¾ì¹ç¤Ë¤Ï¡¢onig_new()¤Îpattern°ú¿ô¤ò + ¥á¥â¥ê²òÊü¤¹¤ë¤è¤ê¤âÁ°¤Ë¸Æ¤Ó½Ð¤µ¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£ + + Àµ¾ï½ªÎ»Ìá¤êÃÍ: ¥¨¥é¡¼¥á¥Ã¥»¡¼¥¸Ê¸»úÎó¤Î¥Ð¥¤¥ÈĹ + + °ú¿ô + 1 err_buf: ¥¨¥é¡¼¥á¥Ã¥»¡¼¥¸¤ò³ÊǼ¤¹¤ëÎΰè + (ɬÍפʥµ¥¤¥º: ONIG_MAX_ERROR_MESSAGE_LEN) + 2 err_code: ¥¨¥é¡¼¥³¡¼¥É + 3 err_info (optional): onig_new()¤Îerr_info + + +# void onig_set_warn_func(OnigWarnFunc func) + + ·Ù¹ðÄÌÃδؿô¤ò¥»¥Ã¥È¤¹¤ë¡£ + + ·Ù¹ð: + '[', '-', ']' in character class without escape. + ']' in pattern without escape. + + °ú¿ô + 1 func: ·Ù¹ð´Ø¿ô void (*func)(char* warning_message) + + +# void onig_set_verb_warn_func(OnigWarnFunc func) + + ¾ÜºÙ·Ù¹ðÄÌÃδؿô¤ò¥»¥Ã¥È¤¹¤ë¡£ + + ¾ÜºÙ·Ù¹ð: + redundant nested repeat operator. + + °ú¿ô + 1 func: ¾ÜºÙ·Ù¹ð´Ø¿ô void (*func)(char* warning_message) + + +# int onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end, + OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, + OnigErrorInfo* err_info) + + Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È(regex)¤òºîÀ®¤¹¤ë¡£ + + Àµ¾ï½ªÎ»Ìá¤êÃÍ: ONIG_NORMAL + + °ú¿ô + 1 reg: ºîÀ®¤µ¤ì¤¿Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È¤òÊÖ¤¹¥¢¥É¥ì¥¹ + 2 pattern: Àµµ¬É½¸½¥Ñ¥¿¡¼¥óʸ»úÎó + 3 pattern_end: Àµµ¬É½¸½¥Ñ¥¿¡¼¥óʸ»úÎó¤Î½ªÃ¼¥¢¥É¥ì¥¹(pattern + pattern length) + 4 option: Àµµ¬É½¸½¥³¥ó¥Ñ¥¤¥ë»þ¥ª¥×¥·¥ç¥ó + + ONIG_OPTION_NONE ¥ª¥×¥·¥ç¥ó¤Ê¤· + ONIG_OPTION_SINGLELINE '^' -> '\A', '$' -> '\z', '\Z' -> '\z' + ONIG_OPTION_MULTILINE '.'¤¬²þ¹Ô¤Ë¥Þ¥Ã¥Á¤¹¤ë + ONIG_OPTION_IGNORECASE Û£Ëæ¥Þ¥Ã¥Á ¥ª¥ó + ONIG_OPTION_EXTEND ¥Ñ¥¿¡¼¥ó³ÈÄ¥·Á¼° + ONIG_OPTION_FIND_LONGEST ºÇĹ¥Þ¥Ã¥Á + ONIG_OPTION_FIND_NOT_EMPTY ¶õ¥Þ¥Ã¥Á¤ò̵»ë + ONIG_OPTION_NEGATE_SINGLELINE + ONIG_SYNTAX_POSIX_BASIC, ONIG_SYNTAX_POSIX_EXTENDED, + ONIG_SYNTAX_PERL, ONIG_SYNTAX_PERL_NG, ONIG_SYNTAX_JAVA¤Ç + ¥Ç¥Õ¥©¥ë¥È¤Ç͸ú¤ÊONIG_OPTION_SINGLELINE¤ò¥¯¥ê¥¢¤¹¤ë¡£ + + ONIG_OPTION_DONT_CAPTURE_GROUP ̾Á°ÉÕ¤Êá³Í¼°½¸¹ç¤Î¤ßÊá³Í + ONIG_OPTION_CAPTURE_GROUP ̾Á°Ìµ¤·Êá³Í¼°½¸¹ç¤âÊá³Í + + 5 enc: ʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + + ONIG_ENCODING_ASCII ASCII + ONIG_ENCODING_ISO_8859_1 ISO 8859-1 + ONIG_ENCODING_ISO_8859_2 ISO 8859-2 + ONIG_ENCODING_ISO_8859_3 ISO 8859-3 + ONIG_ENCODING_ISO_8859_4 ISO 8859-4 + ONIG_ENCODING_ISO_8859_5 ISO 8859-5 + ONIG_ENCODING_ISO_8859_6 ISO 8859-6 + ONIG_ENCODING_ISO_8859_7 ISO 8859-7 + ONIG_ENCODING_ISO_8859_8 ISO 8859-8 + ONIG_ENCODING_ISO_8859_9 ISO 8859-9 + ONIG_ENCODING_ISO_8859_10 ISO 8859-10 + ONIG_ENCODING_ISO_8859_11 ISO 8859-11 + ONIG_ENCODING_ISO_8859_13 ISO 8859-13 + ONIG_ENCODING_ISO_8859_14 ISO 8859-14 + ONIG_ENCODING_ISO_8859_15 ISO 8859-15 + ONIG_ENCODING_ISO_8859_16 ISO 8859-16 + ONIG_ENCODING_UTF8 UTF-8 + ONIG_ENCODING_UTF16_BE UTF-16BE + ONIG_ENCODING_UTF16_LE UTF-16LE + ONIG_ENCODING_UTF32_BE UTF-32BE + ONIG_ENCODING_UTF32_LE UTF-32LE + ONIG_ENCODING_EUC_JP EUC-JP + ONIG_ENCODING_EUC_TW EUC-TW + ONIG_ENCODING_EUC_KR EUC-KR + ONIG_ENCODING_EUC_CN EUC-CN + ONIG_ENCODING_SJIS Shift_JIS + ONIG_ENCODING_KOI8 KOI8 + ONIG_ENCODING_KOI8_R KOI8-R + ONIG_ENCODING_BIG5 Big5 + ONIG_ENCODING_GB18030 GB 18030 + + ¤Þ¤¿¤Ï¡¢¥æ¡¼¥¶¤¬ÄêµÁ¤·¤¿OnigEncodingType¥Ç¡¼¥¿¤Î¥¢¥É¥ì¥¹ + + 6 syntax: Àµµ¬É½¸½¥Ñ¥¿¡¼¥óʸˡÄêµÁ + + ONIG_SYNTAX_ASIS plain text + ONIG_SYNTAX_POSIX_BASIC POSIX Basic RE + ONIG_SYNTAX_POSIX_EXTENDED POSIX Extended RE + ONIG_SYNTAX_EMACS Emacs + ONIG_SYNTAX_GREP grep + ONIG_SYNTAX_GNU_REGEX GNU regex + ONIG_SYNTAX_JAVA Java (Sun java.util.regex) + ONIG_SYNTAX_PERL Perl + ONIG_SYNTAX_PERL_NG Perl + ̾Á°ÉÕ¤Êá³Í¼°½¸¹ç + ONIG_SYNTAX_RUBY Ruby + ONIG_SYNTAX_DEFAULT default (== Ruby) + onig_set_default_syntax() + + ¤Þ¤¿¤Ï¡¢¥æ¡¼¥¶¤¬ÄêµÁ¤·¤¿OnigSyntaxType¥Ç¡¼¥¿¤Î¥¢¥É¥ì¥¹ + + 7 err_info: ¥¨¥é¡¼¾ðÊó¤òÊÖ¤¹¤¿¤á¤Î¥¢¥É¥ì¥¹ + onig_error_code_to_str()¤Î»°ÈÖÌܤΰú¿ô¤È¤·¤Æ»ÈÍѤ¹¤ë + + +# int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, + OnigCompileInfo* ci, OnigErrorInfo* einfo) + + Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È(regex)¤òºîÀ®¤¹¤ë¡£ + ¤³¤Î´Ø¿ô¤Ï¡¢onig_new()¤Î¥Ç¥é¥Ã¥¯¥¹ÈÇ¡£ + + Àµ¾ï½ªÎ»Ìá¤êÃÍ: ONIG_NORMAL + + °ú¿ô + 1 reg: ºîÀ®¤µ¤ì¤¿Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È¤òÊÖ¤¹¥¢¥É¥ì¥¹ + 2 pattern: Àµµ¬É½¸½¥Ñ¥¿¡¼¥óʸ»úÎó + 3 pattern_end: Àµµ¬É½¸½¥Ñ¥¿¡¼¥óʸ»úÎó¤Î½ªÃ¼¥¢¥É¥ì¥¹(pattern + pattern length) + 4 ci: ¥³¥ó¥Ñ¥¤¥ë¾ðÊó + + ci->num_of_elements: ci¤ÎÍ×ÁÇ¿ô (¸½ºß¤ÎÈǤǤÏ: 5) + ci->pattern_enc: ¥Ñ¥¿¡¼¥óʸ»úÎó¤Îʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + ci->target_enc: ÂоÝʸ»úÎó¤Îʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + ci->syntax: Àµµ¬É½¸½¥Ñ¥¿¡¼¥óʸˡÄêµÁ + ci->option: Àµµ¬É½¸½¥³¥ó¥Ñ¥¤¥ë»þ¥ª¥×¥·¥ç¥ó + ci->ambig_flag: ONIG_OPTION_IGNORECASE¥â¡¼¥É¤Ç¤Î + ʸ»úÛ£Ëæ¥Þ¥Ã¥Á»ØÄê¥Ó¥Ã¥È¥Õ¥é¥° + + ONIGENC_AMBIGUOUS_MATCH_NONE: Û£ËæÌµ¤· + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE: ASCII¤ÎÂçʸ»ú¾®Ê¸»ú + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE: ASCII°Ê³°¤ÎÂçʸ»ú¾®Ê¸»ú + ONIGENC_AMBIGUOUS_MATCH_COMPOUND: ¹çÀ®Ê¸»ú + ONIGENC_AMBIGUOUS_MATCH_FULL: Á´¤Æ¤ÎÛ£Ëæ¥Õ¥é¥°Í¸ú + ONIGENC_AMBIGUOUS_MATCH_DEFAULT: (ASCII | NONASCII) + onig_set_default_ambig_flag() + + 5 err_info: ¥¨¥é¡¼¾ðÊó¤òÊÖ¤¹¤¿¤á¤Î¥¢¥É¥ì¥¹ + onig_error_code_to_str()¤Î»°ÈÖÌܤΰú¿ô¤È¤·¤Æ»ÈÍѤ¹¤ë + + + °Û¤Ê¤ëʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥°¤ÎÁȤ߹ç¤ï¤»¤Ï¡¢°Ê²¼¤Î¾ì¹ç¤Ë¤Î¤ßµö¤µ¤ì¤ë¡£ + + pattern_enc: ASCII, ISO_8859_1 + target_enc: UTF16_BE, UTF16_LE, UTF32_BE, UTF32_LE + + pattern_enc: UTF16_BE/LE + target_enc: UTF16_LE/BE + + pattern_enc: UTF32_BE/LE + target_enc: UTF32_LE/BE + + +# void onig_free(regex_t* reg) + + Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È¤Î¥á¥â¥ê¤ò²òÊü¤¹¤ë¡£ + + °ú¿ô + 1 reg: Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È + + + +# int onig_search(regex_t* reg, const UChar* str, const UChar* end, const UChar* start, + const UChar* range, OnigRegion* region, OnigOptionType option) + + Àµµ¬É½¸½¤Çʸ»úÎó¤ò¸¡º÷¤·¡¢¸¡º÷·ë²Ì¤È¥Þ¥Ã¥ÁÎΰè¤òÊÖ¤¹¡£ + + Àµ¾ï½ªÎ»Ìá¤êÃÍ: ¥Þ¥Ã¥Á°ÌÃÖ (p - str >= 0) + ¸¡º÷¼ºÇÔ: ONIG_MISMATCH (< 0) + + °ú¿ô + 1 reg: Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È + 2 str: ¸¡º÷ÂоÝʸ»úÎó + 3 end: ¸¡º÷ÂоÝʸ»úÎó¤Î½ªÃ¼¥¢¥É¥ì¥¹ + 4 start: ¸¡º÷ÂоÝʸ»úÎó¤Î¸¡º÷ÀèÆ¬°ÌÃÖ³«»Ï¥¢¥É¥ì¥¹ + 5 range: ¸¡º÷ÂоÝʸ»úÎó¤Î¸¡º÷ÀèÆ¬°ÌÃÖ½ªÃ¼¥¢¥É¥ì¥¹ + Á°Êýõº÷ (start <= õº÷¤µ¤ì¤ëʸ»úÎó¤ÎÀèÆ¬ < range) + ¸åÊýõº÷ (range <= õº÷¤µ¤ì¤ëʸ»úÎó¤ÎÀèÆ¬ <= start) + 6 region: ¥Þ¥Ã¥ÁÎΰè¾ðÊó(region) (NULL¤âµö¤µ¤ì¤ë) + 7 option: ¸¡º÷»þ¥ª¥×¥·¥ç¥ó + + ONIG_OPTION_NOTBOL ʸ»úÎó¤ÎÀèÆ¬(str)¤ò¹ÔƬ¤È´ÇÐö¤µ¤Ê¤¤ + ONIG_OPTION_NOTEOL ʸ»úÎó¤Î½ªÃ¼(end)¤ò¹ÔËö¤È´ÇÐö¤µ¤Ê¤¤ + ONIG_OPTION_POSIX_REGION region°ú¿ô¤òPOSIX API¤Îregmatch_t[]¤Ë¤¹¤ë + + +# int onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, + OnigRegion* region, OnigOptionType option) + + ʸ»úÎó¤Î»ØÄê°ÌÃ֤ǥޥåÁ¥ó¥°¤ò¹Ô¤¤¡¢·ë²Ì¤È¥Þ¥Ã¥ÁÎΰè¤òÊÖ¤¹¡£ + + Àµ¾ï½ªÎ»Ìá¤êÃÍ: ¥Þ¥Ã¥Á¤·¤¿¥Ð¥¤¥ÈĹ (>= 0) + not match: ONIG_MISMATCH ( < 0) + + °ú¿ô + 1 reg: Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È + 2 str: ¸¡º÷ÂоÝʸ»úÎó + 3 end: ¸¡º÷ÂоÝʸ»úÎó¤Î½ªÃ¼¥¢¥É¥ì¥¹ + 4 at: ¸¡º÷ÂоÝʸ»úÎó¤Î¸¡º÷¥¢¥É¥ì¥¹ + 5 region: ¥Þ¥Ã¥ÁÎΰè¾ðÊó(region) (NULL¤âµö¤µ¤ì¤ë) + 6 option: ¸¡º÷»þ¥ª¥×¥·¥ç¥ó + + ONIG_OPTION_NOTBOL ʸ»úÎó¤ÎÀèÆ¬(str)¤ò¹ÔƬ¤È´ÇÐö¤µ¤Ê¤¤ + ONIG_OPTION_NOTEOL ʸ»úÎó¤Î½ªÃ¼(end)¤ò¹ÔËö¤È´ÇÐö¤µ¤Ê¤¤ + ONIG_OPTION_POSIX_REGION region°ú¿ô¤òPOSIX API¤Îregmatch_t[]¤Ë¤¹¤ë + + +# OnigRegion* onig_region_new(void) + + ¥Þ¥Ã¥ÁÎΰè¾ðÊó(region)¤òºîÀ®¤¹¤ë¡£ + + +# void onig_region_free(OnigRegion* region, int free_self) + + ¥Þ¥Ã¥ÁÎΰè¾ðÊó(region)¤Ç»ÈÍѤµ¤ì¤Æ¤¤¤ë¥á¥â¥ê¤ò²òÊü¤¹¤ë¡£ + + °ú¿ô + 1 region: ¥Þ¥Ã¥ÁÎΰè¾ðÊ󥪥֥¸¥§¥¯¥È + 2 free_self: [1: region¼«¿È¤ò´Þ¤á¤ÆÁ´¤Æ²òÊü, 0: region¼«¿È¤Ï²òÊü¤·¤Ê¤¤] + + +# void onig_region_copy(OnigRegion* to, OnigRegion* from) + + ¥Þ¥Ã¥ÁÎΰè¾ðÊó(region)¤òÊ£À½¤¹¤ë¡£ + + °ú¿ô + 1 to: ÂоÝÎΰè + 2 from: ¸µÎΰè + + +# void onig_region_clear(OnigRegion* region) + + ¥Þ¥Ã¥ÁÎΰè¾ðÊó(region)¤ÎÃæÌ£¤ò¥¯¥ê¥¢¤¹¤ë¡£ + + °ú¿ô + 1 region: ÂоÝÎΰè + + +# int onig_region_resize(OnigRegion* region, int n) + + ¥Þ¥Ã¥ÁÎΰè¾ðÊó(region)¤ÎÊá³Í¼°½¸¹ç(¥°¥ë¡¼¥×)¿ô¤òÊѹ¹¤¹¤ë¡£ + + Àµ¾ï½ªÎ»Ìá¤êÃÍ: ONIG_NORMAL + + °ú¿ô + 1 region: ÂоÝÎΰè + 2 n: ¿·¤·¤¤¥µ¥¤¥º + + +# int onig_name_to_group_numbers(regex_t* reg, const UChar* name, const UChar* name_end, + int** num_list) + + »ØÄꤷ¤¿Ì¾Á°¤ËÂФ¹¤ë̾Á°ÉÕ¤Êá³Í¼°½¸¹ç(¥°¥ë¡¼¥×)¤Î + ¥°¥ë¡¼¥×ÈÖ¹æ¥ê¥¹¥È¤òÊÖ¤¹¡£ + ̾Á°ÉÕ¤Êá³Í¼°½¸¹ç¤Ï¡¢(?<name>....)¤Ë¤è¤Ã¤ÆÄêµÁ¤Ç¤¤ë¡£ + + Àµ¾ï½ªÎ»Ìá¤êÃÍ: »ØÄꤵ¤ì¤¿Ì¾Á°¤ËÂФ¹¤ë¥°¥ë¡¼¥×¿ô + (Îã /(?<x>..)(?<x>..)/ ==> 2) + ̾Á°¤ËÂФ¹¤ë¥°¥ë¡¼¥×¤¬Â¸ºß¤·¤Ê¤¤: -1 + + °ú¿ô + 1 reg: Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È + 2 name: Êá³Í¼°½¸¹ç(¥°¥ë¡¼¥×)̾ + 3 name_end: Êá³Í¼°½¸¹ç(¥°¥ë¡¼¥×)̾¤Î½ªÃ¼¥¢¥É¥ì¥¹ + 4 num_list: ÈÖ¹æ¥ê¥¹¥È¤òÊÖ¤¹¥¢¥É¥ì¥¹ + + +# int onig_name_to_backref_number(regex_t* reg, const UChar* name, const UChar* name_end, + OnigRegion *region) + + »ØÄꤵ¤ì¤¿Ì¾Á°¤Î¸åÊý»²¾È(\k<name>)¤ËÂФ¹¤ëÊá³Í¼°½¸¹ç(¥°¥ë¡¼¥×)¤ÎÈÖ¹æ¤òÊÖ¤¹¡£ + ̾Á°¤ËÂФ·¤Æ¡¢Ê£¿ô¤Î¥Þ¥Ã¥ÁÎΰ褬͸ú¤Ç¤¢¤ì¤Ð¡¢¤½¤ÎÃæ¤ÎºÇÂç¤ÎÈÖ¹æ¤òÊÖ¤¹¡£ + ̾Á°¤ËÂФ¹¤ëÊá³Í¼°½¸¹ç¤¬°ì¸Ä¤·¤«¤Ê¤¤¤È¤¤Ë¤Ï¡¢Âбþ¤¹¤ë¥Þ¥Ã¥ÁÎΰ褬͸ú¤« + ¤É¤¦¤«¤Ë´Ø·¸¤Ê¤¯¡¢¤½¤ÎÈÖ¹æ¤òÊÖ¤¹¡£(½¾¤Ã¤Æ¡¢region¤Ë¤ÏNULL¤òÅϤ·¤Æ¤â¤è¤¤¡£) + + Àµ¾ï½ªÎ»Ìá¤êÃÍ: ÈÖ¹æ + + °ú¿ô + 1 reg: Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È + 2 name: Êá³Í¼°½¸¹ç(¥°¥ë¡¼¥×)̾ + 3 name_end: Êá³Í¼°½¸¹ç(¥°¥ë¡¼¥×)̾¤Î½ªÃ¼¥¢¥É¥ì¥¹ + 4 region: search/match·ë²Ì¤Î¥Þ¥Ã¥ÁÎΰè + + +# int onig_foreach_name(regex_t* reg, + int (*func)(const UChar*, const UChar*, int,int*,regex_t*,void*), + void* arg) + + Á´¤Æ¤Î̾Á°¤ËÂФ·¤Æ¥³¡¼¥ë¥Ð¥Ã¥¯´Ø¿ô¸Æ¤Ó½Ð¤·¤ò¼Â¹Ô¤¹¤ë¡£ + + Àµ¾ï½ªÎ»Ìá¤êÃÍ: 0 + ¥¨¥é¡¼: ¥³¡¼¥ë¥Ð¥Ã¥¯´Ø¿ô¤ÎÌá¤êÃÍ + + °ú¿ô + 1 reg: Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È + 2 func: ¥³¡¼¥ë¥Ð¥Ã¥¯´Ø¿ô + func(name, name_end, <number of groups>, <group number's list>, + reg, arg); + + func¤¬0°Ê³°¤ÎÃͤòÊÖ¤¹¤È¡¢¤½¤ì°Ê¹ß¤Î¥³¡¼¥ë¥Ð¥Ã¥¯¤Ï¹Ô¤Ê¤ï¤º¤Ë + ½ªÎ»¤¹¤ë¡£ + + 3 arg: func¤ËÂФ¹¤ëÄɲðú¿ô + + +# int onig_number_of_names(regex_t* reg) + + ¥Ñ¥¿¡¼¥óÃæ¤ÇÄêµÁ¤µ¤ì¤¿Ì¾Á°¤Î¿ô¤òÊÖ¤¹¡£ + °ì¸Ä¤Î̾Á°¤Î¿½ÅÄêµÁ¤Ï°ì¸Ä¤È´ÇÐö¤¹¡£ + + °ú¿ô + 1 reg: Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È + + +# OnigEncoding onig_get_encoding(regex_t* reg) +# OnigOptionType onig_get_options(regex_t* reg) +# OnigAmbigType onig_get_ambig_flag(regex_t* reg) +# OnigSyntaxType* onig_get_syntax(regex_t* reg) + + Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È¤ËÂФ·¤Æ¡¢Âбþ¤¹¤ëÃͤòÊÖ¤¹¡£ + + °ú¿ô + 1 reg: Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È + + +# int onig_number_of_captures(regex_t* reg) + + ¥Ñ¥¿¡¼¥óÃæ¤ÇÄêµÁ¤µ¤ì¤¿Êá³Í¥°¥ë¡¼¥×¤Î¿ô¤òÊÖ¤¹¡£ + + °ú¿ô + 1 reg: Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È + + +# int onig_number_of_capture_histories(regex_t* reg) + + ¥Ñ¥¿¡¼¥óÃæ¤ÇÄêµÁ¤µ¤ì¤¿Êá³ÍÍúÎò(?@...)¤Î¿ô¤òÊÖ¤¹¡£ + + »ÈÍѤ¹¤ëʸˡ¤ÇÊá³ÍÍúÎòµ¡Ç½¤¬Í¸ú(ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY) + ¤Ç¤Ê¤±¤ì¤Ð¡¢Êá³ÍÍúÎòµ¡Ç½¤Ï»ÈÍѤǤ¤Ê¤¤¡£ + + °ú¿ô + 1 reg: Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È + + +# OnigCaptureTreeNode* onig_get_capture_tree(OnigRegion* region) + + Êá³ÍÍúÎò¥Ç¡¼¥¿¤Î¥ë¡¼¥È¥Î¡¼¥É¤òÊÖ¤¹¡£ + + ¥Þ¥Ã¥Á¤¬¼ºÇÔ¤·¤Æ¤¤¤ë¾ì¹ç¤Ë¤Ï¡¢¤³¤ÎÃͤÏÉÔÄê¤Ç¤¢¤ë¡£ + + °ú¿ô + 1 region: ¥Þ¥Ã¥ÁÎΰè + + +# int onig_capture_tree_traverse(OnigRegion* region, int at, + int(*func)(int,int,int,int,int,void*), void* arg) + + Êá³ÍÍúÎò¥Ç¡¼¥¿ÌÚ¤ò½ä²ó¤·¤Æ¥³¡¼¥ë¥Ð¥Ã¥¯¤¹¤ë¡£ + + Àµ¾ï½ªÎ»Ìá¤êÃÍ: 0 + ¥¨¥é¡¼: ¥³¡¼¥ë¥Ð¥Ã¥¯´Ø¿ô¤ÎÌá¤êÃÍ + + °ú¿ô + 1 region: ¥Þ¥Ã¥ÁÎΰè + 2 at: ¥³¡¼¥ë¥Ð¥Ã¥¯¤ò¹Ô¤Ê¤¦¥¿¥¤¥ß¥ó¥° + + ONIG_TRAVERSE_CALLBACK_AT_FIRST: + ºÇ½é¤Ë¥³¡¼¥ë¥Ð¥Ã¥¯¤·¤Æ¡¢»Ò¥Î¡¼¥É¤ò½ä²ó + ONIG_TRAVERSE_CALLBACK_AT_LAST: + »Ò¥Î¡¼¥É¤ò½ä²ó¤·¤Æ¡¢¥³¡¼¥ë¥Ð¥Ã¥¯ + ONIG_TRAVERSE_CALLBACK_AT_BOTH: + ºÇ½é¤Ë¥³¡¼¥ë¥Ð¥Ã¥¯¤·¤Æ¡¢»Ò¥Î¡¼¥É¤ò½ä²ó¡¢ºÇ¸å¤Ë¤â¤¦°ìÅÙ¥³¡¼¥ë¥Ð¥Ã¥¯ + + 3 func: ¥³¡¼¥ë¥Ð¥Ã¥¯´Ø¿ô + func¤¬0°Ê³°¤ÎÃͤòÊÖ¤¹¤È¡¢¤½¤ì°Ê¹ß¤Î½ä²ó¤Ï¹Ô¤Ê¤ï¤º¤Ë + ½ªÎ»¤¹¤ë¡£ + + int func(int group, int beg, int end, int level, int at, + void* arg) + group: ¥°¥ë¡¼¥×ÈÖ¹æ + beg: ¥Þ¥Ã¥Á³«»Ï°ÌÃÖ + end ¥Þ¥Ã¥Á½ªÎ»°ÌÃÖ + level: ¥Í¥¹¥È¥ì¥Ù¥ë (0¤«¤é) + at: ¥³¡¼¥ë¥Ð¥Ã¥¯¤¬¸Æ¤Ó½Ð¤µ¤ì¤¿¥¿¥¤¥ß¥ó¥° + ONIG_TRAVERSE_CALLBACK_AT_FIRST + ONIG_TRAVERSE_CALLBACK_AT_LAST + arg: Äɲðú¿ô + + 4 arg; func¤ËÂФ¹¤ëÄɲðú¿ô + + +# int onig_noname_group_capture_is_active(regex_t* reg) + + ̾Á°¤Ê¤·¼°½¸¹ç¤ÎÊá³Íµ¡Ç½¤¬Í¸ú¤«¤É¤¦¤«¤òÊÖ¤¹¡£ + + ͸ú: 1 + ̵¸ú: 0 + + °ú¿ô + 1 reg: Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È + + + ¥ª¥×¥·¥ç¥ó¤ÎONIG_OPTION_DONT_CAPTURE_GROUP¤¬ON --> ̵¸ú + + ¥Ñ¥¿¡¼¥ó¤¬Ì¾Á°¤Ä¤¼°½¸¹ç¤ò»ÈÍѤ·¤Æ¤¤¤ë + AND »ÈÍÑʸˡ¤Ç¡¢ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP¤¬ON + AND ¥ª¥×¥·¥ç¥ó¤ÎONIG_OPTION_CAPTURE_GROUP¤¬OFF + --> ̵¸ú + + ¾åµ°Ê³°¤Î¾ì¹ç --> ͸ú + + +# UChar* onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s) + + ʸ»ú°ì¸ÄʬÁ°¤Îʸ»úÎó°ÌÃÖ¤òÊÖ¤¹¡£ + + °ú¿ô + 1 enc: ʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + 2 start: ʸ»úÎó¤ÎÀèÆ¬¥¢¥É¥ì¥¹ + 3 s: ʸ»úÎóÃæ¤Î°ÌÃÖ + + +# UChar* onigenc_get_left_adjust_char_head(OnigEncoding enc, + const UChar* start, const UChar* s) + + ʸ»ú¤ÎÀèÆ¬¥Ð¥¤¥È°ÌÃ֤ˤʤë¤è¤¦¤Ëº¸Â¦¤ËÄ´À°¤·¤¿¥¢¥É¥ì¥¹¤òÊÖ¤¹¡£ + + °ú¿ô + 1 enc: ʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + 2 start: ʸ»úÎó¤ÎÀèÆ¬¥¢¥É¥ì¥¹ + 3 s: ʸ»úÎóÃæ¤Î°ÌÃÖ + + +# UChar* onigenc_get_right_adjust_char_head(OnigEncoding enc, + const UChar* start, const UChar* s) + + ʸ»ú¤ÎÀèÆ¬¥Ð¥¤¥È°ÌÃ֤ˤʤë¤è¤¦¤Ë±¦Â¦¤ËÄ´À°¤·¤¿¥¢¥É¥ì¥¹¤òÊÖ¤¹¡£ + + °ú¿ô + 1 enc: ʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + 2 start: ʸ»úÎó¤ÎÀèÆ¬¥¢¥É¥ì¥¹ + 3 s: ʸ»úÎóÃæ¤Î°ÌÃÖ + + +# int onigenc_strlen(OnigEncoding enc, const UChar* s, const UChar* end) +# int onigenc_strlen_null(OnigEncoding enc, const UChar* s) + + ʸ»úÎó¤Îʸ»ú¿ô¤òÊÖ¤¹¡£ + + +# int onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s) + + ʸ»úÎó¤Î¥Ð¥¤¥È¿ô¤òÊÖ¤¹¡£ + + +# int onig_set_default_syntax(OnigSyntaxType* syntax) + + ¥Ç¥Õ¥©¥ë¥È¤ÎÀµµ¬É½¸½¥Ñ¥¿¡¼¥óʸˡ¤ò¥»¥Ã¥È¤¹¤ë¡£ + + °ú¿ô + 1 syntax: Àµµ¬É½¸½¥Ñ¥¿¡¼¥óʸˡ + + +# void onig_copy_syntax(OnigSyntaxType* to, OnigSyntaxType* from) + + Àµµ¬É½¸½¥Ñ¥¿¡¼¥óʸˡ¤ò¥³¥Ô¡¼¤¹¤ë¡£ + + °ú¿ô + 1 to: ÂÐ¾Ý + 2 from: ¸µ + + +# unsigned int onig_get_syntax_op(OnigSyntaxType* syntax) +# unsigned int onig_get_syntax_op2(OnigSyntaxType* syntax) +# unsigned int onig_get_syntax_behavior(OnigSyntaxType* syntax) +# OnigOptionType onig_get_syntax_options(OnigSyntaxType* syntax) + +# void onig_set_syntax_op(OnigSyntaxType* syntax, unsigned int op) +# void onig_set_syntax_op2(OnigSyntaxType* syntax, unsigned int op2) +# void onig_set_syntax_behavior(OnigSyntaxType* syntax, unsigned int behavior) +# void onig_set_syntax_options(OnigSyntaxType* syntax, OnigOptionType options) + + Àµµ¬É½¸½¥Ñ¥¿¡¼¥óʸˡ¤ÎÍ×ÁǤò»²¾È/¼èÆÀ¤¹¤ë¡£ + + °ú¿ô + 1 syntax: Àµµ¬É½¸½¥Ñ¥¿¡¼¥óʸˡ + 2 op, op2, behavior, options: Í×ÁǤÎÃÍ + + +# void onig_copy_encoding(OnigEncoding to, OnigOnigEncoding from) + + ʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥°¤ò¥³¥Ô¡¼¤¹¤ë¡£ + + °ú¿ô + 1 to: ÂÐ¾Ý + 2 from: ¸µ + + +# int onig_set_meta_char(OnigEncoding enc, unsigned int what, + OnigCodePoint code) + + ¥á¥¿Ê¸»ú¤ò»ØÄꤷ¤¿¥³¡¼¥É¥Ý¥¤¥ó¥ÈÃͤ˥»¥Ã¥È¤¹¤ë¡£ + ONIG_SYN_OP_VARIABLE_META_CHARACTERS¤¬Àµµ¬É½¸½¥Ñ¥¿¡¼¥óʸˡ¤Ç͸ú¤Ë + ¤Ê¤Ã¤Æ¤¤¤Ê¤¤¾ì¹ç¤Ë¤Ï¡¢¥¨¥¹¥±¡¼¥×ʸ»ú¤ò½ü¤¤¤Æ¡¢¤³¤³¤Ç»ØÄꤷ¤¿¥á¥¿Ê¸»ú¤Ï + µ¡Ç½¤·¤Ê¤¤¡£(Áȹþ¤ß¤Îʸˡ¤Ç¤Ï͸ú¤Ë¤·¤Æ¤¤¤Ê¤¤¡£) + + Àµ¾ï½ªÎ»Ìá¤êÃÍ: ONIG_NORMAL + + °ú¿ô + 1 enc: ÂоÝʸ»ú¥¨¥ó¥³¡¼¥Ç¥£¥ó¥° + 2 what: ¥á¥¿Ê¸»úµ¡Ç½¤Î»ØÄê + + ONIG_META_CHAR_ESCAPE + ONIG_META_CHAR_ANYCHAR + ONIG_META_CHAR_ANYTIME + ONIG_META_CHAR_ZERO_OR_ONE_TIME + ONIG_META_CHAR_ONE_OR_MORE_TIME + ONIG_META_CHAR_ANYCHAR_ANYTIME + + 3 code: ¥á¥¿Ê¸»ú¤Î¥³¡¼¥É¥Ý¥¤¥ó¥È ¤Þ¤¿¤Ï ONIG_INEFFECTIVE_META_CHAR. + + +# OnigAmbigType onig_get_default_ambig_flag() + + ¥Ç¥Õ¥©¥ë¥È¤ÎÛ£Ëæ¥Þ¥Ã¥Á¥Õ¥é¥°¤ò¼èÆÀ¤¹¤ë¡£ + + +# int onig_set_default_ambig_flag(OnigAmbigType ambig_flag) + + ¥Ç¥Õ¥©¥ë¥È¤ÎÛ£Ëæ¥Þ¥Ã¥Á¥Õ¥é¥°¤ò¥»¥Ã¥È¤¹¤ë¡£ + + °ú¿ô + 1 ambig_flag: Û£Ëæ¥Þ¥Ã¥Á¥Õ¥é¥° + + +# unsigned int onig_get_match_stack_limit_size(void) + + ¥Þ¥Ã¥Á¥¹¥¿¥Ã¥¯¥µ¥¤¥º¤ÎºÇÂçÃͤòÊÖ¤¹¡£ + (¥Ç¥Õ¥©¥ë¥È: 0 == ̵À©¸Â) + + +# int onig_set_match_stack_limit_size(unsigned int size) + + ¥Þ¥Ã¥Á¥¹¥¿¥Ã¥¯¥µ¥¤¥º¤ÎºÇÂçÃͤò»ØÄꤹ¤ë¡£ + (size = 0: ̵À©¸Â) + + Àµ¾ï½ªÎ»Ìá¤êÃÍ: ONIG_NORMAL + + +# int onig_end(void) + + ¥é¥¤¥Ö¥é¥ê¤Î»ÈÍѤò½ªÎ»¤¹¤ë¡£ + + Àµ¾ï½ªÎ»Ìá¤êÃÍ: ONIG_NORMAL + + onig_init()¤òºÆÅٸƤӽФ·¤Æ¤â¡¢°ÊÁ°¤ËºîÀ®¤·¤¿Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È + ¤ò»ÈÍѤ¹¤ë¤³¤È¤Ï¤Ç¤¤Ê¤¤¡£ + + +# const char* onig_version(void) + + ¥Ð¡¼¥¸¥ç¥óʸ»úÎó¤òÊÖ¤¹¡£(Îã "2.2.8") + +// END diff --git a/ext/mbstring/oniguruma/doc/FAQ b/ext/mbstring/oniguruma/doc/FAQ new file mode 100644 index 000000000..1621a359e --- /dev/null +++ b/ext/mbstring/oniguruma/doc/FAQ @@ -0,0 +1,33 @@ +FAQ 2006/05/15 + +1. Lognest match + + You can execute longest match by using ONIG_OPTION_FIND_LONGEST option + in onig_new(). + + +2. Thread safe + + In order to make thread safe, which of (A) or (B) must be done. + + (A) Oniguruma Layer + + Define the macro below at NOT_RUBY case in oniguruma/regint.h. + + USE_MULTI_THREAD_SYSTEM + THREAD_ATOMIC_START + THREAD_ATOMIC_END + THREAD_PASS + + (B) Application Layer + + The plural threads should not do simultaneously that making + new regexp objects or re-compiling objects or freeing objects, + even if these objects are differ. + + +3. Mailing list + + There is no mailing list about Oniguruma. + +// END diff --git a/ext/mbstring/oniguruma/doc/FAQ.ja b/ext/mbstring/oniguruma/doc/FAQ.ja new file mode 100644 index 000000000..5f61b0955 --- /dev/null +++ b/ext/mbstring/oniguruma/doc/FAQ.ja @@ -0,0 +1,115 @@ +FAQ 2006/05/15 + +1. ºÇĹ¥Þ¥Ã¥Á + + onig_new()¤ÎÃæ¤Ç¡¢ONIG_OPTION_FIND_LONGEST¥ª¥×¥·¥ç¥ó + ¤ò»ÈÍѤ¹¤ì¤ÐºÇĹ¥Þ¥Ã¥Á¤Ë¤Ê¤ë¡£ + + +2. ¥¹¥ì¥Ã¥É¥»¡¼¥Õ + + ¥¹¥ì¥Ã¥É¥»¡¼¥Õ¤Ë¤¹¤ë¤Ë¤Ï¡¢°Ê²¼¤Î(A)¤È(B)¤Î¤É¤Á¤é¤«¤ò¹Ô¤Ê¤¨¤Ð + ¤è¤¤¡£ + + (A) Oniguruma Layer + + oniguruma/regint.h¤ÎÃæ¤ÎNOT_RUBY¤ÎÉôʬ¤Î°Ê²¼¤Î¥Þ¥¯¥í¤òÄêµÁ¤¹¤ë¡£ + + USE_MULTI_THREAD_SYSTEM + THREAD_ATOMIC_START + THREAD_ATOMIC_END + THREAD_PASS + + (B) Application Layer + + Ʊ»þ¤ËÊ£¿ô¤Î¥¹¥ì¥Ã¥É¤¬¡¢Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È¤òºîÀ®¤¹¤ë¡¢ + ¤Þ¤¿¤Ï²òÊü¤¹¤ë¡¢¤³¤È¤ò¹Ô¤Ê¤Ã¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£ + ¤½¤ì¤é¤Î¥ª¥Ö¥¸¥§¥¯¥È¤¬Á´¤¯Ê̤Τâ¤Î¤Ç¤¢¤Ã¤Æ¤â¡£ + + ¤â¤¦¾¯¤·¾Ü¤·¤¤ÀâÌÀ¤Ï¡¢¤³¤Î¥É¥¥å¥á¥ó¥È¤ÎÃæ¤Î + "¥¹¥ì¥Ã¥É¥»¡¼¥Õ¤Ë´Ø¤¹¤ëÊäÂ"¤Ë½ñ¤¤¤Æ¤ª¤¤¤¿¡£ + + +3. ¥á¡¼¥ê¥ó¥°¥ê¥¹¥È + + µ´¼Ö¤Ë´Ø¤¹¤ë¥á¡¼¥ê¥ó¥°¥ê¥¹¥È¤Ï¸ºß¤·¤Ê¤¤¡£ + +//END + + + +¥¹¥ì¥Ã¥É¥»¡¼¥Õ¤Ë´Ø¤¹¤ëÊä + +¥¹¥ì¥Ã¥É¥»¡¼¥Õ¤Ë¤¹¤ë¤Ë¤Ï¡¢¸ÄÊ̤Υ¢¥×¥ê¥±¡¼¥·¥ç¥ó¤ÎÃæ¤Ç¹Ô¤¦¤«¡¢ +Oniguruma¥é¥¤¥Ö¥é¥ê¤ÎÃæ¤Ç¹Ô¤¦¤«¡¢¤É¤Á¤é¤«¤òÁª¤Ö¤³¤È¤¬¤Ç¤¤Þ¤¹¡£ +(Oniguruma¤ò»ÈÍѤ¹¤ë¦¤ÇÂн褹¤ë¤«¡¢Oniguruma¤ËÂн褵¤»¤ë¤« +¤É¤Á¤é¤«ÊÒÊý¤Ç¹Ô¤¦É¬Íפ¬¤¢¤ë¤È¤¤¤¦¤³¤È¤Ç¤¹¡£) + +¤³¤ì¤é¤ÎÊýË¡¤Ë¤Ä¤¤¤Æ¡¢°Ê²¼(A)¤È(B)¤ÇÀâÌÀ¤·¤Þ¤¹¡£ + +¥Þ¥ë¥Á¥¹¥ì¥Ã¥ÉAPI¤Ï¡¢¤½¤ì¤¾¤ì¤Î¥×¥é¥Ã¥È¥Õ¥©¡¼¥à¤Ë¤è¤Ã¤Æ¤â +°Û¤Ê¤ê¤Þ¤¹¤Î¤Ç¡¢°Ê²¼¤ÎÀâÌÀ¤ÎÃæ¤Ç¶ñÂÎŪ¤Ë²¿¤ò¸Æ¤Ö¤Î¤«¤ò +½ñ¤¯¤³¤È¤Ï̵Íý¤Ç¤¹¡£¼ÂºÝ¤Ë»ÈÍѤµ¤ì¤ë¥Þ¥ë¥Á¥¹¥ì¥Ã¥ÉAPI¤Ç¡¢ +Âбþ¤¹¤ëµ¡Ç½¤Î¤â¤Î¤ò»ØÄꤷ¤Æ¤¯¤À¤µ¤¤¡£ + +(A) Oniguruma¤ÎÃæ¤ÇÂбþ¤¹¤ë¾ì¹ç + +oniguruma/regint.h¤ÎÃæ¤ÎNOT_RUBY¤Ç°Ï¤Þ¤ì¤Æ¤¤¤ëÉôʬ¤ÎÃæ¤Ç +°Ê²¼¤Î¥Þ¥¯¥í¤òÄêµÁ¤·¤ÆºÆ¥³¥ó¥Ñ¥¤¥ë¤·¤Æ¤¯¤À¤µ¤¤¡£ + +USE_MULTI_THREAD_SYSTEM + + ñ¤Ë͸ú¤Ë¤¹¤ì¤Ð¤è¤¤¤Ç¤¹¡£ + +THREAD_ATOMIC_START +THREAD_ATOMIC_END + + THREAD_ATOMIC_START¤«¤éTHREAD_ATOMIC_END¤Ç°Ï¤Þ¤ì¤¿ + ¥×¥í¥°¥é¥à¤Î¥³¡¼¥ÉÉôʬ¤ò¤¢¤ë¥¹¥ì¥Ã¥É¤¬¼Â¹ÔÃæ¤Ë¡¢Â¾¤Î + ¥¹¥ì¥Ã¥É¤Ë¼Â¹Ô¸¢¤¬°Üư¤·¤Ê¤¤¤³¤È¤òÊݾ㤹¤ë¤â¤Î¤ËÄêµÁ + ¤·¤Æ¤¯¤À¤µ¤¤¡£ + (̾Á°¤ÎÄ̤ꡢ°Ï¤Þ¤ì¤¿¥³¡¼¥ÉÉôʬ¤ò¥¹¥ì¥Ã¥É¥¢¥È¥ß¥Ã¥¯¤Ë + ¤¹¤ë¤È¤¤¤¦°ÕÌ£) + +THREAD_PASS + + ¤³¤ì¤ò¼Â¹Ô¤·¤¿¥¹¥ì¥Ã¥É¤«¤é¡¢Â¾¤Î¥¹¥ì¥Ã¥É¤Ë¼Â¹Ô¸¢¤ò°Ñ¾ù + ¤¹¤ë¤â¤Î¤ËÄêµÁ¤ò¤·¤Æ¤¯¤À¤µ¤¤¡£(ºÆ¥¹¥±¥¸¥å¡¼¥ë¤ò¸Æ¤Ó½Ð¤¹ + ¤È¤¤¤¦°ÕÌ£) + Âбþ¤¹¤ëµ¡Ç½¤¬Á´¤¯¤Ê¤±¤ì¤Ð¡¢¶õÄêµÁ¤Ë¤·¤Æ¤¯¤À¤µ¤¤¡£ + +(»²¹ÍÎã) +Ruby¤Î¾ì¹ç¤òÎã¤Ë¤¹¤ë¤È¡¢ +Ruby¤Ï¼«Ê¬¼«¿È¤ÇÆÈ¼«¤Î¥¹¥ì¥Ã¥Éµ¡Ç½¤ò¼ÂÁõ¤·¤Æ¤¤¤Þ¤¹¡£ +¤½¤Îµ¡Ç½¤ò»ÈÍѤ¹¤ë¤È¡¢°Ê²¼¤Î¤è¤¦¤ËÄêµÁ¤¹¤ì¤Ð¤è¤¤¤³¤È¤Ë +¤Ê¤ê¤Þ¤¹¡£ + +#define USE_MULTI_THREAD_SYSTEM +#define THREAD_ATOMIC_START DEFER_INTS +#define THREAD_ATOMIC_END ENABLE_INTS +#define THREAD_PASS rb_thread_schedule() + +Ruby¤Î¾ì¹ç¡¢¥¿¥¤¥Þ³ä¤ê¹þ¤ß¤ò»ÈÍѤ·¤Æ¡¢¥¹¥ì¥Ã¥É¤ÎÀÚ¤êÂØ¤¨¤ò +¹Ô¤Ã¤Æ¤¤¤Þ¤¹¡£DEFER_INTS¤Ï³ä¤ê¹þ¤ß¥Ï¥ó¥É¥é¤Î¼Â¹Ô¤ò°ì»þŪ¤Ë +»ß¤á¤ë¤¿¤á¤Î¥Þ¥¯¥í¤Ç¤¹¡£ENABLE_INTS¥Þ¥¯¥í¤Ç³ä¤ê¹þ¤ß¥Ï¥ó¥É¥é +¤Î¼Â¹Ô¤òµö²Ä¤·¤Þ¤¹¡£ +¤³¤ì¤Ë¤è¤Ã¤Æ¡¢THREAD_ATOMIC_START¤«¤éTHREAD_ATOMIC_END +¤Ç°Ï¤Þ¤ì¤¿Éôʬ¤Î¼Â¹ÔÃæ¤Ë¡¢Â¾¤Î¥¹¥ì¥Ã¥É¤Ë¼Â¹Ô¸¢¤¬°Üư¤·¤Þ¤»¤ó¡£ + + +(B) ¥¢¥×¥ê¥±¡¼¥·¥ç¥ó¤ÎÃæ¤ÇÂбþ¤¹¤ë¾ì¹ç + +°Ê²¼¤òÊݾ㤹¤ë¤è¤¦¤Ë¡¢¥¹¥ì¥Ã¥É¤Î¼Â¹Ô¤òÀ©¸æ¤·¤Æ¤¯¤À¤µ¤¤¡£ + +Ʊ»þ¤ËÊ£¿ô¤Î¥¹¥ì¥Ã¥É¤¬¡¢Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È¤òºîÀ®¤¹¤ë¡¢¤Þ¤¿¤Ï²òÊü¤¹¤ë¡¢¤³¤È¤ò +¹Ô¤Ê¤Ã¤Æ¤Ï¤Ê¤é¤Ê¤¤¡£¤½¤ì¤é¤Î¥ª¥Ö¥¸¥§¥¯¥È¤¬Á´¤¯Ê̤Τâ¤Î¤Ç¤¢¤Ã¤Æ¤â¡£ + +onig_new(), onig_new_deluxe(), onig_free()¤Î¤É¤ì¤«¤Î¸Æ¤Ó½Ð¤·¤ò¡¢ +Ê£¿ô¤Î¥¹¥ì¥Ã¥É¤¬Æ±»þ¤Ë¼Â¹Ô¤¹¤ë¤³¤È¤òÈò¤±¤Æ¤¯¤À¤µ¤¤¡£Æ±»þ¤Ç¤Ê¤±¤ì¤ÐÊ̤ˤ«¤Þ¤¤¤Þ¤»¤ó¡£ + +¤³¤ì¤Ï²¿¸ÎɬÍפʤΤ«¤È¤¤¤¦¤È¡¢Àµµ¬É½¸½¥ª¥Ö¥¸¥§¥¯¥È¤òºîÀ®¤¹¤ë +²áÄø¤Ç¡¢ÆâÉô¤Ç¶¦Ä̤˻²¾È¤¹¤ë¥Æ¡¼¥Ö¥ë¤¬¤¢¤ê¤Þ¤¹¡£ +¤³¤Î¥Æ¡¼¥Ö¥ë¤ËÂФ·¤Æ¤Î¥Ç¡¼¥¿ÅÐÏ¿½èÍý¤¬Ê£¿ô¤Î¥¹¥ì¥Ã¥É¤Ç¾×ÆÍ¤·¤Æ +°Û¾ï¤Ê¾õÂ֤ˤʤé¤Ê¤¤¤¿¤á¤ËɬÍפǤ¹¡£ + +// END diff --git a/ext/mbstring/oniguruma/doc/RE b/ext/mbstring/oniguruma/doc/RE new file mode 100644 index 000000000..5a2783d16 --- /dev/null +++ b/ext/mbstring/oniguruma/doc/RE @@ -0,0 +1,412 @@ +Oniguruma Regular Expressions Version 4.3.0 2006/08/17 + +syntax: ONIG_SYNTAX_RUBY (default) + + +1. Syntax elements + + \ escape (enable or disable meta character meaning) + | alternation + (...) group + [...] character class + + +2. Characters + + \t horizontal tab (0x09) + \v vertical tab (0x0B) + \n newline (0x0A) + \r return (0x0D) + \b back space (0x08) + \f form feed (0x0C) + \a bell (0x07) + \e escape (0x1B) + \nnn octal char (encoded byte value) + \xHH hexadecimal char (encoded byte value) + \x{7HHHHHHH} wide hexadecimal char (character code point value) + \cx control char (character code point value) + \C-x control char (character code point value) + \M-x meta (x|0x80) (character code point value) + \M-\C-x meta control char (character code point value) + + (* \b is effective in character class [...] only) + + +3. Character types + + . any character (except newline) + + \w word character + + Not Unicode: + alphanumeric, "_" and multibyte char. + + Unicode: + General_Category -- (Letter|Mark|Number|Connector_Punctuation) + + \W non word char + + \s whitespace char + + Not Unicode: + \t, \n, \v, \f, \r, \x20 + + Unicode: + 0009, 000A, 000B, 000C, 000D, 0085(NEL), + General_Category -- Line_Separator + -- Paragraph_Separator + -- Space_Separator + + \S non whitespace char + + \d decimal digit char + + Unicode: General_Category -- Decimal_Number + + \D non decimal digit char + + \h hexadecimal digit char [0-9a-fA-F] + + \H non hexadecimal digit char + + +4. Quantifier + + greedy + + ? 1 or 0 times + * 0 or more times + + 1 or more times + {n,m} at least n but not more than m times + {n,} at least n times + {,n} at least 0 but not more than n times ({0,n}) + {n} n times + + reluctant + + ?? 1 or 0 times + *? 0 or more times + +? 1 or more times + {n,m}? at least n but not more than m times + {n,}? at least n times + {,n}? at least 0 but not more than n times (== {0,n}?) + + possessive (greedy and does not backtrack after repeated) + + ?+ 1 or 0 times + *+ 0 or more times + ++ 1 or more times + + ({n,m}+, {n,}+, {n}+ are possessive op. in ONIG_SYNTAX_JAVA only) + + ex. /a*+/ === /(?>a*)/ + + +5. Anchors + + ^ beginning of the line + $ end of the line + \b word boundary + \B not word boundary + \A beginning of string + \Z end of string, or before newline at the end + \z end of string + \G matching start position (*) + + * Ruby Regexp: + previous end-of-match position + (This specification is not related to this library.) + + +6. Character class + + ^... negative class (lowest precedence operator) + x-y range from x to y + [...] set (character class in character class) + ..&&.. intersection (low precedence at the next of ^) + + ex. [a-w&&[^c-g]z] ==> ([a-w] AND ([^c-g] OR z)) ==> [abh-w] + + * If you want to use '[', '-', ']' as a normal character + in a character class, you should escape these characters by '\'. + + + POSIX bracket ([:xxxxx:], negate [:^xxxxx:]) + + Not Unicode Case: + + alnum alphabet or digit char + alpha alphabet + ascii code value: [0 - 127] + blank \t, \x20 + cntrl + digit 0-9 + graph include all of multibyte encoded characters + lower + print include all of multibyte encoded characters + punct + space \t, \n, \v, \f, \r, \x20 + upper + xdigit 0-9, a-f, A-F + + + Unicode Case: + + alnum Letter | Mark | Decimal_Number + alpha Letter | Mark + ascii 0000 - 007F + blank Space_Separator | 0009 + cntrl Control | Format | Unassigned | Private_Use | Surrogate + digit Decimal_Number + graph [[:^space:]] && ^Control && ^Unassigned && ^Surrogate + lower Lowercase_Letter + print [[:graph:]] | [[:space:]] + punct Connector_Punctuation | Dash_Punctuation | Close_Punctuation | + Final_Punctuation | Initial_Punctuation | Other_Punctuation | + Open_Punctuation + space Space_Separator | Line_Separator | Paragraph_Separator | + 0009 | 000A | 000B | 000C | 000D | 0085 + upper Uppercase_Letter + xdigit 0030 - 0039 | 0041 - 0046 | 0061 - 0066 + (0-9, a-f, A-F) + + +7. Extended groups + + (?#...) comment + + (?imx-imx) option on/off + i: ignore case + m: multi-line (dot(.) match newline) + x: extended form + (?imx-imx:subexp) option on/off for subexp + + (?:subexp) not captured group + (subexp) captured group + + (?=subexp) look-ahead + (?!subexp) negative look-ahead + (?<=subexp) look-behind + (?<!subexp) negative look-behind + + Subexp of look-behind must be fixed character length. + But different character length is allowed in top level + alternatives only. + ex. (?<=a|bc) is OK. (?<=aaa(?:b|cd)) is not allowed. + + In negative-look-behind, captured group isn't allowed, + but shy group(?:) is allowed. + + (?>subexp) atomic group + don't backtrack in subexp. + + (?<name>subexp) define named group + (All characters of the name must be a word character. + And first character must not be a digit or uppper case) + + Not only a name but a number is assigned like a captured + group. + + Assigning the same name as two or more subexps is allowed. + In this case, a subexp call can not be performed although + the back reference is possible. + + +8. Back reference + + \n back reference by group number (n >= 1) + \k<name> back reference by group name + + In the back reference by the multiplex definition name, + a subexp with a large number is referred to preferentially. + (When not matched, a group of the small number is referred to.) + + * Back reference by group number is forbidden if named group is defined + in the pattern and ONIG_OPTION_CAPTURE_GROUP is not setted. + + + back reference with nest level + + (This function is disabled in Ruby 1.9.) + + \k<name+n> n: 0, 1, 2, ... + \k<name-n> n: 0, 1, 2, ... + + Destinate relative nest level from back reference position. + + ex 1. + + /\A(?<a>|.|(?:(?<b>.)\g<a>\k<b+0>))\z/.match("reer") + + ex 2. + + r = Regexp.compile(<<'__REGEXP__'.strip, Regexp::EXTENDED) + (?<element> \g<stag> \g<content>* \g<etag> ){0} + (?<stag> < \g<name> \s* > ){0} + (?<name> [a-zA-Z_:]+ ){0} + (?<content> [^<&]+ (\g<element> | [^<&]+)* ){0} + (?<etag> </ \k<name+1> >){0} + \g<element> + __REGEXP__ + + p r.match('<foo>f<bar>bbb</bar>f</foo>').captures + + + +9. Subexp call ("Tanaka Akira special") + + \g<name> call by group name + \g<n> call by group number (n >= 1) + + * left-most recursive call is not allowed. + ex. (?<name>a|\g<name>b) => error + (?<name>a|b\g<name>c) => OK + + * Call by group number is forbidden if named group is defined in the pattern + and ONIG_OPTION_CAPTURE_GROUP is not setted. + + * If the option status of called group is different from calling position + then the group's option is effective. + + ex. (?-i:\g<name>)(?i:(?<name>a)){0} match to "A" + + +10. Captured group + + Behavior of the no-named group (...) changes with the following conditions. + (But named group is not changed.) + + case 1. /.../ (named group is not used, no option) + + (...) is treated as a captured group. + + case 2. /.../g (named group is not used, 'g' option) + + (...) is treated as a no-captured group (?:...). + + case 3. /..(?<name>..)../ (named group is used, no option) + + (...) is treated as a no-captured group (?:...). + numbered-backref/call is not allowed. + + case 4. /..(?<name>..)../G (named group is used, 'G' option) + + (...) is treated as a captured group. + numbered-backref/call is allowed. + + where + g: ONIG_OPTION_DONT_CAPTURE_GROUP + G: ONIG_OPTION_CAPTURE_GROUP + + ('g' and 'G' options are argued in ruby-dev ML) + + These options are not implemented in Ruby level. + + +----------------------------- +A-1. Syntax depend options + + + ONIG_SYNTAX_RUBY + (?m): dot(.) match newline + + + ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA + (?s): dot(.) match newline + (?m): ^ match after newline, $ match before newline + + +A-2. Original extensions + + + hexadecimal digit char type \h, \H + + named group (?<name>...) + + named backref \k<name> + + subexp call \g<name>, \g<group-num> + + +A-3. Lacked features compare with perl 5.8.0 + + + [:word:] + + \N{name} + + \l,\u,\L,\U, \X, \C + + (?{code}) + + (??{code}) + + (?(condition)yes-pat|no-pat) + + * \Q...\E + This is effective on ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA. + + * \p{property}, \P{property} + This is effective on ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA. + Alnum, Alpha, Blank, Cntrl, Digit, Graph, Lower, + Print, Punct, Space, Upper, XDigit, ASCII are supported. + + Prefix 'Is' of property name is allowed in ONIG_SYNTAX_PERL only. + ex. \p{IsXDigit}. + + Negation operator of property is supported in ONIG_SYNTAX_PERL only. + \p{^...}, \P{^...} + + +A-4. Differences with Japanized GNU regex(version 0.12) of Ruby + + + add hexadecimal digit char type (\h, \H) + + add look-behind + (?<=fixed-char-length-pattern), (?<!fixed-char-length-pattern) + + add possessive quantifier. ?+, *+, ++ + + add operations in character class. [], && + ('[' must be escaped as an usual char in character class.) + + add named group and subexp call. + + octal or hexadecimal number sequence can be treated as + a multibyte code char in character class if multibyte encoding + is specified. + (ex. [\xa1\xa2], [\xa1\xa7-\xa4\xa1]) + + allow the range of single byte char and multibyte char in character + class. + ex. /[a-<<any EUC-JP character>>]/ in EUC-JP encoding. + + effect range of isolated option is to next ')'. + ex. (?:(?i)a|b) is interpreted as (?:(?i:a|b)), not (?:(?i:a)|b). + + isolated option is not transparent to previous pattern. + ex. a(?i)* is a syntax error pattern. + + allowed incompleted left brace as an usual string. + ex. /{/, /({)/, /a{2,3/ etc... + + negative POSIX bracket [:^xxxx:] is supported. + + POSIX bracket [:ascii:] is added. + + repeat of look-ahead is not allowed. + ex. /(?=a)*/, /(?!b){5}/ + + Ignore case option is effective to numbered character. + ex. /\x61/i =~ "A" + + In the range quantifier, the number of the minimum is omissible. + /a{,n}/ == /a{0,n}/ + The simultanious abbreviation of the number of times of the minimum + and the maximum is not allowed. (/a{,}/) + + /a{n}?/ is not a non-greedy operator. + /a{n}?/ == /(?:a{n})?/ + + invalid back reference is checked and cause error. + /\1/, /(a)\2/ + + Zero-length match in infinite repeat stops the repeat, + then changes of the capture group status are checked as stop condition. + /(?:()|())*\1\2/ =~ "" + /(?:\1a|())*/ =~ "a" + + +A-5. Disabled functions by default syntax + + + capture history + + (?@...) and (?@<name>...) + + ex. /(?@a)*/.match("aaa") ==> [<0-1>, <1-2>, <2-3>] + + see sample/listcap.c file. + + +A-6. Problems + + + Invalid encoding byte sequence is not checked in UTF-8. + + * Invalid first byte is treated as a character. + /./u =~ "\xa3" + + * Incomplete byte sequence is not checked. + /\w+/ =~ "a\xf3\x8ec" + +// END diff --git a/ext/mbstring/oniguruma/doc/RE.ja b/ext/mbstring/oniguruma/doc/RE.ja new file mode 100644 index 000000000..51681715c --- /dev/null +++ b/ext/mbstring/oniguruma/doc/RE.ja @@ -0,0 +1,424 @@ +µ´¼Ö Àµµ¬É½¸½ Version 4.3.0 2006/08/17 + +»ÈÍÑʸˡ: ONIG_SYNTAX_RUBY (´ûÄêÃÍ) + + +1. ´ðËÜÍ×ÁÇ + + \ ÂàÈò½¤¾þ (¥¨¥¹¥±¡¼¥×) Àµµ¬É½¸½µ¹æ¤Î͸ú/̵¸ú¤ÎÀ©¸æ + | ÁªÂò»Ò + (...) ¼°½¸¹ç (¥°¥ë¡¼¥×) + [...] ʸ»ú½¸¹ç (ʸ»ú¥¯¥é¥¹) + + +2. ʸ»ú + + \t ¿åÊ¿¥¿¥Ö (0x09) + \v ¿âľ¥¿¥Ö (0x0B) + \n ²þ¹Ô (0x0A) + \r Éüµ¢ (0x0D) + \b ¸åÂà¶õÇò (0x08) + \f ²þÊÇ (0x0C) + \a ¾â (0x07) + \e ÂàÈò½¤¾þ (0x1B) + \nnn Ȭ¿Ê¿ôɽ¸½ É乿²½¥Ð¥¤¥ÈÃÍ(¤Î°ìÉô) + \xHH ½½Ï»¿Ê¿ôɽ¸½ É乿²½¥Ð¥¤¥ÈÃÍ(¤Î°ìÉô) + \x{7HHHHHHH} ³ÈÄ¥½½Ï»¿Ê¿ôɽ¸½ ¥³¡¼¥É¥Ý¥¤¥ó¥ÈÃÍ + \cx À©¸æÊ¸»úɽ¸½ ¥³¡¼¥É¥Ý¥¤¥ó¥ÈÃÍ + \C-x À©¸æÊ¸»úɽ¸½ ¥³¡¼¥É¥Ý¥¤¥ó¥ÈÃÍ + \M-x Ķ (x|0x80) ¥³¡¼¥É¥Ý¥¤¥ó¥ÈÃÍ + \M-\C-x Ķ + À©¸æÊ¸»úɽ¸½ ¥³¡¼¥É¥Ý¥¤¥ó¥ÈÃÍ + + ¢¨ \b¤Ï¡¢Ê¸»ú½¸¹çÆâ¤Ç¤Î¤ß͸ú + + +3. ʸ»ú¼ï + + . Ǥ°Õʸ»ú (²þ¹Ô¤ò½ü¤¯) + + \w ñ¸ì¹½À®Ê¸»ú + + Unicode°Ê³°¤Î¾ì¹ç: + ±Ñ¿ô»ú, "_" ¤ª¤è¤Ó ¿¥Ð¥¤¥Èʸ»ú¡£ + + Unicode¤Î¾ì¹ç: + General_Category -- (Letter|Mark|Number|Connector_Punctuation) + + \W Èóñ¸ì¹½À®Ê¸»ú + + \s ¶õÇòʸ»ú + + Unicode°Ê³°¤Î¾ì¹ç: + \t, \n, \v, \f, \r, \x20 + + Unicode¤Î¾ì¹ç: + 0009, 000A, 000B, 000C, 000D, 0085(NEL), + General_Category -- Line_Separator + -- Paragraph_Separator + -- Space_Separator + + \S Èó¶õÇòʸ»ú + + \d 10¿Ê¿ô»ú + + Unicode¤Î¾ì¹ç: General_Category -- Decimal_Number + + \D Èó10¿Ê¿ô»ú + + \h 16¿Ê¿ô»ú [0-9a-fA-F] + + \H Èó16¿Ê¿ô»ú + + + +4. ÎÌ»ØÄê»Ò + + ÍßÄ¥¤ê + + ? °ì²ó¤Þ¤¿¤ÏÎí²ó + * Îí²ó°Ê¾å + + °ì²ó°Ê¾å + {n,m} n²ó°Ê¾åm²ó°Ê²¼ + {n,} n²ó°Ê¾å + {,n} Îí²ó°Ê¾ån²ó°Ê²¼ ({0,n}) + {n} n²ó + + ̵Íß + + ?? °ì²ó¤Þ¤¿¤ÏÎí²ó + *? Îí²ó°Ê¾å + +? °ì²ó°Ê¾å + {n,m}? n²ó°Ê¾åm²ó°Ê²¼ + {n,}? n²ó°Ê¾å + {,n}? Îí²ó°Ê¾ån²ó°Ê²¼ (== {0,n}?) + + ¶¯Íß (ÍßÄ¥¤ê¤Ç¡¢·«¤êÊÖ¤·¤ËÀ®¸ù¤·¤¿¸å¤Ï²ó¿ô¤ò¸º¤é¤¹¤è¤¦¤Ê¸åÂàºÆ»î¹Ô¤ò¤·¤Ê¤¤) + + ?+ °ì²ó¤Þ¤¿¤ÏÎí²ó + *+ Îí²ó°Ê¾å + ++ °ì²ó°Ê¾å + + ({n,m}+, {n,}+, {n}+ ¤Ï¡¢ONIG_SYNTAX_JAVA¤Ç¤Î¤ß¶¯ÍߤʻØÄê»Ò) + + Îã. /a*+/ === /(?>a*)/ + + +5. ÉÅ + + ^ ¹ÔƬ + $ ¹ÔËö + \b ñ¸ì¶³¦ + \B Èóñ¸ì¶³¦ + \A ʸ»úÎóÀèÆ¬ + \Z ʸ»úÎóËöÈø¡¢¤Þ¤¿¤Ïʸ»úÎóËöÈø¤Î²þ¹Ô¤ÎľÁ° + \z ʸ»úÎóËöÈø + \G ¾È¹ç³«»Ï°ÌÃÖ(*) + + * Ruby Regexp: + Á°²ó¾È¹çÀ®¸ùËöÈø°ÌÃÖ + (¤³¤Î»ÅÍͤÏRuby¤Î¼ÂÁõ¤Ë´Ø¤¹¤ë¤â¤Î¤Ç¤¢¤ê¡¢ + Àµµ¬É½¸½¥é¥¤¥Ö¥é¥ê¤È¤Ï̵´Ø·¸) + + +6. ʸ»ú½¸¹ç + + ^... ÈÝÄê (ºÇÄãÍ¥ÀèÅٱ黻»Ò) + x-y ÈÏ°Ï (x¤«¤éy¤Þ¤Ç) + [...] ½¸¹ç (ʸ»ú½¸¹çÆâʸ»ú½¸¹ç) + ..&&.. Àѱ黻 (^¤Î¼¡¤ËÍ¥ÀèÅÙ¤¬Ä㤤±é»»»Ò) + + Îã. [a-w&&[^c-g]z] ==> ([a-w] and ([^c-g] or z)) ==> [abh-w] + + ¢¨ '[', '-', ']'¤ò¡¢Ê¸»ú½¸¹çÆâ¤ÇÄ̾ïʸ»ú¤Î°ÕÌ£¤Ç»ÈÍѤ·¤¿¤¤¾ì¹ç¤Ë¤Ï¡¢ + ¤³¤ì¤é¤Îʸ»ú¤ò'\'¤ÇÂàÈò½¤¾þ¤·¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£ + + + POSIX¥Ö¥é¥±¥Ã¥È ([:xxxxx:], ÈÝÄê [:^xxxxx:]) + + Unicode°Ê³°¤Î¾ì¹ç: + + alnum ±Ñ¿ô»ú + alpha ±Ñ»ú + ascii 0 - 127 + blank \t, \x20 + cntrl + digit 0-9 + graph ¿¥Ð¥¤¥Èʸ»úÁ´Éô¤ò´Þ¤à + lower + print ¿¥Ð¥¤¥Èʸ»úÁ´Éô¤ò´Þ¤à + punct + space \t, \n, \v, \f, \r, \x20 + upper + xdigit 0-9, a-f, A-F + + Unicode¤Î¾ì¹ç: + + alnum Letter | Mark | Decimal_Number + alpha Letter | Mark + ascii 0000 - 007F + blank Space_Separator | 0009 + cntrl Control | Format | Unassigned | Private_Use | Surrogate + digit Decimal_Number + graph [[:^space:]] && ^Control && ^Unassigned && ^Surrogate + lower Lowercase_Letter + print [[:graph:]] | [[:space:]] + punct Connector_Punctuation | Dash_Punctuation | Close_Punctuation | + Final_Punctuation | Initial_Punctuation | Other_Punctuation | + Open_Punctuation + space Space_Separator | Line_Separator | Paragraph_Separator | + 0009 | 000A | 000B | 000C | 000D | 0085 + upper Uppercase_Letter + xdigit 0030 - 0039 | 0041 - 0046 | 0061 - 0066 + (0-9, a-f, A-F) + + +7. ³ÈÄ¥¼°½¸¹ç + + (?#...) Ãí¼á + (?imx-imx) ¸ÉΩ¥ª¥×¥·¥ç¥ó + i: Âçʸ»ú¾®Ê¸»ú¾È¹ç + m: Ê£¿ô¹Ô + x: ³ÈÄ¥·Á¼° + (?imx-imx:¼°) ¼°¥ª¥×¥·¥ç¥ó + + (¼°) Êá³Í¼°½¸¹ç + (?:¼°) ÈóÊá³Í¼°½¸¹ç + + (?=¼°) ÀèÆÉ¤ß + (?!¼°) ÈÝÄêÀèÆÉ¤ß + (?<=¼°) Ìá¤êÆÉ¤ß + (?<!¼°) ÈÝÄêÌá¤êÆÉ¤ß + + Ìá¤êÆÉ¤ß¤Î¼°¤Ï¸ÇÄêʸ»úĹ¤Ç¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£ + ¤·¤«¤·¡¢ºÇ¾å°Ì¤ÎÁªÂò»Ò¤À¤±¤Ï°Û¤Ê¤Ã¤¿Ê¸»úŤ¬µö¤µ¤ì¤ë¡£ + Îã. (?<=a|bc) ¤Ïµö²Ä. (?<=aaa(?:b|cd)) ¤ÏÉÔµö²Ä + + ÈÝÄêÌá¤êÆÉ¤ß¤Ç¤Ï¡¢Êá³Í¼°½¸¹ç¤Ïµö¤µ¤ì¤Ê¤¤¤¬¡¢ + ÈóÊá³Í¼°½¸¹ç¤Ïµö¤µ¤ì¤ë¡£ + + (?>¼°) ¸¶»ÒŪ¼°½¸¹ç + ¼°Á´ÂΤòÄ̲ᤷ¤¿¤È¤¡¢¼°¤ÎÃæ¤Ç¤Î¸åÂàºÆ»î¹Ô¤ò¹Ô¤Ê¤ï¤Ê¤¤ + + (?<name>¼°) ̾Á°ÉÕ¤Êá³Í¼°½¸¹ç + ¼°½¸¹ç¤Ë̾Á°¤ò³ä¤êÅö¤Æ¤ë(ÄêµÁ¤¹¤ë)¡£ + (̾Á°¤Ïñ¸ì¹½À®Ê¸»ú¤Ç¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£ºÇ½é¤Îʸ»ú¤Ï + ±ÑÂçʸ»ú¤Ç¤¢¤Ã¤Æ¤Ï¤¤¤±¤Ê¤¤¡£) + + ̾Á°¤À¤±¤Ç¤Ê¤¯¡¢Êá³Í¼°½¸¹ç¤ÈƱÍͤËÈÖ¹æ¤â³ä¤êÅö¤Æ¤é¤ì¤ë¡£ + ÈÖ¹æ»ØÄ꤬¶Ø»ß¤µ¤ì¤Æ¤¤¤Ê¤¤¾õÂÖ (10. Êá³Í¼°½¸¹ç ¤ò»²¾È) + ¤Î¤È¤¤Ï¡¢Ì¾Á°¤ò»È¤ï¤Ê¤¤¤ÇÈÖ¹æ¤Ç¤â»²¾È¤Ç¤¤ë¡£ + + Ê£¿ô¤Î¼°½¸¹ç¤ËƱ¤¸Ì¾Á°¤òÍ¿¤¨¤ë¤³¤È¤Ïµö¤µ¤ì¤Æ¤¤¤ë¡£ + ¤³¤Î¾ì¹ç¤Ë¤Ï¡¢¤³¤Î̾Á°¤ò»ÈÍѤ·¤¿¸åÊý»²¾È¤Ï²Äǽ¤Ç¤¢¤ë¤¬¡¢ + Éôʬ¼°¸Æ½Ð¤·¤Ï¤Ç¤¤Ê¤¤¡£ + + +8. ¸åÊý»²¾È + + \n ÈÖ¹æ»ØÄ껲¾È (n >= 1) + \k<name> ̾Á°»ØÄ껲¾È + + ̾Á°»ØÄ껲¾È¤Ç¡¢¤½¤Î̾Á°¤¬Ê£¿ô¤Î¼°½¸¹ç¤Ç¿½ÅÄêµÁ¤µ¤ì¤Æ¤¤¤ë¾ì¹ç¤Ë¤Ï¡¢ + ÈÖ¹æ¤ÎÂ礤¤¼°½¸¹ç¤«¤éÍ¥ÀèŪ¤Ë»²¾È¤µ¤ì¤ë¡£ + (¥Þ¥Ã¥Á¤·¤Ê¤¤¤È¤¤Ë¤ÏÈÖ¹æ¤Î¾®¤µ¤¤¼°½¸¹ç¤¬»²¾È¤µ¤ì¤ë) + + ¢¨ ÈÖ¹æ»ØÄ껲¾È¤Ï¡¢Ì¾Á°ÉÕ¤Êá³Í¼°½¸¹ç¤¬ÄêµÁ¤µ¤ì¡¢ + ¤«¤Ä ONIG_OPTION_CAPTURE_GROUP¤¬»ØÄꤵ¤ì¤Æ¤¤¤Ê¤¤¾ì¹ç¤Ë¤Ï¡¢ + ¶Ø»ß¤µ¤ì¤ë¡£(10. Êá³Í¼°½¸¹ç ¤ò»²¾È) + + + ¥Í¥¹¥È¥ì¥Ù¥ëÉÕ¤¸åÊý»²¾È + + ¤³¤Îµ¡Ç½¤Ï¸½ºß¡¢Ruby 1.9¤Ç¤Ï̵¸ú¤Ë¤·¤Æ¤¤¤ë¡£ + + \k<name+n> n: 0, 1, 2, ... + \k<name-n> n: 0, 1, 2, ... + + ¸åÊý»²¾È¤Î°ÌÃÖ¤«¤éÁêÂÐŪ¤ÊÉôʬ¼°¸Æ½Ð¤·¥Í¥¹¥È¥ì¥Ù¥ë¤ò»ØÄꤷ¤Æ¡¢¤½¤Î¥ì¥Ù¥ë¤Ç¤Î + Êá³ÍÃͤò»²¾È¤¹¤ë¡£ + + Îã-1. + + /\A(?<a>|.|(?:(?<b>.)\g<a>\k<b+0>))\z/.match("reer") + + Îã-2. + + r = Regexp.compile(<<'__REGEXP__'.strip, Regexp::EXTENDED) + (?<element> \g<stag> \g<content>* \g<etag> ){0} + (?<stag> < \g<name> \s* > ){0} + (?<name> [a-zA-Z_:]+ ){0} + (?<content> [^<&]+ (\g<element> | [^<&]+)* ){0} + (?<etag> </ \k<name+1> >){0} + \g<element> + __REGEXP__ + + p r.match('<foo>f<bar>bbb</bar>f</foo>').captures + + + +9. Éôʬ¼°¸Æ½Ð¤· ("ÅÄÃæÅ¯¥¹¥Ú¥·¥ã¥ë") + + \g<name> ̾Á°»ØÄê¸Æ½Ð¤· + \g<n> ÈÖ¹æ»ØÄê¸Æ½Ð¤· (n >= 1) + + ¢¨ ºÇº¸°ÌÃ֤ǤκƵ¢¸Æ½Ð¤·¤Ï¶Ø»ß¤µ¤ì¤ë¡£ + Îã. (?<name>a|\g<name>b) => error + (?<name>a|b\g<name>c) => OK + + ¢¨ ÈÖ¹æ»ØÄê¸Æ½Ð¤·¤Ï¡¢Ì¾Á°ÉÕ¤Êá³Í¼°½¸¹ç¤¬ÄêµÁ¤µ¤ì¡¢ + ¤«¤Ä ONIG_OPTION_CAPTURE_GROUP¤¬»ØÄꤵ¤ì¤Æ¤¤¤Ê¤¤¾ì¹ç¤Ë¤Ï¡¢ + ¶Ø»ß¤µ¤ì¤ë¡£ (10. Êá³Í¼°½¸¹ç ¤ò»²¾È) + + ¢¨ ¸Æ¤Ó½Ð¤µ¤ì¤¿¼°½¸¹ç¤Î¥ª¥×¥·¥ç¥ó¾õÂÖ¤¬¸Æ½Ð¤·Â¦¤Î¥ª¥×¥·¥ç¥ó¾õÂ֤ȰۤʤäƤ¤¤ë + ¤È¤¡¢¸Æ¤Ó½Ð¤µ¤ì¤¿Â¦¤Î¥ª¥×¥·¥ç¥ó¾õÂÖ¤¬Í¸ú¤Ç¤¢¤ë¡£ + + Îã. (?-i:\g<name>)(?i:(?<name>a)){0} ¤Ï "A" ¤Ë¾È¹çÀ®¸ù¤¹¤ë¡£ + + +10. Êá³Í¼°½¸¹ç + + Êá³Í¼°½¸¹ç(...)¤Ï¡¢°Ê²¼¤Î¾ò·ï¤Ë±þ¤¸¤Æ¿¶Éñ¤¬ÊѲ½¤¹¤ë¡£ + (̾Á°ÉÕ¤Êá³Í¼°½¸¹ç¤ÏÊѲ½¤·¤Ê¤¤) + + case 1. /.../ (̾Á°ÉÕ¤Êá³Í¼°½¸¹ç¤ÏÉÔ»ÈÍÑ¡¢¥ª¥×¥·¥ç¥ó¤Ê¤·) + + (...) ¤Ï¡¢Êá³Í¼°½¸¹ç¤È¤·¤Æ°·¤ï¤ì¤ë¡£ + + case 2. /.../g (̾Á°ÉÕ¤Êá³Í¼°½¸¹ç¤ÏÉÔ»ÈÍÑ¡¢¥ª¥×¥·¥ç¥ó 'g'¤ò»ØÄê) + + (...) ¤Ï¡¢ÈóÊá³Í¼°½¸¹ç¤È¤·¤Æ°·¤ï¤ì¤ë¡£ + + case 3. /..(?<name>..)../ (̾Á°ÉÕ¤Êá³Í¼°½¸¹ç¤Ï»ÈÍÑ¡¢¥ª¥×¥·¥ç¥ó¤Ê¤·) + + (...) ¤Ï¡¢ÈóÊá³Í¼°½¸¹ç¤È¤·¤Æ°·¤ï¤ì¤ë¡£ + ÈÖ¹æ»ØÄ껲¾È/¸Æ¤Ó½Ð¤·¤ÏÉÔµö²Ä¡£ + + case 4. /..(?<name>..)../G (̾Á°ÉÕ¤Êá³Í¼°½¸¹ç¤Ï»ÈÍÑ¡¢¥ª¥×¥·¥ç¥ó 'G'¤ò»ØÄê) + + (...) ¤Ï¡¢Êá³Í¼°½¸¹ç¤È¤·¤Æ°·¤ï¤ì¤ë¡£ + ÈÖ¹æ»ØÄ껲¾È/¸Æ¤Ó½Ð¤·¤Ïµö²Ä¡£ + + ⤷ + g: ONIG_OPTION_DONT_CAPTURE_GROUP + G: ONIG_OPTION_CAPTURE_GROUP + ('g'¤È'G'¥ª¥×¥·¥ç¥ó¤Ï¡¢ruby-dev ML¤ÇµÄÏÀ¤µ¤ì¤¿¡£) + + ¤³¤ì¤é¤Î¿¶Éñ¤Î°ÕÌ£¤Ï¡¢ + ̾Á°ÉÕ¤Êá³Í¤È̾Á°Ìµ¤·Êá³Í¤òƱ»þ¤Ë»ÈÍѤ¹¤ëɬÁ³À¤Î¤¢¤ë¾ìÌ̤Ͼ¯¤Ê¤¤¤Ç¤¢¤í¤¦ + ¤È¤¤¤¦Íýͳ¤«¤é¹Í¤¨¤é¤ì¤¿¤â¤Î¤Ç¤¢¤ë¡£ + ¤³¤ì¤é¤Î¥ª¥×¥·¥ç¥ó¤Ë¤Ä¤¤¤Æ¤Ï¡¢Ruby¤Ç¤Ï¸½ºß¼ÂÁõ¤µ¤ì¤Æ¤¤¤Ê¤¤¡£ + + +----------------------------- +Êäµ 1. ʸˡ°Í¸¥ª¥×¥·¥ç¥ó + + + ONIG_SYNTAX_RUBY + (?m): ½ª»ßÉäµ¹æ(.)¤Ï²þ¹Ô¤È¾È¹çÀ®¸ù + + + ONIG_SYNTAX_PERL ¤È ONIG_SYNTAX_JAVA + (?s): ½ª»ßÉäµ¹æ(.)¤Ï²þ¹Ô¤È¾È¹çÀ®¸ù + (?m): ^ ¤Ï²þ¹Ô¤Îľ¸å¤Ë¾È¹ç¤¹¤ë¡¢$ ¤Ï²þ¹Ô¤ÎľÁ°¤Ë¾È¹ç¤¹¤ë + + +Êäµ 2. ÆÈ¼«³ÈÄ¥µ¡Ç½ + + + 16¿Ê¿ô¿ô»ú¡¢Èó16¿Ê¿ô»ú \h, \H + + ̾Á°ÉÕ¤Êá³Í¼°½¸¹ç (?<name>...) + + ̾Á°»ØÄê¸åÊý»²¾È \k<name> + + Éôʬ¼°¸Æ½Ð¤· \g<name>, \g<group-num> + + +Êäµ 3. Perl 5.8.0¤ÈÈæ³Ó¤·¤ÆÂ¸ºß¤·¤Ê¤¤µ¡Ç½ + + + [:word:] + + \N{name} + + \l,\u,\L,\U, \X, \C + + (?{code}) + + (??{code}) + + (?(condition)yes-pat|no-pat) + + * \Q...\E + ⤷ONIG_SYNTAX_PERL¤ÈONIG_SYNTAX_JAVA¤Ç¤Ï͸ú + + * \p{property}, \P{property} + ⤷ONIG_SYNTAX_PERL¤ÈONIG_SYNTAX_JAVA¤Ç¤Ï͸ú + Alnum, Alpha, Blank, Cntrl, Digit, Graph, Lower, + Print, Punct, Space, Upper, XDigit, ASCII¤¬»ØÄê¤Ç¤¤ë¡£ + + ÆÃÀ̾¤ÎÁ°¤Ë 'Is'Á°ÃÖ»ì¤ò»ÈÍѤ¹¤ë¤³¤È¤Ï¡¢ONIG_SYNTAX_PERL¤Ç¤Î¤ß + µö¤µ¤ì¤Æ¤¤¤ë¡£ + ex. \p{IsXDigit}. + + ÆÃÀ¤ÎÈÝÄê±é»»»Ò¤Ï¡¢ONIG_SYNTAX_PERL¤Ç¤Î¤ßµö¤µ¤ì¤Æ¤¤¤ë¡£ + \p{^...}, \P{^...} + + +Êäµ 4. Ruby¤ÎÆüËܸ첽 GNU regex(version 0.12)¤È¤Î°ã¤¤ + + + 16¿Ê¿ô»ú¥¿¥¤¥×Äɲà (\h, \H) + + Ìá¤êÆÉ¤ßµ¡Ç½¤òÄɲà + + ¶¯Íߤʷ«¤êÊÖ¤·»ØÄê»Ò¤òÄɲà (?+, *+, ++) + + ʸ»ú½¸¹ç¤ÎÃæ¤Î±é»»»Ò¤òÄɲà ([...], &&) + ('[' ¤Ï¡¢Ê¸»ú½¸¹ç¤ÎÃæ¤ÇÄ̾ï¤Îʸ»ú¤È¤·¤Æ»ÈÍѤ¹¤ë¤È¤¤Ë¤Ï + ÂàÈò½¤¾þ¤·¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤) + + ̾Á°ÉÕ¤Êá³Í¼°½¸¹ç¤È¡¢Éôʬ¼°¸Æ½Ð¤·µ¡Ç½Äɲà + + ¿¥Ð¥¤¥Èʸ»ú¥³¡¼¥É¤¬»ØÄꤵ¤ì¤Æ¤¤¤ë¤È¤¡¢ + ʸ»ú½¸¹ç¤ÎÃæ¤ÇȬ¿Ê¿ô¤Þ¤¿¤Ï½½Ï»¿Ê¿ôɽ¸½¤ÎϢ³¤Ï¡¢Â¿¥Ð¥¤¥ÈÉä¹ç¤Çɽ¸½¤µ¤ì¤¿ + °ì¸Ä¤Îʸ»ú¤È²ò¼á¤µ¤ì¤ë + (Îã. [\xa1\xa2], [\xa1\xa7-\xa4\xa1]) + + ʸ»ú½¸¹ç¤ÎÃæ¤Ç¡¢°ì¥Ð¥¤¥Èʸ»ú¤È¿¥Ð¥¤¥Èʸ»ú¤ÎÈϰϻØÄê¤Ïµö¤µ¤ì¤ë¡£ + ex. /[a-¤¢]/ + + ¸ÉΩ¥ª¥×¥·¥ç¥ó¤Î͸úÈϰϤϡ¢¤½¤Î¸ÉΩ¥ª¥×¥·¥ç¥ó¤ò´Þ¤ó¤Ç¤¤¤ë¼°½¸¹ç¤Î + ½ª¤ï¤ê¤Þ¤Ç¤Ç¤¢¤ë + Îã. (?:(?i)a|b) ¤Ï (?:(?i:a|b)) ¤È²ò¼á¤µ¤ì¤ë¡¢(?:(?i:a)|b)¤Ç¤Ï¤Ê¤¤ + + ¸ÉΩ¥ª¥×¥·¥ç¥ó¤Ï¤½¤ÎÁ°¤Î¼°¤ËÂФ·¤ÆÆ©²áŪ¤Ç¤Ï¤Ê¤¤ + Îã. /a(?i)*/ ¤Ïʸˡ¥¨¥é¡¼¤È¤Ê¤ë + + ÉÔ´°Á´¤Ê·«¤êÊÖ¤·ÈϰϻØÄê»Ò¤ÏÄ̾ï¤Îʸ»úÎó¤È¤·¤Æµö²Ä¤µ¤ì¤ë + Îã. /{/, /({)/, /a{2,3/ + + ÈÝÄêŪPOSIX¥Ö¥é¥±¥Ã¥È [:^xxxx:] ¤òÄɲà + + POSIX¥Ö¥é¥±¥Ã¥È [:ascii:] ¤òÄɲà + + ÀèÆÉ¤ß¤Î·«¤êÊÖ¤·¤ÏÉÔµö²Ä + Îã. /(?=a)*/, /(?!b){5}/ + + ¿ôÃͤǻØÄꤵ¤ì¤¿Ê¸»ú¤ËÂФ·¤Æ¤â¡¢Âçʸ»ú¾®Ê¸»ú¾È¹ç¥ª¥×¥·¥ç¥ó¤Ï͸ú + Îã. /\x61/i =~ "A" + + ·«¤êÊÖ¤·²ó¿ô»ØÄê¤Ç¡¢ºÇÄã²ó¿ô¤Î¾Êά(0²ó)¤¬¤Ç¤¤ë + /a{,n}/ == /a{0,n}/ + ºÇÄã²ó¿ô¤ÈºÇÂç²ó¿ô¤ÎƱ»þ¾Êά¤Ïµö¤µ¤ì¤Ê¤¤¡£(/a{,}/) + + /a{n}?/¤Ï̵Íߤʱ黻»Ò¤Ç¤Ï¤Ê¤¤¡£ + /a{n}?/ == /(?:a{n})?/ + + ̵¸ú¤Ê¸åÊý»²¾È¤ò¥Á¥§¥Ã¥¯¤·¤Æ¥¨¥é¡¼¤Ë¤¹¤ë¡£ + /\1/, /(a)\2/ + + ̵¸Â·«¤êÊÖ¤·¤ÎÃæ¤Ç¡¢Ä¹¤µÎí¤Ç¤Î¾È¹çÀ®¸ù¤Ï·«¤êÊÖ¤·¤òÃæÃǤµ¤»¤ë¤¬¡¢ + ¤³¤Î¤È¤¡¢ÃæÃǤ¹¤Ù¤¤«¤É¤¦¤«¤ÎȽÄê¤È¤·¤Æ¡¢Êá³Í¼°½¸¹ç¤ÎÊá³Í¾õÂ֤Π+ ÊѲ½¤Þ¤Ç¹Íθ¤·¤Æ¤¤¤ë + /(?:()|())*\1\2/ =~ "" + /(?:\1a|())*/ =~ "a" + + + +Êäµ 5. ¼ÂÁõ¤µ¤ì¤Æ¤¤¤ë¤¬¡¢´ûÄêÃͤǤÏ͸ú¤Ë¤·¤Æ¤¤¤Ê¤¤µ¡Ç½ + + + Êá³ÍÍúÎò»²¾È + + (?@...) ¤È (?@<name>...) + + Îã. /(?@a)*/.match("aaa") ==> [<0-1>, <1-2>, <2-3>] + + »ÈÍÑÊýË¡¤Ï¡¢sample/listcap.c¤ò»²¾È + + ͸ú¤Ë¤·¤Æ¤¤¤Ê¤¤Íýͳ¤Ï¡¢¤É¤ÎÄøÅÙÌò¤ËΩ¤Ä¤«¤Ï¤Ã¤¤ê¤·¤Ê¤¤¤¿¤á¡£ + + +Êäµ 6. ÌäÂêÅÀ + + + UTF-8¤Ç¡¢¥Ð¥¤¥ÈÃͤ¬Å¬Àµ¤Ê²Á¤«¤É¤¦¤«¤Î¥Á¥§¥Ã¥¯¤Ï¹Ô¤Ê¤Ã¤Æ¤¤¤Ê¤¤¡£ + + * ÀèÆ¬¥Ð¥¤¥È¤È¤·¤ÆÉÔÀµ¤Ê¥Ð¥¤¥È¤ò°ìʸ»ú¤È¤ß¤Ê¤¹ + /./u =~ "\xa3" + + * ÉÔ´°Á´¤Ê¥Ð¥¤¥È¥·¡¼¥±¥ó¥¹¤Î¥Á¥§¥Ã¥¯¤ò¤·¤Ê¤¤ + /\w+/ =~ "a\xf3\x8ec" + + ¤³¤ì¤òÄ´¤Ù¤ë¤³¤È¤Ï²Äǽ¤Ç¤Ï¤¢¤ë¤¬¡¢ÃÙ¤¯¤Ê¤ë¤Î¤Ç¹Ô¤Ê¤ï¤Ê¤¤¡£ + +½ª¤ê diff --git a/ext/mbstring/oniguruma/enc/big5.c b/ext/mbstring/oniguruma/enc/big5.c index 763872e96..86792666a 100644 --- a/ext/mbstring/oniguruma/enc/big5.c +++ b/ext/mbstring/oniguruma/enc/big5.c @@ -29,7 +29,7 @@ #include "regenc.h" -static int EncLen_BIG5[] = { +static const int EncLen_BIG5[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, diff --git a/ext/mbstring/oniguruma/enc/euc_jp.c b/ext/mbstring/oniguruma/enc/euc_jp.c index 5f13e33eb..71c81ee9f 100644 --- a/ext/mbstring/oniguruma/enc/euc_jp.c +++ b/ext/mbstring/oniguruma/enc/euc_jp.c @@ -31,7 +31,7 @@ #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1) -static int EncLen_EUCJP[] = { +static const int EncLen_EUCJP[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -158,20 +158,16 @@ eucjp_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) static int eucjp_is_code_ctype(OnigCodePoint code, unsigned int ctype) { - if ((ctype & ONIGENC_CTYPE_WORD) != 0) { - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else { + if ((ctype & (ONIGENC_CTYPE_WORD | + ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) { return (eucjp_code_to_mbclen(code) > 1 ? TRUE : FALSE); - - ctype &= ~ONIGENC_CTYPE_WORD; - if (ctype == 0) return FALSE; + } } - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else - return FALSE; + return FALSE; } static UChar* diff --git a/ext/mbstring/oniguruma/enc/euc_kr.c b/ext/mbstring/oniguruma/enc/euc_kr.c index c1e83b7e6..57bf80153 100644 --- a/ext/mbstring/oniguruma/enc/euc_kr.c +++ b/ext/mbstring/oniguruma/enc/euc_kr.c @@ -29,7 +29,7 @@ #include "regenc.h" -static int EncLen_EUCKR[] = { +static const int EncLen_EUCKR[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, diff --git a/ext/mbstring/oniguruma/enc/euc_tw.c b/ext/mbstring/oniguruma/enc/euc_tw.c index 4e5851a45..6f396e75e 100644 --- a/ext/mbstring/oniguruma/enc/euc_tw.c +++ b/ext/mbstring/oniguruma/enc/euc_tw.c @@ -29,7 +29,7 @@ #include "regenc.h" -static int EncLen_EUCTW[] = { +static const int EncLen_EUCTW[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, diff --git a/ext/mbstring/oniguruma/enc/gb18030.c b/ext/mbstring/oniguruma/enc/gb18030.c new file mode 100644 index 000000000..01995ea09 --- /dev/null +++ b/ext/mbstring/oniguruma/enc/gb18030.c @@ -0,0 +1,501 @@ +/********************************************************************** + gb18030.c - Oniguruma (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2005 KUBO Takehiro <kubo AT jiubao DOT org> + * K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "regenc.h" + +#if 1 +#define DEBUG_GB18030(arg) +#else +#define DEBUG_GB18030(arg) printf arg +#endif + +enum { + C1, /* one-byte char */ + C2, /* one-byte or second of two-byte char */ + C4, /* one-byte or second or fourth of four-byte char */ + CM /* first of two- or four-byte char or second of two-byte char */ +}; + +static const char GB18030_MAP[] = { + C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, + C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, + C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, + C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1, + C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, + C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, + C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, + C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1, + C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, + CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, + CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, + CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, + CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, + CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, + CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, + CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1 +}; + +static int +gb18030_mbc_enc_len(const UChar* p) +{ + if (GB18030_MAP[*p] != CM) + return 1; + p++; + if (GB18030_MAP[*p] == C4) + return 4; + if (GB18030_MAP[*p] == C1) + return 1; /* illegal sequence */ + return 2; +} + +static OnigCodePoint +gb18030_mbc_to_code(const UChar* p, const UChar* end) +{ + return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end); +} + +static int +gb18030_code_to_mbc(OnigCodePoint code, UChar *buf) +{ + return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf); +} + +static int +gb18030_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, + UChar* lower) +{ + return onigenc_mbn_mbc_to_normalize(ONIG_ENCODING_GB18030, flag, + pp, end, lower); +} + +static int +gb18030_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) +{ + return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end); +} + +static int +gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype) +{ + return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype); +} + +enum state { + S_START, + S_one_C2, + S_one_C4, + S_one_CM, + + S_odd_CM_one_CX, + S_even_CM_one_CX, + + /* CMC4 : pair of "CM C4" */ + S_one_CMC4, + S_odd_CMC4, + S_one_C4_odd_CMC4, + S_even_CMC4, + S_one_C4_even_CMC4, + + S_odd_CM_odd_CMC4, + S_even_CM_odd_CMC4, + + S_odd_CM_even_CMC4, + S_even_CM_even_CMC4, + + /* C4CM : pair of "C4 CM" */ + S_odd_C4CM, + S_one_CM_odd_C4CM, + S_even_C4CM, + S_one_CM_even_C4CM, + + S_even_CM_odd_C4CM, + S_odd_CM_odd_C4CM, + S_even_CM_even_C4CM, + S_odd_CM_even_C4CM, +}; + +static UChar* +gb18030_left_adjust_char_head(const UChar* start, const UChar* s) +{ + const UChar *p; + enum state state = S_START; + + DEBUG_GB18030(("----------------\n")); + for (p = s; p >= start; p--) { + DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p)); + switch (state) { + case S_START: + switch (GB18030_MAP[*p]) { + case C1: + return (UChar *)s; + case C2: + state = S_one_C2; /* C2 */ + break; + case C4: + state = S_one_C4; /* C4 */ + break; + case CM: + state = S_one_CM; /* CM */ + break; + } + break; + case S_one_C2: /* C2 */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)s; + case CM: + state = S_odd_CM_one_CX; /* CM C2 */ + break; + } + break; + case S_one_C4: /* C4 */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)s; + case CM: + state = S_one_CMC4; + break; + } + break; + case S_one_CM: /* CM */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + return (UChar *)s; + case C4: + state = S_odd_C4CM; + break; + case CM: + state = S_odd_CM_one_CX; /* CM CM */ + break; + } + break; + + case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)(s - 1); + case CM: + state = S_even_CM_one_CX; + break; + } + break; + case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)s; + case CM: + state = S_odd_CM_one_CX; + break; + } + break; + + case S_one_CMC4: /* CM C4 */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + return (UChar *)(s - 1); + case C4: + state = S_one_C4_odd_CMC4; /* C4 CM C4 */ + break; + case CM: + state = S_even_CM_one_CX; /* CM CM C4 */ + break; + } + break; + case S_odd_CMC4: /* CM C4 CM C4 CM C4 */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + return (UChar *)(s - 1); + case C4: + state = S_one_C4_odd_CMC4; + break; + case CM: + state = S_odd_CM_odd_CMC4; + break; + } + break; + case S_one_C4_odd_CMC4: /* C4 CM C4 */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)(s - 1); + case CM: + state = S_even_CMC4; /* CM C4 CM C4 */ + break; + } + break; + case S_even_CMC4: /* CM C4 CM C4 */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + return (UChar *)(s - 3); + case C4: + state = S_one_C4_even_CMC4; + break; + case CM: + state = S_odd_CM_even_CMC4; + break; + } + break; + case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)(s - 3); + case CM: + state = S_odd_CMC4; + break; + } + break; + + case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)(s - 3); + case CM: + state = S_even_CM_odd_CMC4; + break; + } + break; + case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)(s - 1); + case CM: + state = S_odd_CM_odd_CMC4; + break; + } + break; + + case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)(s - 1); + case CM: + state = S_even_CM_even_CMC4; + break; + } + break; + case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)(s - 3); + case CM: + state = S_odd_CM_even_CMC4; + break; + } + break; + + case S_odd_C4CM: /* C4 CM */ /* C4 CM C4 CM C4 CM*/ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)s; + case CM: + state = S_one_CM_odd_C4CM; /* CM C4 CM */ + break; + } + break; + case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + return (UChar *)(s - 2); /* |CM C4 CM */ + case C4: + state = S_even_C4CM; + break; + case CM: + state = S_even_CM_odd_C4CM; + break; + } + break; + case S_even_C4CM: /* C4 CM C4 CM */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)(s - 2); /* C4|CM C4 CM */ + case CM: + state = S_one_CM_even_C4CM; + break; + } + break; + case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + return (UChar *)(s - 0); /*|CM C4 CM C4|CM */ + case C4: + state = S_odd_C4CM; + break; + case CM: + state = S_even_CM_even_C4CM; + break; + } + break; + + case S_even_CM_odd_C4CM: /* CM CM C4 CM */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)(s - 0); /* |CM CM|C4|CM */ + case CM: + state = S_odd_CM_odd_C4CM; + break; + } + break; + case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)(s - 2); /* |CM CM|CM C4 CM */ + case CM: + state = S_even_CM_odd_C4CM; + break; + } + break; + + case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */ + case CM: + state = S_odd_CM_even_C4CM; + break; + } + break; + case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */ + switch (GB18030_MAP[*p]) { + case C1: + case C2: + case C4: + return (UChar *)(s - 0); /* |CM CM|CM C4 CM C4|CM */ + case CM: + state = S_even_CM_even_C4CM; + break; + } + break; + } + } + + DEBUG_GB18030(("state %d\n", state)); + switch (state) { + case S_START: return (UChar *)(s - 0); + case S_one_C2: return (UChar *)(s - 0); + case S_one_C4: return (UChar *)(s - 0); + case S_one_CM: return (UChar *)(s - 0); + + case S_odd_CM_one_CX: return (UChar *)(s - 1); + case S_even_CM_one_CX: return (UChar *)(s - 0); + + case S_one_CMC4: return (UChar *)(s - 1); + case S_odd_CMC4: return (UChar *)(s - 1); + case S_one_C4_odd_CMC4: return (UChar *)(s - 1); + case S_even_CMC4: return (UChar *)(s - 3); + case S_one_C4_even_CMC4: return (UChar *)(s - 3); + + case S_odd_CM_odd_CMC4: return (UChar *)(s - 3); + case S_even_CM_odd_CMC4: return (UChar *)(s - 1); + + case S_odd_CM_even_CMC4: return (UChar *)(s - 1); + case S_even_CM_even_CMC4: return (UChar *)(s - 3); + + case S_odd_C4CM: return (UChar *)(s - 0); + case S_one_CM_odd_C4CM: return (UChar *)(s - 2); + case S_even_C4CM: return (UChar *)(s - 2); + case S_one_CM_even_C4CM: return (UChar *)(s - 0); + + case S_even_CM_odd_C4CM: return (UChar *)(s - 0); + case S_odd_CM_odd_C4CM: return (UChar *)(s - 2); + case S_even_CM_even_C4CM: return (UChar *)(s - 2); + case S_odd_CM_even_C4CM: return (UChar *)(s - 0); + } + + return (UChar* )s; /* never come here. (escape warning) */ +} + +static int +gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end) +{ + return GB18030_MAP[*s] == C1 ? TRUE : FALSE; +} + +OnigEncodingType OnigEncodingGB18030 = { + gb18030_mbc_enc_len, + "GB18030", /* name */ + 4, /* max enc length */ + 1, /* min enc length */ + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, + gb18030_mbc_to_code, + onigenc_mb4_code_to_mbclen, + gb18030_code_to_mbc, + gb18030_mbc_to_normalize, + gb18030_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + gb18030_is_code_ctype, + onigenc_not_support_get_ctype_code_range, + gb18030_left_adjust_char_head, + gb18030_is_allowed_reverse_match +}; diff --git a/ext/mbstring/oniguruma/enc/iso8859_1.c b/ext/mbstring/oniguruma/enc/iso8859_1.c index 53ad52ee1..4dd708d84 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_1.c +++ b/ext/mbstring/oniguruma/enc/iso8859_1.c @@ -32,7 +32,7 @@ #define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \ ((EncISO_8859_1_CtypeTable[code] & ctype) != 0) -static unsigned short EncISO_8859_1_CtypeTable[256] = { +static const unsigned short EncISO_8859_1_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, diff --git a/ext/mbstring/oniguruma/enc/iso8859_10.c b/ext/mbstring/oniguruma/enc/iso8859_10.c index a9331cebf..e317f4975 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_10.c +++ b/ext/mbstring/oniguruma/enc/iso8859_10.c @@ -33,7 +33,7 @@ #define ENC_IS_ISO_8859_10_CTYPE(code,ctype) \ ((EncISO_8859_10_CtypeTable[code] & ctype) != 0) -static UChar EncISO_8859_10_ToLowerCaseTable[256] = { +static const UChar EncISO_8859_10_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -68,7 +68,7 @@ static UChar EncISO_8859_10_ToLowerCaseTable[256] = { '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' }; -static unsigned short EncISO_8859_10_CtypeTable[256] = { +static const unsigned short EncISO_8859_10_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -186,9 +186,9 @@ iso_8859_10_is_code_ctype(OnigCodePoint code, unsigned int ctype) static int iso_8859_10_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xa1, 0xb1 }, { 0xa2, 0xb2 }, { 0xa3, 0xb3 }, diff --git a/ext/mbstring/oniguruma/enc/iso8859_11.c b/ext/mbstring/oniguruma/enc/iso8859_11.c index bb1098807..6afaa27f4 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_11.c +++ b/ext/mbstring/oniguruma/enc/iso8859_11.c @@ -32,7 +32,7 @@ #define ENC_IS_ISO_8859_11_CTYPE(code,ctype) \ ((EncISO_8859_11_CtypeTable[code] & ctype) != 0) -static unsigned short EncISO_8859_11_CtypeTable[256] = { +static const unsigned short EncISO_8859_11_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, diff --git a/ext/mbstring/oniguruma/enc/iso8859_13.c b/ext/mbstring/oniguruma/enc/iso8859_13.c index 827ca508e..abd764452 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_13.c +++ b/ext/mbstring/oniguruma/enc/iso8859_13.c @@ -33,7 +33,7 @@ #define ENC_IS_ISO_8859_13_CTYPE(code,ctype) \ ((EncISO_8859_13_CtypeTable[code] & ctype) != 0) -static UChar EncISO_8859_13_ToLowerCaseTable[256] = { +static const UChar EncISO_8859_13_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -68,7 +68,7 @@ static UChar EncISO_8859_13_ToLowerCaseTable[256] = { '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' }; -static unsigned short EncISO_8859_13_CtypeTable[256] = { +static const unsigned short EncISO_8859_13_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -186,9 +186,9 @@ iso_8859_13_is_code_ctype(OnigCodePoint code, unsigned int ctype) static int iso_8859_13_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xc0, 0xe0 }, { 0xc1, 0xe1 }, { 0xc2, 0xe2 }, diff --git a/ext/mbstring/oniguruma/enc/iso8859_14.c b/ext/mbstring/oniguruma/enc/iso8859_14.c index 4fe5ab29d..d76771a1c 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_14.c +++ b/ext/mbstring/oniguruma/enc/iso8859_14.c @@ -33,7 +33,7 @@ #define ENC_IS_ISO_8859_14_CTYPE(code,ctype) \ ((EncISO_8859_14_CtypeTable[code] & ctype) != 0) -static UChar EncISO_8859_14_ToLowerCaseTable[256] = { +static const UChar EncISO_8859_14_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -68,7 +68,7 @@ static UChar EncISO_8859_14_ToLowerCaseTable[256] = { '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' }; -static unsigned short EncISO_8859_14_CtypeTable[256] = { +static const unsigned short EncISO_8859_14_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -186,9 +186,9 @@ iso_8859_14_is_code_ctype(OnigCodePoint code, unsigned int ctype) static int iso_8859_14_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xa1, 0xa2 }, { 0xa2, 0xa1 }, { 0xa4, 0xa5 }, diff --git a/ext/mbstring/oniguruma/enc/iso8859_15.c b/ext/mbstring/oniguruma/enc/iso8859_15.c index 1a8bd7b4c..d6611ed29 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_15.c +++ b/ext/mbstring/oniguruma/enc/iso8859_15.c @@ -33,7 +33,7 @@ #define ENC_IS_ISO_8859_15_CTYPE(code,ctype) \ ((EncISO_8859_15_CtypeTable[code] & ctype) != 0) -static UChar EncISO_8859_15_ToLowerCaseTable[256] = { +static const UChar EncISO_8859_15_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -68,7 +68,7 @@ static UChar EncISO_8859_15_ToLowerCaseTable[256] = { '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' }; -static unsigned short EncISO_8859_15_CtypeTable[256] = { +static const unsigned short EncISO_8859_15_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -186,9 +186,9 @@ iso_8859_15_is_code_ctype(OnigCodePoint code, unsigned int ctype) static int iso_8859_15_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xa6, 0xa8 }, { 0xa8, 0xa6 }, diff --git a/ext/mbstring/oniguruma/enc/iso8859_16.c b/ext/mbstring/oniguruma/enc/iso8859_16.c index e283db17c..23b868065 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_16.c +++ b/ext/mbstring/oniguruma/enc/iso8859_16.c @@ -33,7 +33,7 @@ #define ENC_IS_ISO_8859_16_CTYPE(code,ctype) \ ((EncISO_8859_16_CtypeTable[code] & ctype) != 0) -static UChar EncISO_8859_16_ToLowerCaseTable[256] = { +static const UChar EncISO_8859_16_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -68,7 +68,7 @@ static UChar EncISO_8859_16_ToLowerCaseTable[256] = { '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' }; -static unsigned short EncISO_8859_16_CtypeTable[256] = { +static const unsigned short EncISO_8859_16_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -186,9 +186,9 @@ iso_8859_16_is_code_ctype(OnigCodePoint code, unsigned int ctype) static int iso_8859_16_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xa1, 0xa2 }, { 0xa2, 0xa1 }, { 0xa3, 0xb3 }, diff --git a/ext/mbstring/oniguruma/enc/iso8859_2.c b/ext/mbstring/oniguruma/enc/iso8859_2.c index e86415b9c..5f21ff78a 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_2.c +++ b/ext/mbstring/oniguruma/enc/iso8859_2.c @@ -33,7 +33,7 @@ #define ENC_IS_ISO_8859_2_CTYPE(code,ctype) \ ((EncISO_8859_2_CtypeTable[code] & ctype) != 0) -static UChar EncISO_8859_2_ToLowerCaseTable[256] = { +static const UChar EncISO_8859_2_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -68,7 +68,7 @@ static UChar EncISO_8859_2_ToLowerCaseTable[256] = { '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' }; -static unsigned short EncISO_8859_2_CtypeTable[256] = { +static const unsigned short EncISO_8859_2_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -177,9 +177,9 @@ iso_8859_2_is_mbc_ambiguous(OnigAmbigType flag, static int iso_8859_2_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xa1, 0xb1 }, { 0xa3, 0xb3 }, { 0xa5, 0xb5 }, diff --git a/ext/mbstring/oniguruma/enc/iso8859_3.c b/ext/mbstring/oniguruma/enc/iso8859_3.c index 76d2bec8a..9ac3dab17 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_3.c +++ b/ext/mbstring/oniguruma/enc/iso8859_3.c @@ -33,7 +33,7 @@ #define ENC_IS_ISO_8859_3_CTYPE(code,ctype) \ ((EncISO_8859_3_CtypeTable[code] & ctype) != 0) -static UChar EncISO_8859_3_ToLowerCaseTable[256] = { +static const UChar EncISO_8859_3_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -68,7 +68,7 @@ static UChar EncISO_8859_3_ToLowerCaseTable[256] = { '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' }; -static unsigned short EncISO_8859_3_CtypeTable[256] = { +static const unsigned short EncISO_8859_3_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -186,9 +186,9 @@ iso_8859_3_is_code_ctype(OnigCodePoint code, unsigned int ctype) static int iso_8859_3_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xa1, 0xb1 }, { 0xa6, 0xb6 }, { 0xa9, 0xb9 }, diff --git a/ext/mbstring/oniguruma/enc/iso8859_4.c b/ext/mbstring/oniguruma/enc/iso8859_4.c index 756900672..c54a2fa14 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_4.c +++ b/ext/mbstring/oniguruma/enc/iso8859_4.c @@ -33,7 +33,7 @@ #define ENC_IS_ISO_8859_4_CTYPE(code,ctype) \ ((EncISO_8859_4_CtypeTable[code] & ctype) != 0) -static UChar EncISO_8859_4_ToLowerCaseTable[256] = { +static const UChar EncISO_8859_4_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -68,7 +68,7 @@ static UChar EncISO_8859_4_ToLowerCaseTable[256] = { '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' }; -static unsigned short EncISO_8859_4_CtypeTable[256] = { +static const unsigned short EncISO_8859_4_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -186,9 +186,9 @@ iso_8859_4_is_code_ctype(OnigCodePoint code, unsigned int ctype) static int iso_8859_4_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xa1, 0xb1 }, { 0xa3, 0xb3 }, { 0xa5, 0xb5 }, diff --git a/ext/mbstring/oniguruma/enc/iso8859_5.c b/ext/mbstring/oniguruma/enc/iso8859_5.c index 2f7677b3e..5b941e2eb 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_5.c +++ b/ext/mbstring/oniguruma/enc/iso8859_5.c @@ -33,7 +33,7 @@ #define ENC_IS_ISO_8859_5_CTYPE(code,ctype) \ ((EncISO_8859_5_CtypeTable[code] & ctype) != 0) -static UChar EncISO_8859_5_ToLowerCaseTable[256] = { +static const UChar EncISO_8859_5_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -68,7 +68,7 @@ static UChar EncISO_8859_5_ToLowerCaseTable[256] = { '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' }; -static unsigned short EncISO_8859_5_CtypeTable[256] = { +static const unsigned short EncISO_8859_5_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -152,9 +152,9 @@ iso_8859_5_is_code_ctype(OnigCodePoint code, unsigned int ctype) static int iso_8859_5_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xa1, 0xf1 }, { 0xa2, 0xf2 }, { 0xa3, 0xf3 }, diff --git a/ext/mbstring/oniguruma/enc/iso8859_6.c b/ext/mbstring/oniguruma/enc/iso8859_6.c index 0fcb9e8b8..bb5515d30 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_6.c +++ b/ext/mbstring/oniguruma/enc/iso8859_6.c @@ -32,7 +32,7 @@ #define ENC_IS_ISO_8859_6_CTYPE(code,ctype) \ ((EncISO_8859_6_CtypeTable[code] & ctype) != 0) -static unsigned short EncISO_8859_6_CtypeTable[256] = { +static const unsigned short EncISO_8859_6_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, diff --git a/ext/mbstring/oniguruma/enc/iso8859_7.c b/ext/mbstring/oniguruma/enc/iso8859_7.c index 8b2cb9ec5..2529dae66 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_7.c +++ b/ext/mbstring/oniguruma/enc/iso8859_7.c @@ -33,7 +33,7 @@ #define ENC_IS_ISO_8859_7_CTYPE(code,ctype) \ ((EncISO_8859_7_CtypeTable[code] & ctype) != 0) -static UChar EncISO_8859_7_ToLowerCaseTable[256] = { +static const UChar EncISO_8859_7_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -68,7 +68,7 @@ static UChar EncISO_8859_7_ToLowerCaseTable[256] = { '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' }; -static unsigned short EncISO_8859_7_CtypeTable[256] = { +static const unsigned short EncISO_8859_7_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -159,9 +159,9 @@ iso_8859_7_is_code_ctype(OnigCodePoint code, unsigned int ctype) static int iso_8859_7_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xb6, 0xdc }, { 0xb8, 0xdd }, { 0xb9, 0xde }, diff --git a/ext/mbstring/oniguruma/enc/iso8859_8.c b/ext/mbstring/oniguruma/enc/iso8859_8.c index 3c95b9b13..d7f0fc594 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_8.c +++ b/ext/mbstring/oniguruma/enc/iso8859_8.c @@ -32,7 +32,7 @@ #define ENC_IS_ISO_8859_8_CTYPE(code,ctype) \ ((EncISO_8859_8_CtypeTable[code] & ctype) != 0) -static unsigned short EncISO_8859_8_CtypeTable[256] = { +static const unsigned short EncISO_8859_8_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, diff --git a/ext/mbstring/oniguruma/enc/iso8859_9.c b/ext/mbstring/oniguruma/enc/iso8859_9.c index 1b061ff6e..f4bcac1ae 100644 --- a/ext/mbstring/oniguruma/enc/iso8859_9.c +++ b/ext/mbstring/oniguruma/enc/iso8859_9.c @@ -33,7 +33,7 @@ #define ENC_IS_ISO_8859_9_CTYPE(code,ctype) \ ((EncISO_8859_9_CtypeTable[code] & ctype) != 0) -static UChar EncISO_8859_9_ToLowerCaseTable[256] = { +static const UChar EncISO_8859_9_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -68,7 +68,7 @@ static UChar EncISO_8859_9_ToLowerCaseTable[256] = { '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' }; -static unsigned short EncISO_8859_9_CtypeTable[256] = { +static const unsigned short EncISO_8859_9_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -186,9 +186,9 @@ iso_8859_9_is_code_ctype(OnigCodePoint code, unsigned int ctype) static int iso_8859_9_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xc0, 0xe0 }, { 0xc1, 0xe1 }, { 0xc2, 0xe2 }, diff --git a/ext/mbstring/oniguruma/enc/koi8.c b/ext/mbstring/oniguruma/enc/koi8.c index f8a5a1da6..27f97f307 100644 --- a/ext/mbstring/oniguruma/enc/koi8.c +++ b/ext/mbstring/oniguruma/enc/koi8.c @@ -33,7 +33,7 @@ #define ENC_IS_KOI8_CTYPE(code,ctype) \ ((EncKOI8_CtypeTable[code] & ctype) != 0) -static UChar EncKOI8_ToLowerCaseTable[256] = { +static const UChar EncKOI8_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -68,7 +68,7 @@ static UChar EncKOI8_ToLowerCaseTable[256] = { '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337' }; -static unsigned short EncKOI8_CtypeTable[256] = { +static const unsigned short EncKOI8_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -105,9 +105,9 @@ static unsigned short EncKOI8_CtypeTable[256] = { static int koi8_mbc_to_normalize(OnigAmbigType flag, - const UChar** pp, const UChar* end, UChar* lower) + const OnigUChar** pp, const OnigUChar* end, OnigUChar* lower) { - UChar* p = (UChar *)*pp; + const OnigUChar* p = *pp; if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && ONIGENC_IS_MBC_ASCII(p)) || @@ -123,9 +123,9 @@ koi8_mbc_to_normalize(OnigAmbigType flag, } static int -koi8_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) +koi8_is_mbc_ambiguous(OnigAmbigType flag, const OnigUChar** pp, const OnigUChar* end) { - UChar* p = (UChar *)*pp; + const OnigUChar* p = *pp; (*pp)++; if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && @@ -151,9 +151,9 @@ koi8_is_code_ctype(OnigCodePoint code, unsigned int ctype) static int koi8_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xc0, 0xe0 }, { 0xc1, 0xe1 }, { 0xc2, 0xe2 }, diff --git a/ext/mbstring/oniguruma/enc/koi8_r.c b/ext/mbstring/oniguruma/enc/koi8_r.c index 7c626df61..d2a4440f2 100644 --- a/ext/mbstring/oniguruma/enc/koi8_r.c +++ b/ext/mbstring/oniguruma/enc/koi8_r.c @@ -33,7 +33,7 @@ #define ENC_IS_KOI8_R_CTYPE(code,ctype) \ ((EncKOI8_R_CtypeTable[code] & ctype) != 0) -static UChar EncKOI8_R_ToLowerCaseTable[256] = { +static const UChar EncKOI8_R_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -68,7 +68,7 @@ static UChar EncKOI8_R_ToLowerCaseTable[256] = { '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337' }; -static unsigned short EncKOI8_R_CtypeTable[256] = { +static const unsigned short EncKOI8_R_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -150,9 +150,9 @@ koi8_r_is_code_ctype(OnigCodePoint code, unsigned int ctype) static int koi8_r_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xc0, 0xe0 }, { 0xc1, 0xe1 }, { 0xc2, 0xe2 }, diff --git a/ext/mbstring/oniguruma/enc/sjis.c b/ext/mbstring/oniguruma/enc/sjis.c index e13407bcc..f7d7d5226 100644 --- a/ext/mbstring/oniguruma/enc/sjis.c +++ b/ext/mbstring/oniguruma/enc/sjis.c @@ -29,7 +29,7 @@ #include "regenc.h" -static int EncLen_SJIS[] = { +static const int EncLen_SJIS[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -76,7 +76,7 @@ sjis_mbc_enc_len(const UChar* p) return EncLen_SJIS[*p]; } -extern int +static int sjis_code_to_mbclen(OnigCodePoint code) { if (code < 256) { @@ -167,21 +167,16 @@ sjis_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) static int sjis_is_code_ctype(OnigCodePoint code, unsigned int ctype) { - if ((ctype & ONIGENC_CTYPE_WORD) != 0) { - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else { + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else { + if ((ctype & (ONIGENC_CTYPE_WORD | + ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) { return (sjis_code_to_mbclen(code) > 1 ? TRUE : FALSE); } - - ctype &= ~ONIGENC_CTYPE_WORD; - if (ctype == 0) return FALSE; } - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else - return FALSE; + return FALSE; } static UChar* diff --git a/ext/mbstring/oniguruma/enc/unicode.c b/ext/mbstring/oniguruma/enc/unicode.c index e3be9450a..a8cf53901 100644 --- a/ext/mbstring/oniguruma/enc/unicode.c +++ b/ext/mbstring/oniguruma/enc/unicode.c @@ -30,7 +30,7 @@ #include "regenc.h" -unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[256] = { +const unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x228c, 0x2289, 0x2288, 0x2288, 0x2288, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -65,7 +65,7 @@ unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[256] = { 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2 }; -static OnigCodePoint CRAlnum[] = { +static const OnigCodePoint CRAlnum[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 414, #else @@ -490,7 +490,7 @@ static OnigCodePoint CRAlnum[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of CRAlnum */ -static OnigCodePoint CRAlpha[] = { +static const OnigCodePoint CRAlpha[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 396, #else @@ -897,7 +897,7 @@ static OnigCodePoint CRAlpha[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of CRAlpha */ -static OnigCodePoint CRBlank[] = { +static const OnigCodePoint CRBlank[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 9, #else @@ -917,7 +917,7 @@ static OnigCodePoint CRBlank[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of CRBlank */ -static OnigCodePoint CRCntrl[] = { +static const OnigCodePoint CRCntrl[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 19, #else @@ -947,7 +947,7 @@ static OnigCodePoint CRCntrl[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of CRCntrl */ -static OnigCodePoint CRDigit[] = { +static const OnigCodePoint CRDigit[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 23, #else @@ -981,7 +981,7 @@ static OnigCodePoint CRDigit[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of CRDigit */ -static OnigCodePoint CRGraph[] = { +static const OnigCodePoint CRGraph[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 405, #else @@ -1397,7 +1397,7 @@ static OnigCodePoint CRGraph[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of CRGraph */ -static OnigCodePoint CRLower[] = { +static const OnigCodePoint CRLower[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 424, #else @@ -1832,7 +1832,7 @@ static OnigCodePoint CRLower[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of CRLower */ -static OnigCodePoint CRPrint[] = { +static const OnigCodePoint CRPrint[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 405, #else @@ -2248,7 +2248,7 @@ static OnigCodePoint CRPrint[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of CRPrint */ -static OnigCodePoint CRPunct[] = { +static const OnigCodePoint CRPunct[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 86, #else @@ -2345,7 +2345,7 @@ static OnigCodePoint CRPunct[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of CRPunct */ -static OnigCodePoint CRSpace[] = { +static const OnigCodePoint CRSpace[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 11, #else @@ -2367,7 +2367,7 @@ static OnigCodePoint CRSpace[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of CRSpace */ -static OnigCodePoint CRUpper[] = { +static const OnigCodePoint CRUpper[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 421, #else @@ -2799,7 +2799,7 @@ static OnigCodePoint CRUpper[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of CRUpper */ -static OnigCodePoint CRXDigit[] = { +static const OnigCodePoint CRXDigit[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 3, #else @@ -2810,7 +2810,7 @@ static OnigCodePoint CRXDigit[] = { 0x0061, 0x0066 }; -static OnigCodePoint CRASCII[] = { +static const OnigCodePoint CRASCII[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 1, #else @@ -2819,7 +2819,7 @@ static OnigCodePoint CRASCII[] = { 0x0000, 0x007f }; -static OnigCodePoint CRWord[] = { +static const OnigCodePoint CRWord[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 436, #else @@ -3320,6 +3320,9 @@ onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype) case ONIGENC_CTYPE_ALNUM: return onig_is_in_code_range((UChar* )CRAlnum, code); break; + case ONIGENC_CTYPE_NEWLINE: + return FALSE; + break; default: return ONIGENCERR_TYPE_BUG; @@ -3337,9 +3340,9 @@ onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype) extern int onigenc_unicode_get_ctype_code_range(int ctype, - OnigCodePoint* sbr[], OnigCodePoint* mbr[]) + const OnigCodePoint* sbr[], const OnigCodePoint* mbr[]) { - static OnigCodePoint EmptyRange[] = { 0 }; + static const OnigCodePoint EmptyRange[] = { 0 }; #define CR_SET(list) do { \ *mbr = list; \ diff --git a/ext/mbstring/oniguruma/enc/utf16_be.c b/ext/mbstring/oniguruma/enc/utf16_be.c index ad33ddbee..0dd2832f7 100755 --- a/ext/mbstring/oniguruma/enc/utf16_be.c +++ b/ext/mbstring/oniguruma/enc/utf16_be.c @@ -2,7 +2,7 @@ utf16_be.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,7 +32,7 @@ #define UTF16_IS_SURROGATE_FIRST(c) (c >= 0xd8 && c <= 0xdb) #define UTF16_IS_SURROGATE_SECOND(c) (c >= 0xdc && c <= 0xdf) -static int EncLen_UTF16[] = { +static const int EncLen_UTF16[] = { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -63,6 +63,12 @@ utf16be_is_mbc_newline(const UChar* p, const UChar* end) if (p + 1 < end) { if (*(p+1) == 0x0a && *p == 0x00) return 1; +#ifdef USE_UNICODE_ALL_LINE_TERMINATORS + if ((*(p+1) == 0x0d || *(p+1) == 0x85) && *p == 0x00) + return 1; + if (*p == 0x20 && (*(p+1) == 0x29 || *(p+1) == 0x28)) + return 1; +#endif } return 0; } diff --git a/ext/mbstring/oniguruma/enc/utf16_le.c b/ext/mbstring/oniguruma/enc/utf16_le.c index db892dcd1..93cc6138a 100755 --- a/ext/mbstring/oniguruma/enc/utf16_le.c +++ b/ext/mbstring/oniguruma/enc/utf16_le.c @@ -2,7 +2,7 @@ utf16_le.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,7 +32,7 @@ #define UTF16_IS_SURROGATE_FIRST(c) (c >= 0xd8 && c <= 0xdb) #define UTF16_IS_SURROGATE_SECOND(c) (c >= 0xdc && c <= 0xdf) -static int EncLen_UTF16[] = { +static const int EncLen_UTF16[] = { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -69,6 +69,12 @@ utf16le_is_mbc_newline(const UChar* p, const UChar* end) if (p + 1 < end) { if (*p == 0x0a && *(p+1) == 0x00) return 1; +#ifdef USE_UNICODE_ALL_LINE_TERMINATORS + if ((*p == 0x0d || *p == 0x85) && *(p+1) == 0x00) + return 1; + if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28)) + return 1; +#endif } return 0; } diff --git a/ext/mbstring/oniguruma/enc/utf32_be.c b/ext/mbstring/oniguruma/enc/utf32_be.c index 60feb040b..36b477286 100755 --- a/ext/mbstring/oniguruma/enc/utf32_be.c +++ b/ext/mbstring/oniguruma/enc/utf32_be.c @@ -2,7 +2,7 @@ utf32_be.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -41,6 +41,14 @@ utf32be_is_mbc_newline(const UChar* p, const UChar* end) if (p + 3 < end) { if (*(p+3) == 0x0a && *(p+2) == 0 && *(p+1) == 0 && *p == 0) return 1; +#ifdef USE_UNICODE_ALL_LINE_TERMINATORS + if ((*(p+3) == 0x0d || *(p+3) == 0x85) + && *(p+2) == 0 && *(p+1) == 0 && *p == 0x00) + return 1; + if (*(p+2) == 0x20 && (*(p+3) == 0x29 || *(p+3) == 0x28) + && *(p+1) == 0 && *p == 0) + return 1; +#endif } return 0; } diff --git a/ext/mbstring/oniguruma/enc/utf32_le.c b/ext/mbstring/oniguruma/enc/utf32_le.c index bba9689f7..1e9487d1d 100755 --- a/ext/mbstring/oniguruma/enc/utf32_le.c +++ b/ext/mbstring/oniguruma/enc/utf32_le.c @@ -2,7 +2,7 @@ utf32_le.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -41,6 +41,14 @@ utf32le_is_mbc_newline(const UChar* p, const UChar* end) if (p + 3 < end) { if (*p == 0x0a && *(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) return 1; +#ifdef USE_UNICODE_ALL_LINE_TERMINATORS + if ((*p == 0x0d || *p == 0x85) && *(p+1) == 0x00 + && (p+2) == 0x00 && *(p+3) == 0x00) + return 1; + if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28) + && *(p+2) == 0x00 && *(p+3) == 0x00) + return 1; +#endif } return 0; } diff --git a/ext/mbstring/oniguruma/enc/utf8.c b/ext/mbstring/oniguruma/enc/utf8.c index 592bebfe8..0e816176b 100644 --- a/ext/mbstring/oniguruma/enc/utf8.c +++ b/ext/mbstring/oniguruma/enc/utf8.c @@ -2,7 +2,7 @@ utf8.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -40,7 +40,7 @@ #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) -static int EncLen_UTF8[] = { +static const int EncLen_UTF8[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -65,6 +65,29 @@ utf8_mbc_enc_len(const UChar* p) return EncLen_UTF8[*p]; } +static int +utf8_is_mbc_newline(const UChar* p, const UChar* end) +{ + if (p < end) { + if (*p == 0x0a) return 1; + +#ifdef USE_UNICODE_ALL_LINE_TERMINATORS + if (*p == 0x0d) return 1; + if (p + 1 < end) { + if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */ + return 1; + if (p + 2 < end) { + if ((*(p+2) == 0xa8 || *(p+2) == 0xa9) + && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */ + return 1; + } + } +#endif + } + + return 0; +} + static OnigCodePoint utf8_mbc_to_code(const UChar* p, const UChar* end) { @@ -307,16 +330,16 @@ utf8_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end) } -static OnigCodePoint EmptyRange[] = { 0 }; +static const OnigCodePoint EmptyRange[] = { 0 }; -static OnigCodePoint SBAlnum[] = { +static const OnigCodePoint SBAlnum[] = { 3, 0x0030, 0x0039, 0x0041, 0x005a, 0x0061, 0x007a }; -static OnigCodePoint MBAlnum[] = { +static const OnigCodePoint MBAlnum[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 411, #else @@ -738,13 +761,13 @@ static OnigCodePoint MBAlnum[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBAlnum */ -static OnigCodePoint SBAlpha[] = { +static const OnigCodePoint SBAlpha[] = { 2, 0x0041, 0x005a, 0x0061, 0x007a }; -static OnigCodePoint MBAlpha[] = { +static const OnigCodePoint MBAlpha[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 394, #else @@ -1149,13 +1172,13 @@ static OnigCodePoint MBAlpha[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBAlpha */ -static OnigCodePoint SBBlank[] = { +static const OnigCodePoint SBBlank[] = { 2, 0x0009, 0x0009, 0x0020, 0x0020 }; -static OnigCodePoint MBBlank[] = { +static const OnigCodePoint MBBlank[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 7, #else @@ -1173,13 +1196,13 @@ static OnigCodePoint MBBlank[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBBlank */ -static OnigCodePoint SBCntrl[] = { +static const OnigCodePoint SBCntrl[] = { 2, 0x0000, 0x001f, 0x007f, 0x007f }; -static OnigCodePoint MBCntrl[] = { +static const OnigCodePoint MBCntrl[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 18, #else @@ -1208,12 +1231,12 @@ static OnigCodePoint MBCntrl[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBCntrl */ -static OnigCodePoint SBDigit[] = { +static const OnigCodePoint SBDigit[] = { 1, 0x0030, 0x0039 }; -static OnigCodePoint MBDigit[] = { +static const OnigCodePoint MBDigit[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 22, #else @@ -1245,12 +1268,12 @@ static OnigCodePoint MBDigit[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBDigit */ -static OnigCodePoint SBGraph[] = { +static const OnigCodePoint SBGraph[] = { 1, 0x0021, 0x007e }; -static OnigCodePoint MBGraph[] = { +static const OnigCodePoint MBGraph[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 404, #else @@ -1665,12 +1688,12 @@ static OnigCodePoint MBGraph[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBGraph */ -static OnigCodePoint SBLower[] = { +static const OnigCodePoint SBLower[] = { 1, 0x0061, 0x007a }; -static OnigCodePoint MBLower[] = { +static const OnigCodePoint MBLower[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 423, #else @@ -2104,13 +2127,13 @@ static OnigCodePoint MBLower[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBLower */ -static OnigCodePoint SBPrint[] = { +static const OnigCodePoint SBPrint[] = { 2, 0x0009, 0x000d, 0x0020, 0x007e }; -static OnigCodePoint MBPrint[] = { +static const OnigCodePoint MBPrint[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 403, #else @@ -2524,7 +2547,7 @@ static OnigCodePoint MBPrint[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBPrint */ -static OnigCodePoint SBPunct[] = { +static const OnigCodePoint SBPunct[] = { 9, 0x0021, 0x0023, 0x0025, 0x002a, @@ -2537,7 +2560,7 @@ static OnigCodePoint SBPunct[] = { 0x007d, 0x007d }; /* end of SBPunct */ -static OnigCodePoint MBPunct[] = { +static const OnigCodePoint MBPunct[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 77, #else @@ -2625,13 +2648,13 @@ static OnigCodePoint MBPunct[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBPunct */ -static OnigCodePoint SBSpace[] = { +static const OnigCodePoint SBSpace[] = { 2, 0x0009, 0x000d, 0x0020, 0x0020 }; -static OnigCodePoint MBSpace[] = { +static const OnigCodePoint MBSpace[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 9, #else @@ -2651,12 +2674,12 @@ static OnigCodePoint MBSpace[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBSpace */ -static OnigCodePoint SBUpper[] = { +static const OnigCodePoint SBUpper[] = { 1, 0x0041, 0x005a }; -static OnigCodePoint MBUpper[] = { +static const OnigCodePoint MBUpper[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 420, #else @@ -3087,19 +3110,19 @@ static OnigCodePoint MBUpper[] = { #endif /* USE_UNICODE_FULL_RANGE_CTYPE */ }; /* end of MBUpper */ -static OnigCodePoint SBXDigit[] = { +static const OnigCodePoint SBXDigit[] = { 3, 0x0030, 0x0039, 0x0041, 0x0046, 0x0061, 0x0066 }; -static OnigCodePoint SBASCII[] = { +static const OnigCodePoint SBASCII[] = { 1, 0x0000, 0x007f }; -static OnigCodePoint SBWord[] = { +static const OnigCodePoint SBWord[] = { 4, 0x0030, 0x0039, 0x0041, 0x005a, @@ -3107,7 +3130,7 @@ static OnigCodePoint SBWord[] = { 0x0061, 0x007a }; -static OnigCodePoint MBWord[] = { +static const OnigCodePoint MBWord[] = { #ifdef USE_UNICODE_FULL_RANGE_CTYPE 432, #else @@ -3554,7 +3577,7 @@ static OnigCodePoint MBWord[] = { static int utf8_get_ctype_code_range(int ctype, - OnigCodePoint* sbr[], OnigCodePoint* mbr[]) + const OnigCodePoint* sbr[], const OnigCodePoint* mbr[]) { #define CR_SET(sbl,mbl) do { \ *sbr = sbl; \ @@ -3622,7 +3645,7 @@ static int utf8_is_code_ctype(OnigCodePoint code, unsigned int ctype) { #ifdef USE_UNICODE_FULL_RANGE_CTYPE - OnigCodePoint *range; + const OnigCodePoint *range; #endif if (code < 256) { @@ -3674,6 +3697,9 @@ utf8_is_code_ctype(OnigCodePoint code, unsigned int ctype) case ONIGENC_CTYPE_ALNUM: range = MBAlnum; break; + case ONIGENC_CTYPE_NEWLINE: + return FALSE; + break; default: return ONIGENCERR_TYPE_BUG; @@ -3723,7 +3749,7 @@ OnigEncodingType OnigEncodingUTF8 = { , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ }, - onigenc_is_mbc_newline_0x0a, + utf8_is_mbc_newline, utf8_mbc_to_code, utf8_code_to_mbclen, utf8_code_to_mbc, diff --git a/ext/mbstring/oniguruma/index.html b/ext/mbstring/oniguruma/index.html index 02e844c36..fbf4fc095 100755 --- a/ext/mbstring/oniguruma/index.html +++ b/ext/mbstring/oniguruma/index.html @@ -5,18 +5,10 @@ </head> <body BGCOLOR="#ffffff" VLINK="#808040" TEXT="#696969"> -<!-- -<a href="http://miuras.net/matsushita.html"> -<img src="anti_matsushita.PNG" height="46" width="266"> -</a> ---> -<a href="http://miuras.net/matsushita.html">M</a> -<a href="http://www.micropac.co.jp/nec/">N</a> - <h1>Oniguruma</h1> <p> -2005/03/07 (C) K.Kosako +2006/09/19 (C) K.Kosako </p> <p> @@ -29,10 +21,13 @@ The characteristics of this library is that different character encoding <dt><b>Supported character encodings:</b><br> ASCII, UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE,<br> EUC-JP, EUC-TW, EUC-KR, EUC-CN,<br> -Shift_JIS, Big5, KOI8-R, KOI8,<br> +Shift_JIS, Big5, GB 18030, KOI8-R, KOI8,<br> ISO-8859-1, ISO-8859-2, ISO-8859-3, ISO-8859-4, ISO-8859-5,<br> ISO-8859-6, ISO-8859-7, ISO-8859-8, ISO-8859-9, ISO-8859-10,<br> -ISO-8859-11, ISO-8859-13, ISO-8859-14, ISO-8859-15, ISO-8859-16 +ISO-8859-11, ISO-8859-13, ISO-8859-14, ISO-8859-15, ISO-8859-16<br> +<font color="red"> +(GB 18030 encoding was contributed by KUBO Takehiro) +</font> </p> </dl> <p> @@ -42,8 +37,8 @@ ISO-8859-11, ISO-8859-13, ISO-8859-14, ISO-8859-15, ISO-8859-16 <dt><b>What's new</b> </font> <ul> -<li>released Version 3.7.1 (2005/03/07) -<li>released Version 2.4.2 (2005/03/05) +<li>Version 4.4.4 released. (2006/09/19) +<li>Version 2.5.7 released. (2006/07/28) </ul> </dl> @@ -75,23 +70,24 @@ It follows the BSD license in the case of the one except for it. <dt><b>Download:</b> <ul> -<li> <a href="archive/onigd20050307.tar.gz">Latest release version 3.7.1</a> (2005/03/07) <a href="HISTORY_3X.txt">Change Log</a> -<li> <a href="archive/onigd20050219.tar.gz">3.7.0</a> (2005/02/19) -<li> <a href="archive/onigd20050204.tar.gz">3.6.0</a> (2005/02/04) -<li> <a href="archive/onigd2_4_2.tar.gz">Latest release version 2.4.2</a> (2005/03/05) <a href="HISTORY_2X.txt">Change Log</a> -<li> <a href="archive/onigd2_4_1.tar.gz">2.4.1</a> (2005/01/05) -<li> <a href="archive/onigd2_4_0.tar.gz">2.4.0</a> (2004/12/01) +<li> <a href="archive/onig-4.4.4.tar.gz">Latest release version 4.4.4</a> (2006/09/19) <a href="HISTORY_4X.txt">Change Log</a> +<li> <a href="archive/onig-4.4.3.tar.gz">4.4.3</a> (2006/09/15) +<li> <a href="archive/onig-4.4.2.tar.gz">4.4.2</a> (2006/09/08) +<li> <a href="archive/onig-4.4.1.tar.gz">4.4.1</a> (2006/08/29) +<li> <a href="archive/onigd2_5_7.tar.gz">Latest release version 2.5.7</a> (2006/07/28) <a href="HISTORY_2X.txt">Change Log</a> +<li> <a href="archive/onigd2_5_6.tar.gz">2.5.6</a> (2006/05/29) +<li> <a href="archive/onigd2_5_5.tar.gz">2.5.5</a> (2006/05/08) </ul> <br> <font color="red"> -* 3.X.X supports UTF-16/UTF-32, Ruby 1.9.X.<br> -* 2.X.X does not support UTF-16/UTF-32, supports Ruby 1.6/1.8. +* 4.X.X supports UTF-16/UTF-32, Ruby 1.9.X.<br> +* 2.X.X does not support UTF-16/UTF-32, supports Ruby 1.6/1.8.[2-4] </font> <br> <br> -<dt><b>Documents:</b> (version 3.7.1) +<dt><b>Documents:</b> (version 4.4.4) <ul> <li> <a href="doc/RE.txt">Regular Expressions</a> <a href="doc/RE.ja.txt">(Japanese: EUC-JP)</a> @@ -112,23 +108,39 @@ It follows the BSD license in the case of the one except for it. <li> <a href="http://www.ruby-lang.org/cgi-bin/cvsweb.cgi/oniguruma/">Oniguruma in Ruby CVS</a> (old version) <li> <a href="http://raa.ruby-lang.org/project/oniguruma/">Oniguruma in RAA</a> (Ruby Application Archive) <li> <a href="http://www.freebsd.org/cgi/cvsweb.cgi/ports/devel/oniguruma/">FreeBSD ports</a> -<li> <a href="http://www.halbiz.com/osaru/cnregex.html">cnRegex 4D Plugin</a> (Japanese page) +<li> <a href="http://www.softantenna.com/lib/1953/index.html">SoftAntenna > Lib > Oniguruma</a> (Japanese page) +<li> <a href="http://homepage3.nifty.com/k-takata/mysoft/bregonig.html">bregonig.dll</a> (Japanese page) +<li> <a href="http://www.halbiz.com/osaru/cnregex.html">cnRegex 4D Plugin (Mac OS X)</a> (Japanese page) +<li> <a href="http://kmaebashi.com/">new script language crowbar</a> (Japanese page) +<li> <a href="http://homepage2.nifty.com/Km/onig.htm">Delphi interface (Win32)</a> (Japanese page) +<li> <a href="http://pyxis-project.net/ensemble/">Ensemble (Mac OS X)</a> (Japanese page) <li> <a href="http://www.tom.sfc.keio.ac.jp/~sakai/d/?date=20050209">GHC patch</a> Masahiro Sakai (Japanese Blog) -<li> <a href="http://www.gyazsquare.com/gyazmail/index.php">GyazMail</a> -<li> <a href="http://www.artman21.net/">Jedit X</a> +<li> <a href="http://www.gyazsquare.com/gyazmail/index.php">GyazMail (Mac OS X)</a> +<li> <a href="http://www.artman21.net/">Jedit X (Mac OS X)</a> <li> <a href="http://www.chitora.jp/lhaz.html">Lhaz</a> (Japanese page) +<li> <a href="http://limechat.net/">LimeChat</a> (Japanese page) <li> <a href="http://www.irori.org/tool/mregexp.html">mregexp</a> (Japanese page) -<li> <a href="http://www.trinity-site.net/wiki/index.php?MultiFind">MultiFind</a> (Japanese page) <li> <a href="http://ochusha.sourceforge.jp/">Ochusha</a> (Japanese page) -<li> <a href="http://www-gauge.scphys.kyoto-u.ac.jp/~sonobe/OgreKit/index.html">OgreKit</a> Regular Expression Framework for Cocoa (Japanese page) -<li> <a href ="http://www.kanetaka.net/4dapi/wiki4d.dll/4dcgi/wiki.cgi?plugins-oniguruma">OnigRegexp</a> (Japanese page) -<li> <a href ="http://www.moriq.com/onig/">Oniguruma / FireBird (Win32)</a> -<li> <a href ="http://openspace.timedia.co.jp/~yasuyuki/wiliki/wiliki.cgi?Oniguruma-mysqld&l=jp">Oniguruma-mysqld</a> -<li> <a href ="http://www.kt.rim.or.jp/~kbk/sed/index.html">Onigsed (Win32)</a> (Japanese page) +<li> <a href="http://www8.ocn.ne.jp/%7esonoisa/OgreKit/index.html">OgreKit (Mac OS X)</a> Regular Expression Framework for Cocoa (Japanese page) +<li> <a href="http://www.kanetaka.net/4dapi/wiki4d.dll/4dcgi/wiki.cgi?plugins-oniguruma">OnigRegexp</a> (Japanese page) +<li> <a href="http://www.moriq.com/onig/">Oniguruma / FireBird (Win32)</a> +<li> <a href="http://openspace.timedia.co.jp/~yasuyuki/wiliki/wiliki.cgi?Oniguruma-mysqld&l=jp">Oniguruma-mysqld</a> +<li> <a href="http://www.void.in/wiki/OnigPP">OnigPP</a> (Japanese page) +<li> <a href="http://www.kt.rim.or.jp/~kbk/sed/index.html">Onigsed (Win32)</a> (Japanese page) +<li> <a href="http://www.kt.rim.or.jp/~kbk/yagrep/index.html">yagrep (Win32)</a> (Japanese page) <li> <a href="http://www.php.gr.jp/">Japan PHP User Group</a> PHP 5.0 mb_ereg (Japanese page) +<li> <a href="http://yatsu.info/wiki/Pufui/">Pufui (Mac OS X)</a> (Japanese page) +<li> <a href="http://harumune.s56.xrea.com/assari/index.php?RSSTyping">RSSTyping</a> (Japanese page) <li> <a href="http://www.ruby-lang.org/">Ruby</a> -<li> <a href="http://quux.s74.xrea.com/">SevenFour</a> (Japanese page) -<li> <a href="http://www8.ocn.ne.jp/~sonoisa/TiddlyWikiPod/">TiddlyWikiPod</a> +<li> <a href="http://tobysoft.net/wiki/index.php?Ruby%2Fruby-win32-oniguruma">ruby-win32-oniguruma</a> (Japanese page) +<li> <a href="http://quux.s74.xrea.com/">SevenFour (Mac OS X)</a> (Japanese page) +<li> <a href="http://storklab.cyber-ninja.jp/">Stork Lab. Products (Mac OS X)</a> (Japanese page) +<li> <a href="http://sourceforge.jp/projects/ttssh2/">TeraTerm</a> +<li> <a href="http://macromates.com/">TextMate (Mac OS X)</a> +<li> <a href="http://www8.ocn.ne.jp/~sonoisa/TiddlyWikiPod/">TiddlyWikiPod (Mac OS X)</a> +<li> <a href="http://www.cyanworks.net/mac.html">TunesTEXT (Mac OS X)</a> +<li> <a href="http://sourceforge.jp/projects/frogger/">XML parser</a> +<li> <a href="http://www.yokkasoft.net/">YokkaSoft</a> </ul> <br> @@ -138,41 +150,41 @@ It follows the BSD license in the case of the one except for it. <li> <a href="http://www.perldoc.com/perl5.8.0/pod/perlre.html">Perl regular expressions</a> <li> <a href="http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html">java.util.regex.Pattern (J2SE 1.4.2)</a> <li> <a href="http://www.opengroup.org/onlinepubs/007908799/xbd/re.html">The Open Group</a> -<li> <a href="http://www.pcre.org/">PCRE</a> -<!-- -<li> <a href="http://www.jajakarta.org/regexp/">Jakarta Project Regexp</a> (Japanese page) -<li> <a href="http://www.jajakarta.org/oro/">Jakarta Project ORO</a> (Japanese page) ---> -<li> <a href="http://www.kt.rim.or.jp/~kbk/regex/regex.html">Regular expressions memo</a> (Japanese page) -<li> <a href="http://www.din.or.jp/~ohzaki/regex.htm">Regular expressions technique</a> (Japanese page) <li> <a href="http://regex.info/">Mastering Regular Expressions</a> +<li> <a href="http://www.unicode.org/">Unicode Home Page</a> </ul> <br> -<!-- -<dt><b>ToDo:</b> +<dt><b>Resources:</b> <ul> -<li> support character types for all code point range. +<li> <a href="http://www.kt.rim.or.jp/~kbk/regex/regex.html">Regular expressions memo</a> (Japanese page) +<li> <a href="http://www.din.or.jp/~ohzaki/regex.htm">Regular expressions technique</a> (Japanese page) +<li> <a href="http://staff.aist.go.jp/tanaka-akira/textprocess/">"Text Processing" Lecture documents (Tanaka Akira)</a> (Japanese page) </ul> ---> + +<br> </dl> <p> and I'm thankful to Akinori MUSHA. </p> -<!-- <hr> -<font color="red"> -2004-06-14<br> -To: "Greg A. Woods"<br> -I can't send mail to you. (rejected)<br> -Please set the nmatch argument of regexec() to 1, -and use Oniguruma 3.7.1 or 2.4.2.<br> -The nmatch argument should be array size of a pmatch.<br> -But I don't know whether this problem is related to the crash -that you reported. -</font> ---> +<dl> +<dt><b>Other Libraries:</b> +<ul> +<li> <a href="http://www.boost.org/libs/regex/doc/">Boost.Regex</a> +<li> <a href="http://www.pcre.org/">PCRE</a> +<li> <a href="http://arglist.com/regex/">A copy of Henry Spencer's</a> +<li> <a href="http://re2c.org/">re2c</a> +<li> <a href="http://tiny-rex.sourceforge.net/">T-Rex</a> +<li> <a href="http://laurikari.net/tre/">TRE</a> +<li> <a href="http://www.cacas.org/java/gnu/regexp/">gnu.regexp for Java</a> +<li> <a href="http://jakarta.apache.org/regexp/index.html">Jakarta Project Regexp</a> +<li> <a href="http://jakarta.apache.org/oro/">Jakarta Project ORO</a> +</ul> +</dl> + <hr> +<a href="../">Back to Home</a> </body> </html> diff --git a/ext/mbstring/oniguruma/onigcmpt200.h b/ext/mbstring/oniguruma/onigcmpt200.h index 4c029304b..d9b141914 100644 --- a/ext/mbstring/oniguruma/onigcmpt200.h +++ b/ext/mbstring/oniguruma/onigcmpt200.h @@ -29,6 +29,12 @@ #define REGCODE_EUCJP REG_ENCODING_EUC_JP #define REGCODE_SJIS REG_ENCODING_SJIS +/* Don't use REGCODE_XXXX. (obsoleted) */ +#define MBCTYPE_ASCII RE_MBCTYPE_ASCII +#define MBCTYPE_EUC RE_MBCTYPE_EUC +#define MBCTYPE_SJIS RE_MBCTYPE_SJIS +#define MBCTYPE_UTF8 RE_MBCTYPE_UTF8 + typedef unsigned char* RegTransTableType; #define RegOptionType OnigOptionType #define RegDistance OnigDistance diff --git a/ext/mbstring/oniguruma/oniggnu.h b/ext/mbstring/oniguruma/oniggnu.h index b203f6c8a..3da9f235c 100644 --- a/ext/mbstring/oniguruma/oniggnu.h +++ b/ext/mbstring/oniguruma/oniggnu.h @@ -35,10 +35,10 @@ extern "C" { #endif -#define MBCTYPE_ASCII 0 -#define MBCTYPE_EUC 1 -#define MBCTYPE_SJIS 2 -#define MBCTYPE_UTF8 3 +#define RE_MBCTYPE_ASCII 0 +#define RE_MBCTYPE_EUC 1 +#define RE_MBCTYPE_SJIS 2 +#define RE_MBCTYPE_UTF8 3 /* GNU regex options */ #ifndef RE_NREGS diff --git a/ext/mbstring/oniguruma/oniguruma.h b/ext/mbstring/oniguruma/oniguruma.h index 279035610..a0107cbe3 100644 --- a/ext/mbstring/oniguruma/oniguruma.h +++ b/ext/mbstring/oniguruma/oniguruma.h @@ -4,7 +4,7 @@ oniguruma.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -36,9 +36,9 @@ extern "C" { #endif #define ONIGURUMA -#define ONIGURUMA_VERSION_MAJOR 3 -#define ONIGURUMA_VERSION_MINOR 7 -#define ONIGURUMA_VERSION_TEENY 1 +#define ONIGURUMA_VERSION_MAJOR 4 +#define ONIGURUMA_VERSION_MINOR 4 +#define ONIGURUMA_VERSION_TEENY 4 #ifdef __cplusplus # ifndef HAVE_PROTOTYPES @@ -49,6 +49,13 @@ extern "C" { # endif #endif +/* escape Mac OS X/Xcode 2.4/gcc 4.0.1 problem */ +#if defined(__APPLE__) && defined(__GNUC__) && __GNUC__ >= 4 +# ifndef HAVE_STDARG_PROTOTYPES +# define HAVE_STDARG_PROTOTYPES 1 +# endif +#endif + #ifndef P_ #if defined(__STDC__) || defined(_WIN32) # define P_(args) args @@ -167,10 +174,10 @@ typedef struct { int (*code_to_mbc)(OnigCodePoint code, OnigUChar *buf); int (*mbc_to_normalize)(OnigAmbigType flag, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to); int (*is_mbc_ambiguous)(OnigAmbigType flag, const OnigUChar** pp, const OnigUChar* end); - int (*get_all_pair_ambig_codes)(OnigAmbigType flag, OnigPairAmbigCodes** acs); - int (*get_all_comp_ambig_codes)(OnigAmbigType flag, OnigCompAmbigCodes** acs); + int (*get_all_pair_ambig_codes)(OnigAmbigType flag, const OnigPairAmbigCodes** acs); + int (*get_all_comp_ambig_codes)(OnigAmbigType flag, const OnigCompAmbigCodes** acs); int (*is_code_ctype)(OnigCodePoint code, unsigned int ctype); - int (*get_ctype_code_range)(int ctype, OnigCodePoint* sb_range[], OnigCodePoint* mb_range[]); + int (*get_ctype_code_range)(int ctype, const OnigCodePoint* sb_range[], const OnigCodePoint* mb_range[]); OnigUChar* (*left_adjust_char_head)(const OnigUChar* start, const OnigUChar* p); int (*is_allowed_reverse_match)(const OnigUChar* p, const OnigUChar* end); } OnigEncodingType; @@ -206,6 +213,7 @@ ONIG_EXTERN OnigEncodingType OnigEncodingSJIS; ONIG_EXTERN OnigEncodingType OnigEncodingKOI8; ONIG_EXTERN OnigEncodingType OnigEncodingKOI8_R; ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; +ONIG_EXTERN OnigEncodingType OnigEncodingGB18030; #define ONIG_ENCODING_ASCII (&OnigEncodingASCII) #define ONIG_ENCODING_ISO_8859_1 (&OnigEncodingISO_8859_1) @@ -236,6 +244,7 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; #define ONIG_ENCODING_KOI8 (&OnigEncodingKOI8) #define ONIG_ENCODING_KOI8_R (&OnigEncodingKOI8_R) #define ONIG_ENCODING_BIG5 (&OnigEncodingBIG5) +#define ONIG_ENCODING_GB18030 (&OnigEncodingGB18030) #endif /* else RUBY && M17N */ @@ -448,7 +457,7 @@ int onigenc_str_bytelen_null P_((OnigEncoding enc, const OnigUChar* p)); #define ONIG_NREGION 10 #define ONIG_MAX_BACKREF_NUM 1000 #define ONIG_MAX_REPEAT_NUM 100000 -#define ONIG_MAX_MULTI_BYTE_RANGES_NUM 1000 +#define ONIG_MAX_MULTI_BYTE_RANGES_NUM 10000 /* constants */ #define ONIG_MAX_ERROR_MESSAGE_LEN 90 @@ -457,8 +466,8 @@ typedef unsigned int OnigOptionType; #define ONIG_OPTION_DEFAULT ONIG_OPTION_NONE /* options */ -#define ONIG_OPTION_NONE 0 -#define ONIG_OPTION_IGNORECASE 1L +#define ONIG_OPTION_NONE 0U +#define ONIG_OPTION_IGNORECASE 1U #define ONIG_OPTION_EXTEND (ONIG_OPTION_IGNORECASE << 1) #define ONIG_OPTION_MULTILINE (ONIG_OPTION_EXTEND << 1) #define ONIG_OPTION_SINGLELINE (ONIG_OPTION_MULTILINE << 1) @@ -471,6 +480,7 @@ typedef unsigned int OnigOptionType; #define ONIG_OPTION_NOTBOL (ONIG_OPTION_CAPTURE_GROUP << 1) #define ONIG_OPTION_NOTEOL (ONIG_OPTION_NOTBOL << 1) #define ONIG_OPTION_POSIX_REGION (ONIG_OPTION_NOTEOL << 1) +#define ONIG_OPTION_MAXBIT ONIG_OPTION_POSIX_REGION /* limit */ #define ONIG_OPTION_ON(options,regopt) ((options) |= (regopt)) #define ONIG_OPTION_OFF(options,regopt) ((options) &= ~(regopt)) @@ -484,6 +494,7 @@ typedef struct { OnigOptionType options; /* default option */ } OnigSyntaxType; +ONIG_EXTERN OnigSyntaxType OnigSyntaxASIS; ONIG_EXTERN OnigSyntaxType OnigSyntaxPosixBasic; ONIG_EXTERN OnigSyntaxType OnigSyntaxPosixExtended; ONIG_EXTERN OnigSyntaxType OnigSyntaxEmacs; @@ -491,9 +502,11 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxGrep; ONIG_EXTERN OnigSyntaxType OnigSyntaxGnuRegex; ONIG_EXTERN OnigSyntaxType OnigSyntaxJava; ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl; +ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl_NG; ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby; /* predefined syntaxes (see regsyntax.c) */ +#define ONIG_SYNTAX_ASIS (&OnigSyntaxASIS) #define ONIG_SYNTAX_POSIX_BASIC (&OnigSyntaxPosixBasic) #define ONIG_SYNTAX_POSIX_EXTENDED (&OnigSyntaxPosixExtended) #define ONIG_SYNTAX_EMACS (&OnigSyntaxEmacs) @@ -501,6 +514,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby; #define ONIG_SYNTAX_GNU_REGEX (&OnigSyntaxGnuRegex) #define ONIG_SYNTAX_JAVA (&OnigSyntaxJava) #define ONIG_SYNTAX_PERL (&OnigSyntaxPerl) +#define ONIG_SYNTAX_PERL_NG (&OnigSyntaxPerl_NG) #define ONIG_SYNTAX_RUBY (&OnigSyntaxRuby) /* default syntax */ @@ -508,80 +522,81 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYNTAX_DEFAULT OnigDefaultSyntax /* syntax (operators) */ -#define ONIG_SYN_OP_VARIABLE_META_CHARACTERS (1<<0) -#define ONIG_SYN_OP_DOT_ANYCHAR (1<<1) /* . */ -#define ONIG_SYN_OP_ASTERISK_ZERO_INF (1<<2) /* * */ -#define ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF (1<<3) -#define ONIG_SYN_OP_PLUS_ONE_INF (1<<4) /* + */ -#define ONIG_SYN_OP_ESC_PLUS_ONE_INF (1<<5) -#define ONIG_SYN_OP_QMARK_ZERO_ONE (1<<6) /* ? */ -#define ONIG_SYN_OP_ESC_QMARK_ZERO_ONE (1<<7) -#define ONIG_SYN_OP_BRACE_INTERVAL (1<<8) /* {lower,upper} */ -#define ONIG_SYN_OP_ESC_BRACE_INTERVAL (1<<9) /* \{lower,upper\} */ -#define ONIG_SYN_OP_VBAR_ALT (1<<10) /* | */ -#define ONIG_SYN_OP_ESC_VBAR_ALT (1<<11) /* \| */ -#define ONIG_SYN_OP_LPAREN_SUBEXP (1<<12) /* (...) */ -#define ONIG_SYN_OP_ESC_LPAREN_SUBEXP (1<<13) /* \(...\) */ -#define ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR (1<<14) /* \A, \Z, \z */ -#define ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR (1<<15) /* \G */ -#define ONIG_SYN_OP_DECIMAL_BACKREF (1<<16) /* \num */ -#define ONIG_SYN_OP_BRACKET_CC (1<<17) /* [...] */ -#define ONIG_SYN_OP_ESC_W_WORD (1<<18) /* \w, \W */ -#define ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END (1<<19) /* \<. \> */ -#define ONIG_SYN_OP_ESC_B_WORD_BOUND (1<<20) /* \b, \B */ -#define ONIG_SYN_OP_ESC_S_WHITE_SPACE (1<<21) /* \s, \S */ -#define ONIG_SYN_OP_ESC_D_DIGIT (1<<22) /* \d, \D */ -#define ONIG_SYN_OP_LINE_ANCHOR (1<<23) /* ^, $ */ -#define ONIG_SYN_OP_POSIX_BRACKET (1<<24) /* [:xxxx:] */ -#define ONIG_SYN_OP_QMARK_NON_GREEDY (1<<25) /* ??,*?,+?,{n,m}? */ -#define ONIG_SYN_OP_ESC_CONTROL_CHARS (1<<26) /* \n,\r,\t,\a ... */ -#define ONIG_SYN_OP_ESC_C_CONTROL (1<<27) /* \cx */ -#define ONIG_SYN_OP_ESC_OCTAL3 (1<<28) /* \OOO */ -#define ONIG_SYN_OP_ESC_X_HEX2 (1<<29) /* \xHH */ -#define ONIG_SYN_OP_ESC_X_BRACE_HEX8 (1<<30) /* \x{7HHHHHHH} */ - -#define ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE (1<<0) /* \Q...\E */ -#define ONIG_SYN_OP2_QMARK_GROUP_EFFECT (1<<1) /* (?...) */ -#define ONIG_SYN_OP2_OPTION_PERL (1<<2) /* (?imsx),(?-imsx) */ -#define ONIG_SYN_OP2_OPTION_RUBY (1<<3) /* (?imx), (?-imx) */ -#define ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT (1<<4) /* ?+,*+,++ */ -#define ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL (1<<5) /* {n,m}+ */ -#define ONIG_SYN_OP2_CCLASS_SET_OP (1<<6) /* [...&&..[..]..] */ -#define ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP (1<<7) /* (?<name>...) */ -#define ONIG_SYN_OP2_ESC_K_NAMED_BACKREF (1<<8) /* \k<name> */ -#define ONIG_SYN_OP2_ESC_G_SUBEXP_CALL (1<<9) /* \g<name>, \g<n> */ -#define ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY (1<<10) /* (?@..),(?@<x>..) */ -#define ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL (1<<11) /* \C-x */ -#define ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META (1<<12) /* \M-x */ -#define ONIG_SYN_OP2_ESC_V_VTAB (1<<13) /* \v as VTAB */ -#define ONIG_SYN_OP2_ESC_U_HEX4 (1<<14) /* \uHHHH */ -#define ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR (1<<15) /* \`, \' */ -#define ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY (1<<16) /* \p{...}, \P{...} */ -#define ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT (1<<17) /* \p{^..}, \P{^..} */ -#define ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS (1<<18) /* \p{IsXDigit} */ -#define ONIG_SYN_OP2_ESC_H_XDIGIT (1<<19) /* \h, \H */ +#define ONIG_SYN_OP_VARIABLE_META_CHARACTERS (1U<<0) +#define ONIG_SYN_OP_DOT_ANYCHAR (1U<<1) /* . */ +#define ONIG_SYN_OP_ASTERISK_ZERO_INF (1U<<2) /* * */ +#define ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF (1U<<3) +#define ONIG_SYN_OP_PLUS_ONE_INF (1U<<4) /* + */ +#define ONIG_SYN_OP_ESC_PLUS_ONE_INF (1U<<5) +#define ONIG_SYN_OP_QMARK_ZERO_ONE (1U<<6) /* ? */ +#define ONIG_SYN_OP_ESC_QMARK_ZERO_ONE (1U<<7) +#define ONIG_SYN_OP_BRACE_INTERVAL (1U<<8) /* {lower,upper} */ +#define ONIG_SYN_OP_ESC_BRACE_INTERVAL (1U<<9) /* \{lower,upper\} */ +#define ONIG_SYN_OP_VBAR_ALT (1U<<10) /* | */ +#define ONIG_SYN_OP_ESC_VBAR_ALT (1U<<11) /* \| */ +#define ONIG_SYN_OP_LPAREN_SUBEXP (1U<<12) /* (...) */ +#define ONIG_SYN_OP_ESC_LPAREN_SUBEXP (1U<<13) /* \(...\) */ +#define ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR (1U<<14) /* \A, \Z, \z */ +#define ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR (1U<<15) /* \G */ +#define ONIG_SYN_OP_DECIMAL_BACKREF (1U<<16) /* \num */ +#define ONIG_SYN_OP_BRACKET_CC (1U<<17) /* [...] */ +#define ONIG_SYN_OP_ESC_W_WORD (1U<<18) /* \w, \W */ +#define ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END (1U<<19) /* \<. \> */ +#define ONIG_SYN_OP_ESC_B_WORD_BOUND (1U<<20) /* \b, \B */ +#define ONIG_SYN_OP_ESC_S_WHITE_SPACE (1U<<21) /* \s, \S */ +#define ONIG_SYN_OP_ESC_D_DIGIT (1U<<22) /* \d, \D */ +#define ONIG_SYN_OP_LINE_ANCHOR (1U<<23) /* ^, $ */ +#define ONIG_SYN_OP_POSIX_BRACKET (1U<<24) /* [:xxxx:] */ +#define ONIG_SYN_OP_QMARK_NON_GREEDY (1U<<25) /* ??,*?,+?,{n,m}? */ +#define ONIG_SYN_OP_ESC_CONTROL_CHARS (1U<<26) /* \n,\r,\t,\a ... */ +#define ONIG_SYN_OP_ESC_C_CONTROL (1U<<27) /* \cx */ +#define ONIG_SYN_OP_ESC_OCTAL3 (1U<<28) /* \OOO */ +#define ONIG_SYN_OP_ESC_X_HEX2 (1U<<29) /* \xHH */ +#define ONIG_SYN_OP_ESC_X_BRACE_HEX8 (1U<<30) /* \x{7HHHHHHH} */ + +#define ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE (1U<<0) /* \Q...\E */ +#define ONIG_SYN_OP2_QMARK_GROUP_EFFECT (1U<<1) /* (?...) */ +#define ONIG_SYN_OP2_OPTION_PERL (1U<<2) /* (?imsx),(?-imsx) */ +#define ONIG_SYN_OP2_OPTION_RUBY (1U<<3) /* (?imx), (?-imx) */ +#define ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT (1U<<4) /* ?+,*+,++ */ +#define ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL (1U<<5) /* {n,m}+ */ +#define ONIG_SYN_OP2_CCLASS_SET_OP (1U<<6) /* [...&&..[..]..] */ +#define ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP (1U<<7) /* (?<name>...) */ +#define ONIG_SYN_OP2_ESC_K_NAMED_BACKREF (1U<<8) /* \k<name> */ +#define ONIG_SYN_OP2_ESC_G_SUBEXP_CALL (1U<<9) /* \g<name>, \g<n> */ +#define ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY (1U<<10) /* (?@..),(?@<x>..) */ +#define ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL (1U<<11) /* \C-x */ +#define ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META (1U<<12) /* \M-x */ +#define ONIG_SYN_OP2_ESC_V_VTAB (1U<<13) /* \v as VTAB */ +#define ONIG_SYN_OP2_ESC_U_HEX4 (1U<<14) /* \uHHHH */ +#define ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR (1U<<15) /* \`, \' */ +#define ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY (1U<<16) /* \p{...}, \P{...} */ +#define ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT (1U<<17) /* \p{^..}, \P{^..} */ +#define ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS (1U<<18) /* \p{IsXDigit} */ +#define ONIG_SYN_OP2_ESC_H_XDIGIT (1U<<19) /* \h, \H */ +#define ONIG_SYN_OP2_INEFFECTIVE_ESCAPE (1U<<20) /* \ */ /* syntax (behavior) */ -#define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1<<31) /* not implemented */ -#define ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS (1<<0) /* ?, *, +, {n,m} */ -#define ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS (1<<1) /* error or ignore */ -#define ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP (1<<2) /* ...)... */ -#define ONIG_SYN_ALLOW_INVALID_INTERVAL (1<<3) /* {??? */ -#define ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV (1<<4) /* {,n} => {0,n} */ -#define ONIG_SYN_STRICT_CHECK_BACKREF (1<<5) /* /(\1)/,/\1()/ ..*/ -#define ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (1<<6) /* (?<=a|bc) */ -#define ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (1<<7) /* see doc/RE */ -#define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (1<<8) /* (?<x>)(?<x>) */ -#define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1<<9) /* a{n}?=(?:a{n})? */ +#define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1U<<31) /* not implemented */ +#define ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS (1U<<0) /* ?, *, +, {n,m} */ +#define ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS (1U<<1) /* error or ignore */ +#define ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP (1U<<2) /* ...)... */ +#define ONIG_SYN_ALLOW_INVALID_INTERVAL (1U<<3) /* {??? */ +#define ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV (1U<<4) /* {,n} => {0,n} */ +#define ONIG_SYN_STRICT_CHECK_BACKREF (1U<<5) /* /(\1)/,/\1()/ ..*/ +#define ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (1U<<6) /* (?<=a|bc) */ +#define ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (1U<<7) /* see doc/RE */ +#define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (1U<<8) /* (?<x>)(?<x>) */ +#define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1U<<9) /* a{n}?=(?:a{n})? */ /* syntax (behavior) in char class [...] */ -#define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1<<20) /* [^...] */ -#define ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (1<<21) /* [..\w..] etc.. */ -#define ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC (1<<22) -#define ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC (1<<23) /* [0-9-a]=[0-9\-a] */ +#define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1U<<20) /* [^...] */ +#define ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (1U<<21) /* [..\w..] etc.. */ +#define ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC (1U<<22) +#define ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC (1U<<23) /* [0-9-a]=[0-9\-a] */ /* syntax (behavior) warning */ -#define ONIG_SYN_WARN_CC_OP_NOT_ESCAPED (1<<24) /* [,-,] */ -#define ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT (1<<25) /* (?:a*)+ */ +#define ONIG_SYN_WARN_CC_OP_NOT_ESCAPED (1U<<24) /* [,-,] */ +#define ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT (1U<<25) /* (?:a*)+ */ /* meta character specifiers (onig_set_meta_char()) */ #define ONIG_META_CHAR_ESCAPE 0 @@ -660,6 +675,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIGERR_INVALID_WIDE_CHAR_VALUE -400 #define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE -401 #define ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION -402 +#define ONIGERR_INVALID_COMBINATION_OF_OPTIONS -403 /* errors related to thread */ #define ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT -1001 @@ -735,6 +751,7 @@ typedef struct re_pattern_buffer { int num_mem; /* used memory(...) num counted from 1 */ int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ int num_null_check; /* OP_NULL_CHECK_START/END id counter */ + int num_comb_exp_check; /* combination explosion check */ int num_call; /* number of subexp call */ unsigned int capture_history; /* (?@...) flag (1-31) */ unsigned int bt_mem_start; /* need backtrack flag */ @@ -766,7 +783,13 @@ typedef struct re_pattern_buffer { /* regex_t link chain */ struct re_pattern_buffer* chain; /* escape compile-conflict */ -} regex_t; +} OnigRegexType; + +typedef OnigRegexType* OnigRegex; + +#ifndef ONIG_ESCAPE_REGEX_T_COLLISION + typedef OnigRegexType regex_t; +#endif typedef struct { @@ -788,19 +811,19 @@ void onig_set_warn_func P_((OnigWarnFunc f)); ONIG_EXTERN void onig_set_verb_warn_func P_((OnigWarnFunc f)); ONIG_EXTERN -int onig_new P_((regex_t**, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); +int onig_new P_((OnigRegex*, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); ONIG_EXTERN -int onig_new_deluxe P_((regex_t** reg, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); +int onig_new_deluxe P_((OnigRegex* reg, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); ONIG_EXTERN -void onig_free P_((regex_t*)); +void onig_free P_((OnigRegex)); ONIG_EXTERN -int onig_recompile P_((regex_t*, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); +int onig_recompile P_((OnigRegex, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); ONIG_EXTERN -int onig_recompile_deluxe P_((regex_t* reg, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); +int onig_recompile_deluxe P_((OnigRegex reg, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); ONIG_EXTERN -int onig_search P_((regex_t*, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option)); +int onig_search P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option)); ONIG_EXTERN -int onig_match P_((regex_t*, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option)); +int onig_match P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option)); ONIG_EXTERN OnigRegion* onig_region_new P_((void)); ONIG_EXTERN @@ -816,29 +839,31 @@ int onig_region_resize P_((OnigRegion* region, int n)); ONIG_EXTERN int onig_region_set P_((OnigRegion* region, int at, int beg, int end)); ONIG_EXTERN -int onig_name_to_group_numbers P_((regex_t* reg, const OnigUChar* name, const OnigUChar* name_end, int** nums)); +int onig_name_to_group_numbers P_((OnigRegex reg, const OnigUChar* name, const OnigUChar* name_end, int** nums)); ONIG_EXTERN -int onig_name_to_backref_number P_((regex_t* reg, const OnigUChar* name, const OnigUChar* name_end, OnigRegion *region)); +int onig_name_to_backref_number P_((OnigRegex reg, const OnigUChar* name, const OnigUChar* name_end, OnigRegion *region)); ONIG_EXTERN -int onig_foreach_name P_((regex_t* reg, int (*func)(const OnigUChar*, const OnigUChar*,int,int*,regex_t*,void*), void* arg)); +int onig_foreach_name P_((OnigRegex reg, int (*func)(const OnigUChar*, const OnigUChar*,int,int*,OnigRegex,void*), void* arg)); ONIG_EXTERN -int onig_number_of_names P_((regex_t* reg)); +int onig_number_of_names P_((OnigRegex reg)); ONIG_EXTERN -int onig_number_of_captures P_((regex_t* reg)); +int onig_number_of_captures P_((OnigRegex reg)); ONIG_EXTERN -int onig_number_of_capture_histories P_((regex_t* reg)); +int onig_number_of_capture_histories P_((OnigRegex reg)); ONIG_EXTERN OnigCaptureTreeNode* onig_get_capture_tree P_((OnigRegion* region)); ONIG_EXTERN int onig_capture_tree_traverse P_((OnigRegion* region, int at, int(*callback_func)(int,int,int,int,int,void*), void* arg)); ONIG_EXTERN -OnigEncoding onig_get_encoding P_((regex_t* reg)); +int onig_noname_group_capture_is_active P_((OnigRegex reg)); +ONIG_EXTERN +OnigEncoding onig_get_encoding P_((OnigRegex reg)); ONIG_EXTERN -OnigOptionType onig_get_options P_((regex_t* reg)); +OnigOptionType onig_get_options P_((OnigRegex reg)); ONIG_EXTERN -OnigAmbigType onig_get_ambig_flag P_((regex_t* reg)); +OnigAmbigType onig_get_ambig_flag P_((OnigRegex reg)); ONIG_EXTERN -OnigSyntaxType* onig_get_syntax P_((regex_t* reg)); +OnigSyntaxType* onig_get_syntax P_((OnigRegex reg)); ONIG_EXTERN int onig_set_default_syntax P_((OnigSyntaxType* syntax)); ONIG_EXTERN diff --git a/ext/mbstring/oniguruma/regcomp.c b/ext/mbstring/oniguruma/regcomp.c index a2315fcec..9b862657d 100644 --- a/ext/mbstring/oniguruma/regcomp.c +++ b/ext/mbstring/oniguruma/regcomp.c @@ -2,7 +2,7 @@ regcomp.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -186,6 +186,17 @@ add_opcode(regex_t* reg, int opcode) return 0; } +#ifdef USE_COMBINATION_EXPLOSION_CHECK +static int +add_state_check_num(regex_t* reg, int num) +{ + StateCheckNumType n = (StateCheckNumType )num; + + BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM); + return 0; +} +#endif + static int add_rel_addr(regex_t* reg, int addr) { @@ -644,7 +655,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) } p[id].lower = lower; - p[id].upper = upper; + p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper); return 0; } @@ -684,7 +695,254 @@ compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info, return r; } +static int +is_anychar_star_qualifier(QualifierNode* qn) +{ + if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && + NTYPE(qn->target) == N_ANYCHAR) + return 1; + else + return 0; +} + #define QUALIFIER_EXPAND_LIMIT_SIZE 50 +#define CKN_ON (ckn > 0) + +#ifdef USE_COMBINATION_EXPLOSION_CHECK + +static int +compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) +{ + int len, mod_tlen, cklen; + int ckn; + int infinite = IS_REPEAT_INFINITE(qn->upper); + int empty_info = qn->target_empty_info; + int tlen = compile_length_tree(qn->target, reg); + + if (tlen < 0) return tlen; + + ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0); + + cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0); + + /* anychar repeat */ + if (NTYPE(qn->target) == N_ANYCHAR) { + if (qn->greedy && infinite) { + if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) + return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen; + else + return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen; + } + } + + if (empty_info != 0) + mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END); + else + mod_tlen = tlen; + + if (infinite && qn->lower <= 1) { + if (qn->greedy) { + if (qn->lower == 1) + len = SIZE_OP_JUMP; + else + len = 0; + + len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP; + } + else { + if (qn->lower == 0) + len = SIZE_OP_JUMP; + else + len = 0; + + len += mod_tlen + SIZE_OP_PUSH + cklen; + } + } + else if (qn->upper == 0) { + if (qn->is_refered != 0) /* /(?<n>..){0}/ */ + len = SIZE_OP_JUMP + tlen; + else + len = 0; + } + else if (qn->upper == 1 && qn->greedy) { + if (qn->lower == 0) { + if (CKN_ON) { + len = SIZE_OP_STATE_CHECK_PUSH + tlen; + } + else { + len = SIZE_OP_PUSH + tlen; + } + } + else { + len = tlen; + } + } + else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ + len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen; + } + else { + len = SIZE_OP_REPEAT_INC + + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM; + if (CKN_ON) + len += SIZE_OP_STATE_CHECK; + } + + return len; +} + +static int +compile_qualifier_node(QualifierNode* qn, regex_t* reg) +{ + int r, mod_tlen; + int ckn; + int infinite = IS_REPEAT_INFINITE(qn->upper); + int empty_info = qn->target_empty_info; + int tlen = compile_length_tree(qn->target, reg); + + if (tlen < 0) return tlen; + + ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0); + + if (is_anychar_star_qualifier(qn)) { + r = compile_tree_n_times(qn->target, qn->lower, reg); + if (r) return r; + if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) { + if (IS_MULTILINE(reg->options)) + r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT); + else + r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT); + if (r) return r; + if (CKN_ON) { + r = add_state_check_num(reg, ckn); + if (r) return r; + } + + return add_bytes(reg, NSTRING(qn->next_head_exact).s, 1); + } + else { + if (IS_MULTILINE(reg->options)) { + r = add_opcode(reg, (CKN_ON ? + OP_STATE_CHECK_ANYCHAR_ML_STAR + : OP_ANYCHAR_ML_STAR)); + } + else { + r = add_opcode(reg, (CKN_ON ? + OP_STATE_CHECK_ANYCHAR_STAR + : OP_ANYCHAR_STAR)); + } + if (r) return r; + if (CKN_ON) + r = add_state_check_num(reg, ckn); + + return r; + } + } + + if (empty_info != 0) + mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END); + else + mod_tlen = tlen; + + if (infinite && qn->lower <= 1) { + if (qn->greedy) { + if (qn->lower == 1) { + r = add_opcode_rel_addr(reg, OP_JUMP, + (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)); + if (r) return r; + } + + if (CKN_ON) { + r = add_opcode(reg, OP_STATE_CHECK_PUSH); + if (r) return r; + r = add_state_check_num(reg, ckn); + if (r) return r; + r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP); + } + else { + r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP); + } + if (r) return r; + r = compile_tree_empty_check(qn->target, reg, empty_info); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, + -(mod_tlen + (int )SIZE_OP_JUMP + + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH))); + } + else { + if (qn->lower == 0) { + r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen); + if (r) return r; + } + r = compile_tree_empty_check(qn->target, reg, empty_info); + if (r) return r; + if (CKN_ON) { + r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP); + if (r) return r; + r = add_state_check_num(reg, ckn); + if (r) return r; + r = add_rel_addr(reg, + -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP)); + } + else + r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH)); + } + } + else if (qn->upper == 0) { + if (qn->is_refered != 0) { /* /(?<n>..){0}/ */ + r = add_opcode_rel_addr(reg, OP_JUMP, tlen); + if (r) return r; + r = compile_tree(qn->target, reg); + } + else + r = 0; + } + else if (qn->upper == 1 && qn->greedy) { + if (qn->lower == 0) { + if (CKN_ON) { + r = add_opcode(reg, OP_STATE_CHECK_PUSH); + if (r) return r; + r = add_state_check_num(reg, ckn); + if (r) return r; + r = add_rel_addr(reg, tlen); + } + else { + r = add_opcode_rel_addr(reg, OP_PUSH, tlen); + } + if (r) return r; + } + + r = compile_tree(qn->target, reg); + } + else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ + if (CKN_ON) { + r = add_opcode(reg, OP_STATE_CHECK_PUSH); + if (r) return r; + r = add_state_check_num(reg, ckn); + if (r) return r; + r = add_rel_addr(reg, SIZE_OP_JUMP); + } + else { + r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP); + } + + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, tlen); + if (r) return r; + r = compile_tree(qn->target, reg); + } + else { + r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg); + if (CKN_ON) { + if (r) return r; + r = add_opcode(reg, OP_STATE_CHECK); + if (r) return r; + r = add_state_check_num(reg, ckn); + } + } + return r; +} + +#else /* USE_COMBINATION_EXPLOSION_CHECK */ static int compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) @@ -752,16 +1010,6 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) } static int -is_anychar_star_qualifier(QualifierNode* qn) -{ - if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && - NTYPE(qn->target) == N_ANYCHAR) - return 1; - else - return 0; -} - -static int compile_qualifier_node(QualifierNode* qn, regex_t* reg) { int i, r, mod_tlen; @@ -887,6 +1135,7 @@ compile_qualifier_node(QualifierNode* qn, regex_t* reg) } return r; } +#endif /* USE_COMBINATION_EXPLOSION_CHECK */ static int compile_length_option_node(EffectNode* node, regex_t* reg) @@ -1268,8 +1517,15 @@ compile_length_tree(Node* node, regex_t* reg) { BackrefNode* br = &(NBACKREF(node)); +#ifdef USE_BACKREF_AT_LEVEL + if (IS_BACKREF_NEST_LEVEL(br)) { + r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH + + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num); + } + else +#endif if (br->back_num == 1) { - r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 3) + r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2) ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM)); } else { @@ -1381,9 +1637,21 @@ compile_tree(Node* node, regex_t* reg) case N_BACKREF: { - int i; BackrefNode* br = &(NBACKREF(node)); +#ifdef USE_BACKREF_AT_LEVEL + if (IS_BACKREF_NEST_LEVEL(br)) { + r = add_opcode(reg, OP_BACKREF_AT_LEVEL); + if (r) return r; + r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE)); + if (r) return r; + r = add_length(reg, br->nest_level); + if (r) return r; + + goto add_bacref_mems; + } + else +#endif if (br->back_num == 1) { n = br->back_static[0]; if (IS_IGNORECASE(reg->options)) { @@ -1395,7 +1663,6 @@ compile_tree(Node* node, regex_t* reg) switch (n) { case 1: r = add_opcode(reg, OP_BACKREF1); break; case 2: r = add_opcode(reg, OP_BACKREF2); break; - case 3: r = add_opcode(reg, OP_BACKREF3); break; default: r = add_opcode(reg, OP_BACKREFN); if (r) return r; @@ -1405,17 +1672,21 @@ compile_tree(Node* node, regex_t* reg) } } else { + int i; int* p; if (IS_IGNORECASE(reg->options)) { - add_opcode(reg, OP_BACKREF_MULTI_IC); + r = add_opcode(reg, OP_BACKREF_MULTI_IC); } else { - add_opcode(reg, OP_BACKREF_MULTI); + r = add_opcode(reg, OP_BACKREF_MULTI); } - if (r) return r; - add_length(reg, br->back_num); + +#ifdef USE_BACKREF_AT_LEVEL + add_bacref_mems: +#endif + r = add_length(reg, br->back_num); if (r) return r; p = BACKREFS_P(br); for (i = br->back_num - 1; i >= 0; i--) { @@ -2120,29 +2391,6 @@ get_char_length_tree(Node* node, regex_t* reg, int* len) return get_char_length_tree1(node, reg, len, 0); } -extern int -onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) -{ - int found; - - if (ONIGENC_MBC_MINLEN(enc) > 1 || (code >= SINGLE_BYTE_SIZE)) { - if (IS_NULL(cc->mbuf)) { - found = 0; - } - else { - found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0); - } - } - else { - found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1); - } - - if (IS_CCLASS_NOT(cc)) - return !found; - else - return found; -} - /* x is not included y ==> 1 : 0 */ static int is_not_included(Node* x, Node* y, regex_t* reg) @@ -2516,6 +2764,9 @@ subexp_inf_recursive_check(Node* node, ScanEnv* env, int head) case N_QUALIFIER: r = subexp_inf_recursive_check(NQUALIFIER(node).target, env, head); + if (r == RECURSION_EXIST) { + if (NQUALIFIER(node).lower == 0) r = 0; + } break; case N_ANCHOR: @@ -2943,15 +3194,55 @@ next_setup(Node* node, Node* next_node, regex_t* reg) return 0; } + +static int +divide_ambig_string_node_sub(regex_t* reg, int prev_ambig, + UChar* prev_start, UChar* prev, + UChar* end, Node*** tailp, Node** root) +{ + UChar *tmp, *wp; + Node* snode; + + if (prev_ambig != 0) { + tmp = prev_start; + wp = prev_start; + while (tmp < prev) { + wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, + &tmp, end, wp); + } + snode = onig_node_new_str(prev_start, wp); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + NSTRING_SET_AMBIG(snode); + if (wp != prev) NSTRING_SET_AMBIG_REDUCE(snode); + } + else { + snode = onig_node_new_str(prev_start, prev); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + } + + if (*tailp == (Node** )0) { + *root = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(*root, ONIGERR_MEMORY); + *tailp = &(NCONS(*root).right); + } + else { + **tailp = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(**tailp, ONIGERR_MEMORY); + *tailp = &(NCONS(**tailp).right); + } + + return 0; +} + static int divide_ambig_string_node(Node* node, regex_t* reg) { StrNode* sn = &NSTRING(node); int ambig, prev_ambig; UChar *prev, *p, *end, *prev_start, *start, *tmp, *wp; - Node *snode; Node *root = NULL_NODE; Node **tailp = (Node** )0; + int r; start = prev_start = p = sn->s; end = sn->end; @@ -2964,33 +3255,9 @@ divide_ambig_string_node(Node* node, regex_t* reg) if (prev_ambig != (ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag, &p, end))) { - if (prev_ambig != 0) { - tmp = prev_start; - wp = prev_start; - while (tmp < prev) { - wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, - &tmp, end, wp); - } - snode = onig_node_new_str(prev_start, wp); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - NSTRING_SET_AMBIG(snode); - if (wp != prev) NSTRING_SET_AMBIG_REDUCE(snode); - } - else { - snode = onig_node_new_str(prev_start, prev); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - } - - if (tailp == (Node** )0) { - root = onig_node_new_list(snode, NULL); - CHECK_NULL_RETURN_VAL(root, ONIGERR_MEMORY); - tailp = &(NCONS(root).right); - } - else { - *tailp = onig_node_new_list(snode, NULL); - CHECK_NULL_RETURN_VAL(*tailp, ONIGERR_MEMORY); - tailp = &(NCONS(*tailp).right); - } + r = divide_ambig_string_node_sub(reg, prev_ambig, prev_start, prev, + end, &tailp, &root); + if (r != 0) return r; prev_ambig = ambig; prev_start = prev; @@ -3011,41 +3278,157 @@ divide_ambig_string_node(Node* node, regex_t* reg) } } else { - if (prev_ambig != 0) { - tmp = prev_start; - wp = prev_start; - while (tmp < end) { - wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, - &tmp, end, wp); - } - snode = onig_node_new_str(prev_start, wp); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); - NSTRING_SET_AMBIG(snode); - if (wp != end) NSTRING_SET_AMBIG_REDUCE(snode); + r = divide_ambig_string_node_sub(reg, prev_ambig, prev_start, end, + end, &tailp, &root); + if (r != 0) return r; + + swap_node(node, root); + onig_node_str_clear(root); /* should be after swap! */ + onig_node_free(root); /* free original string node */ + } + + return 0; +} + +#ifdef USE_COMBINATION_EXPLOSION_CHECK + +#define CEC_THRES_NUM_BIG_REPEAT 512 +#define CEC_INFINITE_NUM 0x7fffffff + +#define CEC_IN_INFINITE_REPEAT (1<<0) +#define CEC_IN_FINITE_REPEAT (1<<1) +#define CEC_CONT_BIG_REPEAT (1<<2) + +static int +setup_comb_exp_check(Node* node, int state, ScanEnv* env) +{ + int type; + int r = state; + + type = NTYPE(node); + switch (type) { + case N_LIST: + { + Node* prev = NULL_NODE; + do { + r = setup_comb_exp_check(NCONS(node).left, r, env); + prev = NCONS(node).left; + } while (r >= 0 && IS_NOT_NULL(node = NCONS(node).right)); } - else { - snode = onig_node_new_str(prev_start, end); - CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + break; + + case N_ALT: + { + int ret; + do { + ret = setup_comb_exp_check(NCONS(node).left, state, env); + r |= ret; + } while (ret >= 0 && IS_NOT_NULL(node = NCONS(node).right)); } + break; + + case N_QUALIFIER: + { + int child_state = state; + int add_state = 0; + QualifierNode* qn = &(NQUALIFIER(node)); + Node* target = qn->target; + int var_num; + + if (! IS_REPEAT_INFINITE(qn->upper)) { + if (qn->upper > 1) { + /* {0,1}, {1,1} are allowed */ + child_state |= CEC_IN_FINITE_REPEAT; + + /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */ + if (env->backrefed_mem == 0) { + if (NTYPE(qn->target) == N_EFFECT) { + EffectNode* en = &(NEFFECT(qn->target)); + if (en->type == EFFECT_MEMORY) { + if (NTYPE(en->target) == N_QUALIFIER) { + QualifierNode* q = &(NQUALIFIER(en->target)); + if (IS_REPEAT_INFINITE(q->upper) + && q->greedy == qn->greedy) { + qn->upper = (qn->lower == 0 ? 1 : qn->lower); + if (qn->upper == 1) + child_state = state; + } + } + } + } + } + } + } + + if (state & CEC_IN_FINITE_REPEAT) { + qn->comb_exp_check_num = -1; + } + else { + if (IS_REPEAT_INFINITE(qn->upper)) { + var_num = CEC_INFINITE_NUM; + child_state |= CEC_IN_INFINITE_REPEAT; + } + else { + var_num = qn->upper - qn->lower; + } - if (tailp == (Node** )0) { - root = onig_node_new_list(snode, NULL); - CHECK_NULL_RETURN_VAL(root, ONIGERR_MEMORY); - tailp = &(NCONS(node).right); + if (var_num >= CEC_THRES_NUM_BIG_REPEAT) + add_state |= CEC_CONT_BIG_REPEAT; + + if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) || + ((state & CEC_CONT_BIG_REPEAT) != 0 && + var_num >= CEC_THRES_NUM_BIG_REPEAT)) { + if (qn->comb_exp_check_num == 0) { + env->num_comb_exp_check++; + qn->comb_exp_check_num = env->num_comb_exp_check; + if (env->curr_max_regnum > env->comb_exp_max_regnum) + env->comb_exp_max_regnum = env->curr_max_regnum; + } + } + } + + r = setup_comb_exp_check(target, child_state, env); + r |= add_state; } - else { - *tailp = onig_node_new_list(snode, NULL); - CHECK_NULL_RETURN_VAL(*tailp, ONIGERR_MEMORY); - tailp = &(NCONS(*tailp).right); + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + + switch (en->type) { + case EFFECT_MEMORY: + { + if (env->curr_max_regnum < en->regnum) + env->curr_max_regnum = en->regnum; + + r = setup_comb_exp_check(en->target, state, env); + } + break; + + default: + r = setup_comb_exp_check(en->target, state, env); + break; + } } + break; - swap_node(node, root); - onig_node_str_clear(root); /* should be after swap! */ - onig_node_free(root); /* free original string node */ +#ifdef USE_SUBEXP_CALL + case N_CALL: + if (IS_CALL_RECURSION(&(NCALL(node)))) + env->has_recursion = 1; + else + r = setup_comb_exp_check(NCALL(node).target, state, env); + break; +#endif + + default: + break; } - return 0; + return r; } +#endif #define IN_ALT (1<<0) #define IN_NOT (1<<1) @@ -3116,6 +3499,11 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; BIT_STATUS_ON_AT(env->backrefed_mem, p[i]); BIT_STATUS_ON_AT(env->bt_mem_start, p[i]); +#ifdef USE_BACKREF_AT_LEVEL + if (IS_BACKREF_NEST_LEVEL(br)) { + BIT_STATUS_ON_AT(env->bt_mem_end, p[i]); + } +#endif SET_EFFECT_STATUS(nodes[p[i]], NST_MEM_BACKREFED); } } @@ -3263,11 +3651,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) #define ALLOWED_EFFECT_IN_LB_NOT 0 #define ALLOWED_ANCHOR_IN_LB \ -( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF ) +( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION ) #define ALLOWED_ANCHOR_IN_LB_NOT \ -( ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF ) - /* can't allow all anchors, because \G in look-behind through Search(). - ex. /(?<=\G)zz/.match("azz") => success. */ +( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION ) case ANCHOR_LOOK_BEHIND: { @@ -3383,7 +3769,7 @@ typedef struct { static int map_position_value(OnigEncoding enc, int i) { - static short int ByteValTable[] = { + static const short int ByteValTable[] = { 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, @@ -3408,7 +3794,7 @@ static int distance_value(MinMaxLen* mm) { /* 1000 / (min-max-dist + 1) */ - static short int dist_vals[] = { + static const short int dist_vals[] = { 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100, 91, 83, 77, 71, 67, 63, 59, 56, 53, 50, 48, 45, 43, 42, 40, 38, 37, 36, 34, 33, @@ -3604,9 +3990,10 @@ copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from) } static void -concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add) +concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc) { - int i, n; + int i, j, len; + UChar *p, *end; OptAncInfo tanc; if (! to->ignore_case && add->ignore_case) { @@ -3615,11 +4002,17 @@ concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add) to->ignore_case = 1; } - for (i = to->len, n = 0; n < add->len && i < OPT_EXACT_MAXLEN; i++, n++) - to->s[i] = add->s[n]; + p = add->s; + end = p + add->len; + for (i = to->len; p < end; ) { + len = enc_len(enc, p); + if (i + len > OPT_EXACT_MAXLEN) break; + for (j = 0; j < len && p < end; j++) + to->s[i++] = *p++; + } to->len = i; - to->reach_end = (n == add->len ? add->reach_end : 0); + to->reach_end = (p == end ? add->reach_end : 0); concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1); if (! to->reach_end) tanc.right_anchor = 0; @@ -3634,15 +4027,10 @@ concat_opt_exact_info_str(OptExactInfo* to, UChar *p; for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) { - if (raw) { + len = enc_len(enc, p); + if (i + len > OPT_EXACT_MAXLEN) break; + for (j = 0; j < len && p < end; j++) to->s[i++] = *p++; - } - else { - len = enc_len(enc, p); - if (i + len > OPT_EXACT_MAXLEN) break; - for (j = 0; j < len; j++) - to->s[i++] = *p++; - } } to->len = i; @@ -3711,7 +4099,7 @@ select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt) static void clear_opt_map_info(OptMapInfo* map) { - static OptMapInfo clean_info = { + static const OptMapInfo clean_info = { {0, 0}, {0, 0}, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -3758,8 +4146,8 @@ add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end, int i, j, n, len; UChar buf[ONIGENC_MBC_NORMALIZE_MAXLEN]; OnigCodePoint code, ccode; - OnigCompAmbigCodes* ccs; - OnigPairAmbigCodes* pccs; + const OnigCompAmbigCodes* ccs; + const OnigPairAmbigCodes* pccs; OnigAmbigType amb; add_char_opt_map_info(map, p[0], enc); @@ -3907,11 +4295,11 @@ concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add) if (add->exb.len > 0) { if (exb_reach) { - concat_opt_exact_info(&to->exb, &add->exb); + concat_opt_exact_info(&to->exb, &add->exb, enc); clear_opt_exact_info(&add->exb); } else if (exm_reach) { - concat_opt_exact_info(&to->exm, &add->exb); + concat_opt_exact_info(&to->exm, &add->exb, enc); clear_opt_exact_info(&add->exb); } } @@ -4197,8 +4585,8 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) { if (env->mmd.max == 0 && NTYPE(qn->target) == N_ANYCHAR && qn->greedy) { - if (IS_POSIXLINE(env->options)) - add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_PL); + if (IS_MULTILINE(env->options)) + add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML); else add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR); } @@ -4210,7 +4598,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) if (nopt.exb.reach_end) { for (i = 2; i < qn->lower && ! is_full_opt_exact_info(&opt->exb); i++) { - concat_opt_exact_info(&opt->exb, &nopt.exb); + concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc); } if (i < qn->lower) { opt->exb.reach_end = 0; @@ -4316,10 +4704,7 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); reg->exact_end = reg->exact + e->len; - if (e->anc.left_anchor & ANCHOR_BEGIN_LINE) - allow_reverse = 1; - else - allow_reverse = + allow_reverse = ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { @@ -4391,7 +4776,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) if (r) return r; reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF | - ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_PL); + ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML); reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF); @@ -4503,7 +4888,7 @@ print_anchor(FILE* f, int anchor) q = 1; fprintf(f, "anychar-star"); } - if (anchor & ANCHOR_ANYCHAR_STAR_PL) { + if (anchor & ANCHOR_ANYCHAR_STAR_ML) { if (q) fprintf(f, ", "); fprintf(f, "anychar-star-pl"); } @@ -4514,8 +4899,8 @@ print_anchor(FILE* f, int anchor) static void print_optimize_info(FILE* f, regex_t* reg) { - static char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV", - "EXACT_IC", "MAP" }; + static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV", + "EXACT_IC", "MAP" }; fprintf(f, "optimize: %s\n", on[reg->optimize]); fprintf(f, " anchor: "); print_anchor(f, reg->anchor); @@ -4624,7 +5009,6 @@ onig_chain_reduce(regex_t* reg) { regex_t *head, *prev; - THREAD_ATOMIC_START; prev = reg; head = prev->chain; if (IS_NOT_NULL(head)) { @@ -4636,7 +5020,6 @@ onig_chain_reduce(regex_t* reg) prev->chain = (regex_t* )NULL; REGEX_TRANSFER(reg, head); } - THREAD_ATOMIC_END; } #if 0 @@ -4753,6 +5136,9 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, reg->num_null_check = 0; reg->repeat_range_alloc = 0; reg->repeat_range = (OnigRepeatRange* )NULL; +#ifdef USE_COMBINATION_EXPLOSION_CHECK + reg->num_comb_exp_check = 0; +#endif r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env); if (r != 0) goto err; @@ -4806,6 +5192,33 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, reg->bt_mem_end |= reg->capture_history; } +#ifdef USE_COMBINATION_EXPLOSION_CHECK + if (scan_env.backrefed_mem == 0 +#ifdef USE_SUBEXP_CALL + || scan_env.num_call == 0 +#endif + ) { + setup_comb_exp_check(root, 0, &scan_env); +#ifdef USE_SUBEXP_CALL + if (scan_env.has_recursion != 0) { + scan_env.num_comb_exp_check = 0; + } + else +#endif + if (scan_env.comb_exp_max_regnum > 0) { + int i; + for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) { + if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) { + scan_env.num_comb_exp_check = 0; + break; + } + } + } + } + + reg->num_comb_exp_check = scan_env.num_comb_exp_check; +#endif + clear_optimize_info(reg); #ifndef ONIG_DONT_OPTIMIZE r = set_optimize_info_from_tree(root, reg, &scan_env); @@ -4875,6 +5288,7 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, return r; } +#ifdef USE_RECOMPILE_API extern int onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, @@ -4893,6 +5307,7 @@ onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, } return 0; } +#endif static int onig_inited = 0; @@ -4906,6 +5321,11 @@ onig_alloc_init(regex_t** reg, OnigOptionType option, OnigAmbigType ambig_flag, if (ONIGENC_IS_UNDEF(enc)) return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED; + if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) + == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) { + return ONIGERR_INVALID_COMBINATION_OF_OPTIONS; + } + *reg = (regex_t* )xmalloc(sizeof(regex_t)); if (IS_NULL(*reg)) return ONIGERR_MEMORY; (*reg)->state = ONIG_STATE_MODIFY; @@ -4991,14 +5411,14 @@ onig_end() onig_print_statistics(stderr); #endif -#ifdef USE_RECYCLE_NODE - onig_free_node_list(); -#endif - #ifdef USE_SHARED_CCLASS_TABLE onig_free_shared_cclass_table(); #endif +#ifdef USE_RECYCLE_NODE + onig_free_node_list(); +#endif + onig_inited = 0; THREAD_ATOMIC_END; @@ -5008,6 +5428,16 @@ onig_end() #ifdef ONIG_DEBUG +/* arguments type */ +#define ARG_SPECIAL -1 +#define ARG_NON 0 +#define ARG_RELADDR 1 +#define ARG_ABSADDR 2 +#define ARG_LENGTH 3 +#define ARG_MEMNUM 4 +#define ARG_OPTION 5 +#define ARG_STATE_CHECK 6 + OnigOpInfoType OnigOpInfo[] = { { OP_FINISH, "finish", ARG_NON }, { OP_END, "end", ARG_NON }, @@ -5038,62 +5468,68 @@ OnigOpInfoType OnigOpInfo[] = { { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON }, { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL }, { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL }, - { OP_WORD, "word", ARG_NON }, - { OP_NOT_WORD, "not-word", ARG_NON }, - { OP_WORD_SB, "word-sb", ARG_NON }, - { OP_WORD_MB, "word-mb", ARG_NON }, - { OP_WORD_BOUND, "word-bound", ARG_NON }, - { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON }, - { OP_WORD_BEGIN, "word-begin", ARG_NON }, - { OP_WORD_END, "word-end", ARG_NON }, - { OP_BEGIN_BUF, "begin-buf", ARG_NON }, - { OP_END_BUF, "end-buf", ARG_NON }, - { OP_BEGIN_LINE, "begin-line", ARG_NON }, - { OP_END_LINE, "end-line", ARG_NON }, - { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON }, - { OP_BEGIN_POSITION, "begin-position", ARG_NON }, - { OP_BACKREF1, "backref1", ARG_NON }, - { OP_BACKREF2, "backref2", ARG_NON }, - { OP_BACKREF3, "backref3", ARG_NON }, - { OP_BACKREFN, "backrefn", ARG_MEMNUM }, - { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL }, - { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL }, - { OP_BACKREF_MULTI_IC, "backref_multi-ic",ARG_SPECIAL }, - { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM }, - { OP_MEMORY_START, "mem-start", ARG_MEMNUM }, - { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM }, - { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM }, - { OP_MEMORY_END, "mem-end", ARG_MEMNUM }, - { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM }, - { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION }, - { OP_SET_OPTION, "set-option", ARG_OPTION }, - { OP_FAIL, "fail", ARG_NON }, - { OP_JUMP, "jump", ARG_RELADDR }, - { OP_PUSH, "push", ARG_RELADDR }, - { OP_POP, "pop", ARG_NON }, - { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL }, - { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL }, - { OP_REPEAT, "repeat", ARG_SPECIAL }, - { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL }, - { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM }, - { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM }, - { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM }, - { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM }, - { OP_NULL_CHECK_START, "null-check-start",ARG_MEMNUM }, - { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM }, - { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM }, - { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM }, - { OP_PUSH_POS, "push-pos", ARG_NON }, - { OP_POP_POS, "pop-pos", ARG_NON }, - { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR }, - { OP_FAIL_POS, "fail-pos", ARG_NON }, - { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON }, - { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON }, - { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL }, + { OP_WORD, "word", ARG_NON }, + { OP_NOT_WORD, "not-word", ARG_NON }, + { OP_WORD_SB, "word-sb", ARG_NON }, + { OP_WORD_MB, "word-mb", ARG_NON }, + { OP_WORD_BOUND, "word-bound", ARG_NON }, + { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON }, + { OP_WORD_BEGIN, "word-begin", ARG_NON }, + { OP_WORD_END, "word-end", ARG_NON }, + { OP_BEGIN_BUF, "begin-buf", ARG_NON }, + { OP_END_BUF, "end-buf", ARG_NON }, + { OP_BEGIN_LINE, "begin-line", ARG_NON }, + { OP_END_LINE, "end-line", ARG_NON }, + { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON }, + { OP_BEGIN_POSITION, "begin-position", ARG_NON }, + { OP_BACKREF1, "backref1", ARG_NON }, + { OP_BACKREF2, "backref2", ARG_NON }, + { OP_BACKREFN, "backrefn", ARG_MEMNUM }, + { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL }, + { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL }, + { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL }, + { OP_BACKREF_AT_LEVEL, "backref_at_level", ARG_SPECIAL }, + { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM }, + { OP_MEMORY_START, "mem-start", ARG_MEMNUM }, + { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM }, + { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM }, + { OP_MEMORY_END, "mem-end", ARG_MEMNUM }, + { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM }, + { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION }, + { OP_SET_OPTION, "set-option", ARG_OPTION }, + { OP_FAIL, "fail", ARG_NON }, + { OP_JUMP, "jump", ARG_RELADDR }, + { OP_PUSH, "push", ARG_RELADDR }, + { OP_POP, "pop", ARG_NON }, + { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL }, + { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL }, + { OP_REPEAT, "repeat", ARG_SPECIAL }, + { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL }, + { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM }, + { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM }, + { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM }, + { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM }, + { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM }, + { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM }, + { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM }, + { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM }, + { OP_PUSH_POS, "push-pos", ARG_NON }, + { OP_POP_POS, "pop-pos", ARG_NON }, + { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR }, + { OP_FAIL_POS, "fail-pos", ARG_NON }, + { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON }, + { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON }, + { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL }, { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL }, { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON }, - { OP_CALL, "call", ARG_ABSADDR }, - { OP_RETURN, "return", ARG_NON }, + { OP_CALL, "call", ARG_ABSADDR }, + { OP_RETURN, "return", ARG_NON }, + { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL }, + { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL }, + { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK }, + { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK }, + { OP_STATE_CHECK_ANYCHAR_ML_STAR, + "state-check-anychar-ml*", ARG_STATE_CHECK }, { -1, "", ARG_NON } }; @@ -5152,6 +5588,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, RelAddrType addr; LengthType len; MemNumType mem; + StateCheckNumType scn; OnigCodePoint code; UChar *q; @@ -5186,6 +5623,12 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, fprintf(f, ":%d", option); } break; + + case ARG_STATE_CHECK: + scn = *((StateCheckNumType* )bp); + bp += SIZE_STATE_CHECK_NUM; + fprintf(f, ":%d", scn); + break; } } else { @@ -5312,6 +5755,26 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, } break; + case OP_BACKREF_AT_LEVEL: + { + OnigOptionType option; + LengthType level; + + GET_OPTION_INC(option, bp); + fprintf(f, ":%d", option); + GET_LENGTH_INC(level, bp); + fprintf(f, ":%d", level); + + fputs(" ", f); + GET_LENGTH_INC(len, bp); + for (i = 0; i < len; i++) { + GET_MEMNUM_INC(mem, bp); + if (i > 0) fputs(", ", f); + fprintf(f, "%d", mem); + } + } + break; + case OP_REPEAT: case OP_REPEAT_NG: { @@ -5343,6 +5806,15 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, fprintf(f, ":%d:(%d)", len, addr); break; + case OP_STATE_CHECK_PUSH: + case OP_STATE_CHECK_PUSH_OR_JUMP: + scn = *((StateCheckNumType* )bp); + bp += SIZE_STATE_CHECK_NUM; + addr = *((RelAddrType* )bp); + bp += SIZE_RELADDR; + fprintf(f, ":%d:(%d)", scn, addr); + break; + default: fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n", *--bp); diff --git a/ext/mbstring/oniguruma/regenc.c b/ext/mbstring/oniguruma/regenc.c index a767ca60b..bbbf1a2f9 100644 --- a/ext/mbstring/oniguruma/regenc.c +++ b/ext/mbstring/oniguruma/regenc.c @@ -175,7 +175,7 @@ onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s) #define USE_APPLICATION_TO_LOWER_CASE_TABLE -unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[256] = { +const unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x228c, 0x2289, 0x2288, 0x2288, 0x2288, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -251,7 +251,7 @@ static const UChar BuiltInAsciiToLowerCaseTable[] = { #endif /* not USE_APPLICATION_TO_LOWER_CASE_TABLE */ #ifdef USE_UPPER_CASE_TABLE -UChar OnigEncAsciiToUpperCaseTable[256] = { +const UChar OnigEncAsciiToUpperCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -287,7 +287,7 @@ UChar OnigEncAsciiToUpperCaseTable[256] = { }; #endif -unsigned short OnigEncAsciiCtypeTable[256] = { +const unsigned short OnigEncAsciiCtypeTable[256] = { 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, @@ -323,7 +323,7 @@ unsigned short OnigEncAsciiCtypeTable[256] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; -UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = { +const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -359,7 +359,7 @@ UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = { }; #ifdef USE_UPPER_CASE_TABLE -UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = { +const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', @@ -417,7 +417,7 @@ onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UC return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s); } -OnigPairAmbigCodes OnigAsciiPairAmbigCodes[] = { +const OnigPairAmbigCodes OnigAsciiPairAmbigCodes[] = { { 0x41, 0x61 }, { 0x42, 0x62 }, { 0x43, 0x63 }, @@ -475,7 +475,7 @@ OnigPairAmbigCodes OnigAsciiPairAmbigCodes[] = { extern int onigenc_ascii_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { *ccs = OnigAsciiPairAmbigCodes; @@ -488,16 +488,16 @@ onigenc_ascii_get_all_pair_ambig_codes(OnigAmbigType flag, extern int onigenc_nothing_get_all_comp_ambig_codes(OnigAmbigType flag, - OnigCompAmbigCodes** ccs) + const OnigCompAmbigCodes** ccs) { return 0; } extern int onigenc_iso_8859_1_get_all_pair_ambig_codes(OnigAmbigType flag, - OnigPairAmbigCodes** ccs) + const OnigPairAmbigCodes** ccs) { - static OnigPairAmbigCodes cc[] = { + static const OnigPairAmbigCodes cc[] = { { 0xc0, 0xe0 }, { 0xc1, 0xe1 }, { 0xc2, 0xe2 }, @@ -577,9 +577,9 @@ onigenc_iso_8859_1_get_all_pair_ambig_codes(OnigAmbigType flag, extern int onigenc_ess_tsett_get_all_comp_ambig_codes(OnigAmbigType flag, - OnigCompAmbigCodes** ccs) + const OnigCompAmbigCodes** ccs) { - static OnigCompAmbigCodes folds[] = { + static const OnigCompAmbigCodes folds[] = { { 2, 0xdf, {{ 2, { 0x53, 0x53 } }, { 2, { 0x73, 0x73} } } } }; @@ -593,7 +593,7 @@ onigenc_ess_tsett_get_all_comp_ambig_codes(OnigAmbigType flag, extern int onigenc_not_support_get_ctype_code_range(int ctype, - OnigCodePoint* sbr[], OnigCodePoint* mbr[]) + const OnigCodePoint* sbr[], const OnigCodePoint* mbr[]) { return ONIG_NO_SUPPORT_CONFIG; } @@ -830,10 +830,10 @@ onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf) if ((code & 0xff000000) != 0) { *p++ = (UChar )((code >> 24) & 0xff); } - if ((code & 0xff0000) != 0) { + if ((code & 0xff0000) != 0 || p != buf) { *p++ = (UChar )((code >> 16) & 0xff); } - if ((code & 0xff00) != 0) { + if ((code & 0xff00) != 0 || p != buf) { *p++ = (UChar )((code >> 8) & 0xff); } *p++ = (UChar )(code & 0xff); @@ -849,40 +849,32 @@ extern int onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code, unsigned int ctype) { - if ((ctype & ONIGENC_CTYPE_WORD) != 0) { - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else { + if ((ctype & (ONIGENC_CTYPE_WORD | + ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) { return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE); - - ctype &= ~ONIGENC_CTYPE_WORD; - if (ctype == 0) return FALSE; + } } - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else - return FALSE; + return FALSE; } extern int onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code, unsigned int ctype) { - if ((ctype & ONIGENC_CTYPE_WORD) != 0) { - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else { + if ((ctype & (ONIGENC_CTYPE_WORD | + ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) { return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE); - - ctype &= ~ONIGENC_CTYPE_WORD; - if (ctype == 0) return FALSE; + } } - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else - return FALSE; + return FALSE; } extern int diff --git a/ext/mbstring/oniguruma/regenc.h b/ext/mbstring/oniguruma/regenc.h index 510455146..58ee3e7f2 100644 --- a/ext/mbstring/oniguruma/regenc.h +++ b/ext/mbstring/oniguruma/regenc.h @@ -4,7 +4,7 @@ regenc.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -65,15 +65,17 @@ #else /* ONIG_RUBY_M17N */ #define USE_UNICODE_FULL_RANGE_CTYPE +/* following must not use with USE_CRNL_AS_LINE_TERMINATOR */ +/* #define USE_UNICODE_ALL_LINE_TERMINATORS */ /* see Unicode.org UTF#18 */ #define ONIG_ENCODING_INIT_DEFAULT ONIG_ENCODING_ASCII /* for encoding system implementation (internal) */ -ONIG_EXTERN int onigenc_ascii_get_all_pair_ambig_codes P_((OnigAmbigType flag, OnigPairAmbigCodes** acs)); -ONIG_EXTERN int onigenc_nothing_get_all_comp_ambig_codes P_((OnigAmbigType flag, OnigCompAmbigCodes** acs)); -ONIG_EXTERN int onigenc_iso_8859_1_get_all_pair_ambig_codes P_((OnigAmbigType flag, OnigPairAmbigCodes** acs)); -ONIG_EXTERN int onigenc_ess_tsett_get_all_comp_ambig_codes P_((OnigAmbigType flag, OnigCompAmbigCodes** acs)); -ONIG_EXTERN int onigenc_not_support_get_ctype_code_range P_((int ctype, OnigCodePoint* sbr[], OnigCodePoint* mbr[])); +ONIG_EXTERN int onigenc_ascii_get_all_pair_ambig_codes P_((OnigAmbigType flag, const OnigPairAmbigCodes** acs)); +ONIG_EXTERN int onigenc_nothing_get_all_comp_ambig_codes P_((OnigAmbigType flag, const OnigCompAmbigCodes** acs)); +ONIG_EXTERN int onigenc_iso_8859_1_get_all_pair_ambig_codes P_((OnigAmbigType flag, const OnigPairAmbigCodes** acs)); +ONIG_EXTERN int onigenc_ess_tsett_get_all_comp_ambig_codes P_((OnigAmbigType flag, const OnigCompAmbigCodes** acs)); +ONIG_EXTERN int onigenc_not_support_get_ctype_code_range P_((int ctype, const OnigCodePoint* sbr[], const OnigCodePoint* mbr[])); ONIG_EXTERN int onigenc_is_mbc_newline_0x0a P_((const UChar* p, const UChar* end)); /* methods for single byte encoding */ @@ -105,7 +107,7 @@ ONIG_EXTERN int onigenc_get_all_fold_match_code_ss_0xdf P_((OnigCodePoint** code /* in enc/unicode.c */ ONIG_EXTERN int onigenc_unicode_is_code_ctype P_((OnigCodePoint code, unsigned int ctype)); -ONIG_EXTERN int onigenc_unicode_get_ctype_code_range P_((int ctype, OnigCodePoint* sbr[], OnigCodePoint* mbr[])); +ONIG_EXTERN int onigenc_unicode_get_ctype_code_range P_((int ctype, const OnigCodePoint* sbr[], const OnigCodePoint* mbr[])); #define ONIGENC_ISO_8859_1_TO_LOWER_CASE(c) \ @@ -115,10 +117,10 @@ ONIG_EXTERN int onigenc_unicode_get_ctype_code_range P_((int ctype, OnigCodePoin #define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \ ((OnigEnc_Unicode_ISO_8859_1_CtypeTable[code] & ctype) != 0) -ONIG_EXTERN UChar OnigEncISO_8859_1_ToLowerCaseTable[]; -ONIG_EXTERN UChar OnigEncISO_8859_1_ToUpperCaseTable[]; -ONIG_EXTERN unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[]; -ONIG_EXTERN OnigPairAmbigCodes OnigAsciiPairAmbigCodes[]; +ONIG_EXTERN const UChar OnigEncISO_8859_1_ToLowerCaseTable[]; +ONIG_EXTERN const UChar OnigEncISO_8859_1_ToUpperCaseTable[]; +ONIG_EXTERN const unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[]; +ONIG_EXTERN const OnigPairAmbigCodes OnigAsciiPairAmbigCodes[]; #endif /* is not ONIG_RUBY_M17N */ @@ -133,7 +135,7 @@ extern int onig_is_in_code_range P_((const UChar* p, OnigCodePoint code)); ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; ONIG_EXTERN const UChar* OnigEncAsciiToLowerCaseTable; ONIG_EXTERN const UChar OnigEncAsciiToUpperCaseTable[]; -ONIG_EXTERN unsigned short OnigEncAsciiCtypeTable[]; +ONIG_EXTERN const unsigned short OnigEncAsciiCtypeTable[]; #define ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) OnigEncAsciiToLowerCaseTable[c] #define ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) OnigEncAsciiToUpperCaseTable[c] diff --git a/ext/mbstring/oniguruma/regerror.c b/ext/mbstring/oniguruma/regerror.c index 560b5e12c..ad73b76c3 100644 --- a/ext/mbstring/oniguruma/regerror.c +++ b/ext/mbstring/oniguruma/regerror.c @@ -2,7 +2,7 @@ regerror.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,12 +38,12 @@ #define va_init_list(a,b) va_start(a) #endif -extern char* +extern UChar* onig_error_code_to_format(int code) { char *p; - if (code >= 0) return (char* )0; + if (code >= 0) return (UChar* )0; switch (code) { case ONIG_MISMATCH: @@ -170,6 +170,8 @@ onig_error_code_to_format(int code) p = "invalid character property name {%n}"; break; case ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION: p = "not supported encoding combination"; break; + case ONIGERR_INVALID_COMBINATION_OF_OPTIONS: + p = "invalid combination of options"; break; case ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT: p = "over thread pass limit count"; break; @@ -177,7 +179,7 @@ onig_error_code_to_format(int code) p = "undefined error code"; break; } - return p; + return (UChar* )p; } @@ -256,36 +258,36 @@ onig_error_code_to_str(s, code, va_alist) void #ifdef HAVE_STDARG_PROTOTYPES -onig_snprintf_with_pattern(char buf[], int bufsize, OnigEncoding enc, - char* pat, char* pat_end, char *fmt, ...) +onig_snprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc, + UChar* pat, UChar* pat_end, const UChar *fmt, ...) #else onig_snprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, va_alist) - char buf[]; + UChar buf[]; int bufsize; OnigEncoding enc; - char* pat; - char* pat_end; - const char *fmt; + UChar* pat; + UChar* pat_end; + const UChar *fmt; va_dcl #endif { int n, need, len; UChar *p, *s, *bp; - char bs[6]; + UChar bs[6]; va_list args; va_init_list(args, fmt); - n = vsnprintf(buf, bufsize, fmt, args); + n = vsnprintf((char* )buf, bufsize, (const char* )fmt, args); va_end(args); need = (pat_end - pat) * 4 + 4; if (n + need < bufsize) { - strcat(buf, ": /"); + strcat((char* )buf, ": /"); s = buf + onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, buf); p = pat; - while (p < (UChar* )pat_end) { + while (p < pat_end) { if (*p == MC_ESC(enc)) { *s++ = *p++; len = enc_len(enc, p); @@ -304,7 +306,7 @@ onig_snprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, va_alist) int blen; while (len-- > 0) { - sprintf(bs, "\\%03o", *p++ & 0377); + sprintf((char* )bs, "\\%03o", *p++ & 0377); blen = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, bs); bp = bs; while (blen-- > 0) *s++ = *bp++; @@ -313,7 +315,7 @@ onig_snprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, va_alist) } else if (!ONIGENC_IS_CODE_PRINT(enc, *p) && !ONIGENC_IS_CODE_SPACE(enc, *p)) { - sprintf(bs, "\\%03o", *p++ & 0377); + sprintf((char* )bs, "\\%03o", *p++ & 0377); len = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, bs); bp = bs; while (len-- > 0) *s++ = *bp++; diff --git a/ext/mbstring/oniguruma/regexec.c b/ext/mbstring/oniguruma/regexec.c index 25d97773f..769ed30c9 100644 --- a/ext/mbstring/oniguruma/regexec.c +++ b/ext/mbstring/oniguruma/regexec.c @@ -2,7 +2,7 @@ regexec.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,6 +29,12 @@ #include "regint.h" +#ifdef USE_CRNL_AS_LINE_TERMINATOR +#define ONIGENC_IS_MBC_CRNL(enc,p,end) \ + (ONIGENC_MBC_TO_CODE(enc,p,end) == 13 && \ + ONIGENC_IS_MBC_NEWLINE(enc,(p+enc_len(enc,p)),end)) +#endif + #ifdef USE_CAPTURE_HISTORY static void history_tree_free(OnigCaptureTreeNode* node); @@ -300,6 +306,9 @@ typedef struct _StackType { UChar *pcode; /* byte code position */ UChar *pstr; /* string position */ UChar *pstr_prev; /* previous char position of pstr */ +#ifdef USE_COMBINATION_EXPLOSION_CHECK + unsigned int state_check; +#endif } state; struct { int count; /* for OP_REPEAT_INC, OP_REPEAT_INC_NG */ @@ -333,28 +342,28 @@ typedef struct _StackType { /* stack type */ /* used by normal-POP */ #define STK_ALT 0x0001 -#define STK_LOOK_BEHIND_NOT 0x0003 -#define STK_POS_NOT 0x0005 -/* avoided by normal-POP, but value should be small */ -#define STK_NULL_CHECK_START 0x0100 +#define STK_LOOK_BEHIND_NOT 0x0002 +#define STK_POS_NOT 0x0003 /* handled by normal-POP */ -#define STK_MEM_START 0x0200 -#define STK_MEM_END 0x0300 -#define STK_REPEAT_INC 0x0400 +#define STK_MEM_START 0x0100 +#define STK_MEM_END 0x8200 +#define STK_REPEAT_INC 0x0300 +#define STK_STATE_CHECK_MARK 0x1000 /* avoided by normal-POP */ +#define STK_NULL_CHECK_START 0x3000 +#define STK_NULL_CHECK_END 0x5000 /* for recursive call */ +#define STK_MEM_END_MARK 0x8400 #define STK_POS 0x0500 /* used when POP-POS */ #define STK_STOP_BT 0x0600 /* mark for "(?>...)" */ #define STK_REPEAT 0x0700 #define STK_CALL_FRAME 0x0800 #define STK_RETURN 0x0900 -#define STK_MEM_END_MARK 0x0a00 -#define STK_VOID 0x0b00 /* for fill a blank */ -#define STK_NULL_CHECK_END 0x0c00 /* for recursive call */ +#define STK_VOID 0x0a00 /* for fill a blank */ /* stack type check mask */ -#define STK_MASK_POP_USED 0x00ff -#define IS_TO_VOID_TARGET(stk) \ - (((stk)->type & STK_MASK_POP_USED) || (stk)->type == STK_NULL_CHECK_START) +#define STK_MASK_POP_USED 0x00ff +#define STK_MASK_TO_VOID_TARGET 0x10ff +#define STK_MASK_MEM_END_OR_MARK 0x8000 /* MEM_END or MEM_END_MARK */ typedef struct { void* stack_p; @@ -362,6 +371,10 @@ typedef struct { OnigOptionType options; OnigRegion* region; const UChar* start; /* search start position (for \G: BEGIN_POSITION) */ +#ifdef USE_COMBINATION_EXPLOSION_CHECK + void* state_check_buff; + int state_check_buff_size; +#endif } MatchArg; #define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start) do {\ @@ -371,7 +384,37 @@ typedef struct { (msa).start = (arg_start);\ } while (0) -#define MATCH_ARG_FREE(msa) if ((msa).stack_p) xfree((msa).stack_p) +#ifdef USE_COMBINATION_EXPLOSION_CHECK + +#define STATE_CHECK_BUFF_MALLOC_THRESHOLD_SIZE 16 + +#define STATE_CHECK_BUFF_INIT(msa, str_len, state_num) do { \ + (msa).state_check_buff = (void* )0;\ + (msa).state_check_buff_size = 0;\ + if ((state_num) > 0 && str_len >= STATE_CHECK_STRING_THRESHOLD_LEN) {\ + int size = ((int )((str_len) + 1) * (state_num) + 7) / 8;\ + (msa).state_check_buff_size = size; \ + if (size > 0 && size < STATE_CHECK_BUFF_MAX_SIZE) {\ + if (size >= STATE_CHECK_BUFF_MALLOC_THRESHOLD_SIZE) \ + (msa).state_check_buff = (void* )xmalloc(size);\ + else \ + (msa).state_check_buff = (void* )xalloca(size);\ + xmemset((msa).state_check_buff, 0, (size_t )size);\ + }\ + }\ +} while (0) + +#define MATCH_ARG_FREE(msa) do {\ + if ((msa).stack_p) xfree((msa).stack_p);\ + if ((msa).state_check_buff_size >= STATE_CHECK_BUFF_MALLOC_THRESHOLD_SIZE) { \ + if ((msa).state_check_buff) xfree((msa).state_check_buff);\ + }\ +} while (0); +#else +#define STATE_CHECK_BUFF_INIT(msa, str_len, state_num) +#define MATCH_ARG_FREE(msa) if ((msa).stack_p) xfree((msa).stack_p) +#endif + #define STACK_INIT(alloc_addr, ptr_num, stack_num) do {\ @@ -465,26 +508,88 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, #define STACK_AT(index) (stk_base + (index)) #define GET_STACK_INDEX(stk) ((stk) - stk_base) +#define STACK_PUSH_TYPE(stack_type) do {\ + STACK_ENSURE(1);\ + stk->type = (stack_type);\ + STACK_INC;\ +} while(0) + +#define IS_TO_VOID_TARGET(stk) (((stk)->type & STK_MASK_TO_VOID_TARGET) != 0) + +#ifdef USE_COMBINATION_EXPLOSION_CHECK +#define STATE_CHECK_POS(s,snum) \ + (((s) - str) * num_comb_exp_check + ((snum) - 1)) +#define STATE_CHECK_VAL(v,snum) do {\ + if (state_check_buff != NULL) {\ + int x = STATE_CHECK_POS(s,snum);\ + (v) = state_check_buff[x/8] & (1<<(x%8));\ + }\ + else (v) = 0;\ +} while(0) + + +#define ELSE_IF_STATE_CHECK_MARK(stk) \ + else if ((stk)->type == STK_STATE_CHECK_MARK) { \ + int x = STATE_CHECK_POS(stk->u.state.pstr, stk->u.state.state_check);\ + state_check_buff[x/8] |= (1<<(x%8)); \ + } + #define STACK_PUSH(stack_type,pat,s,sprev) do {\ STACK_ENSURE(1);\ stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ stk->u.state.pstr = (s);\ stk->u.state.pstr_prev = (sprev);\ + stk->u.state.state_check = 0;\ STACK_INC;\ } while(0) #define STACK_PUSH_ENSURED(stack_type,pat) do {\ stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ + stk->u.state.state_check = 0;\ STACK_INC;\ } while(0) -#define STACK_PUSH_TYPE(stack_type) do {\ +#define STACK_PUSH_ALT_WITH_STATE_CHECK(pat,s,sprev,snum) do {\ STACK_ENSURE(1);\ + stk->type = STK_ALT;\ + stk->u.state.pcode = (pat);\ + stk->u.state.pstr = (s);\ + stk->u.state.pstr_prev = (sprev);\ + stk->u.state.state_check = ((state_check_buff != NULL) ? (snum) : 0);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_STATE_CHECK(s,snum) do {\ + if (state_check_buff != NULL) {\ + STACK_ENSURE(1);\ + stk->type = STK_STATE_CHECK_MARK;\ + stk->u.state.pstr = (s);\ + stk->u.state.state_check = (snum);\ + STACK_INC;\ + }\ +} while(0) + +#else /* USE_COMBINATION_EXPLOSION_CHECK */ + +#define ELSE_IF_STATE_CHECK_MARK(stk) + +#define STACK_PUSH(stack_type,pat,s,sprev) do {\ + STACK_ENSURE(1);\ + stk->type = (stack_type);\ + stk->u.state.pcode = (pat);\ + stk->u.state.pstr = (s);\ + stk->u.state.pstr_prev = (sprev);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_ENSURED(stack_type,pat) do {\ stk->type = (stack_type);\ + stk->u.state.pcode = (pat);\ STACK_INC;\ } while(0) +#endif /* USE_COMBINATION_EXPLOSION_CHECK */ #define STACK_PUSH_ALT(pat,s,sprev) STACK_PUSH(STK_ALT,pat,s,sprev) #define STACK_PUSH_POS(s,sprev) STACK_PUSH(STK_POS,NULL_UCHARP,s,sprev) @@ -544,7 +649,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, k = stk;\ while (k > stk_base) {\ k--;\ - if ((k->type == STK_MEM_END_MARK || k->type == STK_MEM_END) \ + if ((k->type & STK_MASK_MEM_END_OR_MARK) != 0 \ && k->u.mem.num == (mnum)) {\ level++;\ }\ @@ -603,15 +708,18 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, #ifdef ONIG_DEBUG -#define STACK_BASE_CHECK(p) \ - if ((p) < stk_base) goto stack_error; +#define STACK_BASE_CHECK(p, at) \ + if ((p) < stk_base) {\ + fprintf(stderr, "at %s\n", at);\ + goto stack_error;\ + } #else -#define STACK_BASE_CHECK(p) +#define STACK_BASE_CHECK(p, at) #endif #define STACK_POP_ONE do {\ stk--;\ - STACK_BASE_CHECK(stk); \ + STACK_BASE_CHECK(stk, "STACK_POP_ONE"); \ } while(0) #define STACK_POP do {\ @@ -619,25 +727,27 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, case STACK_POP_LEVEL_FREE:\ while (1) {\ stk--;\ - STACK_BASE_CHECK(stk); \ + STACK_BASE_CHECK(stk, "STACK_POP"); \ if ((stk->type & STK_MASK_POP_USED) != 0) break;\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ break;\ case STACK_POP_LEVEL_MEM_START:\ while (1) {\ stk--;\ - STACK_BASE_CHECK(stk); \ + STACK_BASE_CHECK(stk, "STACK_POP 2"); \ if ((stk->type & STK_MASK_POP_USED) != 0) break;\ else if (stk->type == STK_MEM_START) {\ mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ }\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ break;\ default:\ while (1) {\ stk--;\ - STACK_BASE_CHECK(stk); \ + STACK_BASE_CHECK(stk, "STACK_POP 3"); \ if ((stk->type & STK_MASK_POP_USED) != 0) break;\ else if (stk->type == STK_MEM_START) {\ mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ @@ -650,6 +760,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ }\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ break;\ }\ @@ -658,7 +769,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, #define STACK_POP_TIL_POS_NOT do {\ while (1) {\ stk--;\ - STACK_BASE_CHECK(stk); \ + STACK_BASE_CHECK(stk, "STACK_POP_TIL_POS_NOT"); \ if (stk->type == STK_POS_NOT) break;\ else if (stk->type == STK_MEM_START) {\ mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ @@ -671,13 +782,14 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ }\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ } while(0) #define STACK_POP_TIL_LOOK_BEHIND_NOT do {\ while (1) {\ stk--;\ - STACK_BASE_CHECK(stk); \ + STACK_BASE_CHECK(stk, "STACK_POP_TIL_LOOK_BEHIND_NOT"); \ if (stk->type == STK_LOOK_BEHIND_NOT) break;\ else if (stk->type == STK_MEM_START) {\ mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ @@ -690,6 +802,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ }\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ } while(0) @@ -697,7 +810,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, k = stk;\ while (1) {\ k--;\ - STACK_BASE_CHECK(k); \ + STACK_BASE_CHECK(k, "STACK_POS_END"); \ if (IS_TO_VOID_TARGET(k)) {\ k->type = STK_VOID;\ }\ @@ -712,7 +825,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, StackType *k = stk;\ while (1) {\ k--;\ - STACK_BASE_CHECK(k); \ + STACK_BASE_CHECK(k, "STACK_STOP_BT_END"); \ if (IS_TO_VOID_TARGET(k)) {\ k->type = STK_VOID;\ }\ @@ -727,7 +840,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, StackType* k = stk;\ while (1) {\ k--;\ - STACK_BASE_CHECK(k); \ + STACK_BASE_CHECK(k, "STACK_NULL_CHECK"); \ if (k->type == STK_NULL_CHECK_START) {\ if (k->u.null_check.num == (id)) {\ (isnull) = (k->u.null_check.pstr == (s));\ @@ -742,7 +855,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, StackType* k = stk;\ while (1) {\ k--;\ - STACK_BASE_CHECK(k); \ + STACK_BASE_CHECK(k, "STACK_NULL_CHECK_REC"); \ if (k->type == STK_NULL_CHECK_START) {\ if (k->u.null_check.num == (id)) {\ if (level == 0) {\ @@ -762,7 +875,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, StackType* k = stk;\ while (1) {\ k--;\ - STACK_BASE_CHECK(k); \ + STACK_BASE_CHECK(k, "STACK_NULL_CHECK_MEMST"); \ if (k->type == STK_NULL_CHECK_START) {\ if (k->u.null_check.num == (id)) {\ if (k->u.null_check.pstr != (s)) {\ @@ -802,7 +915,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, StackType* k = stk;\ while (1) {\ k--;\ - STACK_BASE_CHECK(k); \ + STACK_BASE_CHECK(k, "STACK_NULL_CHECK_MEMST_REC"); \ if (k->type == STK_NULL_CHECK_START) {\ if (k->u.null_check.num == (id)) {\ if (level == 0) {\ @@ -850,7 +963,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, k = stk;\ while (1) {\ k--;\ - STACK_BASE_CHECK(k); \ + STACK_BASE_CHECK(k, "STACK_GET_REPEAT"); \ if (k->type == STK_REPEAT) {\ if (level == 0) {\ if (k->u.repeat.num == (id)) {\ @@ -868,7 +981,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, StackType* k = stk;\ while (1) {\ k--;\ - STACK_BASE_CHECK(k); \ + STACK_BASE_CHECK(k, "STACK_RETURN"); \ if (k->type == STK_CALL_FRAME) {\ if (level == 0) {\ (addr) = k->u.call_frame.ret_addr;\ @@ -937,6 +1050,7 @@ static int string_cmp_ic(OnigEncoding enc, int ambig_flag, is_fail = 0; \ } while(0) + #define ON_STR_BEGIN(s) ((s) == str) #define ON_STR_END(s) ((s) == end) #define IS_EMPTY_STR (str == end) @@ -988,6 +1102,77 @@ make_capture_history_tree(OnigCaptureTreeNode* node, StackType** kp, } #endif +#ifdef USE_BACKREF_AT_LEVEL +static int mem_is_in_memp(int mem, int num, UChar* memp) +{ + int i; + MemNumType m; + + for (i = 0; i < num; i++) { + GET_MEMNUM_INC(m, memp); + if (mem == (int )m) return 1; + } + return 0; +} + +static int backref_match_at_nested_level(regex_t* reg + , StackType* top, StackType* stk_base + , int ignore_case, int ambig_flag + , int nest, int mem_num, UChar* memp, UChar** s, const UChar* send) +{ + UChar *ss, *p, *pstart, *pend = NULL_UCHARP; + int level; + StackType* k; + + level = 0; + k = top; + k--; + while (k >= stk_base) { + if (k->type == STK_CALL_FRAME) { + level--; + } + else if (k->type == STK_RETURN) { + level++; + } + else if (level == nest) { + if (k->type == STK_MEM_START) { + if (mem_is_in_memp(k->u.mem.num, mem_num, memp)) { + pstart = k->u.mem.pstr; + if (pend != NULL_UCHARP) { + if (pend - pstart > send - *s) return 0; /* or goto next_mem; */ + p = pstart; + ss = *s; + + if (ignore_case != 0) { + if (string_cmp_ic(reg->enc, ambig_flag, + pstart, &ss, (int )(pend - pstart)) == 0) + return 0; /* or goto next_mem; */ + } + else { + while (p < pend) { + if (*p++ != *ss++) return 0; /* or goto next_mem; */ + } + } + + *s = ss; + return 1; + } + } + } + else if (k->type == STK_MEM_END) { + if (mem_is_in_memp(k->u.mem.num, mem_num, memp)) { + pend = k->u.mem.pstr; + } + } + } + k--; + } + + return 0; +} +#endif /* USE_BACKREF_AT_LEVEL */ + + #ifdef RUBY_PLATFORM typedef struct { @@ -1003,7 +1188,7 @@ trap_ensure(VALUE arg) TrapEnsureArg* ta = (TrapEnsureArg* )arg; if (ta->state == 0) { /* trap_exec() is not normal return */ - ONIG_STATE_DEC(ta->reg); + ONIG_STATE_DEC_THREAD(ta->reg); if (! IS_NULL(ta->msa->stack_p) && ta->stk_base != ta->msa->stack_p) xfree(ta->stk_base); @@ -1165,27 +1350,43 @@ onig_is_in_code_range(const UChar* p, OnigCodePoint code) } static int -code_is_in_cclass_node(void* node, OnigCodePoint code, int enclen) +is_code_in_cc(int enclen, OnigCodePoint code, CClassNode* cc) { - unsigned int in_cc; - CClassNode* cc = (CClassNode* )node; + int found; - if (enclen == 1) { - in_cc = BITSET_AT(cc->bs, code); + if (enclen > 1 || (code >= SINGLE_BYTE_SIZE)) { + if (IS_NULL(cc->mbuf)) { + found = 0; + } + else { + found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0); + } } else { - UChar* p = ((BBuf* )(cc->mbuf))->p; - in_cc = onig_is_in_code_range(p, code); + found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1); } - if (IS_CCLASS_NOT(cc)) { - return (in_cc ? 0 : 1); + if (IS_CCLASS_NOT(cc)) + return !found; + else + return found; +} + +extern int +onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) +{ + int len; + + if (ONIGENC_MBC_MINLEN(enc) > 1) { + len = 2; } else { - return (in_cc ? 1 : 0); + len = ONIGENC_CODE_TO_MBCLEN(enc, code); } + return is_code_in_cc(len, code, cc); } + /* matching region of POSIX API */ typedef int regoff_t; @@ -1217,6 +1418,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, StackIndex si; StackIndex *repeat_stk; StackIndex *mem_start_stk, *mem_end_stk; +#ifdef USE_COMBINATION_EXPLOSION_CHECK + int scv; + unsigned char* state_check_buff = msa->state_check_buff; + int num_comb_exp_check = reg->num_comb_exp_check; +#endif n = reg->num_repeat + reg->num_mem * 2; STACK_INIT(alloca_base, n, INIT_MATCH_STACK_SIZE); @@ -1739,8 +1945,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, mb_len = enc_len(encode, s); ss = s; s += mb_len; + DATA_ENSURE(0); code = ONIGENC_MBC_TO_CODE(encode, ss, s); - if (code_is_in_cclass_node(node, code, mb_len) == 0) goto fail; + if (is_code_in_cc(mb_len, code, node) == 0) goto fail; } STAT_OP_OUT; break; @@ -1826,6 +2033,47 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, STAT_OP_OUT; break; +#ifdef USE_COMBINATION_EXPLOSION_CHECK + case OP_STATE_CHECK_ANYCHAR_STAR: STAT_OP_IN(OP_STATE_CHECK_ANYCHAR_STAR); + GET_STATE_CHECK_NUM_INC(mem, p); + while (s < end) { + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem); + n = enc_len(encode, s); + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + sprev = s; + s += n; + } + STAT_OP_OUT; + break; + + case OP_STATE_CHECK_ANYCHAR_ML_STAR: + STAT_OP_IN(OP_STATE_CHECK_ANYCHAR_ML_STAR); + + GET_STATE_CHECK_NUM_INC(mem, p); + while (s < end) { + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem); + n = enc_len(encode, s); + if (n > 1) { + DATA_ENSURE(n); + sprev = s; + s += n; + } + else { + sprev = s; + s++; + } + } + STAT_OP_OUT; + break; +#endif /* USE_COMBINATION_EXPLOSION_CHECK */ + case OP_WORD: STAT_OP_IN(OP_WORD); DATA_ENSURE(1); if (! ONIGENC_IS_MBC_WORD(encode, s, end)) @@ -1946,6 +2194,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, STAT_OP_OUT; continue; } +#ifdef USE_CRNL_AS_LINE_TERMINATOR + else if (ONIGENC_IS_MBC_CRNL(encode, s, end)) { + STAT_OP_OUT; + continue; + } +#endif goto fail; break; @@ -1966,6 +2220,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, STAT_OP_OUT; continue; } +#ifdef USE_CRNL_AS_LINE_TERMINATOR + else if (ONIGENC_IS_MBC_CRNL(encode, s, end)) { + UChar* ss = s + enc_len(encode, s); + if (ON_STR_END(ss + enc_len(encode, ss))) { + STAT_OP_OUT; + continue; + } + } +#endif goto fail; break; @@ -2041,11 +2304,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, goto backref; break; - case OP_BACKREF3: STAT_OP_IN(OP_BACKREF3); - mem = 3; - goto backref; - break; - case OP_BACKREFN: STAT_OP_IN(OP_BACKREFN); GET_MEMNUM_INC(mem, p); backref: @@ -2188,6 +2446,35 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, continue; } break; + +#ifdef USE_BACKREF_AT_LEVEL + case OP_BACKREF_AT_LEVEL: + { + int len; + OnigOptionType ic; + LengthType level; + + GET_OPTION_INC(ic, p); + GET_LENGTH_INC(level, p); + GET_LENGTH_INC(tlen, p); + + sprev = s; + if (backref_match_at_nested_level(reg, stk, stk_base, ic, ambig_flag + , (int )level, (int )tlen, p, &s, end)) { + while (sprev + (len = enc_len(encode, sprev)) < s) + sprev += len; + + p += (SIZE_MEMNUM * tlen); + } + else + goto fail; + + STAT_OP_OUT; + continue; + } + + break; +#endif case OP_SET_OPTION_PUSH: STAT_OP_IN(OP_SET_OPTION_PUSH); GET_OPTION_INC(option, p); @@ -2309,6 +2596,43 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, continue; break; +#ifdef USE_COMBINATION_EXPLOSION_CHECK + case OP_STATE_CHECK_PUSH: STAT_OP_IN(OP_STATE_CHECK_PUSH); + GET_STATE_CHECK_NUM_INC(mem, p); + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + GET_RELADDR_INC(addr, p); + STACK_PUSH_ALT_WITH_STATE_CHECK(p + addr, s, sprev, mem); + STAT_OP_OUT; + continue; + break; + + case OP_STATE_CHECK_PUSH_OR_JUMP: STAT_OP_IN(OP_STATE_CHECK_PUSH_OR_JUMP); + GET_STATE_CHECK_NUM_INC(mem, p); + GET_RELADDR_INC(addr, p); + STATE_CHECK_VAL(scv, mem); + if (scv) { + p += addr; + } + else { + STACK_PUSH_ALT_WITH_STATE_CHECK(p + addr, s, sprev, mem); + } + STAT_OP_OUT; + continue; + break; + + case OP_STATE_CHECK: STAT_OP_IN(OP_STATE_CHECK); + GET_STATE_CHECK_NUM_INC(mem, p); + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + STACK_PUSH_STATE_CHECK(s, mem); + STAT_OP_OUT; + continue; + break; +#endif /* USE_COMBINATION_EXPLOSION_CHECK */ + case OP_POP: STAT_OP_IN(OP_POP); STACK_POP_ONE; STAT_OP_OUT; @@ -2383,7 +2707,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, repeat_inc: stkp->u.repeat.count++; - if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { + if (stkp->u.repeat.count >= reg->repeat_range[mem].upper) { /* end of repeat. Nothing to do. */ } else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { @@ -2413,8 +2737,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, repeat_inc_ng: stkp->u.repeat.count++; - if (stkp->u.repeat.count < reg->repeat_range[mem].upper || - IS_REPEAT_INFINITE(reg->repeat_range[mem].upper)) { + if (stkp->u.repeat.count < reg->repeat_range[mem].upper) { if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { UChar* pcode = stkp->u.repeat.pcode; @@ -2543,6 +2866,14 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, p = stk->u.state.pcode; s = stk->u.state.pstr; sprev = stk->u.state.pstr_prev; + +#ifdef USE_COMBINATION_EXPLOSION_CHECK + if (stk->u.state.state_check != 0) { + stk->type = STK_STATE_CHECK_MARK; + stk++; + } +#endif + STAT_OP_OUT; continue; break; @@ -2727,66 +3058,56 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, const UChar* text, const UChar* text_end, const UChar* text_range) { - const UChar *s, *t, *p, *end; + const UChar *s, *se, *t, *p, *end; const UChar *tail; - int skip; + int skip, tlen1; #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "bm_search_notrev: text: %d, text_end: %d, text_range: %d\n", (int )text, (int )text_end, (int )text_range); #endif - end = text_range + (target_end - target) - 1; - if (end > text_end) - end = text_end; - tail = target_end - 1; + tlen1 = tail - target; + end = text_range; + if (end + tlen1 > text_end) + end = text_end - tlen1; + s = text; - while ((s - text) < target_end - target) { - s += enc_len(reg->enc, s); - } - s--; /* set to text check tail position. */ if (IS_NULL(reg->int_map)) { while (s < end) { - p = s; + p = se = s + tlen1; t = tail; while (t >= target && *p == *t) { - p--; t--; + p--; t--; } - if (t < target) return (UChar* )(p + 1); + if (t < target) return (UChar* )s; - skip = reg->map[*s]; - p = s + 1; - if (p >= text_end) return (UChar* )NULL; - t = p; + skip = reg->map[*se]; + t = s; do { - p += enc_len(reg->enc, p); - } while ((p - t) < skip && p < text_end); - - s += (p - t); + s += enc_len(reg->enc, s); + } while ((s - t) < skip && s < end); } } else { while (s < end) { - p = s; + p = se = s + tlen1; t = tail; while (t >= target && *p == *t) { - p--; t--; + p--; t--; } - if (t < target) return (UChar* )(p + 1); + if (t < target) return (UChar* )s; - skip = reg->int_map[*s]; - p = s + 1; - if (p >= text_end) return (UChar* )NULL; - t = p; + skip = reg->int_map[*se]; + t = s; do { - p += enc_len(reg->enc, p); - } while ((p - t) < skip && p < text_end); - - s += (p - t); + s += enc_len(reg->enc, s); + } while ((s - t) < skip && s < end); } } + return (UChar* )NULL; } @@ -2915,7 +3236,9 @@ onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, On UChar *prev; MatchArg msa; -#ifdef USE_MULTI_THREAD_SYSTEM +#if defined(USE_RECOMPILE_API) && defined(USE_MULTI_THREAD_SYSTEM) + start: + THREAD_ATOMIC_START; if (ONIG_STATE(reg) >= ONIG_STATE_NORMAL) { ONIG_STATE_INC(reg); if (IS_NOT_NULL(reg->chain) && ONIG_STATE(reg) == ONIG_STATE_NORMAL) { @@ -2924,17 +3247,22 @@ onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, On } } else { - int n = 0; + int n; + + THREAD_ATOMIC_END; + n = 0; while (ONIG_STATE(reg) < ONIG_STATE_NORMAL) { if (++n > THREAD_PASS_LIMIT_COUNT) return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT; THREAD_PASS; } - ONIG_STATE_INC(reg); + goto start; } -#endif /* USE_MULTI_THREAD_SYSTEM */ + THREAD_ATOMIC_END; +#endif /* USE_RECOMPILE_API && USE_MULTI_THREAD_SYSTEM */ MATCH_ARG_INIT(msa, option, region, at); + STATE_CHECK_BUFF_INIT(msa, end - str, reg->num_comb_exp_check); if (region #ifdef USE_POSIX_REGION_OPTION @@ -2952,7 +3280,7 @@ onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, On } MATCH_ARG_FREE(msa); - ONIG_STATE_DEC(reg); + ONIG_STATE_DEC_THREAD(reg); return r; } @@ -3029,7 +3357,11 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, if (prev && ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) goto retry_gate; } - else if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)) + else if (! ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end) +#ifdef USE_CRNL_AS_LINE_TERMINATOR + && ! ONIGENC_IS_MBC_CRNL(reg->enc, p, end) +#endif + ) goto retry_gate; break; } @@ -3132,7 +3464,7 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, switch (reg->sub_anchor) { case ANCHOR_BEGIN_LINE: if (!ON_STR_BEGIN(p)) { - prev = onigenc_get_prev_char_head(reg->enc, adjrange, p); + prev = onigenc_get_prev_char_head(reg->enc, str, p); if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) { p = prev; goto retry; @@ -3149,7 +3481,11 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, goto retry; } } - else if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)) { + else if (! ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end) +#ifdef USE_CRNL_AS_LINE_TERMINATOR + && ! ONIGENC_IS_MBC_CRNL(reg->enc, p, end) +#endif + ) { p = onigenc_get_prev_char_head(reg->enc, adjrange, p); if (IS_NULL(p)) goto fail; goto retry; @@ -3187,8 +3523,11 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, int r; UChar *s, *prev; MatchArg msa; + const UChar *orig_start = start; -#ifdef USE_MULTI_THREAD_SYSTEM +#if defined(USE_RECOMPILE_API) && defined(USE_MULTI_THREAD_SYSTEM) + start: + THREAD_ATOMIC_START; if (ONIG_STATE(reg) >= ONIG_STATE_NORMAL) { ONIG_STATE_INC(reg); if (IS_NOT_NULL(reg->chain) && ONIG_STATE(reg) == ONIG_STATE_NORMAL) { @@ -3197,15 +3536,19 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, } } else { - int n = 0; + int n; + + THREAD_ATOMIC_END; + n = 0; while (ONIG_STATE(reg) < ONIG_STATE_NORMAL) { if (++n > THREAD_PASS_LIMIT_COUNT) return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT; THREAD_PASS; } - ONIG_STATE_INC(reg); + goto start; } -#endif /* USE_MULTI_THREAD_SYSTEM */ + THREAD_ATOMIC_END; +#endif /* USE_RECOMPILE_API && USE_MULTI_THREAD_SYSTEM */ #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, @@ -3305,12 +3648,12 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, goto end_buf; } } - else if ((reg->anchor & ANCHOR_ANYCHAR_STAR_PL)) { + else if ((reg->anchor & ANCHOR_ANYCHAR_STAR_ML)) { goto begin_position; } } else if (str == end) { /* empty string */ - static const UChar* address_for_empty_string = ""; + static const UChar* address_for_empty_string = (UChar* )""; #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "onig_search: empty string.\n"); @@ -3322,6 +3665,10 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, prev = (UChar* )NULL; MATCH_ARG_INIT(msa, option, region, start); +#ifdef USE_COMBINATION_EXPLOSION_CHECK + msa.state_check_buff = (void* )0; + msa.state_check_buff_size = 0; +#endif MATCH_AND_RETURN_CHECK; goto mismatch; } @@ -3333,7 +3680,8 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, (int )(end - str), (int )(start - str), (int )(range - str)); #endif - MATCH_ARG_INIT(msa, option, region, start); + MATCH_ARG_INIT(msa, option, region, orig_start); + STATE_CHECK_BUFF_INIT(msa, end - str, reg->num_comb_exp_check); s = (UChar* )start; if (range > start) { /* forward search */ @@ -3398,7 +3746,11 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, MATCH_AND_RETURN_CHECK; prev = s; s += enc_len(reg->enc, s); - } while (s <= range); /* exec s == range, because empty match with /$/. */ + } while (s < range); + + if (s == range) { /* because empty match with /$/. */ + MATCH_AND_RETURN_CHECK; + } } else { /* backward search */ if (reg->optimize != ONIG_OPTIMIZE_NONE) { @@ -3461,7 +3813,7 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, finish: MATCH_ARG_FREE(msa); - ONIG_STATE_DEC(reg); + ONIG_STATE_DEC_THREAD(reg); /* If result is mismatch and no FIND_NOT_EMPTY option, then the region is not setted in match_at(). */ @@ -3482,7 +3834,7 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, mismatch_no_msa: r = ONIG_MISMATCH; finish_no_msa: - ONIG_STATE_DEC(reg); + ONIG_STATE_DEC_THREAD(reg); #ifdef ONIG_DEBUG if (r != ONIG_MISMATCH) fprintf(stderr, "onig_search: error %d\n", r); @@ -3490,7 +3842,7 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, return r; match: - ONIG_STATE_DEC(reg); + ONIG_STATE_DEC_THREAD(reg); MATCH_ARG_FREE(msa); return s - str; } diff --git a/ext/mbstring/oniguruma/regext.c b/ext/mbstring/oniguruma/regext.c index 6839708be..f5ad1f35a 100755 --- a/ext/mbstring/oniguruma/regext.c +++ b/ext/mbstring/oniguruma/regext.c @@ -2,7 +2,7 @@ regext.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -194,6 +194,7 @@ onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, return r; } +#ifdef USE_RECOMPILE_API extern int onig_recompile_deluxe(regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo) @@ -211,3 +212,4 @@ onig_recompile_deluxe(regex_t* reg, const UChar* pattern, const UChar* pattern_e } return 0; } +#endif diff --git a/ext/mbstring/oniguruma/reggnu.c b/ext/mbstring/oniguruma/reggnu.c index 70e8582ff..248957c9d 100644 --- a/ext/mbstring/oniguruma/reggnu.c +++ b/ext/mbstring/oniguruma/reggnu.c @@ -2,7 +2,7 @@ reggnu.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -93,6 +93,7 @@ re_compile_pattern(const char* pattern, int size, regex_t* reg, char* ebuf) return r; } +#ifdef USE_RECOMPILE_API extern int re_recompile_pattern(const char* pattern, int size, regex_t* reg, char* ebuf) { @@ -113,6 +114,7 @@ re_recompile_pattern(const char* pattern, int size, regex_t* reg, char* ebuf) } return r; } +#endif extern void re_free_pattern(regex_t* reg) @@ -151,16 +153,16 @@ re_mbcinit(int mb_code) OnigEncoding enc; switch (mb_code) { - case MBCTYPE_ASCII: + case RE_MBCTYPE_ASCII: enc = ONIG_ENCODING_ASCII; break; - case MBCTYPE_EUC: + case RE_MBCTYPE_EUC: enc = ONIG_ENCODING_EUC_JP; break; - case MBCTYPE_SJIS: + case RE_MBCTYPE_SJIS: enc = ONIG_ENCODING_SJIS; break; - case MBCTYPE_UTF8: + case RE_MBCTYPE_UTF8: enc = ONIG_ENCODING_UTF8; break; default: diff --git a/ext/mbstring/oniguruma/regint.h b/ext/mbstring/oniguruma/regint.h index 2bd514b7c..c06bf5763 100644 --- a/ext/mbstring/oniguruma/regint.h +++ b/ext/mbstring/oniguruma/regint.h @@ -4,7 +4,7 @@ regint.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -62,6 +62,12 @@ #define USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK /* /(?:()|())*\2/ */ #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR +/* #define USE_RECOMPILE_API */ +/* treat \r\n as line terminator. + !!! NO SUPPORT !!! + use this configuration on your own responsibility */ +/* #define USE_CRNL_AS_LINE_TERMINATOR */ + /* internal config */ #define USE_RECYCLE_NODE #define USE_OP_PUSH_OR_JUMP_EXACT @@ -75,10 +81,12 @@ /* interface to external system */ #ifdef NOT_RUBY /* given from Makefile */ #include "config.h" +#define USE_BACKREF_AT_LEVEL #define USE_CAPTURE_HISTORY #define USE_VARIABLE_META_CHARS #define USE_WORD_BEGIN_END /* "\<": word-begin, "\>": word-end */ #define USE_POSIX_REGION_OPTION /* needed for POSIX API support */ +/* #define USE_COMBINATION_EXPLOSION_CHECK */ /* (X*)* */ /* #define USE_MULTI_THREAD_SYSTEM */ #define THREAD_ATOMIC_START /* depend on thread system */ #define THREAD_ATOMIC_END /* depend on thread system */ @@ -93,7 +101,9 @@ #include "version.h" #include "rubysig.h" /* for DEFER_INTS, ENABLE_INTS */ +#define USE_COMBINATION_EXPLOSION_CHECK /* (X*)* */ #define USE_MULTI_THREAD_SYSTEM + #define THREAD_ATOMIC_START DEFER_INTS #define THREAD_ATOMIC_END ENABLE_INTS #define THREAD_PASS rb_thread_schedule() @@ -105,11 +115,14 @@ }\ } while (0) -#define DEFAULT_WARN_FUNCTION rb_warn -#define DEFAULT_VERB_WARN_FUNCTION rb_warning +#define DEFAULT_WARN_FUNCTION onig_rb_warn +#define DEFAULT_VERB_WARN_FUNCTION onig_rb_warning #endif /* else NOT_RUBY */ +#define STATE_CHECK_STRING_THRESHOLD_LEN 7 +#define STATE_CHECK_BUFF_MAX_SIZE 0x08000000 + #define THREAD_PASS_LIMIT_COUNT 8 #define xmemset memset #define xmemcpy memcpy @@ -124,13 +137,26 @@ #endif -#ifdef USE_MULTI_THREAD_SYSTEM -#define ONIG_STATE_INC(reg) (reg)->state++ -#define ONIG_STATE_DEC(reg) (reg)->state-- +#if defined(USE_RECOMPILE_API) && defined(USE_MULTI_THREAD_SYSTEM) +#define ONIG_STATE_INC(reg) (reg)->state++ +#define ONIG_STATE_DEC(reg) (reg)->state-- + +#define ONIG_STATE_INC_THREAD(reg) do {\ + THREAD_ATOMIC_START;\ + (reg)->state++;\ + THREAD_ATOMIC_END;\ +} while(0) +#define ONIG_STATE_DEC_THREAD(reg) do {\ + THREAD_ATOMIC_START;\ + (reg)->state--;\ + THREAD_ATOMIC_END;\ +} while(0) #else -#define ONIG_STATE_INC(reg) /* Nothing */ -#define ONIG_STATE_DEC(reg) /* Nothing */ -#endif /* USE_MULTI_THREAD_SYSTEM */ +#define ONIG_STATE_INC(reg) /* Nothing */ +#define ONIG_STATE_DEC(reg) /* Nothing */ +#define ONIG_STATE_INC_THREAD(reg) /* Nothing */ +#define ONIG_STATE_DEC_THREAD(reg) /* Nothing */ +#endif /* USE_RECOMPILE_API && USE_MULTI_THREAD_SYSTEM */ #define onig_st_is_member st_is_member @@ -518,7 +544,7 @@ typedef struct _BBuf { #define ANCHOR_LOOK_BEHIND_NOT (1<<13) #define ANCHOR_ANYCHAR_STAR (1<<14) /* ".*" optimize info */ -#define ANCHOR_ANYCHAR_STAR_PL (1<<15) /* ".*" optimize info (posix-line) */ +#define ANCHOR_ANYCHAR_STAR_ML (1<<15) /* ".*" optimize info (multi-line) */ /* operation code */ enum OpCode { @@ -574,11 +600,11 @@ enum OpCode { OP_BACKREF1, OP_BACKREF2, - OP_BACKREF3, OP_BACKREFN, OP_BACKREFN_IC, OP_BACKREF_MULTI, OP_BACKREF_MULTI_IC, + OP_BACKREF_AT_LEVEL, /* \k<xxx+n>, \k<xxx-n> */ OP_MEMORY_START, OP_MEMORY_START_PUSH, /* push back-tracker to stack */ @@ -618,34 +644,33 @@ enum OpCode { OP_FAIL_LOOK_BEHIND_NOT, /* (?<!...) end */ OP_CALL, /* \g<name> */ - OP_RETURN -}; + OP_RETURN, -/* arguments type */ -#define ARG_SPECIAL -1 -#define ARG_NON 0 -#define ARG_RELADDR 1 -#define ARG_ABSADDR 2 -#define ARG_LENGTH 3 -#define ARG_MEMNUM 4 -#define ARG_OPTION 5 + OP_STATE_CHECK_PUSH, /* combination explosion check and push */ + OP_STATE_CHECK_PUSH_OR_JUMP, /* check ok -> push, else jump */ + OP_STATE_CHECK, /* check only */ + OP_STATE_CHECK_ANYCHAR_STAR, + OP_STATE_CHECK_ANYCHAR_ML_STAR +}; typedef int RelAddrType; typedef int AbsAddrType; typedef int LengthType; typedef int RepeatNumType; typedef short int MemNumType; +typedef short int StateCheckNumType; typedef void* PointerType; -#define SIZE_OPCODE 1 -#define SIZE_RELADDR sizeof(RelAddrType) -#define SIZE_ABSADDR sizeof(AbsAddrType) -#define SIZE_LENGTH sizeof(LengthType) -#define SIZE_MEMNUM sizeof(MemNumType) -#define SIZE_REPEATNUM sizeof(RepeatNumType) -#define SIZE_OPTION sizeof(OnigOptionType) -#define SIZE_CODE_POINT sizeof(OnigCodePoint) -#define SIZE_POINTER sizeof(PointerType) +#define SIZE_OPCODE 1 +#define SIZE_RELADDR sizeof(RelAddrType) +#define SIZE_ABSADDR sizeof(AbsAddrType) +#define SIZE_LENGTH sizeof(LengthType) +#define SIZE_MEMNUM sizeof(MemNumType) +#define SIZE_STATE_CHECK_NUM sizeof(StateCheckNumType) +#define SIZE_REPEATNUM sizeof(RepeatNumType) +#define SIZE_OPTION sizeof(OnigOptionType) +#define SIZE_CODE_POINT sizeof(OnigCodePoint) +#define SIZE_POINTER sizeof(PointerType) #ifdef PLATFORM_UNALIGNED_WORD_ACCESS @@ -671,6 +696,7 @@ typedef void* PointerType; #define GET_REPEATNUM_INC(num,p) PLATFORM_GET_INC(num, p, RepeatNumType) #define GET_OPTION_INC(option,p) PLATFORM_GET_INC(option, p, OnigOptionType) #define GET_POINTER_INC(ptr,p) PLATFORM_GET_INC(ptr, p, PointerType) +#define GET_STATE_CHECK_NUM_INC(num,p) PLATFORM_GET_INC(num, p, StateCheckNumType) /* code point's address must be aligned address. */ #define GET_CODE_POINT(code,p) code = *((OnigCodePoint* )(p)) @@ -713,6 +739,12 @@ typedef void* PointerType; #define SIZE_OP_CALL (SIZE_OPCODE + SIZE_ABSADDR) #define SIZE_OP_RETURN SIZE_OPCODE +#ifdef USE_COMBINATION_EXPLOSION_CHECK +#define SIZE_OP_STATE_CHECK (SIZE_OPCODE + SIZE_STATE_CHECK_NUM) +#define SIZE_OP_STATE_CHECK_PUSH (SIZE_OPCODE + SIZE_STATE_CHECK_NUM + SIZE_RELADDR) +#define SIZE_OP_STATE_CHECK_PUSH_OR_JUMP (SIZE_OPCODE + SIZE_STATE_CHECK_NUM + SIZE_RELADDR) +#define SIZE_OP_STATE_CHECK_ANYCHAR_STAR (SIZE_OPCODE + SIZE_STATE_CHECK_NUM) +#endif #define MC_ESC(enc) (enc)->meta_char_table.esc #define MC_ANYCHAR(enc) (enc)->meta_char_table.anychar @@ -721,6 +753,11 @@ typedef void* PointerType; #define MC_ONE_OR_MORE_TIME(enc) (enc)->meta_char_table.one_or_more_time #define MC_ANYCHAR_ANYTIME(enc) (enc)->meta_char_table.anychar_anytime +#define IS_MC_ESC_CODE(code, enc, syn) \ + ((code) == MC_ESC(enc) && \ + !IS_SYNTAX_OP2((syn), ONIG_SYN_OP2_INEFFECTIVE_ESCAPE)) + + #define SYN_POSIX_COMMON_OP \ ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_POSIX_BRACKET | \ ONIG_SYN_OP_DECIMAL_BACKREF | \ @@ -781,13 +818,14 @@ extern void onig_print_statistics P_((FILE* f)); #endif #endif -extern char* onig_error_code_to_format P_((int code)); -extern void onig_snprintf_with_pattern PV_((char buf[], int bufsize, OnigEncoding enc, char* pat, char* pat_end, char *fmt, ...)); +extern UChar* onig_error_code_to_format P_((int code)); +extern void onig_snprintf_with_pattern PV_((UChar buf[], int bufsize, OnigEncoding enc, UChar* pat, UChar* pat_end, const UChar *fmt, ...)); extern int onig_bbuf_init P_((BBuf* buf, int size)); extern int onig_alloc_init P_((regex_t** reg, OnigOptionType option, OnigAmbigType ambig_flag, OnigEncoding enc, OnigSyntaxType* syntax)); extern int onig_compile P_((regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigErrorInfo* einfo)); extern void onig_chain_reduce P_((regex_t* reg)); extern void onig_chain_link_add P_((regex_t* to, regex_t* add)); extern void onig_transfer P_((regex_t* to, regex_t* from)); +extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); #endif /* REGINT_H */ diff --git a/ext/mbstring/oniguruma/regparse.c b/ext/mbstring/oniguruma/regparse.c index 58e122f48..407b73fc4 100644 --- a/ext/mbstring/oniguruma/regparse.c +++ b/ext/mbstring/oniguruma/regparse.c @@ -2,7 +2,7 @@ regparse.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -60,6 +60,20 @@ OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY; extern void onig_null_warn(const char* s) { } +#ifdef RUBY_PLATFORM +extern void +onig_rb_warn(const char* s) +{ + rb_warn(s); +} + +extern void +onig_rb_warning(const char* s) +{ + rb_warning(s); +} +#endif + #ifdef DEFAULT_WARN_FUNCTION static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION; #else @@ -305,6 +319,88 @@ typedef struct { #include "st.h" +typedef struct { + unsigned char* s; + unsigned char* end; +} st_strend_key; + +static int strend_cmp(st_strend_key*, st_strend_key*); +static int strend_hash(st_strend_key*); + +static struct st_hash_type type_strend_hash = { + strend_cmp, + strend_hash, +}; + +static st_table* +onig_st_init_strend_table_with_size(int size) +{ + return onig_st_init_table_with_size(&type_strend_hash, size); +} + +static int +onig_st_lookup_strend(st_table *table, const UChar* str_key, const UChar* end_key, st_data_t *value) +{ + st_strend_key key; + + key.s = (unsigned char* )str_key; + key.end = (unsigned char* )end_key; + + return onig_st_lookup(table, (st_data_t )(&key), value); +} + +static int +onig_st_insert_strend(st_table *table, const UChar* str_key, const UChar* end_key, st_data_t value) +{ + st_strend_key* key; + int result; + + key = (st_strend_key* )xmalloc(sizeof(st_strend_key)); + key->s = (unsigned char* )str_key; + key->end = (unsigned char* )end_key; + result = onig_st_insert(table, (st_data_t )key, value); + if (result) { + xfree(key); + } + return result; +} + +static int +strend_cmp(st_strend_key* x, st_strend_key* y) +{ + unsigned char *p, *q; + int c; + + if ((x->end - x->s) != (y->end - y->s)) + return 1; + + p = x->s; + q = y->s; + while (p < x->end) { + c = (int )*p - (int )*q; + if (c != 0) return c; + + p++; q++; + } + + return 0; +} + +static int +strend_hash(st_strend_key* x) +{ + int val; + unsigned char *p; + + val = 0; + p = x->s; + while (p < x->end) { + val = val * 997 + (int )*p++; + } + + return val + (val >> 5); +} + typedef st_table NameTable; typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */ @@ -350,8 +446,10 @@ onig_print_names(FILE* fp, regex_t* reg) static int i_free_name_entry(UChar* key, NameEntry* e, void* arg) { - xfree(e->name); /* == key */ + xfree(e->name); if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); + xfree(key); + xfree(e); return ST_DELETE; } @@ -801,6 +899,23 @@ onig_number_of_names(regex_t* reg) } #endif /* else USE_NAMED_GROUP */ +extern int +onig_noname_group_capture_is_active(regex_t* reg) +{ + if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP)) + return 0; + +#ifdef USE_NAMED_GROUP + if (onig_number_of_names(reg) > 0 && + IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && + !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) { + return 0; + } +#endif + + return 1; +} + #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16 @@ -825,6 +940,13 @@ scan_env_clear(ScanEnv* env) for (i = 0; i < SCANENV_MEMNODES_SIZE; i++) env->mem_nodes_static[i] = NULL_NODE; + +#ifdef USE_COMBINATION_EXPLOSION_CHECK + env->num_comb_exp_check = 0; + env->comb_exp_max_regnum = 0; + env->curr_max_regnum = 0; + env->has_recursion = 0; +#endif } static int @@ -970,13 +1092,13 @@ onig_free_node_list() { FreeNode* n; - THREAD_ATOMIC_START; - while (FreeNodeList) { + /* THREAD_ATOMIC_START; */ + while (IS_NOT_NULL(FreeNodeList)) { n = FreeNodeList; FreeNodeList = FreeNodeList->next; xfree(n); } - THREAD_ATOMIC_END; + /* THREAD_ATOMIC_END; */ return 0; } #endif @@ -987,13 +1109,14 @@ node_new() Node* node; #ifdef USE_RECYCLE_NODE + THREAD_ATOMIC_START; if (IS_NOT_NULL(FreeNodeList)) { - THREAD_ATOMIC_START; node = (Node* )FreeNodeList; FreeNodeList = FreeNodeList->next; THREAD_ATOMIC_END; return node; } + THREAD_ATOMIC_END; #endif node = (Node* )xmalloc(sizeof(Node)); @@ -1020,9 +1143,9 @@ node_new_cclass() return node; } -extern Node* +static Node* node_new_cclass_by_codepoint_range(int not, - OnigCodePoint sbr[], OnigCodePoint mbr[]) + const OnigCodePoint sbr[], const OnigCodePoint mbr[]) { CClassNode* cc; int n, i, j; @@ -1128,7 +1251,11 @@ onig_node_new_anchor(int type) } static Node* -node_new_backref(int back_num, int* backrefs, int by_name, ScanEnv* env) +node_new_backref(int back_num, int* backrefs, int by_name, +#ifdef USE_BACKREF_AT_LEVEL + int exist_level, int nest_level, +#endif + ScanEnv* env) { int i; Node* node = node_new(); @@ -1141,6 +1268,13 @@ node_new_backref(int back_num, int* backrefs, int by_name, ScanEnv* env) if (by_name != 0) NBACKREF(node).state |= NST_NAME_REF; +#ifdef USE_BACKREF_AT_LEVEL + if (exist_level != 0) { + NBACKREF(node).state |= NST_NEST_LEVEL; + NBACKREF(node).nest_level = nest_level; + } +#endif + for (i = 0; i < back_num; i++) { if (backrefs[i] <= env->num_mem && IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) { @@ -1194,11 +1328,17 @@ node_new_qualifier(int lower, int upper, int by_number) NQUALIFIER(node).lower = lower; NQUALIFIER(node).upper = upper; NQUALIFIER(node).greedy = 1; - NQUALIFIER(node).by_number = by_number; NQUALIFIER(node).target_empty_info = NQ_TARGET_ISNOT_EMPTY; NQUALIFIER(node).head_exact = NULL_NODE; NQUALIFIER(node).next_head_exact = NULL_NODE; NQUALIFIER(node).is_refered = 0; + if (by_number != 0) + NQUALIFIER(node).state |= NST_BY_NUMBER; + +#ifdef USE_COMBINATION_EXPLOSION_CHECK + NQUALIFIER(node).comb_exp_check_num = 0; +#endif + return node; } @@ -2013,7 +2153,7 @@ enum ReduceType { RQ_AQ, /* to '*?' */ RQ_QQ, /* to '??' */ RQ_P_QQ, /* to '+)??' */ - RQ_PQ_Q, /* to '+?)?' */ + RQ_PQ_Q /* to '+?)?' */ }; static enum ReduceType ReduceTypeTable[6][6] = { @@ -2125,6 +2265,10 @@ typedef struct { int ref1; int* refs; int by_name; +#ifdef USE_BACKREF_AT_LEVEL + int exist_level; + int level; /* \k<name+n> */ +#endif } backref; struct { UChar* name; @@ -2274,15 +2418,17 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) control: if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; PFETCH(c); - if (c == MC_ESC(enc)) { - v = fetch_escaped_value(&p, end, env); - if (v < 0) return v; - c = (OnigCodePoint )v; - } - else if (c == '?') + if (c == '?') { c = 0177; - else + } + else { + if (c == MC_ESC(enc)) { + v = fetch_escaped_value(&p, end, env); + if (v < 0) return v; + c = (OnigCodePoint )v; + } c &= 0x9f; + } break; } /* fall through */ @@ -2302,6 +2448,89 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); #ifdef USE_NAMED_GROUP +#ifdef USE_BACKREF_AT_LEVEL +/* + \k<name+n>, \k<name-n> +*/ +static int +fetch_name_with_level(UChar** src, UChar* end, UChar** rname_end + , ScanEnv* env, int* level) +{ + int r, exist_level = 0; + OnigCodePoint c = 0; + OnigCodePoint first_code; + OnigEncoding enc = env->enc; + UChar *name_end; + UChar *p = *src; + PFETCH_READY; + + name_end = end; + r = 0; + if (PEND) { + return ONIGERR_EMPTY_GROUP_NAME; + } + else { + PFETCH(c); + first_code = c; + if (c == '>') + return ONIGERR_EMPTY_GROUP_NAME; + + if (!ONIGENC_IS_CODE_WORD(enc, c)) { + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + } + } + + while (!PEND) { + name_end = p; + PFETCH(c); + if (c == '>' || c == ')' || c == '+' || c == '-') break; + + if (!ONIGENC_IS_CODE_WORD(enc, c)) { + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + } + } + + if (c != '>') { + if (c == '+' || c == '-') { + int num; + int flag = (c == '-' ? -1 : 1); + + PFETCH(c); + if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err; + PUNFETCH; + num = onig_scan_unsigned_number(&p, end, enc); + if (num < 0) return ONIGERR_TOO_BIG_NUMBER; + *level = (num * flag); + exist_level = 1; + + PFETCH(c); + if (c == '>') + goto first_check; + } + + err: + r = ONIGERR_INVALID_GROUP_NAME; + name_end = end; + } + else { + first_check: + if (ONIGENC_IS_CODE_ASCII(first_code) && + ONIGENC_IS_CODE_UPPER(enc, first_code)) + r = ONIGERR_INVALID_GROUP_NAME; + } + + if (r == 0) { + *rname_end = name_end; + *src = p; + return (exist_level ? 1 : 0); + } + else { + onig_scan_env_set_error_string(env, r, *src, name_end); + return r; + } +} +#endif /* USE_BACKREF_AT_LEVEL */ + /* def: 0 -> define name (don't allow number name) 1 -> reference name (allow number name) @@ -2428,11 +2657,11 @@ CC_ESC_WARN(ScanEnv* env, UChar *c) if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) && IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) { - char buf[WARN_BUFSIZE]; + UChar buf[WARN_BUFSIZE]; onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, env->pattern, env->pattern_end, - "character class has '%s' without escape", c); - (*onig_warn)(buf); + (UChar* )"character class has '%s' without escape", c); + (*onig_warn)((char* )buf); } } @@ -2442,11 +2671,11 @@ CCEND_ESC_WARN(ScanEnv* env, UChar* c) if (onig_warn == onig_null_warn) return ; if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) { - char buf[WARN_BUFSIZE]; + UChar buf[WARN_BUFSIZE]; onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc, (env)->pattern, (env)->pattern_end, - "regular expression has '%s' without escape", c); - (*onig_warn)(buf); + (UChar* )"regular expression has '%s' without escape", c); + (*onig_warn)((char* )buf); } } @@ -2537,6 +2766,8 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->type = TK_CHAR; tok->base = 0; tok->u.c = c; + tok->escaped = 0; + if (c == ']') { tok->type = TK_CC_CLOSE; } @@ -2708,7 +2939,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->type = TK_CC_CC_OPEN; } else { - CC_ESC_WARN(env, "["); + CC_ESC_WARN(env, (UChar* )"["); } } } @@ -2747,7 +2978,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->backp = p; PFETCH(c); - if (c == MC_ESC(enc)) { + if (IS_MC_ESC_CODE(c, enc, syn)) { if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; tok->backp = p; @@ -3012,6 +3243,9 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.backref.num = 1; tok->u.backref.ref1 = num; tok->u.backref.by_name = 0; +#ifdef USE_BACKREF_AT_LEVEL + tok->u.backref.exist_level = 0; +#endif break; } @@ -3050,8 +3284,17 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) int* backs; prev = p; + +#ifdef USE_BACKREF_AT_LEVEL + name_end = NULL_UCHARP; /* no need. escape gcc warning. */ + r = fetch_name_with_level(&p, end, &name_end, env, &tok->u.backref.level); + if (r == 1) tok->u.backref.exist_level = 1; + else tok->u.backref.exist_level = 0; +#else r = fetch_name(&p, end, &name_end, env, 1); +#endif if (r < 0) return r; + num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); if (num <= 0) { onig_scan_env_set_error_string(env, @@ -3170,13 +3413,17 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) switch (c) { case '.': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break; +#ifdef USE_VARIABLE_META_CHARS any_char: +#endif tok->type = TK_ANYCHAR; break; case '*': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break; +#ifdef USE_VARIABLE_META_CHARS anytime: +#endif tok->type = TK_OP_REPEAT; tok->u.repeat.lower = 0; tok->u.repeat.upper = REPEAT_INFINITE; @@ -3185,7 +3432,9 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '+': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break; +#ifdef USE_VARIABLE_META_CHARS one_or_more_time: +#endif tok->type = TK_OP_REPEAT; tok->u.repeat.lower = 1; tok->u.repeat.upper = REPEAT_INFINITE; @@ -3194,7 +3443,9 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '?': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break; +#ifdef USE_VARIABLE_META_CHARS zero_or_one_time: +#endif tok->type = TK_OP_REPEAT; tok->u.repeat.lower = 0; tok->u.repeat.upper = 1; @@ -3271,7 +3522,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case ']': if (*src > env->pattern) /* /].../ is allowed. */ - CCEND_ESC_WARN(env, "]"); + CCEND_ESC_WARN(env, (UChar* )"]"); break; case '#': @@ -3297,14 +3548,16 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } } +#ifdef USE_VARIABLE_META_CHARS out: +#endif *src = p; return tok->type; } static int add_ctype_to_cc_by_range(CClassNode* cc, int ctype, int not, OnigEncoding enc, - OnigCodePoint sbr[], OnigCodePoint mbr[]) + const OnigCodePoint sbr[], const OnigCodePoint mbr[]) { int i, r; OnigCodePoint j; @@ -3368,7 +3621,7 @@ static int add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) { int c, r; - OnigCodePoint *sbr, *mbr; + const OnigCodePoint *sbr, *mbr; OnigEncoding enc = env->enc; r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sbr, &mbr); @@ -3506,19 +3759,19 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) #define POSIX_BRACKET_NAME_MAX_LEN 6 static PosixBracketEntryType PBS[] = { - { "alnum", ONIGENC_CTYPE_ALNUM, 5 }, - { "alpha", ONIGENC_CTYPE_ALPHA, 5 }, - { "blank", ONIGENC_CTYPE_BLANK, 5 }, - { "cntrl", ONIGENC_CTYPE_CNTRL, 5 }, - { "digit", ONIGENC_CTYPE_DIGIT, 5 }, - { "graph", ONIGENC_CTYPE_GRAPH, 5 }, - { "lower", ONIGENC_CTYPE_LOWER, 5 }, - { "print", ONIGENC_CTYPE_PRINT, 5 }, - { "punct", ONIGENC_CTYPE_PUNCT, 5 }, - { "space", ONIGENC_CTYPE_SPACE, 5 }, - { "upper", ONIGENC_CTYPE_UPPER, 5 }, - { "xdigit", ONIGENC_CTYPE_XDIGIT, 6 }, - { "ascii", ONIGENC_CTYPE_ASCII, 5 }, /* I don't know origin. Perl? */ + { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 }, + { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 }, + { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 }, + { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 }, + { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 }, + { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 }, + { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 }, + { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 }, + { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 }, + { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 }, + { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 }, + { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 }, + { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 }, { (UChar* )NULL, -1, 0 } }; @@ -3542,7 +3795,7 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) { p = (UChar* )onigenc_step(enc, p, end, pb->len); - if (onigenc_with_ascii_strncmp(enc, p, end, ":]", 2) != 0) + if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0) return ONIGERR_INVALID_POSIX_BRACKET_TYPE; r = add_ctype_to_cc(cc, pb->ctype, not, env); @@ -3577,19 +3830,19 @@ static int property_name_to_ctype(UChar* p, UChar* end, OnigEncoding enc) { static PosixBracketEntryType PBS[] = { - { "Alnum", ONIGENC_CTYPE_ALNUM, 5 }, - { "Alpha", ONIGENC_CTYPE_ALPHA, 5 }, - { "Blank", ONIGENC_CTYPE_BLANK, 5 }, - { "Cntrl", ONIGENC_CTYPE_CNTRL, 5 }, - { "Digit", ONIGENC_CTYPE_DIGIT, 5 }, - { "Graph", ONIGENC_CTYPE_GRAPH, 5 }, - { "Lower", ONIGENC_CTYPE_LOWER, 5 }, - { "Print", ONIGENC_CTYPE_PRINT, 5 }, - { "Punct", ONIGENC_CTYPE_PUNCT, 5 }, - { "Space", ONIGENC_CTYPE_SPACE, 5 }, - { "Upper", ONIGENC_CTYPE_UPPER, 5 }, - { "XDigit", ONIGENC_CTYPE_XDIGIT, 6 }, - { "ASCII", ONIGENC_CTYPE_ASCII, 5 }, + { (UChar* )"Alnum", ONIGENC_CTYPE_ALNUM, 5 }, + { (UChar* )"Alpha", ONIGENC_CTYPE_ALPHA, 5 }, + { (UChar* )"Blank", ONIGENC_CTYPE_BLANK, 5 }, + { (UChar* )"Cntrl", ONIGENC_CTYPE_CNTRL, 5 }, + { (UChar* )"Digit", ONIGENC_CTYPE_DIGIT, 5 }, + { (UChar* )"Graph", ONIGENC_CTYPE_GRAPH, 5 }, + { (UChar* )"Lower", ONIGENC_CTYPE_LOWER, 5 }, + { (UChar* )"Print", ONIGENC_CTYPE_PRINT, 5 }, + { (UChar* )"Punct", ONIGENC_CTYPE_PUNCT, 5 }, + { (UChar* )"Space", ONIGENC_CTYPE_SPACE, 5 }, + { (UChar* )"Upper", ONIGENC_CTYPE_UPPER, 5 }, + { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 }, + { (UChar* )"ASCII", ONIGENC_CTYPE_ASCII, 5 }, { (UChar* )NULL, -1, 0 } }; @@ -3839,7 +4092,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, *src, env->pattern_end, 1, env->enc)) return ONIGERR_EMPTY_CHAR_CLASS; - CC_ESC_WARN(env, "]"); + CC_ESC_WARN(env, (UChar* )"]"); r = tok->type = TK_CHAR; /* allow []...] */ } @@ -3942,7 +4195,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, r = parse_posix_bracket(cc, &p, end, env); if (r < 0) goto err; if (r == 1) { /* is not POSIX bracket */ - CC_ESC_WARN(env, "["); + CC_ESC_WARN(env, (UChar* )"["); p = tok->backp; v = (OnigCodePoint )tok->u.c; in_israw = 0; @@ -3988,7 +4241,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, goto val_entry; } else if (r == TK_CC_AND) { - CC_ESC_WARN(env, "-"); + CC_ESC_WARN(env, (UChar* )"-"); goto range_end_val; } state = CCS_RANGE; @@ -4003,12 +4256,12 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, fetched = 1; /* [--x] or [a&&-x] is warned. */ if (r == TK_CC_RANGE || and_start != 0) - CC_ESC_WARN(env, "-"); + CC_ESC_WARN(env, (UChar* )"-"); goto val_entry; } else if (state == CCS_RANGE) { - CC_ESC_WARN(env, "-"); + CC_ESC_WARN(env, (UChar* )"-"); goto sb_char; /* [!--x] is allowed */ } else { /* CCS_COMPLETE */ @@ -4017,12 +4270,12 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, fetched = 1; if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */ else if (r == TK_CC_AND) { - CC_ESC_WARN(env, "-"); + CC_ESC_WARN(env, (UChar* )"-"); goto range_end_val; } if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) { - CC_ESC_WARN(env, "-"); + CC_ESC_WARN(env, (UChar* )"-"); goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */ } r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; @@ -4326,10 +4579,9 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, } } else { -#ifdef USE_NAMED_GROUP if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP)) goto group; -#endif + *np = node_new_effect_memory(env->option, 0); CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); num = scan_env_add_mem_entry(env); @@ -4358,11 +4610,11 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, return 0; } -static char* PopularQStr[] = { +static const char* PopularQStr[] = { "?", "*", "+", "??", "*?", "+?" }; -static char* ReduceQStr[] = { +static const char* ReduceQStr[] = { "", "", "*", "*?", "??", "+ and ??", "+? and ?" }; @@ -4394,15 +4646,13 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) { /* check redundant double repeat. */ /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */ QualifierNode* qnt = &(NQUALIFIER(target)); + int nestq_num = popular_qualifier_num(qn); + int targetq_num = popular_qualifier_num(qnt); #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR - if (qn->by_number == 0 && qnt->by_number == 0 && + if (!IS_QUALIFIER_BY_NUMBER(qn) && !IS_QUALIFIER_BY_NUMBER(qnt) && IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) { - int nestq_num, targetq_num; - char buf[WARN_BUFSIZE]; - - nestq_num = popular_qualifier_num(qn); - targetq_num = popular_qualifier_num(qnt); + UChar buf[WARN_BUFSIZE]; switch(ReduceTypeTable[targetq_num][nestq_num]) { case RQ_ASIS: @@ -4411,9 +4661,9 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) case RQ_DEL: if (onig_verb_warn != onig_null_warn) { onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, - env->pattern, env->pattern_end, - "redundant nested repeat operator"); - (*onig_verb_warn)(buf); + env->pattern, env->pattern_end, + (UChar* )"redundant nested repeat operator"); + (*onig_verb_warn)((char* )buf); } goto warn_exit; break; @@ -4422,10 +4672,10 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) if (onig_verb_warn != onig_null_warn) { onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, env->pattern, env->pattern_end, - "nested repeat operator %s and %s was replaced with '%s'", + (UChar* )"nested repeat operator %s and %s was replaced with '%s'", PopularQStr[targetq_num], PopularQStr[nestq_num], ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); - (*onig_verb_warn)(buf); + (*onig_verb_warn)((char* )buf); } goto warn_exit; break; @@ -4434,9 +4684,17 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) warn_exit: #endif - if (popular_qualifier_num(qnt) >= 0 && popular_qualifier_num(qn) >= 0) { - onig_reduce_nested_qualifier(qnode, target); - goto q_exit; + if (targetq_num >= 0) { + if (nestq_num >= 0) { + onig_reduce_nested_qualifier(qnode, target); + goto q_exit; + } + else if (targetq_num == 1 || targetq_num == 2) { /* * or + */ + /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ + if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) { + qn->upper = (qn->lower == 0 ? 1 : qn->lower); + } + } } } break; @@ -4457,8 +4715,8 @@ make_compound_alt_node_from_cc(OnigAmbigType ambig_flag, OnigEncoding enc, int r, i, j, k, clen, len, ncode, n; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; Node **ptail, *snode = NULL_NODE; - OnigCompAmbigCodes* ccs; - OnigCompAmbigCodeItem* ci; + const OnigCompAmbigCodes* ccs; + const OnigCompAmbigCodeItem* ci; OnigAmbigType amb; n = 0; @@ -4546,27 +4804,9 @@ static int type_cclass_hash(type_cclass_key* key) return val + (val >> 5); } -static int type_cclass_key_free(st_data_t x) -{ - xfree((void* )x); - return 0; -} - -static st_data_t type_cclass_key_clone(st_data_t x) -{ - type_cclass_key* new_key; - type_cclass_key* key = (type_cclass_key* )x; - - new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key)); - *new_key = *key; - return (st_data_t )new_key; -} - static struct st_hash_type type_type_cclass_hash = { type_cclass_cmp, type_cclass_hash, - type_cclass_key_free, - type_cclass_key_clone }; static st_table* OnigTypeCClassTable; @@ -4580,6 +4820,8 @@ i_free_shared_class(type_cclass_key* key, Node* node, void* arg) if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf); xfree(node); } + + if (IS_NOT_NULL(key)) xfree(key); return ST_DELETE; } @@ -4588,6 +4830,8 @@ onig_free_shared_cclass_table() { if (IS_NOT_NULL(OnigTypeCClassTable)) { onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0); + xfree(OnigTypeCClassTable); + OnigTypeCClassTable = NULL; } return 0; @@ -4741,7 +4985,7 @@ parse_exp(Node** np, OnigToken* tok, int term, int ctype, not; #ifdef USE_SHARED_CCLASS_TABLE - OnigCodePoint *sbr, *mbr; + const OnigCodePoint *sbr, *mbr; ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬); r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, ctype, &sbr, &mbr); @@ -4823,7 +5067,7 @@ parse_exp(Node** np, OnigToken* tok, int term, if (IS_IGNORECASE(env->option)) { int i, n, in_cc; - OnigPairAmbigCodes* ccs; + const OnigPairAmbigCodes* ccs; BitSetRef bs = cc->bs; OnigAmbigType amb; @@ -4892,8 +5136,13 @@ parse_exp(Node** np, OnigToken* tok, int term, case TK_BACKREF: len = tok->u.backref.num; *np = node_new_backref(len, - (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)), - tok->u.backref.by_name, env); + (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)), + tok->u.backref.by_name, +#ifdef USE_BACKREF_AT_LEVEL + tok->u.backref.exist_level, + tok->u.backref.level, +#endif + env); CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); break; diff --git a/ext/mbstring/oniguruma/regparse.h b/ext/mbstring/oniguruma/regparse.h index 1a4ac7dea..ca62dddf7 100644 --- a/ext/mbstring/oniguruma/regparse.h +++ b/ext/mbstring/oniguruma/regparse.h @@ -4,7 +4,7 @@ regparse.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -67,7 +67,7 @@ #define CTYPE_XDIGIT (1<<6) #define CTYPE_NOT_XDIGIT (1<<7) -#define ANCHOR_ANYCHAR_STAR_MASK (ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_PL) +#define ANCHOR_ANYCHAR_STAR_MASK (ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML) #define ANCHOR_END_BUF_MASK (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF) #define EFFECT_MEMORY (1<<0) @@ -76,7 +76,7 @@ #define NODE_STR_MARGIN 16 #define NODE_STR_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ -#define NODE_BACKREFS_SIZE 7 +#define NODE_BACKREFS_SIZE 6 #define NSTR_RAW (1<<0) /* by backslashed number */ #define NSTR_AMBIG (1<<1) @@ -124,11 +124,13 @@ typedef struct { int lower; int upper; int greedy; - int by_number; /* {n,m} */ int target_empty_info; struct _Node* head_exact; struct _Node* next_head_exact; int is_refered; /* include called node. don't eliminate even if {0} */ +#ifdef USE_COMBINATION_EXPLOSION_CHECK + int comb_exp_check_num; /* 1,2,3...: check, 0: no check */ +#endif } QualifierNode; /* status bits */ @@ -145,6 +147,8 @@ typedef struct { #define NST_NAMED_GROUP (1<<10) #define NST_NAME_REF (1<<11) #define NST_IN_REPEAT (1<<12) /* STK_REPEAT is nested in stack. */ +#define NST_NEST_LEVEL (1<<13) +#define NST_BY_NUMBER (1<<14) /* {n,m} */ #define SET_EFFECT_STATUS(node,f) (node)->u.effect.state |= (f) #define CLEAR_EFFECT_STATUS(node,f) (node)->u.effect.state &= ~(f) @@ -165,7 +169,9 @@ typedef struct { #define IS_CALL_RECURSION(cn) (((cn)->state & NST_RECURSION) != 0) #define IS_CALL_NAME_REF(cn) (((cn)->state & NST_NAME_REF) != 0) #define IS_BACKREF_NAME_REF(bn) (((bn)->state & NST_NAME_REF) != 0) +#define IS_BACKREF_NEST_LEVEL(bn) (((bn)->state & NST_NEST_LEVEL) != 0) #define IS_QUALIFIER_IN_REPEAT(qn) (((qn)->state & NST_IN_REPEAT) != 0) +#define IS_QUALIFIER_BY_NUMBER(qn) (((qn)->state & NST_BY_NUMBER) != 0) typedef struct { int state; @@ -212,6 +218,7 @@ typedef struct { int back_num; int back_static[NODE_BACKREFS_SIZE]; int* back_dynamic; + int nest_level; } BackrefNode; typedef struct { @@ -274,6 +281,12 @@ typedef struct { int mem_alloc; Node* mem_nodes_static[SCANENV_MEMNODES_SIZE]; Node** mem_nodes_dynamic; +#ifdef USE_COMBINATION_EXPLOSION_CHECK + int num_comb_exp_check; + int comb_exp_max_regnum; + int curr_max_regnum; + int has_recursion; +#endif } ScanEnv; @@ -290,7 +303,6 @@ typedef struct { extern int onig_renumber_name_table P_((regex_t* reg, GroupNumRemap* map)); #endif -extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n)); extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); extern int onig_scan_unsigned_number P_((UChar** src, const UChar* end, OnigEncoding enc)); diff --git a/ext/mbstring/oniguruma/regposix.c b/ext/mbstring/oniguruma/regposix.c index 34cbeb9a4..a3bacf722 100644 --- a/ext/mbstring/oniguruma/regposix.c +++ b/ext/mbstring/oniguruma/regposix.c @@ -2,7 +2,7 @@ regposix.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -55,7 +55,7 @@ typedef struct { static int onig2posix_error_code(int code) { - static O2PERR o2p[] = { + static const O2PERR o2p[] = { { ONIG_MISMATCH, REG_NOMATCH }, { ONIG_NO_SUPPORT_CONFIG, REG_EONIG_INTERNAL }, { ONIGERR_MEMORY, REG_ESPACE }, @@ -192,7 +192,7 @@ regexec(regex_t* reg, const char* str, size_t nmatch, ENC_STRING_LEN(ONIG_C(reg)->enc, str, len); end = (UChar* )(str + len); r = onig_search(ONIG_C(reg), (UChar* )str, end, (UChar* )str, end, - (OnigRegion* )pmatch, options); + (OnigRegion* )pm, options); if (r >= 0) { r = 0; /* Match */ @@ -212,6 +212,11 @@ regexec(regex_t* reg, const char* str, size_t nmatch, if (pm != pmatch && pm != NULL) xfree(pm); +#if 0 + if (reg->re_nsub > nmatch - 1) + reg->re_nsub = (nmatch <= 1 ? 0 : nmatch - 1); +#endif + return r; } diff --git a/ext/mbstring/oniguruma/regsyntax.c b/ext/mbstring/oniguruma/regsyntax.c index a0f36b8c3..9114e39e6 100644 --- a/ext/mbstring/oniguruma/regsyntax.c +++ b/ext/mbstring/oniguruma/regsyntax.c @@ -2,7 +2,7 @@ regsyntax.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2004 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,6 +29,13 @@ #include "regint.h" +OnigSyntaxType OnigSyntaxASIS = { + 0 + , ONIG_SYN_OP2_INEFFECTIVE_ESCAPE + , 0 + , ONIG_OPTION_NONE +}; + OnigSyntaxType OnigSyntaxPosixBasic = { ( SYN_POSIX_COMMON_OP | ONIG_SYN_OP_ESC_LPAREN_SUBEXP | ONIG_SYN_OP_ESC_BRACE_INTERVAL ) @@ -63,7 +70,7 @@ OnigSyntaxType OnigSyntaxEmacs = { OnigSyntaxType OnigSyntaxGrep = { ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | ONIG_SYN_OP_POSIX_BRACKET | - ONIG_SYN_OP_BRACE_INTERVAL | ONIG_SYN_OP_ESC_LPAREN_SUBEXP | + ONIG_SYN_OP_ESC_BRACE_INTERVAL | ONIG_SYN_OP_ESC_LPAREN_SUBEXP | ONIG_SYN_OP_ESC_VBAR_ALT | ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_ESC_PLUS_ONE_INF | ONIG_SYN_OP_ESC_QMARK_ZERO_ONE | ONIG_SYN_OP_LINE_ANCHOR | @@ -110,6 +117,28 @@ OnigSyntaxType OnigSyntaxPerl = { , ONIG_OPTION_SINGLELINE }; +/* Perl + named group */ +OnigSyntaxType OnigSyntaxPerl_NG = { + (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | + ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | + ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS | + ONIG_SYN_OP_ESC_C_CONTROL ) + & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) + , ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE | + ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL | + ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | + ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | + ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS | + ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | + ONIG_SYN_OP2_ESC_K_NAMED_BACKREF | + ONIG_SYN_OP2_ESC_G_SUBEXP_CALL ) + , ( SYN_GNU_REGEX_BV | + ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | + ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME ) + , ONIG_OPTION_SINGLELINE +}; + + extern int onig_set_default_syntax(OnigSyntaxType* syntax) diff --git a/ext/mbstring/oniguruma/regversion.c b/ext/mbstring/oniguruma/regversion.c index 5f15c10e6..5fad0cc18 100644 --- a/ext/mbstring/oniguruma/regversion.c +++ b/ext/mbstring/oniguruma/regversion.c @@ -2,7 +2,7 @@ regversion.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2006 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -47,7 +47,7 @@ onig_copyright(void) { static char s[58]; - sprintf(s, "Oniguruma %d.%d.%d : Copyright (C) 2002-2005 K.Kosako", + sprintf(s, "Oniguruma %d.%d.%d : Copyright (C) 2002-2006 K.Kosako", ONIGURUMA_VERSION_MAJOR, ONIGURUMA_VERSION_MINOR, ONIGURUMA_VERSION_TEENY); diff --git a/ext/mbstring/oniguruma/st.c b/ext/mbstring/oniguruma/st.c index 65c2cc58b..2324da263 100644 --- a/ext/mbstring/oniguruma/st.c +++ b/ext/mbstring/oniguruma/st.c @@ -56,8 +56,6 @@ static int numhash(long); static struct st_hash_type type_numhash = { numcmp, numhash, - st_nothing_key_free, - st_nothing_key_clone }; /* extern int strcmp(const char *, const char *); */ @@ -65,20 +63,6 @@ static int strhash(const char *); static struct st_hash_type type_strhash = { strcmp, strhash, - st_nothing_key_free, - st_nothing_key_clone -}; - -static int strend_cmp(st_strend_key*, st_strend_key*); -static int strend_hash(st_strend_key*); -static int strend_key_free(st_data_t key); -static st_data_t strend_key_clone(st_data_t x); - -static struct st_hash_type type_strend_hash = { - strend_cmp, - strend_hash, - strend_key_free, - strend_key_clone }; static void rehash(st_table *); @@ -100,7 +84,7 @@ static void rehash(st_table *); /* Table of prime numbers 2^n+a, 2<=n<=30. */ -static long primes[] = { +static const long primes[] = { 8 + 3, 16 + 3, 32 + 5, @@ -228,13 +212,6 @@ st_init_strtable_with_size(size) return st_init_table_with_size(&type_strhash, size); } -st_table* -st_init_strend_table_with_size(size) - int size; -{ - return st_init_table_with_size(&type_strend_hash, size); -} - void st_free_table(table) st_table *table; @@ -246,7 +223,6 @@ st_free_table(table) ptr = table->bins[i]; while (ptr != 0) { next = ptr->next; - table->type->key_free(ptr->key); free(ptr); ptr = next; } @@ -297,21 +273,6 @@ st_lookup(table, key, value) } } -int -st_lookup_strend(table, str_key, end_key, value) - st_table *table; - const unsigned char* str_key; - const unsigned char* end_key; - st_data_t *value; -{ - st_strend_key key; - - key.s = (unsigned char* )str_key; - key.end = (unsigned char* )end_key; - - return st_lookup(table, (st_data_t )(&key), value); -} - #define ADD_DIRECT(table, key, value, hash_val, bin_pos)\ do {\ st_table_entry *entry;\ @@ -352,22 +313,6 @@ st_insert(table, key, value) } } -int -st_insert_strend(table, str_key, end_key, value) - st_table *table; - const unsigned char* str_key; - const unsigned char* end_key; - st_data_t value; -{ - st_strend_key* key; - - key = alloc(st_strend_key); - key->s = (unsigned char* )str_key; - key->end = (unsigned char* )end_key; - - return st_insert(table, (st_data_t )key, value); -} - void st_add_direct(table, key, value) st_table *table; @@ -381,21 +326,6 @@ st_add_direct(table, key, value) ADD_DIRECT(table, key, value, hash_val, bin_pos); } -void -st_add_direct_strend(table, str_key, end_key, value) - st_table *table; - const unsigned char* str_key; - const unsigned char* end_key; - st_data_t value; -{ - st_strend_key* key; - - key = alloc(st_strend_key); - key->s = (unsigned char* )str_key; - key->end = (unsigned char* )end_key; - st_add_direct(table, (st_data_t )key, value); -} - static void rehash(table) register st_table *table; @@ -455,7 +385,6 @@ st_copy(old_table) return 0; } *entry = *ptr; - entry->key = old_table->type->key_clone(ptr->key); entry->next = new_table->bins[i]; new_table->bins[i] = entry; ptr = ptr->next; @@ -556,7 +485,7 @@ st_cleanup_safe(table, never) table->num_entries = num_entries; } -void +int st_foreach(table, func, arg) st_table *table; int (*func)(); @@ -569,7 +498,7 @@ st_foreach(table, func, arg) for(i = 0; i < table->num_bins; i++) { last = 0; for(ptr = table->bins[i]; ptr != 0;) { - retval = (*func)(ptr->key, ptr->record, arg, 0); + retval = (*func)(ptr->key, ptr->record, arg); switch (retval) { case ST_CHECK: /* check if hash is modified during iteration */ tmp = 0; @@ -580,8 +509,7 @@ st_foreach(table, func, arg) } if (!tmp) { /* call func with error notice */ - retval = (*func)(0, 0, arg, 1); - return; + return 1; } /* fall through */ case ST_CONTINUE: @@ -589,7 +517,7 @@ st_foreach(table, func, arg) ptr = ptr->next; break; case ST_STOP: - return; + return 0; case ST_DELETE: tmp = ptr; if (last == 0) { @@ -599,12 +527,12 @@ st_foreach(table, func, arg) last->next = ptr->next; } ptr = ptr->next; - table->type->key_free(tmp->key); free(tmp); table->num_entries--; } } } + return 0; } static int @@ -659,59 +587,3 @@ numhash(n) { return n; } - -extern int -st_nothing_key_free(st_data_t key) { return 0; } - -extern st_data_t -st_nothing_key_clone(st_data_t x) { return x; } - -static int strend_cmp(st_strend_key* x, st_strend_key* y) -{ - unsigned char *p, *q; - int c; - - if ((x->end - x->s) != (y->end - y->s)) - return 1; - - p = x->s; - q = y->s; - while (p < x->end) { - c = (int )*p - (int )*q; - if (c != 0) return c; - - p++; q++; - } - - return 0; -} - -static int strend_hash(st_strend_key* x) -{ - int val; - unsigned char *p; - - val = 0; - p = x->s; - while (p < x->end) { - val = val * 997 + (int )*p++; - } - - return val + (val >> 5); -} - -static int strend_key_free(st_data_t x) -{ - xfree((void* )x); - return 0; -} - -static st_data_t strend_key_clone(st_data_t x) -{ - st_strend_key* new_key; - st_strend_key* key = (st_strend_key* )x; - - new_key = alloc(st_strend_key); - *new_key = *key; - return (st_data_t )new_key; -} diff --git a/ext/mbstring/oniguruma/st.h b/ext/mbstring/oniguruma/st.h index c5cc4e625..da65e7fef 100644 --- a/ext/mbstring/oniguruma/st.h +++ b/ext/mbstring/oniguruma/st.h @@ -14,8 +14,6 @@ typedef struct st_table st_table; struct st_hash_type { int (*compare)(); int (*hash)(); - int (*key_free)(); - st_data_t (*key_clone)(); }; struct st_table { @@ -25,11 +23,6 @@ struct st_table { struct st_table_entry **bins; }; -typedef struct { - unsigned char* s; - unsigned char* end; -} st_strend_key; - #define st_is_member(table,key) st_lookup(table,key,(st_data_t *)0) enum st_retval {ST_CONTINUE, ST_STOP, ST_DELETE, ST_CHECK}; @@ -51,23 +44,16 @@ st_table *st_init_numtable _((void)); st_table *st_init_numtable_with_size _((int)); st_table *st_init_strtable _((void)); st_table *st_init_strtable_with_size _((int)); -st_table *st_init_strend_table_with_size _((int)); int st_delete _((st_table *, st_data_t *, st_data_t *)); int st_delete_safe _((st_table *, st_data_t *, st_data_t *, st_data_t)); int st_insert _((st_table *, st_data_t, st_data_t)); -int st_insert_strend _((st_table *, const unsigned char*, const unsigned char*, st_data_t)); int st_lookup _((st_table *, st_data_t, st_data_t *)); -int st_lookup_strend _((st_table *, const unsigned char*, const unsigned char*, st_data_t*)); -void st_foreach _((st_table *, int (*)(ANYARGS), st_data_t)); +int st_foreach _((st_table *, int (*)(ANYARGS), st_data_t)); void st_add_direct _((st_table *, st_data_t, st_data_t)); -void st_add_direct_strend _((st_table *, const unsigned char*, const unsigned char*, st_data_t)); void st_free_table _((st_table *)); void st_cleanup_safe _((st_table *, st_data_t)); st_table *st_copy _((st_table *)); -extern st_data_t st_nothing_key_clone _((st_data_t key)); -extern int st_nothing_key_free _((st_data_t key)); - #define ST_NUMCMP ((int (*)()) 0) #define ST_NUMHASH ((int (*)()) -2) diff --git a/ext/mbstring/oniguruma/win32/config.h b/ext/mbstring/oniguruma/win32/config.h index 7ee9e2506..bdbdaf25c 100644 --- a/ext/mbstring/oniguruma/win32/config.h +++ b/ext/mbstring/oniguruma/win32/config.h @@ -1,84 +1,84 @@ -#define STDC_HEADERS 1
-#define HAVE_SYS_TYPES_H 1
-#define HAVE_SYS_STAT_H 1
-#define HAVE_STDLIB_H 1
-#define HAVE_STRING_H 1
-#define HAVE_MEMORY_H 1
-#define HAVE_FLOAT_H 1
-#define HAVE_OFF_T 1
-#define SIZEOF_INT 4
-#define SIZEOF_SHORT 2
-#define SIZEOF_LONG 4
-#define SIZEOF_LONG_LONG 0
-#define SIZEOF___INT64 8
-#define SIZEOF_OFF_T 4
-#define SIZEOF_VOIDP 4
-#define SIZEOF_FLOAT 4
-#define SIZEOF_DOUBLE 8
-#define HAVE_PROTOTYPES 1
-#define TOKEN_PASTE(x,y) x##y
-#define HAVE_STDARG_PROTOTYPES 1
-#ifndef NORETURN
-#if _MSC_VER > 1100
-#define NORETURN(x) __declspec(noreturn) x
-#else
-#define NORETURN(x) x
-#endif
-#endif
-#define HAVE_DECL_SYS_NERR 1
-#define STDC_HEADERS 1
-#define HAVE_STDLIB_H 1
-#define HAVE_STRING_H 1
-#define HAVE_LIMITS_H 1
-#define HAVE_FCNTL_H 1
-#define HAVE_SYS_UTIME_H 1
-#define HAVE_MEMORY_H 1
-#define uid_t int
-#define gid_t int
-#define HAVE_STRUCT_STAT_ST_RDEV 1
-#define HAVE_ST_RDEV 1
-#define GETGROUPS_T int
-#define RETSIGTYPE void
-#define HAVE_ALLOCA 1
-#define HAVE_DUP2 1
-#define HAVE_MEMCMP 1
-#define HAVE_MEMMOVE 1
-#define HAVE_MKDIR 1
-#define HAVE_STRCASECMP 1
-#define HAVE_STRNCASECMP 1
-#define HAVE_STRERROR 1
-#define HAVE_STRFTIME 1
-#define HAVE_STRCHR 1
-#define HAVE_STRSTR 1
-#define HAVE_STRTOD 1
-#define HAVE_STRTOL 1
-#define HAVE_STRTOUL 1
-#define HAVE_FLOCK 1
-#define HAVE_VSNPRINTF 1
-#define HAVE_FINITE 1
-#define HAVE_FMOD 1
-#define HAVE_FREXP 1
-#define HAVE_HYPOT 1
-#define HAVE_MODF 1
-#define HAVE_WAITPID 1
-#define HAVE_CHSIZE 1
-#define HAVE_TIMES 1
-#define HAVE__SETJMP 1
-#define HAVE_TELLDIR 1
-#define HAVE_SEEKDIR 1
-#define HAVE_MKTIME 1
-#define HAVE_COSH 1
-#define HAVE_SINH 1
-#define HAVE_TANH 1
-#define HAVE_EXECVE 1
-#define HAVE_TZNAME 1
-#define HAVE_DAYLIGHT 1
-#define SETPGRP_VOID 1
-#define inline __inline
-#define NEED_IO_SEEK_BETWEEN_RW 1
-#define RSHIFT(x,y) ((x)>>(int)y)
-#define FILE_COUNT _cnt
-#define FILE_READPTR _ptr
-#define DEFAULT_KCODE KCODE_NONE
-#define DLEXT ".so"
-#define DLEXT2 ".dll"
+#define STDC_HEADERS 1 +#define HAVE_SYS_TYPES_H 1 +#define HAVE_SYS_STAT_H 1 +#define HAVE_STDLIB_H 1 +#define HAVE_STRING_H 1 +#define HAVE_MEMORY_H 1 +#define HAVE_FLOAT_H 1 +#define HAVE_OFF_T 1 +#define SIZEOF_INT 4 +#define SIZEOF_SHORT 2 +#define SIZEOF_LONG 4 +#define SIZEOF_LONG_LONG 0 +#define SIZEOF___INT64 8 +#define SIZEOF_OFF_T 4 +#define SIZEOF_VOIDP 4 +#define SIZEOF_FLOAT 4 +#define SIZEOF_DOUBLE 8 +#define HAVE_PROTOTYPES 1 +#define TOKEN_PASTE(x,y) x##y +#define HAVE_STDARG_PROTOTYPES 1 +#ifndef NORETURN +#if _MSC_VER > 1100 +#define NORETURN(x) __declspec(noreturn) x +#else +#define NORETURN(x) x +#endif +#endif +#define HAVE_DECL_SYS_NERR 1 +#define STDC_HEADERS 1 +#define HAVE_STDLIB_H 1 +#define HAVE_STRING_H 1 +#define HAVE_LIMITS_H 1 +#define HAVE_FCNTL_H 1 +#define HAVE_SYS_UTIME_H 1 +#define HAVE_MEMORY_H 1 +#define uid_t int +#define gid_t int +#define HAVE_STRUCT_STAT_ST_RDEV 1 +#define HAVE_ST_RDEV 1 +#define GETGROUPS_T int +#define RETSIGTYPE void +#define HAVE_ALLOCA 1 +#define HAVE_DUP2 1 +#define HAVE_MEMCMP 1 +#define HAVE_MEMMOVE 1 +#define HAVE_MKDIR 1 +#define HAVE_STRCASECMP 1 +#define HAVE_STRNCASECMP 1 +#define HAVE_STRERROR 1 +#define HAVE_STRFTIME 1 +#define HAVE_STRCHR 1 +#define HAVE_STRSTR 1 +#define HAVE_STRTOD 1 +#define HAVE_STRTOL 1 +#define HAVE_STRTOUL 1 +#define HAVE_FLOCK 1 +#define HAVE_VSNPRINTF 1 +#define HAVE_FINITE 1 +#define HAVE_FMOD 1 +#define HAVE_FREXP 1 +#define HAVE_HYPOT 1 +#define HAVE_MODF 1 +#define HAVE_WAITPID 1 +#define HAVE_CHSIZE 1 +#define HAVE_TIMES 1 +#define HAVE__SETJMP 1 +#define HAVE_TELLDIR 1 +#define HAVE_SEEKDIR 1 +#define HAVE_MKTIME 1 +#define HAVE_COSH 1 +#define HAVE_SINH 1 +#define HAVE_TANH 1 +#define HAVE_EXECVE 1 +#define HAVE_TZNAME 1 +#define HAVE_DAYLIGHT 1 +#define SETPGRP_VOID 1 +#define inline __inline +#define NEED_IO_SEEK_BETWEEN_RW 1 +#define RSHIFT(x,y) ((x)>>(int)y) +#define FILE_COUNT _cnt +#define FILE_READPTR _ptr +#define DEFAULT_KCODE KCODE_NONE +#define DLEXT ".so" +#define DLEXT2 ".dll" |
