diff options
author | scole <scole@pkgsrc.org> | 2020-08-17 17:43:15 +0000 |
---|---|---|
committer | scole <scole@pkgsrc.org> | 2020-08-17 17:43:15 +0000 |
commit | a6ead48ef445697d19daa8aa5904e2186a58ff5b (patch) | |
tree | b78e9b49df400db6ab70f1741374c03603b77cb8 | |
parent | 2bc86d1224d1312b1e30952b6ab3609c9d432b5c (diff) | |
download | pkgsrc-a6ead48ef445697d19daa8aa5904e2186a58ff5b.tar.gz |
Update to 0.4
- always use pkgsrc path for swath for st-swath script
- make splitting of numbers a little more consistent for st-emacs & st-icu
- add split-thai, split-thai-line, wrapper functions to emacs lisp code
-rw-r--r-- | textproc/split-thai/Makefile | 8 | ||||
-rw-r--r-- | textproc/split-thai/files/README.txt | 2 | ||||
-rwxr-xr-x | textproc/split-thai/files/st-emacs | 4 | ||||
-rw-r--r-- | textproc/split-thai/files/st-icu.cc | 44 | ||||
-rwxr-xr-x | textproc/split-thai/files/st-swath | 7 | ||||
-rw-r--r-- | textproc/split-thai/files/thai-utility.el | 31 |
6 files changed, 78 insertions, 18 deletions
diff --git a/textproc/split-thai/Makefile b/textproc/split-thai/Makefile index 738e2c98170..88b08076289 100644 --- a/textproc/split-thai/Makefile +++ b/textproc/split-thai/Makefile @@ -1,6 +1,6 @@ -# $NetBSD: Makefile,v 1.3 2020/08/15 16:52:28 scole Exp $ +# $NetBSD: Makefile,v 1.4 2020/08/17 17:43:15 scole Exp $ -PKGNAME= split-thai-0.3 +PKGNAME= split-thai-0.4 CATEGORIES= textproc MAINTAINER= pkgsrc-users@NetBSD.org COMMENT= Utilities to split UTF-8 Thai text into words @@ -24,7 +24,8 @@ REPLACE_SH= st-swath UTF8_ENV= env LC_ALL=C.UTF-8 ST_SHARE_DIR= share/split-thai -INSTALLATION_DIRS= bin ${ST_SHARE_DIR} +ST_SHARE_BIN= bin +INSTALLATION_DIRS= ${ST_SHARE_BIN} ${ST_SHARE_DIR} ST_SHARE_FILES= README.txt thaidict thai-dict.el thai-dict.elc ST_SHARE_FILES+= thai-utility.el thai-utility.elc thaidict.tri @@ -41,6 +42,7 @@ SUBST_STAGE.dictionary-app= pre-configure SUBST_MESSAGE.dictionary-app= Fixing dictionary paths. SUBST_FILES.dictionary-app= st-emacs st-swath SUBST_SED.dictionary-app= -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g' +SUBST_SED.dictionary-app+= -e 's,ST_SHARE_BIN,${PREFIX}/${ST_SHARE_BIN},g' pre-extract: mkdir -p ${WRKSRC} diff --git a/textproc/split-thai/files/README.txt b/textproc/split-thai/files/README.txt index 7480d4b4c2a..8bdd21a706a 100644 --- a/textproc/split-thai/files/README.txt +++ b/textproc/split-thai/files/README.txt @@ -66,5 +66,5 @@ SEE ALSO BUGS st-icu should also use the combined dictionary words. - st-emacs and st-icu don't always split thai numbers well. + thai text mixed with other languages may not be handled well. this file should be converted to a proper manpage. diff --git a/textproc/split-thai/files/st-emacs b/textproc/split-thai/files/st-emacs index 7b31f12a3d5..e4cda5b0151 100755 --- a/textproc/split-thai/files/st-emacs +++ b/textproc/split-thai/files/st-emacs @@ -18,7 +18,7 @@ (with-temp-buffer (insert line) (goto-char (point-min)) - (thai-break-words " ") + (split-thai-line) (buffer-string))) ;; hack to process stdin @@ -48,6 +48,6 @@ (insert (mapconcat 'identity (cdddr command-line-args) " ")) (insert "\n")) (goto-char (point-min)) - (thai-break-words " ") + (split-thai) (write-region nil nil "/dev/stdout")) (kill-emacs 0) diff --git a/textproc/split-thai/files/st-icu.cc b/textproc/split-thai/files/st-icu.cc index 8df3f3c2f2f..f3d04c05064 100644 --- a/textproc/split-thai/files/st-icu.cc +++ b/textproc/split-thai/files/st-icu.cc @@ -13,6 +13,13 @@ using namespace std; using namespace icu; +// utf-8 unicode thai values +// 0x0e1 - 0x0e5b should work for thai_rexp as well... +const UnicodeString thai_rexp = "[\\u0e00-\\u0e7f]+"; +const UnicodeString thai_consonant = "[\\u0e01-\\u0e2e]+"; +const UnicodeString thai_num_rexp = "[\\u0e50-\\u0e59]+"; +const UnicodeString thai_nonnum_rexp = "[\\u0e01-\\u0e4f\\u0e5a-\\u0e7f]+"; + void usage() { const char *progname = "st-icu"; @@ -27,11 +34,11 @@ void usage() { "returns 0 on succes, or non-zero otherwise" << endl << endl; } -// return true if string contains any thai unicode -bool contains_thai(const UnicodeString &s) { +// return true if string contains some regexp +bool matches_regexp(const UnicodeString &s, const UnicodeString ®exp) { UErrorCode status = U_ZERO_ERROR; - // matches one or more thai chars, \u0e01-\u0e5b should work too - RegexMatcher *matcher = new RegexMatcher("[\u0e00-\u0e7f]+", 0, status); + + RegexMatcher *matcher = new RegexMatcher(regexp, 0, status); if (U_FAILURE(status)) { // syntax errors in the regular expression @@ -46,11 +53,36 @@ bool contains_thai(const UnicodeString &s) { return false; } +// add spaces to string with thai numbers +UnicodeString space_thai_numbers(const UnicodeString &s) { + // return string unmodified if no numbers + if ( ! matches_regexp(s, thai_num_rexp) ) { + return s; + } + + UnicodeString rs; + UChar32 pch; + // add spaces between number and non-number + for (int i = 0 ; i < s.length(); i++) { + if ( u_isWhitespace(s[i]) ) { + rs += s[i]; + } else if ((u_isdigit(s[i]) && !u_isdigit(pch) && matches_regexp(pch, thai_rexp)) || + (u_isdigit(pch) && !u_isdigit(s[i]) && matches_regexp(s[i], thai_rexp))) { + rs += " "; + rs += s[i]; + } else { + rs += s[i]; + } + pch = s[i]; + } + return rs; +} + // split a unicode string by word boundaries. if arg contains // whitespaces, it will get consolidated to single spaces. // if string has no thai characters, return it unmodified UnicodeString split_words_consolidated(const UnicodeString &s) { - if ( ! contains_thai(s) ) { + if ( ! matches_regexp(s, thai_rexp) ) { return s; } @@ -108,6 +140,8 @@ UnicodeString split_words(const UnicodeString &s) { } if ( tempStr.length() > 0 ) rs += split_words_consolidated(tempStr); + + rs = space_thai_numbers(rs); return rs; } diff --git a/textproc/split-thai/files/st-swath b/textproc/split-thai/files/st-swath index 52d8e17acf8..861a3694196 100755 --- a/textproc/split-thai/files/st-swath +++ b/textproc/split-thai/files/st-swath @@ -6,6 +6,7 @@ # swath settings are split with ' ', longest match, unicode input, and # unicode output. see swath(1) # +swath_cmd=ST_SHARE_BIN/swath # use merged dictionary unless specified otherwise if [ -z "$SWATHDICT" ]; then @@ -16,12 +17,12 @@ if [ "$#" -eq 0 ]; then # no args, read from stdin while read line do - echo "$line" | swath -b ' ' -m long -u 'u,u' $dictarg + echo "$line" | $swath_cmd -b ' ' -m long -u 'u,u' $dictarg done < /dev/stdin exit 0 elif [ "$#" -eq 1 -a -e "$1" ]; then # one arg and arg is an existing file - swath -b ' ' -m long -u 'u,u' $dictarg < "$1" + $swath_cmd -b ' ' -m long -u 'u,u' $dictarg < "$1" exit $? elif [ "$#" -ge 1 ]; then # one or more args, assume it is all text @@ -34,7 +35,7 @@ elif [ "$#" -ge 1 ]; then shift done - echo "$txt" | swath -b ' ' -m long -u 'u,u' $dictarg + echo "$txt" | $swath_cmd -b ' ' -m long -u 'u,u' $dictarg exit $? else echo "$0: error parsing args" diff --git a/textproc/split-thai/files/thai-utility.el b/textproc/split-thai/files/thai-utility.el index 5ee04a4b9a9..a931907828e 100644 --- a/textproc/split-thai/files/thai-utility.el +++ b/textproc/split-thai/files/thai-utility.el @@ -168,15 +168,38 @@ dictionary words." (write-region nil nil lispfile)) line_count)) -(defun split-thai-line(&optional separator) +(defun split-thai-line() "Break Thai words from point to end of line by inserting a separator string at word boundaries. (wrapper for 'thai-break-words)" (interactive) - (thai-break-words (or separator " ") (line-end-position))) + (thai-break-words " " (line-end-position)) + (split-thai-numbers (point) (line-end-position))) -(defun split-thai(&optional separator) +(defun split-thai() "Break Thai words from point to end of buffer by inserting a separator string at word boundaries. (wrapper for 'thai-break-words)" (interactive) - (thai-break-words (or separator " ") (point-max))) + (thai-break-words " " (point-max)) + (split-thai-numbers (point) (point-max))) + +(defun split-thai-numbers(start_point end_point) + "helper function to separate numbers in a buffer. +'thai-break-words doesn't always split numbers properly. this may +improve tokenization somewhat." + ;; xxx this really should be fixed in 'thai-word lib + (let* ( + ;; "\\([๐๑๒๓๔๕๖๗๘๙0123456789]+\\)" + (num_rexp "\\([\u0e50-\u0e59]+\\)") ;; thai numbers + (nonnum_rexp "\\([\u0e00-\u0e4f\u0e5a-\u0e7f]\\)") ;; "non-numbers" + (trailing_rexp (concat num_rexp nonnum_rexp)) + (leading_rexp (concat nonnum_rexp num_rexp))) + (save-restriction + (narrow-to-region start_point end_point) + (goto-char (point-min)) + (while (search-forward-regexp trailing_rexp nil t) + (replace-match (concat (match-string 1) " " (match-string 2)))) + (goto-char (point-min)) + (while (search-forward-regexp leading_rexp nil t) + (replace-match (concat (match-string 1) " " (match-string 2)))) + (goto-char start_point)))) |