summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorscole <scole@pkgsrc.org>2020-08-17 17:43:15 +0000
committerscole <scole@pkgsrc.org>2020-08-17 17:43:15 +0000
commita6ead48ef445697d19daa8aa5904e2186a58ff5b (patch)
treeb78e9b49df400db6ab70f1741374c03603b77cb8
parent2bc86d1224d1312b1e30952b6ab3609c9d432b5c (diff)
downloadpkgsrc-a6ead48ef445697d19daa8aa5904e2186a58ff5b.tar.gz
Update to 0.4
- always use pkgsrc path for swath for st-swath script - make splitting of numbers a little more consistent for st-emacs & st-icu - add split-thai, split-thai-line, wrapper functions to emacs lisp code
-rw-r--r--textproc/split-thai/Makefile8
-rw-r--r--textproc/split-thai/files/README.txt2
-rwxr-xr-xtextproc/split-thai/files/st-emacs4
-rw-r--r--textproc/split-thai/files/st-icu.cc44
-rwxr-xr-xtextproc/split-thai/files/st-swath7
-rw-r--r--textproc/split-thai/files/thai-utility.el31
6 files changed, 78 insertions, 18 deletions
diff --git a/textproc/split-thai/Makefile b/textproc/split-thai/Makefile
index 738e2c98170..88b08076289 100644
--- a/textproc/split-thai/Makefile
+++ b/textproc/split-thai/Makefile
@@ -1,6 +1,6 @@
-# $NetBSD: Makefile,v 1.3 2020/08/15 16:52:28 scole Exp $
+# $NetBSD: Makefile,v 1.4 2020/08/17 17:43:15 scole Exp $
-PKGNAME= split-thai-0.3
+PKGNAME= split-thai-0.4
CATEGORIES= textproc
MAINTAINER= pkgsrc-users@NetBSD.org
COMMENT= Utilities to split UTF-8 Thai text into words
@@ -24,7 +24,8 @@ REPLACE_SH= st-swath
UTF8_ENV= env LC_ALL=C.UTF-8
ST_SHARE_DIR= share/split-thai
-INSTALLATION_DIRS= bin ${ST_SHARE_DIR}
+ST_SHARE_BIN= bin
+INSTALLATION_DIRS= ${ST_SHARE_BIN} ${ST_SHARE_DIR}
ST_SHARE_FILES= README.txt thaidict thai-dict.el thai-dict.elc
ST_SHARE_FILES+= thai-utility.el thai-utility.elc thaidict.tri
@@ -41,6 +42,7 @@ SUBST_STAGE.dictionary-app= pre-configure
SUBST_MESSAGE.dictionary-app= Fixing dictionary paths.
SUBST_FILES.dictionary-app= st-emacs st-swath
SUBST_SED.dictionary-app= -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
+SUBST_SED.dictionary-app+= -e 's,ST_SHARE_BIN,${PREFIX}/${ST_SHARE_BIN},g'
pre-extract:
mkdir -p ${WRKSRC}
diff --git a/textproc/split-thai/files/README.txt b/textproc/split-thai/files/README.txt
index 7480d4b4c2a..8bdd21a706a 100644
--- a/textproc/split-thai/files/README.txt
+++ b/textproc/split-thai/files/README.txt
@@ -66,5 +66,5 @@ SEE ALSO
BUGS
st-icu should also use the combined dictionary words.
- st-emacs and st-icu don't always split thai numbers well.
+ thai text mixed with other languages may not be handled well.
this file should be converted to a proper manpage.
diff --git a/textproc/split-thai/files/st-emacs b/textproc/split-thai/files/st-emacs
index 7b31f12a3d5..e4cda5b0151 100755
--- a/textproc/split-thai/files/st-emacs
+++ b/textproc/split-thai/files/st-emacs
@@ -18,7 +18,7 @@
(with-temp-buffer
(insert line)
(goto-char (point-min))
- (thai-break-words " ")
+ (split-thai-line)
(buffer-string)))
;; hack to process stdin
@@ -48,6 +48,6 @@
(insert (mapconcat 'identity (cdddr command-line-args) " "))
(insert "\n"))
(goto-char (point-min))
- (thai-break-words " ")
+ (split-thai)
(write-region nil nil "/dev/stdout"))
(kill-emacs 0)
diff --git a/textproc/split-thai/files/st-icu.cc b/textproc/split-thai/files/st-icu.cc
index 8df3f3c2f2f..f3d04c05064 100644
--- a/textproc/split-thai/files/st-icu.cc
+++ b/textproc/split-thai/files/st-icu.cc
@@ -13,6 +13,13 @@
using namespace std;
using namespace icu;
+// utf-8 unicode thai values
+// 0x0e1 - 0x0e5b should work for thai_rexp as well...
+const UnicodeString thai_rexp = "[\\u0e00-\\u0e7f]+";
+const UnicodeString thai_consonant = "[\\u0e01-\\u0e2e]+";
+const UnicodeString thai_num_rexp = "[\\u0e50-\\u0e59]+";
+const UnicodeString thai_nonnum_rexp = "[\\u0e01-\\u0e4f\\u0e5a-\\u0e7f]+";
+
void usage() {
const char *progname = "st-icu";
@@ -27,11 +34,11 @@ void usage() {
"returns 0 on succes, or non-zero otherwise" << endl << endl;
}
-// return true if string contains any thai unicode
-bool contains_thai(const UnicodeString &s) {
+// return true if string contains some regexp
+bool matches_regexp(const UnicodeString &s, const UnicodeString &regexp) {
UErrorCode status = U_ZERO_ERROR;
- // matches one or more thai chars, \u0e01-\u0e5b should work too
- RegexMatcher *matcher = new RegexMatcher("[\u0e00-\u0e7f]+", 0, status);
+
+ RegexMatcher *matcher = new RegexMatcher(regexp, 0, status);
if (U_FAILURE(status)) {
// syntax errors in the regular expression
@@ -46,11 +53,36 @@ bool contains_thai(const UnicodeString &s) {
return false;
}
+// add spaces to string with thai numbers
+UnicodeString space_thai_numbers(const UnicodeString &s) {
+ // return string unmodified if no numbers
+ if ( ! matches_regexp(s, thai_num_rexp) ) {
+ return s;
+ }
+
+ UnicodeString rs;
+ UChar32 pch;
+ // add spaces between number and non-number
+ for (int i = 0 ; i < s.length(); i++) {
+ if ( u_isWhitespace(s[i]) ) {
+ rs += s[i];
+ } else if ((u_isdigit(s[i]) && !u_isdigit(pch) && matches_regexp(pch, thai_rexp)) ||
+ (u_isdigit(pch) && !u_isdigit(s[i]) && matches_regexp(s[i], thai_rexp))) {
+ rs += " ";
+ rs += s[i];
+ } else {
+ rs += s[i];
+ }
+ pch = s[i];
+ }
+ return rs;
+}
+
// split a unicode string by word boundaries. if arg contains
// whitespaces, it will get consolidated to single spaces.
// if string has no thai characters, return it unmodified
UnicodeString split_words_consolidated(const UnicodeString &s) {
- if ( ! contains_thai(s) ) {
+ if ( ! matches_regexp(s, thai_rexp) ) {
return s;
}
@@ -108,6 +140,8 @@ UnicodeString split_words(const UnicodeString &s) {
}
if ( tempStr.length() > 0 )
rs += split_words_consolidated(tempStr);
+
+ rs = space_thai_numbers(rs);
return rs;
}
diff --git a/textproc/split-thai/files/st-swath b/textproc/split-thai/files/st-swath
index 52d8e17acf8..861a3694196 100755
--- a/textproc/split-thai/files/st-swath
+++ b/textproc/split-thai/files/st-swath
@@ -6,6 +6,7 @@
# swath settings are split with ' ', longest match, unicode input, and
# unicode output. see swath(1)
#
+swath_cmd=ST_SHARE_BIN/swath
# use merged dictionary unless specified otherwise
if [ -z "$SWATHDICT" ]; then
@@ -16,12 +17,12 @@ if [ "$#" -eq 0 ]; then
# no args, read from stdin
while read line
do
- echo "$line" | swath -b ' ' -m long -u 'u,u' $dictarg
+ echo "$line" | $swath_cmd -b ' ' -m long -u 'u,u' $dictarg
done < /dev/stdin
exit 0
elif [ "$#" -eq 1 -a -e "$1" ]; then
# one arg and arg is an existing file
- swath -b ' ' -m long -u 'u,u' $dictarg < "$1"
+ $swath_cmd -b ' ' -m long -u 'u,u' $dictarg < "$1"
exit $?
elif [ "$#" -ge 1 ]; then
# one or more args, assume it is all text
@@ -34,7 +35,7 @@ elif [ "$#" -ge 1 ]; then
shift
done
- echo "$txt" | swath -b ' ' -m long -u 'u,u' $dictarg
+ echo "$txt" | $swath_cmd -b ' ' -m long -u 'u,u' $dictarg
exit $?
else
echo "$0: error parsing args"
diff --git a/textproc/split-thai/files/thai-utility.el b/textproc/split-thai/files/thai-utility.el
index 5ee04a4b9a9..a931907828e 100644
--- a/textproc/split-thai/files/thai-utility.el
+++ b/textproc/split-thai/files/thai-utility.el
@@ -168,15 +168,38 @@ dictionary words."
(write-region nil nil lispfile))
line_count))
-(defun split-thai-line(&optional separator)
+(defun split-thai-line()
"Break Thai words from point to end of line by inserting a
separator string at word boundaries. (wrapper for 'thai-break-words)"
(interactive)
- (thai-break-words (or separator " ") (line-end-position)))
+ (thai-break-words " " (line-end-position))
+ (split-thai-numbers (point) (line-end-position)))
-(defun split-thai(&optional separator)
+(defun split-thai()
"Break Thai words from point to end of buffer by inserting a
separator string at word boundaries. (wrapper for
'thai-break-words)"
(interactive)
- (thai-break-words (or separator " ") (point-max)))
+ (thai-break-words " " (point-max))
+ (split-thai-numbers (point) (point-max)))
+
+(defun split-thai-numbers(start_point end_point)
+ "helper function to separate numbers in a buffer.
+'thai-break-words doesn't always split numbers properly. this may
+improve tokenization somewhat."
+ ;; xxx this really should be fixed in 'thai-word lib
+ (let* (
+ ;; "\\([๐๑๒๓๔๕๖๗๘๙0123456789]+\\)"
+ (num_rexp "\\([\u0e50-\u0e59]+\\)") ;; thai numbers
+ (nonnum_rexp "\\([\u0e00-\u0e4f\u0e5a-\u0e7f]\\)") ;; "non-numbers"
+ (trailing_rexp (concat num_rexp nonnum_rexp))
+ (leading_rexp (concat nonnum_rexp num_rexp)))
+ (save-restriction
+ (narrow-to-region start_point end_point)
+ (goto-char (point-min))
+ (while (search-forward-regexp trailing_rexp nil t)
+ (replace-match (concat (match-string 1) " " (match-string 2))))
+ (goto-char (point-min))
+ (while (search-forward-regexp leading_rexp nil t)
+ (replace-match (concat (match-string 1) " " (match-string 2))))
+ (goto-char start_point))))