Update to 0.4

- always use pkgsrc path for swath for st-swath script - make splitting of numbers a little more consistent for st-emacs & st-icu - add split-thai, split-thai-line, wrapper functions to emacs lisp code
author: scole <scole@pkgsrc.org> 2020-08-17 17:43:15 +0000
committer: scole <scole@pkgsrc.org> 2020-08-17 17:43:15 +0000
commit: a6ead48ef445697d19daa8aa5904e2186a58ff5b (patch)
tree: b78e9b49df400db6ab70f1741374c03603b77cb8
parent: 2bc86d1224d1312b1e30952b6ab3609c9d432b5c (diff)
download: pkgsrc-a6ead48ef445697d19daa8aa5904e2186a58ff5b.tar.gz
6 files changed, 78 insertions, 18 deletions
diff --git a/textproc/split-thai/Makefile b/textproc/split-thai/Makefile
index 738e2c98170..88b08076289 100644
--- a/textproc/split-thai/Makefile
+++ b/textproc/split-thai/Makefile
@@ -1,6 +1,6 @@
-# $NetBSD: Makefile,v 1.3 2020/08/15 16:52:28 scole Exp $
+# $NetBSD: Makefile,v 1.4 2020/08/17 17:43:15 scole Exp $
 
-PKGNAME=	split-thai-0.3
+PKGNAME=	split-thai-0.4
 CATEGORIES=	textproc
 MAINTAINER=	pkgsrc-users@NetBSD.org
 COMMENT=	Utilities to split UTF-8 Thai text into words
@@ -24,7 +24,8 @@ REPLACE_SH=	st-swath
 UTF8_ENV=	env LC_ALL=C.UTF-8
 
 ST_SHARE_DIR=		share/split-thai
-INSTALLATION_DIRS=	bin ${ST_SHARE_DIR}
+ST_SHARE_BIN=		bin
+INSTALLATION_DIRS=	${ST_SHARE_BIN} ${ST_SHARE_DIR}
 
 ST_SHARE_FILES=		README.txt thaidict thai-dict.el thai-dict.elc
 ST_SHARE_FILES+=	thai-utility.el thai-utility.elc thaidict.tri
@@ -41,6 +42,7 @@ SUBST_STAGE.dictionary-app=	pre-configure
 SUBST_MESSAGE.dictionary-app=	Fixing dictionary paths.
 SUBST_FILES.dictionary-app=	st-emacs st-swath
 SUBST_SED.dictionary-app=	-e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
+SUBST_SED.dictionary-app+=	-e 's,ST_SHARE_BIN,${PREFIX}/${ST_SHARE_BIN},g'
 
 pre-extract:
 	mkdir -p ${WRKSRC}
diff --git a/textproc/split-thai/files/README.txt b/textproc/split-thai/files/README.txt
index 7480d4b4c2a..8bdd21a706a 100644
--- a/textproc/split-thai/files/README.txt
+++ b/textproc/split-thai/files/README.txt
@@ -66,5 +66,5 @@ SEE ALSO
 
 BUGS
      st-icu should also use the combined dictionary words.
-     st-emacs and st-icu don't always split thai numbers well.
+     thai text mixed with other languages may not be handled well.
      this file should be converted to a proper manpage.
diff --git a/textproc/split-thai/files/st-emacs b/textproc/split-thai/files/st-emacs
index 7b31f12a3d5..e4cda5b0151 100755
--- a/textproc/split-thai/files/st-emacs
+++ b/textproc/split-thai/files/st-emacs
@@ -18,7 +18,7 @@
   (with-temp-buffer
     (insert line)
     (goto-char (point-min))
-    (thai-break-words " ")
+    (split-thai-line)
     (buffer-string)))
 
 ;; hack to process stdin
@@ -48,6 +48,6 @@
     (insert (mapconcat 'identity (cdddr command-line-args) " "))
     (insert "\n"))
   (goto-char (point-min))
-  (thai-break-words " ")
+  (split-thai)
   (write-region nil nil "/dev/stdout"))
 (kill-emacs 0)
diff --git a/textproc/split-thai/files/st-icu.cc b/textproc/split-thai/files/st-icu.cc
index 8df3f3c2f2f..f3d04c05064 100644
--- a/textproc/split-thai/files/st-icu.cc
+++ b/textproc/split-thai/files/st-icu.cc
@@ -13,6 +13,13 @@
 using namespace std;
 using namespace icu;
 
+// utf-8 unicode thai values
+// 0x0e1 - 0x0e5b should work for thai_rexp as well...
+const UnicodeString thai_rexp = "[\\u0e00-\\u0e7f]+";
+const UnicodeString thai_consonant = "[\\u0e01-\\u0e2e]+";
+const UnicodeString thai_num_rexp = "[\\u0e50-\\u0e59]+";
+const UnicodeString thai_nonnum_rexp = "[\\u0e01-\\u0e4f\\u0e5a-\\u0e7f]+";
+
 void usage() {
  const char *progname = "st-icu";
 	
@@ -27,11 +34,11 @@ void usage() {
      "returns 0 on succes, or non-zero otherwise" << endl << endl;
 }
 
-// return true if string contains any thai unicode
-bool contains_thai(const UnicodeString &s) {
+// return true if string contains some regexp
+bool matches_regexp(const UnicodeString &s, const UnicodeString &regexp) {
 	UErrorCode status = U_ZERO_ERROR;
-	// matches one or more thai chars, \u0e01-\u0e5b should work too
-	RegexMatcher *matcher = new RegexMatcher("[\u0e00-\u0e7f]+", 0, status);
+
+	RegexMatcher *matcher = new RegexMatcher(regexp, 0, status);
 
 	if (U_FAILURE(status)) {
 		// syntax errors in the regular expression
@@ -46,11 +53,36 @@ bool contains_thai(const UnicodeString &s) {
 		return false;
 }
 
+// add spaces to string with thai numbers
+UnicodeString space_thai_numbers(const UnicodeString &s) {
+	// return string unmodified if no numbers
+	if ( ! matches_regexp(s, thai_num_rexp) ) {
+		return s;
+	}
+
+	UnicodeString rs;
+	UChar32 pch;
+	// add spaces between number and non-number
+	for (int i = 0 ; i < s.length(); i++) {
+		if ( u_isWhitespace(s[i]) ) {
+			rs += s[i];
+		} else if ((u_isdigit(s[i]) && !u_isdigit(pch) && matches_regexp(pch, thai_rexp)) ||
+		 	   (u_isdigit(pch) && !u_isdigit(s[i]) && matches_regexp(s[i], thai_rexp))) {
+		 	rs += " ";
+		 	rs += s[i];
+		} else {
+			rs += s[i];
+		}
+		pch = s[i];
+	}
+	return rs;
+}
+
 // split a unicode string by word boundaries.  if arg contains
 // whitespaces, it will get consolidated to single spaces.
 // if string has no thai characters, return it unmodified
 UnicodeString split_words_consolidated(const UnicodeString &s) {
-	if ( ! contains_thai(s) ) {
+	if ( ! matches_regexp(s, thai_rexp) ) {
 		return s;
 	}
 	
@@ -108,6 +140,8 @@ UnicodeString split_words(const UnicodeString &s) {
 	}
 	if ( tempStr.length() > 0 )
 		rs += split_words_consolidated(tempStr);
+
+	rs = space_thai_numbers(rs);
 	return rs;
 }
 
diff --git a/textproc/split-thai/files/st-swath b/textproc/split-thai/files/st-swath
index 52d8e17acf8..861a3694196 100755
--- a/textproc/split-thai/files/st-swath
+++ b/textproc/split-thai/files/st-swath
@@ -6,6 +6,7 @@
 # swath settings are split with ' ', longest match, unicode input, and
 # unicode output.  see swath(1)
 #
+swath_cmd=ST_SHARE_BIN/swath
 
 # use merged dictionary unless specified otherwise
 if [ -z "$SWATHDICT" ]; then
@@ -16,12 +17,12 @@ if [ "$#" -eq 0 ]; then
     # no args, read from stdin
     while read line
     do
-	echo "$line" | swath -b ' ' -m long -u 'u,u' $dictarg
+	echo "$line" | $swath_cmd -b ' ' -m long -u 'u,u' $dictarg
     done < /dev/stdin
     exit 0 
 elif [ "$#" -eq 1 -a -e "$1" ]; then
     # one arg and arg is an existing file
-    swath -b ' ' -m long -u 'u,u' $dictarg < "$1"
+    $swath_cmd -b ' ' -m long -u 'u,u' $dictarg < "$1"
     exit $?
 elif [ "$#" -ge 1 ]; then
     # one or more args, assume it is all text
@@ -34,7 +35,7 @@ elif [ "$#" -ge 1 ]; then
 
 	shift
     done
-    echo "$txt" | swath -b ' ' -m long -u 'u,u' $dictarg
+    echo "$txt" | $swath_cmd -b ' ' -m long -u 'u,u' $dictarg
     exit $?
 else
     echo "$0: error parsing args"
diff --git a/textproc/split-thai/files/thai-utility.el b/textproc/split-thai/files/thai-utility.el
index 5ee04a4b9a9..a931907828e 100644
--- a/textproc/split-thai/files/thai-utility.el
+++ b/textproc/split-thai/files/thai-utility.el
@@ -168,15 +168,38 @@ dictionary words."
       (write-region nil nil lispfile))
     line_count))
 
-(defun split-thai-line(&optional separator)
+(defun split-thai-line()
   "Break Thai words from point to end of line by inserting a
 separator string at word boundaries. (wrapper for 'thai-break-words)"
   (interactive)
-    (thai-break-words (or separator " ") (line-end-position)))
+  (thai-break-words " " (line-end-position))
+  (split-thai-numbers (point) (line-end-position)))
 
-(defun split-thai(&optional separator)
+(defun split-thai()
   "Break Thai words from point to end of buffer by inserting a
 separator string at word boundaries. (wrapper for
 'thai-break-words)"
   (interactive)
-    (thai-break-words (or separator " ") (point-max)))
+  (thai-break-words " " (point-max))
+  (split-thai-numbers (point) (point-max)))
+
+(defun split-thai-numbers(start_point end_point)
+  "helper function to separate numbers in a buffer.
+'thai-break-words doesn't always split numbers properly. this may
+improve tokenization somewhat."
+  ;; xxx this really should be fixed in 'thai-word lib
+  (let* (
+	 ;; "\\([๐๑๒๓๔๕๖๗๘๙0123456789]+\\)"
+	 (num_rexp "\\([\u0e50-\u0e59]+\\)") ;; thai numbers
+	 (nonnum_rexp "\\([\u0e00-\u0e4f\u0e5a-\u0e7f]\\)") ;; "non-numbers"
+	 (trailing_rexp (concat num_rexp nonnum_rexp))
+	 (leading_rexp (concat nonnum_rexp num_rexp)))
+    (save-restriction
+      (narrow-to-region start_point end_point)
+      (goto-char (point-min))
+      (while (search-forward-regexp trailing_rexp nil t)
+	(replace-match (concat (match-string 1) " " (match-string 2))))
+      (goto-char (point-min))
+      (while (search-forward-regexp leading_rexp nil t)
+	(replace-match (concat (match-string 1) " " (match-string 2))))
+      (goto-char start_point))))
author	scole <scole@pkgsrc.org>	2020-08-17 17:43:15 +0000
committer	scole <scole@pkgsrc.org>	2020-08-17 17:43:15 +0000
commit	a6ead48ef445697d19daa8aa5904e2186a58ff5b (patch)
tree	b78e9b49df400db6ab70f1741374c03603b77cb8
parent	2bc86d1224d1312b1e30952b6ab3609c9d432b5c (diff)
download	pkgsrc-a6ead48ef445697d19daa8aa5904e2186a58ff5b.tar.gz