Add split-thai 0.1, a set of utilities for splitting Thai UTF8 text by word boundaries

author: scole <scole@pkgsrc.org> 2020-08-13 20:52:08 +0000
committer: scole <scole@pkgsrc.org> 2020-08-13 20:52:08 +0000
commit: b07548647c6d46aaebfeef8c155cdf784f9e87e8 (patch)
tree: c06d7ce614ed4894cd6746e47a3d746758d5135b /textproc/split-thai
parent: a95644a520c61d223d2bfb0c621815d947551513 (diff)
download: pkgsrc-b07548647c6d46aaebfeef8c155cdf784f9e87e8.tar.gz
10 files changed, 541 insertions, 0 deletions
diff --git a/textproc/split-thai/DESCR b/textproc/split-thai/DESCR
new file mode 100644
index 00000000000..5dba7ad4417
--- /dev/null
+++ b/textproc/split-thai/DESCR
@@ -0,0 +1,6 @@
+A collection of utilities to split Thai Unicode UTF-8 text by word
+boundaries, also known as word tokenization.  The utilities use emacs,
+swath, and a c++ icu-project program.  All use dictionary-based word
+splitting.
+
+Also included is merged dictionary file of thai words.
diff --git a/textproc/split-thai/Makefile b/textproc/split-thai/Makefile
new file mode 100644
index 00000000000..b332448c98b
--- /dev/null
+++ b/textproc/split-thai/Makefile
@@ -0,0 +1,81 @@
+# $NetBSD: Makefile,v 1.1 2020/08/13 20:52:09 scole Exp $
+
+PKGNAME=	split-thai-0.1
+CATEGORIES=	textproc
+MAINTAINER=	pkgsrc-users@NetBSD.org
+COMMENT=	Utilities to split UTF-8 Thai text into words
+LICENSE=	public-domain AND mit AND gnu-gpl-v2 # code, icu dict, swath dict
+
+# xxx fetching a specific version of a file out of a github project
+EXTRACT_SUFX=	# none
+GITHUB_ICU_TAG=	61607c27732906d36c5bd4d23ecc092f89f53a2b
+DISTFILES=	thaidict-${GITHUB_ICU_TAG}.txt
+MASTER_SITES=	-${MASTER_SITE_GITHUB:=unicode-org/}/icu/raw/${GITHUB_ICU_TAG}/icu4c/source/data/brkitr/dictionaries/thaidict.txt
+
+USE_LANGUAGES=	c++11	# darwin needed 11?
+
+USE_TOOLS=	pkg-config mkdir cp sh:run env awk cat sort uniq grep wc echo
+BUILD_DEPENDS+=	libdatrie-[0-9]*:../../devel/libdatrie
+DEPENDS+=	emacs-[0-9]*:../../editors/emacs
+DEPENDS+=	swath-[0-9]*:../../textproc/swath
+
+REPLACE_SH=	st-swath
+
+UTF8_ENV=	env LC_ALL=C.UTF-8
+
+ST_SHARE_DIR=		share/split-thai
+INSTALLATION_DIRS=	bin ${ST_SHARE_DIR}
+
+# xxx REPLACE_EMACS_SCRIPT
+SUBST_CLASSES+=			st-emacs-app
+SUBST_STAGE.st-emacs-app=	pre-configure
+SUBST_MESSAGE.st-emacs-app=	Fixing emacs script paths.
+SUBST_FILES.st-emacs-app=	st-emacs
+SUBST_SED.st-emacs-app=		-e 's,!/bin/emacs,!${PREFIX}/bin/emacs,g'
+
+SUBST_CLASSES+=			dictionary-app
+SUBST_STAGE.dictionary-app=	pre-configure
+SUBST_MESSAGE.dictionary-app=	Fixing dictionary paths.
+SUBST_FILES.dictionary-app=	st-emacs st-swath
+SUBST_SED.dictionary-app=	-e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
+
+pre-extract:
+	mkdir -p ${WRKSRC}
+	cd files && cp README.txt st-emacs st-icu.cc st-swath \
+		thai-utility.el thaidict.abm ${WRKSRC}
+
+post-extract:
+	cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
+		-f batch-byte-compile thai-utility.el
+	cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.el \
+		--eval '(thai-word-table-save "emacs-dict")'
+	cp ${WRKDIR}/${DISTFILES} ${WRKSRC}/icu-dict
+	cd ${PREFIX}/share/swath && \
+		${UTF8_ENV} trietool swathdic list | \
+		awk '{print $$1}' > ${WRKSRC}/swath-dict
+	cd ${WRKSRC} && \
+		${UTF8_ENV} cat icu-dict swath-dict emacs-dict | \
+			grep -v '#' | sort | uniq > thaidict
+	cd ${WRKSRC} && \
+		${UTF8_ENV} trietool thaidict add-list -e utf-8 thaidict
+.for i in emacs-dict icu-dict swath-dict
+	@${ECHO} `wc -l ${WRKSRC}/${i} | awk '{print $$1}'` words in ${i}
+.endfor
+	@${ECHO} `wc -l ${WRKSRC}/thaidict | awk '{print $$1}'` \
+		unique words in combined dictionary
+
+do-build:
+	cd ${WRKSRC} &&	\
+		${CXX} ${CPPFLAGS} -o st-icu st-icu.cc \
+		`pkg-config --libs --cflags icu-io`
+
+do-install:
+	${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \
+		${DESTDIR}${PREFIX}/bin
+	${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin
+.for i in README.txt thaidict thai-utility.el thai-utility.elc thaidict.tri
+	${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai
+.endfor
+
+.include "../../textproc/icu/buildlink3.mk"
+.include "../../mk/bsd.pkg.mk"
diff --git a/textproc/split-thai/PLIST b/textproc/split-thai/PLIST
new file mode 100644
index 00000000000..4a1bef0d833
--- /dev/null
+++ b/textproc/split-thai/PLIST
@@ -0,0 +1,9 @@
+@comment $NetBSD: PLIST,v 1.1 2020/08/13 20:52:09 scole Exp $
+bin/st-emacs
+bin/st-icu
+bin/st-swath
+share/split-thai/README.txt
+share/split-thai/thai-utility.el
+share/split-thai/thai-utility.elc
+share/split-thai/thaidict
+share/split-thai/thaidict.tri
diff --git a/textproc/split-thai/distinfo b/textproc/split-thai/distinfo
new file mode 100644
index 00000000000..e35ad9a15aa
--- /dev/null
+++ b/textproc/split-thai/distinfo
@@ -0,0 +1,6 @@
+$NetBSD: distinfo,v 1.1 2020/08/13 20:52:09 scole Exp $
+
+SHA1 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 2a2ad127cc279835cb4df04eb69401a0d4927774
+RMD160 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 0a6df7b7dd6ef502c5dd20020e37b2ca1a5514a2
+SHA512 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 88800fe2a453fc40f16ff54c21c852a8ea8e1496e42d5d187e5b5ac0ff58050830fc0816239e4f88cb23ed301f894d1ca52eb4676fd85c13c285cec815ae7c42
+Size (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 493044 bytes
diff --git a/textproc/split-thai/files/README.txt b/textproc/split-thai/files/README.txt
new file mode 100644
index 00000000000..7b91f97fb9a
--- /dev/null
+++ b/textproc/split-thai/files/README.txt
@@ -0,0 +1,49 @@
+This is a collection of utilities to separate Thai words by spaces
+(word tokenization).  They can separate stdin, files, or text as
+arguments.  It includes 3 separate utilities:
+
+st-emacs:  emacs-script using emacs lisp thai-word library
+           https://www.gnu.org/software/emacs/
+st-icu:    basic C++ program using the ICU library
+           http://site.icu-project.org/
+st-swath:  sh script wrapper to simplfy args to the swath program
+           https://linux.thai.net/projects/swath
+
+All scripts should be able to take a filename, stdin, or arguments as
+input, e.g., :
+
+      # st-swath แมวและหมา
+or
+      # echo "แมวและหมา" | st-swath
+or      
+      # st-swath < thaifile.txt
+or
+      # st-swath "แมวหมา" พ่อและแม่
+      
+You will most likely need to set LC_ALL or LC_CTYPE to an approriate
+unicode value, e.g., en_US.UTF-8 or C.UTF-8, in the environment for
+them to work properly.  These tools are setup to only support UTF-8
+encodings.
+
+Note that it is not possible to split Thai words 100% accurately
+without context and meaning.  These programs use dictionary-based word
+splitting.
+
+Also included in the package is a combined thai word dictionary and
+corresponding .tri file, and emacs lisp .el file for reading and
+dumping out dictionary files.
+
+st-emacs and st-swath are setup to use the combined dictionary with
+words from the emacs 'thai-word library, swath dictionary words, and
+the icu thai library words.
+
+st-icu uses its own built in library.  To customise the icu
+dictionary, you apparently would have to modify
+  icu4c/source/data/brkitr/dictionaries/thaidict.txt
+and rebuild icu library, and then rebuild the whole thing.
+
+There is also 
+
+See also swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1)
+
+TODO - fix st-icu to use all the combined dictionary words.
diff --git a/textproc/split-thai/files/st-emacs b/textproc/split-thai/files/st-emacs
new file mode 100755
index 00000000000..eff41ab98ea
--- /dev/null
+++ b/textproc/split-thai/files/st-emacs
@@ -0,0 +1,54 @@
+#!/bin/emacs --script
+;;
+;; break thai string into words separated by spaces
+;;
+;; - if no args, process stdin
+;; - if one arg and file exists with arg name, process file
+;; - else join get remainder of args and process
+;;
+
+;;(toggle-debug-on-error) ;; debug
+(require 'thai-word)
+
+;; load custom dictionary
+(load "ST_SHARE_DIR/thai-utility" nil t)
+(thai-update-word-table-utf8 "ST_SHARE_DIR/thaidict")
+
+;; split a thai line by spaces, return new line
+(defun process-thai-line(line)
+  (with-temp-buffer
+    (insert line)
+    (goto-char (point-min))
+    (thai-break-words " ")
+    (buffer-string)))
+
+;; hack to process stdin
+(defun process-stdin()
+  (condition-case nil
+      (let (aline)
+	(while (setq aline (read-from-minibuffer ""))
+	  (princ (process-thai-line aline))
+	  (princ "\n")))
+    (error nil)))
+
+;; process arguments, remove "emacs -scriptload scriptname" from args,
+;; join the rest by spaces
+(setq args (cdddr command-line-args))
+(setq argc (length args))
+
+;; no args => process stdin
+(when (= 0 argc)
+  (process-stdin)
+  (kill-emacs 0))
+
+;; if one arg and arg is a file, process that file
+;; else process all input args joined by spaces with an added newline
+(with-temp-buffer
+  (if (and (= 1 argc) (file-exists-p (car args)))
+      (insert-file-contents (car args))
+    (insert (mapconcat 'identity (cdddr command-line-args) " "))
+    (insert "\n"))
+  (goto-char (point-min))
+  (thai-break-words " ")
+  (write-region nil nil "/dev/stdout"))
+(kill-emacs 0)
diff --git a/textproc/split-thai/files/st-icu.cc b/textproc/split-thai/files/st-icu.cc
new file mode 100644
index 00000000000..8df3f3c2f2f
--- /dev/null
+++ b/textproc/split-thai/files/st-icu.cc
@@ -0,0 +1,195 @@
+/*
+ *   split up thai strings in a file, stdin or args into "words"
+ */
+#include <fstream>
+#include <vector>
+
+#include <unicode/brkiter.h>
+#include <unicode/regex.h>
+#include <unicode/ucnv.h>
+#include <unicode/ustream.h>
+#include <unicode/ustdio.h>
+
+using namespace std;
+using namespace icu;
+
+void usage() {
+ const char *progname = "st-icu";
+	
+ cout << endl <<
+  "Usage: " << progname << " [stdin|filename|thaiarg1 thaiarg2 ...]" <<
+    endl << endl <<
+     "This program attempts to split thai strings into thai words." << endl <<
+     "It takes a filename, stdin, or UTF8 thai string(s) as arguments" << endl <<
+     "and prints out the string separated by spaces." << endl <<
+     "When no argument is given, it can read lines from stdin, and" << endl <<
+     "separate thai words in the line by spaces." << endl << endl <<
+     "returns 0 on succes, or non-zero otherwise" << endl << endl;
+}
+
+// return true if string contains any thai unicode
+bool contains_thai(const UnicodeString &s) {
+	UErrorCode status = U_ZERO_ERROR;
+	// matches one or more thai chars, \u0e01-\u0e5b should work too
+	RegexMatcher *matcher = new RegexMatcher("[\u0e00-\u0e7f]+", 0, status);
+
+	if (U_FAILURE(status)) {
+		// syntax errors in the regular expression
+		cerr << "error creating RegexMatcher" << endl;
+		exit(1);
+	}
+
+	matcher->reset(s);
+	if (matcher->find())
+		return true;
+	else
+		return false;
+}
+
+// split a unicode string by word boundaries.  if arg contains
+// whitespaces, it will get consolidated to single spaces.
+// if string has no thai characters, return it unmodified
+UnicodeString split_words_consolidated(const UnicodeString &s) {
+	if ( ! contains_thai(s) ) {
+		return s;
+	}
+	
+	UErrorCode status = U_ZERO_ERROR;
+	BreakIterator* wordBreaker =
+		BreakIterator::createWordInstance(Locale::getUS(), status);
+	if ( U_FAILURE(status) ) {
+		cerr << "error creating BreakIterator" << endl;
+		exit(1);
+	}
+
+	wordBreaker->setText(s);	
+	vector<int32_t> vbreak;
+
+	int32_t pos = wordBreaker->first();
+	while( pos != BreakIterator::DONE ) {
+		// cout << "boundary " << pos << endl;
+		vbreak.push_back(pos);
+		pos = wordBreaker->next();
+	}
+
+	// only one word found, trim and done
+	if ( vbreak.size() == 1 ) {
+		UnicodeString ss(s);
+		return ss.trim();
+	}
+	
+	UnicodeString rs;
+	for (int i = 0 ; i < vbreak.size() - 1; i++) {
+		UnicodeString ss;
+		s.extractBetween(vbreak[i], vbreak[i+1], ss);
+		ss.trim();
+		if ( ss != "" )
+			rs += ss + " ";
+	}
+
+	return rs.trim();
+}
+
+// split a unicode string by word boundaries trying to preserve
+// original spacing
+UnicodeString split_words(const UnicodeString &s) {
+	UnicodeString tempStr;
+	UnicodeString rs;
+	for (int i = 0 ; i < s.length() ; i++) {
+		if ( ! u_isUWhiteSpace(s[i]) ) {
+			tempStr += s[i];
+		} else {
+			if ( tempStr.length() > 0 ) {
+				rs += split_words_consolidated(tempStr);
+				tempStr.remove();
+			}
+			rs += s[i];
+		}
+	}
+	if ( tempStr.length() > 0 )
+		rs += split_words_consolidated(tempStr);
+	return rs;
+}
+
+// split stdin
+void split_stdin() {
+	UFILE *in = u_finit(stdin, NULL, NULL);
+	if ( !in ) {
+		cerr << "error: u_finit of stdin failed" << endl;
+		exit(1);
+	}
+
+	UChar uch;
+	UnicodeString line;
+	while ( (uch = u_fgetc(in)) ) {
+		if ( uch == 0xffff ) {
+			break;
+		} else if ( uch == '\n' ) {
+			UnicodeString s(line);
+			cout << split_words(s) << endl;
+			line = "";
+		} else {
+			line += uch;
+		}
+	}
+		
+	u_fclose(in);
+}
+
+// read file line by line, spliting each line 1 at a time
+void split_file(const char* filename) {
+	UFILE *in = u_fopen(filename, "r", NULL, NULL);
+	if ( !in ) {
+		cerr << "error: opening file " << filename << endl;
+		exit(1);
+	}
+	const int32_t maxLine = 1024;
+	UChar line[maxLine];
+	while ( u_fgets(line, maxLine, in) != NULL ) {
+		//cout << split_words(line) << endl;
+		cout << split_words(line);
+	}
+
+	u_fclose(in);
+}
+
+// check if file is "readable"
+bool is_readable(const char* fname) {
+    ifstream infile(fname);
+    return infile.good();
+}
+
+int main(int argc, char **argv) {
+ 	// utf8 for everything
+	ucnv_setDefaultName("UTF-8");
+
+	// read stdin when no args passed in
+	if ( argc <= 1 ) {
+		split_stdin();
+		exit(0);
+	}
+
+	// check second arg for help flag
+	UnicodeString arg2(argv[1]);
+	if ( arg2 == "-h" || arg2 == "-H" || arg2 == "-?" || arg2 == "-help" ) {
+		usage();
+		exit(0);
+	}
+
+	// if only one arg and exists with arg name, process file
+	if ( argc == 2 && is_readable(argv[1]) ) {
+		split_file(argv[1]);
+		exit(0);
+	}
+
+	// join remainder of args and process as string
+	UnicodeString inArgs;
+	for ( int i = 1 ; i < argc ; i++ ) {
+		UnicodeString s(argv[i]);
+		inArgs += s;
+		if ( i < (argc - 1) )
+			inArgs += " ";
+	}
+	cout << split_words(inArgs) << endl;
+	exit(0);
+}
diff --git a/textproc/split-thai/files/st-swath b/textproc/split-thai/files/st-swath
new file mode 100755
index 00000000000..52d8e17acf8
--- /dev/null
+++ b/textproc/split-thai/files/st-swath
@@ -0,0 +1,42 @@
+#!/bin/sh
+#
+# simple wrapper for swath to split thai text from stdin, arg, or a
+# file
+#
+# swath settings are split with ' ', longest match, unicode input, and
+# unicode output.  see swath(1)
+#
+
+# use merged dictionary unless specified otherwise
+if [ -z "$SWATHDICT" ]; then
+    dictarg="-d ST_SHARE_DIR/thaidict.tri"
+fi
+
+if [ "$#" -eq 0 ]; then
+    # no args, read from stdin
+    while read line
+    do
+	echo "$line" | swath -b ' ' -m long -u 'u,u' $dictarg
+    done < /dev/stdin
+    exit 0 
+elif [ "$#" -eq 1 -a -e "$1" ]; then
+    # one arg and arg is an existing file
+    swath -b ' ' -m long -u 'u,u' $dictarg < "$1"
+    exit $?
+elif [ "$#" -ge 1 ]; then
+    # one or more args, assume it is all text
+    while [ "$1" != "" ]; do
+	if [ -z "$txt" ]; then
+	    txt="$1"
+	else
+	    txt="$txt $1"
+	fi
+
+	shift
+    done
+    echo "$txt" | swath -b ' ' -m long -u 'u,u' $dictarg
+    exit $?
+else
+    echo "$0: error parsing args"
+    exit 1
+fi
diff --git a/textproc/split-thai/files/thai-utility.el b/textproc/split-thai/files/thai-utility.el
new file mode 100644
index 00000000000..914534f436f
--- /dev/null
+++ b/textproc/split-thai/files/thai-utility.el
@@ -0,0 +1,97 @@
+(require 'mule-util)
+(require 'thai-word)
+
+" nested-alist from mule-util looks like this:          "
+"  '(3585 1                       ;; ก      word   ก    "
+"    (3591 1                      ;;  ง     word   กง   "
+"          (3585 t                ;;    ก               "
+"                (3634 t          ;;     า              "
+"                      (3619 1))));;      ร word   กงการ"
+"    (3585 1                      ;;  ก     word   กก   "
+"          (3621 1))))            ;;   ล    word   กกล  "
+
+(defun extract-thai-na(nlist thaistr)
+  "helper function to reconstruct thai words from a nested alist,
+uses recursion"
+  (let ((ucode)
+	(complete))
+    (cond
+     ;; finished
+     ((not nlist) nil)
+
+     ;; (3591 1 ...
+     ((integerp (car nlist))
+      ;; xxx care about coding-system vars here?
+      (setq ucode (char-to-string (car nlist)))
+      (setq complete (cadr nlist))
+      (setq thaistr (concat thaistr ucode))
+      (cond
+       ;; t => no word at this depth 
+       ((equal complete t)
+	(extract-thai-na (cddr nlist) thaistr))
+       ;; 1 => word at this depth
+       ((equal complete 1)
+	(append (list thaistr)
+		(extract-thai-na (cddr nlist) thaistr) '()))
+       (t
+	(error "invalid parsing for complete var"))))
+     
+     ;; not finished
+     (t
+      (append (extract-thai-na (car nlist) thaistr)
+	      (extract-thai-na (cdr nlist) thaistr) '())))))
+
+(defun thai-word-table-save(filename &optional alist)
+  "save thai words extracted from a nested-alist table to
+filename in utf8 format.  default is to save 'thai-word-table if
+no alist argument given."
+  (interactive)
+  (let ((thaiwords)
+	(elem)
+	(coding-system-for-read 'utf-8)
+	(coding-system-for-write 'utf-8)
+	(buffer-file-coding-system 'utf-8))
+    ;; default list or not
+    (setq alist (or alist
+		    thai-word-table))
+
+    (or (nested-alist-p alist)
+      (error "Invalid argument %s" alist))
+
+    ;; remove 'thai-words from 'thai-word-table
+    (setq alist (cdr alist))
+
+    (with-temp-buffer
+      ;; process per-letter list one at a time.  could process whole
+      ;; list at once but maybe try to conserve memory resources
+      (while (setq elem (car alist))
+	(setq alist (cdr alist))
+	(setq thaiwords (extract-thai-na elem ""))
+	
+    	(dolist (elem thaiwords)
+	  (insert elem "\n")))
+
+      (sort-lines nil (point-min) (point-max))
+      (write-region nil nil filename)
+      (buffer-string))))
+
+;; 'thai-tis620 is default for emacs <= 28
+(defun thai-update-word-table-utf8 (file &optional append)
+  "Update Thai word table by replacing the current word list with
+FILE, which is in utf-8.  If called with a prefix argument, FILE
+is appended instead to the current word list.  Does the same as
+'thai-update-word-table, except that function expects
+'thai-tis620 encoding"
+  (interactive "FThai word table file: \nP")
+  (let* ((coding-system-for-read 'utf-8)
+	 (coding-system-for-write 'utf-8)
+	 (buffer-file-coding-system 'utf-8)
+	 (temp_file (make-temp-file "thaiutf8_")))
+    (unwind-protect
+	(with-temp-buffer
+	  (insert-file-contents file)
+	  (setq coding-system-for-write 'thai-tis620)
+	  (write-file temp_file))
+      (thai-update-word-table temp_file append)
+      (delete-file temp_file)
+      thai-word-table)))
diff --git a/textproc/split-thai/files/thaidict.abm b/textproc/split-thai/files/thaidict.abm
new file mode 100644
index 00000000000..c364c86918a
--- /dev/null
+++ b/textproc/split-thai/files/thaidict.abm
@@ -0,0 +1,2 @@
+[0x002d,0x002e]
+[0x0e01,0x0e5b]
author	scole <scole@pkgsrc.org>	2020-08-13 20:52:08 +0000
committer	scole <scole@pkgsrc.org>	2020-08-13 20:52:08 +0000
commit	b07548647c6d46aaebfeef8c155cdf784f9e87e8 (patch)
tree	c06d7ce614ed4894cd6746e47a3d746758d5135b /textproc/split-thai
parent	a95644a520c61d223d2bfb0c621815d947551513 (diff)
download	pkgsrc-b07548647c6d46aaebfeef8c155cdf784f9e87e8.tar.gz