diff options
author | scole <scole@pkgsrc.org> | 2020-08-13 20:52:08 +0000 |
---|---|---|
committer | scole <scole@pkgsrc.org> | 2020-08-13 20:52:08 +0000 |
commit | b07548647c6d46aaebfeef8c155cdf784f9e87e8 (patch) | |
tree | c06d7ce614ed4894cd6746e47a3d746758d5135b /textproc/split-thai | |
parent | a95644a520c61d223d2bfb0c621815d947551513 (diff) | |
download | pkgsrc-b07548647c6d46aaebfeef8c155cdf784f9e87e8.tar.gz |
Add split-thai 0.1, a set of utilities for splitting Thai UTF8 text by word boundaries
Diffstat (limited to 'textproc/split-thai')
-rw-r--r-- | textproc/split-thai/DESCR | 6 | ||||
-rw-r--r-- | textproc/split-thai/Makefile | 81 | ||||
-rw-r--r-- | textproc/split-thai/PLIST | 9 | ||||
-rw-r--r-- | textproc/split-thai/distinfo | 6 | ||||
-rw-r--r-- | textproc/split-thai/files/README.txt | 49 | ||||
-rwxr-xr-x | textproc/split-thai/files/st-emacs | 54 | ||||
-rw-r--r-- | textproc/split-thai/files/st-icu.cc | 195 | ||||
-rwxr-xr-x | textproc/split-thai/files/st-swath | 42 | ||||
-rw-r--r-- | textproc/split-thai/files/thai-utility.el | 97 | ||||
-rw-r--r-- | textproc/split-thai/files/thaidict.abm | 2 |
10 files changed, 541 insertions, 0 deletions
diff --git a/textproc/split-thai/DESCR b/textproc/split-thai/DESCR new file mode 100644 index 00000000000..5dba7ad4417 --- /dev/null +++ b/textproc/split-thai/DESCR @@ -0,0 +1,6 @@ +A collection of utilities to split Thai Unicode UTF-8 text by word +boundaries, also known as word tokenization. The utilities use emacs, +swath, and a c++ icu-project program. All use dictionary-based word +splitting. + +Also included is merged dictionary file of thai words. diff --git a/textproc/split-thai/Makefile b/textproc/split-thai/Makefile new file mode 100644 index 00000000000..b332448c98b --- /dev/null +++ b/textproc/split-thai/Makefile @@ -0,0 +1,81 @@ +# $NetBSD: Makefile,v 1.1 2020/08/13 20:52:09 scole Exp $ + +PKGNAME= split-thai-0.1 +CATEGORIES= textproc +MAINTAINER= pkgsrc-users@NetBSD.org +COMMENT= Utilities to split UTF-8 Thai text into words +LICENSE= public-domain AND mit AND gnu-gpl-v2 # code, icu dict, swath dict + +# xxx fetching a specific version of a file out of a github project +EXTRACT_SUFX= # none +GITHUB_ICU_TAG= 61607c27732906d36c5bd4d23ecc092f89f53a2b +DISTFILES= thaidict-${GITHUB_ICU_TAG}.txt +MASTER_SITES= -${MASTER_SITE_GITHUB:=unicode-org/}/icu/raw/${GITHUB_ICU_TAG}/icu4c/source/data/brkitr/dictionaries/thaidict.txt + +USE_LANGUAGES= c++11 # darwin needed 11? + +USE_TOOLS= pkg-config mkdir cp sh:run env awk cat sort uniq grep wc echo +BUILD_DEPENDS+= libdatrie-[0-9]*:../../devel/libdatrie +DEPENDS+= emacs-[0-9]*:../../editors/emacs +DEPENDS+= swath-[0-9]*:../../textproc/swath + +REPLACE_SH= st-swath + +UTF8_ENV= env LC_ALL=C.UTF-8 + +ST_SHARE_DIR= share/split-thai +INSTALLATION_DIRS= bin ${ST_SHARE_DIR} + +# xxx REPLACE_EMACS_SCRIPT +SUBST_CLASSES+= st-emacs-app +SUBST_STAGE.st-emacs-app= pre-configure +SUBST_MESSAGE.st-emacs-app= Fixing emacs script paths. +SUBST_FILES.st-emacs-app= st-emacs +SUBST_SED.st-emacs-app= -e 's,!/bin/emacs,!${PREFIX}/bin/emacs,g' + +SUBST_CLASSES+= dictionary-app +SUBST_STAGE.dictionary-app= pre-configure +SUBST_MESSAGE.dictionary-app= Fixing dictionary paths. +SUBST_FILES.dictionary-app= st-emacs st-swath +SUBST_SED.dictionary-app= -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g' + +pre-extract: + mkdir -p ${WRKSRC} + cd files && cp README.txt st-emacs st-icu.cc st-swath \ + thai-utility.el thaidict.abm ${WRKSRC} + +post-extract: + cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \ + -f batch-byte-compile thai-utility.el + cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.el \ + --eval '(thai-word-table-save "emacs-dict")' + cp ${WRKDIR}/${DISTFILES} ${WRKSRC}/icu-dict + cd ${PREFIX}/share/swath && \ + ${UTF8_ENV} trietool swathdic list | \ + awk '{print $$1}' > ${WRKSRC}/swath-dict + cd ${WRKSRC} && \ + ${UTF8_ENV} cat icu-dict swath-dict emacs-dict | \ + grep -v '#' | sort | uniq > thaidict + cd ${WRKSRC} && \ + ${UTF8_ENV} trietool thaidict add-list -e utf-8 thaidict +.for i in emacs-dict icu-dict swath-dict + @${ECHO} `wc -l ${WRKSRC}/${i} | awk '{print $$1}'` words in ${i} +.endfor + @${ECHO} `wc -l ${WRKSRC}/thaidict | awk '{print $$1}'` \ + unique words in combined dictionary + +do-build: + cd ${WRKSRC} && \ + ${CXX} ${CPPFLAGS} -o st-icu st-icu.cc \ + `pkg-config --libs --cflags icu-io` + +do-install: + ${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \ + ${DESTDIR}${PREFIX}/bin + ${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin +.for i in README.txt thaidict thai-utility.el thai-utility.elc thaidict.tri + ${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai +.endfor + +.include "../../textproc/icu/buildlink3.mk" +.include "../../mk/bsd.pkg.mk" diff --git a/textproc/split-thai/PLIST b/textproc/split-thai/PLIST new file mode 100644 index 00000000000..4a1bef0d833 --- /dev/null +++ b/textproc/split-thai/PLIST @@ -0,0 +1,9 @@ +@comment $NetBSD: PLIST,v 1.1 2020/08/13 20:52:09 scole Exp $ +bin/st-emacs +bin/st-icu +bin/st-swath +share/split-thai/README.txt +share/split-thai/thai-utility.el +share/split-thai/thai-utility.elc +share/split-thai/thaidict +share/split-thai/thaidict.tri diff --git a/textproc/split-thai/distinfo b/textproc/split-thai/distinfo new file mode 100644 index 00000000000..e35ad9a15aa --- /dev/null +++ b/textproc/split-thai/distinfo @@ -0,0 +1,6 @@ +$NetBSD: distinfo,v 1.1 2020/08/13 20:52:09 scole Exp $ + +SHA1 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 2a2ad127cc279835cb4df04eb69401a0d4927774 +RMD160 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 0a6df7b7dd6ef502c5dd20020e37b2ca1a5514a2 +SHA512 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 88800fe2a453fc40f16ff54c21c852a8ea8e1496e42d5d187e5b5ac0ff58050830fc0816239e4f88cb23ed301f894d1ca52eb4676fd85c13c285cec815ae7c42 +Size (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 493044 bytes diff --git a/textproc/split-thai/files/README.txt b/textproc/split-thai/files/README.txt new file mode 100644 index 00000000000..7b91f97fb9a --- /dev/null +++ b/textproc/split-thai/files/README.txt @@ -0,0 +1,49 @@ +This is a collection of utilities to separate Thai words by spaces +(word tokenization). They can separate stdin, files, or text as +arguments. It includes 3 separate utilities: + +st-emacs: emacs-script using emacs lisp thai-word library + https://www.gnu.org/software/emacs/ +st-icu: basic C++ program using the ICU library + http://site.icu-project.org/ +st-swath: sh script wrapper to simplfy args to the swath program + https://linux.thai.net/projects/swath + +All scripts should be able to take a filename, stdin, or arguments as +input, e.g., : + + # st-swath แมวและหมา +or + # echo "แมวและหมา" | st-swath +or + # st-swath < thaifile.txt +or + # st-swath "แมวหมา" พ่อและแม่ + +You will most likely need to set LC_ALL or LC_CTYPE to an approriate +unicode value, e.g., en_US.UTF-8 or C.UTF-8, in the environment for +them to work properly. These tools are setup to only support UTF-8 +encodings. + +Note that it is not possible to split Thai words 100% accurately +without context and meaning. These programs use dictionary-based word +splitting. + +Also included in the package is a combined thai word dictionary and +corresponding .tri file, and emacs lisp .el file for reading and +dumping out dictionary files. + +st-emacs and st-swath are setup to use the combined dictionary with +words from the emacs 'thai-word library, swath dictionary words, and +the icu thai library words. + +st-icu uses its own built in library. To customise the icu +dictionary, you apparently would have to modify + icu4c/source/data/brkitr/dictionaries/thaidict.txt +and rebuild icu library, and then rebuild the whole thing. + +There is also + +See also swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1) + +TODO - fix st-icu to use all the combined dictionary words. diff --git a/textproc/split-thai/files/st-emacs b/textproc/split-thai/files/st-emacs new file mode 100755 index 00000000000..eff41ab98ea --- /dev/null +++ b/textproc/split-thai/files/st-emacs @@ -0,0 +1,54 @@ +#!/bin/emacs --script +;; +;; break thai string into words separated by spaces +;; +;; - if no args, process stdin +;; - if one arg and file exists with arg name, process file +;; - else join get remainder of args and process +;; + +;;(toggle-debug-on-error) ;; debug +(require 'thai-word) + +;; load custom dictionary +(load "ST_SHARE_DIR/thai-utility" nil t) +(thai-update-word-table-utf8 "ST_SHARE_DIR/thaidict") + +;; split a thai line by spaces, return new line +(defun process-thai-line(line) + (with-temp-buffer + (insert line) + (goto-char (point-min)) + (thai-break-words " ") + (buffer-string))) + +;; hack to process stdin +(defun process-stdin() + (condition-case nil + (let (aline) + (while (setq aline (read-from-minibuffer "")) + (princ (process-thai-line aline)) + (princ "\n"))) + (error nil))) + +;; process arguments, remove "emacs -scriptload scriptname" from args, +;; join the rest by spaces +(setq args (cdddr command-line-args)) +(setq argc (length args)) + +;; no args => process stdin +(when (= 0 argc) + (process-stdin) + (kill-emacs 0)) + +;; if one arg and arg is a file, process that file +;; else process all input args joined by spaces with an added newline +(with-temp-buffer + (if (and (= 1 argc) (file-exists-p (car args))) + (insert-file-contents (car args)) + (insert (mapconcat 'identity (cdddr command-line-args) " ")) + (insert "\n")) + (goto-char (point-min)) + (thai-break-words " ") + (write-region nil nil "/dev/stdout")) +(kill-emacs 0) diff --git a/textproc/split-thai/files/st-icu.cc b/textproc/split-thai/files/st-icu.cc new file mode 100644 index 00000000000..8df3f3c2f2f --- /dev/null +++ b/textproc/split-thai/files/st-icu.cc @@ -0,0 +1,195 @@ +/* + * split up thai strings in a file, stdin or args into "words" + */ +#include <fstream> +#include <vector> + +#include <unicode/brkiter.h> +#include <unicode/regex.h> +#include <unicode/ucnv.h> +#include <unicode/ustream.h> +#include <unicode/ustdio.h> + +using namespace std; +using namespace icu; + +void usage() { + const char *progname = "st-icu"; + + cout << endl << + "Usage: " << progname << " [stdin|filename|thaiarg1 thaiarg2 ...]" << + endl << endl << + "This program attempts to split thai strings into thai words." << endl << + "It takes a filename, stdin, or UTF8 thai string(s) as arguments" << endl << + "and prints out the string separated by spaces." << endl << + "When no argument is given, it can read lines from stdin, and" << endl << + "separate thai words in the line by spaces." << endl << endl << + "returns 0 on succes, or non-zero otherwise" << endl << endl; +} + +// return true if string contains any thai unicode +bool contains_thai(const UnicodeString &s) { + UErrorCode status = U_ZERO_ERROR; + // matches one or more thai chars, \u0e01-\u0e5b should work too + RegexMatcher *matcher = new RegexMatcher("[\u0e00-\u0e7f]+", 0, status); + + if (U_FAILURE(status)) { + // syntax errors in the regular expression + cerr << "error creating RegexMatcher" << endl; + exit(1); + } + + matcher->reset(s); + if (matcher->find()) + return true; + else + return false; +} + +// split a unicode string by word boundaries. if arg contains +// whitespaces, it will get consolidated to single spaces. +// if string has no thai characters, return it unmodified +UnicodeString split_words_consolidated(const UnicodeString &s) { + if ( ! contains_thai(s) ) { + return s; + } + + UErrorCode status = U_ZERO_ERROR; + BreakIterator* wordBreaker = + BreakIterator::createWordInstance(Locale::getUS(), status); + if ( U_FAILURE(status) ) { + cerr << "error creating BreakIterator" << endl; + exit(1); + } + + wordBreaker->setText(s); + vector<int32_t> vbreak; + + int32_t pos = wordBreaker->first(); + while( pos != BreakIterator::DONE ) { + // cout << "boundary " << pos << endl; + vbreak.push_back(pos); + pos = wordBreaker->next(); + } + + // only one word found, trim and done + if ( vbreak.size() == 1 ) { + UnicodeString ss(s); + return ss.trim(); + } + + UnicodeString rs; + for (int i = 0 ; i < vbreak.size() - 1; i++) { + UnicodeString ss; + s.extractBetween(vbreak[i], vbreak[i+1], ss); + ss.trim(); + if ( ss != "" ) + rs += ss + " "; + } + + return rs.trim(); +} + +// split a unicode string by word boundaries trying to preserve +// original spacing +UnicodeString split_words(const UnicodeString &s) { + UnicodeString tempStr; + UnicodeString rs; + for (int i = 0 ; i < s.length() ; i++) { + if ( ! u_isUWhiteSpace(s[i]) ) { + tempStr += s[i]; + } else { + if ( tempStr.length() > 0 ) { + rs += split_words_consolidated(tempStr); + tempStr.remove(); + } + rs += s[i]; + } + } + if ( tempStr.length() > 0 ) + rs += split_words_consolidated(tempStr); + return rs; +} + +// split stdin +void split_stdin() { + UFILE *in = u_finit(stdin, NULL, NULL); + if ( !in ) { + cerr << "error: u_finit of stdin failed" << endl; + exit(1); + } + + UChar uch; + UnicodeString line; + while ( (uch = u_fgetc(in)) ) { + if ( uch == 0xffff ) { + break; + } else if ( uch == '\n' ) { + UnicodeString s(line); + cout << split_words(s) << endl; + line = ""; + } else { + line += uch; + } + } + + u_fclose(in); +} + +// read file line by line, spliting each line 1 at a time +void split_file(const char* filename) { + UFILE *in = u_fopen(filename, "r", NULL, NULL); + if ( !in ) { + cerr << "error: opening file " << filename << endl; + exit(1); + } + const int32_t maxLine = 1024; + UChar line[maxLine]; + while ( u_fgets(line, maxLine, in) != NULL ) { + //cout << split_words(line) << endl; + cout << split_words(line); + } + + u_fclose(in); +} + +// check if file is "readable" +bool is_readable(const char* fname) { + ifstream infile(fname); + return infile.good(); +} + +int main(int argc, char **argv) { + // utf8 for everything + ucnv_setDefaultName("UTF-8"); + + // read stdin when no args passed in + if ( argc <= 1 ) { + split_stdin(); + exit(0); + } + + // check second arg for help flag + UnicodeString arg2(argv[1]); + if ( arg2 == "-h" || arg2 == "-H" || arg2 == "-?" || arg2 == "-help" ) { + usage(); + exit(0); + } + + // if only one arg and exists with arg name, process file + if ( argc == 2 && is_readable(argv[1]) ) { + split_file(argv[1]); + exit(0); + } + + // join remainder of args and process as string + UnicodeString inArgs; + for ( int i = 1 ; i < argc ; i++ ) { + UnicodeString s(argv[i]); + inArgs += s; + if ( i < (argc - 1) ) + inArgs += " "; + } + cout << split_words(inArgs) << endl; + exit(0); +} diff --git a/textproc/split-thai/files/st-swath b/textproc/split-thai/files/st-swath new file mode 100755 index 00000000000..52d8e17acf8 --- /dev/null +++ b/textproc/split-thai/files/st-swath @@ -0,0 +1,42 @@ +#!/bin/sh +# +# simple wrapper for swath to split thai text from stdin, arg, or a +# file +# +# swath settings are split with ' ', longest match, unicode input, and +# unicode output. see swath(1) +# + +# use merged dictionary unless specified otherwise +if [ -z "$SWATHDICT" ]; then + dictarg="-d ST_SHARE_DIR/thaidict.tri" +fi + +if [ "$#" -eq 0 ]; then + # no args, read from stdin + while read line + do + echo "$line" | swath -b ' ' -m long -u 'u,u' $dictarg + done < /dev/stdin + exit 0 +elif [ "$#" -eq 1 -a -e "$1" ]; then + # one arg and arg is an existing file + swath -b ' ' -m long -u 'u,u' $dictarg < "$1" + exit $? +elif [ "$#" -ge 1 ]; then + # one or more args, assume it is all text + while [ "$1" != "" ]; do + if [ -z "$txt" ]; then + txt="$1" + else + txt="$txt $1" + fi + + shift + done + echo "$txt" | swath -b ' ' -m long -u 'u,u' $dictarg + exit $? +else + echo "$0: error parsing args" + exit 1 +fi diff --git a/textproc/split-thai/files/thai-utility.el b/textproc/split-thai/files/thai-utility.el new file mode 100644 index 00000000000..914534f436f --- /dev/null +++ b/textproc/split-thai/files/thai-utility.el @@ -0,0 +1,97 @@ +(require 'mule-util) +(require 'thai-word) + +" nested-alist from mule-util looks like this: " +" '(3585 1 ;; ก word ก " +" (3591 1 ;; ง word กง " +" (3585 t ;; ก " +" (3634 t ;; า " +" (3619 1))));; ร word กงการ" +" (3585 1 ;; ก word กก " +" (3621 1)))) ;; ล word กกล " + +(defun extract-thai-na(nlist thaistr) + "helper function to reconstruct thai words from a nested alist, +uses recursion" + (let ((ucode) + (complete)) + (cond + ;; finished + ((not nlist) nil) + + ;; (3591 1 ... + ((integerp (car nlist)) + ;; xxx care about coding-system vars here? + (setq ucode (char-to-string (car nlist))) + (setq complete (cadr nlist)) + (setq thaistr (concat thaistr ucode)) + (cond + ;; t => no word at this depth + ((equal complete t) + (extract-thai-na (cddr nlist) thaistr)) + ;; 1 => word at this depth + ((equal complete 1) + (append (list thaistr) + (extract-thai-na (cddr nlist) thaistr) '())) + (t + (error "invalid parsing for complete var")))) + + ;; not finished + (t + (append (extract-thai-na (car nlist) thaistr) + (extract-thai-na (cdr nlist) thaistr) '()))))) + +(defun thai-word-table-save(filename &optional alist) + "save thai words extracted from a nested-alist table to +filename in utf8 format. default is to save 'thai-word-table if +no alist argument given." + (interactive) + (let ((thaiwords) + (elem) + (coding-system-for-read 'utf-8) + (coding-system-for-write 'utf-8) + (buffer-file-coding-system 'utf-8)) + ;; default list or not + (setq alist (or alist + thai-word-table)) + + (or (nested-alist-p alist) + (error "Invalid argument %s" alist)) + + ;; remove 'thai-words from 'thai-word-table + (setq alist (cdr alist)) + + (with-temp-buffer + ;; process per-letter list one at a time. could process whole + ;; list at once but maybe try to conserve memory resources + (while (setq elem (car alist)) + (setq alist (cdr alist)) + (setq thaiwords (extract-thai-na elem "")) + + (dolist (elem thaiwords) + (insert elem "\n"))) + + (sort-lines nil (point-min) (point-max)) + (write-region nil nil filename) + (buffer-string)))) + +;; 'thai-tis620 is default for emacs <= 28 +(defun thai-update-word-table-utf8 (file &optional append) + "Update Thai word table by replacing the current word list with +FILE, which is in utf-8. If called with a prefix argument, FILE +is appended instead to the current word list. Does the same as +'thai-update-word-table, except that function expects +'thai-tis620 encoding" + (interactive "FThai word table file: \nP") + (let* ((coding-system-for-read 'utf-8) + (coding-system-for-write 'utf-8) + (buffer-file-coding-system 'utf-8) + (temp_file (make-temp-file "thaiutf8_"))) + (unwind-protect + (with-temp-buffer + (insert-file-contents file) + (setq coding-system-for-write 'thai-tis620) + (write-file temp_file)) + (thai-update-word-table temp_file append) + (delete-file temp_file) + thai-word-table))) diff --git a/textproc/split-thai/files/thaidict.abm b/textproc/split-thai/files/thaidict.abm new file mode 100644 index 00000000000..c364c86918a --- /dev/null +++ b/textproc/split-thai/files/thaidict.abm @@ -0,0 +1,2 @@ +[0x002d,0x002e] +[0x0e01,0x0e5b] |