diff options
author | scole <scole@pkgsrc.org> | 2020-08-14 17:31:34 +0000 |
---|---|---|
committer | scole <scole@pkgsrc.org> | 2020-08-14 17:31:34 +0000 |
commit | 1bba9897f2a766bde9f50dfb4ec805e3d9c2e94a (patch) | |
tree | b6fa4b9d9a5a5a17f9ba7c2a8d13ffded7171513 /textproc | |
parent | e887163372569eddf2bf9ebdc7a7ca9af7bdf5e1 (diff) | |
download | pkgsrc-1bba9897f2a766bde9f50dfb4ec805e3d9c2e94a.tar.gz |
Update to version 0.2
- generate the emacs dictionary once at build time, not every time the
program is run
- clean up the README
Diffstat (limited to 'textproc')
-rw-r--r-- | textproc/split-thai/Makefile | 15 | ||||
-rw-r--r-- | textproc/split-thai/PLIST | 4 | ||||
-rw-r--r-- | textproc/split-thai/files/README.txt | 93 | ||||
-rwxr-xr-x | textproc/split-thai/files/st-emacs | 2 | ||||
-rw-r--r-- | textproc/split-thai/files/thai-utility.el | 45 |
5 files changed, 115 insertions, 44 deletions
diff --git a/textproc/split-thai/Makefile b/textproc/split-thai/Makefile index b332448c98b..76ae72dd071 100644 --- a/textproc/split-thai/Makefile +++ b/textproc/split-thai/Makefile @@ -1,6 +1,6 @@ -# $NetBSD: Makefile,v 1.1 2020/08/13 20:52:09 scole Exp $ +# $NetBSD: Makefile,v 1.2 2020/08/14 17:31:34 scole Exp $ -PKGNAME= split-thai-0.1 +PKGNAME= split-thai-0.2 CATEGORIES= textproc MAINTAINER= pkgsrc-users@NetBSD.org COMMENT= Utilities to split UTF-8 Thai text into words @@ -26,6 +26,9 @@ UTF8_ENV= env LC_ALL=C.UTF-8 ST_SHARE_DIR= share/split-thai INSTALLATION_DIRS= bin ${ST_SHARE_DIR} +ST_SHARE_FILES= README.txt thaidict thai-dict.el thai-dict.elc +ST_SHARE_FILES+= thai-utility.el thai-utility.elc thaidict.tri + # xxx REPLACE_EMACS_SCRIPT SUBST_CLASSES+= st-emacs-app SUBST_STAGE.st-emacs-app= pre-configure @@ -47,7 +50,7 @@ pre-extract: post-extract: cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \ -f batch-byte-compile thai-utility.el - cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.el \ + cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.elc \ --eval '(thai-word-table-save "emacs-dict")' cp ${WRKDIR}/${DISTFILES} ${WRKSRC}/icu-dict cd ${PREFIX}/share/swath && \ @@ -58,6 +61,10 @@ post-extract: grep -v '#' | sort | uniq > thaidict cd ${WRKSRC} && \ ${UTF8_ENV} trietool thaidict add-list -e utf-8 thaidict + cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.elc \ + --eval '(thai-word-table-save-defvar "thaidict" "thai-dict.el")' + cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \ + -f batch-byte-compile thai-dict.el .for i in emacs-dict icu-dict swath-dict @${ECHO} `wc -l ${WRKSRC}/${i} | awk '{print $$1}'` words in ${i} .endfor @@ -73,7 +80,7 @@ do-install: ${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \ ${DESTDIR}${PREFIX}/bin ${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin -.for i in README.txt thaidict thai-utility.el thai-utility.elc thaidict.tri +.for i in ${ST_SHARE_FILES} ${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai .endfor diff --git a/textproc/split-thai/PLIST b/textproc/split-thai/PLIST index 4a1bef0d833..14269891613 100644 --- a/textproc/split-thai/PLIST +++ b/textproc/split-thai/PLIST @@ -1,8 +1,10 @@ -@comment $NetBSD: PLIST,v 1.1 2020/08/13 20:52:09 scole Exp $ +@comment $NetBSD: PLIST,v 1.2 2020/08/14 17:31:34 scole Exp $ bin/st-emacs bin/st-icu bin/st-swath share/split-thai/README.txt +share/split-thai/thai-dict.el +share/split-thai/thai-dict.elc share/split-thai/thai-utility.el share/split-thai/thai-utility.elc share/split-thai/thaidict diff --git a/textproc/split-thai/files/README.txt b/textproc/split-thai/files/README.txt index 7b91f97fb9a..7480d4b4c2a 100644 --- a/textproc/split-thai/files/README.txt +++ b/textproc/split-thai/files/README.txt @@ -1,49 +1,70 @@ -This is a collection of utilities to separate Thai words by spaces -(word tokenization). They can separate stdin, files, or text as -arguments. It includes 3 separate utilities: +NAME + st-emacs + st-icu + st-swath -st-emacs: emacs-script using emacs lisp thai-word library - https://www.gnu.org/software/emacs/ -st-icu: basic C++ program using the ICU library - http://site.icu-project.org/ -st-swath: sh script wrapper to simplfy args to the swath program - https://linux.thai.net/projects/swath +SYNOPSIS + st-emacs|st-icu|st-swath [filename|text1 text2 ...|'blank'] -All scripts should be able to take a filename, stdin, or arguments as -input, e.g., : +DESCRIPTION + This package is a collection of utilities to separate Thai words + by spaces (word tokenization). They can separate stdin, files, + or text as arguments. It includes 3 separate utilities: + st-emacs: emacs-script using emacs lisp thai-word library + https://www.gnu.org/software/emacs/ + st-icu: basic C++ program using the ICU library + http://site.icu-project.org/ + st-swath: sh script wrapper to simplfy args to the swath program + https://linux.thai.net/projects/swath + +EXAMPLES + split one or more text strings # st-swath แมวและหมา -or - # echo "แมวและหมา" | st-swath -or - # st-swath < thaifile.txt -or # st-swath "แมวหมา" พ่อและแม่ -You will most likely need to set LC_ALL or LC_CTYPE to an approriate -unicode value, e.g., en_US.UTF-8 or C.UTF-8, in the environment for -them to work properly. These tools are setup to only support UTF-8 -encodings. + read stdin + # echo "แมวและหมา" | st-swath + + read from a file + # st-swath < thaifile.txt + # st-swath somefile.txt + + They can also read directly from stdin + # st-icu + แมวหมา (typed in) + แมว หมา (output line by line) + +ENVIRONMENT + You will most likely need to set the environment variables LC_ALL + or LC_CTYPE for proper unicode handling, e.g., en_US.UTF-8 or + C.UTF-8. These tools are only setup to handle UTF-8 encodings. -Note that it is not possible to split Thai words 100% accurately -without context and meaning. These programs use dictionary-based word -splitting. +EXIT STATUS + 0 for success, non zero otherwise -Also included in the package is a combined thai word dictionary and -corresponding .tri file, and emacs lisp .el file for reading and -dumping out dictionary files. +NOTES + Note that it is not possible to split Thai words 100% accurately + without context and meaning. All these programs use + dictionary-based word splitting. -st-emacs and st-swath are setup to use the combined dictionary with -words from the emacs 'thai-word library, swath dictionary words, and -the icu thai library words. + Also included in the package is a combined thai word dictionary + and corresponding .tri file, and emacs lisp .el files for reading + and dumping out dictionary files. -st-icu uses its own built in library. To customise the icu -dictionary, you apparently would have to modify - icu4c/source/data/brkitr/dictionaries/thaidict.txt -and rebuild icu library, and then rebuild the whole thing. + st-emacs and st-swath are setup to use the combined dictionary + with words from the emacs 'thai-word library, swath dictionary + words, and the icu thai library words. -There is also + st-icu uses its own built in library. To customise the icu + dictionary, you apparently would have to modify + icu4c/source/data/brkitr/dictionaries/thaidict.txt and then + rebuild the whole library. -See also swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1) +SEE ALSO + swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1) -TODO - fix st-icu to use all the combined dictionary words. +BUGS + st-icu should also use the combined dictionary words. + st-emacs and st-icu don't always split thai numbers well. + this file should be converted to a proper manpage. diff --git a/textproc/split-thai/files/st-emacs b/textproc/split-thai/files/st-emacs index eff41ab98ea..693c90ab117 100755 --- a/textproc/split-thai/files/st-emacs +++ b/textproc/split-thai/files/st-emacs @@ -12,7 +12,7 @@ ;; load custom dictionary (load "ST_SHARE_DIR/thai-utility" nil t) -(thai-update-word-table-utf8 "ST_SHARE_DIR/thaidict") +(load "ST_SHARE_DIR/thai-dict" nil t) ;; split a thai line by spaces, return new line (defun process-thai-line(line) diff --git a/textproc/split-thai/files/thai-utility.el b/textproc/split-thai/files/thai-utility.el index 914534f436f..61163248260 100644 --- a/textproc/split-thai/files/thai-utility.el +++ b/textproc/split-thai/files/thai-utility.el @@ -43,8 +43,8 @@ uses recursion" (defun thai-word-table-save(filename &optional alist) "save thai words extracted from a nested-alist table to -filename in utf8 format. default is to save 'thai-word-table if -no alist argument given." +filename in utf8 format, one word per line. default is to save +'thai-word-table if no alist argument given." (interactive) (let ((thaiwords) (elem) @@ -95,3 +95,44 @@ is appended instead to the current word list. Does the same as (thai-update-word-table temp_file append) (delete-file temp_file) thai-word-table))) + +(defun thai-word-table-save-defvar(dictfile lispfile) + "read a utf8 thai dictionary file and save to a lisp file +suitable for initializing the 'thai-word-table as a \"defvar\". +Overwrites the lisp file if it exists." + (interactive) + (let ((header) + (footer) + (elem) + (coding-system-for-read 'utf-8) + (coding-system-for-write 'utf-8) + (buffer-file-coding-system 'utf-8)) + (setq header (list "(defvar thai-word-table" + "(let ((table (list 'thai-words)))" + "(dolist (elt" + "'(" )) + (setq footer (list "))" + "(set-nested-alist elt 1 table))" + "table)" + "\"Nested alist of Thai words.\")" )) + (with-temp-buffer + (insert-file-contents dictfile) + (goto-char (point-min)) + ;; quote each thai word + (while (not (eobp)) + (beginning-of-line) + (insert "\"") + (end-of-line) + (insert "\"") + (forward-line 1)) + + (goto-char (point-min)) + (dolist (elem header) + (insert elem "\n")) + + (goto-char (point-max)) + (dolist (elem footer) + (insert elem "\n")) + (lisp-mode) + (indent-region (point-min) (point-max)) + (write-region nil nil lispfile)))) |