summaryrefslogtreecommitdiff
path: root/textproc
diff options
context:
space:
mode:
authorscole <scole@pkgsrc.org>2020-08-14 17:31:34 +0000
committerscole <scole@pkgsrc.org>2020-08-14 17:31:34 +0000
commit1bba9897f2a766bde9f50dfb4ec805e3d9c2e94a (patch)
treeb6fa4b9d9a5a5a17f9ba7c2a8d13ffded7171513 /textproc
parente887163372569eddf2bf9ebdc7a7ca9af7bdf5e1 (diff)
downloadpkgsrc-1bba9897f2a766bde9f50dfb4ec805e3d9c2e94a.tar.gz
Update to version 0.2
- generate the emacs dictionary once at build time, not every time the program is run - clean up the README
Diffstat (limited to 'textproc')
-rw-r--r--textproc/split-thai/Makefile15
-rw-r--r--textproc/split-thai/PLIST4
-rw-r--r--textproc/split-thai/files/README.txt93
-rwxr-xr-xtextproc/split-thai/files/st-emacs2
-rw-r--r--textproc/split-thai/files/thai-utility.el45
5 files changed, 115 insertions, 44 deletions
diff --git a/textproc/split-thai/Makefile b/textproc/split-thai/Makefile
index b332448c98b..76ae72dd071 100644
--- a/textproc/split-thai/Makefile
+++ b/textproc/split-thai/Makefile
@@ -1,6 +1,6 @@
-# $NetBSD: Makefile,v 1.1 2020/08/13 20:52:09 scole Exp $
+# $NetBSD: Makefile,v 1.2 2020/08/14 17:31:34 scole Exp $
-PKGNAME= split-thai-0.1
+PKGNAME= split-thai-0.2
CATEGORIES= textproc
MAINTAINER= pkgsrc-users@NetBSD.org
COMMENT= Utilities to split UTF-8 Thai text into words
@@ -26,6 +26,9 @@ UTF8_ENV= env LC_ALL=C.UTF-8
ST_SHARE_DIR= share/split-thai
INSTALLATION_DIRS= bin ${ST_SHARE_DIR}
+ST_SHARE_FILES= README.txt thaidict thai-dict.el thai-dict.elc
+ST_SHARE_FILES+= thai-utility.el thai-utility.elc thaidict.tri
+
# xxx REPLACE_EMACS_SCRIPT
SUBST_CLASSES+= st-emacs-app
SUBST_STAGE.st-emacs-app= pre-configure
@@ -47,7 +50,7 @@ pre-extract:
post-extract:
cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
-f batch-byte-compile thai-utility.el
- cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.el \
+ cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.elc \
--eval '(thai-word-table-save "emacs-dict")'
cp ${WRKDIR}/${DISTFILES} ${WRKSRC}/icu-dict
cd ${PREFIX}/share/swath && \
@@ -58,6 +61,10 @@ post-extract:
grep -v '#' | sort | uniq > thaidict
cd ${WRKSRC} && \
${UTF8_ENV} trietool thaidict add-list -e utf-8 thaidict
+ cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.elc \
+ --eval '(thai-word-table-save-defvar "thaidict" "thai-dict.el")'
+ cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
+ -f batch-byte-compile thai-dict.el
.for i in emacs-dict icu-dict swath-dict
@${ECHO} `wc -l ${WRKSRC}/${i} | awk '{print $$1}'` words in ${i}
.endfor
@@ -73,7 +80,7 @@ do-install:
${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \
${DESTDIR}${PREFIX}/bin
${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin
-.for i in README.txt thaidict thai-utility.el thai-utility.elc thaidict.tri
+.for i in ${ST_SHARE_FILES}
${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai
.endfor
diff --git a/textproc/split-thai/PLIST b/textproc/split-thai/PLIST
index 4a1bef0d833..14269891613 100644
--- a/textproc/split-thai/PLIST
+++ b/textproc/split-thai/PLIST
@@ -1,8 +1,10 @@
-@comment $NetBSD: PLIST,v 1.1 2020/08/13 20:52:09 scole Exp $
+@comment $NetBSD: PLIST,v 1.2 2020/08/14 17:31:34 scole Exp $
bin/st-emacs
bin/st-icu
bin/st-swath
share/split-thai/README.txt
+share/split-thai/thai-dict.el
+share/split-thai/thai-dict.elc
share/split-thai/thai-utility.el
share/split-thai/thai-utility.elc
share/split-thai/thaidict
diff --git a/textproc/split-thai/files/README.txt b/textproc/split-thai/files/README.txt
index 7b91f97fb9a..7480d4b4c2a 100644
--- a/textproc/split-thai/files/README.txt
+++ b/textproc/split-thai/files/README.txt
@@ -1,49 +1,70 @@
-This is a collection of utilities to separate Thai words by spaces
-(word tokenization). They can separate stdin, files, or text as
-arguments. It includes 3 separate utilities:
+NAME
+ st-emacs
+ st-icu
+ st-swath
-st-emacs: emacs-script using emacs lisp thai-word library
- https://www.gnu.org/software/emacs/
-st-icu: basic C++ program using the ICU library
- http://site.icu-project.org/
-st-swath: sh script wrapper to simplfy args to the swath program
- https://linux.thai.net/projects/swath
+SYNOPSIS
+ st-emacs|st-icu|st-swath [filename|text1 text2 ...|'blank']
-All scripts should be able to take a filename, stdin, or arguments as
-input, e.g., :
+DESCRIPTION
+ This package is a collection of utilities to separate Thai words
+ by spaces (word tokenization). They can separate stdin, files,
+ or text as arguments. It includes 3 separate utilities:
+ st-emacs: emacs-script using emacs lisp thai-word library
+ https://www.gnu.org/software/emacs/
+ st-icu: basic C++ program using the ICU library
+ http://site.icu-project.org/
+ st-swath: sh script wrapper to simplfy args to the swath program
+ https://linux.thai.net/projects/swath
+
+EXAMPLES
+ split one or more text strings
# st-swath แมวและหมา
-or
- # echo "แมวและหมา" | st-swath
-or
- # st-swath < thaifile.txt
-or
# st-swath "แมวหมา" พ่อและแม่
-You will most likely need to set LC_ALL or LC_CTYPE to an approriate
-unicode value, e.g., en_US.UTF-8 or C.UTF-8, in the environment for
-them to work properly. These tools are setup to only support UTF-8
-encodings.
+ read stdin
+ # echo "แมวและหมา" | st-swath
+
+ read from a file
+ # st-swath < thaifile.txt
+ # st-swath somefile.txt
+
+ They can also read directly from stdin
+ # st-icu
+ แมวหมา (typed in)
+ แมว หมา (output line by line)
+
+ENVIRONMENT
+ You will most likely need to set the environment variables LC_ALL
+ or LC_CTYPE for proper unicode handling, e.g., en_US.UTF-8 or
+ C.UTF-8. These tools are only setup to handle UTF-8 encodings.
-Note that it is not possible to split Thai words 100% accurately
-without context and meaning. These programs use dictionary-based word
-splitting.
+EXIT STATUS
+ 0 for success, non zero otherwise
-Also included in the package is a combined thai word dictionary and
-corresponding .tri file, and emacs lisp .el file for reading and
-dumping out dictionary files.
+NOTES
+ Note that it is not possible to split Thai words 100% accurately
+ without context and meaning. All these programs use
+ dictionary-based word splitting.
-st-emacs and st-swath are setup to use the combined dictionary with
-words from the emacs 'thai-word library, swath dictionary words, and
-the icu thai library words.
+ Also included in the package is a combined thai word dictionary
+ and corresponding .tri file, and emacs lisp .el files for reading
+ and dumping out dictionary files.
-st-icu uses its own built in library. To customise the icu
-dictionary, you apparently would have to modify
- icu4c/source/data/brkitr/dictionaries/thaidict.txt
-and rebuild icu library, and then rebuild the whole thing.
+ st-emacs and st-swath are setup to use the combined dictionary
+ with words from the emacs 'thai-word library, swath dictionary
+ words, and the icu thai library words.
-There is also
+ st-icu uses its own built in library. To customise the icu
+ dictionary, you apparently would have to modify
+ icu4c/source/data/brkitr/dictionaries/thaidict.txt and then
+ rebuild the whole library.
-See also swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1)
+SEE ALSO
+ swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1)
-TODO - fix st-icu to use all the combined dictionary words.
+BUGS
+ st-icu should also use the combined dictionary words.
+ st-emacs and st-icu don't always split thai numbers well.
+ this file should be converted to a proper manpage.
diff --git a/textproc/split-thai/files/st-emacs b/textproc/split-thai/files/st-emacs
index eff41ab98ea..693c90ab117 100755
--- a/textproc/split-thai/files/st-emacs
+++ b/textproc/split-thai/files/st-emacs
@@ -12,7 +12,7 @@
;; load custom dictionary
(load "ST_SHARE_DIR/thai-utility" nil t)
-(thai-update-word-table-utf8 "ST_SHARE_DIR/thaidict")
+(load "ST_SHARE_DIR/thai-dict" nil t)
;; split a thai line by spaces, return new line
(defun process-thai-line(line)
diff --git a/textproc/split-thai/files/thai-utility.el b/textproc/split-thai/files/thai-utility.el
index 914534f436f..61163248260 100644
--- a/textproc/split-thai/files/thai-utility.el
+++ b/textproc/split-thai/files/thai-utility.el
@@ -43,8 +43,8 @@ uses recursion"
(defun thai-word-table-save(filename &optional alist)
"save thai words extracted from a nested-alist table to
-filename in utf8 format. default is to save 'thai-word-table if
-no alist argument given."
+filename in utf8 format, one word per line. default is to save
+'thai-word-table if no alist argument given."
(interactive)
(let ((thaiwords)
(elem)
@@ -95,3 +95,44 @@ is appended instead to the current word list. Does the same as
(thai-update-word-table temp_file append)
(delete-file temp_file)
thai-word-table)))
+
+(defun thai-word-table-save-defvar(dictfile lispfile)
+ "read a utf8 thai dictionary file and save to a lisp file
+suitable for initializing the 'thai-word-table as a \"defvar\".
+Overwrites the lisp file if it exists."
+ (interactive)
+ (let ((header)
+ (footer)
+ (elem)
+ (coding-system-for-read 'utf-8)
+ (coding-system-for-write 'utf-8)
+ (buffer-file-coding-system 'utf-8))
+ (setq header (list "(defvar thai-word-table"
+ "(let ((table (list 'thai-words)))"
+ "(dolist (elt"
+ "'(" ))
+ (setq footer (list "))"
+ "(set-nested-alist elt 1 table))"
+ "table)"
+ "\"Nested alist of Thai words.\")" ))
+ (with-temp-buffer
+ (insert-file-contents dictfile)
+ (goto-char (point-min))
+ ;; quote each thai word
+ (while (not (eobp))
+ (beginning-of-line)
+ (insert "\"")
+ (end-of-line)
+ (insert "\"")
+ (forward-line 1))
+
+ (goto-char (point-min))
+ (dolist (elem header)
+ (insert elem "\n"))
+
+ (goto-char (point-max))
+ (dolist (elem footer)
+ (insert elem "\n"))
+ (lisp-mode)
+ (indent-region (point-min) (point-max))
+ (write-region nil nil lispfile))))