summaryrefslogtreecommitdiff
path: root/textproc
diff options
context:
space:
mode:
authorscole <scole@pkgsrc.org>2020-08-13 20:52:08 +0000
committerscole <scole@pkgsrc.org>2020-08-13 20:52:08 +0000
commitb07548647c6d46aaebfeef8c155cdf784f9e87e8 (patch)
treec06d7ce614ed4894cd6746e47a3d746758d5135b /textproc
parenta95644a520c61d223d2bfb0c621815d947551513 (diff)
downloadpkgsrc-b07548647c6d46aaebfeef8c155cdf784f9e87e8.tar.gz
Add split-thai 0.1, a set of utilities for splitting Thai UTF8 text by word boundaries
Diffstat (limited to 'textproc')
-rw-r--r--textproc/Makefile3
-rw-r--r--textproc/split-thai/DESCR6
-rw-r--r--textproc/split-thai/Makefile81
-rw-r--r--textproc/split-thai/PLIST9
-rw-r--r--textproc/split-thai/distinfo6
-rw-r--r--textproc/split-thai/files/README.txt49
-rwxr-xr-xtextproc/split-thai/files/st-emacs54
-rw-r--r--textproc/split-thai/files/st-icu.cc195
-rwxr-xr-xtextproc/split-thai/files/st-swath42
-rw-r--r--textproc/split-thai/files/thai-utility.el97
-rw-r--r--textproc/split-thai/files/thaidict.abm2
11 files changed, 543 insertions, 1 deletions
diff --git a/textproc/Makefile b/textproc/Makefile
index 56de3c8cfc1..4bb62b1f09f 100644
--- a/textproc/Makefile
+++ b/textproc/Makefile
@@ -1,4 +1,4 @@
-# $NetBSD: Makefile,v 1.1164 2020/08/07 02:36:24 brook Exp $
+# $NetBSD: Makefile,v 1.1165 2020/08/13 20:52:08 scole Exp $
#
COMMENT= Text processing utilities (does not include desktop publishing)
@@ -1099,6 +1099,7 @@ SUBDIR+= soprano
SUBDIR+= sord
SUBDIR+= source-highlight
SUBDIR+= sphinxsearch
+SUBDIR+= split-thai
SUBDIR+= stardic
SUBDIR+= sub2srt
SUBDIR+= sublib
diff --git a/textproc/split-thai/DESCR b/textproc/split-thai/DESCR
new file mode 100644
index 00000000000..5dba7ad4417
--- /dev/null
+++ b/textproc/split-thai/DESCR
@@ -0,0 +1,6 @@
+A collection of utilities to split Thai Unicode UTF-8 text by word
+boundaries, also known as word tokenization. The utilities use emacs,
+swath, and a c++ icu-project program. All use dictionary-based word
+splitting.
+
+Also included is merged dictionary file of thai words.
diff --git a/textproc/split-thai/Makefile b/textproc/split-thai/Makefile
new file mode 100644
index 00000000000..b332448c98b
--- /dev/null
+++ b/textproc/split-thai/Makefile
@@ -0,0 +1,81 @@
+# $NetBSD: Makefile,v 1.1 2020/08/13 20:52:09 scole Exp $
+
+PKGNAME= split-thai-0.1
+CATEGORIES= textproc
+MAINTAINER= pkgsrc-users@NetBSD.org
+COMMENT= Utilities to split UTF-8 Thai text into words
+LICENSE= public-domain AND mit AND gnu-gpl-v2 # code, icu dict, swath dict
+
+# xxx fetching a specific version of a file out of a github project
+EXTRACT_SUFX= # none
+GITHUB_ICU_TAG= 61607c27732906d36c5bd4d23ecc092f89f53a2b
+DISTFILES= thaidict-${GITHUB_ICU_TAG}.txt
+MASTER_SITES= -${MASTER_SITE_GITHUB:=unicode-org/}/icu/raw/${GITHUB_ICU_TAG}/icu4c/source/data/brkitr/dictionaries/thaidict.txt
+
+USE_LANGUAGES= c++11 # darwin needed 11?
+
+USE_TOOLS= pkg-config mkdir cp sh:run env awk cat sort uniq grep wc echo
+BUILD_DEPENDS+= libdatrie-[0-9]*:../../devel/libdatrie
+DEPENDS+= emacs-[0-9]*:../../editors/emacs
+DEPENDS+= swath-[0-9]*:../../textproc/swath
+
+REPLACE_SH= st-swath
+
+UTF8_ENV= env LC_ALL=C.UTF-8
+
+ST_SHARE_DIR= share/split-thai
+INSTALLATION_DIRS= bin ${ST_SHARE_DIR}
+
+# xxx REPLACE_EMACS_SCRIPT
+SUBST_CLASSES+= st-emacs-app
+SUBST_STAGE.st-emacs-app= pre-configure
+SUBST_MESSAGE.st-emacs-app= Fixing emacs script paths.
+SUBST_FILES.st-emacs-app= st-emacs
+SUBST_SED.st-emacs-app= -e 's,!/bin/emacs,!${PREFIX}/bin/emacs,g'
+
+SUBST_CLASSES+= dictionary-app
+SUBST_STAGE.dictionary-app= pre-configure
+SUBST_MESSAGE.dictionary-app= Fixing dictionary paths.
+SUBST_FILES.dictionary-app= st-emacs st-swath
+SUBST_SED.dictionary-app= -e 's,ST_SHARE_DIR,${PREFIX}/${ST_SHARE_DIR},g'
+
+pre-extract:
+ mkdir -p ${WRKSRC}
+ cd files && cp README.txt st-emacs st-icu.cc st-swath \
+ thai-utility.el thaidict.abm ${WRKSRC}
+
+post-extract:
+ cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
+ -f batch-byte-compile thai-utility.el
+ cd ${WRKSRC} && ${UTF8_ENV} emacs --batch -l thai-utility.el \
+ --eval '(thai-word-table-save "emacs-dict")'
+ cp ${WRKDIR}/${DISTFILES} ${WRKSRC}/icu-dict
+ cd ${PREFIX}/share/swath && \
+ ${UTF8_ENV} trietool swathdic list | \
+ awk '{print $$1}' > ${WRKSRC}/swath-dict
+ cd ${WRKSRC} && \
+ ${UTF8_ENV} cat icu-dict swath-dict emacs-dict | \
+ grep -v '#' | sort | uniq > thaidict
+ cd ${WRKSRC} && \
+ ${UTF8_ENV} trietool thaidict add-list -e utf-8 thaidict
+.for i in emacs-dict icu-dict swath-dict
+ @${ECHO} `wc -l ${WRKSRC}/${i} | awk '{print $$1}'` words in ${i}
+.endfor
+ @${ECHO} `wc -l ${WRKSRC}/thaidict | awk '{print $$1}'` \
+ unique words in combined dictionary
+
+do-build:
+ cd ${WRKSRC} && \
+ ${CXX} ${CPPFLAGS} -o st-icu st-icu.cc \
+ `pkg-config --libs --cflags icu-io`
+
+do-install:
+ ${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \
+ ${DESTDIR}${PREFIX}/bin
+ ${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin
+.for i in README.txt thaidict thai-utility.el thai-utility.elc thaidict.tri
+ ${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai
+.endfor
+
+.include "../../textproc/icu/buildlink3.mk"
+.include "../../mk/bsd.pkg.mk"
diff --git a/textproc/split-thai/PLIST b/textproc/split-thai/PLIST
new file mode 100644
index 00000000000..4a1bef0d833
--- /dev/null
+++ b/textproc/split-thai/PLIST
@@ -0,0 +1,9 @@
+@comment $NetBSD: PLIST,v 1.1 2020/08/13 20:52:09 scole Exp $
+bin/st-emacs
+bin/st-icu
+bin/st-swath
+share/split-thai/README.txt
+share/split-thai/thai-utility.el
+share/split-thai/thai-utility.elc
+share/split-thai/thaidict
+share/split-thai/thaidict.tri
diff --git a/textproc/split-thai/distinfo b/textproc/split-thai/distinfo
new file mode 100644
index 00000000000..e35ad9a15aa
--- /dev/null
+++ b/textproc/split-thai/distinfo
@@ -0,0 +1,6 @@
+$NetBSD: distinfo,v 1.1 2020/08/13 20:52:09 scole Exp $
+
+SHA1 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 2a2ad127cc279835cb4df04eb69401a0d4927774
+RMD160 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 0a6df7b7dd6ef502c5dd20020e37b2ca1a5514a2
+SHA512 (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 88800fe2a453fc40f16ff54c21c852a8ea8e1496e42d5d187e5b5ac0ff58050830fc0816239e4f88cb23ed301f894d1ca52eb4676fd85c13c285cec815ae7c42
+Size (thaidict-61607c27732906d36c5bd4d23ecc092f89f53a2b.txt) = 493044 bytes
diff --git a/textproc/split-thai/files/README.txt b/textproc/split-thai/files/README.txt
new file mode 100644
index 00000000000..7b91f97fb9a
--- /dev/null
+++ b/textproc/split-thai/files/README.txt
@@ -0,0 +1,49 @@
+This is a collection of utilities to separate Thai words by spaces
+(word tokenization). They can separate stdin, files, or text as
+arguments. It includes 3 separate utilities:
+
+st-emacs: emacs-script using emacs lisp thai-word library
+ https://www.gnu.org/software/emacs/
+st-icu: basic C++ program using the ICU library
+ http://site.icu-project.org/
+st-swath: sh script wrapper to simplfy args to the swath program
+ https://linux.thai.net/projects/swath
+
+All scripts should be able to take a filename, stdin, or arguments as
+input, e.g., :
+
+ # st-swath แมวและหมา
+or
+ # echo "แมวและหมา" | st-swath
+or
+ # st-swath < thaifile.txt
+or
+ # st-swath "แมวหมา" พ่อและแม่
+
+You will most likely need to set LC_ALL or LC_CTYPE to an approriate
+unicode value, e.g., en_US.UTF-8 or C.UTF-8, in the environment for
+them to work properly. These tools are setup to only support UTF-8
+encodings.
+
+Note that it is not possible to split Thai words 100% accurately
+without context and meaning. These programs use dictionary-based word
+splitting.
+
+Also included in the package is a combined thai word dictionary and
+corresponding .tri file, and emacs lisp .el file for reading and
+dumping out dictionary files.
+
+st-emacs and st-swath are setup to use the combined dictionary with
+words from the emacs 'thai-word library, swath dictionary words, and
+the icu thai library words.
+
+st-icu uses its own built in library. To customise the icu
+dictionary, you apparently would have to modify
+ icu4c/source/data/brkitr/dictionaries/thaidict.txt
+and rebuild icu library, and then rebuild the whole thing.
+
+There is also
+
+See also swath(1), libthai(1), emacs(1), locale(1), uconv(1), iconv(1)
+
+TODO - fix st-icu to use all the combined dictionary words.
diff --git a/textproc/split-thai/files/st-emacs b/textproc/split-thai/files/st-emacs
new file mode 100755
index 00000000000..eff41ab98ea
--- /dev/null
+++ b/textproc/split-thai/files/st-emacs
@@ -0,0 +1,54 @@
+#!/bin/emacs --script
+;;
+;; break thai string into words separated by spaces
+;;
+;; - if no args, process stdin
+;; - if one arg and file exists with arg name, process file
+;; - else join get remainder of args and process
+;;
+
+;;(toggle-debug-on-error) ;; debug
+(require 'thai-word)
+
+;; load custom dictionary
+(load "ST_SHARE_DIR/thai-utility" nil t)
+(thai-update-word-table-utf8 "ST_SHARE_DIR/thaidict")
+
+;; split a thai line by spaces, return new line
+(defun process-thai-line(line)
+ (with-temp-buffer
+ (insert line)
+ (goto-char (point-min))
+ (thai-break-words " ")
+ (buffer-string)))
+
+;; hack to process stdin
+(defun process-stdin()
+ (condition-case nil
+ (let (aline)
+ (while (setq aline (read-from-minibuffer ""))
+ (princ (process-thai-line aline))
+ (princ "\n")))
+ (error nil)))
+
+;; process arguments, remove "emacs -scriptload scriptname" from args,
+;; join the rest by spaces
+(setq args (cdddr command-line-args))
+(setq argc (length args))
+
+;; no args => process stdin
+(when (= 0 argc)
+ (process-stdin)
+ (kill-emacs 0))
+
+;; if one arg and arg is a file, process that file
+;; else process all input args joined by spaces with an added newline
+(with-temp-buffer
+ (if (and (= 1 argc) (file-exists-p (car args)))
+ (insert-file-contents (car args))
+ (insert (mapconcat 'identity (cdddr command-line-args) " "))
+ (insert "\n"))
+ (goto-char (point-min))
+ (thai-break-words " ")
+ (write-region nil nil "/dev/stdout"))
+(kill-emacs 0)
diff --git a/textproc/split-thai/files/st-icu.cc b/textproc/split-thai/files/st-icu.cc
new file mode 100644
index 00000000000..8df3f3c2f2f
--- /dev/null
+++ b/textproc/split-thai/files/st-icu.cc
@@ -0,0 +1,195 @@
+/*
+ * split up thai strings in a file, stdin or args into "words"
+ */
+#include <fstream>
+#include <vector>
+
+#include <unicode/brkiter.h>
+#include <unicode/regex.h>
+#include <unicode/ucnv.h>
+#include <unicode/ustream.h>
+#include <unicode/ustdio.h>
+
+using namespace std;
+using namespace icu;
+
+void usage() {
+ const char *progname = "st-icu";
+
+ cout << endl <<
+ "Usage: " << progname << " [stdin|filename|thaiarg1 thaiarg2 ...]" <<
+ endl << endl <<
+ "This program attempts to split thai strings into thai words." << endl <<
+ "It takes a filename, stdin, or UTF8 thai string(s) as arguments" << endl <<
+ "and prints out the string separated by spaces." << endl <<
+ "When no argument is given, it can read lines from stdin, and" << endl <<
+ "separate thai words in the line by spaces." << endl << endl <<
+ "returns 0 on succes, or non-zero otherwise" << endl << endl;
+}
+
+// return true if string contains any thai unicode
+bool contains_thai(const UnicodeString &s) {
+ UErrorCode status = U_ZERO_ERROR;
+ // matches one or more thai chars, \u0e01-\u0e5b should work too
+ RegexMatcher *matcher = new RegexMatcher("[\u0e00-\u0e7f]+", 0, status);
+
+ if (U_FAILURE(status)) {
+ // syntax errors in the regular expression
+ cerr << "error creating RegexMatcher" << endl;
+ exit(1);
+ }
+
+ matcher->reset(s);
+ if (matcher->find())
+ return true;
+ else
+ return false;
+}
+
+// split a unicode string by word boundaries. if arg contains
+// whitespaces, it will get consolidated to single spaces.
+// if string has no thai characters, return it unmodified
+UnicodeString split_words_consolidated(const UnicodeString &s) {
+ if ( ! contains_thai(s) ) {
+ return s;
+ }
+
+ UErrorCode status = U_ZERO_ERROR;
+ BreakIterator* wordBreaker =
+ BreakIterator::createWordInstance(Locale::getUS(), status);
+ if ( U_FAILURE(status) ) {
+ cerr << "error creating BreakIterator" << endl;
+ exit(1);
+ }
+
+ wordBreaker->setText(s);
+ vector<int32_t> vbreak;
+
+ int32_t pos = wordBreaker->first();
+ while( pos != BreakIterator::DONE ) {
+ // cout << "boundary " << pos << endl;
+ vbreak.push_back(pos);
+ pos = wordBreaker->next();
+ }
+
+ // only one word found, trim and done
+ if ( vbreak.size() == 1 ) {
+ UnicodeString ss(s);
+ return ss.trim();
+ }
+
+ UnicodeString rs;
+ for (int i = 0 ; i < vbreak.size() - 1; i++) {
+ UnicodeString ss;
+ s.extractBetween(vbreak[i], vbreak[i+1], ss);
+ ss.trim();
+ if ( ss != "" )
+ rs += ss + " ";
+ }
+
+ return rs.trim();
+}
+
+// split a unicode string by word boundaries trying to preserve
+// original spacing
+UnicodeString split_words(const UnicodeString &s) {
+ UnicodeString tempStr;
+ UnicodeString rs;
+ for (int i = 0 ; i < s.length() ; i++) {
+ if ( ! u_isUWhiteSpace(s[i]) ) {
+ tempStr += s[i];
+ } else {
+ if ( tempStr.length() > 0 ) {
+ rs += split_words_consolidated(tempStr);
+ tempStr.remove();
+ }
+ rs += s[i];
+ }
+ }
+ if ( tempStr.length() > 0 )
+ rs += split_words_consolidated(tempStr);
+ return rs;
+}
+
+// split stdin
+void split_stdin() {
+ UFILE *in = u_finit(stdin, NULL, NULL);
+ if ( !in ) {
+ cerr << "error: u_finit of stdin failed" << endl;
+ exit(1);
+ }
+
+ UChar uch;
+ UnicodeString line;
+ while ( (uch = u_fgetc(in)) ) {
+ if ( uch == 0xffff ) {
+ break;
+ } else if ( uch == '\n' ) {
+ UnicodeString s(line);
+ cout << split_words(s) << endl;
+ line = "";
+ } else {
+ line += uch;
+ }
+ }
+
+ u_fclose(in);
+}
+
+// read file line by line, spliting each line 1 at a time
+void split_file(const char* filename) {
+ UFILE *in = u_fopen(filename, "r", NULL, NULL);
+ if ( !in ) {
+ cerr << "error: opening file " << filename << endl;
+ exit(1);
+ }
+ const int32_t maxLine = 1024;
+ UChar line[maxLine];
+ while ( u_fgets(line, maxLine, in) != NULL ) {
+ //cout << split_words(line) << endl;
+ cout << split_words(line);
+ }
+
+ u_fclose(in);
+}
+
+// check if file is "readable"
+bool is_readable(const char* fname) {
+ ifstream infile(fname);
+ return infile.good();
+}
+
+int main(int argc, char **argv) {
+ // utf8 for everything
+ ucnv_setDefaultName("UTF-8");
+
+ // read stdin when no args passed in
+ if ( argc <= 1 ) {
+ split_stdin();
+ exit(0);
+ }
+
+ // check second arg for help flag
+ UnicodeString arg2(argv[1]);
+ if ( arg2 == "-h" || arg2 == "-H" || arg2 == "-?" || arg2 == "-help" ) {
+ usage();
+ exit(0);
+ }
+
+ // if only one arg and exists with arg name, process file
+ if ( argc == 2 && is_readable(argv[1]) ) {
+ split_file(argv[1]);
+ exit(0);
+ }
+
+ // join remainder of args and process as string
+ UnicodeString inArgs;
+ for ( int i = 1 ; i < argc ; i++ ) {
+ UnicodeString s(argv[i]);
+ inArgs += s;
+ if ( i < (argc - 1) )
+ inArgs += " ";
+ }
+ cout << split_words(inArgs) << endl;
+ exit(0);
+}
diff --git a/textproc/split-thai/files/st-swath b/textproc/split-thai/files/st-swath
new file mode 100755
index 00000000000..52d8e17acf8
--- /dev/null
+++ b/textproc/split-thai/files/st-swath
@@ -0,0 +1,42 @@
+#!/bin/sh
+#
+# simple wrapper for swath to split thai text from stdin, arg, or a
+# file
+#
+# swath settings are split with ' ', longest match, unicode input, and
+# unicode output. see swath(1)
+#
+
+# use merged dictionary unless specified otherwise
+if [ -z "$SWATHDICT" ]; then
+ dictarg="-d ST_SHARE_DIR/thaidict.tri"
+fi
+
+if [ "$#" -eq 0 ]; then
+ # no args, read from stdin
+ while read line
+ do
+ echo "$line" | swath -b ' ' -m long -u 'u,u' $dictarg
+ done < /dev/stdin
+ exit 0
+elif [ "$#" -eq 1 -a -e "$1" ]; then
+ # one arg and arg is an existing file
+ swath -b ' ' -m long -u 'u,u' $dictarg < "$1"
+ exit $?
+elif [ "$#" -ge 1 ]; then
+ # one or more args, assume it is all text
+ while [ "$1" != "" ]; do
+ if [ -z "$txt" ]; then
+ txt="$1"
+ else
+ txt="$txt $1"
+ fi
+
+ shift
+ done
+ echo "$txt" | swath -b ' ' -m long -u 'u,u' $dictarg
+ exit $?
+else
+ echo "$0: error parsing args"
+ exit 1
+fi
diff --git a/textproc/split-thai/files/thai-utility.el b/textproc/split-thai/files/thai-utility.el
new file mode 100644
index 00000000000..914534f436f
--- /dev/null
+++ b/textproc/split-thai/files/thai-utility.el
@@ -0,0 +1,97 @@
+(require 'mule-util)
+(require 'thai-word)
+
+" nested-alist from mule-util looks like this: "
+" '(3585 1 ;; ก word ก "
+" (3591 1 ;; ง word กง "
+" (3585 t ;; ก "
+" (3634 t ;; า "
+" (3619 1))));; ร word กงการ"
+" (3585 1 ;; ก word กก "
+" (3621 1)))) ;; ล word กกล "
+
+(defun extract-thai-na(nlist thaistr)
+ "helper function to reconstruct thai words from a nested alist,
+uses recursion"
+ (let ((ucode)
+ (complete))
+ (cond
+ ;; finished
+ ((not nlist) nil)
+
+ ;; (3591 1 ...
+ ((integerp (car nlist))
+ ;; xxx care about coding-system vars here?
+ (setq ucode (char-to-string (car nlist)))
+ (setq complete (cadr nlist))
+ (setq thaistr (concat thaistr ucode))
+ (cond
+ ;; t => no word at this depth
+ ((equal complete t)
+ (extract-thai-na (cddr nlist) thaistr))
+ ;; 1 => word at this depth
+ ((equal complete 1)
+ (append (list thaistr)
+ (extract-thai-na (cddr nlist) thaistr) '()))
+ (t
+ (error "invalid parsing for complete var"))))
+
+ ;; not finished
+ (t
+ (append (extract-thai-na (car nlist) thaistr)
+ (extract-thai-na (cdr nlist) thaistr) '())))))
+
+(defun thai-word-table-save(filename &optional alist)
+ "save thai words extracted from a nested-alist table to
+filename in utf8 format. default is to save 'thai-word-table if
+no alist argument given."
+ (interactive)
+ (let ((thaiwords)
+ (elem)
+ (coding-system-for-read 'utf-8)
+ (coding-system-for-write 'utf-8)
+ (buffer-file-coding-system 'utf-8))
+ ;; default list or not
+ (setq alist (or alist
+ thai-word-table))
+
+ (or (nested-alist-p alist)
+ (error "Invalid argument %s" alist))
+
+ ;; remove 'thai-words from 'thai-word-table
+ (setq alist (cdr alist))
+
+ (with-temp-buffer
+ ;; process per-letter list one at a time. could process whole
+ ;; list at once but maybe try to conserve memory resources
+ (while (setq elem (car alist))
+ (setq alist (cdr alist))
+ (setq thaiwords (extract-thai-na elem ""))
+
+ (dolist (elem thaiwords)
+ (insert elem "\n")))
+
+ (sort-lines nil (point-min) (point-max))
+ (write-region nil nil filename)
+ (buffer-string))))
+
+;; 'thai-tis620 is default for emacs <= 28
+(defun thai-update-word-table-utf8 (file &optional append)
+ "Update Thai word table by replacing the current word list with
+FILE, which is in utf-8. If called with a prefix argument, FILE
+is appended instead to the current word list. Does the same as
+'thai-update-word-table, except that function expects
+'thai-tis620 encoding"
+ (interactive "FThai word table file: \nP")
+ (let* ((coding-system-for-read 'utf-8)
+ (coding-system-for-write 'utf-8)
+ (buffer-file-coding-system 'utf-8)
+ (temp_file (make-temp-file "thaiutf8_")))
+ (unwind-protect
+ (with-temp-buffer
+ (insert-file-contents file)
+ (setq coding-system-for-write 'thai-tis620)
+ (write-file temp_file))
+ (thai-update-word-table temp_file append)
+ (delete-file temp_file)
+ thai-word-table)))
diff --git a/textproc/split-thai/files/thaidict.abm b/textproc/split-thai/files/thaidict.abm
new file mode 100644
index 00000000000..c364c86918a
--- /dev/null
+++ b/textproc/split-thai/files/thaidict.abm
@@ -0,0 +1,2 @@
+[0x002d,0x002e]
+[0x0e01,0x0e5b]