summaryrefslogtreecommitdiff
path: root/textproc
diff options
context:
space:
mode:
authorscole <scole@pkgsrc.org>2020-08-28 16:02:42 +0000
committerscole <scole@pkgsrc.org>2020-08-28 16:02:42 +0000
commit3374708ce44e38a9a92ed3a857a057fb49dcdd7e (patch)
treedde1d59e9cc0af1817e085f41d6f9f15f29562bf /textproc
parent982fe04bf99c79bb3602999d5d8f7b00ad95cb90 (diff)
downloadpkgsrc-3374708ce44e38a9a92ed3a857a057fb49dcdd7e.tar.gz
Update to 0.8
- add 'tgrep' perl script for grepping thai words
Diffstat (limited to 'textproc')
-rw-r--r--textproc/split-thai/DESCR3
-rw-r--r--textproc/split-thai/Makefile10
-rw-r--r--textproc/split-thai/PLIST3
-rw-r--r--textproc/split-thai/files/README.txt27
-rwxr-xr-xtextproc/split-thai/files/tgrep208
5 files changed, 237 insertions, 14 deletions
diff --git a/textproc/split-thai/DESCR b/textproc/split-thai/DESCR
index 5dba7ad4417..2557d7021c5 100644
--- a/textproc/split-thai/DESCR
+++ b/textproc/split-thai/DESCR
@@ -3,4 +3,5 @@ boundaries, also known as word tokenization. The utilities use emacs,
swath, and a c++ icu-project program. All use dictionary-based word
splitting.
-Also included is merged dictionary file of thai words.
+Also included is a merged dictionary file of Thai words and a perl
+script to grep Thai UTF-8 words.
diff --git a/textproc/split-thai/Makefile b/textproc/split-thai/Makefile
index baedceb4b9c..8dfc38a1af2 100644
--- a/textproc/split-thai/Makefile
+++ b/textproc/split-thai/Makefile
@@ -1,6 +1,6 @@
-# $NetBSD: Makefile,v 1.7 2020/08/20 14:20:27 scole Exp $
+# $NetBSD: Makefile,v 1.8 2020/08/28 16:02:42 scole Exp $
-PKGNAME= split-thai-0.7
+PKGNAME= split-thai-0.8
CATEGORIES= textproc
MAINTAINER= pkgsrc-users@NetBSD.org
COMMENT= Utilities to split UTF-8 Thai text into words
@@ -15,10 +15,12 @@ MASTER_SITES= -${MASTER_SITE_GITHUB:=unicode-org/}/icu/raw/${GITHUB_ICU_TAG}/icu
USE_LANGUAGES= c++11 # darwin needed 11?
USE_TOOLS= pkg-config mkdir cp sh:run env awk cat sort uniq grep wc echo
+USE_TOOLS+= perl:run
BUILD_DEPENDS+= libdatrie-[0-9]*:../../devel/libdatrie
DEPENDS+= emacs-[0-9]*:../../editors/emacs
DEPENDS+= swath-[0-9]*:../../textproc/swath
+REPLACE_PERL= tgrep
REPLACE_SH= st-swath
UTF8_ENV= env LC_ALL=C.UTF-8
@@ -47,7 +49,7 @@ SUBST_SED.dictionary-app+= -e 's,ST_SHARE_BIN,${PREFIX}/${ST_SHARE_BIN},g'
pre-extract:
mkdir -p ${WRKSRC}
cd files && cp README.txt st-emacs st-icu.cc st-swath \
- thai-utility.el thaidict.abm ${WRKSRC}
+ tgrep thai-utility.el thaidict.abm ${WRKSRC}
post-extract:
cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
@@ -80,7 +82,7 @@ do-build:
do-install:
${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \
- ${DESTDIR}${PREFIX}/bin
+ ${WRKSRC}/tgrep ${DESTDIR}${PREFIX}/bin
${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin
.for i in ${ST_SHARE_FILES}
${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai
diff --git a/textproc/split-thai/PLIST b/textproc/split-thai/PLIST
index 14269891613..7844202bc99 100644
--- a/textproc/split-thai/PLIST
+++ b/textproc/split-thai/PLIST
@@ -1,7 +1,8 @@
-@comment $NetBSD: PLIST,v 1.2 2020/08/14 17:31:34 scole Exp $
+@comment $NetBSD: PLIST,v 1.3 2020/08/28 16:02:42 scole Exp $
bin/st-emacs
bin/st-icu
bin/st-swath
+bin/tgrep
share/split-thai/README.txt
share/split-thai/thai-dict.el
share/split-thai/thai-dict.elc
diff --git a/textproc/split-thai/files/README.txt b/textproc/split-thai/files/README.txt
index 8bdd21a706a..fd138059805 100644
--- a/textproc/split-thai/files/README.txt
+++ b/textproc/split-thai/files/README.txt
@@ -2,14 +2,16 @@ NAME
st-emacs
st-icu
st-swath
+ tgrep
SYNOPSIS
st-emacs|st-icu|st-swath [filename|text1 text2 ...|'blank']
+ tgrep [options] FILE ...
DESCRIPTION
This package is a collection of utilities to separate Thai words
by spaces (word tokenization). They can separate stdin, files,
- or text as arguments. It includes 3 separate utilities:
+ or text as arguments. It includes these utilities:
st-emacs: emacs-script using emacs lisp thai-word library
https://www.gnu.org/software/emacs/
@@ -18,30 +20,38 @@ DESCRIPTION
st-swath: sh script wrapper to simplfy args to the swath program
https://linux.thai.net/projects/swath
+ tgrep: grep-like utility using perl, see "tgrep -h"
+
EXAMPLES
- split one or more text strings
+ split one or more text strings:
# st-swath แมวและหมา
# st-swath "แมวหมา" พ่อและแม่
- read stdin
+ read stdin:
# echo "แมวและหมา" | st-swath
- read from a file
+ read from a file:
# st-swath < thaifile.txt
# st-swath somefile.txt
- They can also read directly from stdin
+ They can also read directly from stdin:
# st-icu
แมวหมา (typed in)
แมว หมา (output line by line)
+ grep for thai words:
+ # grep แมว thaifile.txt
+
ENVIRONMENT
You will most likely need to set the environment variables LC_ALL
or LC_CTYPE for proper unicode handling, e.g., en_US.UTF-8 or
C.UTF-8. These tools are only setup to handle UTF-8 encodings.
+ A terminal capable of entering and displaying UTF-8, and some
+ actual UTF-8 fonts installed on the system will also be needed.
+
EXIT STATUS
- 0 for success, non zero otherwise
+ 0 for success, non zero otherwise. For tgrep, see "tgrep -h"
NOTES
Note that it is not possible to split Thai words 100% accurately
@@ -66,5 +76,6 @@ SEE ALSO
BUGS
st-icu should also use the combined dictionary words.
- thai text mixed with other languages may not be handled well.
- this file should be converted to a proper manpage.
+ thai text mixed with other languages may not be handled well when
+ splitting.
+ this file should be converted to proper manpages.
diff --git a/textproc/split-thai/files/tgrep b/textproc/split-thai/files/tgrep
new file mode 100755
index 00000000000..d722baaa62a
--- /dev/null
+++ b/textproc/split-thai/files/tgrep
@@ -0,0 +1,208 @@
+#!/bin/perl
+#
+# perl grep equivalent-wrapper supporting utf-8 and thai in particular
+#
+use warnings;
+use strict;
+use Encode;
+use Getopt::Std;
+
+use utf8;
+use open qw/:std :utf8/;
+
+our ( $opt_h, $opt_i, $opt_l, $opt_n, $opt_q, $opt_v );
+
+getopts('hilnqv');
+
+if ( $opt_h ) {
+ usage();
+ exit 0;
+} elsif ( ! defined $ARGV[0] ) {
+ # no pattern given
+ usage();
+ exit 1;
+}
+
+my $pattern = decode('UTF-8', $ARGV[0]) if defined $ARGV[0];
+unless ( length( $pattern ) ) {
+ usage();
+ exit 1;
+}
+
+my $opt_filesonly = ( defined $opt_l ? 1 : 0 );
+my $opt_ignorecase = ( defined $opt_i ? 1 : 0 );
+my $opt_linenum = ( defined $opt_n ? 1 : 0 );
+my $opt_quiet = ( defined $opt_q ? 1 : 0 );
+my $opt_invert = ( defined $opt_v ? 1 : 0 );
+
+# rest of args should be filenames
+my @files = @ARGV;
+shift @files;
+@files = map { decode('UTF-8', $_ ) } @files;
+
+#
+# usage
+#
+sub usage {
+ print <<'EOF';
+
+NAME
+ tgrep - print lines matching a pattern, supports utf-8 characters
+ and some thai character classes using perl regexp matching.
+
+SYNOPSIS
+ tgrep [options] PATTERN [FILE] [FILE2]
+
+DESCRIPTION
+ tgrep (thai grep) is similar to grep, in that it searches files or
+ stdin for lines matching a pattern. It uses perl to support utf-8
+ characters, and therefore the patterns are perl regexp patterns.
+ It supports a few simple homegrown character classes:
+
+ [:thai:] match any thai unicode value
+ [:thaiconsonant:] match thai consonant including ฤ ฦ
+ [:thaidigit:] match thai number ๐๑๒๓๔๕๖๗๘๙
+ [:thaitonemark:] match thai tonemark ่้๊๋
+ [:thaivowel:] match thai vowel symbols ะัา ำิีึืุูเแโใไๅ็
+ does not include consonants that function as vowels
+ [:thaifullvowel:] same as [:thaivowel:] plus อรวยฤฦๅ used to form
+ vowel diacritics and dipthongs
+ [:thaimisc:] match misc thai symbols ฯๆฺ฿์ํ๎๏๚๛
+
+OPTIONS
+ -h print help or usage
+
+ -i ignore case
+
+ -l suppress normal output, only print filenames that match
+
+ -n prefix each line of output with the line number of the file
+
+ -q quiet mode, don't print out matches
+
+ -v invert match or print lines not matching pattern
+
+ENVIRONMENT
+ You may need to set LC_CTYPE, LC_ALL, or other LC_* to a utf-8
+ setting for this to program to work, e.g. for csh-type shells:
+ setenv LC_CTYPE en_US.UTF-8
+
+EXIT STATUS
+ Similar to grep, returns 0 when matching line found, 1 otherwise.
+ If an error occurs, exit with 2 unless -q (quiet) option and a
+ match is found
+
+EXAMPLES
+ search for 'ก' in a utf-8 text file
+ $ tgrep ก file.txt
+
+ use perl regexp to match any line thai with utf-8 characters
+ $ tgrep '\p{InThai}' somefile.txt
+
+ use perl regexp unicode values to match thai numbers
+ $ tgrep '^[\x{0e50}-\x{0e59}]+$' other.file
+
+ match lines with a thai number
+ $ tgrep '[:thaidigit:]' afile.txt
+
+NOTES
+ grep(1) also can be used to match thai characters with unicode
+ escapes, for example
+ egrep "["$'\u0e01'-$'\u0e5b'"]" file.txt
+ would match thai unicode chars in a bash-type shell.
+
+SEE ALSO
+ grep(1), perl(1), perlre(1), locale(1), ugrep(1)
+
+BUGS
+ Only utf-8 encodings are supported.
+ The character classes used by this program ([:thai*:]) are not
+ standard or supported by other programs.
+ Quoting perl regular expression can sometimes be difficult from
+ within the shell.
+
+EOF
+}
+
+# handle convenience character classes
+if ( index($pattern, "[:thai:]") != -1 ) {
+ $pattern =~ s!\[\:thai\:\]!\\p\{InThai\}!g;
+}
+if ( index($pattern, "[:thaiconsonant:]") != -1 ) {
+ # chars between ก & ฮ inclusive
+ $pattern =~ s!\[\:thaiconsonant\:\]!\[\x{0e01}-\x{0e2e}\]!g;
+}
+if ( index($pattern, "[:thaidigit:]") != -1 ) {
+ $pattern =~ s!\[\:thaidigit\:\]![๐๑๒๓๔๕๖๗๘๙]!g;
+}
+if ( index($pattern, "[:thaitonemark:]") != -1 ) {
+ $pattern =~ s!\[\:thaitonemark\:\]![่้๊๋]!g;
+}
+if ( index($pattern, "[:thaivowel:]") != -1 ) {
+ $pattern =~ s!\[\:thaivowel\:\]![ะัา ำิีึืุูเแโใไๅ็]!g;
+}
+if ( index($pattern, "[:thaivowelfull:]") != -1 ) {
+ $pattern =~ s!\[\:thaivowelfull\:\]![ะัา ำิีึืุูเแโใไๅ็อรวยฤฦๅ]!g;
+}
+if ( index($pattern, "[:thaimisc:]") != -1 ) {
+ $pattern =~ s!\[\:thaimisc\:\]![ฯๆ฿์ํ๎๏ฺ๚๛]!g;
+}
+
+my $qpattern = ( $opt_ignorecase ? qr/$pattern/iou : qr/$pattern/ou );
+#print "pattern \"$pattern\"\n";
+#print "qpattern \"$qpattern\"\n";
+
+# if no file args or just "-", assume stdin
+push @files, "/dev/stdin" if ! @files;
+@files = map { $_ eq "-" ? "/dev/stdin" : $_ } @files;
+
+# maybe help to improve matching speed
+my $not_invert = ! $opt_invert;
+
+my $match_found = 0;
+my $error_occurred = 0;
+foreach my $file ( @files ) {
+ my $info;
+ unless ( open $info, $file ) {
+ warn "Could not open $file: $!";
+ $error_occurred = 1;
+ next;
+ }
+
+ my $line_num = 1;
+ my $print_filename = ( scalar ( @files ) > 1 ) && $file ne "/dev/stdin";
+ my $print_linenum = $opt_linenum && $file ne "/dev/stdin";
+
+ while( my $line = <$info> ) {
+ if ( ( $not_invert && $line =~ m/$qpattern/ ) ||
+ ( $opt_invert && $line !~ m/$qpattern/ ) ) {
+ $match_found = 1;
+ if ( $opt_quiet ) {
+ last;
+ } elsif ( $opt_filesonly ) {
+ print $file, "\n";
+ last;
+ }
+ print $file,":" if $print_filename;
+ print $line_num,":" if $print_linenum;
+ chomp($line);
+ print $line, "\n";
+ }
+ $line_num += 1;
+ }
+ unless ( close $info ) {
+ warn "Could not close $file: $!";
+ $error_occurred = 1;
+ }
+}
+
+# exit with same error codes as grep
+if ( $error_occurred ) {
+ if ( $match_found && $opt_quiet ) {
+ exit 0;
+ } else {
+ exit 2;
+ }
+} else {
+ exit ( $match_found ? 0 : 1 );
+}