Update to 0.8

- add 'tgrep' perl script for grepping thai words
author: scole <scole@pkgsrc.org> 2020-08-28 16:02:42 +0000
committer: scole <scole@pkgsrc.org> 2020-08-28 16:02:42 +0000
commit: 3374708ce44e38a9a92ed3a857a057fb49dcdd7e (patch)
tree: dde1d59e9cc0af1817e085f41d6f9f15f29562bf /textproc
parent: 982fe04bf99c79bb3602999d5d8f7b00ad95cb90 (diff)
download: pkgsrc-3374708ce44e38a9a92ed3a857a057fb49dcdd7e.tar.gz
5 files changed, 237 insertions, 14 deletions
diff --git a/textproc/split-thai/DESCR b/textproc/split-thai/DESCR
index 5dba7ad4417..2557d7021c5 100644
--- a/textproc/split-thai/DESCR
+++ b/textproc/split-thai/DESCR
@@ -3,4 +3,5 @@ boundaries, also known as word tokenization.  The utilities use emacs,
 swath, and a c++ icu-project program.  All use dictionary-based word
 splitting.
 
-Also included is merged dictionary file of thai words.
+Also included is a merged dictionary file of Thai words and a perl
+script to grep Thai UTF-8 words.
diff --git a/textproc/split-thai/Makefile b/textproc/split-thai/Makefile
index baedceb4b9c..8dfc38a1af2 100644
--- a/textproc/split-thai/Makefile
+++ b/textproc/split-thai/Makefile
@@ -1,6 +1,6 @@
-# $NetBSD: Makefile,v 1.7 2020/08/20 14:20:27 scole Exp $
+# $NetBSD: Makefile,v 1.8 2020/08/28 16:02:42 scole Exp $
 
-PKGNAME=	split-thai-0.7
+PKGNAME=	split-thai-0.8
 CATEGORIES=	textproc
 MAINTAINER=	pkgsrc-users@NetBSD.org
 COMMENT=	Utilities to split UTF-8 Thai text into words
@@ -15,10 +15,12 @@ MASTER_SITES=	-${MASTER_SITE_GITHUB:=unicode-org/}/icu/raw/${GITHUB_ICU_TAG}/icu
 USE_LANGUAGES=	c++11	# darwin needed 11?
 
 USE_TOOLS=	pkg-config mkdir cp sh:run env awk cat sort uniq grep wc echo
+USE_TOOLS+=	perl:run
 BUILD_DEPENDS+=	libdatrie-[0-9]*:../../devel/libdatrie
 DEPENDS+=	emacs-[0-9]*:../../editors/emacs
 DEPENDS+=	swath-[0-9]*:../../textproc/swath
 
+REPLACE_PERL=	tgrep
 REPLACE_SH=	st-swath
 
 UTF8_ENV=	env LC_ALL=C.UTF-8
@@ -47,7 +49,7 @@ SUBST_SED.dictionary-app+=	-e 's,ST_SHARE_BIN,${PREFIX}/${ST_SHARE_BIN},g'
 pre-extract:
 	mkdir -p ${WRKSRC}
 	cd files && cp README.txt st-emacs st-icu.cc st-swath \
-		thai-utility.el thaidict.abm ${WRKSRC}
+		tgrep thai-utility.el thaidict.abm ${WRKSRC}
 
 post-extract:
 	cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \
@@ -80,7 +82,7 @@ do-build:
 
 do-install:
 	${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \
-		${DESTDIR}${PREFIX}/bin
+		${WRKSRC}/tgrep ${DESTDIR}${PREFIX}/bin
 	${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin
 .for i in ${ST_SHARE_FILES}
 	${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai
diff --git a/textproc/split-thai/PLIST b/textproc/split-thai/PLIST
index 14269891613..7844202bc99 100644
--- a/textproc/split-thai/PLIST
+++ b/textproc/split-thai/PLIST
@@ -1,7 +1,8 @@
-@comment $NetBSD: PLIST,v 1.2 2020/08/14 17:31:34 scole Exp $
+@comment $NetBSD: PLIST,v 1.3 2020/08/28 16:02:42 scole Exp $
 bin/st-emacs
 bin/st-icu
 bin/st-swath
+bin/tgrep
 share/split-thai/README.txt
 share/split-thai/thai-dict.el
 share/split-thai/thai-dict.elc
diff --git a/textproc/split-thai/files/README.txt b/textproc/split-thai/files/README.txt
index 8bdd21a706a..fd138059805 100644
--- a/textproc/split-thai/files/README.txt
+++ b/textproc/split-thai/files/README.txt
@@ -2,14 +2,16 @@ NAME
      st-emacs
      st-icu
      st-swath
+     tgrep
 
 SYNOPSIS
      st-emacs|st-icu|st-swath [filename|text1 text2 ...|'blank']
+     tgrep [options] FILE ...
 
 DESCRIPTION
      This package is a collection of utilities to separate Thai words
      by spaces (word tokenization).  They can separate stdin, files,
-     or text as arguments.  It includes 3 separate utilities:
+     or text as arguments.  It includes these utilities:
 
      st-emacs:  emacs-script using emacs lisp thai-word library
                 https://www.gnu.org/software/emacs/
@@ -18,30 +20,38 @@ DESCRIPTION
      st-swath:  sh script wrapper to simplfy args to the swath program
                 https://linux.thai.net/projects/swath
 
+     tgrep:     grep-like utility using perl, see "tgrep -h"
+
 EXAMPLES
-      split one or more text strings
+      split one or more text strings:
       # st-swath แมวและหมา
       # st-swath "แมวหมา" พ่อและแม่
       
-      read stdin
+      read stdin:
       # echo "แมวและหมา" | st-swath
 
-      read from a file
+      read from a file:
       # st-swath < thaifile.txt
       # st-swath somefile.txt
 
-      They can also read directly from stdin
+      They can also read directly from stdin:
       # st-icu
         แมวหมา   (typed in)
         แมว หมา  (output line by line)
 
+      grep for thai words:
+      # grep แมว thaifile.txt
+
 ENVIRONMENT
      You will most likely need to set the environment variables LC_ALL
      or LC_CTYPE for proper unicode handling, e.g., en_US.UTF-8 or
      C.UTF-8.  These tools are only setup to handle UTF-8 encodings.
 
+     A terminal capable of entering and displaying UTF-8, and some
+     actual UTF-8 fonts installed on the system will also be needed.
+     
 EXIT STATUS
-     0 for success, non zero otherwise
+     0 for success, non zero otherwise.  For tgrep, see "tgrep -h"
 
 NOTES
      Note that it is not possible to split Thai words 100% accurately
@@ -66,5 +76,6 @@ SEE ALSO
 
 BUGS
      st-icu should also use the combined dictionary words.
-     thai text mixed with other languages may not be handled well.
-     this file should be converted to a proper manpage.
+     thai text mixed with other languages may not be handled well when
+     splitting.
+     this file should be converted to proper manpages.
diff --git a/textproc/split-thai/files/tgrep b/textproc/split-thai/files/tgrep
new file mode 100755
index 00000000000..d722baaa62a
--- /dev/null
+++ b/textproc/split-thai/files/tgrep
@@ -0,0 +1,208 @@
+#!/bin/perl
+#
+# perl grep equivalent-wrapper supporting utf-8 and thai in particular
+#
+use warnings;
+use strict;
+use Encode;
+use Getopt::Std;
+
+use utf8;
+use open qw/:std :utf8/;
+
+our ( $opt_h, $opt_i, $opt_l, $opt_n, $opt_q, $opt_v );
+
+getopts('hilnqv');
+
+if ( $opt_h ) {
+    usage();
+    exit 0;
+} elsif ( ! defined $ARGV[0] ) {
+    # no pattern given
+    usage();
+    exit 1;
+}
+
+my $pattern = decode('UTF-8', $ARGV[0]) if defined $ARGV[0];
+unless ( length( $pattern ) ) {
+    usage();
+    exit 1;
+}
+
+my $opt_filesonly = ( defined $opt_l ? 1 : 0 );
+my $opt_ignorecase = ( defined $opt_i ? 1 : 0 );
+my $opt_linenum = ( defined $opt_n ? 1 : 0 );
+my $opt_quiet = ( defined $opt_q ? 1 : 0 );
+my $opt_invert = ( defined $opt_v ? 1 : 0 );
+
+# rest of args should be filenames
+my @files = @ARGV;
+shift @files;
+@files = map { decode('UTF-8', $_ ) } @files;
+
+#
+# usage
+#
+sub usage {
+    print <<'EOF';
+
+NAME
+    tgrep - print lines matching a pattern, supports utf-8 characters
+    and some thai character classes using perl regexp matching.
+
+SYNOPSIS
+    tgrep [options] PATTERN [FILE] [FILE2]
+
+DESCRIPTION
+    tgrep (thai grep) is similar to grep, in that it searches files or
+    stdin for lines matching a pattern.  It uses perl to support utf-8
+    characters, and therefore the patterns are perl regexp patterns.
+    It supports a few simple homegrown character classes:
+
+    [:thai:]          match any thai unicode value
+    [:thaiconsonant:] match thai consonant including ฤ ฦ
+    [:thaidigit:]     match thai number ๐๑๒๓๔๕๖๗๘๙ 
+    [:thaitonemark:]  match thai tonemark ่้๊๋
+    [:thaivowel:]     match thai vowel symbols ะัา ำิีึืุูเแโใไๅ็
+                      does not include consonants that function as vowels
+    [:thaifullvowel:] same as [:thaivowel:] plus อรวยฤฦๅ used to form
+                      vowel diacritics and dipthongs
+    [:thaimisc:]      match misc thai symbols ฯๆฺ฿์ํ๎๏๚๛
+
+OPTIONS
+    -h  print help or usage
+
+    -i  ignore case
+
+    -l  suppress normal output, only print filenames that match
+
+    -n  prefix each line of output with the line number of the file
+
+    -q  quiet mode, don't print out matches
+
+    -v  invert match or print lines not matching pattern
+
+ENVIRONMENT
+     You may need to set LC_CTYPE, LC_ALL, or other LC_* to a utf-8
+     setting for this to program to work, e.g. for csh-type shells:
+          setenv LC_CTYPE en_US.UTF-8
+         
+EXIT STATUS
+    Similar to grep, returns 0 when matching line found, 1 otherwise.
+    If an error occurs, exit with 2 unless -q (quiet) option and a
+    match is found
+
+EXAMPLES
+    search for 'ก' in a utf-8 text file
+    $ tgrep ก file.txt
+
+    use perl regexp to match any line thai with utf-8 characters
+    $ tgrep '\p{InThai}' somefile.txt
+
+    use perl regexp unicode values to match thai numbers
+    $ tgrep '^[\x{0e50}-\x{0e59}]+$' other.file
+
+    match lines with a thai number
+    $ tgrep '[:thaidigit:]' afile.txt
+
+NOTES
+    grep(1) also can be used to match thai characters with unicode
+    escapes, for example
+       egrep "["$'\u0e01'-$'\u0e5b'"]" file.txt
+    would match thai unicode chars in a bash-type shell.
+
+SEE ALSO
+    grep(1), perl(1), perlre(1), locale(1), ugrep(1)
+
+BUGS
+    Only utf-8 encodings are supported.
+    The character classes used by this program ([:thai*:]) are not
+    standard or supported by other programs.
+    Quoting perl regular expression can sometimes be difficult from
+    within the shell.
+
+EOF
+}
+
+# handle convenience character classes
+if ( index($pattern, "[:thai:]") != -1 ) {
+    $pattern =~ s!\[\:thai\:\]!\\p\{InThai\}!g;
+}
+if ( index($pattern, "[:thaiconsonant:]") != -1 ) {
+    # chars between ก & ฮ inclusive
+    $pattern =~ s!\[\:thaiconsonant\:\]!\[\x{0e01}-\x{0e2e}\]!g;
+}
+if ( index($pattern, "[:thaidigit:]") != -1 ) {
+    $pattern =~ s!\[\:thaidigit\:\]![๐๑๒๓๔๕๖๗๘๙]!g;
+}
+if ( index($pattern, "[:thaitonemark:]") != -1 ) {
+    $pattern =~ s!\[\:thaitonemark\:\]![่้๊๋]!g;
+}
+if ( index($pattern, "[:thaivowel:]") != -1 ) {
+    $pattern =~ s!\[\:thaivowel\:\]![ะัา ำิีึืุูเแโใไๅ็]!g;
+}
+if ( index($pattern, "[:thaivowelfull:]") != -1 ) {
+    $pattern =~ s!\[\:thaivowelfull\:\]![ะัา ำิีึืุูเแโใไๅ็อรวยฤฦๅ]!g;
+}
+if ( index($pattern, "[:thaimisc:]") != -1 ) {
+    $pattern =~ s!\[\:thaimisc\:\]![ฯๆ฿์ํ๎๏ฺ๚๛]!g;
+}
+
+my $qpattern = ( $opt_ignorecase ? qr/$pattern/iou : qr/$pattern/ou );
+#print "pattern \"$pattern\"\n";
+#print "qpattern \"$qpattern\"\n";
+
+# if no file args or just "-", assume stdin
+push @files, "/dev/stdin" if ! @files;
+@files = map { $_ eq "-" ? "/dev/stdin" : $_ } @files;
+
+# maybe help to improve matching speed
+my $not_invert = ! $opt_invert;
+
+my $match_found = 0;
+my $error_occurred = 0;
+foreach my $file ( @files ) {
+    my $info;
+    unless ( open $info, $file ) {
+	warn "Could not open $file: $!";
+	$error_occurred = 1;
+	next;
+    }
+    
+    my $line_num = 1;
+    my $print_filename =  ( scalar ( @files ) > 1 ) && $file ne "/dev/stdin";
+    my $print_linenum  = $opt_linenum && $file ne "/dev/stdin";
+
+    while( my $line = <$info> ) {
+	if ( ( $not_invert && $line =~ m/$qpattern/ ) ||
+	     ( $opt_invert && $line !~ m/$qpattern/ ) ) {
+	    $match_found = 1;
+	    if ( $opt_quiet ) {
+		last;
+	    } elsif ( $opt_filesonly ) {
+		print $file, "\n";
+		last;
+	    }
+	    print $file,":" if $print_filename;
+	    print $line_num,":" if $print_linenum;
+	    chomp($line);
+	    print $line, "\n";
+	}
+	$line_num += 1;
+    }
+    unless ( close $info ) {
+	warn "Could not close $file: $!";
+	$error_occurred = 1;
+    }
+}
+
+# exit with same error codes as grep
+if ( $error_occurred ) {
+    if ( $match_found && $opt_quiet ) {
+	exit 0;
+    } else {
+	exit 2;
+    }
+} else {
+    exit ( $match_found ? 0 : 1 );
+}
author	scole <scole@pkgsrc.org>	2020-08-28 16:02:42 +0000
committer	scole <scole@pkgsrc.org>	2020-08-28 16:02:42 +0000
commit	3374708ce44e38a9a92ed3a857a057fb49dcdd7e (patch)
tree	dde1d59e9cc0af1817e085f41d6f9f15f29562bf /textproc
parent	982fe04bf99c79bb3602999d5d8f7b00ad95cb90 (diff)
download	pkgsrc-3374708ce44e38a9a92ed3a857a057fb49dcdd7e.tar.gz