From 3374708ce44e38a9a92ed3a857a057fb49dcdd7e Mon Sep 17 00:00:00 2001 From: scole Date: Fri, 28 Aug 2020 16:02:42 +0000 Subject: Update to 0.8 - add 'tgrep' perl script for grepping thai words --- textproc/split-thai/DESCR | 3 +- textproc/split-thai/Makefile | 10 +- textproc/split-thai/PLIST | 3 +- textproc/split-thai/files/README.txt | 27 +++-- textproc/split-thai/files/tgrep | 208 +++++++++++++++++++++++++++++++++++ 5 files changed, 237 insertions(+), 14 deletions(-) create mode 100755 textproc/split-thai/files/tgrep (limited to 'textproc') diff --git a/textproc/split-thai/DESCR b/textproc/split-thai/DESCR index 5dba7ad4417..2557d7021c5 100644 --- a/textproc/split-thai/DESCR +++ b/textproc/split-thai/DESCR @@ -3,4 +3,5 @@ boundaries, also known as word tokenization. The utilities use emacs, swath, and a c++ icu-project program. All use dictionary-based word splitting. -Also included is merged dictionary file of thai words. +Also included is a merged dictionary file of Thai words and a perl +script to grep Thai UTF-8 words. diff --git a/textproc/split-thai/Makefile b/textproc/split-thai/Makefile index baedceb4b9c..8dfc38a1af2 100644 --- a/textproc/split-thai/Makefile +++ b/textproc/split-thai/Makefile @@ -1,6 +1,6 @@ -# $NetBSD: Makefile,v 1.7 2020/08/20 14:20:27 scole Exp $ +# $NetBSD: Makefile,v 1.8 2020/08/28 16:02:42 scole Exp $ -PKGNAME= split-thai-0.7 +PKGNAME= split-thai-0.8 CATEGORIES= textproc MAINTAINER= pkgsrc-users@NetBSD.org COMMENT= Utilities to split UTF-8 Thai text into words @@ -15,10 +15,12 @@ MASTER_SITES= -${MASTER_SITE_GITHUB:=unicode-org/}/icu/raw/${GITHUB_ICU_TAG}/icu USE_LANGUAGES= c++11 # darwin needed 11? USE_TOOLS= pkg-config mkdir cp sh:run env awk cat sort uniq grep wc echo +USE_TOOLS+= perl:run BUILD_DEPENDS+= libdatrie-[0-9]*:../../devel/libdatrie DEPENDS+= emacs-[0-9]*:../../editors/emacs DEPENDS+= swath-[0-9]*:../../textproc/swath +REPLACE_PERL= tgrep REPLACE_SH= st-swath UTF8_ENV= env LC_ALL=C.UTF-8 @@ -47,7 +49,7 @@ SUBST_SED.dictionary-app+= -e 's,ST_SHARE_BIN,${PREFIX}/${ST_SHARE_BIN},g' pre-extract: mkdir -p ${WRKSRC} cd files && cp README.txt st-emacs st-icu.cc st-swath \ - thai-utility.el thaidict.abm ${WRKSRC} + tgrep thai-utility.el thaidict.abm ${WRKSRC} post-extract: cd ${WRKSRC} && ${UTF8_ENV} emacs --batch \ @@ -80,7 +82,7 @@ do-build: do-install: ${INSTALL_SCRIPT} ${WRKSRC}/st-emacs ${WRKSRC}/st-swath \ - ${DESTDIR}${PREFIX}/bin + ${WRKSRC}/tgrep ${DESTDIR}${PREFIX}/bin ${INSTALL_PROGRAM} ${WRKSRC}/st-icu ${DESTDIR}${PREFIX}/bin .for i in ${ST_SHARE_FILES} ${INSTALL_DATA} ${WRKSRC}/${i} ${DESTDIR}${PREFIX}/share/split-thai diff --git a/textproc/split-thai/PLIST b/textproc/split-thai/PLIST index 14269891613..7844202bc99 100644 --- a/textproc/split-thai/PLIST +++ b/textproc/split-thai/PLIST @@ -1,7 +1,8 @@ -@comment $NetBSD: PLIST,v 1.2 2020/08/14 17:31:34 scole Exp $ +@comment $NetBSD: PLIST,v 1.3 2020/08/28 16:02:42 scole Exp $ bin/st-emacs bin/st-icu bin/st-swath +bin/tgrep share/split-thai/README.txt share/split-thai/thai-dict.el share/split-thai/thai-dict.elc diff --git a/textproc/split-thai/files/README.txt b/textproc/split-thai/files/README.txt index 8bdd21a706a..fd138059805 100644 --- a/textproc/split-thai/files/README.txt +++ b/textproc/split-thai/files/README.txt @@ -2,14 +2,16 @@ NAME st-emacs st-icu st-swath + tgrep SYNOPSIS st-emacs|st-icu|st-swath [filename|text1 text2 ...|'blank'] + tgrep [options] FILE ... DESCRIPTION This package is a collection of utilities to separate Thai words by spaces (word tokenization). They can separate stdin, files, - or text as arguments. It includes 3 separate utilities: + or text as arguments. It includes these utilities: st-emacs: emacs-script using emacs lisp thai-word library https://www.gnu.org/software/emacs/ @@ -18,30 +20,38 @@ DESCRIPTION st-swath: sh script wrapper to simplfy args to the swath program https://linux.thai.net/projects/swath + tgrep: grep-like utility using perl, see "tgrep -h" + EXAMPLES - split one or more text strings + split one or more text strings: # st-swath แมวและหมา # st-swath "แมวหมา" พ่อและแม่ - read stdin + read stdin: # echo "แมวและหมา" | st-swath - read from a file + read from a file: # st-swath < thaifile.txt # st-swath somefile.txt - They can also read directly from stdin + They can also read directly from stdin: # st-icu แมวหมา (typed in) แมว หมา (output line by line) + grep for thai words: + # grep แมว thaifile.txt + ENVIRONMENT You will most likely need to set the environment variables LC_ALL or LC_CTYPE for proper unicode handling, e.g., en_US.UTF-8 or C.UTF-8. These tools are only setup to handle UTF-8 encodings. + A terminal capable of entering and displaying UTF-8, and some + actual UTF-8 fonts installed on the system will also be needed. + EXIT STATUS - 0 for success, non zero otherwise + 0 for success, non zero otherwise. For tgrep, see "tgrep -h" NOTES Note that it is not possible to split Thai words 100% accurately @@ -66,5 +76,6 @@ SEE ALSO BUGS st-icu should also use the combined dictionary words. - thai text mixed with other languages may not be handled well. - this file should be converted to a proper manpage. + thai text mixed with other languages may not be handled well when + splitting. + this file should be converted to proper manpages. diff --git a/textproc/split-thai/files/tgrep b/textproc/split-thai/files/tgrep new file mode 100755 index 00000000000..d722baaa62a --- /dev/null +++ b/textproc/split-thai/files/tgrep @@ -0,0 +1,208 @@ +#!/bin/perl +# +# perl grep equivalent-wrapper supporting utf-8 and thai in particular +# +use warnings; +use strict; +use Encode; +use Getopt::Std; + +use utf8; +use open qw/:std :utf8/; + +our ( $opt_h, $opt_i, $opt_l, $opt_n, $opt_q, $opt_v ); + +getopts('hilnqv'); + +if ( $opt_h ) { + usage(); + exit 0; +} elsif ( ! defined $ARGV[0] ) { + # no pattern given + usage(); + exit 1; +} + +my $pattern = decode('UTF-8', $ARGV[0]) if defined $ARGV[0]; +unless ( length( $pattern ) ) { + usage(); + exit 1; +} + +my $opt_filesonly = ( defined $opt_l ? 1 : 0 ); +my $opt_ignorecase = ( defined $opt_i ? 1 : 0 ); +my $opt_linenum = ( defined $opt_n ? 1 : 0 ); +my $opt_quiet = ( defined $opt_q ? 1 : 0 ); +my $opt_invert = ( defined $opt_v ? 1 : 0 ); + +# rest of args should be filenames +my @files = @ARGV; +shift @files; +@files = map { decode('UTF-8', $_ ) } @files; + +# +# usage +# +sub usage { + print <<'EOF'; + +NAME + tgrep - print lines matching a pattern, supports utf-8 characters + and some thai character classes using perl regexp matching. + +SYNOPSIS + tgrep [options] PATTERN [FILE] [FILE2] + +DESCRIPTION + tgrep (thai grep) is similar to grep, in that it searches files or + stdin for lines matching a pattern. It uses perl to support utf-8 + characters, and therefore the patterns are perl regexp patterns. + It supports a few simple homegrown character classes: + + [:thai:] match any thai unicode value + [:thaiconsonant:] match thai consonant including ฤ ฦ + [:thaidigit:] match thai number ๐๑๒๓๔๕๖๗๘๙ + [:thaitonemark:] match thai tonemark ่้๊๋ + [:thaivowel:] match thai vowel symbols ะัา ำิีึืุูเแโใไๅ็ + does not include consonants that function as vowels + [:thaifullvowel:] same as [:thaivowel:] plus อรวยฤฦๅ used to form + vowel diacritics and dipthongs + [:thaimisc:] match misc thai symbols ฯๆฺ฿์ํ๎๏๚๛ + +OPTIONS + -h print help or usage + + -i ignore case + + -l suppress normal output, only print filenames that match + + -n prefix each line of output with the line number of the file + + -q quiet mode, don't print out matches + + -v invert match or print lines not matching pattern + +ENVIRONMENT + You may need to set LC_CTYPE, LC_ALL, or other LC_* to a utf-8 + setting for this to program to work, e.g. for csh-type shells: + setenv LC_CTYPE en_US.UTF-8 + +EXIT STATUS + Similar to grep, returns 0 when matching line found, 1 otherwise. + If an error occurs, exit with 2 unless -q (quiet) option and a + match is found + +EXAMPLES + search for 'ก' in a utf-8 text file + $ tgrep ก file.txt + + use perl regexp to match any line thai with utf-8 characters + $ tgrep '\p{InThai}' somefile.txt + + use perl regexp unicode values to match thai numbers + $ tgrep '^[\x{0e50}-\x{0e59}]+$' other.file + + match lines with a thai number + $ tgrep '[:thaidigit:]' afile.txt + +NOTES + grep(1) also can be used to match thai characters with unicode + escapes, for example + egrep "["$'\u0e01'-$'\u0e5b'"]" file.txt + would match thai unicode chars in a bash-type shell. + +SEE ALSO + grep(1), perl(1), perlre(1), locale(1), ugrep(1) + +BUGS + Only utf-8 encodings are supported. + The character classes used by this program ([:thai*:]) are not + standard or supported by other programs. + Quoting perl regular expression can sometimes be difficult from + within the shell. + +EOF +} + +# handle convenience character classes +if ( index($pattern, "[:thai:]") != -1 ) { + $pattern =~ s!\[\:thai\:\]!\\p\{InThai\}!g; +} +if ( index($pattern, "[:thaiconsonant:]") != -1 ) { + # chars between ก & ฮ inclusive + $pattern =~ s!\[\:thaiconsonant\:\]!\[\x{0e01}-\x{0e2e}\]!g; +} +if ( index($pattern, "[:thaidigit:]") != -1 ) { + $pattern =~ s!\[\:thaidigit\:\]![๐๑๒๓๔๕๖๗๘๙]!g; +} +if ( index($pattern, "[:thaitonemark:]") != -1 ) { + $pattern =~ s!\[\:thaitonemark\:\]![่้๊๋]!g; +} +if ( index($pattern, "[:thaivowel:]") != -1 ) { + $pattern =~ s!\[\:thaivowel\:\]![ะัา ำิีึืุูเแโใไๅ็]!g; +} +if ( index($pattern, "[:thaivowelfull:]") != -1 ) { + $pattern =~ s!\[\:thaivowelfull\:\]![ะัา ำิีึืุูเแโใไๅ็อรวยฤฦๅ]!g; +} +if ( index($pattern, "[:thaimisc:]") != -1 ) { + $pattern =~ s!\[\:thaimisc\:\]![ฯๆ฿์ํ๎๏ฺ๚๛]!g; +} + +my $qpattern = ( $opt_ignorecase ? qr/$pattern/iou : qr/$pattern/ou ); +#print "pattern \"$pattern\"\n"; +#print "qpattern \"$qpattern\"\n"; + +# if no file args or just "-", assume stdin +push @files, "/dev/stdin" if ! @files; +@files = map { $_ eq "-" ? "/dev/stdin" : $_ } @files; + +# maybe help to improve matching speed +my $not_invert = ! $opt_invert; + +my $match_found = 0; +my $error_occurred = 0; +foreach my $file ( @files ) { + my $info; + unless ( open $info, $file ) { + warn "Could not open $file: $!"; + $error_occurred = 1; + next; + } + + my $line_num = 1; + my $print_filename = ( scalar ( @files ) > 1 ) && $file ne "/dev/stdin"; + my $print_linenum = $opt_linenum && $file ne "/dev/stdin"; + + while( my $line = <$info> ) { + if ( ( $not_invert && $line =~ m/$qpattern/ ) || + ( $opt_invert && $line !~ m/$qpattern/ ) ) { + $match_found = 1; + if ( $opt_quiet ) { + last; + } elsif ( $opt_filesonly ) { + print $file, "\n"; + last; + } + print $file,":" if $print_filename; + print $line_num,":" if $print_linenum; + chomp($line); + print $line, "\n"; + } + $line_num += 1; + } + unless ( close $info ) { + warn "Could not close $file: $!"; + $error_occurred = 1; + } +} + +# exit with same error codes as grep +if ( $error_occurred ) { + if ( $match_found && $opt_quiet ) { + exit 0; + } else { + exit 2; + } +} else { + exit ( $match_found ? 0 : 1 ); +} -- cgit v1.2.3