diff options
author | Igor Pashev <pashev.igor@gmail.com> | 2012-06-24 22:28:35 +0000 |
---|---|---|
committer | Igor Pashev <pashev.igor@gmail.com> | 2012-06-24 22:28:35 +0000 |
commit | 3950ffe2a485479f6561c27364d3d7df5a21d124 (patch) | |
tree | 468c6e14449d1b1e279222ec32f676b0311917d2 /src/lib/libcmd/cut.c | |
download | ksh-upstream.tar.gz |
Imported Upstream version 93u+upstream
Diffstat (limited to 'src/lib/libcmd/cut.c')
-rw-r--r-- | src/lib/libcmd/cut.c | 702 |
1 files changed, 702 insertions, 0 deletions
diff --git a/src/lib/libcmd/cut.c b/src/lib/libcmd/cut.c new file mode 100644 index 0000000..985b789 --- /dev/null +++ b/src/lib/libcmd/cut.c @@ -0,0 +1,702 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1992-2012 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped +/* + * David Korn + * AT&T Bell Laboratories + * + * cut fields or columns from fields from a file + */ + +static const char usage[] = +"[-?\n@(#)$Id: cut (AT&T Research) 2010-08-11 $\n]" +USAGE_LICENSE +"[+NAME?cut - cut out selected columns or fields of each line of a file]" +"[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields " + "from one or more files, contatenating them on standard output.]" +"[+?The option argument \alist\a is a comma-separated or blank-separated " + "list of positive numbers and ranges. Ranges can be of three " + "forms. The first is two positive integers separated by a hyphen " + "(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to " + "\ahigh\a. The second is a positive number preceded by a hyphen " + "(\b-\b\ahigh\a), which represents all fields from field \b1\b to " + "\ahigh\a. The last is a positive number followed by a hyphen " + "(\alow\a\b-\b), which represents all fields from \alow\a to the " + "last field, inclusive. Elements in the \alist\a can be repeated, " + "can overlap, and can appear in any order. The order of the " + "output is that of the input.]" +"[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]" +"[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b " + "cuts from standard input. The start of the file is defined " + "as the current offset.]" +"[b:bytes]:[list?\bcut\b based on a list of byte counts.]" +"[c:characters]:[list?\bcut\b based on a list of character counts.]" +"[d:delimiter]:[delim?The field character for the \b-f\b option is set " + "to \adelim\a. The default is the \btab\b character.]" +"[f:fields]:[list?\bcut\b based on fields separated by the delimiter " + "character specified with the \b-d\b optiion.]" +"[n!:split?Split multibyte characters selected by the \b-b\b option.]" +"[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length " + "records of length \areclen\a when used with the \b-b\b or \b-c\b " + "option.]" +"[s:suppress|only-delimited?Suppress lines with no delimiter characters, " + "when used with the \b-f\b option. By default, lines with no " + "delimiters will be passsed in untouched.]" +"[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for " + "the \b-f\b option is set to \aldelim\a. The default is the " + "\bnewline\b character.]" +"[N!:newline?Output new-lines at end of each record when used " + "with the \b-b\b or \b-c\b option.]" +"\n" +"\n[file ...]\n" +"\n" +"[+EXIT STATUS?]{" + "[+0?All files processed successfully.]" + "[+>0?One or more files failed to open or could not be read.]" +"}" +"[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]" +; + +#include <cmd.h> +#include <ctype.h> + +typedef struct Delim_s +{ + char* str; + int len; + int chr; +} Delim_t; + +typedef struct Cut_s +{ + int mb; + int eob; + int cflag; + int nosplit; + int sflag; + int nlflag; + int reclen; + Delim_t wdelim; + Delim_t ldelim; + unsigned char space[UCHAR_MAX+1]; + int list[2]; /* NOTE: must be last member */ +} Cut_t; + +#define HUGE INT_MAX +#define BLOCK 8*1024 +#define C_BYTES 1 +#define C_CHARS 2 +#define C_FIELDS 4 +#define C_SUPRESS 8 +#define C_NOSPLIT 16 +#define C_NONEWLINE 32 + +#define SP_LINE 1 +#define SP_WORD 2 +#define SP_WIDE 3 + +/* + * compare the first of an array of integers + */ + +static int +mycomp(register const void* a, register const void* b) +{ + if (*((int*)a) < *((int*)b)) + return -1; + if (*((int*)a) > *((int*)b)) + return 1; + return 0; +} + +static Cut_t* +cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen) +{ + register int* lp; + register int c; + register int n = 0; + register int range = 0; + register char* cp = str; + Cut_t* cut; + + if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int)))) + error(ERROR_exit(1), "out of space"); + if (cut->mb = mbwide()) + { + memset(cut->space, 0, sizeof(cut->space) / 2); + memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2); + } + else + memset(cut->space, 0, sizeof(cut->space)); + cut->wdelim = *wdelim; + if (wdelim->len == 1) + cut->space[wdelim->chr] = SP_WORD; + cut->ldelim = *ldelim; + cut->eob = (ldelim->len == 1) ? ldelim->chr : 0; + cut->space[cut->eob] = SP_LINE; + cut->cflag = (mode&C_CHARS) && cut->mb; + cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb; + cut->sflag = (mode&C_SUPRESS) != 0; + cut->nlflag = (mode&C_NONEWLINE) != 0; + cut->reclen = reclen; + lp = cut->list; + for (;;) + switch(c = *cp++) + { + case ' ': + case '\t': + while(*cp==' ' || *cp=='\t') + cp++; + /*FALLTHROUGH*/ + case 0: + case ',': + if(range) + { + --range; + if((n = (n ? (n-range) : (HUGE-1))) < 0) + error(ERROR_exit(1),"invalid range for c/f option"); + *lp++ = range; + *lp++ = n; + } + else + { + *lp++ = --n; + *lp++ = 1; + } + if(c==0) + { + register int *dp; + *lp = HUGE; + n = 1 + (lp-cut->list)/2; + qsort(lp=cut->list,n,2*sizeof(*lp),mycomp); + /* eliminate overlapping regions */ + for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2) + { + if(lp[0] <= range) + { + if(lp[1]==HUGE) + { + dp[-1] = HUGE; + break; + } + if((c = lp[0]+lp[1]-range)>0) + { + range += c; + dp[-1] += c; + } + } + else + { + range = *dp++ = lp[0]; + if(lp[1]==HUGE) + { + *dp++ = HUGE; + break; + } + range += (*dp++ = lp[1]); + } + } + *dp = HUGE; + lp = cut->list; + /* convert ranges into gaps */ + for(n=0; *lp!=HUGE; lp+=2) + { + c = *lp; + *lp -= n; + n = c+lp[1]; + } + return cut; + } + n = range = 0; + break; + + case '-': + if(range) + error(ERROR_exit(1),"bad list for c/f option"); + range = n?n:1; + n = 0; + break; + + default: + if(!isdigit(c)) + error(ERROR_exit(1),"bad list for c/f option"); + n = 10*n + (c-'0'); + break; + } + /* NOTREACHED */ +} + +/* + * cut each line of file <fdin> and put results to <fdout> using list <list> + */ + +static void +cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout) +{ + register int c; + register int len; + register int ncol = 0; + register const int* lp = cut->list; + register char* bp; + register int skip; /* non-zero for don't copy */ + int must; + char* ep; + const char* xx; + + for (;;) + { + if (len = cut->reclen) + bp = sfreserve(fdin, len, -1); + else + bp = sfgetr(fdin, '\n', 0); + if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR))) + break; + len = sfvalue(fdin); + ep = bp + len; + xx = 0; + if (!(ncol = skip = *(lp = cut->list))) + ncol = *++lp; + must = 1; + do + { + if (cut->nosplit) + { + register const char* s = bp; + register int w = len < ncol ? len : ncol; + register int z; + + while (w > 0) + { + if (!(*s & 0x80)) + z = 1; + else if ((z = mbnsize(s, w)) <= 0) + { + if (s == bp && xx) + { + w += s - xx; + bp = (char*)(s = xx); + xx = 0; + continue; + } + xx = s; + if (skip) + s += w; + w = 0; + break; + } + s += z; + w -= z; + } + c = s - bp; + ncol = !w && ncol >= len; + } + else if (cut->cflag) + { + register const char* s = bp; + register int w = len; + register int z; + + while (w > 0 && ncol > 0) + { + ncol--; + if (!(*s & 0x80) || (z = mbnsize(s, w)) <= 0) + z = 1; + s += z; + w -= z; + + } + c = s - bp; + ncol = !w && (ncol || !skip); + } + else + { + if ((c = ncol) > len) + c = len; + else if (c == len && !skip) + ncol++; + ncol -= c; + } + if (!skip && c) + { + if (sfwrite(fdout, (char*)bp, c) < 0) + return; + must = 0; + } + bp += c; + if (ncol) + break; + len -= c; + ncol = *++lp; + skip = !skip; + } while (ncol != HUGE); + if (!cut->nlflag && (skip || must || cut->reclen)) + { + if (cut->ldelim.len > 1) + sfwrite(fdout, cut->ldelim.str, cut->ldelim.len); + else + sfputc(fdout, cut->ldelim.chr); + } + } +} + +/* + * cut each line of file <fdin> and put results to <fdout> using list <list> + * stream <fdin> must be line buffered + */ + +static void +cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout) +{ + register unsigned char *sp = cut->space; + register unsigned char *cp; + register unsigned char *wp; + register int c, nfields; + register const int *lp = cut->list; + register unsigned char *copy; + register int nodelim, empty, inword=0; + register unsigned char *ep; + unsigned char *bp, *first; + int lastchar; + wchar_t w; + Sfio_t *fdtmp = 0; + long offset = 0; + unsigned char mb[8]; + /* process each buffer */ + while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0) + { + cp = bp; + ep = cp + --c; + if((lastchar = cp[c]) != cut->eob) + *ep = cut->eob; + /* process each line in the buffer */ + while (cp <= ep) + { + first = cp; + if (!inword) + { + nodelim = empty = 1; + copy = cp; + if (nfields = *(lp = cut->list)) + copy = 0; + else + nfields = *++lp; + } + else if (copy) + copy = cp; + inword = 0; + do + { + /* skip over non-delimiter characters */ + if (cut->mb) + for (;;) + { + switch (c = sp[*(unsigned char*)cp++]) + { + case 0: + continue; + case SP_WIDE: + wp = --cp; + while ((c = mb2wc(w, cp, ep - cp)) <= 0) + { + /* mb char possibly spanning buffer boundary -- fun stuff */ + if ((ep - cp) < mbmax()) + { + int i; + int j; + int k; + + if (lastchar != cut->eob) + { + *ep = lastchar; + if ((c = mb2wc(w, cp, ep - cp)) > 0) + break; + } + if (copy) + { + empty = 0; + if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0) + goto failed; + } + for (i = 0; i <= (ep - cp); i++) + mb[i] = cp[i]; + if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0) + goto failed; + cp = bp; + ep = cp + --c; + if ((lastchar = cp[c]) != cut->eob) + *ep = cut->eob; + j = i; + k = 0; + while (j < mbmax()) + mb[j++] = cp[k++]; + if ((c = mb2wc(w, (char*)mb, j)) <= 0) + { + c = i; + w = 0; + } + first = bp = cp += c - i; + if (copy) + { + copy = bp; + if (w == cut->ldelim.chr) + lastchar = cut->ldelim.chr; + else if (w != cut->wdelim.chr) + { + empty = 0; + if (sfwrite(fdout, (char*)mb, c) < 0) + goto failed; + } + } + c = 0; + } + else + { + w = *cp; + c = 1; + } + break; + } + cp += c; + c = w; + if (c == cut->wdelim.chr) + { + c = SP_WORD; + break; + } + if (c == cut->ldelim.chr) + { + c = SP_LINE; + break; + } + continue; + default: + wp = cp - 1; + break; + } + break; + } + else + { + while (!(c = sp[*cp++])); + wp = cp - 1; + } + /* check for end-of-line */ + if (c == SP_LINE) + { + if (cp <= ep) + break; + if (lastchar == cut->ldelim.chr) + break; + /* restore cut->last character */ + if (lastchar != cut->eob) + *ep = lastchar; + inword++; + if (!sp[lastchar]) + break; + } + nodelim = 0; + if (--nfields > 0) + continue; + nfields = *++lp; + if (copy) + { + empty = 0; + if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0) + goto failed; + copy = 0; + } + else + /* set to delimiter unless the first field */ + copy = empty ? cp : wp; + } while (!inword); + if (!inword) + { + if (!copy) + { + if (nodelim) + { + if (!cut->sflag) + { + if (offset) + { + sfseek(fdtmp,(Sfoff_t)0,SEEK_SET); + sfmove(fdtmp,fdout,offset,-1); + } + copy = first; + } + } + else + sfputc(fdout,'\n'); + } + if (offset) + sfseek(fdtmp,offset=0,SEEK_SET); + } + if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0) + goto failed; + } + /* see whether to save in tmp file */ + if(inword && nodelim && !cut->sflag && (c=cp-first)>0) + { + /* copy line to tmpfile in case no fields */ + if(!fdtmp) + fdtmp = sftmp(BLOCK); + sfwrite(fdtmp,(char*)first,c); + offset +=c; + } + } + failed: + if(fdtmp) + sfclose(fdtmp); +} + +int +b_cut(int argc, char** argv, Shbltin_t* context) +{ + register char* cp = 0; + register Sfio_t* fp; + char* s; + int n; + Cut_t* cut; + int mode = 0; + Delim_t wdelim; + Delim_t ldelim; + size_t reclen = 0; + + cmdinit(argc, argv, context, ERROR_CATALOG, 0); + wdelim.chr = '\t'; + ldelim.chr = '\n'; + wdelim.len = ldelim.len = 1; + for (;;) + { + switch (optget(argv, usage)) + { + case 0: + break; + case 'b': + case 'c': + if(mode&C_FIELDS) + { + error(2, "f option already specified"); + continue; + } + cp = opt_info.arg; + if(opt_info.option[1]=='b') + mode |= C_BYTES; + else + mode |= C_CHARS; + continue; + case 'D': + ldelim.str = opt_info.arg; + if (mbwide()) + { + s = opt_info.arg; + ldelim.chr = mbchar(s); + if ((n = s - opt_info.arg) > 1) + { + ldelim.len = n; + continue; + } + } + ldelim.chr = *(unsigned char*)opt_info.arg; + ldelim.len = 1; + continue; + case 'd': + wdelim.str = opt_info.arg; + if (mbwide()) + { + s = opt_info.arg; + wdelim.chr = mbchar(s); + if ((n = s - opt_info.arg) > 1) + { + wdelim.len = n; + continue; + } + } + wdelim.chr = *(unsigned char*)opt_info.arg; + wdelim.len = 1; + continue; + case 'f': + if(mode&(C_CHARS|C_BYTES)) + { + error(2, "c option already specified"); + continue; + } + cp = opt_info.arg; + mode |= C_FIELDS; + continue; + case 'n': + mode |= C_NOSPLIT; + continue; + case 'N': + mode |= C_NONEWLINE; + continue; + case 'R': + case 'r': + if(opt_info.num>0) + reclen = opt_info.num; + continue; + case 's': + mode |= C_SUPRESS; + continue; + case ':': + error(2, "%s", opt_info.arg); + break; + case '?': + error(ERROR_usage(2), "%s", opt_info.arg); + break; + } + break; + } + argv += opt_info.index; + if (error_info.errors) + error(ERROR_usage(2), "%s",optusage(NiL)); + if(!cp) + { + error(2, "b, c or f option must be specified"); + error(ERROR_usage(2), "%s", optusage(NiL)); + } + if(!*cp) + error(3, "non-empty b, c or f option must be specified"); + if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS) + error(3, "s option requires f option"); + cut = cutinit(mode, cp, &wdelim, &ldelim, reclen); + if(cp = *argv) + argv++; + do + { + if(!cp || streq(cp,"-")) + fp = sfstdin; + else if(!(fp = sfopen(NiL,cp,"r"))) + { + error(ERROR_system(0),"%s: cannot open",cp); + continue; + } + if(mode&C_FIELDS) + cutfields(cut,fp,sfstdout); + else + cutcols(cut,fp,sfstdout); + if(fp!=sfstdin) + sfclose(fp); + } while(cp = *argv++); + if (sfsync(sfstdout)) + error(ERROR_system(0), "write error"); + return error_info.errors != 0; +} |