diff options
Diffstat (limited to 'usr/src/lib/libcmd/common/cut.c')
-rw-r--r-- | usr/src/lib/libcmd/common/cut.c | 583 |
1 files changed, 388 insertions, 195 deletions
diff --git a/usr/src/lib/libcmd/common/cut.c b/usr/src/lib/libcmd/common/cut.c index 2be03c3183..abafdc5070 100644 --- a/usr/src/lib/libcmd/common/cut.c +++ b/usr/src/lib/libcmd/common/cut.c @@ -1,7 +1,7 @@ /*********************************************************************** * * * This software is part of the ast package * -* Copyright (c) 1992-2009 AT&T Intellectual Property * +* Copyright (c) 1992-2010 AT&T Intellectual Property * * and is licensed under the * * Common Public License, Version 1.0 * * by AT&T Intellectual Property * @@ -23,13 +23,11 @@ * David Korn * AT&T Bell Laboratories * - * cut [-sN] [-f flist] [-c clist] [-d delim] [-D delim] [-r reclen] [file] ... - * * cut fields or columns from fields from a file */ static const char usage[] = -"[-?\n@(#)$Id: cut (AT&T Research) 2008-04-01 $\n]" +"[-?\n@(#)$Id: cut (AT&T Research) 2009-12-04 $\n]" USAGE_LICENSE "[+NAME?cut - cut out selected columns or fields of each line of a file]" "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields " @@ -49,13 +47,13 @@ USAGE_LICENSE "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b " "cuts from standard input. The start of the file is defined " "as the current offset.]" -"[b:bytes]:[list?\bcut\b based on a list of bytes.]" -"[c:characters]:[list?\bcut\b based on a list of characters.]" +"[b:bytes]:[list?\bcut\b based on a list of byte counts.]" +"[c:characters]:[list?\bcut\b based on a list of character counts.]" "[d:delimiter]:[delim?The field character for the \b-f\b option is set " "to \adelim\a. The default is the \btab\b character.]" "[f:fields]:[list?\bcut\b based on fields separated by the delimiter " "character specified with the \b-d\b optiion.]" -"[n:nosplit?Do not split characters. Currently ignored.]" +"[n!:split?Split multibyte characters selected by the \b-b\b option.]" "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length " "records of length \areclen\a when used with the \b-b\b or \b-c\b " "option.]" @@ -65,7 +63,7 @@ USAGE_LICENSE "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for " "the \b-f\b option is set to \aldelim\a. The default is the " "\bnewline\b character.]" -"[N:nonewline?Do not output new-lines at end of each record when used " +"[N!:newline?Output new-lines at end of each record when used " "with the \b-b\b or \b-c\b option.]" "\n" "\n[file ...]\n" @@ -80,79 +78,102 @@ USAGE_LICENSE #include <cmd.h> #include <ctype.h> -typedef struct Last_s +typedef struct Delim_s { - int seqno; - int seq; - int wdelim; - int ldelim; -} Last_t; + char* str; + int len; + int chr; +} Delim_t; typedef struct Cut_s { + int mb; + int eob; int cflag; + int nosplit; int sflag; int nlflag; - int wdelim; - int ldelim; - int seqno; int reclen; - signed char space[UCHAR_MAX]; - Last_t last; + Delim_t wdelim; + Delim_t ldelim; + unsigned char space[UCHAR_MAX+1]; int list[2]; /* NOTE: must be last member */ } Cut_t; -#define HUGE (1<<14) +#define HUGE INT_MAX #define BLOCK 8*1024 #define C_BYTES 1 #define C_CHARS 2 #define C_FIELDS 4 #define C_SUPRESS 8 -#define C_NOCHOP 16 +#define C_NOSPLIT 16 #define C_NONEWLINE 32 +#define SP_LINE 1 +#define SP_WORD 2 +#define SP_WIDE 3 + +#define mb2wc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n) + /* * compare the first of an array of integers */ -static int mycomp(register const void *a,register const void *b) +static int +mycomp(register const void* a, register const void* b) { - return(*((int*)a) - *((int*)b)); + if (*((int*)a) < *((int*)b)) + return -1; + if (*((int*)a) > *((int*)b)) + return 1; + return 0; } -static Cut_t *cutinit(int mode,char *str,int wdelim,int ldelim,size_t reclen) +static Cut_t* +cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen) { - register int *lp, c, n=0; - register int range = 0; - register char *cp = str; - Cut_t *cuthdr; - if (!(cuthdr = (Cut_t*)stakalloc(sizeof(Cut_t)+strlen(cp)*sizeof(int)))) + register int* lp; + register int c; + register int n = 0; + register int range = 0; + register char* cp = str; + Cut_t* cut; + + if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int)))) error(ERROR_exit(1), "out of space"); - memset(cuthdr->space, 0, sizeof(cuthdr->space)); - cuthdr->last.seqno = 0; - cuthdr->last.seq = 0; - cuthdr->last.wdelim = 0; - cuthdr->last.ldelim = '\n'; - cuthdr->cflag = ((mode&C_CHARS)!=0 && mbwide()); - cuthdr->sflag = ((mode&C_SUPRESS)!=0); - cuthdr->nlflag = ((mode&C_NONEWLINE)!=0); - cuthdr->wdelim = wdelim; - cuthdr->ldelim = ldelim; - cuthdr->reclen = reclen; - cuthdr->seqno = ++cuthdr->last.seqno; - lp = cuthdr->list; - while(1) switch(c= *cp++) + if (cut->mb = mbwide()) { + memset(cut->space, 0, sizeof(cut->space) / 2); + memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2); + } + else + memset(cut->space, 0, sizeof(cut->space)); + cut->wdelim = *wdelim; + if (wdelim->len == 1) + cut->space[wdelim->chr] = SP_WORD; + cut->ldelim = *ldelim; + cut->eob = (ldelim->len == 1) ? ldelim->chr : 0; + cut->space[cut->eob] = SP_LINE; + cut->cflag = (mode&C_CHARS) && cut->mb; + cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb; + cut->sflag = (mode&C_SUPRESS) != 0; + cut->nlflag = (mode&C_NONEWLINE) != 0; + cut->reclen = reclen; + lp = cut->list; + for (;;) + switch(c = *cp++) + { case ' ': case '\t': while(*cp==' ' || *cp=='\t') cp++; + /*FALLTHROUGH*/ case 0: case ',': if(range) { --range; - if((n = (n==0?HUGE:n-range)) < 0) + if((n = (n ? (n-range) : (HUGE-1))) < 0) error(ERROR_exit(1),"invalid range for c/f option"); *lp++ = range; *lp++ = n; @@ -166,8 +187,8 @@ static Cut_t *cutinit(int mode,char *str,int wdelim,int ldelim,size_t reclen) { register int *dp; *lp = HUGE; - n = 1 + (lp-cuthdr->list)/2; - qsort(lp=cuthdr->list,n,2*sizeof(*lp),mycomp); + n = 1 + (lp-cut->list)/2; + qsort(lp=cut->list,n,2*sizeof(*lp),mycomp); /* eliminate overlapping regions */ for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2) { @@ -196,7 +217,7 @@ static Cut_t *cutinit(int mode,char *str,int wdelim,int ldelim,size_t reclen) } } *dp = HUGE; - lp = cuthdr->list; + lp = cut->list; /* convert ranges into gaps */ for(n=0; *lp!=HUGE; lp+=2) { @@ -204,7 +225,7 @@ static Cut_t *cutinit(int mode,char *str,int wdelim,int ldelim,size_t reclen) *lp -= n; n = c+lp[1]; } - return(cuthdr); + return cut; } n = range = 0; break; @@ -220,70 +241,121 @@ static Cut_t *cutinit(int mode,char *str,int wdelim,int ldelim,size_t reclen) if(!isdigit(c)) error(ERROR_exit(1),"bad list for c/f option"); n = 10*n + (c-'0'); - } + break; + } /* NOTREACHED */ } /* - * advance <cp> by <n> multi-byte characters - */ -static int advance(const char *str, register int n, register int inlen) -{ - register int size, len=inlen; - register const char *cp=str; - while(len>0 && n-->0) - { - size = mblen(cp, len); - if(size<0) - size = 1; - cp += size; - len -= size; - - } - if(n>0) - return(inlen+1); - return(cp-str); -} - -/* * cut each line of file <fdin> and put results to <fdout> using list <list> */ -static void cutcols(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout) +static void +cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout) { - register int c, ncol=0,len; - register const int *lp = cuthdr->list; - register char *inp; + register int c; + register int len; + register int ncol = 0; + register const int* lp = cut->list; + register char* bp; register int skip; /* non-zero for don't copy */ - while(1) + int must; + char* ep; + const char* xx; + + for (;;) { - if(len = cuthdr->reclen) - inp = sfreserve(fdin, len, -1); + if (len = cut->reclen) + bp = sfreserve(fdin, len, -1); else - inp = sfgetr(fdin, '\n', 0); - if(!inp && !(inp = sfgetr(fdin, 0, SF_LASTR))) + bp = sfgetr(fdin, '\n', 0); + if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR))) break; len = sfvalue(fdin); - if((ncol = skip = *(lp = cuthdr->list)) == 0) + ep = bp + len; + xx = 0; + if (!(ncol = skip = *(lp = cut->list))) ncol = *++lp; - while(1) + must = 1; + do { - if((c=(cuthdr->cflag?advance(inp,ncol,len):ncol)) > len) - c = len; - else if(c==len && !skip) - ncol++; - ncol -= c; - if(!skip && sfwrite(fdout,(char*)inp,c)<0) - return; - inp += c; - if(ncol) + if (cut->nosplit) + { + register const char* s = bp; + register int w = len < ncol ? len : ncol; + register int z; + + while (w > 0) + { + if (!(*s & 0x80)) + z = 1; + else if ((z = mblen(s, w)) <= 0) + { + if (s == bp && xx) + { + w += s - xx; + bp = (char*)(s = xx); + xx = 0; + continue; + } + xx = s; + if (skip) + s += w; + w = 0; + break; + } + s += z; + w -= z; + } + c = s - bp; + ncol = !w && ncol >= len; + } + else if (cut->cflag) + { + register const char* s = bp; + register int w = len; + register int z; + + while (w > 0 && ncol > 0) + { + ncol--; + if (!(*s & 0x80) || (z = mblen(s, w)) <= 0) + z = 1; + s += z; + w -= z; + + } + c = s - bp; + ncol = !w && (ncol || !skip); + } + else + { + if ((c = ncol) > len) + c = len; + else if (c == len && !skip) + ncol++; + ncol -= c; + } + if (!skip && c) + { + if (sfwrite(fdout, (char*)bp, c) < 0) + return; + must = 0; + } + bp += c; + if (ncol) break; len -= c; ncol = *++lp; skip = !skip; + } while (ncol != HUGE); + if (!cut->nlflag && (skip || must || cut->reclen)) + { + if (cut->ldelim.len > 1) + sfwrite(fdout, cut->ldelim.str, cut->ldelim.len); + else + sfputc(fdout, cut->ldelim.chr); } - if(!cuthdr->nlflag && (skip || cuthdr->reclen)) - sfputc(fdout,cuthdr->ldelim); } } @@ -292,93 +364,180 @@ static void cutcols(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout) * stream <fdin> must be line buffered */ -#define endline(c) (((signed char)-1)<0?(c)<0:(c)==((char)-1)) - -static void cutfields(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout) +static void +cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout) { + register unsigned char *sp = cut->space; register unsigned char *cp; + register unsigned char *wp; register int c, nfields; - register const int *lp = cuthdr->list; + register const int *lp = cut->list; register unsigned char *copy; register int nodelim, empty, inword=0; - register unsigned char *endbuff; - unsigned char *inbuff, *first; + register unsigned char *ep; + unsigned char *bp, *first; int lastchar; + wchar_t w; Sfio_t *fdtmp = 0; long offset = 0; - if(cuthdr->seqno != cuthdr->last.seq) - { - cuthdr->space[cuthdr->last.ldelim] = 0; - cuthdr->space[cuthdr->last.wdelim] = 0; - cuthdr->space[cuthdr->last.wdelim=cuthdr->wdelim] = 1; - cuthdr->space[cuthdr->last.ldelim=cuthdr->ldelim] = -1; - cuthdr->last.seq = cuthdr->seqno; - } + unsigned char mb[8]; /* process each buffer */ - while ((inbuff = (unsigned char*)sfreserve(fdin, SF_UNBOUND, 0)) && (c = sfvalue(fdin)) > 0) + while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0) { - cp = inbuff; - endbuff = cp + --c; - if((lastchar = cp[c]) != cuthdr->ldelim) - *endbuff = cuthdr->ldelim; + cp = bp; + ep = cp + --c; + if((lastchar = cp[c]) != cut->eob) + *ep = cut->eob; /* process each line in the buffer */ - while(cp <= endbuff) + while (cp <= ep) { first = cp; - if(!inword) + if (!inword) { nodelim = empty = 1; copy = cp; - if(nfields = *(lp = cuthdr->list)) + if (nfields = *(lp = cut->list)) copy = 0; else nfields = *++lp; } - else if(copy) + else if (copy) copy = cp; inword = 0; - while(!inword) + do { /* skip over non-delimiter characters */ - while(!(c=cuthdr->space[*cp++])); + if (cut->mb) + for (;;) + { + switch (c = sp[*(unsigned char*)cp++]) + { + case 0: + continue; + case SP_WIDE: + wp = --cp; + while ((c = mb2wc(w, cp, ep - cp)) <= 0) + { + /* mb char possibly spanning buffer boundary -- fun stuff */ + if ((ep - cp) < mbmax()) + { + int i; + int j; + int k; + + if (lastchar != cut->eob) + { + *ep = lastchar; + if ((c = mb2wc(w, cp, ep - cp)) > 0) + break; + } + if (copy) + { + empty = 0; + if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0) + goto failed; + } + for (i = 0; i <= (ep - cp); i++) + mb[i] = cp[i]; + if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0) + goto failed; + cp = bp; + ep = cp + --c; + if ((lastchar = cp[c]) != cut->eob) + *ep = cut->eob; + j = i; + k = 0; + while (j < mbmax()) + mb[j++] = cp[k++]; + if ((c = mb2wc(w, (char*)mb, j)) <= 0) + { + c = i; + w = 0; + } + first = bp = cp += c - i; + if (copy) + { + copy = bp; + if (w == cut->ldelim.chr) + lastchar = cut->ldelim.chr; + else if (w != cut->wdelim.chr) + { + empty = 0; + if (sfwrite(fdout, (char*)mb, c) < 0) + goto failed; + } + } + c = 0; + } + else + { + w = *cp; + c = 1; + } + break; + } + cp += c; + c = w; + if (c == cut->wdelim.chr) + { + c = SP_WORD; + break; + } + if (c == cut->ldelim.chr) + { + c = SP_LINE; + break; + } + continue; + default: + wp = cp - 1; + break; + } + break; + } + else + { + while (!(c = sp[*cp++])); + wp = cp - 1; + } /* check for end-of-line */ - if(endline(c)) + if (c == SP_LINE) { - if(cp<=endbuff) + if (cp <= ep) break; - if((c=cuthdr->space[lastchar]),endline(c)) + if (lastchar == cut->ldelim.chr) break; - /* restore cuthdr->last. character */ - if(lastchar != cuthdr->ldelim) - *endbuff = lastchar; + /* restore cut->last character */ + if (lastchar != cut->eob) + *ep = lastchar; inword++; - if(!c) + if (!sp[lastchar]) break; } nodelim = 0; - if(--nfields >0) + if (--nfields > 0) continue; nfields = *++lp; - if(copy) + if (copy) { empty = 0; - if((c=(cp-1)-copy)>0 && sfwrite(fdout,(char*)copy,c)< 0) + if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0) goto failed; copy = 0; } else /* set to delimiter unless the first field */ - copy = cp -!empty; - } - if(!inword) + copy = empty ? cp : wp; + } while (!inword); + if (!inword) { - if(!copy) + if (!copy) { - if(nodelim) + if (nodelim) { - if(!cuthdr->sflag) + if (!cut->sflag) { - if(offset) + if (offset) { sfseek(fdtmp,(Sfoff_t)0,SEEK_SET); sfmove(fdtmp,fdout,offset,-1); @@ -389,14 +548,14 @@ static void cutfields(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout) else sfputc(fdout,'\n'); } - if(offset) + if (offset) sfseek(fdtmp,offset=0,SEEK_SET); } - if(copy && (c=cp-copy)>0 && (!nodelim || !cuthdr->sflag) && sfwrite(fdout,(char*)copy,c)< 0) + if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0) goto failed; } /* see whether to save in tmp file */ - if(inword && nodelim && !cuthdr->sflag && (c=cp-first)>0) + if(inword && nodelim && !cut->sflag && (c=cp-first)>0) { /* copy line to tmpfile in case no fields */ if(!fdtmp) @@ -405,73 +564,107 @@ static void cutfields(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout) offset +=c; } } -failed: + failed: if(fdtmp) sfclose(fdtmp); } int -b_cut(int argc,char *argv[], void* context) +b_cut(int argc, char** argv, void* context) { - register char *cp = 0; - register Sfio_t *fp; - int n; - Cut_t *cuthdr; - int mode = 0; - int wdelim = '\t'; - int ldelim = '\n'; - size_t reclen = 0; + register char* cp = 0; + register Sfio_t* fp; + char* s; + int n; + Cut_t* cut; + int mode = 0; + Delim_t wdelim; + Delim_t ldelim; + size_t reclen = 0; cmdinit(argc, argv, context, ERROR_CATALOG, 0); - while (n = optget(argv, usage)) switch (n) + wdelim.chr = '\t'; + ldelim.chr = '\n'; + wdelim.len = ldelim.len = 1; + for (;;) { - case 'b': - case 'c': - if(mode&C_FIELDS) + switch (n = optget(argv, usage)) { - error(2, "f option already specified"); + case 0: break; - } - cp = opt_info.arg; - if(n=='b') - mode |= C_BYTES; - else - mode |= C_CHARS; - break; - case 'D': - ldelim = *(unsigned char*)opt_info.arg; - break; - case 'd': - wdelim = *(unsigned char*)opt_info.arg; - break; - case 'f': - if(mode&(C_CHARS|C_BYTES)) - { - error(2, "c option already specified"); + case 'b': + case 'c': + if(mode&C_FIELDS) + { + error(2, "f option already specified"); + continue; + } + cp = opt_info.arg; + if(n=='b') + mode |= C_BYTES; + else + mode |= C_CHARS; + continue; + case 'D': + ldelim.str = opt_info.arg; + if (mbwide()) + { + s = opt_info.arg; + ldelim.chr = mbchar(s); + if ((n = s - opt_info.arg) > 1) + { + ldelim.len = n; + continue; + } + } + ldelim.chr = *(unsigned char*)opt_info.arg; + ldelim.len = 1; + continue; + case 'd': + wdelim.str = opt_info.arg; + if (mbwide()) + { + s = opt_info.arg; + wdelim.chr = mbchar(s); + if ((n = s - opt_info.arg) > 1) + { + wdelim.len = n; + continue; + } + } + wdelim.chr = *(unsigned char*)opt_info.arg; + wdelim.len = 1; + continue; + case 'f': + if(mode&(C_CHARS|C_BYTES)) + { + error(2, "c option already specified"); + continue; + } + cp = opt_info.arg; + mode |= C_FIELDS; + continue; + case 'n': + mode |= C_NOSPLIT; + continue; + case 'N': + mode |= C_NONEWLINE; + continue; + case 'R': + case 'r': + if(opt_info.num>0) + reclen = opt_info.num; + continue; + case 's': + mode |= C_SUPRESS; + continue; + case ':': + error(2, "%s", opt_info.arg); + break; + case '?': + error(ERROR_usage(2), "%s", opt_info.arg); break; } - cp = opt_info.arg; - mode |= C_FIELDS; - break; - case 'n': - mode |= C_NOCHOP; - break; - case 'N': - mode |= C_NONEWLINE; - break; - case 'R': - case 'r': - if(opt_info.num>0) - reclen = opt_info.num; - break; - case 's': - mode |= C_SUPRESS; - break; - case ':': - error(2, "%s", opt_info.arg); - break; - case '?': - error(ERROR_usage(2), "%s", opt_info.arg); break; } argv += opt_info.index; @@ -486,7 +679,7 @@ b_cut(int argc,char *argv[], void* context) error(3, "non-empty b, c or f option must be specified"); if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS) error(3, "s option requires f option"); - cuthdr = cutinit(mode,cp,wdelim,ldelim,reclen); + cut = cutinit(mode, cp, &wdelim, &ldelim, reclen); if(cp = *argv) argv++; do @@ -499,13 +692,13 @@ b_cut(int argc,char *argv[], void* context) continue; } if(mode&C_FIELDS) - cutfields(cuthdr,fp,sfstdout); + cutfields(cut,fp,sfstdout); else - cutcols(cuthdr,fp,sfstdout); + cutcols(cut,fp,sfstdout); if(fp!=sfstdin) sfclose(fp); } while(cp = *argv++); if (sfsync(sfstdout)) error(ERROR_system(0), "write error"); - return(error_info.errors?1:0); + return error_info.errors != 0; } |