summaryrefslogtreecommitdiff
path: root/src/lib/libcmd/cut.c
diff options
context:
space:
mode:
authorIgor Pashev <pashev.igor@gmail.com>2012-06-24 22:28:35 +0000
committerIgor Pashev <pashev.igor@gmail.com>2012-06-24 22:28:35 +0000
commit3950ffe2a485479f6561c27364d3d7df5a21d124 (patch)
tree468c6e14449d1b1e279222ec32f676b0311917d2 /src/lib/libcmd/cut.c
downloadksh-upstream.tar.gz
Imported Upstream version 93u+upstream
Diffstat (limited to 'src/lib/libcmd/cut.c')
-rw-r--r--src/lib/libcmd/cut.c702
1 files changed, 702 insertions, 0 deletions
diff --git a/src/lib/libcmd/cut.c b/src/lib/libcmd/cut.c
new file mode 100644
index 0000000..985b789
--- /dev/null
+++ b/src/lib/libcmd/cut.c
@@ -0,0 +1,702 @@
+/***********************************************************************
+* *
+* This software is part of the ast package *
+* Copyright (c) 1992-2012 AT&T Intellectual Property *
+* and is licensed under the *
+* Eclipse Public License, Version 1.0 *
+* by AT&T Intellectual Property *
+* *
+* A copy of the License is available at *
+* http://www.eclipse.org/org/documents/epl-v10.html *
+* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
+* *
+* Information and Software Systems Research *
+* AT&T Research *
+* Florham Park NJ *
+* *
+* Glenn Fowler <gsf@research.att.com> *
+* David Korn <dgk@research.att.com> *
+* *
+***********************************************************************/
+#pragma prototyped
+/*
+ * David Korn
+ * AT&T Bell Laboratories
+ *
+ * cut fields or columns from fields from a file
+ */
+
+static const char usage[] =
+"[-?\n@(#)$Id: cut (AT&T Research) 2010-08-11 $\n]"
+USAGE_LICENSE
+"[+NAME?cut - cut out selected columns or fields of each line of a file]"
+"[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
+ "from one or more files, contatenating them on standard output.]"
+"[+?The option argument \alist\a is a comma-separated or blank-separated "
+ "list of positive numbers and ranges. Ranges can be of three "
+ "forms. The first is two positive integers separated by a hyphen "
+ "(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
+ "\ahigh\a. The second is a positive number preceded by a hyphen "
+ "(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
+ "\ahigh\a. The last is a positive number followed by a hyphen "
+ "(\alow\a\b-\b), which represents all fields from \alow\a to the "
+ "last field, inclusive. Elements in the \alist\a can be repeated, "
+ "can overlap, and can appear in any order. The order of the "
+ "output is that of the input.]"
+"[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
+"[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
+ "cuts from standard input. The start of the file is defined "
+ "as the current offset.]"
+"[b:bytes]:[list?\bcut\b based on a list of byte counts.]"
+"[c:characters]:[list?\bcut\b based on a list of character counts.]"
+"[d:delimiter]:[delim?The field character for the \b-f\b option is set "
+ "to \adelim\a. The default is the \btab\b character.]"
+"[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
+ "character specified with the \b-d\b optiion.]"
+"[n!:split?Split multibyte characters selected by the \b-b\b option.]"
+"[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
+ "records of length \areclen\a when used with the \b-b\b or \b-c\b "
+ "option.]"
+"[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
+ "when used with the \b-f\b option. By default, lines with no "
+ "delimiters will be passsed in untouched.]"
+"[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
+ "the \b-f\b option is set to \aldelim\a. The default is the "
+ "\bnewline\b character.]"
+"[N!:newline?Output new-lines at end of each record when used "
+ "with the \b-b\b or \b-c\b option.]"
+"\n"
+"\n[file ...]\n"
+"\n"
+"[+EXIT STATUS?]{"
+ "[+0?All files processed successfully.]"
+ "[+>0?One or more files failed to open or could not be read.]"
+"}"
+"[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
+;
+
+#include <cmd.h>
+#include <ctype.h>
+
+typedef struct Delim_s
+{
+ char* str;
+ int len;
+ int chr;
+} Delim_t;
+
+typedef struct Cut_s
+{
+ int mb;
+ int eob;
+ int cflag;
+ int nosplit;
+ int sflag;
+ int nlflag;
+ int reclen;
+ Delim_t wdelim;
+ Delim_t ldelim;
+ unsigned char space[UCHAR_MAX+1];
+ int list[2]; /* NOTE: must be last member */
+} Cut_t;
+
+#define HUGE INT_MAX
+#define BLOCK 8*1024
+#define C_BYTES 1
+#define C_CHARS 2
+#define C_FIELDS 4
+#define C_SUPRESS 8
+#define C_NOSPLIT 16
+#define C_NONEWLINE 32
+
+#define SP_LINE 1
+#define SP_WORD 2
+#define SP_WIDE 3
+
+/*
+ * compare the first of an array of integers
+ */
+
+static int
+mycomp(register const void* a, register const void* b)
+{
+ if (*((int*)a) < *((int*)b))
+ return -1;
+ if (*((int*)a) > *((int*)b))
+ return 1;
+ return 0;
+}
+
+static Cut_t*
+cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen)
+{
+ register int* lp;
+ register int c;
+ register int n = 0;
+ register int range = 0;
+ register char* cp = str;
+ Cut_t* cut;
+
+ if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int))))
+ error(ERROR_exit(1), "out of space");
+ if (cut->mb = mbwide())
+ {
+ memset(cut->space, 0, sizeof(cut->space) / 2);
+ memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2);
+ }
+ else
+ memset(cut->space, 0, sizeof(cut->space));
+ cut->wdelim = *wdelim;
+ if (wdelim->len == 1)
+ cut->space[wdelim->chr] = SP_WORD;
+ cut->ldelim = *ldelim;
+ cut->eob = (ldelim->len == 1) ? ldelim->chr : 0;
+ cut->space[cut->eob] = SP_LINE;
+ cut->cflag = (mode&C_CHARS) && cut->mb;
+ cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb;
+ cut->sflag = (mode&C_SUPRESS) != 0;
+ cut->nlflag = (mode&C_NONEWLINE) != 0;
+ cut->reclen = reclen;
+ lp = cut->list;
+ for (;;)
+ switch(c = *cp++)
+ {
+ case ' ':
+ case '\t':
+ while(*cp==' ' || *cp=='\t')
+ cp++;
+ /*FALLTHROUGH*/
+ case 0:
+ case ',':
+ if(range)
+ {
+ --range;
+ if((n = (n ? (n-range) : (HUGE-1))) < 0)
+ error(ERROR_exit(1),"invalid range for c/f option");
+ *lp++ = range;
+ *lp++ = n;
+ }
+ else
+ {
+ *lp++ = --n;
+ *lp++ = 1;
+ }
+ if(c==0)
+ {
+ register int *dp;
+ *lp = HUGE;
+ n = 1 + (lp-cut->list)/2;
+ qsort(lp=cut->list,n,2*sizeof(*lp),mycomp);
+ /* eliminate overlapping regions */
+ for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2)
+ {
+ if(lp[0] <= range)
+ {
+ if(lp[1]==HUGE)
+ {
+ dp[-1] = HUGE;
+ break;
+ }
+ if((c = lp[0]+lp[1]-range)>0)
+ {
+ range += c;
+ dp[-1] += c;
+ }
+ }
+ else
+ {
+ range = *dp++ = lp[0];
+ if(lp[1]==HUGE)
+ {
+ *dp++ = HUGE;
+ break;
+ }
+ range += (*dp++ = lp[1]);
+ }
+ }
+ *dp = HUGE;
+ lp = cut->list;
+ /* convert ranges into gaps */
+ for(n=0; *lp!=HUGE; lp+=2)
+ {
+ c = *lp;
+ *lp -= n;
+ n = c+lp[1];
+ }
+ return cut;
+ }
+ n = range = 0;
+ break;
+
+ case '-':
+ if(range)
+ error(ERROR_exit(1),"bad list for c/f option");
+ range = n?n:1;
+ n = 0;
+ break;
+
+ default:
+ if(!isdigit(c))
+ error(ERROR_exit(1),"bad list for c/f option");
+ n = 10*n + (c-'0');
+ break;
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * cut each line of file <fdin> and put results to <fdout> using list <list>
+ */
+
+static void
+cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
+{
+ register int c;
+ register int len;
+ register int ncol = 0;
+ register const int* lp = cut->list;
+ register char* bp;
+ register int skip; /* non-zero for don't copy */
+ int must;
+ char* ep;
+ const char* xx;
+
+ for (;;)
+ {
+ if (len = cut->reclen)
+ bp = sfreserve(fdin, len, -1);
+ else
+ bp = sfgetr(fdin, '\n', 0);
+ if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR)))
+ break;
+ len = sfvalue(fdin);
+ ep = bp + len;
+ xx = 0;
+ if (!(ncol = skip = *(lp = cut->list)))
+ ncol = *++lp;
+ must = 1;
+ do
+ {
+ if (cut->nosplit)
+ {
+ register const char* s = bp;
+ register int w = len < ncol ? len : ncol;
+ register int z;
+
+ while (w > 0)
+ {
+ if (!(*s & 0x80))
+ z = 1;
+ else if ((z = mbnsize(s, w)) <= 0)
+ {
+ if (s == bp && xx)
+ {
+ w += s - xx;
+ bp = (char*)(s = xx);
+ xx = 0;
+ continue;
+ }
+ xx = s;
+ if (skip)
+ s += w;
+ w = 0;
+ break;
+ }
+ s += z;
+ w -= z;
+ }
+ c = s - bp;
+ ncol = !w && ncol >= len;
+ }
+ else if (cut->cflag)
+ {
+ register const char* s = bp;
+ register int w = len;
+ register int z;
+
+ while (w > 0 && ncol > 0)
+ {
+ ncol--;
+ if (!(*s & 0x80) || (z = mbnsize(s, w)) <= 0)
+ z = 1;
+ s += z;
+ w -= z;
+
+ }
+ c = s - bp;
+ ncol = !w && (ncol || !skip);
+ }
+ else
+ {
+ if ((c = ncol) > len)
+ c = len;
+ else if (c == len && !skip)
+ ncol++;
+ ncol -= c;
+ }
+ if (!skip && c)
+ {
+ if (sfwrite(fdout, (char*)bp, c) < 0)
+ return;
+ must = 0;
+ }
+ bp += c;
+ if (ncol)
+ break;
+ len -= c;
+ ncol = *++lp;
+ skip = !skip;
+ } while (ncol != HUGE);
+ if (!cut->nlflag && (skip || must || cut->reclen))
+ {
+ if (cut->ldelim.len > 1)
+ sfwrite(fdout, cut->ldelim.str, cut->ldelim.len);
+ else
+ sfputc(fdout, cut->ldelim.chr);
+ }
+ }
+}
+
+/*
+ * cut each line of file <fdin> and put results to <fdout> using list <list>
+ * stream <fdin> must be line buffered
+ */
+
+static void
+cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
+{
+ register unsigned char *sp = cut->space;
+ register unsigned char *cp;
+ register unsigned char *wp;
+ register int c, nfields;
+ register const int *lp = cut->list;
+ register unsigned char *copy;
+ register int nodelim, empty, inword=0;
+ register unsigned char *ep;
+ unsigned char *bp, *first;
+ int lastchar;
+ wchar_t w;
+ Sfio_t *fdtmp = 0;
+ long offset = 0;
+ unsigned char mb[8];
+ /* process each buffer */
+ while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0)
+ {
+ cp = bp;
+ ep = cp + --c;
+ if((lastchar = cp[c]) != cut->eob)
+ *ep = cut->eob;
+ /* process each line in the buffer */
+ while (cp <= ep)
+ {
+ first = cp;
+ if (!inword)
+ {
+ nodelim = empty = 1;
+ copy = cp;
+ if (nfields = *(lp = cut->list))
+ copy = 0;
+ else
+ nfields = *++lp;
+ }
+ else if (copy)
+ copy = cp;
+ inword = 0;
+ do
+ {
+ /* skip over non-delimiter characters */
+ if (cut->mb)
+ for (;;)
+ {
+ switch (c = sp[*(unsigned char*)cp++])
+ {
+ case 0:
+ continue;
+ case SP_WIDE:
+ wp = --cp;
+ while ((c = mb2wc(w, cp, ep - cp)) <= 0)
+ {
+ /* mb char possibly spanning buffer boundary -- fun stuff */
+ if ((ep - cp) < mbmax())
+ {
+ int i;
+ int j;
+ int k;
+
+ if (lastchar != cut->eob)
+ {
+ *ep = lastchar;
+ if ((c = mb2wc(w, cp, ep - cp)) > 0)
+ break;
+ }
+ if (copy)
+ {
+ empty = 0;
+ if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
+ goto failed;
+ }
+ for (i = 0; i <= (ep - cp); i++)
+ mb[i] = cp[i];
+ if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0)
+ goto failed;
+ cp = bp;
+ ep = cp + --c;
+ if ((lastchar = cp[c]) != cut->eob)
+ *ep = cut->eob;
+ j = i;
+ k = 0;
+ while (j < mbmax())
+ mb[j++] = cp[k++];
+ if ((c = mb2wc(w, (char*)mb, j)) <= 0)
+ {
+ c = i;
+ w = 0;
+ }
+ first = bp = cp += c - i;
+ if (copy)
+ {
+ copy = bp;
+ if (w == cut->ldelim.chr)
+ lastchar = cut->ldelim.chr;
+ else if (w != cut->wdelim.chr)
+ {
+ empty = 0;
+ if (sfwrite(fdout, (char*)mb, c) < 0)
+ goto failed;
+ }
+ }
+ c = 0;
+ }
+ else
+ {
+ w = *cp;
+ c = 1;
+ }
+ break;
+ }
+ cp += c;
+ c = w;
+ if (c == cut->wdelim.chr)
+ {
+ c = SP_WORD;
+ break;
+ }
+ if (c == cut->ldelim.chr)
+ {
+ c = SP_LINE;
+ break;
+ }
+ continue;
+ default:
+ wp = cp - 1;
+ break;
+ }
+ break;
+ }
+ else
+ {
+ while (!(c = sp[*cp++]));
+ wp = cp - 1;
+ }
+ /* check for end-of-line */
+ if (c == SP_LINE)
+ {
+ if (cp <= ep)
+ break;
+ if (lastchar == cut->ldelim.chr)
+ break;
+ /* restore cut->last character */
+ if (lastchar != cut->eob)
+ *ep = lastchar;
+ inword++;
+ if (!sp[lastchar])
+ break;
+ }
+ nodelim = 0;
+ if (--nfields > 0)
+ continue;
+ nfields = *++lp;
+ if (copy)
+ {
+ empty = 0;
+ if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
+ goto failed;
+ copy = 0;
+ }
+ else
+ /* set to delimiter unless the first field */
+ copy = empty ? cp : wp;
+ } while (!inword);
+ if (!inword)
+ {
+ if (!copy)
+ {
+ if (nodelim)
+ {
+ if (!cut->sflag)
+ {
+ if (offset)
+ {
+ sfseek(fdtmp,(Sfoff_t)0,SEEK_SET);
+ sfmove(fdtmp,fdout,offset,-1);
+ }
+ copy = first;
+ }
+ }
+ else
+ sfputc(fdout,'\n');
+ }
+ if (offset)
+ sfseek(fdtmp,offset=0,SEEK_SET);
+ }
+ if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0)
+ goto failed;
+ }
+ /* see whether to save in tmp file */
+ if(inword && nodelim && !cut->sflag && (c=cp-first)>0)
+ {
+ /* copy line to tmpfile in case no fields */
+ if(!fdtmp)
+ fdtmp = sftmp(BLOCK);
+ sfwrite(fdtmp,(char*)first,c);
+ offset +=c;
+ }
+ }
+ failed:
+ if(fdtmp)
+ sfclose(fdtmp);
+}
+
+int
+b_cut(int argc, char** argv, Shbltin_t* context)
+{
+ register char* cp = 0;
+ register Sfio_t* fp;
+ char* s;
+ int n;
+ Cut_t* cut;
+ int mode = 0;
+ Delim_t wdelim;
+ Delim_t ldelim;
+ size_t reclen = 0;
+
+ cmdinit(argc, argv, context, ERROR_CATALOG, 0);
+ wdelim.chr = '\t';
+ ldelim.chr = '\n';
+ wdelim.len = ldelim.len = 1;
+ for (;;)
+ {
+ switch (optget(argv, usage))
+ {
+ case 0:
+ break;
+ case 'b':
+ case 'c':
+ if(mode&C_FIELDS)
+ {
+ error(2, "f option already specified");
+ continue;
+ }
+ cp = opt_info.arg;
+ if(opt_info.option[1]=='b')
+ mode |= C_BYTES;
+ else
+ mode |= C_CHARS;
+ continue;
+ case 'D':
+ ldelim.str = opt_info.arg;
+ if (mbwide())
+ {
+ s = opt_info.arg;
+ ldelim.chr = mbchar(s);
+ if ((n = s - opt_info.arg) > 1)
+ {
+ ldelim.len = n;
+ continue;
+ }
+ }
+ ldelim.chr = *(unsigned char*)opt_info.arg;
+ ldelim.len = 1;
+ continue;
+ case 'd':
+ wdelim.str = opt_info.arg;
+ if (mbwide())
+ {
+ s = opt_info.arg;
+ wdelim.chr = mbchar(s);
+ if ((n = s - opt_info.arg) > 1)
+ {
+ wdelim.len = n;
+ continue;
+ }
+ }
+ wdelim.chr = *(unsigned char*)opt_info.arg;
+ wdelim.len = 1;
+ continue;
+ case 'f':
+ if(mode&(C_CHARS|C_BYTES))
+ {
+ error(2, "c option already specified");
+ continue;
+ }
+ cp = opt_info.arg;
+ mode |= C_FIELDS;
+ continue;
+ case 'n':
+ mode |= C_NOSPLIT;
+ continue;
+ case 'N':
+ mode |= C_NONEWLINE;
+ continue;
+ case 'R':
+ case 'r':
+ if(opt_info.num>0)
+ reclen = opt_info.num;
+ continue;
+ case 's':
+ mode |= C_SUPRESS;
+ continue;
+ case ':':
+ error(2, "%s", opt_info.arg);
+ break;
+ case '?':
+ error(ERROR_usage(2), "%s", opt_info.arg);
+ break;
+ }
+ break;
+ }
+ argv += opt_info.index;
+ if (error_info.errors)
+ error(ERROR_usage(2), "%s",optusage(NiL));
+ if(!cp)
+ {
+ error(2, "b, c or f option must be specified");
+ error(ERROR_usage(2), "%s", optusage(NiL));
+ }
+ if(!*cp)
+ error(3, "non-empty b, c or f option must be specified");
+ if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS)
+ error(3, "s option requires f option");
+ cut = cutinit(mode, cp, &wdelim, &ldelim, reclen);
+ if(cp = *argv)
+ argv++;
+ do
+ {
+ if(!cp || streq(cp,"-"))
+ fp = sfstdin;
+ else if(!(fp = sfopen(NiL,cp,"r")))
+ {
+ error(ERROR_system(0),"%s: cannot open",cp);
+ continue;
+ }
+ if(mode&C_FIELDS)
+ cutfields(cut,fp,sfstdout);
+ else
+ cutcols(cut,fp,sfstdout);
+ if(fp!=sfstdin)
+ sfclose(fp);
+ } while(cp = *argv++);
+ if (sfsync(sfstdout))
+ error(ERROR_system(0), "write error");
+ return error_info.errors != 0;
+}