summaryrefslogtreecommitdiff
path: root/usr/src/lib/libcmd/common/wclib.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/lib/libcmd/common/wclib.c')
-rw-r--r--usr/src/lib/libcmd/common/wclib.c424
1 files changed, 362 insertions, 62 deletions
diff --git a/usr/src/lib/libcmd/common/wclib.c b/usr/src/lib/libcmd/common/wclib.c
index 67adb5fe15..c7ce5a338a 100644
--- a/usr/src/lib/libcmd/common/wclib.c
+++ b/usr/src/lib/libcmd/common/wclib.c
@@ -1,7 +1,7 @@
/***********************************************************************
* *
* This software is part of the ast package *
-* Copyright (c) 1992-2008 AT&T Intellectual Property *
+* Copyright (c) 1992-2009 AT&T Intellectual Property *
* and is licensed under the *
* Common Public License, Version 1.0 *
* by AT&T Intellectual Property *
@@ -34,6 +34,7 @@
#include <wchar.h>
#include <wctype.h>
+#include <lc.h>
#else
@@ -43,80 +44,197 @@
#endif
-#define endline(c) (((signed char)-1)<0?(c)<0:(c)==((char)-1))
-#define mbok(p,n) (((n)<1)?0:mbwide()?((*ast.mb_towc)(NiL,(char*)(p),n)>=0):1)
+#define WC_SP 0x08
+#define WC_NL 0x10
+#define WC_MB 0x20
+#define WC_ERR 0x40
-Wc_t *wc_init(int mode)
+#define eol(c) ((c)&WC_NL)
+#define mbc(c) ((c)&WC_MB)
+#define spc(c) ((c)&WC_SP)
+#define mbwc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n)
+
+Wc_t* wc_init(int mode)
{
register int n;
register int w;
Wc_t* wp;
- if(!(wp = (Wc_t*)stakalloc(sizeof(Wc_t))))
- return(0);
- wp->mode = mode;
+ if (!(wp = (Wc_t*)stakalloc(sizeof(Wc_t))))
+ return 0;
+ if (!mbwide())
+ wp->mb = 0;
+#if _hdr_wchar && _hdr_wctype && _lib_iswctype
+ else if (!(mode & WC_NOUTF8) && (lcinfo(LC_CTYPE)->lc->flags & LC_utf8))
+ wp->mb = 1;
+#endif
+ else
+ wp->mb = -1;
w = mode & WC_WORDS;
- for(n=(1<<CHAR_BIT);--n >=0;)
- wp->space[n] = w ? !!isspace(n) : 0;
- wp->space['\n'] = -1;
- return(wp);
+ for (n = (1<<CHAR_BIT); --n >= 0;)
+ wp->type[n] = (w && isspace(n)) ? WC_SP : 0;
+ wp->type['\n'] = WC_SP|WC_NL;
+ if ((mode & (WC_MBYTE|WC_WORDS)) && wp->mb > 0)
+ {
+ for (n = 0; n < 64; n++)
+ {
+ wp->type[0x80+n] |= WC_MB;
+ if (n<32)
+ wp->type[0xc0+n] |= WC_MB+1;
+ else if (n<48)
+ wp->type[0xc0+n] |= WC_MB+2;
+ else if (n<56)
+ wp->type[0xc0+n] |= WC_MB+3;
+ else if (n<60)
+ wp->type[0xc0+n] |= WC_MB+4;
+ else if (n<62)
+ wp->type[0xc0+n] |= WC_MB+5;
+ }
+ wp->type[0xc0] = WC_MB|WC_ERR;
+ wp->type[0xc1] = WC_MB|WC_ERR;
+ wp->type[0xfe] = WC_MB|WC_ERR;
+ wp->type[0xff] = WC_MB|WC_ERR;
+ }
+ wp->mode = mode;
+ return wp;
+}
+
+static int invalid(const char *file, int nlines)
+{
+ error_info.file = (char*)file;
+ error_info.line = nlines;
+ error(ERROR_SYSTEM|1, "invalid multibyte character");
+ error_info.file = 0;
+ error_info.line = 0;
+ return nlines;
+}
+
+/*
+ * handle utf space characters
+ */
+
+static int chkstate(int state, register unsigned int c)
+{
+ switch(state)
+ {
+ case 1:
+ state = (c==0x9a?4:0);
+ break;
+ case 2:
+ state = ((c==0x80||c==0x81)?6+(c&1):0);
+ break;
+ case 3:
+ state = (c==0x80?5:0);
+ break;
+ case 4:
+ state = (c==0x80?10:0);
+ break;
+ case 5:
+ state = (c==0x80?10:0);
+ break;
+ case 6:
+ state = 0;
+ if(c==0xa0 || c==0xa1)
+ return(10);
+ else if((c&0xf0)== 0x80)
+ {
+ if((c&=0xf)==7)
+ return(iswspace(0x2007)?10:0);
+ if(c<=0xb)
+ return(10);
+ }
+ else if(c==0xaf && iswspace(0x202f))
+ return(10);
+ break;
+ case 7:
+ state = (c==0x9f?10:0);
+ break;
+ case 8:
+ return (iswspace(c)?10:0);
+ }
+ return state;
}
/*
* compute the line, word, and character count for file <fd>
*/
+
int wc_count(Wc_t *wp, Sfio_t *fd, const char* file)
{
- register signed char *space = wp->space;
- register unsigned char *cp;
+ register char* type = wp->type;
+ register unsigned char* cp;
register Sfoff_t nchars;
register Sfoff_t nwords;
register Sfoff_t nlines;
- register Sfoff_t eline;
- register Sfoff_t longest;
+ register Sfoff_t eline = -1;
+ register Sfoff_t longest = 0;
register ssize_t c;
- register unsigned char *endbuff;
- register int lasttype = 1;
+ register unsigned char* endbuff;
+ register int lasttype = WC_SP;
unsigned int lastchar;
- unsigned char *buff;
+ ssize_t n;
+ ssize_t o;
+ unsigned char* buff;
wchar_t x;
+ unsigned char side[32];
sfset(fd,SF_WRITE,1);
nlines = nwords = nchars = 0;
wp->longest = 0;
- if (wp->mode & (WC_LONGEST|WC_MBYTE))
+ if (wp->mb < 0 && (wp->mode & (WC_MBYTE|WC_WORDS)))
{
- longest = 0;
- eline = -1;
cp = buff = endbuff = 0;
for (;;)
{
- if (!mbok(cp, endbuff-cp))
- {
- if (buff)
- sfread(fd, buff, cp-buff);
- if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, SF_LOCKR)))
- break;
- endbuff = (cp = buff) + sfvalue(fd);
- }
- nchars++;
- x = mbchar(cp);
- if (x == -1)
+ if (cp >= endbuff || (n = mbwc(x, cp, endbuff-cp)) < 0)
{
- if (eline != nlines && !(wp->mode & WC_QUIET))
+ if ((o = endbuff-cp) < sizeof(side))
+ {
+ if (buff)
+ {
+ if (o)
+ memcpy(side, cp, o);
+ mbinit();
+ }
+ else
+ o = 0;
+ cp = side + o;
+ if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) || (n = sfvalue(fd)) <= 0)
+ {
+ if ((nchars - longest) > wp->longest)
+ wp->longest = nchars - longest;
+ break;
+ }
+ if ((c = sizeof(side) - o) > n)
+ c = n;
+ if (c)
+ memcpy(cp, buff, c);
+ endbuff = buff + n;
+ cp = side;
+ x = mbchar(cp);
+ if ((cp-side) < o)
+ {
+ cp = buff;
+ nchars += (cp-side) - 1;
+ }
+ else
+ cp = buff + (cp-side) - o;
+ }
+ else
{
- error_info.file = (char*)file;
- error_info.line = eline = nlines;
- error(ERROR_SYSTEM|1, "invalid multibyte character");
- error_info.file = 0;
- error_info.line = 0;
+ cp++;
+ x = -1;
}
+ if (x == -1 && eline != nlines && !(wp->mode & WC_QUIET))
+ eline = invalid(file, nlines);
}
- else if (x == '\n')
+ else
+ cp += n ? n : 1;
+ if (x == '\n')
{
if ((nchars - longest) > wp->longest)
wp->longest = nchars - longest;
- longest = nchars;
+ longest = nchars + 1;
nlines++;
lasttype = 1;
}
@@ -127,71 +245,253 @@ int wc_count(Wc_t *wp, Sfio_t *fd, const char* file)
lasttype = 0;
nwords++;
}
+ nchars++;
+ }
+ }
+ else if (!wp->mb && !(wp->mode & WC_LONGEST) || wp->mb > 0 && !(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
+ {
+ if (!(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
+ {
+ while ((cp = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
+ {
+ nchars += c;
+ endbuff = cp + c;
+ if (*--endbuff == '\n')
+ nlines++;
+ else
+ *endbuff = '\n';
+ for (;;)
+ if (*cp++ == '\n')
+ {
+ if (cp > endbuff)
+ break;
+ nlines++;
+ }
+ }
+ }
+ else
+ {
+ while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
+ {
+ nchars += c;
+ /* check to see whether first character terminates word */
+ if (c==1)
+ {
+ if (eol(lasttype))
+ nlines++;
+ if ((c = type[*cp]) && !lasttype)
+ nwords++;
+ lasttype = c;
+ continue;
+ }
+ if (!lasttype && type[*cp])
+ nwords++;
+ lastchar = cp[--c];
+ *(endbuff = cp+c) = '\n';
+ c = lasttype;
+ /* process each buffer */
+ for (;;)
+ {
+ /* process spaces and new-lines */
+ do
+ {
+ if (eol(c))
+ for (;;)
+ {
+ /* check for end of buffer */
+ if (cp > endbuff)
+ goto beob;
+ nlines++;
+ if (*cp != '\n')
+ break;
+ cp++;
+ }
+ } while (c = type[*cp++]);
+ /* skip over word characters */
+ while (!(c = type[*cp++]));
+ nwords++;
+ }
+ beob:
+ if ((cp -= 2) >= buff)
+ c = type[*cp];
+ else
+ c = lasttype;
+ lasttype = type[lastchar];
+ /* see if was in word */
+ if (!c && !lasttype)
+ nwords--;
+ }
+ if (eol(lasttype))
+ nlines++;
+ else if (!lasttype)
+ nwords++;
}
}
else
{
- for (;;)
+ int lineoff=0;
+ int skip=0;
+ int adjust=0;
+ int state=0;
+ int oldc;
+ int xspace;
+ int wasspace = 1;
+ unsigned char* start;
+
+ lastchar = 0;
+ start = (endbuff = side) + 1;
+ xspace = iswspace(0xa0) || iswspace(0x85);
+ while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
{
- /* fill next buffer and check for end-of-file */
- if (!(buff = (unsigned char*)sfreserve(fd, 0, 0)) || (c = sfvalue(fd)) <= 0)
- break;
- sfread(fd,(char*)(cp=buff),c);
nchars += c;
+ start = cp-lineoff;
/* check to see whether first character terminates word */
if(c==1)
{
- if(endline(lasttype))
+ if(eol(lasttype))
nlines++;
- if((c = space[*cp]) && !lasttype)
+ if((c = type[*cp]) && !lasttype)
nwords++;
lasttype = c;
+ endbuff = start;
continue;
}
- if(!lasttype && space[*cp])
- nwords++;
lastchar = cp[--c];
- cp[c] = '\n';
endbuff = cp+c;
+ cp[c] = '\n';
+ if(mbc(lasttype))
+ {
+ c = lasttype;
+ goto mbyte;
+ }
+ if(!lasttype && spc(type[*cp]))
+ nwords++;
c = lasttype;
/* process each buffer */
for (;;)
{
/* process spaces and new-lines */
- do if (endline(c))
+ spaces:
+ do
{
- for (;;)
+ if (eol(c))
{
/* check for end of buffer */
if (cp > endbuff)
goto eob;
+ if(wp->mode&WC_LONGEST)
+ {
+ if((cp-start)-adjust > longest)
+ longest = (cp-start)-adjust-1;
+ start = cp;
+ }
nlines++;
- if (*cp != '\n')
+ nchars -= adjust;
+ adjust = 0;
+ }
+ } while (spc(c = type[*cp++]));
+ wasspace=1;
+ if(mbc(c))
+ {
+ mbyte:
+ do
+ {
+ if(c&WC_ERR)
+ goto err;
+ if(skip && (c&7))
break;
- cp++;
+ if(!skip)
+ {
+ if(!(c&7))
+ {
+ skip=1;
+ break;
+ }
+ skip = (c&7);
+ adjust += skip;
+ state = 0;
+ if(skip==2 && (cp[-1]&0xc)==0 && (state=(cp[-1]&0x3)))
+ oldc = *cp;
+ else if(xspace && cp[-1]==0xc2)
+ {
+ state = 8;
+ oldc = *cp;
+ }
+ }
+ else
+ {
+ skip--;
+ if(state && (state=chkstate(state,oldc)))
+ {
+ if(state==10)
+ {
+ if(!wasspace)
+ nwords++;
+ wasspace = 1;
+ state=0;
+ goto spaces;
+ }
+ oldc = *cp;
+ }
+ }
+ } while (mbc(c = type[*cp++]));
+ wasspace = 0;
+ if(skip)
+ {
+ if(eol(c) && (cp > endbuff))
+ goto eob;
+ err:
+ skip = 0;
+ state = 0;
+ if(eline!=nlines && !(wp->mode & WC_QUIET))
+ eline = invalid(file, nlines);
+ while(mbc(c) && ((c|WC_ERR) || (c&7)==0))
+ c=type[*cp++];
+ if(eol(c) && (cp > endbuff))
+ {
+ c = WC_MB|WC_ERR;
+ goto eob;
+ }
+ if(mbc(c))
+ goto mbyte;
+ else if(c&WC_SP)
+ goto spaces;
+ }
+ if(spc(c))
+ {
+ nwords++;
+ continue;
}
- } while (c = space[*cp++]);
+ }
/* skip over word characters */
- while(!(c = space[*cp++]));
+ while(!(c = type[*cp++]));
+ if(mbc(c))
+ goto mbyte;
nwords++;
}
eob:
+ lineoff = cp-start;
if((cp -= 2) >= buff)
- c = space[*cp];
+ c = type[*cp];
else
- c = lasttype;
- lasttype = space[lastchar];
+ c = lasttype;
+ lasttype = type[lastchar];
/* see if was in word */
if(!c && !lasttype)
nwords--;
}
- if(endline(lasttype))
+ if ((wp->mode&WC_LONGEST) && ((endbuff + 1 - start) - adjust - (lastchar == '\n')) > longest)
+ longest = (endbuff + 1 - start) - adjust - (lastchar == '\n');
+ wp->longest = longest;
+ if (eol(lasttype))
nlines++;
- else if(!lasttype)
+ else if (!lasttype)
nwords++;
+ nchars -= adjust;
}
wp->chars = nchars;
wp->words = nwords;
wp->lines = nlines;
- return(0);
+ return 0;
}
+