diff options
Diffstat (limited to 'usr/src/lib/libcmd/common/wclib.c')
-rw-r--r-- | usr/src/lib/libcmd/common/wclib.c | 424 |
1 files changed, 362 insertions, 62 deletions
diff --git a/usr/src/lib/libcmd/common/wclib.c b/usr/src/lib/libcmd/common/wclib.c index 67adb5fe15..c7ce5a338a 100644 --- a/usr/src/lib/libcmd/common/wclib.c +++ b/usr/src/lib/libcmd/common/wclib.c @@ -1,7 +1,7 @@ /*********************************************************************** * * * This software is part of the ast package * -* Copyright (c) 1992-2008 AT&T Intellectual Property * +* Copyright (c) 1992-2009 AT&T Intellectual Property * * and is licensed under the * * Common Public License, Version 1.0 * * by AT&T Intellectual Property * @@ -34,6 +34,7 @@ #include <wchar.h> #include <wctype.h> +#include <lc.h> #else @@ -43,80 +44,197 @@ #endif -#define endline(c) (((signed char)-1)<0?(c)<0:(c)==((char)-1)) -#define mbok(p,n) (((n)<1)?0:mbwide()?((*ast.mb_towc)(NiL,(char*)(p),n)>=0):1) +#define WC_SP 0x08 +#define WC_NL 0x10 +#define WC_MB 0x20 +#define WC_ERR 0x40 -Wc_t *wc_init(int mode) +#define eol(c) ((c)&WC_NL) +#define mbc(c) ((c)&WC_MB) +#define spc(c) ((c)&WC_SP) +#define mbwc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n) + +Wc_t* wc_init(int mode) { register int n; register int w; Wc_t* wp; - if(!(wp = (Wc_t*)stakalloc(sizeof(Wc_t)))) - return(0); - wp->mode = mode; + if (!(wp = (Wc_t*)stakalloc(sizeof(Wc_t)))) + return 0; + if (!mbwide()) + wp->mb = 0; +#if _hdr_wchar && _hdr_wctype && _lib_iswctype + else if (!(mode & WC_NOUTF8) && (lcinfo(LC_CTYPE)->lc->flags & LC_utf8)) + wp->mb = 1; +#endif + else + wp->mb = -1; w = mode & WC_WORDS; - for(n=(1<<CHAR_BIT);--n >=0;) - wp->space[n] = w ? !!isspace(n) : 0; - wp->space['\n'] = -1; - return(wp); + for (n = (1<<CHAR_BIT); --n >= 0;) + wp->type[n] = (w && isspace(n)) ? WC_SP : 0; + wp->type['\n'] = WC_SP|WC_NL; + if ((mode & (WC_MBYTE|WC_WORDS)) && wp->mb > 0) + { + for (n = 0; n < 64; n++) + { + wp->type[0x80+n] |= WC_MB; + if (n<32) + wp->type[0xc0+n] |= WC_MB+1; + else if (n<48) + wp->type[0xc0+n] |= WC_MB+2; + else if (n<56) + wp->type[0xc0+n] |= WC_MB+3; + else if (n<60) + wp->type[0xc0+n] |= WC_MB+4; + else if (n<62) + wp->type[0xc0+n] |= WC_MB+5; + } + wp->type[0xc0] = WC_MB|WC_ERR; + wp->type[0xc1] = WC_MB|WC_ERR; + wp->type[0xfe] = WC_MB|WC_ERR; + wp->type[0xff] = WC_MB|WC_ERR; + } + wp->mode = mode; + return wp; +} + +static int invalid(const char *file, int nlines) +{ + error_info.file = (char*)file; + error_info.line = nlines; + error(ERROR_SYSTEM|1, "invalid multibyte character"); + error_info.file = 0; + error_info.line = 0; + return nlines; +} + +/* + * handle utf space characters + */ + +static int chkstate(int state, register unsigned int c) +{ + switch(state) + { + case 1: + state = (c==0x9a?4:0); + break; + case 2: + state = ((c==0x80||c==0x81)?6+(c&1):0); + break; + case 3: + state = (c==0x80?5:0); + break; + case 4: + state = (c==0x80?10:0); + break; + case 5: + state = (c==0x80?10:0); + break; + case 6: + state = 0; + if(c==0xa0 || c==0xa1) + return(10); + else if((c&0xf0)== 0x80) + { + if((c&=0xf)==7) + return(iswspace(0x2007)?10:0); + if(c<=0xb) + return(10); + } + else if(c==0xaf && iswspace(0x202f)) + return(10); + break; + case 7: + state = (c==0x9f?10:0); + break; + case 8: + return (iswspace(c)?10:0); + } + return state; } /* * compute the line, word, and character count for file <fd> */ + int wc_count(Wc_t *wp, Sfio_t *fd, const char* file) { - register signed char *space = wp->space; - register unsigned char *cp; + register char* type = wp->type; + register unsigned char* cp; register Sfoff_t nchars; register Sfoff_t nwords; register Sfoff_t nlines; - register Sfoff_t eline; - register Sfoff_t longest; + register Sfoff_t eline = -1; + register Sfoff_t longest = 0; register ssize_t c; - register unsigned char *endbuff; - register int lasttype = 1; + register unsigned char* endbuff; + register int lasttype = WC_SP; unsigned int lastchar; - unsigned char *buff; + ssize_t n; + ssize_t o; + unsigned char* buff; wchar_t x; + unsigned char side[32]; sfset(fd,SF_WRITE,1); nlines = nwords = nchars = 0; wp->longest = 0; - if (wp->mode & (WC_LONGEST|WC_MBYTE)) + if (wp->mb < 0 && (wp->mode & (WC_MBYTE|WC_WORDS))) { - longest = 0; - eline = -1; cp = buff = endbuff = 0; for (;;) { - if (!mbok(cp, endbuff-cp)) - { - if (buff) - sfread(fd, buff, cp-buff); - if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, SF_LOCKR))) - break; - endbuff = (cp = buff) + sfvalue(fd); - } - nchars++; - x = mbchar(cp); - if (x == -1) + if (cp >= endbuff || (n = mbwc(x, cp, endbuff-cp)) < 0) { - if (eline != nlines && !(wp->mode & WC_QUIET)) + if ((o = endbuff-cp) < sizeof(side)) + { + if (buff) + { + if (o) + memcpy(side, cp, o); + mbinit(); + } + else + o = 0; + cp = side + o; + if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) || (n = sfvalue(fd)) <= 0) + { + if ((nchars - longest) > wp->longest) + wp->longest = nchars - longest; + break; + } + if ((c = sizeof(side) - o) > n) + c = n; + if (c) + memcpy(cp, buff, c); + endbuff = buff + n; + cp = side; + x = mbchar(cp); + if ((cp-side) < o) + { + cp = buff; + nchars += (cp-side) - 1; + } + else + cp = buff + (cp-side) - o; + } + else { - error_info.file = (char*)file; - error_info.line = eline = nlines; - error(ERROR_SYSTEM|1, "invalid multibyte character"); - error_info.file = 0; - error_info.line = 0; + cp++; + x = -1; } + if (x == -1 && eline != nlines && !(wp->mode & WC_QUIET)) + eline = invalid(file, nlines); } - else if (x == '\n') + else + cp += n ? n : 1; + if (x == '\n') { if ((nchars - longest) > wp->longest) wp->longest = nchars - longest; - longest = nchars; + longest = nchars + 1; nlines++; lasttype = 1; } @@ -127,71 +245,253 @@ int wc_count(Wc_t *wp, Sfio_t *fd, const char* file) lasttype = 0; nwords++; } + nchars++; + } + } + else if (!wp->mb && !(wp->mode & WC_LONGEST) || wp->mb > 0 && !(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST))) + { + if (!(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST))) + { + while ((cp = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0) + { + nchars += c; + endbuff = cp + c; + if (*--endbuff == '\n') + nlines++; + else + *endbuff = '\n'; + for (;;) + if (*cp++ == '\n') + { + if (cp > endbuff) + break; + nlines++; + } + } + } + else + { + while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0) + { + nchars += c; + /* check to see whether first character terminates word */ + if (c==1) + { + if (eol(lasttype)) + nlines++; + if ((c = type[*cp]) && !lasttype) + nwords++; + lasttype = c; + continue; + } + if (!lasttype && type[*cp]) + nwords++; + lastchar = cp[--c]; + *(endbuff = cp+c) = '\n'; + c = lasttype; + /* process each buffer */ + for (;;) + { + /* process spaces and new-lines */ + do + { + if (eol(c)) + for (;;) + { + /* check for end of buffer */ + if (cp > endbuff) + goto beob; + nlines++; + if (*cp != '\n') + break; + cp++; + } + } while (c = type[*cp++]); + /* skip over word characters */ + while (!(c = type[*cp++])); + nwords++; + } + beob: + if ((cp -= 2) >= buff) + c = type[*cp]; + else + c = lasttype; + lasttype = type[lastchar]; + /* see if was in word */ + if (!c && !lasttype) + nwords--; + } + if (eol(lasttype)) + nlines++; + else if (!lasttype) + nwords++; } } else { - for (;;) + int lineoff=0; + int skip=0; + int adjust=0; + int state=0; + int oldc; + int xspace; + int wasspace = 1; + unsigned char* start; + + lastchar = 0; + start = (endbuff = side) + 1; + xspace = iswspace(0xa0) || iswspace(0x85); + while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0) { - /* fill next buffer and check for end-of-file */ - if (!(buff = (unsigned char*)sfreserve(fd, 0, 0)) || (c = sfvalue(fd)) <= 0) - break; - sfread(fd,(char*)(cp=buff),c); nchars += c; + start = cp-lineoff; /* check to see whether first character terminates word */ if(c==1) { - if(endline(lasttype)) + if(eol(lasttype)) nlines++; - if((c = space[*cp]) && !lasttype) + if((c = type[*cp]) && !lasttype) nwords++; lasttype = c; + endbuff = start; continue; } - if(!lasttype && space[*cp]) - nwords++; lastchar = cp[--c]; - cp[c] = '\n'; endbuff = cp+c; + cp[c] = '\n'; + if(mbc(lasttype)) + { + c = lasttype; + goto mbyte; + } + if(!lasttype && spc(type[*cp])) + nwords++; c = lasttype; /* process each buffer */ for (;;) { /* process spaces and new-lines */ - do if (endline(c)) + spaces: + do { - for (;;) + if (eol(c)) { /* check for end of buffer */ if (cp > endbuff) goto eob; + if(wp->mode&WC_LONGEST) + { + if((cp-start)-adjust > longest) + longest = (cp-start)-adjust-1; + start = cp; + } nlines++; - if (*cp != '\n') + nchars -= adjust; + adjust = 0; + } + } while (spc(c = type[*cp++])); + wasspace=1; + if(mbc(c)) + { + mbyte: + do + { + if(c&WC_ERR) + goto err; + if(skip && (c&7)) break; - cp++; + if(!skip) + { + if(!(c&7)) + { + skip=1; + break; + } + skip = (c&7); + adjust += skip; + state = 0; + if(skip==2 && (cp[-1]&0xc)==0 && (state=(cp[-1]&0x3))) + oldc = *cp; + else if(xspace && cp[-1]==0xc2) + { + state = 8; + oldc = *cp; + } + } + else + { + skip--; + if(state && (state=chkstate(state,oldc))) + { + if(state==10) + { + if(!wasspace) + nwords++; + wasspace = 1; + state=0; + goto spaces; + } + oldc = *cp; + } + } + } while (mbc(c = type[*cp++])); + wasspace = 0; + if(skip) + { + if(eol(c) && (cp > endbuff)) + goto eob; + err: + skip = 0; + state = 0; + if(eline!=nlines && !(wp->mode & WC_QUIET)) + eline = invalid(file, nlines); + while(mbc(c) && ((c|WC_ERR) || (c&7)==0)) + c=type[*cp++]; + if(eol(c) && (cp > endbuff)) + { + c = WC_MB|WC_ERR; + goto eob; + } + if(mbc(c)) + goto mbyte; + else if(c&WC_SP) + goto spaces; + } + if(spc(c)) + { + nwords++; + continue; } - } while (c = space[*cp++]); + } /* skip over word characters */ - while(!(c = space[*cp++])); + while(!(c = type[*cp++])); + if(mbc(c)) + goto mbyte; nwords++; } eob: + lineoff = cp-start; if((cp -= 2) >= buff) - c = space[*cp]; + c = type[*cp]; else - c = lasttype; - lasttype = space[lastchar]; + c = lasttype; + lasttype = type[lastchar]; /* see if was in word */ if(!c && !lasttype) nwords--; } - if(endline(lasttype)) + if ((wp->mode&WC_LONGEST) && ((endbuff + 1 - start) - adjust - (lastchar == '\n')) > longest) + longest = (endbuff + 1 - start) - adjust - (lastchar == '\n'); + wp->longest = longest; + if (eol(lasttype)) nlines++; - else if(!lasttype) + else if (!lasttype) nwords++; + nchars -= adjust; } wp->chars = nchars; wp->words = nwords; wp->lines = nlines; - return(0); + return 0; } + |