diff options
Diffstat (limited to 'src/lib9/utf/rune.c')
| -rw-r--r-- | src/lib9/utf/rune.c | 230 |
1 files changed, 202 insertions, 28 deletions
diff --git a/src/lib9/utf/rune.c b/src/lib9/utf/rune.c index 3d6831b02..cf98bab15 100644 --- a/src/lib9/utf/rune.c +++ b/src/lib9/utf/rune.c @@ -1,20 +1,21 @@ /* * The authors of this software are Rob Pike and Ken Thompson. * Copyright (c) 2002 by Lucent Technologies. + * Portions Copyright (c) 2009 The Go Authors. All rights reserved. * Permission to use, copy, modify, and distribute this software for any * purpose without fee is hereby granted, provided that this entire notice * is included in all copies of any software which is or includes a copy * or modification of this software and in all copies of the supporting * documentation for such software. * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE - * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. */ #include <stdarg.h> #include <string.h> -#include "plan9.h" #include "utf.h" +#include "utfdef.h" enum { @@ -23,27 +24,150 @@ enum Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, + /* 0001 1111 1111 1111 1111 1111 */ Maskx = (1<<Bitx)-1, /* 0011 1111 */ Testx = Maskx ^ 0xFF, /* 1100 0000 */ - Bad = Runeerror + Bad = Runeerror, }; +/* + * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24 + * This is a slower but "safe" version of the old chartorune + * that works on strings that are not necessarily null-terminated. + * + * If you know for sure that your string is null-terminated, + * chartorune will be a bit faster. + * + * It is guaranteed not to attempt to access "length" + * past the incoming pointer. This is to avoid + * possible access violations. If the string appears to be + * well-formed but incomplete (i.e., to get the whole Rune + * we'd need to read past str+length) then we'll set the Rune + * to Bad and return 0. + * + * Note that if we have decoding problems for other + * reasons, we return 1 instead of 0. + */ +int +charntorune(Rune *rune, const char *str, int length) +{ + int c, c1, c2, c3; + long l; + + /* When we're not allowed to read anything */ + if(length <= 0) { + goto badlen; + } + + /* + * one character sequence (7-bit value) + * 00000-0007F => T1 + */ + c = *(uchar*)str; + if(c < Tx) { + *rune = c; + return 1; + } + + // If we can't read more than one character we must stop + if(length <= 1) { + goto badlen; + } + + /* + * two character sequence (11-bit value) + * 0080-07FF => T2 Tx + */ + c1 = *(uchar*)(str+1) ^ Tx; + if(c1 & Testx) + goto bad; + if(c < T3) { + if(c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if(l <= Rune1) + goto bad; + *rune = l; + return 2; + } + + // If we can't read more than two characters we must stop + if(length <= 2) { + goto badlen; + } + + /* + * three character sequence (16-bit value) + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(uchar*)(str+2) ^ Tx; + if(c2 & Testx) + goto bad; + if(c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if(l <= Rune2) + goto bad; + *rune = l; + return 3; + } + + if (length <= 3) + goto badlen; + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + c3 = *(uchar*)(str+3) ^ Tx; + if (c3 & Testx) + goto bad; + if (c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if (l <= Rune3) + goto bad; + *rune = l; + return 4; + } + + // Support for 5-byte or longer UTF-8 would go here, but + // since we don't have that, we'll just fall through to bad. + + /* + * bad decoding + */ +bad: + *rune = Bad; + return 1; +badlen: + *rune = Bad; + return 0; + +} + + +/* + * This is the older "unsafe" version, which works fine on + * null-terminated strings. + */ int -chartorune(Rune *rune, char *str) +chartorune(Rune *rune, const char *str) { - int c, c1, c2; + int c, c1, c2, c3; long l; /* @@ -89,6 +213,26 @@ chartorune(Rune *rune, char *str) } /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + c3 = *(uchar*)(str+3) ^ Tx; + if (c3 & Testx) + goto bad; + if (c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if (l <= Rune3) + goto bad; + *rune = l; + return 4; + } + + /* + * Support for 5-byte or longer UTF-8 would go here, but + * since we don't have that, we'll just fall through to bad. + */ + + /* * bad decoding */ bad: @@ -97,9 +241,16 @@ bad: } int -runetochar(char *str, Rune *rune) +isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) { + *consumed = charntorune(rune, str, length); + return *rune != Runeerror || *consumed == 3; +} + +int +runetochar(char *str, const Rune *rune) { - long c; + /* Runes are signed, so convert to unsigned for range check. */ + unsigned long c; /* * one character sequence @@ -122,56 +273,79 @@ runetochar(char *str, Rune *rune) } /* + * If the Rune is out of range, convert it to the error rune. + * Do this test here because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. + */ + if (c > Runemax) + c = Runeerror; + + /* * three character sequence * 0800-FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if (c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int -runelen(long c) +runelen(Rune rune) { - Rune rune; char str[10]; - rune = c; return runetochar(str, &rune); } int -runenlen(Rune *r, int nrune) +runenlen(const Rune *r, int nrune) { int nb, c; nb = 0; while(nrune--) { c = *r++; - if(c <= Rune1) + if (c <= Rune1) nb++; - else - if(c <= Rune2) + else if (c <= Rune2) nb += 2; - else + else if (c <= Rune3) nb += 3; + else /* assert(c <= Rune4) */ + nb += 4; } return nb; } int -fullrune(char *str, int n) +fullrune(const char *str, int n) { - int c; - - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) + if (n > 0) { + int c = *(uchar*)str; + if (c < Tx) return 1; - if(n > 1) - if(c < T3 || n > 2) + if (n > 1) { + if (c < T3) return 1; + if (n > 2) { + if (c < T4 || n > 3) + return 1; + } + } } return 0; } |
