diff options
Diffstat (limited to 'xmlstring.c')
-rw-r--r-- | xmlstring.c | 47 |
1 files changed, 27 insertions, 20 deletions
diff --git a/xmlstring.c b/xmlstring.c index 5c64053..5f239df 100644 --- a/xmlstring.c +++ b/xmlstring.c @@ -764,27 +764,34 @@ xmlCheckUTF8(const unsigned char *utf) int ix; unsigned char c; - for (ix = 0; (c = utf[ix]);) { - if (c & 0x80) { - if ((utf[ix + 1] & 0xc0) != 0x80) - return(0); - if ((c & 0xe0) == 0xe0) { - if ((utf[ix + 2] & 0xc0) != 0x80) - return(0); - if ((c & 0xf0) == 0xf0) { - if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80) - return(0); - ix += 4; - /* 4-byte code */ - } else - /* 3-byte code */ - ix += 3; - } else - /* 2-byte code */ - ix += 2; - } else - /* 1-byte code */ + /* + * utf is a string of 1, 2, 3 or 4 bytes. The valid strings + * are as follows (in "bit format"): + * 0xxxxxxx valid 1-byte + * 110xxxxx 10xxxxxx valid 2-byte + * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte + */ + for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */ + if (c & 0x80) { /* 1-byte code, starts with 10 */ ix++; + } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */ + if ((utf[ix+1] & 0xc0 ) != 0x80) + return 0; + ix += 2; + } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */ + if (((utf[ix+1] & 0xc0) != 0x80) || + ((utf[ix+2] & 0xc0) != 0x80)) + return 0; + ix += 3; + } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */ + if (((utf[ix+1] & 0xc0) != 0x80) || + ((utf[ix+2] & 0xc0) != 0x80) || + ((utf[ix+3] & 0xc0) != 0x80)) + return 0; + ix += 4; + } else /* unknown encoding */ + return 0; } return(1); } |