$NetBSD$ * Fix for htmlspecialchars(): http://svn.php.net/viewvc?view=revision&revision=289411 http://svn.php.net/viewvc?view=revision&revision=289554 http://svn.php.net/viewvc?view=revision&revision=289565 http://svn.php.net/viewvc?view=revision&revision=289567 http://svn.php.net/viewvc?view=revision&revision=289605 --- ext/standard/html.c.orig 2008-12-31 20:17:49.000000000 +0900 +++ ext/standard/html.c @@ -484,15 +484,31 @@ struct basic_entities_dec { } \ mbseq[mbpos++] = (mbchar); } -#define CHECK_LEN(pos, chars_need) \ - if((str_len - (pos)) < chars_need) { \ - *status = FAILURE; \ - return 0; \ +/* skip one byte and return */ +#define MB_FAILURE(pos) do { \ + *newpos = pos + 1; \ + *status = FAILURE; \ + return 0; \ + } while (0) + +#define CHECK_LEN(pos, chars_need) \ + if (chars_need < 1) { \ + if((str_len - (pos)) < chars_need) { \ + *newpos = pos; \ + *status = FAILURE; \ + return 0; \ + } \ + } else { \ + if((str_len - (pos)) < chars_need) { \ + *newpos = pos + 1; \ + *status = FAILURE; \ + return 0; \ + } \ } /* {{{ get_next_char */ -inline static unsigned short get_next_char(enum entity_charset charset, +inline static unsigned int get_next_char(enum entity_charset charset, unsigned char * str, int str_len, int * newpos, @@ -503,205 +519,189 @@ inline static unsigned short get_next_ch int pos = *newpos; int mbpos = 0; int mbspace = *mbseqlen; - unsigned short this_char = str[pos++]; + unsigned int this_char = 0; unsigned char next_char; *status = SUCCESS; - + if (mbspace <= 0) { *mbseqlen = 0; - return this_char; + CHECK_LEN(pos, 1); + *newpos = pos + 1; + *newpos = pos + 1; } - - MB_WRITE((unsigned char)this_char); - + switch (charset) { case cs_utf_8: { - unsigned long utf = 0; - int stat = 0; - int more = 1; - - /* unpack utf-8 encoding into a wide char. - * Code stolen from the mbstring extension */ - - do { - if (this_char < 0x80) { - more = 0; - if(stat) { - /* we didn't finish the UTF sequence correctly */ - *status = FAILURE; - } - break; - } else if (this_char < 0xc0) { - switch (stat) { - case 0x10: /* 2, 2nd */ - case 0x21: /* 3, 3rd */ - case 0x32: /* 4, 4th */ - case 0x43: /* 5, 5th */ - case 0x54: /* 6, 6th */ - /* last byte in sequence */ - more = 0; - utf |= (this_char & 0x3f); - this_char = (unsigned short)utf; - break; - case 0x20: /* 3, 2nd */ - case 0x31: /* 4, 3rd */ - case 0x42: /* 5, 4th */ - case 0x53: /* 6, 5th */ - /* penultimate char */ - utf |= ((this_char & 0x3f) << 6); - stat++; - break; - case 0x30: /* 4, 2nd */ - case 0x41: /* 5, 3rd */ - case 0x52: /* 6, 4th */ - utf |= ((this_char & 0x3f) << 12); - stat++; - break; - case 0x40: /* 5, 2nd */ - case 0x51: - utf |= ((this_char & 0x3f) << 18); - stat++; - break; - case 0x50: /* 6, 2nd */ - utf |= ((this_char & 0x3f) << 24); - stat++; - break; - default: - /* invalid */ - *status = FAILURE; - more = 0; - } - } - /* lead byte */ - else if (this_char < 0xe0) { - stat = 0x10; /* 2 byte */ - utf = (this_char & 0x1f) << 6; - CHECK_LEN(pos, 1); - } else if (this_char < 0xf0) { - stat = 0x20; /* 3 byte */ - utf = (this_char & 0xf) << 12; - CHECK_LEN(pos, 2); - } else if (this_char < 0xf8) { - stat = 0x30; /* 4 byte */ - utf = (this_char & 0x7) << 18; - CHECK_LEN(pos, 3); - } else if (this_char < 0xfc) { - stat = 0x40; /* 5 byte */ - utf = (this_char & 0x3) << 24; - CHECK_LEN(pos, 4); - } else if (this_char < 0xfe) { - stat = 0x50; /* 6 byte */ - utf = (this_char & 0x1) << 30; - CHECK_LEN(pos, 5); - } else { - /* invalid; bail */ - more = 0; - *status = FAILURE; - break; + unsigned char c; + CHECK_LEN(pos, 1); + c = str[pos]; + if (c < 0x80) { + MB_WRITE(c); + this_char = c; + pos++; + } else if (c < 0xc0) { + MB_FAILURE(pos); + } else if (c < 0xe0) { + CHECK_LEN(pos, 2); + if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { + MB_FAILURE(pos); } - - if (more) { - this_char = str[pos++]; - MB_WRITE((unsigned char)this_char); + this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f); + if (this_char < 0x80) { + MB_FAILURE(pos); } - } while (more); + MB_WRITE((unsigned char)c); + MB_WRITE((unsigned char)str[pos + 1]); + pos += 2; + } else if (c < 0xf0) { + CHECK_LEN(pos, 3); + if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { + MB_FAILURE(pos); + } + if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) { + MB_FAILURE(pos); + } + this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f); + if (this_char < 0x800) { + MB_FAILURE(pos); + } + MB_WRITE((unsigned char)c); + MB_WRITE((unsigned char)str[pos + 1]); + MB_WRITE((unsigned char)str[pos + 2]); + pos += 3; + } else if (c < 0xf8) { + CHECK_LEN(pos, 4); + if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { + MB_FAILURE(pos); + } + if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) { + MB_FAILURE(pos); + } + if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) { + MB_FAILURE(pos); + } + this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f); + if (this_char < 0x10000) { + MB_FAILURE(pos); + } + MB_WRITE((unsigned char)c); + MB_WRITE((unsigned char)str[pos + 1]); + MB_WRITE((unsigned char)str[pos + 2]); + MB_WRITE((unsigned char)str[pos + 3]); + pos += 4; + } else { + MB_FAILURE(pos); + } } break; case cs_big5: case cs_gb2312: case cs_big5hkscs: { + CHECK_LEN(pos, 1); + this_char = str[pos++]; /* check if this is the first of a 2-byte sequence */ - if (this_char >= 0xa1 && this_char <= 0xfe) { + if (this_char >= 0x81 && this_char <= 0xfe) { /* peek at the next char */ CHECK_LEN(pos, 1); - next_char = str[pos]; + next_char = str[pos++]; if ((next_char >= 0x40 && next_char <= 0x7e) || (next_char >= 0xa1 && next_char <= 0xfe)) { /* yes, this a wide char */ - this_char <<= 8; + MB_WRITE(this_char); MB_WRITE(next_char); - this_char |= next_char; - pos++; + this_char = (this_char << 8) | next_char; + } else { + MB_FAILURE(pos); } - + } else { + MB_WRITE(this_char); } - break; } + break; case cs_sjis: { + CHECK_LEN(pos, 1); + this_char = str[pos++]; /* check if this is the first of a 2-byte sequence */ - if ( (this_char >= 0x81 && this_char <= 0x9f) || - (this_char >= 0xe0 && this_char <= 0xef) - ) { + if ((this_char >= 0x81 && this_char <= 0x9f) || + (this_char >= 0xe0 && this_char <= 0xfc)) { /* peek at the next char */ CHECK_LEN(pos, 1); - next_char = str[pos]; + next_char = str[pos++]; if ((next_char >= 0x40 && next_char <= 0x7e) || (next_char >= 0x80 && next_char <= 0xfc)) { /* yes, this a wide char */ - this_char <<= 8; + MB_WRITE(this_char); MB_WRITE(next_char); - this_char |= next_char; - pos++; + this_char = (this_char << 8) | next_char; + } else { + MB_FAILURE(pos); } - + } else { + MB_WRITE(this_char); } break; } case cs_eucjp: { + CHECK_LEN(pos, 1); + this_char = str[pos++]; /* check if this is the first of a multi-byte sequence */ if (this_char >= 0xa1 && this_char <= 0xfe) { /* peek at the next char */ CHECK_LEN(pos, 1); - next_char = str[pos]; + next_char = str[pos++]; if (next_char >= 0xa1 && next_char <= 0xfe) { /* yes, this a jis kanji char */ - this_char <<= 8; + MB_WRITE(this_char); MB_WRITE(next_char); - this_char |= next_char; - pos++; + this_char = (this_char << 8) | next_char; + } else { + MB_FAILURE(pos); } - } else if (this_char == 0x8e) { /* peek at the next char */ CHECK_LEN(pos, 1); - next_char = str[pos]; + next_char = str[pos++]; if (next_char >= 0xa1 && next_char <= 0xdf) { /* JIS X 0201 kana */ - this_char <<= 8; + MB_WRITE(this_char); MB_WRITE(next_char); - this_char |= next_char; - pos++; + this_char = (this_char << 8) | next_char; + } else { + MB_FAILURE(pos); } - } else if (this_char == 0x8f) { /* peek at the next two char */ unsigned char next2_char; CHECK_LEN(pos, 2); next_char = str[pos]; - next2_char = str[pos+1]; + next2_char = str[pos + 1]; + pos += 2; if ((next_char >= 0xa1 && next_char <= 0xfe) && (next2_char >= 0xa1 && next2_char <= 0xfe)) { /* JIS X 0212 hojo-kanji */ - this_char <<= 8; + MB_WRITE(this_char); MB_WRITE(next_char); - this_char |= next_char; - pos++; - this_char <<= 8; MB_WRITE(next2_char); - this_char |= next2_char; - pos++; + this_char = (this_char << 16) | (next_char << 8) | next2_char; + } else { + MB_FAILURE(pos); } - + } else { + MB_WRITE(this_char); } break; } default: + /* single-byte charsets */ + CHECK_LEN(pos, 1); + this_char = str[pos++]; + MB_WRITE(this_char); break; } MB_RETURN; @@ -1132,7 +1132,7 @@ PHPAPI char *php_escape_html_entities_ex unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */ int mbseqlen = sizeof(mbsequence); int status = SUCCESS; - unsigned short this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status); + unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status); if(status == FAILURE) { /* invalid MB sequence */