$NetBSD$
* Fix for htmlspecialchars():
http://svn.php.net/viewvc?view=revision&revision=289411
http://svn.php.net/viewvc?view=revision&revision=289554
http://svn.php.net/viewvc?view=revision&revision=289565
http://svn.php.net/viewvc?view=revision&revision=289567
http://svn.php.net/viewvc?view=revision&revision=289605
--- ext/standard/html.c.orig 2008-12-31 20:17:49.000000000 +0900
+++ ext/standard/html.c
@@ -484,15 +484,31 @@ struct basic_entities_dec {
} \
mbseq[mbpos++] = (mbchar); }
-#define CHECK_LEN(pos, chars_need) \
- if((str_len - (pos)) < chars_need) { \
- *status = FAILURE; \
- return 0; \
+/* skip one byte and return */
+#define MB_FAILURE(pos) do { \
+ *newpos = pos + 1; \
+ *status = FAILURE; \
+ return 0; \
+ } while (0)
+
+#define CHECK_LEN(pos, chars_need) \
+ if (chars_need < 1) { \
+ if((str_len - (pos)) < chars_need) { \
+ *newpos = pos; \
+ *status = FAILURE; \
+ return 0; \
+ } \
+ } else { \
+ if((str_len - (pos)) < chars_need) { \
+ *newpos = pos + 1; \
+ *status = FAILURE; \
+ return 0; \
+ } \
}
/* {{{ get_next_char
*/
-inline static unsigned short get_next_char(enum entity_charset charset,
+inline static unsigned int get_next_char(enum entity_charset charset,
unsigned char * str,
int str_len,
int * newpos,
@@ -503,205 +519,189 @@ inline static unsigned short get_next_ch
int pos = *newpos;
int mbpos = 0;
int mbspace = *mbseqlen;
- unsigned short this_char = str[pos++];
+ unsigned int this_char = 0;
unsigned char next_char;
*status = SUCCESS;
-
+
if (mbspace <= 0) {
*mbseqlen = 0;
- return this_char;
+ CHECK_LEN(pos, 1);
+ *newpos = pos + 1;
+ *newpos = pos + 1;
}
-
- MB_WRITE((unsigned char)this_char);
-
+
switch (charset) {
case cs_utf_8:
{
- unsigned long utf = 0;
- int stat = 0;
- int more = 1;
-
- /* unpack utf-8 encoding into a wide char.
- * Code stolen from the mbstring extension */
-
- do {
- if (this_char < 0x80) {
- more = 0;
- if(stat) {
- /* we didn't finish the UTF sequence correctly */
- *status = FAILURE;
- }
- break;
- } else if (this_char < 0xc0) {
- switch (stat) {
- case 0x10: /* 2, 2nd */
- case 0x21: /* 3, 3rd */
- case 0x32: /* 4, 4th */
- case 0x43: /* 5, 5th */
- case 0x54: /* 6, 6th */
- /* last byte in sequence */
- more = 0;
- utf |= (this_char & 0x3f);
- this_char = (unsigned short)utf;
- break;
- case 0x20: /* 3, 2nd */
- case 0x31: /* 4, 3rd */
- case 0x42: /* 5, 4th */
- case 0x53: /* 6, 5th */
- /* penultimate char */
- utf |= ((this_char & 0x3f) << 6);
- stat++;
- break;
- case 0x30: /* 4, 2nd */
- case 0x41: /* 5, 3rd */
- case 0x52: /* 6, 4th */
- utf |= ((this_char & 0x3f) << 12);
- stat++;
- break;
- case 0x40: /* 5, 2nd */
- case 0x51:
- utf |= ((this_char & 0x3f) << 18);
- stat++;
- break;
- case 0x50: /* 6, 2nd */
- utf |= ((this_char & 0x3f) << 24);
- stat++;
- break;
- default:
- /* invalid */
- *status = FAILURE;
- more = 0;
- }
- }
- /* lead byte */
- else if (this_char < 0xe0) {
- stat = 0x10; /* 2 byte */
- utf = (this_char & 0x1f) << 6;
- CHECK_LEN(pos, 1);
- } else if (this_char < 0xf0) {
- stat = 0x20; /* 3 byte */
- utf = (this_char & 0xf) << 12;
- CHECK_LEN(pos, 2);
- } else if (this_char < 0xf8) {
- stat = 0x30; /* 4 byte */
- utf = (this_char & 0x7) << 18;
- CHECK_LEN(pos, 3);
- } else if (this_char < 0xfc) {
- stat = 0x40; /* 5 byte */
- utf = (this_char & 0x3) << 24;
- CHECK_LEN(pos, 4);
- } else if (this_char < 0xfe) {
- stat = 0x50; /* 6 byte */
- utf = (this_char & 0x1) << 30;
- CHECK_LEN(pos, 5);
- } else {
- /* invalid; bail */
- more = 0;
- *status = FAILURE;
- break;
+ unsigned char c;
+ CHECK_LEN(pos, 1);
+ c = str[pos];
+ if (c < 0x80) {
+ MB_WRITE(c);
+ this_char = c;
+ pos++;
+ } else if (c < 0xc0) {
+ MB_FAILURE(pos);
+ } else if (c < 0xe0) {
+ CHECK_LEN(pos, 2);
+ if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
+ MB_FAILURE(pos);
}
-
- if (more) {
- this_char = str[pos++];
- MB_WRITE((unsigned char)this_char);
+ this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
+ if (this_char < 0x80) {
+ MB_FAILURE(pos);
}
- } while (more);
+ MB_WRITE((unsigned char)c);
+ MB_WRITE((unsigned char)str[pos + 1]);
+ pos += 2;
+ } else if (c < 0xf0) {
+ CHECK_LEN(pos, 3);
+ if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
+ MB_FAILURE(pos);
+ }
+ if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
+ MB_FAILURE(pos);
+ }
+ this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
+ if (this_char < 0x800) {
+ MB_FAILURE(pos);
+ }
+ MB_WRITE((unsigned char)c);
+ MB_WRITE((unsigned char)str[pos + 1]);
+ MB_WRITE((unsigned char)str[pos + 2]);
+ pos += 3;
+ } else if (c < 0xf8) {
+ CHECK_LEN(pos, 4);
+ if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
+ MB_FAILURE(pos);
+ }
+ if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
+ MB_FAILURE(pos);
+ }
+ if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
+ MB_FAILURE(pos);
+ }
+ this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
+ if (this_char < 0x10000) {
+ MB_FAILURE(pos);
+ }
+ MB_WRITE((unsigned char)c);
+ MB_WRITE((unsigned char)str[pos + 1]);
+ MB_WRITE((unsigned char)str[pos + 2]);
+ MB_WRITE((unsigned char)str[pos + 3]);
+ pos += 4;
+ } else {
+ MB_FAILURE(pos);
+ }
}
break;
case cs_big5:
case cs_gb2312:
case cs_big5hkscs:
{
+ CHECK_LEN(pos, 1);
+ this_char = str[pos++];
/* check if this is the first of a 2-byte sequence */
- if (this_char >= 0xa1 && this_char <= 0xfe) {
+ if (this_char >= 0x81 && this_char <= 0xfe) {
/* peek at the next char */
CHECK_LEN(pos, 1);
- next_char = str[pos];
+ next_char = str[pos++];
if ((next_char >= 0x40 && next_char <= 0x7e) ||
(next_char >= 0xa1 && next_char <= 0xfe)) {
/* yes, this a wide char */
- this_char <<= 8;
+ MB_WRITE(this_char);
MB_WRITE(next_char);
- this_char |= next_char;
- pos++;
+ this_char = (this_char << 8) | next_char;
+ } else {
+ MB_FAILURE(pos);
}
-
+ } else {
+ MB_WRITE(this_char);
}
- break;
}
+ break;
case cs_sjis:
{
+ CHECK_LEN(pos, 1);
+ this_char = str[pos++];
/* check if this is the first of a 2-byte sequence */
- if ( (this_char >= 0x81 && this_char <= 0x9f) ||
- (this_char >= 0xe0 && this_char <= 0xef)
- ) {
+ if ((this_char >= 0x81 && this_char <= 0x9f) ||
+ (this_char >= 0xe0 && this_char <= 0xfc)) {
/* peek at the next char */
CHECK_LEN(pos, 1);
- next_char = str[pos];
+ next_char = str[pos++];
if ((next_char >= 0x40 && next_char <= 0x7e) ||
(next_char >= 0x80 && next_char <= 0xfc))
{
/* yes, this a wide char */
- this_char <<= 8;
+ MB_WRITE(this_char);
MB_WRITE(next_char);
- this_char |= next_char;
- pos++;
+ this_char = (this_char << 8) | next_char;
+ } else {
+ MB_FAILURE(pos);
}
-
+ } else {
+ MB_WRITE(this_char);
}
break;
}
case cs_eucjp:
{
+ CHECK_LEN(pos, 1);
+ this_char = str[pos++];
/* check if this is the first of a multi-byte sequence */
if (this_char >= 0xa1 && this_char <= 0xfe) {
/* peek at the next char */
CHECK_LEN(pos, 1);
- next_char = str[pos];
+ next_char = str[pos++];
if (next_char >= 0xa1 && next_char <= 0xfe) {
/* yes, this a jis kanji char */
- this_char <<= 8;
+ MB_WRITE(this_char);
MB_WRITE(next_char);
- this_char |= next_char;
- pos++;
+ this_char = (this_char << 8) | next_char;
+ } else {
+ MB_FAILURE(pos);
}
-
} else if (this_char == 0x8e) {
/* peek at the next char */
CHECK_LEN(pos, 1);
- next_char = str[pos];
+ next_char = str[pos++];
if (next_char >= 0xa1 && next_char <= 0xdf) {
/* JIS X 0201 kana */
- this_char <<= 8;
+ MB_WRITE(this_char);
MB_WRITE(next_char);
- this_char |= next_char;
- pos++;
+ this_char = (this_char << 8) | next_char;
+ } else {
+ MB_FAILURE(pos);
}
-
} else if (this_char == 0x8f) {
/* peek at the next two char */
unsigned char next2_char;
CHECK_LEN(pos, 2);
next_char = str[pos];
- next2_char = str[pos+1];
+ next2_char = str[pos + 1];
+ pos += 2;
if ((next_char >= 0xa1 && next_char <= 0xfe) &&
(next2_char >= 0xa1 && next2_char <= 0xfe)) {
/* JIS X 0212 hojo-kanji */
- this_char <<= 8;
+ MB_WRITE(this_char);
MB_WRITE(next_char);
- this_char |= next_char;
- pos++;
- this_char <<= 8;
MB_WRITE(next2_char);
- this_char |= next2_char;
- pos++;
+ this_char = (this_char << 16) | (next_char << 8) | next2_char;
+ } else {
+ MB_FAILURE(pos);
}
-
+ } else {
+ MB_WRITE(this_char);
}
break;
}
default:
+ /* single-byte charsets */
+ CHECK_LEN(pos, 1);
+ this_char = str[pos++];
+ MB_WRITE(this_char);
break;
}
MB_RETURN;
@@ -1132,7 +1132,7 @@ PHPAPI char *php_escape_html_entities_ex
unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */
int mbseqlen = sizeof(mbsequence);
int status = SUCCESS;
- unsigned short this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
+ unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
if(status == FAILURE) {
/* invalid MB sequence */