summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoey Hess <joey@gnu.kitenet.net>2009-05-05 15:06:34 -0400
committerJoey Hess <joey@gnu.kitenet.net>2009-05-05 15:06:34 -0400
commita250ae89f37849be1caf204a07d2e4e563503390 (patch)
tree0a0f8cbaf709e4e13a574312f0d7f59382090e15
parent0aa82b9e712f62170d7f433b9fb181cdc6a60a92 (diff)
downloadmoreutils-a250ae89f37849be1caf204a07d2e4e563503390.tar.gz
isutf8: Reject UTF-8-encoded UTF-16 surrogates. Closes: #525301 (Thanks, Jakub Wilk and liw)
-rwxr-xr-xcheck-isutf83
-rw-r--r--debian/changelog2
-rw-r--r--isutf8.c10
3 files changed, 14 insertions, 1 deletions
diff --git a/check-isutf8 b/check-isutf8
index 3abb315..83a4eed 100755
--- a/check-isutf8
+++ b/check-isutf8
@@ -39,5 +39,8 @@ check 1 '\xc2'
check 1 '\xc2\x20'
check 1 '\x20\xc2'
check 1 '\300\200'
+check 1 '\xed\xa0\x88\xed\xbd\x85' # UTF-16 surrogates
+check 1 '\xef\xbf\xbe' # 0xFFFE
+check 1 '\xef\xbf\xbf' # 0xFFFF
exit $failed
diff --git a/debian/changelog b/debian/changelog
index aac1f3f..7b638cb 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -3,6 +3,8 @@ moreutils (0.35) UNRELEASED; urgency=low
* ifdata: Don't assume that all interface names are 6 characters or less,
for instance "wmaster0" is longer. Increase the limit to 20 characters.
Closes: #526654 (Thanks, Alan Pope)
+ * isutf8: Reject UTF-8-encoded UTF-16 surrogates. Closes: #525301
+ (Thanks, Jakub Wilk and liw)
-- Joey Hess <joeyh@debian.org> Sat, 02 May 2009 20:40:23 -0400
diff --git a/isutf8.c b/isutf8.c
index 4306c7d..c5f5eeb 100644
--- a/isutf8.c
+++ b/isutf8.c
@@ -127,6 +127,14 @@ static unsigned long decodeutf8(unsigned char *buf, int nbytes)
return INVALID_CHAR;
u = (u << 6) | (buf[j] & 0x3f);
}
+
+ /* Conforming UTF-8 cannot contain codes 0xd800–0xdfff (UTF-16
+ surrogates) as well as 0xfffe and 0xffff. */
+ if (u >= 0xD800 && u <= 0xDFFF)
+ return INVALID_CHAR;
+ if (u == 0xFFFE || u == 0xFFFF)
+ return INVALID_CHAR;
+
return u;
}
@@ -145,7 +153,7 @@ static int is_utf8_byte_stream(FILE *file, char *filename, int quiet) {
int nbytes, nbytes2;
int c;
unsigned long code;
- unsigned long line, col, byteoff;
+ unsigned long line, col, byteoff;
nbytes = 0;
line = 1;