/* * isutf8.c - do the input files look like valid utf-8 byte streams? * * Copyright (C) 2005 Lars Wirzenius * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #define VERSION "1.1" /* * Code to indicate an invalid UTF8 character. */ enum { INVALID_CHAR = 0xffffffff }; /* * Produce shortest UTF8 encoding of a 31-bit value in 'u', returning it * in the array 'buf'. Return the number of bytes in the encoded value. * If the value is too large (more than 32 bits or would take more than * 'maxbytes' bytes), return -1. */ static int encodeutf8(unsigned long u, unsigned char *buf, size_t maxbytes) { static const struct { int nbytes; unsigned long max; } tab[] = { { 1, 0x0000007F }, { 2, 0x000007FF }, { 3, 0x0000FFFF }, { 4, 0x001FFFFF }, { 5, 0x03FFFFFF }, { 6, 0x7FFFFFFF }, }; static const int ntab = sizeof(tab) / sizeof(tab[0]); int i, j; if (u > tab[ntab-1].max) return -1; for (i = 0; i < ntab; ++i) { if (u <= tab[i].max) break; } assert(i < ntab); if (tab[i].nbytes > maxbytes) return -1; if (tab[i].nbytes == 1) { /* Special case */ buf[0] = u; } else { for (j = tab[i].nbytes-1; j > 0; --j) { buf[j] = 0x80 | (u & 0x3f); u >>= 6; } unsigned char mask = ~(0xFF >> tab[i].nbytes); buf[0] = mask | u; } return tab[i].nbytes; } /* * Return number of ones at the top of a byte. * * I'm pretty sure there is a fancy trick to do this without a loop, * but I'm too tired to figure it out now. --liw */ static int high_ones(int c) { int n; for (n = 0; (c & 0x80) == 0x80; c <<= 1) ++n; return n; } /* * Decode a UTF8 character from an array of bytes. Return character code. * Upon error, return INVALID_CHAR. */ static unsigned long decodeutf8(unsigned char *buf, int nbytes) { unsigned long u; int i, j; if (nbytes <= 0) return INVALID_CHAR; if (nbytes == 1) { if (buf[0] >= 0x80) return INVALID_CHAR; return buf[0]; } i = high_ones(buf[0]); if (i != nbytes) return INVALID_CHAR; u = buf[0] & (0xff >> i); for (j = 1; j < nbytes; ++j) { if ((buf[j] & 0xC0) != 0x80) return INVALID_CHAR; u = (u << 6) | (buf[j] & 0x3f); } /* Conforming UTF-8 cannot contain codes 0xd800–0xdfff (UTF-16 surrogates) as well as 0xfffe and 0xffff. */ if (u >= 0xD800 && u <= 0xDFFF) return INVALID_CHAR; if (u == 0xFFFE || u == 0xFFFF) return INVALID_CHAR; return u; } /* * Determine if the contents of an open file form a valid UTF8 byte stream. * Do this by collecting bytes for a character into a buffer and then * decode the bytes and re-encode them and compare that they are identical * to the original bytes. If any step fails, return 0 for error. If EOF * is reached, return 1 for OK. */ static int is_utf8_byte_stream(FILE *file, char *filename, int quiet) { enum { MAX_UTF8_BYTES = 6 }; unsigned char buf[MAX_UTF8_BYTES]; unsigned char buf2[MAX_UTF8_BYTES]; int nbytes, nbytes2; int c; unsigned long code; unsigned long line, col, byteoff; nbytes = 0; line = 1; col = 1; byteoff = 0; for (;;) { c = getc(file); if (c == EOF || c < 0x80 || (c & 0xC0) != 0x80) { /* New char starts, deal with previous one. */ if (nbytes > 0) { code = decodeutf8(buf, nbytes); if (code == INVALID_CHAR) goto error; nbytes2 = encodeutf8(code, buf2, MAX_UTF8_BYTES); if (nbytes != nbytes2 || memcmp(buf, buf2, nbytes) != 0) goto error; ++col; } nbytes = 0; /* If it's UTF8, start collecting again. */ if (c != EOF && c >= 0x80) buf[nbytes++] = c; } else { /* This is a continuation byte, append to buffer. */ if (nbytes == MAX_UTF8_BYTES) goto error; buf[nbytes++] = c; } if (c == EOF) break; else if (c == '\n') { ++line; byteoff = 0; col = 1; } else ++byteoff; } if (nbytes != 0) goto error; return 1; error: if (!quiet) { printf("%s: line %lu, char %lu, byte offset %lu: " "invalid UTF-8 code\n", filename, line, col, byteoff); } return 0; } static void usage(const char *program_name) { printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n", program_name); printf("Check whether input files are valid UTF-8.\n"); printf("This is version %s.\n", VERSION); } int main(int argc, char **argv) { int i, ok; FILE *file; int quiet; struct option options[] = { { "help", no_argument, NULL, 'h' }, { "quiet", no_argument, &quiet, 1 }, { 0, 0, 0, 0 } }; int opt; quiet = 0; while ((opt = getopt_long(argc, argv, "hq", options, NULL)) != -1) { switch (opt) { case 0: break; case 'h': usage(argv[0]); exit(0); break; case 'q': quiet = 1; break; case '?': exit(EXIT_FAILURE); default: abort(); } } if (optind == argc) ok = is_utf8_byte_stream(stdin, "stdin", quiet); else { ok = 1; for (i = optind; i < argc; ++i) { file = fopen(argv[i], "r"); if (file == NULL) { fprintf(stderr, "isutf8: %s: error %d: %s\n", argv[i], errno, strerror(errno)); ok = 0; } else { if (! is_utf8_byte_stream(file, argv[i], quiet)) ok = 0; (void) fclose(file); } } } if (ok) exit(0); exit(EXIT_FAILURE); }