/*
 * isutf8.c - do the input files look like valid utf-8 byte streams?
 * 
 * Copyright (C) 2005  Lars Wirzenius
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <getopt.h>


#define VERSION "1.1"


/*
 * Code to indicate an invalid UTF8 character.
 */
enum { INVALID_CHAR = 0xffffffff };


/*
 * Produce shortest UTF8 encoding of a 31-bit value in 'u', returning it
 * in the array 'buf'. Return the number of bytes in the encoded value.
 * If the value is too large (more than 32 bits or would take more than
 * 'maxbytes' bytes), return -1.
 */
static int encodeutf8(unsigned long u, unsigned char *buf, size_t maxbytes)
{
        static const struct {
            int nbytes;
            unsigned long max;
        } tab[] = {
            { 1, 0x0000007F },
            { 2, 0x000007FF },
            { 3, 0x0000FFFF },
            { 4, 0x001FFFFF },
            { 5, 0x03FFFFFF },
            { 6, 0x7FFFFFFF },
        };
        static const int ntab = sizeof(tab) / sizeof(tab[0]);
        int i, j;

        if (u > tab[ntab-1].max)
                return -1;

        for (i = 0; i < ntab; ++i) {
                if (u <= tab[i].max)
                    break;
        }
        assert(i < ntab);

        if (tab[i].nbytes > maxbytes)
                return -1;
        
        if (tab[i].nbytes == 1) { /* Special case */
                buf[0] = u;
        } else {
                for (j = tab[i].nbytes-1; j > 0; --j) {
                        buf[j] = 0x80 | (u & 0x3f);
                        u >>= 6;
                }
        
                unsigned char mask = ~(0xFF >> tab[i].nbytes);
                buf[0] = mask | u;
        }

        return tab[i].nbytes;
}


/* 
 * Return number of ones at the top of a byte.
 *
 * I'm pretty sure there is a fancy trick to do this without a loop,
 * but I'm too tired to figure it out now. --liw
 */
static int high_ones(int c) {
        int n;

        for (n = 0; (c & 0x80) == 0x80; c <<= 1)
                ++n;    
        return n;
}


/*
 * Decode a UTF8 character from an array of bytes. Return character code.
 * Upon error, return INVALID_CHAR.
 */
static unsigned long decodeutf8(unsigned char *buf, int nbytes)
{
        unsigned long u;
        int i, j;
        
        if (nbytes <= 0)
                return INVALID_CHAR;
        
        if (nbytes == 1) {
                if (buf[0] >= 0x80)
                        return INVALID_CHAR;
                return buf[0];
        }
        
        i = high_ones(buf[0]);
        if (i != nbytes)
                return INVALID_CHAR;    
        u = buf[0] & (0xff >> i);
        for (j = 1; j < nbytes; ++j) {
                if ((buf[j] & 0xC0) != 0x80)
                            return INVALID_CHAR;
                u = (u << 6) | (buf[j] & 0x3f);
        }

        /* Conforming UTF-8 cannot contain codes 0xd800–0xdfff (UTF-16 
           surrogates) as well as 0xfffe and 0xffff. */
        if (u >= 0xD800 && u <= 0xDFFF)
            return INVALID_CHAR;
        if (u == 0xFFFE || u == 0xFFFF)
            return INVALID_CHAR;

        return u;
}


/*
 * Determine if the contents of an open file form a valid UTF8 byte stream.
 * Do this by collecting bytes for a character into a buffer and then
 * decode the bytes and re-encode them and compare that they are identical
 * to the original bytes. If any step fails, return 0 for error. If EOF
 * is reached, return 1 for OK.
 */
static int is_utf8_byte_stream(FILE *file, char *filename, int quiet) {
        enum { MAX_UTF8_BYTES = 6 };
        unsigned char buf[MAX_UTF8_BYTES];
        unsigned char buf2[MAX_UTF8_BYTES];
        int nbytes, nbytes2;
        int c;
        unsigned long code;
        unsigned long line, col, byteoff;

        nbytes = 0;
        line = 1;
        col = 1;
        byteoff = 0;
                
        for (;;) {
                c = getc(file);
    
                if (c == EOF || c < 0x80 || (c & 0xC0) != 0x80) {
                        /* New char starts, deal with previous one. */
                        if (nbytes > 0) {
                                code = decodeutf8(buf, nbytes);
                                if (code == INVALID_CHAR)
                                        goto error;
                                nbytes2 = encodeutf8(code, buf2, 
                                                     MAX_UTF8_BYTES);
                                if (nbytes != nbytes2 || 
                                    memcmp(buf, buf2, nbytes) != 0)
                                        goto error;
                                ++col;
                        }
                        nbytes = 0;
                        /* If it's UTF8, start collecting again. */
                        if (c != EOF && c >= 0x80)
                                buf[nbytes++] = c;
                } else {
                        /* This is a continuation byte, append to buffer. */
                        if (nbytes == MAX_UTF8_BYTES)
                                goto error;
                        buf[nbytes++] = c;
                }
    
                if (c == EOF)
                        break;
                else if (c == '\n') {
                        ++line;
                        byteoff = 0;
                        col = 1;
                } else
                        ++byteoff;
        }
        
        if (nbytes != 0)
                goto error;

	return 1;
	
error:
	if (!quiet) {
		printf("%s: line %lu, char %lu, byte offset %lu: "
		       "invalid UTF-8 code\n", filename, line, col, byteoff);
	}
	return 0;
}


static void usage(const char *program_name) {
	printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n", 
	       program_name);
	printf("Check whether input files are valid UTF-8.\n");
	printf("This is version %s.\n", VERSION);
}


int main(int argc, char **argv) {
	int i, ok;
	FILE *file;

	int quiet;
	struct option options[] = {
		{ "help", no_argument, NULL, 'h' },
		{ "quiet", no_argument, &quiet, 1 },
		{ 0, 0, 0, 0 }
	};
	int opt;
	
	quiet = 0;
	
	while ((opt = getopt_long(argc, argv, "hq", options, NULL)) != -1) {
		switch (opt) {
		case 0:
			break;
			
		case 'h':
			usage(argv[0]);
			exit(0);
			break;
			
		case 'q':
			quiet = 1;
			break;

		case '?':
			exit(EXIT_FAILURE);

		default:
			abort();
		}
	}

	if (optind == argc)
		ok = is_utf8_byte_stream(stdin, "stdin", quiet);
	else {
		ok = 1;
		for (i = optind; i < argc; ++i) {
			file = fopen(argv[i], "r");
			if (file == NULL) {
				fprintf(stderr, "isutf8: %s: error %d: %s\n", 
				                argv[i], errno, 
				                strerror(errno));
				ok = 0;
			} else {
			        if (! is_utf8_byte_stream(file, argv[i], quiet))
			            ok = 0;
				(void) fclose(file);
			}
		}
	}
	
	if (ok)
		exit(0);
	exit(EXIT_FAILURE);
}