summaryrefslogtreecommitdiff
path: root/isutf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'isutf8.c')
-rw-r--r--isutf8.c146
1 files changed, 146 insertions, 0 deletions
diff --git a/isutf8.c b/isutf8.c
new file mode 100644
index 0000000..88bb38b
--- /dev/null
+++ b/isutf8.c
@@ -0,0 +1,146 @@
+/*
+ * isutf8.c - do the input files look like valid utf-8 byte streams?
+ *
+ * Copyright (C) 2005 Lars Wirzenius
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <getopt.h>
+
+#define VERSION "1.0"
+
+/*
+ * I'm pretty sure there is a fancy trick to do this without a loop,
+ * but I'm too tired to figure it out now. --liw
+ */
+static int high_ones(int c)
+{
+ int n;
+
+ for (n = 0; (c & 0x80) == 0x80; c <<= 1)
+ ++n;
+ return n;
+}
+
+static int is_utf8_byte_stream(FILE *file, char *filename, int quiet)
+{
+ int c, n, remaining_bytes;
+ unsigned long line, col;
+
+ remaining_bytes = 0;
+ line = 1;
+ col = 1;
+ while ((c = getc(file)) != EOF) {
+ n = high_ones(c);
+ if (remaining_bytes > 0) {
+ if (n == 1) {
+ --remaining_bytes;
+ if (remaining_bytes == 0)
+ ++col;
+ } else
+ goto error;
+ } else if (n == 0) {
+ /* 7-bit character, skip, but adjust position */
+ if (c == '\n') {
+ ++line;
+ col = 1;
+ } else
+ ++col;
+ } else if (n == 1)
+ goto error; /* wrong place for continuation byte */
+ else
+ remaining_bytes = n - 1; /* start of multi-byte sequence */
+ }
+ if (remaining_bytes > 0)
+ goto error;
+ return 1;
+
+error:
+ if (!quiet) {
+ printf("%s: line %lu, col %lu: invalid UTF-8 code\n",
+ filename, line, col);
+ }
+ return 0;
+}
+
+static void usage(const char *program_name)
+{
+ printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n", program_name);
+ printf("Check whether input files are valid UTF-8.\n");
+ printf("This is version %s.\n", VERSION);
+}
+
+int main(int argc, char **argv)
+{
+ int i, ok;
+ FILE *file;
+
+ int quiet;
+ struct option options[] = {
+ { "help", no_argument, NULL, 'h' },
+ { "quiet", no_argument, &quiet, 1 },
+ };
+ int opt;
+
+ quiet = 0;
+
+ while ((opt = getopt_long(argc, argv, "hq", options, NULL)) != -1) {
+ switch (opt) {
+ case 0:
+ break;
+
+ case 'h':
+ usage(argv[0]);
+ exit(0);
+ break;
+
+ case 'q':
+ quiet = 1;
+ break;
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ abort();
+ }
+ }
+
+ if (optind == argc)
+ ok = is_utf8_byte_stream(stdin, "stdin", quiet);
+ else {
+ ok = 1;
+ for (i = optind; i < argc; ++i) {
+ file = fopen(argv[i], "r");
+ if (file == NULL) {
+ fprintf(stderr, "isutf8: %s: error %d: %s\n",
+ argv[i], errno, strerror(errno));
+ ok = 0;
+ } else {
+ ok = is_utf8_byte_stream(file, argv[i], quiet) && ok;
+ (void) fclose(file);
+ }
+ }
+ }
+
+ if (ok)
+ exit(0);
+ exit(EXIT_FAILURE);
+}