diff options
| -rw-r--r-- | src/lib/math/asin.go | 4 | ||||
| -rw-r--r-- | src/lib/math/exp.go | 2 | ||||
| -rw-r--r-- | src/lib/math/log.go | 4 | ||||
| -rw-r--r-- | src/lib/math/main.go | 20 | ||||
| -rw-r--r-- | src/lib/math/pow.go | 4 | ||||
| -rw-r--r-- | src/lib/math/sinh.go | 4 | ||||
| -rw-r--r-- | src/lib/math/sqrt.go | 3 | ||||
| -rw-r--r-- | src/lib/math/tan.go | 2 | ||||
| -rw-r--r-- | src/lib9/utf/mkrunetype.c | 733 | ||||
| -rw-r--r-- | src/lib9/utf/rune.c | 230 | ||||
| -rw-r--r-- | src/lib9/utf/runetype.c | 1139 | ||||
| -rw-r--r-- | src/lib9/utf/utf.h | 248 | ||||
| -rw-r--r-- | src/lib9/utf/utfdef.h | 35 | ||||
| -rw-r--r-- | src/lib9/utf/utfecpy.c | 9 | ||||
| -rw-r--r-- | src/lib9/utf/utflen.c | 9 | ||||
| -rw-r--r-- | src/lib9/utf/utfnlen.c | 10 | ||||
| -rw-r--r-- | src/lib9/utf/utfrrune.c | 12 | ||||
| -rw-r--r-- | src/lib9/utf/utfrune.c | 10 | ||||
| -rw-r--r-- | src/lib9/utf/utfutf.c | 13 | ||||
| -rw-r--r-- | src/runtime/Makefile | 1 | ||||
| -rw-r--r-- | src/runtime/rune.c | 224 | ||||
| -rw-r--r-- | src/runtime/runtime.h | 2 | ||||
| -rw-r--r-- | src/runtime/string.c | 49 | ||||
| -rw-r--r-- | test/string_lit.go | 9 | 
24 files changed, 1520 insertions, 1256 deletions
| diff --git a/src/lib/math/asin.go b/src/lib/math/asin.go index 9a83e9e43..a0135f48f 100644 --- a/src/lib/math/asin.go +++ b/src/lib/math/asin.go @@ -34,7 +34,7 @@ asin(arg double)double  		sign = true;  	}  	if arg > 1 { -		panic "return sys.NaN()"; +		return sys.NaN();  	}  	temp = sqrt(1 - x*x); @@ -54,7 +54,7 @@ func  acos(arg double)double  {  	if(arg > 1 || arg < -1) { -		panic "return sys.NaN()"; +		return sys.NaN();  	}  	return pio2 - asin(arg);  } diff --git a/src/lib/math/exp.go b/src/lib/math/exp.go index 6be61afdf..b428273e5 100644 --- a/src/lib/math/exp.go +++ b/src/lib/math/exp.go @@ -40,7 +40,7 @@ exp(arg double) double  		return 0.;  	}  	if arg > maxf { -		panic "return sys.Inf(1)" +		return sys.Inf(1)  	}  	x = arg*log2e; diff --git a/src/lib/math/log.go b/src/lib/math/log.go index 1c44eb8a3..7ad809cb0 100644 --- a/src/lib/math/log.go +++ b/src/lib/math/log.go @@ -36,7 +36,7 @@ log(arg double) double  	var exp int;  	if arg <= 0 { -		panic "return sys.NaN()"; +		return sys.NaN();  	}  	exp,x = sys.frexp(arg); @@ -63,7 +63,7 @@ log10(arg double) double  {  	if arg <= 0 { -		panic "return sys.NaN()"; +		return sys.NaN();  	}  	return log(arg) * ln10o1;  } diff --git a/src/lib/math/main.go b/src/lib/math/main.go index 2fa7ea152..0006151d9 100644 --- a/src/lib/math/main.go +++ b/src/lib/math/main.go @@ -5,7 +5,25 @@  package main -import math "math" +//import math "math" +////////////////// + import math "asin" + import math "atan" + import math "atan2" + import math "exp" + import math "fabs" + import math "floor" + import math "fmod" + import math "hypot" + import math "log" + import math "pow" + import math "pow10" + import math "sin" + import math "sinh" + import math "sqrt" + import math "tan" + import math "tanh" +  const  ( diff --git a/src/lib/math/pow.go b/src/lib/math/pow.go index dba41efdc..958bb371c 100644 --- a/src/lib/math/pow.go +++ b/src/lib/math/pow.go @@ -26,14 +26,14 @@ pow(arg1,arg2 double) double  	if arg1 <= 0 {  		if(arg1 == 0) {  			if arg2 <= 0 { -				panic "return sys.NaN()"; +				return sys.NaN();  			}  			return 0;  		}  		temp = floor(arg2);  		if temp != arg2 { -			panic "return sys.NaN()"; +			panic sys.NaN();  		}  		l = long(temp); diff --git a/src/lib/math/sinh.go b/src/lib/math/sinh.go index 75f6ddd63..a475171d7 100644 --- a/src/lib/math/sinh.go +++ b/src/lib/math/sinh.go @@ -48,7 +48,7 @@ sinh(arg double) double  		temp = exp(arg)/2;  	case arg > 0.5: -//		temp = (exp(arg) - exp(-arg))/2; +		temp = (exp(arg) - exp(-arg))/2;  	default:  		argsq = arg*arg; @@ -71,5 +71,5 @@ cosh(arg double) double  	if arg > 21 {  		return exp(arg)/2;  	} -//	return (exp(arg) + exp(-arg))/2; +	return (exp(arg) + exp(-arg))/2;  } diff --git a/src/lib/math/sqrt.go b/src/lib/math/sqrt.go index c5c01584f..c1a9e8622 100644 --- a/src/lib/math/sqrt.go +++ b/src/lib/math/sqrt.go @@ -19,11 +19,10 @@ sqrt(arg double) double  	var x, temp double;  	var exp, i int; -/* BUG: NO isINF  	if sys.isInf(arg, 1) {  		return arg;  	} -*/ +  	if arg <= 0 {  		if arg < 0 {  			panic "return sys.NaN()" diff --git a/src/lib/math/tan.go b/src/lib/math/tan.go index 695352ae5..11c03009f 100644 --- a/src/lib/math/tan.go +++ b/src/lib/math/tan.go @@ -62,7 +62,7 @@ tan(arg double) double  	if flag {  		if(temp == 0) { -			panic "return sys.NaN()"; +			panic sys.NaN();  		}  		temp = 1/temp;  	} diff --git a/src/lib9/utf/mkrunetype.c b/src/lib9/utf/mkrunetype.c new file mode 100644 index 000000000..f1a9f8a77 --- /dev/null +++ b/src/lib9/utf/mkrunetype.c @@ -0,0 +1,733 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* + * make is(upper|lower|title|space|alpha)rune and + * to(upper|lower|title)rune from a UnicodeData.txt file. + * these can be found at unicode.org + * + * with -c, runs a check of the existing runetype functions vs. + * those extracted from UnicodeData. + * + * with -p, generates tables for pairs of chars, as well as for ranges + * and singletons. + * + * UnicodeData defines 4 fields of interest: + * 1) a category + * 2) an upper case mapping + * 3) a lower case mapping + * 4) a title case mapping + * + * toupper, tolower, and totitle are defined directly from the mapping. + * + * isalpharune(c) is true iff c is a "letter" category + * isupperrune(c) is true iff c is the target of toupperrune, + *	or is in the uppercase letter category + * similarly for islowerrune and istitlerune. + * isspacerune is true for space category chars, "C" locale white space chars, + *	and two additions: + *	0085	"next line" control char + *	feff]	"zero-width non-break space" + * isdigitrune is true iff c is a numeric-digit category. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> +#include <libgen.h> +#include "utf.h" +#include "utfdef.h" + +enum { +	/* +	 * fields in the unicode data file +	 */ +	FIELD_CODE, +	FIELD_NAME, +	FIELD_CATEGORY, +	FIELD_COMBINING, +	FIELD_BIDIR, +	FIELD_DECOMP, +	FIELD_DECIMAL_DIG, +	FIELD_DIG, +	FIELD_NUMERIC_VAL, +	FIELD_MIRRORED, +	FIELD_UNICODE_1_NAME, +	FIELD_COMMENT, +	FIELD_UPPER, +	FIELD_LOWER, +	FIELD_TITLE, +	NFIELDS, + +	MAX_LINE	= 1024, + +	TO_OFFSET	= 1 << 20, + +	NRUNES		= 1 << 21, +}; + +#define TO_DELTA(xmapped,x)	(TO_OFFSET + (xmapped) - (x)) + +static char	myisspace[NRUNES]; +static char	myisalpha[NRUNES]; +static char	myisdigit[NRUNES]; +static char	myisupper[NRUNES]; +static char	myislower[NRUNES]; +static char	myistitle[NRUNES]; + +static int	mytoupper[NRUNES]; +static int	mytolower[NRUNES]; +static int	mytotitle[NRUNES]; + +static void	check(void); +static void	mktables(char *src, int usepairs); +static void	fatal(const char *fmt, ...); +static int	mygetfields(char **fields, int nfields, char *str, const char *delim); +static int	getunicodeline(FILE *in, char **fields, char *buf); +static int	getcode(char *s); + +static void +usage(void) +{ +	fprintf(stderr, "usage: mktables [-cp] <UnicodeData.txt>\n"); +	exit(1); +} + +int +main(int argc, char *argv[]){ +	FILE *in; +	char buf[MAX_LINE], buf2[MAX_LINE]; +	char *fields[NFIELDS + 1], *fields2[NFIELDS + 1]; +	char *p; +	int i, code, last, docheck, usepairs; + +	docheck = 0; +	usepairs = 0; +	ARGBEGIN{ +	case 'c': +		docheck = 1; +		break; +	case 'p': +		usepairs = 1; +		break; +	default: +		usage(); +	}ARGEND + +	if(argc != 1){ +		usage(); +	} + +	in = fopen(argv[0], "r"); +	if(in == NULL){ +		fatal("can't open %s", argv[0]); +	} + +	for(i = 0; i < NRUNES; i++){ +		mytoupper[i] = i; +		mytolower[i] = i; +		mytotitle[i] = i; +	} + +	/* +	 * make sure isspace has all of the "C" locale whitespace chars +	 */ +	myisspace['\t'] = 1; +	myisspace['\n'] = 1; +	myisspace['\r'] = 1; +	myisspace['\f'] = 1; +	myisspace['\v'] = 1; + +	/* +	 * a couple of other exceptions +	 */ +	myisspace[0x85] = 1;	/* control char, "next line" */ +	myisspace[0xfeff] = 1;	/* zero-width non-break space */ + +	last = -1; +	while(getunicodeline(in, fields, buf)){ +		code = getcode(fields[FIELD_CODE]); +                if (code >= NRUNES) +                  fatal("code-point value too big: %x", code); +		if(code <= last) +			fatal("bad code sequence: %x then %x", last, code); +		last = code; + +		/* +		 * check for ranges +		 */ +		p = fields[FIELD_CATEGORY]; +		if(strstr(fields[FIELD_NAME], ", First>") != NULL){ +			if(!getunicodeline(in, fields2, buf2)) +				fatal("range start at eof"); +			if (strstr(fields2[FIELD_NAME], ", Last>") == NULL) +				fatal("range start not followed by range end"); +			last = getcode(fields2[FIELD_CODE]); +			if(last <= code) +				fatal("range out of sequence: %x then %x", code, last); +			if(strcmp(p, fields2[FIELD_CATEGORY]) != 0) +				fatal("range with mismatched category"); +		} + +		/* +		 * set properties and conversions +		 */ +		for (; code <= last; code++){ +			if(p[0] == 'L') +				myisalpha[code] = 1; +			if(p[0] == 'Z') +				myisspace[code] = 1; + +			if(strcmp(p, "Lu") == 0) +				myisupper[code] = 1; +			if(strcmp(p, "Ll") == 0) +				myislower[code] = 1; + +			if(strcmp(p, "Lt") == 0) +				myistitle[code] = 1; + +			if(strcmp(p, "Nd") == 0) +				myisdigit[code] = 1; + +			/* +			 * when finding conversions, also need to mark +			 * upper/lower case, since some chars, like +			 * "III" (0x2162), aren't defined as letters but have a +			 * lower case mapping ("iii" (0x2172)). +			 */ +			if(fields[FIELD_UPPER][0] != '\0'){ +				mytoupper[code] = getcode(fields[FIELD_UPPER]); +			} +			if(fields[FIELD_LOWER][0] != '\0'){ +				mytolower[code] = getcode(fields[FIELD_LOWER]); +			} +			if(fields[FIELD_TITLE][0] != '\0'){ +				mytotitle[code] = getcode(fields[FIELD_TITLE]); +			} +		} +	} + +	fclose(in); + +	/* +	 * check for codes with no totitle mapping but a toupper mapping. +	 * these appear in UnicodeData-2.0.14.txt, but are almost certainly +	 * erroneous. +	 */ +	for(i = 0; i < NRUNES; i++){ +		if(mytotitle[i] == i +		&& mytoupper[i] != i +		&& !myistitle[i]) +			fprintf(stderr, "warning: code=%.4x not istitle, totitle is same, toupper=%.4x\n", i, mytoupper[i]); +	} + +	/* +	 * make sure isupper[c] is true if for some x toupper[x]  == c +	 * ditto for islower and istitle +	 */ +	for(i = 0; i < NRUNES; i++) { +		if(mytoupper[i] != i) +			myisupper[mytoupper[i]] = 1; +		if(mytolower[i] != i) +			myislower[mytolower[i]] = 1; +		if(mytotitle[i] != i) +			myistitle[mytotitle[i]] = 1; +	} + +	if(docheck){ +		check(); +	}else{ +		mktables(argv[0], usepairs); +	} +	return 0; +} + +/* + * generate a properties array for ranges, clearing those cases covered. + * if force, generate one-entry ranges for singletons. + */ +static int +mkisrange(const char* label, char* prop, int force) +{ +	int start, stop, some; + +	/* +	 * first, the ranges +	 */ +	some = 0; +	for(start = 0; start < NRUNES; ) { +		if(!prop[start]){ +			start++; +			continue; +		} + +		for(stop = start + 1; stop < NRUNES; stop++){ +			if(!prop[stop]){ +				break; +			} +			prop[stop] = 0; +		} +		if(force || stop != start + 1){ +			if(!some){ +				printf("static Rune __is%sr[] = {\n", label); +				some = 1; +			} +			prop[start] = 0; +			printf("\t0x%.4x, 0x%.4x,\n", start, stop - 1); +		} + +		start = stop; +	} +	if(some) +		printf("};\n\n"); +	return some; +} + +/* + * generate a mapping array for pairs with a skip between, + * clearing those entries covered. + */ +static int +mkispair(const char *label, char *prop) +{ +	int start, stop, some; + +	some = 0; +	for(start = 0; start + 2 < NRUNES; ) { +		if(!prop[start]){ +			start++; +			continue; +		} + +		for(stop = start + 2; stop < NRUNES; stop += 2){ +			if(!prop[stop]){ +				break; +			} +			prop[stop] = 0; +		} +		if(stop != start + 2){ +			if(!some){ +				printf("static Rune __is%sp[] = {\n", label); +				some = 1; +			} +			prop[start] = 0; +			printf("\t0x%.4x, 0x%.4x,\n", start, stop - 2); +		} + +		start = stop; +	} +	if(some) +		printf("};\n\n"); +	return some; +} + +/* + * generate a properties array for singletons, clearing those cases covered. + */ +static int +mkissingle(const char *label, char *prop) +{ +	int start, some; + +	some = 0; +	for(start = 0; start < NRUNES; start++) { +		if(!prop[start]){ +			continue; +		} + +		if(!some){ +			printf("static Rune __is%ss[] = {\n", label); +			some = 1; +		} +		prop[start] = 0; +		printf("\t0x%.4x,\n", start); +	} +	if(some) +		printf("};\n\n"); +	return some; +} + +/* + * generate tables and a function for is<label>rune + */ +static void +mkis(const char* label, char* prop, int usepairs) +{ +	int isr, isp, iss; + +	isr = mkisrange(label, prop, 0); +	isp = 0; +	if(usepairs) +		isp = mkispair(label, prop); +	iss = mkissingle(label, prop); + +	printf( +		"int\n" +		"is%srune(Rune c)\n" +		"{\n" +		"	Rune *p;\n" +		"\n", +		label); + +	if(isr) +		printf( +			"	p = rbsearch(c, __is%sr, nelem(__is%sr)/2, 2);\n" +			"	if(p && c >= p[0] && c <= p[1])\n" +			"		return 1;\n", +			label, label); + +	if(isp) +		printf( +			"	p = rbsearch(c, __is%sp, nelem(__is%sp)/2, 2);\n" +			"	if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" +			"		return 1;\n", +			label, label); + +	if(iss) +		printf( +			"	p = rbsearch(c, __is%ss, nelem(__is%ss), 1);\n" +			"	if(p && c == p[0])\n" +			"		return 1;\n", +			label, label); + + +	printf( +		"	return 0;\n" +		"}\n" +		"\n" +	); +} + +/* + * generate a mapping array for ranges, clearing those entries covered. + * if force, generate one-entry ranges for singletons. + */ +static int +mktorange(const char* label, int* map, int force) +{ +	int start, stop, delta, some; + +	some = 0; +	for(start = 0; start < NRUNES; ) { +		if(map[start] == start){ +			start++; +			continue; +		} + +		delta = TO_DELTA(map[start], start); +		if(delta != (Rune)delta) +			fatal("bad map delta %d", delta); +		for(stop = start + 1; stop < NRUNES; stop++){ +			if(TO_DELTA(map[stop], stop) != delta){ +				break; +			} +			map[stop] = stop; +		} +		if(stop != start + 1){ +			if(!some){ +				printf("static Rune __to%sr[] = {\n", label); +				some = 1; +			} +			map[start] = start; +			printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 1, delta); +		} + +		start = stop; +	} +	if(some) +		printf("};\n\n"); +	return some; +} + +/* + * generate a mapping array for pairs with a skip between, + * clearing those entries covered. + */ +static int +mktopair(const char* label, int* map) +{ +	int start, stop, delta, some; + +	some = 0; +	for(start = 0; start + 2 < NRUNES; ) { +		if(map[start] == start){ +			start++; +			continue; +		} + +		delta = TO_DELTA(map[start], start); +		if(delta != (Rune)delta) +			fatal("bad map delta %d", delta); +		for(stop = start + 2; stop < NRUNES; stop += 2){ +			if(TO_DELTA(map[stop], stop) != delta){ +				break; +			} +			map[stop] = stop; +		} +		if(stop != start + 2){ +			if(!some){ +				printf("static Rune __to%sp[] = {\n", label); +				some = 1; +			} +			map[start] = start; +			printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 2, delta); +		} + +		start = stop; +	} +	if(some) +		printf("};\n\n"); +	return some; +} + +/* + * generate a mapping array for singletons, clearing those entries covered. + */ +static int +mktosingle(const char* label, int* map) +{ +	int start, delta, some; + +	some = 0; +	for(start = 0; start < NRUNES; start++) { +		if(map[start] == start){ +			continue; +		} + +		delta = TO_DELTA(map[start], start); +		if(delta != (Rune)delta) +			fatal("bad map delta %d", delta); +		if(!some){ +			printf("static Rune __to%ss[] = {\n", label); +			some = 1; +		} +		map[start] = start; +		printf("\t0x%.4x, %d,\n", start, delta); +	} +	if(some) +		printf("};\n\n"); +	return some; +} + +/* + * generate tables and a function for to<label>rune + */ +static void +mkto(const char* label, int* map, int usepairs) +{ +	int tor, top, tos; + +	tor = mktorange(label, map, 0); +	top = 0; +	if(usepairs) +		top = mktopair(label, map); +	tos = mktosingle(label, map); + +	printf( +		"Rune\n" +		"to%srune(Rune c)\n" +		"{\n" +		"	Rune *p;\n" +		"\n", +		label); + +	if(tor) +		printf( +			"	p = rbsearch(c, __to%sr, nelem(__to%sr)/3, 3);\n" +			"	if(p && c >= p[0] && c <= p[1])\n" +			"		return c + p[2] - %d;\n", +			label, label, TO_OFFSET); + +	if(top) +		printf( +			"	p = rbsearch(c, __to%sp, nelem(__to%sp)/3, 3);\n" +			"	if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" +			"		return c + p[2] - %d;\n", +			label, label, TO_OFFSET); + +	if(tos) +		printf( +			"	p = rbsearch(c, __to%ss, nelem(__to%ss)/2, 2);\n" +			"	if(p && c == p[0])\n" +			"		return c + p[1] - %d;\n", +			label, label, TO_OFFSET); + + +	printf( +		"	return c;\n" +		"}\n" +		"\n" +	); +} + +// Make only range tables and a function for is<label>rune. +static void +mkisronly(const char* label, char* prop) { +	mkisrange(label, prop, 1); +	printf( +		"int\n" +		"is%srune(Rune c)\n" +		"{\n" +		"	Rune *p;\n" +		"\n" +		"	p = rbsearch(c, __is%sr, nelem(__is%sr)/2, 2);\n" +		"	if(p && c >= p[0] && c <= p[1])\n" +		"		return 1;\n" +		"	return 0;\n" +		"}\n" +		"\n", +	        label, label, label); +} + +/* + * generate the body of runetype. + * assumes there is a function Rune* rbsearch(Rune c, Rune *t, int n, int ne); + */ +static void +mktables(char *src, int usepairs) +{ +	printf("/* generated automatically by mkrunetype.c from %s */\n\n", +		basename(src)); + +	/* +	 * we special case the space and digit tables, since they are assumed +	 * to be small with several ranges. +	 */ +	mkisronly("space", myisspace); +	mkisronly("digit", myisdigit); + +	mkis("alpha", myisalpha, 0); +	mkis("upper", myisupper, usepairs); +	mkis("lower", myislower, usepairs); +	mkis("title", myistitle, usepairs); + +	mkto("upper", mytoupper, usepairs); +	mkto("lower", mytolower, usepairs); +	mkto("title", mytotitle, usepairs); +} + +/* + * find differences between the newly generated tables and current runetypes. + */ +static void +check(void) +{ +	int i; + +	for(i = 0; i < NRUNES; i++){ +		if(isdigitrune(i) != myisdigit[i]) +			fprintf(stderr, "isdigit diff at %x: runetype=%x, unicode=%x\n", +				i, isdigitrune(i), myisdigit[i]); + +		if(isspacerune(i) != myisspace[i]) +			fprintf(stderr, "isspace diff at %x: runetype=%x, unicode=%x\n", +				i, isspacerune(i), myisspace[i]); + +		if(isupperrune(i) != myisupper[i]) +			fprintf(stderr, "isupper diff at %x: runetype=%x, unicode=%x\n", +				i, isupperrune(i), myisupper[i]); + +		if(islowerrune(i) != myislower[i]) +			fprintf(stderr, "islower diff at %x: runetype=%x, unicode=%x\n", +				i, islowerrune(i), myislower[i]); + +		if(isalpharune(i) != myisalpha[i]) +			fprintf(stderr, "isalpha diff at %x: runetype=%x, unicode=%x\n", +				i, isalpharune(i), myisalpha[i]); + +		if(toupperrune(i) != mytoupper[i]) +			fprintf(stderr, "toupper diff at %x: runetype=%x, unicode=%x\n", +				i, toupperrune(i), mytoupper[i]); + +		if(tolowerrune(i) != mytolower[i]) +			fprintf(stderr, "tolower diff at %x: runetype=%x, unicode=%x\n", +				i, tolowerrune(i), mytolower[i]); + +		if(istitlerune(i) != myistitle[i]) +			fprintf(stderr, "istitle diff at %x: runetype=%x, unicode=%x\n", +				i, istitlerune(i), myistitle[i]); + +		if(totitlerune(i) != mytotitle[i]) +			fprintf(stderr, "totitle diff at %x: runetype=%x, unicode=%x\n", +				i, totitlerune(i), mytotitle[i]); + + +	} +} + +static int +mygetfields(char **fields, int nfields, char *str, const char *delim) +{ +	int nf; + +	fields[0] = str; +	nf = 1; +	if(nf >= nfields) +		return nf; + +	for(; *str; str++){ +		if(strchr(delim, *str) != NULL){ +			*str = '\0'; +			fields[nf++] = str + 1; +			if(nf >= nfields) +				break; +		} +	} +	return nf; +} + +static int +getunicodeline(FILE *in, char **fields, char *buf) +{ +	char *p; + +	if(fgets(buf, MAX_LINE, in) == NULL) +		return 0; + +	p = strchr(buf, '\n'); +	if (p == NULL) +		fatal("line too long"); +	*p = '\0'; + +	if (mygetfields(fields, NFIELDS + 1, buf, ";") != NFIELDS) +		fatal("bad number of fields"); + +	return 1; +} + +static int +getcode(char *s) +{ +	int i, code; + +	code = 0; +        i = 0; +        /* Parse a hex number */ +	while(s[i]) { +		code <<= 4; +		if(s[i] >= '0' && s[i] <= '9') +			code += s[i] - '0'; +		else if(s[i] >= 'A' && s[i] <= 'F') +			code += s[i] - 'A' + 10; +		else +			fatal("bad code char '%c'", s[i]); +                i++; +	} +	return code; +} + +static void +fatal(const char *fmt, ...) +{ +	va_list arg; + +	fprintf(stderr, "%s: fatal error: ", argv0); +	va_start(arg, fmt); +	vfprintf(stderr, fmt, arg); +	va_end(arg); +	fprintf(stderr, "\n"); + +	exit(1); +} diff --git a/src/lib9/utf/rune.c b/src/lib9/utf/rune.c index 3d6831b02..cf98bab15 100644 --- a/src/lib9/utf/rune.c +++ b/src/lib9/utf/rune.c @@ -1,20 +1,21 @@  /*   * The authors of this software are Rob Pike and Ken Thompson.   *              Copyright (c) 2002 by Lucent Technologies. + *              Portions Copyright (c) 2009 The Go Authors.  All rights reserved.   * Permission to use, copy, modify, and distribute this software for any   * purpose without fee is hereby granted, provided that this entire notice   * is included in all copies of any software which is or includes a copy   * or modification of this software and in all copies of the supporting   * documentation for such software.   * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE - * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY   * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.   */  #include <stdarg.h>  #include <string.h> -#include "plan9.h"  #include "utf.h" +#include "utfdef.h"  enum  { @@ -23,27 +24,150 @@ enum  	Bit2	= 5,  	Bit3	= 4,  	Bit4	= 3, +	Bit5	= 2,  	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */  	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */  	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */  	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */  	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */ +	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */  	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */  	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */  	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */ +	Rune4	= (1<<(Bit4+3*Bitx))-1, +                                        /* 0001 1111 1111 1111 1111 1111 */  	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */  	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */ -	Bad	= Runeerror +	Bad	= Runeerror,  }; +/* + * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24 + * This is a slower but "safe" version of the old chartorune + * that works on strings that are not necessarily null-terminated. + * + * If you know for sure that your string is null-terminated, + * chartorune will be a bit faster. + * + * It is guaranteed not to attempt to access "length" + * past the incoming pointer.  This is to avoid + * possible access violations.  If the string appears to be + * well-formed but incomplete (i.e., to get the whole Rune + * we'd need to read past str+length) then we'll set the Rune + * to Bad and return 0. + * + * Note that if we have decoding problems for other + * reasons, we return 1 instead of 0. + */ +int +charntorune(Rune *rune, const char *str, int length) +{ +	int c, c1, c2, c3; +	long l; + +	/* When we're not allowed to read anything */ +	if(length <= 0) { +		goto badlen; +	} + +	/* +	 * one character sequence (7-bit value) +	 *	00000-0007F => T1 +	 */ +	c = *(uchar*)str; +	if(c < Tx) { +		*rune = c; +		return 1; +	} + +	// If we can't read more than one character we must stop +	if(length <= 1) { +		goto badlen; +	} + +	/* +	 * two character sequence (11-bit value) +	 *	0080-07FF => T2 Tx +	 */ +	c1 = *(uchar*)(str+1) ^ Tx; +	if(c1 & Testx) +		goto bad; +	if(c < T3) { +		if(c < T2) +			goto bad; +		l = ((c << Bitx) | c1) & Rune2; +		if(l <= Rune1) +			goto bad; +		*rune = l; +		return 2; +	} + +	// If we can't read more than two characters we must stop +	if(length <= 2) { +		goto badlen; +	} + +	/* +	 * three character sequence (16-bit value) +	 *	0800-FFFF => T3 Tx Tx +	 */ +	c2 = *(uchar*)(str+2) ^ Tx; +	if(c2 & Testx) +		goto bad; +	if(c < T4) { +		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; +		if(l <= Rune2) +			goto bad; +		*rune = l; +		return 3; +	} + +	if (length <= 3) +		goto badlen; + +	/* +	 * four character sequence (21-bit value) +	 *	10000-1FFFFF => T4 Tx Tx Tx +	 */ +	c3 = *(uchar*)(str+3) ^ Tx; +	if (c3 & Testx) +		goto bad; +	if (c < T5) { +		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; +		if (l <= Rune3) +			goto bad; +		*rune = l; +		return 4; +	} + +	// Support for 5-byte or longer UTF-8 would go here, but +	// since we don't have that, we'll just fall through to bad. + +	/* +	 * bad decoding +	 */ +bad: +	*rune = Bad; +	return 1; +badlen: +	*rune = Bad; +	return 0; + +} + + +/* + * This is the older "unsafe" version, which works fine on + * null-terminated strings. + */  int -chartorune(Rune *rune, char *str) +chartorune(Rune *rune, const char *str)  { -	int c, c1, c2; +	int c, c1, c2, c3;  	long l;  	/* @@ -89,6 +213,26 @@ chartorune(Rune *rune, char *str)  	}  	/* +	 * four character sequence (21-bit value) +	 *	10000-1FFFFF => T4 Tx Tx Tx +	 */ +	c3 = *(uchar*)(str+3) ^ Tx; +	if (c3 & Testx) +		goto bad; +	if (c < T5) { +		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; +		if (l <= Rune3) +			goto bad; +		*rune = l; +		return 4; +	} + +	/* +	 * Support for 5-byte or longer UTF-8 would go here, but +	 * since we don't have that, we'll just fall through to bad. +	 */ + +	/*  	 * bad decoding  	 */  bad: @@ -97,9 +241,16 @@ bad:  }  int -runetochar(char *str, Rune *rune) +isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) { +	*consumed = charntorune(rune, str, length); +	return *rune != Runeerror || *consumed == 3; +} + +int +runetochar(char *str, const Rune *rune)  { -	long c; +	/* Runes are signed, so convert to unsigned for range check. */ +	unsigned long c;  	/*  	 * one character sequence @@ -122,56 +273,79 @@ runetochar(char *str, Rune *rune)  	}  	/* +	 * If the Rune is out of range, convert it to the error rune. +	 * Do this test here because the error rune encodes to three bytes. +	 * Doing it earlier would duplicate work, since an out of range +	 * Rune wouldn't have fit in one or two bytes. +	 */ +	if (c > Runemax) +		c = Runeerror; + +	/*  	 * three character sequence  	 *	0800-FFFF => T3 Tx Tx  	 */ -	str[0] = T3 |  (c >> 2*Bitx); -	str[1] = Tx | ((c >> 1*Bitx) & Maskx); -	str[2] = Tx |  (c & Maskx); -	return 3; +	if (c <= Rune3) { +		str[0] = T3 |  (c >> 2*Bitx); +		str[1] = Tx | ((c >> 1*Bitx) & Maskx); +		str[2] = Tx |  (c & Maskx); +		return 3; +	} + +	/* +	 * four character sequence (21-bit value) +	 *     10000-1FFFFF => T4 Tx Tx Tx +	 */ +	str[0] = T4 | (c >> 3*Bitx); +	str[1] = Tx | ((c >> 2*Bitx) & Maskx); +	str[2] = Tx | ((c >> 1*Bitx) & Maskx); +	str[3] = Tx | (c & Maskx); +	return 4;  }  int -runelen(long c) +runelen(Rune rune)  { -	Rune rune;  	char str[10]; -	rune = c;  	return runetochar(str, &rune);  }  int -runenlen(Rune *r, int nrune) +runenlen(const Rune *r, int nrune)  {  	int nb, c;  	nb = 0;  	while(nrune--) {  		c = *r++; -		if(c <= Rune1) +		if (c <= Rune1)  			nb++; -		else -		if(c <= Rune2) +		else if (c <= Rune2)  			nb += 2; -		else +		else if (c <= Rune3)  			nb += 3; +		else /* assert(c <= Rune4) */ +			nb += 4;  	}  	return nb;  }  int -fullrune(char *str, int n) +fullrune(const char *str, int n)  { -	int c; - -	if(n > 0) { -		c = *(uchar*)str; -		if(c < Tx) +	if (n > 0) { +		int c = *(uchar*)str; +		if (c < Tx)  			return 1; -		if(n > 1) -			if(c < T3 || n > 2) +		if (n > 1) { +			if (c < T3)  				return 1; +			if (n > 2) { +				if (c < T4 || n > 3) +					return 1; +			} +		}  	}  	return 0;  } diff --git a/src/lib9/utf/runetype.c b/src/lib9/utf/runetype.c index ac6d7b576..ff2dbb539 100644 --- a/src/lib9/utf/runetype.c +++ b/src/lib9/utf/runetype.c @@ -7,1037 +7,22 @@   * or modification of this software and in all copies of the supporting   * documentation for such software.   * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE - * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY   * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.   */ -#include <stdarg.h> -#include <string.h> -#include "plan9.h"  #include "utf.h" +#include "utfdef.h" -/* - * alpha ranges - - *	only covers ranges not in lower||upper - */ -static -Rune	__alpha2[] = -{ -	0x00d8,	0x00f6,	/* Ø - ö */ -	0x00f8,	0x01f5,	/* ø - ǵ */ -	0x0250,	0x02a8,	/* ɐ - ʨ */ -	0x038e,	0x03a1,	/* Ύ - Ρ */ -	0x03a3,	0x03ce,	/* Σ - ώ */ -	0x03d0,	0x03d6,	/* ϐ - ϖ */ -	0x03e2,	0x03f3,	/* Ϣ - ϳ */ -	0x0490,	0x04c4,	/* Ґ - ӄ */ -	0x0561,	0x0587,	/* ա - և */ -	0x05d0,	0x05ea,	/* א - ת */ -	0x05f0,	0x05f2,	/* װ - ײ */ -	0x0621,	0x063a,	/* ء - غ */ -	0x0640,	0x064a,	/* ـ - ي */ -	0x0671,	0x06b7,	/* ٱ - ڷ */ -	0x06ba,	0x06be,	/* ں - ھ */ -	0x06c0,	0x06ce,	/* ۀ - ێ */ -	0x06d0,	0x06d3,	/* ې - ۓ */ -	0x0905,	0x0939,	/* अ - ह */ -	0x0958,	0x0961,	/* क़ - ॡ */ -	0x0985,	0x098c,	/* অ - ঌ */ -	0x098f,	0x0990,	/* এ - ঐ */ -	0x0993,	0x09a8,	/* ও - ন */ -	0x09aa,	0x09b0,	/* প - র */ -	0x09b6,	0x09b9,	/* শ - হ */ -	0x09dc,	0x09dd,	/* ড় - ঢ় */ -	0x09df,	0x09e1,	/* য় - ৡ */ -	0x09f0,	0x09f1,	/* ৰ - ৱ */ -	0x0a05,	0x0a0a,	/* ਅ - ਊ */ -	0x0a0f,	0x0a10,	/* ਏ - ਐ */ -	0x0a13,	0x0a28,	/* ਓ - ਨ */ -	0x0a2a,	0x0a30,	/* ਪ - ਰ */ -	0x0a32,	0x0a33,	/* ਲ - ਲ਼ */ -	0x0a35,	0x0a36,	/* ਵ - ਸ਼ */ -	0x0a38,	0x0a39,	/* ਸ - ਹ */ -	0x0a59,	0x0a5c,	/* ਖ਼ - ੜ */ -	0x0a85,	0x0a8b,	/* અ - ઋ */ -	0x0a8f,	0x0a91,	/* એ - ઑ */ -	0x0a93,	0x0aa8,	/* ઓ - ન */ -	0x0aaa,	0x0ab0,	/* પ - ર */ -	0x0ab2,	0x0ab3,	/* લ - ળ */ -	0x0ab5,	0x0ab9,	/* વ - હ */ -	0x0b05,	0x0b0c,	/* ଅ - ଌ */ -	0x0b0f,	0x0b10,	/* ଏ - ଐ */ -	0x0b13,	0x0b28,	/* ଓ - ନ */ -	0x0b2a,	0x0b30,	/* ପ - ର */ -	0x0b32,	0x0b33,	/* ଲ - ଳ */ -	0x0b36,	0x0b39,	/* ଶ - ହ */ -	0x0b5c,	0x0b5d,	/* ଡ଼ - ଢ଼ */ -	0x0b5f,	0x0b61,	/* ୟ - ୡ */ -	0x0b85,	0x0b8a,	/* அ - ஊ */ -	0x0b8e,	0x0b90,	/* எ - ஐ */ -	0x0b92,	0x0b95,	/* ஒ - க */ -	0x0b99,	0x0b9a,	/* ங - ச */ -	0x0b9e,	0x0b9f,	/* ஞ - ட */ -	0x0ba3,	0x0ba4,	/* ண - த */ -	0x0ba8,	0x0baa,	/* ந - ப */ -	0x0bae,	0x0bb5,	/* ம - வ */ -	0x0bb7,	0x0bb9,	/* ஷ - ஹ */ -	0x0c05,	0x0c0c,	/* అ - ఌ */ -	0x0c0e,	0x0c10,	/* ఎ - ఐ */ -	0x0c12,	0x0c28,	/* ఒ - న */ -	0x0c2a,	0x0c33,	/* ప - ళ */ -	0x0c35,	0x0c39,	/* వ - హ */ -	0x0c60,	0x0c61,	/* ౠ - ౡ */ -	0x0c85,	0x0c8c,	/* ಅ - ಌ */ -	0x0c8e,	0x0c90,	/* ಎ - ಐ */ -	0x0c92,	0x0ca8,	/* ಒ - ನ */ -	0x0caa,	0x0cb3,	/* ಪ - ಳ */ -	0x0cb5,	0x0cb9,	/* ವ - ಹ */ -	0x0ce0,	0x0ce1,	/* ೠ - ೡ */ -	0x0d05,	0x0d0c,	/* അ - ഌ */ -	0x0d0e,	0x0d10,	/* എ - ഐ */ -	0x0d12,	0x0d28,	/* ഒ - ന */ -	0x0d2a,	0x0d39,	/* പ - ഹ */ -	0x0d60,	0x0d61,	/* ൠ - ൡ */ -	0x0e01,	0x0e30,	/* ก - ะ */ -	0x0e32,	0x0e33,	/* า - ำ */ -	0x0e40,	0x0e46,	/* เ - ๆ */ -	0x0e5a,	0x0e5b,	/* ๚ - ๛ */ -	0x0e81,	0x0e82,	/* ກ - ຂ */ -	0x0e87,	0x0e88,	/* ງ - ຈ */ -	0x0e94,	0x0e97,	/* ດ - ທ */ -	0x0e99,	0x0e9f,	/* ນ - ຟ */ -	0x0ea1,	0x0ea3,	/* ມ - ຣ */ -	0x0eaa,	0x0eab,	/* ສ - ຫ */ -	0x0ead,	0x0eae,	/* ອ - ຮ */ -	0x0eb2,	0x0eb3,	/* າ - ຳ */ -	0x0ec0,	0x0ec4,	/* ເ - ໄ */ -	0x0edc,	0x0edd,	/* ໜ - ໝ */ -	0x0f18,	0x0f19,	/* ༘ - ༙ */ -	0x0f40,	0x0f47,	/* ཀ - ཇ */ -	0x0f49,	0x0f69,	/* ཉ - ཀྵ */ -	0x10d0,	0x10f6,	/* ა - ჶ */ -	0x1100,	0x1159,	/* ᄀ - ᅙ */ -	0x115f,	0x11a2,	/* ᅟ - ᆢ */ -	0x11a8,	0x11f9,	/* ᆨ - ᇹ */ -	0x1e00,	0x1e9b,	/* Ḁ - ẛ */ -	0x1f50,	0x1f57,	/* ὐ - ὗ */ -	0x1f80,	0x1fb4,	/* ᾀ - ᾴ */ -	0x1fb6,	0x1fbc,	/* ᾶ - ᾼ */ -	0x1fc2,	0x1fc4,	/* ῂ - ῄ */ -	0x1fc6,	0x1fcc,	/* ῆ - ῌ */ -	0x1fd0,	0x1fd3,	/* ῐ - ΐ */ -	0x1fd6,	0x1fdb,	/* ῖ - Ί */ -	0x1fe0,	0x1fec,	/* ῠ - Ῥ */ -	0x1ff2,	0x1ff4,	/* ῲ - ῴ */ -	0x1ff6,	0x1ffc,	/* ῶ - ῼ */ -	0x210a,	0x2113,	/* ℊ - ℓ */ -	0x2115,	0x211d,	/* ℕ - ℝ */ -	0x2120,	0x2122,	/* ℠ - ™ */ -	0x212a,	0x2131,	/* K - ℱ */ -	0x2133,	0x2138,	/* ℳ - ℸ */ -	0x3041,	0x3094,	/* ぁ - ゔ */ -	0x30a1,	0x30fa,	/* ァ - ヺ */ -	0x3105,	0x312c,	/* ㄅ - ㄬ */ -	0x3131,	0x318e,	/* ㄱ - ㆎ */ -	0x3192,	0x319f,	/* ㆒ - ㆟ */ -	0x3260,	0x327b,	/* ㉠ - ㉻ */ -	0x328a,	0x32b0,	/* ㊊ - ㊰ */ -	0x32d0,	0x32fe,	/* ㋐ - ㋾ */ -	0x3300,	0x3357,	/* ㌀ - ㍗ */ -	0x3371,	0x3376,	/* ㍱ - ㍶ */ -	0x337b,	0x3394,	/* ㍻ - ㎔ */ -	0x3399,	0x339e,	/* ㎙ - ㎞ */ -	0x33a9,	0x33ad,	/* ㎩ - ㎭ */ -	0x33b0,	0x33c1,	/* ㎰ - ㏁ */ -	0x33c3,	0x33c5,	/* ㏃ - ㏅ */ -	0x33c7,	0x33d7,	/* ㏇ - ㏗ */ -	0x33d9,	0x33dd,	/* ㏙ - ㏝ */ -	0x4e00,	0x9fff,	/* 一 - 鿿 */ -	0xac00,	0xd7a3,	/* 가 - 힣 */ -	0xf900,	0xfb06,	/* 豈 - st */ -	0xfb13,	0xfb17,	/* ﬓ - ﬗ */ -	0xfb1f,	0xfb28,	/* ײַ - ﬨ */ -	0xfb2a,	0xfb36,	/* שׁ - זּ */ -	0xfb38,	0xfb3c,	/* טּ - לּ */ -	0xfb40,	0xfb41,	/* נּ - סּ */ -	0xfb43,	0xfb44,	/* ףּ - פּ */ -	0xfb46,	0xfbb1,	/* צּ - ﮱ */ -	0xfbd3,	0xfd3d,	/* ﯓ - ﴽ */ -	0xfd50,	0xfd8f,	/* ﵐ - ﶏ */ -	0xfd92,	0xfdc7,	/* ﶒ - ﷇ */ -	0xfdf0,	0xfdf9,	/* ﷰ - ﷹ */ -	0xfe70,	0xfe72,	/* ﹰ - ﹲ */ -	0xfe76,	0xfefc,	/* ﹶ - ﻼ */ -	0xff66,	0xff6f,	/* ヲ - ッ */ -	0xff71,	0xff9d,	/* ア - ン */ -	0xffa0,	0xffbe,	/* ᅠ - ᄒ */ -	0xffc2,	0xffc7,	/* ᅡ - ᅦ */ -	0xffca,	0xffcf,	/* ᅧ - ᅬ */ -	0xffd2,	0xffd7,	/* ᅭ - ᅲ */ -	0xffda,	0xffdc,	/* ᅳ - ᅵ */ -}; - -/* - * alpha singlets - - *	only covers ranges not in lower||upper - */ -static -Rune	__alpha1[] = -{ -	0x00aa,	/* ª */ -	0x00b5,	/* µ */ -	0x00ba,	/* º */ -	0x03da,	/* Ϛ */ -	0x03dc,	/* Ϝ */ -	0x03de,	/* Ϟ */ -	0x03e0,	/* Ϡ */ -	0x06d5,	/* ە */ -	0x09b2,	/* ল */ -	0x0a5e,	/* ਫ਼ */ -	0x0a8d,	/* ઍ */ -	0x0ae0,	/* ૠ */ -	0x0b9c,	/* ஜ */ -	0x0cde,	/* ೞ */ -	0x0e4f,	/* ๏ */ -	0x0e84,	/* ຄ */ -	0x0e8a,	/* ຊ */ -	0x0e8d,	/* ຍ */ -	0x0ea5,	/* ລ */ -	0x0ea7,	/* ວ */ -	0x0eb0,	/* ະ */ -	0x0ebd,	/* ຽ */ -	0x1fbe,	/* ι */ -	0x207f,	/* ⁿ */ -	0x20a8,	/* ₨ */ -	0x2102,	/* ℂ */ -	0x2107,	/* ℇ */ -	0x2124,	/* ℤ */ -	0x2126,	/* Ω */ -	0x2128,	/* ℨ */ -	0xfb3e,	/* מּ */ -	0xfe74,	/* ﹴ */ -}; - -/* - * space ranges - */ -static -Rune	__space2[] = -{ -	0x0009,	0x000a,	/* tab and newline */ -	0x0020,	0x0020,	/* space */ -	0x00a0,	0x00a0,	/*   */ -	0x2000,	0x200b,	/*   -  */ -	0x2028,	0x2029,	/* 
 - 
 */ -	0x3000,	0x3000,	/*   */ -	0xfeff,	0xfeff,	/*  */ -}; - -/* - * lower case ranges - *	3rd col is conversion excess 500 - */ -static -Rune	__toupper2[] = -{ -	0x0061,	0x007a, 468,	/* a-z A-Z */ -	0x00e0,	0x00f6, 468,	/* à-ö À-Ö */ -	0x00f8,	0x00fe, 468,	/* ø-þ Ø-Þ */ -	0x0256,	0x0257, 295,	/* ɖ-ɗ Ɖ-Ɗ */ -	0x0258,	0x0259, 298,	/* ɘ-ə Ǝ-Ə */ -	0x028a,	0x028b, 283,	/* ʊ-ʋ Ʊ-Ʋ */ -	0x03ad,	0x03af, 463,	/* έ-ί Έ-Ί */ -	0x03b1,	0x03c1, 468,	/* α-ρ Α-Ρ */ -	0x03c3,	0x03cb, 468,	/* σ-ϋ Σ-Ϋ */ -	0x03cd,	0x03ce, 437,	/* ύ-ώ Ύ-Ώ */ -	0x0430,	0x044f, 468,	/* а-я А-Я */ -	0x0451,	0x045c, 420,	/* ё-ќ Ё-Ќ */ -	0x045e,	0x045f, 420,	/* ў-џ Ў-Џ */ -	0x0561,	0x0586, 452,	/* ա-ֆ Ա-Ֆ */ -	0x1f00,	0x1f07, 508,	/* ἀ-ἇ Ἀ-Ἇ */ -	0x1f10,	0x1f15, 508,	/* ἐ-ἕ Ἐ-Ἕ */ -	0x1f20,	0x1f27, 508,	/* ἠ-ἧ Ἠ-Ἧ */ -	0x1f30,	0x1f37, 508,	/* ἰ-ἷ Ἰ-Ἷ */ -	0x1f40,	0x1f45, 508,	/* ὀ-ὅ Ὀ-Ὅ */ -	0x1f60,	0x1f67, 508,	/* ὠ-ὧ Ὠ-Ὧ */ -	0x1f70,	0x1f71, 574,	/* ὰ-ά Ὰ-Ά */ -	0x1f72,	0x1f75, 586,	/* ὲ-ή Ὲ-Ή */ -	0x1f76,	0x1f77, 600,	/* ὶ-ί Ὶ-Ί */ -	0x1f78,	0x1f79, 628,	/* ὸ-ό Ὸ-Ό */ -	0x1f7a,	0x1f7b, 612,	/* ὺ-ύ Ὺ-Ύ */ -	0x1f7c,	0x1f7d, 626,	/* ὼ-ώ Ὼ-Ώ */ -	0x1f80,	0x1f87, 508,	/* ᾀ-ᾇ ᾈ-ᾏ */ -	0x1f90,	0x1f97, 508,	/* ᾐ-ᾗ ᾘ-ᾟ */ -	0x1fa0,	0x1fa7, 508,	/* ᾠ-ᾧ ᾨ-ᾯ */ -	0x1fb0,	0x1fb1, 508,	/* ᾰ-ᾱ Ᾰ-Ᾱ */ -	0x1fd0,	0x1fd1, 508,	/* ῐ-ῑ Ῐ-Ῑ */ -	0x1fe0,	0x1fe1, 508,	/* ῠ-ῡ Ῠ-Ῡ */ -	0x2170,	0x217f, 484,	/* ⅰ-ⅿ Ⅰ-Ⅿ */ -	0x24d0,	0x24e9, 474,	/* ⓐ-ⓩ Ⓐ-Ⓩ */ -	0xff41,	0xff5a, 468,	/* a-z A-Z */ -}; - -/* - * lower case singlets - *	2nd col is conversion excess 500 - */ -static -Rune	__toupper1[] = -{ -	0x00ff, 621,	/* ÿ Ÿ */ -	0x0101, 499,	/* ā Ā */ -	0x0103, 499,	/* ă Ă */ -	0x0105, 499,	/* ą Ą */ -	0x0107, 499,	/* ć Ć */ -	0x0109, 499,	/* ĉ Ĉ */ -	0x010b, 499,	/* ċ Ċ */ -	0x010d, 499,	/* č Č */ -	0x010f, 499,	/* ď Ď */ -	0x0111, 499,	/* đ Đ */ -	0x0113, 499,	/* ē Ē */ -	0x0115, 499,	/* ĕ Ĕ */ -	0x0117, 499,	/* ė Ė */ -	0x0119, 499,	/* ę Ę */ -	0x011b, 499,	/* ě Ě */ -	0x011d, 499,	/* ĝ Ĝ */ -	0x011f, 499,	/* ğ Ğ */ -	0x0121, 499,	/* ġ Ġ */ -	0x0123, 499,	/* ģ Ģ */ -	0x0125, 499,	/* ĥ Ĥ */ -	0x0127, 499,	/* ħ Ħ */ -	0x0129, 499,	/* ĩ Ĩ */ -	0x012b, 499,	/* ī Ī */ -	0x012d, 499,	/* ĭ Ĭ */ -	0x012f, 499,	/* į Į */ -	0x0131, 268,	/* ı I */ -	0x0133, 499,	/* ij IJ */ -	0x0135, 499,	/* ĵ Ĵ */ -	0x0137, 499,	/* ķ Ķ */ -	0x013a, 499,	/* ĺ Ĺ */ -	0x013c, 499,	/* ļ Ļ */ -	0x013e, 499,	/* ľ Ľ */ -	0x0140, 499,	/* ŀ Ŀ */ -	0x0142, 499,	/* ł Ł */ -	0x0144, 499,	/* ń Ń */ -	0x0146, 499,	/* ņ Ņ */ -	0x0148, 499,	/* ň Ň */ -	0x014b, 499,	/* ŋ Ŋ */ -	0x014d, 499,	/* ō Ō */ -	0x014f, 499,	/* ŏ Ŏ */ -	0x0151, 499,	/* ő Ő */ -	0x0153, 499,	/* œ Œ */ -	0x0155, 499,	/* ŕ Ŕ */ -	0x0157, 499,	/* ŗ Ŗ */ -	0x0159, 499,	/* ř Ř */ -	0x015b, 499,	/* ś Ś */ -	0x015d, 499,	/* ŝ Ŝ */ -	0x015f, 499,	/* ş Ş */ -	0x0161, 499,	/* š Š */ -	0x0163, 499,	/* ţ Ţ */ -	0x0165, 499,	/* ť Ť */ -	0x0167, 499,	/* ŧ Ŧ */ -	0x0169, 499,	/* ũ Ũ */ -	0x016b, 499,	/* ū Ū */ -	0x016d, 499,	/* ŭ Ŭ */ -	0x016f, 499,	/* ů Ů */ -	0x0171, 499,	/* ű Ű */ -	0x0173, 499,	/* ų Ų */ -	0x0175, 499,	/* ŵ Ŵ */ -	0x0177, 499,	/* ŷ Ŷ */ -	0x017a, 499,	/* ź Ź */ -	0x017c, 499,	/* ż Ż */ -	0x017e, 499,	/* ž Ž */ -	0x017f, 200,	/* ſ S */ -	0x0183, 499,	/* ƃ Ƃ */ -	0x0185, 499,	/* ƅ Ƅ */ -	0x0188, 499,	/* ƈ Ƈ */ -	0x018c, 499,	/* ƌ Ƌ */ -	0x0192, 499,	/* ƒ Ƒ */ -	0x0199, 499,	/* ƙ Ƙ */ -	0x01a1, 499,	/* ơ Ơ */ -	0x01a3, 499,	/* ƣ Ƣ */ -	0x01a5, 499,	/* ƥ Ƥ */ -	0x01a8, 499,	/* ƨ Ƨ */ -	0x01ad, 499,	/* ƭ Ƭ */ -	0x01b0, 499,	/* ư Ư */ -	0x01b4, 499,	/* ƴ Ƴ */ -	0x01b6, 499,	/* ƶ Ƶ */ -	0x01b9, 499,	/* ƹ Ƹ */ -	0x01bd, 499,	/* ƽ Ƽ */ -	0x01c5, 499,	/* Dž DŽ */ -	0x01c6, 498,	/* dž DŽ */ -	0x01c8, 499,	/* Lj LJ */ -	0x01c9, 498,	/* lj LJ */ -	0x01cb, 499,	/* Nj NJ */ -	0x01cc, 498,	/* nj NJ */ -	0x01ce, 499,	/* ǎ Ǎ */ -	0x01d0, 499,	/* ǐ Ǐ */ -	0x01d2, 499,	/* ǒ Ǒ */ -	0x01d4, 499,	/* ǔ Ǔ */ -	0x01d6, 499,	/* ǖ Ǖ */ -	0x01d8, 499,	/* ǘ Ǘ */ -	0x01da, 499,	/* ǚ Ǚ */ -	0x01dc, 499,	/* ǜ Ǜ */ -	0x01df, 499,	/* ǟ Ǟ */ -	0x01e1, 499,	/* ǡ Ǡ */ -	0x01e3, 499,	/* ǣ Ǣ */ -	0x01e5, 499,	/* ǥ Ǥ */ -	0x01e7, 499,	/* ǧ Ǧ */ -	0x01e9, 499,	/* ǩ Ǩ */ -	0x01eb, 499,	/* ǫ Ǫ */ -	0x01ed, 499,	/* ǭ Ǭ */ -	0x01ef, 499,	/* ǯ Ǯ */ -	0x01f2, 499,	/* Dz DZ */ -	0x01f3, 498,	/* dz DZ */ -	0x01f5, 499,	/* ǵ Ǵ */ -	0x01fb, 499,	/* ǻ Ǻ */ -	0x01fd, 499,	/* ǽ Ǽ */ -	0x01ff, 499,	/* ǿ Ǿ */ -	0x0201, 499,	/* ȁ Ȁ */ -	0x0203, 499,	/* ȃ Ȃ */ -	0x0205, 499,	/* ȅ Ȅ */ -	0x0207, 499,	/* ȇ Ȇ */ -	0x0209, 499,	/* ȉ Ȉ */ -	0x020b, 499,	/* ȋ Ȋ */ -	0x020d, 499,	/* ȍ Ȍ */ -	0x020f, 499,	/* ȏ Ȏ */ -	0x0211, 499,	/* ȑ Ȑ */ -	0x0213, 499,	/* ȓ Ȓ */ -	0x0215, 499,	/* ȕ Ȕ */ -	0x0217, 499,	/* ȗ Ȗ */ -	0x0253, 290,	/* ɓ Ɓ */ -	0x0254, 294,	/* ɔ Ɔ */ -	0x025b, 297,	/* ɛ Ɛ */ -	0x0260, 295,	/* ɠ Ɠ */ -	0x0263, 293,	/* ɣ Ɣ */ -	0x0268, 291,	/* ɨ Ɨ */ -	0x0269, 289,	/* ɩ Ɩ */ -	0x026f, 289,	/* ɯ Ɯ */ -	0x0272, 287,	/* ɲ Ɲ */ -	0x0283, 282,	/* ʃ Ʃ */ -	0x0288, 282,	/* ʈ Ʈ */ -	0x0292, 281,	/* ʒ Ʒ */ -	0x03ac, 462,	/* ά Ά */ -	0x03cc, 436,	/* ό Ό */ -	0x03d0, 438,	/* ϐ Β */ -	0x03d1, 443,	/* ϑ Θ */ -	0x03d5, 453,	/* ϕ Φ */ -	0x03d6, 446,	/* ϖ Π */ -	0x03e3, 499,	/* ϣ Ϣ */ -	0x03e5, 499,	/* ϥ Ϥ */ -	0x03e7, 499,	/* ϧ Ϧ */ -	0x03e9, 499,	/* ϩ Ϩ */ -	0x03eb, 499,	/* ϫ Ϫ */ -	0x03ed, 499,	/* ϭ Ϭ */ -	0x03ef, 499,	/* ϯ Ϯ */ -	0x03f0, 414,	/* ϰ Κ */ -	0x03f1, 420,	/* ϱ Ρ */ -	0x0461, 499,	/* ѡ Ѡ */ -	0x0463, 499,	/* ѣ Ѣ */ -	0x0465, 499,	/* ѥ Ѥ */ -	0x0467, 499,	/* ѧ Ѧ */ -	0x0469, 499,	/* ѩ Ѩ */ -	0x046b, 499,	/* ѫ Ѫ */ -	0x046d, 499,	/* ѭ Ѭ */ -	0x046f, 499,	/* ѯ Ѯ */ -	0x0471, 499,	/* ѱ Ѱ */ -	0x0473, 499,	/* ѳ Ѳ */ -	0x0475, 499,	/* ѵ Ѵ */ -	0x0477, 499,	/* ѷ Ѷ */ -	0x0479, 499,	/* ѹ Ѹ */ -	0x047b, 499,	/* ѻ Ѻ */ -	0x047d, 499,	/* ѽ Ѽ */ -	0x047f, 499,	/* ѿ Ѿ */ -	0x0481, 499,	/* ҁ Ҁ */ -	0x0491, 499,	/* ґ Ґ */ -	0x0493, 499,	/* ғ Ғ */ -	0x0495, 499,	/* ҕ Ҕ */ -	0x0497, 499,	/* җ Җ */ -	0x0499, 499,	/* ҙ Ҙ */ -	0x049b, 499,	/* қ Қ */ -	0x049d, 499,	/* ҝ Ҝ */ -	0x049f, 499,	/* ҟ Ҟ */ -	0x04a1, 499,	/* ҡ Ҡ */ -	0x04a3, 499,	/* ң Ң */ -	0x04a5, 499,	/* ҥ Ҥ */ -	0x04a7, 499,	/* ҧ Ҧ */ -	0x04a9, 499,	/* ҩ Ҩ */ -	0x04ab, 499,	/* ҫ Ҫ */ -	0x04ad, 499,	/* ҭ Ҭ */ -	0x04af, 499,	/* ү Ү */ -	0x04b1, 499,	/* ұ Ұ */ -	0x04b3, 499,	/* ҳ Ҳ */ -	0x04b5, 499,	/* ҵ Ҵ */ -	0x04b7, 499,	/* ҷ Ҷ */ -	0x04b9, 499,	/* ҹ Ҹ */ -	0x04bb, 499,	/* һ Һ */ -	0x04bd, 499,	/* ҽ Ҽ */ -	0x04bf, 499,	/* ҿ Ҿ */ -	0x04c2, 499,	/* ӂ Ӂ */ -	0x04c4, 499,	/* ӄ Ӄ */ -	0x04c8, 499,	/* ӈ Ӈ */ -	0x04cc, 499,	/* ӌ Ӌ */ -	0x04d1, 499,	/* ӑ Ӑ */ -	0x04d3, 499,	/* ӓ Ӓ */ -	0x04d5, 499,	/* ӕ Ӕ */ -	0x04d7, 499,	/* ӗ Ӗ */ -	0x04d9, 499,	/* ә Ә */ -	0x04db, 499,	/* ӛ Ӛ */ -	0x04dd, 499,	/* ӝ Ӝ */ -	0x04df, 499,	/* ӟ Ӟ */ -	0x04e1, 499,	/* ӡ Ӡ */ -	0x04e3, 499,	/* ӣ Ӣ */ -	0x04e5, 499,	/* ӥ Ӥ */ -	0x04e7, 499,	/* ӧ Ӧ */ -	0x04e9, 499,	/* ө Ө */ -	0x04eb, 499,	/* ӫ Ӫ */ -	0x04ef, 499,	/* ӯ Ӯ */ -	0x04f1, 499,	/* ӱ Ӱ */ -	0x04f3, 499,	/* ӳ Ӳ */ -	0x04f5, 499,	/* ӵ Ӵ */ -	0x04f9, 499,	/* ӹ Ӹ */ -	0x1e01, 499,	/* ḁ Ḁ */ -	0x1e03, 499,	/* ḃ Ḃ */ -	0x1e05, 499,	/* ḅ Ḅ */ -	0x1e07, 499,	/* ḇ Ḇ */ -	0x1e09, 499,	/* ḉ Ḉ */ -	0x1e0b, 499,	/* ḋ Ḋ */ -	0x1e0d, 499,	/* ḍ Ḍ */ -	0x1e0f, 499,	/* ḏ Ḏ */ -	0x1e11, 499,	/* ḑ Ḑ */ -	0x1e13, 499,	/* ḓ Ḓ */ -	0x1e15, 499,	/* ḕ Ḕ */ -	0x1e17, 499,	/* ḗ Ḗ */ -	0x1e19, 499,	/* ḙ Ḙ */ -	0x1e1b, 499,	/* ḛ Ḛ */ -	0x1e1d, 499,	/* ḝ Ḝ */ -	0x1e1f, 499,	/* ḟ Ḟ */ -	0x1e21, 499,	/* ḡ Ḡ */ -	0x1e23, 499,	/* ḣ Ḣ */ -	0x1e25, 499,	/* ḥ Ḥ */ -	0x1e27, 499,	/* ḧ Ḧ */ -	0x1e29, 499,	/* ḩ Ḩ */ -	0x1e2b, 499,	/* ḫ Ḫ */ -	0x1e2d, 499,	/* ḭ Ḭ */ -	0x1e2f, 499,	/* ḯ Ḯ */ -	0x1e31, 499,	/* ḱ Ḱ */ -	0x1e33, 499,	/* ḳ Ḳ */ -	0x1e35, 499,	/* ḵ Ḵ */ -	0x1e37, 499,	/* ḷ Ḷ */ -	0x1e39, 499,	/* ḹ Ḹ */ -	0x1e3b, 499,	/* ḻ Ḻ */ -	0x1e3d, 499,	/* ḽ Ḽ */ -	0x1e3f, 499,	/* ḿ Ḿ */ -	0x1e41, 499,	/* ṁ Ṁ */ -	0x1e43, 499,	/* ṃ Ṃ */ -	0x1e45, 499,	/* ṅ Ṅ */ -	0x1e47, 499,	/* ṇ Ṇ */ -	0x1e49, 499,	/* ṉ Ṉ */ -	0x1e4b, 499,	/* ṋ Ṋ */ -	0x1e4d, 499,	/* ṍ Ṍ */ -	0x1e4f, 499,	/* ṏ Ṏ */ -	0x1e51, 499,	/* ṑ Ṑ */ -	0x1e53, 499,	/* ṓ Ṓ */ -	0x1e55, 499,	/* ṕ Ṕ */ -	0x1e57, 499,	/* ṗ Ṗ */ -	0x1e59, 499,	/* ṙ Ṙ */ -	0x1e5b, 499,	/* ṛ Ṛ */ -	0x1e5d, 499,	/* ṝ Ṝ */ -	0x1e5f, 499,	/* ṟ Ṟ */ -	0x1e61, 499,	/* ṡ Ṡ */ -	0x1e63, 499,	/* ṣ Ṣ */ -	0x1e65, 499,	/* ṥ Ṥ */ -	0x1e67, 499,	/* ṧ Ṧ */ -	0x1e69, 499,	/* ṩ Ṩ */ -	0x1e6b, 499,	/* ṫ Ṫ */ -	0x1e6d, 499,	/* ṭ Ṭ */ -	0x1e6f, 499,	/* ṯ Ṯ */ -	0x1e71, 499,	/* ṱ Ṱ */ -	0x1e73, 499,	/* ṳ Ṳ */ -	0x1e75, 499,	/* ṵ Ṵ */ -	0x1e77, 499,	/* ṷ Ṷ */ -	0x1e79, 499,	/* ṹ Ṹ */ -	0x1e7b, 499,	/* ṻ Ṻ */ -	0x1e7d, 499,	/* ṽ Ṽ */ -	0x1e7f, 499,	/* ṿ Ṿ */ -	0x1e81, 499,	/* ẁ Ẁ */ -	0x1e83, 499,	/* ẃ Ẃ */ -	0x1e85, 499,	/* ẅ Ẅ */ -	0x1e87, 499,	/* ẇ Ẇ */ -	0x1e89, 499,	/* ẉ Ẉ */ -	0x1e8b, 499,	/* ẋ Ẋ */ -	0x1e8d, 499,	/* ẍ Ẍ */ -	0x1e8f, 499,	/* ẏ Ẏ */ -	0x1e91, 499,	/* ẑ Ẑ */ -	0x1e93, 499,	/* ẓ Ẓ */ -	0x1e95, 499,	/* ẕ Ẕ */ -	0x1ea1, 499,	/* ạ Ạ */ -	0x1ea3, 499,	/* ả Ả */ -	0x1ea5, 499,	/* ấ Ấ */ -	0x1ea7, 499,	/* ầ Ầ */ -	0x1ea9, 499,	/* ẩ Ẩ */ -	0x1eab, 499,	/* ẫ Ẫ */ -	0x1ead, 499,	/* ậ Ậ */ -	0x1eaf, 499,	/* ắ Ắ */ -	0x1eb1, 499,	/* ằ Ằ */ -	0x1eb3, 499,	/* ẳ Ẳ */ -	0x1eb5, 499,	/* ẵ Ẵ */ -	0x1eb7, 499,	/* ặ Ặ */ -	0x1eb9, 499,	/* ẹ Ẹ */ -	0x1ebb, 499,	/* ẻ Ẻ */ -	0x1ebd, 499,	/* ẽ Ẽ */ -	0x1ebf, 499,	/* ế Ế */ -	0x1ec1, 499,	/* ề Ề */ -	0x1ec3, 499,	/* ể Ể */ -	0x1ec5, 499,	/* ễ Ễ */ -	0x1ec7, 499,	/* ệ Ệ */ -	0x1ec9, 499,	/* ỉ Ỉ */ -	0x1ecb, 499,	/* ị Ị */ -	0x1ecd, 499,	/* ọ Ọ */ -	0x1ecf, 499,	/* ỏ Ỏ */ -	0x1ed1, 499,	/* ố Ố */ -	0x1ed3, 499,	/* ồ Ồ */ -	0x1ed5, 499,	/* ổ Ổ */ -	0x1ed7, 499,	/* ỗ Ỗ */ -	0x1ed9, 499,	/* ộ Ộ */ -	0x1edb, 499,	/* ớ Ớ */ -	0x1edd, 499,	/* ờ Ờ */ -	0x1edf, 499,	/* ở Ở */ -	0x1ee1, 499,	/* ỡ Ỡ */ -	0x1ee3, 499,	/* ợ Ợ */ -	0x1ee5, 499,	/* ụ Ụ */ -	0x1ee7, 499,	/* ủ Ủ */ -	0x1ee9, 499,	/* ứ Ứ */ -	0x1eeb, 499,	/* ừ Ừ */ -	0x1eed, 499,	/* ử Ử */ -	0x1eef, 499,	/* ữ Ữ */ -	0x1ef1, 499,	/* ự Ự */ -	0x1ef3, 499,	/* ỳ Ỳ */ -	0x1ef5, 499,	/* ỵ Ỵ */ -	0x1ef7, 499,	/* ỷ Ỷ */ -	0x1ef9, 499,	/* ỹ Ỹ */ -	0x1f51, 508,	/* ὑ Ὑ */ -	0x1f53, 508,	/* ὓ Ὓ */ -	0x1f55, 508,	/* ὕ Ὕ */ -	0x1f57, 508,	/* ὗ Ὗ */ -	0x1fb3, 509,	/* ᾳ ᾼ */ -	0x1fc3, 509,	/* ῃ ῌ */ -	0x1fe5, 507,	/* ῥ Ῥ */ -	0x1ff3, 509,	/* ῳ ῼ */ -}; - -/* - * upper case ranges - *	3rd col is conversion excess 500 - */ -static -Rune	__tolower2[] = -{ -	0x0041,	0x005a, 532,	/* A-Z a-z */ -	0x00c0,	0x00d6, 532,	/* À-Ö à-ö */ -	0x00d8,	0x00de, 532,	/* Ø-Þ ø-þ */ -	0x0189,	0x018a, 705,	/* Ɖ-Ɗ ɖ-ɗ */ -	0x018e,	0x018f, 702,	/* Ǝ-Ə ɘ-ə */ -	0x01b1,	0x01b2, 717,	/* Ʊ-Ʋ ʊ-ʋ */ -	0x0388,	0x038a, 537,	/* Έ-Ί έ-ί */ -	0x038e,	0x038f, 563,	/* Ύ-Ώ ύ-ώ */ -	0x0391,	0x03a1, 532,	/* Α-Ρ α-ρ */ -	0x03a3,	0x03ab, 532,	/* Σ-Ϋ σ-ϋ */ -	0x0401,	0x040c, 580,	/* Ё-Ќ ё-ќ */ -	0x040e,	0x040f, 580,	/* Ў-Џ ў-џ */ -	0x0410,	0x042f, 532,	/* А-Я а-я */ -	0x0531,	0x0556, 548,	/* Ա-Ֆ ա-ֆ */ -	0x10a0,	0x10c5, 548,	/* Ⴀ-Ⴥ ა-ჵ */ -	0x1f08,	0x1f0f, 492,	/* Ἀ-Ἇ ἀ-ἇ */ -	0x1f18,	0x1f1d, 492,	/* Ἐ-Ἕ ἐ-ἕ */ -	0x1f28,	0x1f2f, 492,	/* Ἠ-Ἧ ἠ-ἧ */ -	0x1f38,	0x1f3f, 492,	/* Ἰ-Ἷ ἰ-ἷ */ -	0x1f48,	0x1f4d, 492,	/* Ὀ-Ὅ ὀ-ὅ */ -	0x1f68,	0x1f6f, 492,	/* Ὠ-Ὧ ὠ-ὧ */ -	0x1f88,	0x1f8f, 492,	/* ᾈ-ᾏ ᾀ-ᾇ */ -	0x1f98,	0x1f9f, 492,	/* ᾘ-ᾟ ᾐ-ᾗ */ -	0x1fa8,	0x1faf, 492,	/* ᾨ-ᾯ ᾠ-ᾧ */ -	0x1fb8,	0x1fb9, 492,	/* Ᾰ-Ᾱ ᾰ-ᾱ */ -	0x1fba,	0x1fbb, 426,	/* Ὰ-Ά ὰ-ά */ -	0x1fc8,	0x1fcb, 414,	/* Ὲ-Ή ὲ-ή */ -	0x1fd8,	0x1fd9, 492,	/* Ῐ-Ῑ ῐ-ῑ */ -	0x1fda,	0x1fdb, 400,	/* Ὶ-Ί ὶ-ί */ -	0x1fe8,	0x1fe9, 492,	/* Ῠ-Ῡ ῠ-ῡ */ -	0x1fea,	0x1feb, 388,	/* Ὺ-Ύ ὺ-ύ */ -	0x1ff8,	0x1ff9, 372,	/* Ὸ-Ό ὸ-ό */ -	0x1ffa,	0x1ffb, 374,	/* Ὼ-Ώ ὼ-ώ */ -	0x2160,	0x216f, 516,	/* Ⅰ-Ⅿ ⅰ-ⅿ */ -	0x24b6,	0x24cf, 526,	/* Ⓐ-Ⓩ ⓐ-ⓩ */ -	0xff21,	0xff3a, 532,	/* A-Z a-z */ -}; - -/* - * upper case singlets - *	2nd col is conversion excess 500 - */  static -Rune	__tolower1[] = -{ -	0x0100, 501,	/* Ā ā */ -	0x0102, 501,	/* Ă ă */ -	0x0104, 501,	/* Ą ą */ -	0x0106, 501,	/* Ć ć */ -	0x0108, 501,	/* Ĉ ĉ */ -	0x010a, 501,	/* Ċ ċ */ -	0x010c, 501,	/* Č č */ -	0x010e, 501,	/* Ď ď */ -	0x0110, 501,	/* Đ đ */ -	0x0112, 501,	/* Ē ē */ -	0x0114, 501,	/* Ĕ ĕ */ -	0x0116, 501,	/* Ė ė */ -	0x0118, 501,	/* Ę ę */ -	0x011a, 501,	/* Ě ě */ -	0x011c, 501,	/* Ĝ ĝ */ -	0x011e, 501,	/* Ğ ğ */ -	0x0120, 501,	/* Ġ ġ */ -	0x0122, 501,	/* Ģ ģ */ -	0x0124, 501,	/* Ĥ ĥ */ -	0x0126, 501,	/* Ħ ħ */ -	0x0128, 501,	/* Ĩ ĩ */ -	0x012a, 501,	/* Ī ī */ -	0x012c, 501,	/* Ĭ ĭ */ -	0x012e, 501,	/* Į į */ -	0x0130, 301,	/* İ i */ -	0x0132, 501,	/* IJ ij */ -	0x0134, 501,	/* Ĵ ĵ */ -	0x0136, 501,	/* Ķ ķ */ -	0x0139, 501,	/* Ĺ ĺ */ -	0x013b, 501,	/* Ļ ļ */ -	0x013d, 501,	/* Ľ ľ */ -	0x013f, 501,	/* Ŀ ŀ */ -	0x0141, 501,	/* Ł ł */ -	0x0143, 501,	/* Ń ń */ -	0x0145, 501,	/* Ņ ņ */ -	0x0147, 501,	/* Ň ň */ -	0x014a, 501,	/* Ŋ ŋ */ -	0x014c, 501,	/* Ō ō */ -	0x014e, 501,	/* Ŏ ŏ */ -	0x0150, 501,	/* Ő ő */ -	0x0152, 501,	/* Œ œ */ -	0x0154, 501,	/* Ŕ ŕ */ -	0x0156, 501,	/* Ŗ ŗ */ -	0x0158, 501,	/* Ř ř */ -	0x015a, 501,	/* Ś ś */ -	0x015c, 501,	/* Ŝ ŝ */ -	0x015e, 501,	/* Ş ş */ -	0x0160, 501,	/* Š š */ -	0x0162, 501,	/* Ţ ţ */ -	0x0164, 501,	/* Ť ť */ -	0x0166, 501,	/* Ŧ ŧ */ -	0x0168, 501,	/* Ũ ũ */ -	0x016a, 501,	/* Ū ū */ -	0x016c, 501,	/* Ŭ ŭ */ -	0x016e, 501,	/* Ů ů */ -	0x0170, 501,	/* Ű ű */ -	0x0172, 501,	/* Ų ų */ -	0x0174, 501,	/* Ŵ ŵ */ -	0x0176, 501,	/* Ŷ ŷ */ -	0x0178, 379,	/* Ÿ ÿ */ -	0x0179, 501,	/* Ź ź */ -	0x017b, 501,	/* Ż ż */ -	0x017d, 501,	/* Ž ž */ -	0x0181, 710,	/* Ɓ ɓ */ -	0x0182, 501,	/* Ƃ ƃ */ -	0x0184, 501,	/* Ƅ ƅ */ -	0x0186, 706,	/* Ɔ ɔ */ -	0x0187, 501,	/* Ƈ ƈ */ -	0x018b, 501,	/* Ƌ ƌ */ -	0x0190, 703,	/* Ɛ ɛ */ -	0x0191, 501,	/* Ƒ ƒ */ -	0x0193, 705,	/* Ɠ ɠ */ -	0x0194, 707,	/* Ɣ ɣ */ -	0x0196, 711,	/* Ɩ ɩ */ -	0x0197, 709,	/* Ɨ ɨ */ -	0x0198, 501,	/* Ƙ ƙ */ -	0x019c, 711,	/* Ɯ ɯ */ -	0x019d, 713,	/* Ɲ ɲ */ -	0x01a0, 501,	/* Ơ ơ */ -	0x01a2, 501,	/* Ƣ ƣ */ -	0x01a4, 501,	/* Ƥ ƥ */ -	0x01a7, 501,	/* Ƨ ƨ */ -	0x01a9, 718,	/* Ʃ ʃ */ -	0x01ac, 501,	/* Ƭ ƭ */ -	0x01ae, 718,	/* Ʈ ʈ */ -	0x01af, 501,	/* Ư ư */ -	0x01b3, 501,	/* Ƴ ƴ */ -	0x01b5, 501,	/* Ƶ ƶ */ -	0x01b7, 719,	/* Ʒ ʒ */ -	0x01b8, 501,	/* Ƹ ƹ */ -	0x01bc, 501,	/* Ƽ ƽ */ -	0x01c4, 502,	/* DŽ dž */ -	0x01c5, 501,	/* Dž dž */ -	0x01c7, 502,	/* LJ lj */ -	0x01c8, 501,	/* Lj lj */ -	0x01ca, 502,	/* NJ nj */ -	0x01cb, 501,	/* Nj nj */ -	0x01cd, 501,	/* Ǎ ǎ */ -	0x01cf, 501,	/* Ǐ ǐ */ -	0x01d1, 501,	/* Ǒ ǒ */ -	0x01d3, 501,	/* Ǔ ǔ */ -	0x01d5, 501,	/* Ǖ ǖ */ -	0x01d7, 501,	/* Ǘ ǘ */ -	0x01d9, 501,	/* Ǚ ǚ */ -	0x01db, 501,	/* Ǜ ǜ */ -	0x01de, 501,	/* Ǟ ǟ */ -	0x01e0, 501,	/* Ǡ ǡ */ -	0x01e2, 501,	/* Ǣ ǣ */ -	0x01e4, 501,	/* Ǥ ǥ */ -	0x01e6, 501,	/* Ǧ ǧ */ -	0x01e8, 501,	/* Ǩ ǩ */ -	0x01ea, 501,	/* Ǫ ǫ */ -	0x01ec, 501,	/* Ǭ ǭ */ -	0x01ee, 501,	/* Ǯ ǯ */ -	0x01f1, 502,	/* DZ dz */ -	0x01f2, 501,	/* Dz dz */ -	0x01f4, 501,	/* Ǵ ǵ */ -	0x01fa, 501,	/* Ǻ ǻ */ -	0x01fc, 501,	/* Ǽ ǽ */ -	0x01fe, 501,	/* Ǿ ǿ */ -	0x0200, 501,	/* Ȁ ȁ */ -	0x0202, 501,	/* Ȃ ȃ */ -	0x0204, 501,	/* Ȅ ȅ */ -	0x0206, 501,	/* Ȇ ȇ */ -	0x0208, 501,	/* Ȉ ȉ */ -	0x020a, 501,	/* Ȋ ȋ */ -	0x020c, 501,	/* Ȍ ȍ */ -	0x020e, 501,	/* Ȏ ȏ */ -	0x0210, 501,	/* Ȑ ȑ */ -	0x0212, 501,	/* Ȓ ȓ */ -	0x0214, 501,	/* Ȕ ȕ */ -	0x0216, 501,	/* Ȗ ȗ */ -	0x0386, 538,	/* Ά ά */ -	0x038c, 564,	/* Ό ό */ -	0x03e2, 501,	/* Ϣ ϣ */ -	0x03e4, 501,	/* Ϥ ϥ */ -	0x03e6, 501,	/* Ϧ ϧ */ -	0x03e8, 501,	/* Ϩ ϩ */ -	0x03ea, 501,	/* Ϫ ϫ */ -	0x03ec, 501,	/* Ϭ ϭ */ -	0x03ee, 501,	/* Ϯ ϯ */ -	0x0460, 501,	/* Ѡ ѡ */ -	0x0462, 501,	/* Ѣ ѣ */ -	0x0464, 501,	/* Ѥ ѥ */ -	0x0466, 501,	/* Ѧ ѧ */ -	0x0468, 501,	/* Ѩ ѩ */ -	0x046a, 501,	/* Ѫ ѫ */ -	0x046c, 501,	/* Ѭ ѭ */ -	0x046e, 501,	/* Ѯ ѯ */ -	0x0470, 501,	/* Ѱ ѱ */ -	0x0472, 501,	/* Ѳ ѳ */ -	0x0474, 501,	/* Ѵ ѵ */ -	0x0476, 501,	/* Ѷ ѷ */ -	0x0478, 501,	/* Ѹ ѹ */ -	0x047a, 501,	/* Ѻ ѻ */ -	0x047c, 501,	/* Ѽ ѽ */ -	0x047e, 501,	/* Ѿ ѿ */ -	0x0480, 501,	/* Ҁ ҁ */ -	0x0490, 501,	/* Ґ ґ */ -	0x0492, 501,	/* Ғ ғ */ -	0x0494, 501,	/* Ҕ ҕ */ -	0x0496, 501,	/* Җ җ */ -	0x0498, 501,	/* Ҙ ҙ */ -	0x049a, 501,	/* Қ қ */ -	0x049c, 501,	/* Ҝ ҝ */ -	0x049e, 501,	/* Ҟ ҟ */ -	0x04a0, 501,	/* Ҡ ҡ */ -	0x04a2, 501,	/* Ң ң */ -	0x04a4, 501,	/* Ҥ ҥ */ -	0x04a6, 501,	/* Ҧ ҧ */ -	0x04a8, 501,	/* Ҩ ҩ */ -	0x04aa, 501,	/* Ҫ ҫ */ -	0x04ac, 501,	/* Ҭ ҭ */ -	0x04ae, 501,	/* Ү ү */ -	0x04b0, 501,	/* Ұ ұ */ -	0x04b2, 501,	/* Ҳ ҳ */ -	0x04b4, 501,	/* Ҵ ҵ */ -	0x04b6, 501,	/* Ҷ ҷ */ -	0x04b8, 501,	/* Ҹ ҹ */ -	0x04ba, 501,	/* Һ һ */ -	0x04bc, 501,	/* Ҽ ҽ */ -	0x04be, 501,	/* Ҿ ҿ */ -	0x04c1, 501,	/* Ӂ ӂ */ -	0x04c3, 501,	/* Ӄ ӄ */ -	0x04c7, 501,	/* Ӈ ӈ */ -	0x04cb, 501,	/* Ӌ ӌ */ -	0x04d0, 501,	/* Ӑ ӑ */ -	0x04d2, 501,	/* Ӓ ӓ */ -	0x04d4, 501,	/* Ӕ ӕ */ -	0x04d6, 501,	/* Ӗ ӗ */ -	0x04d8, 501,	/* Ә ә */ -	0x04da, 501,	/* Ӛ ӛ */ -	0x04dc, 501,	/* Ӝ ӝ */ -	0x04de, 501,	/* Ӟ ӟ */ -	0x04e0, 501,	/* Ӡ ӡ */ -	0x04e2, 501,	/* Ӣ ӣ */ -	0x04e4, 501,	/* Ӥ ӥ */ -	0x04e6, 501,	/* Ӧ ӧ */ -	0x04e8, 501,	/* Ө ө */ -	0x04ea, 501,	/* Ӫ ӫ */ -	0x04ee, 501,	/* Ӯ ӯ */ -	0x04f0, 501,	/* Ӱ ӱ */ -	0x04f2, 501,	/* Ӳ ӳ */ -	0x04f4, 501,	/* Ӵ ӵ */ -	0x04f8, 501,	/* Ӹ ӹ */ -	0x1e00, 501,	/* Ḁ ḁ */ -	0x1e02, 501,	/* Ḃ ḃ */ -	0x1e04, 501,	/* Ḅ ḅ */ -	0x1e06, 501,	/* Ḇ ḇ */ -	0x1e08, 501,	/* Ḉ ḉ */ -	0x1e0a, 501,	/* Ḋ ḋ */ -	0x1e0c, 501,	/* Ḍ ḍ */ -	0x1e0e, 501,	/* Ḏ ḏ */ -	0x1e10, 501,	/* Ḑ ḑ */ -	0x1e12, 501,	/* Ḓ ḓ */ -	0x1e14, 501,	/* Ḕ ḕ */ -	0x1e16, 501,	/* Ḗ ḗ */ -	0x1e18, 501,	/* Ḙ ḙ */ -	0x1e1a, 501,	/* Ḛ ḛ */ -	0x1e1c, 501,	/* Ḝ ḝ */ -	0x1e1e, 501,	/* Ḟ ḟ */ -	0x1e20, 501,	/* Ḡ ḡ */ -	0x1e22, 501,	/* Ḣ ḣ */ -	0x1e24, 501,	/* Ḥ ḥ */ -	0x1e26, 501,	/* Ḧ ḧ */ -	0x1e28, 501,	/* Ḩ ḩ */ -	0x1e2a, 501,	/* Ḫ ḫ */ -	0x1e2c, 501,	/* Ḭ ḭ */ -	0x1e2e, 501,	/* Ḯ ḯ */ -	0x1e30, 501,	/* Ḱ ḱ */ -	0x1e32, 501,	/* Ḳ ḳ */ -	0x1e34, 501,	/* Ḵ ḵ */ -	0x1e36, 501,	/* Ḷ ḷ */ -	0x1e38, 501,	/* Ḹ ḹ */ -	0x1e3a, 501,	/* Ḻ ḻ */ -	0x1e3c, 501,	/* Ḽ ḽ */ -	0x1e3e, 501,	/* Ḿ ḿ */ -	0x1e40, 501,	/* Ṁ ṁ */ -	0x1e42, 501,	/* Ṃ ṃ */ -	0x1e44, 501,	/* Ṅ ṅ */ -	0x1e46, 501,	/* Ṇ ṇ */ -	0x1e48, 501,	/* Ṉ ṉ */ -	0x1e4a, 501,	/* Ṋ ṋ */ -	0x1e4c, 501,	/* Ṍ ṍ */ -	0x1e4e, 501,	/* Ṏ ṏ */ -	0x1e50, 501,	/* Ṑ ṑ */ -	0x1e52, 501,	/* Ṓ ṓ */ -	0x1e54, 501,	/* Ṕ ṕ */ -	0x1e56, 501,	/* Ṗ ṗ */ -	0x1e58, 501,	/* Ṙ ṙ */ -	0x1e5a, 501,	/* Ṛ ṛ */ -	0x1e5c, 501,	/* Ṝ ṝ */ -	0x1e5e, 501,	/* Ṟ ṟ */ -	0x1e60, 501,	/* Ṡ ṡ */ -	0x1e62, 501,	/* Ṣ ṣ */ -	0x1e64, 501,	/* Ṥ ṥ */ -	0x1e66, 501,	/* Ṧ ṧ */ -	0x1e68, 501,	/* Ṩ ṩ */ -	0x1e6a, 501,	/* Ṫ ṫ */ -	0x1e6c, 501,	/* Ṭ ṭ */ -	0x1e6e, 501,	/* Ṯ ṯ */ -	0x1e70, 501,	/* Ṱ ṱ */ -	0x1e72, 501,	/* Ṳ ṳ */ -	0x1e74, 501,	/* Ṵ ṵ */ -	0x1e76, 501,	/* Ṷ ṷ */ -	0x1e78, 501,	/* Ṹ ṹ */ -	0x1e7a, 501,	/* Ṻ ṻ */ -	0x1e7c, 501,	/* Ṽ ṽ */ -	0x1e7e, 501,	/* Ṿ ṿ */ -	0x1e80, 501,	/* Ẁ ẁ */ -	0x1e82, 501,	/* Ẃ ẃ */ -	0x1e84, 501,	/* Ẅ ẅ */ -	0x1e86, 501,	/* Ẇ ẇ */ -	0x1e88, 501,	/* Ẉ ẉ */ -	0x1e8a, 501,	/* Ẋ ẋ */ -	0x1e8c, 501,	/* Ẍ ẍ */ -	0x1e8e, 501,	/* Ẏ ẏ */ -	0x1e90, 501,	/* Ẑ ẑ */ -	0x1e92, 501,	/* Ẓ ẓ */ -	0x1e94, 501,	/* Ẕ ẕ */ -	0x1ea0, 501,	/* Ạ ạ */ -	0x1ea2, 501,	/* Ả ả */ -	0x1ea4, 501,	/* Ấ ấ */ -	0x1ea6, 501,	/* Ầ ầ */ -	0x1ea8, 501,	/* Ẩ ẩ */ -	0x1eaa, 501,	/* Ẫ ẫ */ -	0x1eac, 501,	/* Ậ ậ */ -	0x1eae, 501,	/* Ắ ắ */ -	0x1eb0, 501,	/* Ằ ằ */ -	0x1eb2, 501,	/* Ẳ ẳ */ -	0x1eb4, 501,	/* Ẵ ẵ */ -	0x1eb6, 501,	/* Ặ ặ */ -	0x1eb8, 501,	/* Ẹ ẹ */ -	0x1eba, 501,	/* Ẻ ẻ */ -	0x1ebc, 501,	/* Ẽ ẽ */ -	0x1ebe, 501,	/* Ế ế */ -	0x1ec0, 501,	/* Ề ề */ -	0x1ec2, 501,	/* Ể ể */ -	0x1ec4, 501,	/* Ễ ễ */ -	0x1ec6, 501,	/* Ệ ệ */ -	0x1ec8, 501,	/* Ỉ ỉ */ -	0x1eca, 501,	/* Ị ị */ -	0x1ecc, 501,	/* Ọ ọ */ -	0x1ece, 501,	/* Ỏ ỏ */ -	0x1ed0, 501,	/* Ố ố */ -	0x1ed2, 501,	/* Ồ ồ */ -	0x1ed4, 501,	/* Ổ ổ */ -	0x1ed6, 501,	/* Ỗ ỗ */ -	0x1ed8, 501,	/* Ộ ộ */ -	0x1eda, 501,	/* Ớ ớ */ -	0x1edc, 501,	/* Ờ ờ */ -	0x1ede, 501,	/* Ở ở */ -	0x1ee0, 501,	/* Ỡ ỡ */ -	0x1ee2, 501,	/* Ợ ợ */ -	0x1ee4, 501,	/* Ụ ụ */ -	0x1ee6, 501,	/* Ủ ủ */ -	0x1ee8, 501,	/* Ứ ứ */ -	0x1eea, 501,	/* Ừ ừ */ -	0x1eec, 501,	/* Ử ử */ -	0x1eee, 501,	/* Ữ ữ */ -	0x1ef0, 501,	/* Ự ự */ -	0x1ef2, 501,	/* Ỳ ỳ */ -	0x1ef4, 501,	/* Ỵ ỵ */ -	0x1ef6, 501,	/* Ỷ ỷ */ -	0x1ef8, 501,	/* Ỹ ỹ */ -	0x1f59, 492,	/* Ὑ ὑ */ -	0x1f5b, 492,	/* Ὓ ὓ */ -	0x1f5d, 492,	/* Ὕ ὕ */ -	0x1f5f, 492,	/* Ὗ ὗ */ -	0x1fbc, 491,	/* ᾼ ᾳ */ -	0x1fcc, 491,	/* ῌ ῃ */ -	0x1fec, 493,	/* Ῥ ῥ */ -	0x1ffc, 491,	/* ῼ ῳ */ -}; - -/* - * title characters are those between - * upper and lower case. ie DZ Dz dz - */ -static -Rune	__totitle1[] = -{ -	0x01c4, 501,	/* DŽ Dž */ -	0x01c6, 499,	/* dž Dž */ -	0x01c7, 501,	/* LJ Lj */ -	0x01c9, 499,	/* lj Lj */ -	0x01ca, 501,	/* NJ Nj */ -	0x01cc, 499,	/* nj Nj */ -	0x01f1, 501,	/* DZ Dz */ -	0x01f3, 499,	/* dz Dz */ -}; - -static Rune* -bsearch(Rune c, Rune *t, int n, int ne) +Rune* +rbsearch(Rune c, Rune *t, int n, int ne)  {  	Rune *p;  	int m;  	while(n > 1) { -		m = n/2; +		m = n >> 1;  		p = t + m*ne;  		if(c >= p[0]) {  			t = p; @@ -1050,102 +35,36 @@ bsearch(Rune c, Rune *t, int n, int ne)  	return 0;  } -Rune -tolowerrune(Rune c) -{ -	Rune *p; - -	p = bsearch(c, __tolower2, nelem(__tolower2)/3, 3); -	if(p && c >= p[0] && c <= p[1]) -		return c + p[2] - 500; -	p = bsearch(c, __tolower1, nelem(__tolower1)/2, 2); -	if(p && c == p[0]) -		return c + p[1] - 500; -	return c; -} - -Rune -toupperrune(Rune c) -{ -	Rune *p; - -	p = bsearch(c, __toupper2, nelem(__toupper2)/3, 3); -	if(p && c >= p[0] && c <= p[1]) -		return c + p[2] - 500; -	p = bsearch(c, __toupper1, nelem(__toupper1)/2, 2); -	if(p && c == p[0]) -		return c + p[1] - 500; -	return c; -} - -Rune -totitlerune(Rune c) -{ -	Rune *p; - -	p = bsearch(c, __totitle1, nelem(__totitle1)/2, 2); -	if(p && c == p[0]) -		return c + p[1] - 500; -	return c; -} - -int -islowerrune(Rune c) -{ -	Rune *p; - -	p = bsearch(c, __toupper2, nelem(__toupper2)/3, 3); -	if(p && c >= p[0] && c <= p[1]) -		return 1; -	p = bsearch(c, __toupper1, nelem(__toupper1)/2, 2); -	if(p && c == p[0]) -		return 1; -	return 0; -} - -int -isupperrune(Rune c) -{ -	Rune *p; - -	p = bsearch(c, __tolower2, nelem(__tolower2)/3, 3); -	if(p && c >= p[0] && c <= p[1]) -		return 1; -	p = bsearch(c, __tolower1, nelem(__tolower1)/2, 2); -	if(p && c == p[0]) -		return 1; -	return 0; -} +/* + * The "ideographic" property is hard to extract from UnicodeData.txt, + * so it is hard coded here. + * + * It is defined in the Unicode PropList.txt file, for example + * PropList-3.0.0.txt.  Unlike the UnicodeData.txt file, the format of + * PropList changes between versions.  This property appears relatively static; + * it is the same in version 4.0.1, except that version defines some >16 bit + * chars as ideographic as well: 20000..2a6d6, and 2f800..2Fa1d. + */ +static Rune __isideographicr[] = { +	0x3006, 0x3007,			/* 3006 not in Unicode 2, in 2.1 */ +	0x3021, 0x3029, +	0x3038, 0x303a,			/* not in Unicode 2 or 2.1 */ +	0x3400, 0x4db5,			/* not in Unicode 2 or 2.1 */ +	0x4e00, 0x9fbb,			/* 0x9FA6..0x9FBB added for 4.1.0? */ +	0xf900, 0xfa2d, +        0x20000, 0x2A6D6, +        0x2F800, 0x2FA1D, +};  int -isalpharune(Rune c) +isideographicrune(Rune c)  {  	Rune *p; -	if(isupperrune(c) || islowerrune(c)) -		return 1; -	p = bsearch(c, __alpha2, nelem(__alpha2)/2, 2); +	p = rbsearch(c, __isideographicr, nelem(__isideographicr)/2, 2);  	if(p && c >= p[0] && c <= p[1])  		return 1; -	p = bsearch(c, __alpha1, nelem(__alpha1), 1); -	if(p && c == p[0]) -		return 1;  	return 0;  } -int -istitlerune(Rune c) -{ -	return isupperrune(c) && islowerrune(c); -} - -int -isspacerune(Rune c) -{ -	Rune *p; - -	p = bsearch(c, __space2, nelem(__space2)/2, 2); -	if(p && c >= p[0] && c <= p[1]) -		return 1; -	return 0; -} +#include "runetypebody-5.0.0.c" diff --git a/src/lib9/utf/utf.h b/src/lib9/utf/utf.h new file mode 100644 index 000000000..22d418436 --- /dev/null +++ b/src/lib9/utf/utf.h @@ -0,0 +1,248 @@ +/* + * The authors of this software are Rob Pike and Ken Thompson. + *              Copyright (c) 1998-2002 by Lucent Technologies. + *              Portions Copyright (c) 2009 The Go Authors.  All rights reserved. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + */ + +#ifndef _UTFH_ +#define _UTFH_ 1 + +#include <stdint.h> + +typedef signed int Rune;	/* Code-point values in Unicode 4.0 are 21 bits wide.*/ + +enum +{ +  UTFmax	= 4,		/* maximum bytes per rune */ +  Runesync	= 0x80,		/* cannot represent part of a UTF sequence (<) */ +  Runeself	= 0x80,		/* rune and UTF sequences are the same (<) */ +  Runeerror	= 0xFFFD,	/* decoding error in UTF */ +  Runemax	= 0x10FFFF,	/* maximum rune value */ +}; + +#ifdef	__cplusplus +extern "C" { +#endif + +/* + * rune routines + */ + +/* + * These routines were written by Rob Pike and Ken Thompson + * and first appeared in Plan 9. + * SEE ALSO + * utf (7) + * tcs (1) +*/ + +// runetochar copies (encodes) one rune, pointed to by r, to at most +// UTFmax bytes starting at s and returns the number of bytes generated. + +int runetochar(char* s, const Rune* r); + + +// chartorune copies (decodes) at most UTFmax bytes starting at s to +// one rune, pointed to by r, and returns the number of bytes consumed. +// If the input is not exactly in UTF format, chartorune will set *r +// to Runeerror and return 1. +// +// Note: There is no special case for a "null-terminated" string. A +// string whose first byte has the value 0 is the UTF8 encoding of the +// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal +// anywhere else in a UTF sequence. + +int chartorune(Rune* r, const char* s); + + +// charntorune is like chartorune, except that it will access at most +// n bytes of s.  If the UTF sequence is incomplete within n bytes, +// charntorune will set *r to Runeerror and return 0. If it is complete +// but not in UTF format, it will set *r to Runeerror and return 1. +//  +// Added 2004-09-24 by Wei-Hwa Huang + +int charntorune(Rune* r, const char* s, int n); + +// isvalidcharntorune(str, n, r, consumed) +// is a convenience function that calls "*consumed = charntorune(r, str, n)" +// and returns an int (logically boolean) indicating whether the first +// n bytes of str was a valid and complete UTF sequence. + +int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed); + +// runelen returns the number of bytes required to convert r into UTF. + +int runelen(Rune r); + + +// runenlen returns the number of bytes required to convert the n +// runes pointed to by r into UTF. + +int runenlen(const Rune* r, int n); + + +// fullrune returns 1 if the string s of length n is long enough to be +// decoded by chartorune, and 0 otherwise. This does not guarantee +// that the string contains a legal UTF encoding. This routine is used +// by programs that obtain input one byte at a time and need to know +// when a full rune has arrived. + +int fullrune(const char* s, int n); + +// The following routines are analogous to the corresponding string +// routines with "utf" substituted for "str", and "rune" substituted +// for "chr". + +// utflen returns the number of runes that are represented by the UTF +// string s. (cf. strlen) + +int utflen(const char* s); + + +// utfnlen returns the number of complete runes that are represented +// by the first n bytes of the UTF string s. If the last few bytes of +// the string contain an incompletely coded rune, utfnlen will not +// count them; in this way, it differs from utflen, which includes +// every byte of the string. (cf. strnlen) + +int utfnlen(const char* s, long n); + + +// utfrune returns a pointer to the first occurrence of rune r in the +// UTF string s, or 0 if r does not occur in the string.  The NULL +// byte terminating a string is considered to be part of the string s. +// (cf. strchr) + +const char* utfrune(const char* s, Rune r); + + +// utfrrune returns a pointer to the last occurrence of rune r in the +// UTF string s, or 0 if r does not occur in the string.  The NULL +// byte terminating a string is considered to be part of the string s. +// (cf. strrchr) + +const char* utfrrune(const char* s, Rune r); + + +// utfutf returns a pointer to the first occurrence of the UTF string +// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the +// null string, utfutf returns s1. (cf. strstr) + +const char* utfutf(const char* s1, const char* s2); + + +// utfecpy copies UTF sequences until a null sequence has been copied, +// but writes no sequences beyond es1.  If any sequences are copied, +// s1 is terminated by a null sequence, and a pointer to that sequence +// is returned.  Otherwise, the original s1 is returned. (cf. strecpy) + +char* utfecpy(char *s1, char *es1, const char *s2); + + + +// These functions are rune-string analogues of the corresponding +// functions in strcat (3). +//  +// These routines first appeared in Plan 9. +// SEE ALSO +// memmove (3) +// rune (3) +// strcat (2) +// +// BUGS: The outcome of overlapping moves varies among implementations. + +Rune* runestrcat(Rune* s1, const Rune* s2); +Rune* runestrncat(Rune* s1, const Rune* s2, long n); + +const Rune* runestrchr(const Rune* s, Rune c); + +int runestrcmp(const Rune* s1, const Rune* s2); +int runestrncmp(const Rune* s1, const Rune* s2, long n); + +Rune* runestrcpy(Rune* s1, const Rune* s2); +Rune* runestrncpy(Rune* s1, const Rune* s2, long n); +Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2); + +Rune* runestrdup(const Rune* s); + +const Rune* runestrrchr(const Rune* s, Rune c); +long runestrlen(const Rune* s); +const Rune* runestrstr(const Rune* s1, const Rune* s2); + + + +// The following routines test types and modify cases for Unicode +// characters.  Unicode defines some characters as letters and +// specifies three cases: upper, lower, and title.  Mappings among the +// cases are also defined, although they are not exhaustive: some +// upper case letters have no lower case mapping, and so on.  Unicode +// also defines several character properties, a subset of which are +// checked by these routines.  These routines are based on Unicode +// version 3.0.0. +// +// NOTE: The routines are implemented in C, so the boolean functions +// (e.g., isupperrune) return 0 for false and 1 for true. +// +// +// toupperrune, tolowerrune, and totitlerune are the Unicode case +// mappings. These routines return the character unchanged if it has +// no defined mapping. + +Rune toupperrune(Rune r); +Rune tolowerrune(Rune r); +Rune totitlerune(Rune r); + + +// isupperrune tests for upper case characters, including Unicode +// upper case letters and targets of the toupper mapping. islowerrune +// and istitlerune are defined analogously.  +  +int isupperrune(Rune r); +int islowerrune(Rune r); +int istitlerune(Rune r); + + +// isalpharune tests for Unicode letters; this includes ideographs in +// addition to alphabetic characters. + +int isalpharune(Rune r); + + +// isdigitrune tests for digits. Non-digit numbers, such as Roman +// numerals, are not included. + +int isdigitrune(Rune r); + + +// isideographicrune tests for ideographic characters and numbers, as +// defined by the Unicode standard. + +int isideographicrune(Rune r); + + +// isspacerune tests for whitespace characters, including "C" locale +// whitespace, Unicode defined whitespace, and the "zero-width +// non-break space" character. + +int isspacerune(Rune r); + + +// (The comments in this file were copied from the manpage files rune.3, +// isalpharune.3, and runestrcat.3. Some formatting changes were also made +// to conform to Google style. /JRM 11/11/05) + +#ifdef	__cplusplus +} +#endif + +#endif diff --git a/src/lib9/utf/utfdef.h b/src/lib9/utf/utfdef.h index ba3749a9c..adc6d95fb 100644 --- a/src/lib9/utf/utfdef.h +++ b/src/lib9/utf/utfdef.h @@ -12,36 +12,17 @@   * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.   */ -/* - * compiler directive on Plan 9 - */ -#ifndef USED -#define USED(x) if(x);else -#endif +#define uchar _utfuchar +#define ushort _utfushort +#define uint _utfuint +#define ulong _utfulong +#define vlong _utfvlong +#define uvlong _utfuvlong -/* - * easiest way to make sure these are defined - */ -#define uchar	_fmtuchar -#define ushort	_fmtushort -#define uint	_fmtuint -#define ulong	_fmtulong -#define vlong	_fmtvlong -#define uvlong	_fmtuvlong  typedef unsigned char		uchar;  typedef unsigned short		ushort;  typedef unsigned int		uint;  typedef unsigned long		ulong; -typedef unsigned long long	uvlong; -typedef long long		vlong; - -/* - * nil cannot be ((void*)0) on ANSI C, - * because it is used for function pointers - */ -#undef	nil -#define	nil	0 - -#undef	nelem -#define	nelem	((void*)0) +#define nelem(x) (sizeof(x)/sizeof((x)[0])) +#define nil ((void*)0) diff --git a/src/lib9/utf/utfecpy.c b/src/lib9/utf/utfecpy.c index cf3535fb4..d6dc091c4 100644 --- a/src/lib9/utf/utfecpy.c +++ b/src/lib9/utf/utfecpy.c @@ -7,18 +7,17 @@   * or modification of this software and in all copies of the supporting   * documentation for such software.   * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE - * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY   * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.   */ -#define _BSD_SOURCE 1	/* memccpy */  #include <stdarg.h>  #include <string.h> -#include "plan9.h"  #include "utf.h" +#include "utfdef.h"  char* -utfecpy(char *to, char *e, char *from) +utfecpy(char *to, char *e, const char *from)  {  	char *end; diff --git a/src/lib9/utf/utflen.c b/src/lib9/utf/utflen.c index 769805a5a..45653d540 100644 --- a/src/lib9/utf/utflen.c +++ b/src/lib9/utf/utflen.c @@ -7,17 +7,17 @@   * or modification of this software and in all copies of the supporting   * documentation for such software.   * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE - * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY   * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.   */  #include <stdarg.h>  #include <string.h> -#include "plan9.h"  #include "utf.h" +#include "utfdef.h"  int -utflen(char *s) +utflen(const char *s)  {  	int c;  	long n; @@ -34,4 +34,5 @@ utflen(char *s)  			s += chartorune(&rune, s);  		n++;  	} +	return 0;  } diff --git a/src/lib9/utf/utfnlen.c b/src/lib9/utf/utfnlen.c index 668032995..d673c8290 100644 --- a/src/lib9/utf/utfnlen.c +++ b/src/lib9/utf/utfnlen.c @@ -7,22 +7,22 @@   * or modification of this software and in all copies of the supporting   * documentation for such software.   * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE - * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY   * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.   */  #include <stdarg.h>  #include <string.h> -#include "plan9.h"  #include "utf.h" +#include "utfdef.h"  int -utfnlen(char *s, long m) +utfnlen(const char *s, long m)  {  	int c;  	long n;  	Rune rune; -	char *es; +	const char *es;  	es = s + m;  	for(n = 0; s < es; n++) { diff --git a/src/lib9/utf/utfrrune.c b/src/lib9/utf/utfrrune.c index cff12b5e2..c0b89f5c6 100644 --- a/src/lib9/utf/utfrrune.c +++ b/src/lib9/utf/utfrrune.c @@ -7,21 +7,22 @@   * or modification of this software and in all copies of the supporting   * documentation for such software.   * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE - * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY   * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.   */  #include <stdarg.h>  #include <string.h> -#include "plan9.h"  #include "utf.h" +#include "utfdef.h" +const  char* -utfrrune(char *s, long c) +utfrrune(const char *s, Rune c)  {  	long c1;  	Rune r; -	char *s1; +	const char *s1;  	if(c < Runesync)		/* not part of utf sequence */  		return strrchr(s, c); @@ -42,4 +43,5 @@ utfrrune(char *s, long c)  			s1 = s;  		s += c1;  	} +	return 0;  } diff --git a/src/lib9/utf/utfrune.c b/src/lib9/utf/utfrune.c index 52b83599e..913783f37 100644 --- a/src/lib9/utf/utfrune.c +++ b/src/lib9/utf/utfrune.c @@ -7,17 +7,18 @@   * or modification of this software and in all copies of the supporting   * documentation for such software.   * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE - * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY   * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.   */  #include <stdarg.h>  #include <string.h> -#include "plan9.h"  #include "utf.h" +#include "utfdef.h" +const  char* -utfrune(char *s, long c) +utfrune(const char *s, Rune c)  {  	long c1;  	Rune r; @@ -41,4 +42,5 @@ utfrune(char *s, long c)  			return s;  		s += n;  	} +	return 0;  } diff --git a/src/lib9/utf/utfutf.c b/src/lib9/utf/utfutf.c index 13c850208..ec4923165 100644 --- a/src/lib9/utf/utfutf.c +++ b/src/lib9/utf/utfutf.c @@ -7,24 +7,25 @@   * or modification of this software and in all copies of the supporting   * documentation for such software.   * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE - * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY   * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.   */  #include <stdarg.h>  #include <string.h> -#include "plan9.h"  #include "utf.h" +#include "utfdef.h"  /*   * Return pointer to first occurrence of s2 in s1,   * 0 if none   */ +const  char* -utfutf(char *s1, char *s2) +utfutf(const char *s1, const char *s2)  { -	char *p; +	const char *p;  	long f, n1, n2;  	Rune r; @@ -34,7 +35,7 @@ utfutf(char *s1, char *s2)  		return strstr(s1, s2);  	n2 = strlen(s2); -	for(p=s1; p=utfrune(p, f); p+=n1) +	for(p=s1; (p=utfrune(p, f)) != 0; p+=n1)  		if(strncmp(p, s2, n2) == 0)  			return p;  	return 0; diff --git a/src/runtime/Makefile b/src/runtime/Makefile index e3f15c836..5b7da999c 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -20,6 +20,7 @@ LIBOFILES=\  	runtime.$O\  	map.$O\  	print.$O\ +	rune.$O\  	string.$O\  	sys_file.$O\ diff --git a/src/runtime/rune.c b/src/runtime/rune.c new file mode 100644 index 000000000..2c717d7a9 --- /dev/null +++ b/src/runtime/rune.c @@ -0,0 +1,224 @@ +/* + * The authors of this software are Rob Pike and Ken Thompson. + *              Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + */ + +/* + * This code is copied, with slight editing due to type differences, + * from a subset of ../lib9/utf/rune.c + */ + +#include "runtime.h" + +enum +{ +	Bit1	= 7, +	Bitx	= 6, +	Bit2	= 5, +	Bit3	= 4, +	Bit4	= 3, +	Bit5	= 2,  + +	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */ +	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */ +	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */ +	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */ +	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */ +	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */ + +	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */ +	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */ +	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */ +	Rune4	= (1<<(Bit4+3*Bitx))-1, +                                        /* 0001 1111 1111 1111 1111 1111 */ + +	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */ +	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */ + +	Runeerror	= 0xFFFD, +	Runeself	= 0x80, + +	Bad	= Runeerror, +	 +	Runemax	= 0x10FFFF,	/* maximum rune value */ +}; + +/* + * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24 + * This is a slower but "safe" version of the old chartorune  + * that works on strings that are not necessarily null-terminated. + *  + * If you know for sure that your string is null-terminated, + * chartorune will be a bit faster. + * + * It is guaranteed not to attempt to access "length" + * past the incoming pointer.  This is to avoid + * possible access violations.  If the string appears to be + * well-formed but incomplete (i.e., to get the whole Rune + * we'd need to read past str+length) then we'll set the Rune + * to Bad and return 0. + * + * Note that if we have decoding problems for other + * reasons, we return 1 instead of 0. + */ +int32 +charntorune(int32 *rune, byte *str, int32 length) +{ +	int32 c, c1, c2, c3; +	int32 l; + +	/* When we're not allowed to read anything */ +	if(length <= 0) { +		goto badlen; +	} + +	/* +	 * one character sequence (7-bit value) +	 *	00000-0007F => T1 +	 */ +	c = *(byte*)str;  /* cast not necessary, but kept for safety */ +	if(c < Tx) { +		*rune = c; +		return 1; +	} + +	// If we can't read more than one character we must stop +	if(length <= 1) { +		goto badlen; +	} + +	/* +	 * two character sequence (11-bit value) +	 *	0080-07FF => T2 Tx +	 */ +	c1 = *(byte*)(str+1) ^ Tx; +	if(c1 & Testx) +		goto bad; +	if(c < T3) { +		if(c < T2) +			goto bad; +		l = ((c << Bitx) | c1) & Rune2; +		if(l <= Rune1) +			goto bad; +		*rune = l; +		return 2; +	} + +	// If we can't read more than two characters we must stop +	if(length <= 2) { +		goto badlen; +	} + +	/* +	 * three character sequence (16-bit value) +	 *	0800-FFFF => T3 Tx Tx +	 */ +	c2 = *(byte*)(str+2) ^ Tx; +	if(c2 & Testx) +		goto bad; +	if(c < T4) { +		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; +		if(l <= Rune2) +			goto bad; +		*rune = l; +		return 3; +	} + +	if (length <= 3) +		goto badlen; + +	/* +	 * four character sequence (21-bit value) +	 *	10000-1FFFFF => T4 Tx Tx Tx +	 */ +	c3 = *(byte*)(str+3) ^ Tx; +	if (c3 & Testx) +		goto bad; +	if (c < T5) { +		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; +		if (l <= Rune3) +			goto bad; +		*rune = l; +		return 4; +	} + +	// Support for 5-byte or longer UTF-8 would go here, but +	// since we don't have that, we'll just fall through to bad. + +	/* +	 * bad decoding +	 */ +bad: +	*rune = Bad; +	return 1; +badlen: +	*rune = Bad; +	return 0; + +} + +int32 +runetochar(byte *str, int32 rune)  /* note: in original, arg2 was pointer */ +{ +	/* Runes are signed, so convert to unsigned for range check. */ +	uint32 c; + +	/* +	 * one character sequence +	 *	00000-0007F => 00-7F +	 */ +	c = rune; +	if(c <= Rune1) { +		str[0] = c; +		return 1; +	} + +	/* +	 * two character sequence +	 *	0080-07FF => T2 Tx +	 */ +	if(c <= Rune2) { +		str[0] = T2 | (c >> 1*Bitx); +		str[1] = Tx | (c & Maskx); +		return 2; +	} + +	/* +	 * If the Rune is out of range, convert it to the error rune. +	 * Do this test here because the error rune encodes to three bytes. +	 * Doing it earlier would duplicate work, since an out of range +	 * Rune wouldn't have fit in one or two bytes. +	 */ +	if (c > Runemax) +		c = Runeerror; + +	/* +	 * three character sequence +	 *	0800-FFFF => T3 Tx Tx +	 */ +	if (c <= Rune3) { +		str[0] = T3 |  (c >> 2*Bitx); +		str[1] = Tx | ((c >> 1*Bitx) & Maskx); +		str[2] = Tx |  (c & Maskx); +		return 3; +	} + +	/* +	 * four character sequence (21-bit value) +	 *     10000-1FFFFF => T4 Tx Tx Tx +	 */ +	str[0] = T4 | (c >> 3*Bitx); +	str[1] = Tx | ((c >> 2*Bitx) & Maskx); +	str[2] = Tx | ((c >> 1*Bitx) & Maskx); +	str[3] = Tx | (c & Maskx); +	return 4; +} diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index f64353e22..dc8f88bb7 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -85,6 +85,8 @@ enum  int32 strcmp(byte*, byte*);  int32 findnull(int8*);  void	dump(byte*, int32); +int32 runetochar(byte*, int32); +int32 chartorune(uint32*, byte*);  extern string	emptystring;  extern int32 debug; diff --git a/src/runtime/string.c b/src/runtime/string.c index 9bac09184..d21273de0 100644 --- a/src/runtime/string.c +++ b/src/runtime/string.c @@ -151,55 +151,6 @@ sys·indexstring(string s, int32 i, byte b)  	FLUSH(&b);  } -/* - * this is the plan9 runetochar - * extended for 36 bits in 7 bytes - * note that it truncates to 32 bits - * through the argument passing. - */ -static int32 -runetochar(byte *str, uint32 c) -{ -	int32 i, n; -	uint32 mask, mark; - -	/* -	 * one character in 7 bits -	 */ -	if(c <= 0x07FUL) { -		str[0] = c; -		return 1; -	} - -	/* -	 * every new character picks up 5 bits -	 * one less in the first byte and -	 * six more in an extension byte -	 */ -	mask = 0x7ffUL; -	mark = 0xC0UL; -	for(n=1;; n++) { -		if(c <= mask) -			break; -		mask = (mask<<5) | 0x1fUL; -		mark = (mark>>1) | 0x80UL; -	} - -	/* -	 * lay down the bytes backwards -	 * n is the number of extension bytes -	 * mask is the max codepoint -	 * mark is the zeroth byte indicator -	 */ -	for(i=n; i>0; i--) { -		str[i] = 0x80UL | (c&0x3fUL); -		c >>= 6; -	} - -	str[0] = mark|c; -	return n+1; -} -  void  sys·intstring(int64 v, string s)  { diff --git a/test/string_lit.go b/test/string_lit.go index f4f123c9c..9ef8f931d 100644 --- a/test/string_lit.go +++ b/test/string_lit.go @@ -75,5 +75,14 @@ func main() {  	       `\000\123\x00\312\xFE\u0123\ubabe\U0000babe`,             "backslashes 2 (backquote)");  	assert("\\x\\u\\U\\", `\x\u\U\`, "backslash 3 (backquote)"); + +	// test large runes. perhaps not the most logical place for this test. +	var r int32; +	r = 0x10ffff;	// largest rune value +	s = string(r); +	assert(s, "\xf4\x8f\xbf\xbf", "largest rune"); +	r = 0x10ffff + 1; +	s = string(r); +	assert(s, "\xef\xbf\xbd", "too-large rune");  	sys.exit(ecode);  } | 
