usr/src/lib/libdemangle/common/rust-v0puny.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264

/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2019 Joyent, Inc.
 * Copyright 2021 Jason King
 */

#include <inttypes.h>
#include <libcustr.h>
#include <limits.h>
#include <string.h>
#include <sys/byteorder.h>
#include "rust.h"
#include "strview.h"

/*
 * The rust v0 encoding (rust RFC 2603) uses a slightly modified
 * version of punycode to encode characters that are not ASCII.
 * The big difference is that '_' is used to separate the ASCII codepoints
 * from the non-ASCII code points instead of '-'.
 *
 * The decoding is taken almost directly from (IETF) RFC 3492
 */

#define	BASE		36
#define	TMIN		1
#define	TMAX		26
#define	SKEW		38
#define	DAMP		700
#define	INITIAL_BIAS	72
#define	INITIAL_N	0x80
#define	DELIMITER	'_'

static inline uint32_t char_val(char);

static size_t
rustv0_puny_adapt(size_t delta, size_t npoints, boolean_t first)
{
	size_t k = 0;

	delta = first ? delta / DAMP : delta / 2;
	delta += delta / npoints;
	while (delta > ((BASE - TMIN) * TMAX) / 2) {
		delta /= (BASE - TMIN);
		k += BASE;
	}

	return (k + (((BASE - TMIN + 1) * delta) / (delta + SKEW)));
}

boolean_t
rustv0_puny_decode(rust_state_t *restrict st, strview_t *restrict src,
    boolean_t repl_underscore)
{
	uint32_t *buf;
	size_t bufalloc; /* in units of uint32_t */
	size_t buflen;
	size_t nbasic;
	size_t i, old_i, k, w;
	size_t n = INITIAL_N;
	size_t bias = INITIAL_BIAS;
	size_t delim_idx = 0;
	boolean_t ret = B_FALSE;
	char c;

	DEMDEBUG("%s: str='%.*s'", __func__, SV_PRINT(src));

	/*
	 * The decoded string should never contain more codepoints than
	 * the original string, so creating a temporary buffer large
	 * enought to hold sv_remaining(src) uint32_t's should be
	 * large enough.
	 *
	 * This also serves as a size check -- xcalloc will fail if the
	 * resulting size of the buf (sizeof (uint32_t) * bufalloc) >=
	 * SIZE_MAX. If xcalloc succeeds, we therefore know that that
	 * buflen cannot overflow.
	 */
	buflen = 0;
	bufalloc = sv_remaining(src) + 1;
	buf = xcalloc(st->rs_ops, bufalloc, sizeof (uint32_t));
	if (buf == NULL) {
		SET_ERROR(st);
		return (B_FALSE);
	}

	/*
	 * Find the position of the last delimiter (if any).
	 * IETF RFC 3492 3.1 states that the delimiter is present if and only
	 * if there are a non-zero number of basic (ASCII) code points. Since
	 * the delimiter itself is a basic code point, the last one present
	 * in the original string is the actual delimiter between the basic
	 * and non-basic code points. Earlier occurences of the delimiter
	 * are treated as normal basic code points. For plain punycode, an
	 * all ASCII string encoded with punycode would terminate with a
	 * final delimiter, and a name with all non-basic code points would
	 * not have a delimiter at all. With the rust v0 encoding, punycode
	 * encoded identifiers have a 'u' prefix prior to the identifier
	 * length (['u'] <decimal-number> <bytes>), so we should never
	 * encounter an all ASCII name that's encoded with punycode (we error
	 * on this).  For an all non-basic codepoint identifier, no delimiter
	 * will be present, and we treat that the same as the delimiter being
	 * in the first position of the string, and consume it (if present)
	 * when we transition from copying the basic code points (which there
	 * will be none in this situation) to non-basic code points.
	 */
	for (i = 0; i < src->sv_rem; i++) {
		if (src->sv_first[i] == DELIMITER) {
			delim_idx = i;
		}
	}
	VERIFY3U(delim_idx, <, bufalloc);

	if (delim_idx + 1 == sv_remaining(src)) {
		DEMDEBUG("%s: encountered an all-ASCII name encoded with "
		    "punycode", __func__);
		goto done;
	}

	/* Copy all the basic characters up to the delimiter into buf */
	for (nbasic = 0; nbasic < delim_idx; nbasic++) {
		c = sv_consume_c(src);

		/* The rust prefix check should guarantee this */
		VERIFY3U(c, <, 0x80);

		/*
		 * Normal rust identifiers do not contain '-' in them.
		 * However ABI identifiers could contain a dash. Those
		 * are translated to _, and we need to replace accordingly
		 * when asked.
		 */
		if (repl_underscore && c == '_')
			c = '-';

		buf[nbasic] = c;
		buflen++;
	}
	DEMDEBUG("%s: %" PRIu32 " ASCII codepoints copied", __func__, nbasic);

	/*
	 * Consume delimiter between basic and non-basic code points if present.
	 * See above for explanation why it may not be present.
	 */
	(void) sv_consume_if_c(src, DELIMITER);

	DEMDEBUG("%s: non-ASCII codepoints to decode: %.*s", __func__,
	    SV_PRINT(src));

	for (i = 0; sv_remaining(src) > 0; i++) {
		VERIFY3U(i, <=, buflen);

		/*
		 * Guarantee we have enough space to insert another codepoint.
		 * Our buffer sizing above should prevent this from ever
		 * tripping, but check this out of paranoia.
		 */
		VERIFY3U(buflen, <, bufalloc - 1);

		/* decode the next codepoint */
		for (old_i = i, k = BASE, w = 1; ; k += BASE) {
			size_t t;
			uint32_t digit;

			if (sv_remaining(src) == 0)
				goto done;

			digit = char_val(sv_consume_c(src));
			if (digit >= BASE)
				goto done;

			i = i + digit * w;

			if (k <= bias)
				t = TMIN;
			else if (k >= bias + TMAX)
				t = TMAX;
			else
				t = k - bias;

			if (digit < t)
				break;

			w = w * (BASE - t);
		}
		buflen++;

		bias = rustv0_puny_adapt(i - old_i, buflen,
		    (old_i == 0) ? B_TRUE : B_FALSE);
		n = n + i / buflen;
		i = i % buflen;

		DEMDEBUG("%s: insert \\u%04" PRIx32 " at index %zu (len = %zu)",
		    __func__, n, i, buflen);

		/*
		 * At the start of this while loop, we guaranteed
		 * buflen < bufalloc - 1. Therefore we know there is room
		 * to move over the contents of buf at i to make room
		 * for the codepoint. We also just guaranteed that i
		 * is in the range [0, buflen), so this should always be
		 * safe.
		 */
		(void) memmove(buf + i + 1, buf + i,
		    (buflen - i) * sizeof (uint32_t));

#if _LP64
		/*
		 * This is always false for ILP32 and smatch will also complain,
		 * so we just omit it for ILP32.
		 */
		if (n > UINT32_MAX) {
			DEMDEBUG("%s: ERROR: utf8 value is out of range",
			    __func__);
			goto done;
		}
#endif

		buf[i] = (uint32_t)n;
	}

	DEMDEBUG("%s: inserted %zu non-basic code points", __func__,
	    buflen - nbasic);

	for (i = 0; i < buflen; i++) {
		if (!rust_append_utf8_c(st, buf[i]))
			goto done;
	}
	ret = B_TRUE;

done:
	xfree(st->rs_ops, buf, bufalloc * sizeof (uint32_t));
	return (ret);
}

/*
 * Convert [0-9][a-z] to a value [0..35]. Rust's punycode encoding always
 * uses lowercase, so we treat uppercase (and any other characters) as
 * invalid, and return BASE (36) to indicate a bad value.
 */
static inline uint32_t
char_val(char c)
{
	uint32_t v = c;

	if (ISLOWER(c)) {
		return (c - 'a');
	} else if (ISDIGIT(c)) {
		return (c - '0' + 26);
	} else {
		DEMDEBUG("%s: ERROR: invalid character 0x%02x encountered",
		    __func__, v);
		return (BASE);
	}
}