diff options
Diffstat (limited to 'usr/src/lib/libdemangle/common/rust.c')
-rw-r--r-- | usr/src/lib/libdemangle/common/rust.c | 543 |
1 files changed, 543 insertions, 0 deletions
diff --git a/usr/src/lib/libdemangle/common/rust.c b/usr/src/lib/libdemangle/common/rust.c new file mode 100644 index 0000000000..f99fe79a10 --- /dev/null +++ b/usr/src/lib/libdemangle/common/rust.c @@ -0,0 +1,543 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019, Joyent, Inc. + */ + +#include <errno.h> +#include <libcustr.h> +#include <limits.h> +#include <string.h> +#include <sys/ctype.h> /* We want the C locale ISXXX() versions */ +#include <sys/debug.h> +#include <stdio.h> +#include <sys/sysmacros.h> + +#include "strview.h" +#include "demangle_int.h" + +/* + * Unfortunately, there is currently no official specification for the rust + * name mangling. This is an attempt to document the understanding of the + * mangling used here. It is based off examination of + * https://docs.rs/rustc-demangle/0.1.13/rustc_demangle/ + * + * A mangled rust name is: + * <prefix> <name> <hash> E + * + * <prefix> ::= _Z + * __Z + * + * <name> ::= <name-segment>+ + * + * <name-segment> ::= <len> <name-chars>{len} + * + * <len> ::= [1-9][0-9]+ + * + * <name-chars> ::= <[A-Za-z]> <[A-Za-z0-9]>* + * <separator> + * <special> + * + * <separator> ::= '..' # '::' + * + * <special> ::= $SP$ # ' ' + * $BP$ # '*' + * $RF$ # '&' + * $LT$ # '<' + * $GT$ # '>' + * $LP$ # '(' + * $RP$ # ')' + * $C$ # ',' + * $u7e$ # '~' + * $u20$ # ' ' + * $u27$ # '\'' + * $u3d$ # '=' + * $u5b$ # '[' + * $u5d$ # ']' + * $u7b$ # '{' + * $u7d$ # '}' + * $u3b$ # ';' + * $u2b$ # '+' + * $u22$ # '"' + * + * <hash> := <len> h <hex-digits>+ + * + * <hex-digits> := <[0-9a-f]> + */ + +typedef struct rustdem_state { + const char *rds_str; + custr_t *rds_demangled; + sysdem_ops_t *rds_ops; + int rds_error; +} rustdem_state_t; + +static const struct rust_charmap { + const char *ruc_seq; + char ruc_ch; +} rust_charmap[] = { + { "$SP$", '@' }, + { "$BP$", '*' }, + { "$RF$", '&' }, + { "$LT$", '<' }, + { "$GT$", '>' }, + { "$LP$", '(' }, + { "$RP$", ')' }, + { "$C$", ',' }, + { "$u7e$", '~' }, + { "$u20$", ' ' }, + { "$u27$", '\'' }, + { "$u3d$", '=' }, + { "$u5b$", '[' }, + { "$u5d$", ']' }, + { "$u7b$", '{' }, + { "$u7d$", '}' }, + { "$u3b$", ';' }, + { "$u2b$", '+' }, + { "$u22$", '"' } +}; +static const size_t rust_charmap_sz = ARRAY_SIZE(rust_charmap); + +static void *rustdem_alloc(custr_alloc_t *, size_t); +static void rustdem_free(custr_alloc_t *, void *, size_t); + +static boolean_t rustdem_append_c(rustdem_state_t *, char); +static boolean_t rustdem_all_ascii(const strview_t *); + +static boolean_t rustdem_parse_prefix(rustdem_state_t *, strview_t *); +static boolean_t rustdem_parse_name(rustdem_state_t *, strview_t *); +static boolean_t rustdem_parse_hash(rustdem_state_t *, strview_t *); +static boolean_t rustdem_parse_num(rustdem_state_t *, strview_t *, uint64_t *); +static boolean_t rustdem_parse_special(rustdem_state_t *, strview_t *); +static boolean_t rustdem_add_sep(rustdem_state_t *); + +char * +rust_demangle(const char *s, size_t slen, sysdem_ops_t *ops) +{ + rustdem_state_t st = { + .rds_str = s, + .rds_ops = ops, + }; + custr_alloc_ops_t custr_ops = { + .custr_ao_alloc = rustdem_alloc, + .custr_ao_free = rustdem_free + }; + custr_alloc_t custr_alloc = { + .cua_version = CUSTR_VERSION + }; + strview_t sv; + int ret; + + if (custr_alloc_init(&custr_alloc, &custr_ops) != 0) + return (NULL); + custr_alloc.cua_arg = &st; + + sv_init_str(&sv, s, s + slen); + + if (sv_remaining(&sv) < 1 || sv_peek(&sv, -1) != 'E') { + DEMDEBUG("ERROR: string is either too small or does not end " + "with 'E'"); + errno = EINVAL; + return (NULL); + } + + if (!rustdem_parse_prefix(&st, &sv)) { + DEMDEBUG("ERROR: could not parse prefix"); + errno = EINVAL; + return (NULL); + } + DEMDEBUG("parsed prefix; remaining='%.*s'", SV_PRINT(&sv)); + + if (!rustdem_all_ascii(&sv)) { + /* rustdem_all_ascii() provides debug output */ + errno = EINVAL; + return (NULL); + } + + if ((ret = custr_xalloc(&st.rds_demangled, &custr_alloc)) != 0) + return (NULL); + + while (sv_remaining(&sv) > 1) { + if (rustdem_parse_name(&st, &sv)) + continue; + if (st.rds_error != 0) + goto fail; + } + + if (st.rds_error != 0 || !sv_consume_if_c(&sv, 'E')) + goto fail; + + char *res = xstrdup(ops, custr_cstr(st.rds_demangled)); + if (res == NULL) { + st.rds_error = errno; + goto fail; + } + + custr_free(st.rds_demangled); + DEMDEBUG("result = '%s'", res); + return (res); + +fail: + custr_free(st.rds_demangled); + errno = st.rds_error; + return (NULL); +} + +static boolean_t +rustdem_parse_prefix(rustdem_state_t *st, strview_t *svp) +{ + strview_t pfx; + + sv_init_sv(&pfx, svp); + + DEMDEBUG("checking for '_ZN' or '__ZN' in '%.*s'", SV_PRINT(&pfx)); + + if (st->rds_error != 0) + return (B_FALSE); + + if (!sv_consume_if_c(&pfx, '_')) + return (B_FALSE); + + (void) sv_consume_if_c(&pfx, '_'); + + if (!sv_consume_if_c(&pfx, 'Z') || !sv_consume_if_c(&pfx, 'N')) + return (B_FALSE); + + /* Update svp with new position */ + sv_init_sv(svp, &pfx); + return (B_TRUE); +} + +static boolean_t +rustdem_parse_name_segment(rustdem_state_t *st, strview_t *svp, boolean_t first) +{ + strview_t sv; + strview_t name; + uint64_t len; + size_t rem; + boolean_t last = B_FALSE; + + if (st->rds_error != 0 || sv_remaining(svp) == 0) + return (B_FALSE); + + sv_init_sv(&sv, svp); + + if (!rustdem_parse_num(st, &sv, &len)) { + DEMDEBUG("ERROR: no leading length"); + st->rds_error = EINVAL; + return (B_FALSE); + } + + rem = sv_remaining(&sv); + + if (rem < len || len > SIZE_MAX) { + st->rds_error = EINVAL; + return (B_FALSE); + } + + /* Is this the last segment before the terminating E? */ + if (rem == len + 1) { + VERIFY3U(sv_peek(&sv, -1), ==, 'E'); + last = B_TRUE; + } + + if (!first && !rustdem_add_sep(st)) + return (B_FALSE); + + /* Reduce length of seg to the length we parsed */ + (void) sv_init_sv_range(&name, &sv, len); + + DEMDEBUG("%s: segment='%.*s'", __func__, SV_PRINT(&name)); + + /* + * A rust hash starts with 'h', and is the last component of a name + * before the terminating 'E' + */ + if (sv_peek(&name, 0) == 'h' && last) { + if (!rustdem_parse_hash(st, &name)) + return (B_FALSE); + goto done; + } + + while (sv_remaining(&name) > 0) { + switch (sv_peek(&name, 0)) { + case '$': + if (rustdem_parse_special(st, &name)) + continue; + break; + case '_': + if (sv_peek(&name, 1) == '$') { + /* + * Only consume/ignore '_'. Leave + * $ for next round. + */ + sv_consume_n(&name, 1); + continue; + } + break; + case '.': + /* Convert '..' to '::' */ + if (sv_peek(&name, 1) != '.') + break; + + if (!rustdem_add_sep(st)) + return (B_FALSE); + + sv_consume_n(&name, 2); + continue; + default: + break; + } + + if (custr_appendc(st->rds_demangled, + sv_consume_c(&name)) != 0) { + st->rds_error = ENOMEM; + return (B_FALSE); + } + } + +done: + DEMDEBUG("%s: consumed '%.*s'", __func__, (int)len, svp->sv_first); + sv_consume_n(&sv, len); + sv_init_sv(svp, &sv); + return (B_TRUE); +} + +static boolean_t +rustdem_parse_name(rustdem_state_t *st, strview_t *svp) +{ + strview_t name; + boolean_t first = B_TRUE; + + if (st->rds_error != 0) + return (B_FALSE); + + sv_init_sv(&name, svp); + + if (sv_remaining(&name) == 0) + return (B_FALSE); + + while (sv_remaining(&name) > 0 && sv_peek(&name, 0) != 'E') { + if (!rustdem_parse_name_segment(st, &name, first)) + return (B_FALSE); + first = B_FALSE; + } + + sv_init_sv(svp, &name); + return (B_TRUE); +} + +static boolean_t +rustdem_parse_hash(rustdem_state_t *st, strview_t *svp) +{ + strview_t sv; + + sv_init_sv(&sv, svp); + + VERIFY(sv_consume_if_c(&sv, 'h')); + if (!rustdem_append_c(st, 'h')) + return (B_FALSE); + + while (sv_remaining(&sv) > 0) { + char c = sv_consume_c(&sv); + + switch (c) { + /* + * The upper-case hex digits (A-F) are excluded as valid + * hash values for several reasons: + * + * 1. It would result in two different possible names for + * the same function, leading to ambiguity in linking (among + * other things). + * + * 2. It would cause potential ambiguity in parsing -- is a + * trailing 'E' part of the hash, or the terminating character + * in the mangled name? + * + * 3. No examples were able to be found in the wild where + * uppercase digits are used, and other rust demanglers all + * seem to assume the hash must contain lower-case hex digits. + */ + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + case '8': case '9': case 'a': case 'b': + case 'c': case 'd': case 'e': case 'f': + if (!rustdem_append_c(st, c)) + return (B_FALSE); + break; + default: + return (B_FALSE); + } + } + + sv_init_sv(svp, &sv); + return (B_TRUE); +} + +/* + * A 10 digit value would imply a name 1Gb or larger in size. It seems + * unlikely to the point of absurdity any such value could every possibly + * be valid (or even have compiled properly). This also prevents the + * uint64_t conversion from possibly overflowing since the value must always + * be below 10 * UINT32_MAX. + */ +#define MAX_DIGITS 10 + +static boolean_t +rustdem_parse_num(rustdem_state_t *restrict st, strview_t *restrict svp, + uint64_t *restrict valp) +{ + strview_t snum; + uint64_t v = 0; + size_t ndigits = 0; + char c; + + if (st->rds_error != 0) + return (B_FALSE); + + sv_init_sv(&snum, svp); + + DEMDEBUG("%s: str='%.*s'", __func__, SV_PRINT(&snum)); + + c = sv_peek(&snum, 0); + if (!ISDIGIT(c)) { + DEMDEBUG("%s: ERROR no digits in str\n", __func__); + st->rds_error = EINVAL; + return (B_FALSE); + } + + /* + * Since there is currently no official specification on rust name + * mangling, only that it has been stated that rust follows what + * C++ mangling does. In the Itanium C++ ABI (what practically + * every non-Windows C++ implementation uses these days), it + * explicitly disallows leading 0s in numeric values (except for + * substition and template indexes, which aren't relevant here). + * We enforce the same restriction -- if a rust implementation allowed + * leading zeros in numbers (basically segment lengths) it'd + * cause all sorts of ambiguity problems with names that likely lead + * to much bigger problems with linking and such, so this seems + * reasonable. + */ + if (c == '0') { + DEMDEBUG("%s: ERROR number starts with leading 0\n", __func__); + st->rds_error = EINVAL; + return (B_FALSE); + } + + while (sv_remaining(&snum) > 0 && ndigits <= MAX_DIGITS) { + c = sv_consume_c(&snum); + + if (!ISDIGIT(c)) + break; + + v *= 10; + v += c - '0'; + ndigits++; + } + + if (ndigits > MAX_DIGITS) { + DEMDEBUG("%s: value %llu is too large\n", __func__, v); + st->rds_error = ERANGE; + return (B_FALSE); + } + + DEMDEBUG("%s: num=%llu", __func__, v); + + *valp = v; + sv_consume_n(svp, ndigits); + return (B_TRUE); +} + +static boolean_t +rustdem_parse_special(rustdem_state_t *restrict st, strview_t *restrict svp) +{ + if (st->rds_error != 0) + return (B_FALSE); + + if (sv_peek(svp, 0) != '$') + return (B_FALSE); + + for (size_t i = 0; i < rust_charmap_sz; i++) { + if (sv_consume_if(svp, rust_charmap[i].ruc_seq)) { + if (!rustdem_append_c(st, rust_charmap[i].ruc_ch)) + return (B_FALSE); + return (B_TRUE); + } + } + return (B_FALSE); +} + +static boolean_t +rustdem_add_sep(rustdem_state_t *st) +{ + if (st->rds_error != 0) + return (B_FALSE); + + if (!rustdem_append_c(st, ':') || + !rustdem_append_c(st, ':')) + return (B_FALSE); + + return (B_TRUE); +} + +static boolean_t +rustdem_append_c(rustdem_state_t *st, char c) +{ + if (st->rds_error != 0) + return (B_FALSE); + + if (custr_appendc(st->rds_demangled, c) == 0) + return (B_TRUE); + + st->rds_error = errno; + return (B_FALSE); +} + +static boolean_t +rustdem_all_ascii(const strview_t *svp) +{ + strview_t p; + + sv_init_sv(&p, svp); + + while (sv_remaining(&p) > 0) { + char c = sv_consume_c(&p); + + /* + * #including <sys/ctype.h> conflicts with <ctype.h>. Since + * we want the C locale macros (ISDIGIT, etc), it also means + * we can't use isascii(3C). + */ + if ((c & 0x80) != 0) { + DEMDEBUG("%s: found non-ascii character 0x%02hhx at " + "offset %tu", __func__, c, + (ptrdiff_t)(p.sv_first - svp->sv_first)); + return (B_FALSE); + } + } + return (B_TRUE); +} + +static void * +rustdem_alloc(custr_alloc_t *cao, size_t len) +{ + rustdem_state_t *st = cao->cua_arg; + return (zalloc(st->rds_ops, len)); +} + +static void +rustdem_free(custr_alloc_t *cao, void *p, size_t len) +{ + rustdem_state_t *st = cao->cua_arg; + xfree(st->rds_ops, p, len); +} |