summaryrefslogtreecommitdiff
path: root/usr/src/lib/libdemangle/common/rust.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/lib/libdemangle/common/rust.c')
-rw-r--r--usr/src/lib/libdemangle/common/rust.c543
1 files changed, 543 insertions, 0 deletions
diff --git a/usr/src/lib/libdemangle/common/rust.c b/usr/src/lib/libdemangle/common/rust.c
new file mode 100644
index 0000000000..f99fe79a10
--- /dev/null
+++ b/usr/src/lib/libdemangle/common/rust.c
@@ -0,0 +1,543 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019, Joyent, Inc.
+ */
+
+#include <errno.h>
+#include <libcustr.h>
+#include <limits.h>
+#include <string.h>
+#include <sys/ctype.h> /* We want the C locale ISXXX() versions */
+#include <sys/debug.h>
+#include <stdio.h>
+#include <sys/sysmacros.h>
+
+#include "strview.h"
+#include "demangle_int.h"
+
+/*
+ * Unfortunately, there is currently no official specification for the rust
+ * name mangling. This is an attempt to document the understanding of the
+ * mangling used here. It is based off examination of
+ * https://docs.rs/rustc-demangle/0.1.13/rustc_demangle/
+ *
+ * A mangled rust name is:
+ * <prefix> <name> <hash> E
+ *
+ * <prefix> ::= _Z
+ * __Z
+ *
+ * <name> ::= <name-segment>+
+ *
+ * <name-segment> ::= <len> <name-chars>{len}
+ *
+ * <len> ::= [1-9][0-9]+
+ *
+ * <name-chars> ::= <[A-Za-z]> <[A-Za-z0-9]>*
+ * <separator>
+ * <special>
+ *
+ * <separator> ::= '..' # '::'
+ *
+ * <special> ::= $SP$ # ' '
+ * $BP$ # '*'
+ * $RF$ # '&'
+ * $LT$ # '<'
+ * $GT$ # '>'
+ * $LP$ # '('
+ * $RP$ # ')'
+ * $C$ # ','
+ * $u7e$ # '~'
+ * $u20$ # ' '
+ * $u27$ # '\''
+ * $u3d$ # '='
+ * $u5b$ # '['
+ * $u5d$ # ']'
+ * $u7b$ # '{'
+ * $u7d$ # '}'
+ * $u3b$ # ';'
+ * $u2b$ # '+'
+ * $u22$ # '"'
+ *
+ * <hash> := <len> h <hex-digits>+
+ *
+ * <hex-digits> := <[0-9a-f]>
+ */
+
+typedef struct rustdem_state {
+ const char *rds_str;
+ custr_t *rds_demangled;
+ sysdem_ops_t *rds_ops;
+ int rds_error;
+} rustdem_state_t;
+
+static const struct rust_charmap {
+ const char *ruc_seq;
+ char ruc_ch;
+} rust_charmap[] = {
+ { "$SP$", '@' },
+ { "$BP$", '*' },
+ { "$RF$", '&' },
+ { "$LT$", '<' },
+ { "$GT$", '>' },
+ { "$LP$", '(' },
+ { "$RP$", ')' },
+ { "$C$", ',' },
+ { "$u7e$", '~' },
+ { "$u20$", ' ' },
+ { "$u27$", '\'' },
+ { "$u3d$", '=' },
+ { "$u5b$", '[' },
+ { "$u5d$", ']' },
+ { "$u7b$", '{' },
+ { "$u7d$", '}' },
+ { "$u3b$", ';' },
+ { "$u2b$", '+' },
+ { "$u22$", '"' }
+};
+static const size_t rust_charmap_sz = ARRAY_SIZE(rust_charmap);
+
+static void *rustdem_alloc(custr_alloc_t *, size_t);
+static void rustdem_free(custr_alloc_t *, void *, size_t);
+
+static boolean_t rustdem_append_c(rustdem_state_t *, char);
+static boolean_t rustdem_all_ascii(const strview_t *);
+
+static boolean_t rustdem_parse_prefix(rustdem_state_t *, strview_t *);
+static boolean_t rustdem_parse_name(rustdem_state_t *, strview_t *);
+static boolean_t rustdem_parse_hash(rustdem_state_t *, strview_t *);
+static boolean_t rustdem_parse_num(rustdem_state_t *, strview_t *, uint64_t *);
+static boolean_t rustdem_parse_special(rustdem_state_t *, strview_t *);
+static boolean_t rustdem_add_sep(rustdem_state_t *);
+
+char *
+rust_demangle(const char *s, size_t slen, sysdem_ops_t *ops)
+{
+ rustdem_state_t st = {
+ .rds_str = s,
+ .rds_ops = ops,
+ };
+ custr_alloc_ops_t custr_ops = {
+ .custr_ao_alloc = rustdem_alloc,
+ .custr_ao_free = rustdem_free
+ };
+ custr_alloc_t custr_alloc = {
+ .cua_version = CUSTR_VERSION
+ };
+ strview_t sv;
+ int ret;
+
+ if (custr_alloc_init(&custr_alloc, &custr_ops) != 0)
+ return (NULL);
+ custr_alloc.cua_arg = &st;
+
+ sv_init_str(&sv, s, s + slen);
+
+ if (sv_remaining(&sv) < 1 || sv_peek(&sv, -1) != 'E') {
+ DEMDEBUG("ERROR: string is either too small or does not end "
+ "with 'E'");
+ errno = EINVAL;
+ return (NULL);
+ }
+
+ if (!rustdem_parse_prefix(&st, &sv)) {
+ DEMDEBUG("ERROR: could not parse prefix");
+ errno = EINVAL;
+ return (NULL);
+ }
+ DEMDEBUG("parsed prefix; remaining='%.*s'", SV_PRINT(&sv));
+
+ if (!rustdem_all_ascii(&sv)) {
+ /* rustdem_all_ascii() provides debug output */
+ errno = EINVAL;
+ return (NULL);
+ }
+
+ if ((ret = custr_xalloc(&st.rds_demangled, &custr_alloc)) != 0)
+ return (NULL);
+
+ while (sv_remaining(&sv) > 1) {
+ if (rustdem_parse_name(&st, &sv))
+ continue;
+ if (st.rds_error != 0)
+ goto fail;
+ }
+
+ if (st.rds_error != 0 || !sv_consume_if_c(&sv, 'E'))
+ goto fail;
+
+ char *res = xstrdup(ops, custr_cstr(st.rds_demangled));
+ if (res == NULL) {
+ st.rds_error = errno;
+ goto fail;
+ }
+
+ custr_free(st.rds_demangled);
+ DEMDEBUG("result = '%s'", res);
+ return (res);
+
+fail:
+ custr_free(st.rds_demangled);
+ errno = st.rds_error;
+ return (NULL);
+}
+
+static boolean_t
+rustdem_parse_prefix(rustdem_state_t *st, strview_t *svp)
+{
+ strview_t pfx;
+
+ sv_init_sv(&pfx, svp);
+
+ DEMDEBUG("checking for '_ZN' or '__ZN' in '%.*s'", SV_PRINT(&pfx));
+
+ if (st->rds_error != 0)
+ return (B_FALSE);
+
+ if (!sv_consume_if_c(&pfx, '_'))
+ return (B_FALSE);
+
+ (void) sv_consume_if_c(&pfx, '_');
+
+ if (!sv_consume_if_c(&pfx, 'Z') || !sv_consume_if_c(&pfx, 'N'))
+ return (B_FALSE);
+
+ /* Update svp with new position */
+ sv_init_sv(svp, &pfx);
+ return (B_TRUE);
+}
+
+static boolean_t
+rustdem_parse_name_segment(rustdem_state_t *st, strview_t *svp, boolean_t first)
+{
+ strview_t sv;
+ strview_t name;
+ uint64_t len;
+ size_t rem;
+ boolean_t last = B_FALSE;
+
+ if (st->rds_error != 0 || sv_remaining(svp) == 0)
+ return (B_FALSE);
+
+ sv_init_sv(&sv, svp);
+
+ if (!rustdem_parse_num(st, &sv, &len)) {
+ DEMDEBUG("ERROR: no leading length");
+ st->rds_error = EINVAL;
+ return (B_FALSE);
+ }
+
+ rem = sv_remaining(&sv);
+
+ if (rem < len || len > SIZE_MAX) {
+ st->rds_error = EINVAL;
+ return (B_FALSE);
+ }
+
+ /* Is this the last segment before the terminating E? */
+ if (rem == len + 1) {
+ VERIFY3U(sv_peek(&sv, -1), ==, 'E');
+ last = B_TRUE;
+ }
+
+ if (!first && !rustdem_add_sep(st))
+ return (B_FALSE);
+
+ /* Reduce length of seg to the length we parsed */
+ (void) sv_init_sv_range(&name, &sv, len);
+
+ DEMDEBUG("%s: segment='%.*s'", __func__, SV_PRINT(&name));
+
+ /*
+ * A rust hash starts with 'h', and is the last component of a name
+ * before the terminating 'E'
+ */
+ if (sv_peek(&name, 0) == 'h' && last) {
+ if (!rustdem_parse_hash(st, &name))
+ return (B_FALSE);
+ goto done;
+ }
+
+ while (sv_remaining(&name) > 0) {
+ switch (sv_peek(&name, 0)) {
+ case '$':
+ if (rustdem_parse_special(st, &name))
+ continue;
+ break;
+ case '_':
+ if (sv_peek(&name, 1) == '$') {
+ /*
+ * Only consume/ignore '_'. Leave
+ * $ for next round.
+ */
+ sv_consume_n(&name, 1);
+ continue;
+ }
+ break;
+ case '.':
+ /* Convert '..' to '::' */
+ if (sv_peek(&name, 1) != '.')
+ break;
+
+ if (!rustdem_add_sep(st))
+ return (B_FALSE);
+
+ sv_consume_n(&name, 2);
+ continue;
+ default:
+ break;
+ }
+
+ if (custr_appendc(st->rds_demangled,
+ sv_consume_c(&name)) != 0) {
+ st->rds_error = ENOMEM;
+ return (B_FALSE);
+ }
+ }
+
+done:
+ DEMDEBUG("%s: consumed '%.*s'", __func__, (int)len, svp->sv_first);
+ sv_consume_n(&sv, len);
+ sv_init_sv(svp, &sv);
+ return (B_TRUE);
+}
+
+static boolean_t
+rustdem_parse_name(rustdem_state_t *st, strview_t *svp)
+{
+ strview_t name;
+ boolean_t first = B_TRUE;
+
+ if (st->rds_error != 0)
+ return (B_FALSE);
+
+ sv_init_sv(&name, svp);
+
+ if (sv_remaining(&name) == 0)
+ return (B_FALSE);
+
+ while (sv_remaining(&name) > 0 && sv_peek(&name, 0) != 'E') {
+ if (!rustdem_parse_name_segment(st, &name, first))
+ return (B_FALSE);
+ first = B_FALSE;
+ }
+
+ sv_init_sv(svp, &name);
+ return (B_TRUE);
+}
+
+static boolean_t
+rustdem_parse_hash(rustdem_state_t *st, strview_t *svp)
+{
+ strview_t sv;
+
+ sv_init_sv(&sv, svp);
+
+ VERIFY(sv_consume_if_c(&sv, 'h'));
+ if (!rustdem_append_c(st, 'h'))
+ return (B_FALSE);
+
+ while (sv_remaining(&sv) > 0) {
+ char c = sv_consume_c(&sv);
+
+ switch (c) {
+ /*
+ * The upper-case hex digits (A-F) are excluded as valid
+ * hash values for several reasons:
+ *
+ * 1. It would result in two different possible names for
+ * the same function, leading to ambiguity in linking (among
+ * other things).
+ *
+ * 2. It would cause potential ambiguity in parsing -- is a
+ * trailing 'E' part of the hash, or the terminating character
+ * in the mangled name?
+ *
+ * 3. No examples were able to be found in the wild where
+ * uppercase digits are used, and other rust demanglers all
+ * seem to assume the hash must contain lower-case hex digits.
+ */
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ case '8': case '9': case 'a': case 'b':
+ case 'c': case 'd': case 'e': case 'f':
+ if (!rustdem_append_c(st, c))
+ return (B_FALSE);
+ break;
+ default:
+ return (B_FALSE);
+ }
+ }
+
+ sv_init_sv(svp, &sv);
+ return (B_TRUE);
+}
+
+/*
+ * A 10 digit value would imply a name 1Gb or larger in size. It seems
+ * unlikely to the point of absurdity any such value could every possibly
+ * be valid (or even have compiled properly). This also prevents the
+ * uint64_t conversion from possibly overflowing since the value must always
+ * be below 10 * UINT32_MAX.
+ */
+#define MAX_DIGITS 10
+
+static boolean_t
+rustdem_parse_num(rustdem_state_t *restrict st, strview_t *restrict svp,
+ uint64_t *restrict valp)
+{
+ strview_t snum;
+ uint64_t v = 0;
+ size_t ndigits = 0;
+ char c;
+
+ if (st->rds_error != 0)
+ return (B_FALSE);
+
+ sv_init_sv(&snum, svp);
+
+ DEMDEBUG("%s: str='%.*s'", __func__, SV_PRINT(&snum));
+
+ c = sv_peek(&snum, 0);
+ if (!ISDIGIT(c)) {
+ DEMDEBUG("%s: ERROR no digits in str\n", __func__);
+ st->rds_error = EINVAL;
+ return (B_FALSE);
+ }
+
+ /*
+ * Since there is currently no official specification on rust name
+ * mangling, only that it has been stated that rust follows what
+ * C++ mangling does. In the Itanium C++ ABI (what practically
+ * every non-Windows C++ implementation uses these days), it
+ * explicitly disallows leading 0s in numeric values (except for
+ * substition and template indexes, which aren't relevant here).
+ * We enforce the same restriction -- if a rust implementation allowed
+ * leading zeros in numbers (basically segment lengths) it'd
+ * cause all sorts of ambiguity problems with names that likely lead
+ * to much bigger problems with linking and such, so this seems
+ * reasonable.
+ */
+ if (c == '0') {
+ DEMDEBUG("%s: ERROR number starts with leading 0\n", __func__);
+ st->rds_error = EINVAL;
+ return (B_FALSE);
+ }
+
+ while (sv_remaining(&snum) > 0 && ndigits <= MAX_DIGITS) {
+ c = sv_consume_c(&snum);
+
+ if (!ISDIGIT(c))
+ break;
+
+ v *= 10;
+ v += c - '0';
+ ndigits++;
+ }
+
+ if (ndigits > MAX_DIGITS) {
+ DEMDEBUG("%s: value %llu is too large\n", __func__, v);
+ st->rds_error = ERANGE;
+ return (B_FALSE);
+ }
+
+ DEMDEBUG("%s: num=%llu", __func__, v);
+
+ *valp = v;
+ sv_consume_n(svp, ndigits);
+ return (B_TRUE);
+}
+
+static boolean_t
+rustdem_parse_special(rustdem_state_t *restrict st, strview_t *restrict svp)
+{
+ if (st->rds_error != 0)
+ return (B_FALSE);
+
+ if (sv_peek(svp, 0) != '$')
+ return (B_FALSE);
+
+ for (size_t i = 0; i < rust_charmap_sz; i++) {
+ if (sv_consume_if(svp, rust_charmap[i].ruc_seq)) {
+ if (!rustdem_append_c(st, rust_charmap[i].ruc_ch))
+ return (B_FALSE);
+ return (B_TRUE);
+ }
+ }
+ return (B_FALSE);
+}
+
+static boolean_t
+rustdem_add_sep(rustdem_state_t *st)
+{
+ if (st->rds_error != 0)
+ return (B_FALSE);
+
+ if (!rustdem_append_c(st, ':') ||
+ !rustdem_append_c(st, ':'))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static boolean_t
+rustdem_append_c(rustdem_state_t *st, char c)
+{
+ if (st->rds_error != 0)
+ return (B_FALSE);
+
+ if (custr_appendc(st->rds_demangled, c) == 0)
+ return (B_TRUE);
+
+ st->rds_error = errno;
+ return (B_FALSE);
+}
+
+static boolean_t
+rustdem_all_ascii(const strview_t *svp)
+{
+ strview_t p;
+
+ sv_init_sv(&p, svp);
+
+ while (sv_remaining(&p) > 0) {
+ char c = sv_consume_c(&p);
+
+ /*
+ * #including <sys/ctype.h> conflicts with <ctype.h>. Since
+ * we want the C locale macros (ISDIGIT, etc), it also means
+ * we can't use isascii(3C).
+ */
+ if ((c & 0x80) != 0) {
+ DEMDEBUG("%s: found non-ascii character 0x%02hhx at "
+ "offset %tu", __func__, c,
+ (ptrdiff_t)(p.sv_first - svp->sv_first));
+ return (B_FALSE);
+ }
+ }
+ return (B_TRUE);
+}
+
+static void *
+rustdem_alloc(custr_alloc_t *cao, size_t len)
+{
+ rustdem_state_t *st = cao->cua_arg;
+ return (zalloc(st->rds_ops, len));
+}
+
+static void
+rustdem_free(custr_alloc_t *cao, void *p, size_t len)
+{
+ rustdem_state_t *st = cao->cua_arg;
+ xfree(st->rds_ops, p, len);
+}