diff options
Diffstat (limited to 'usr/src/common')
24 files changed, 3576 insertions, 153 deletions
diff --git a/usr/src/common/brand/lx/lx_auxv.c b/usr/src/common/brand/lx/lx_auxv.c new file mode 100644 index 0000000000..2ed5fd0517 --- /dev/null +++ b/usr/src/common/brand/lx/lx_auxv.c @@ -0,0 +1,96 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/auxv.h> +#include <sys/lx_brand.h> + +/* + * Linux does not make the distinction between 'int' and 'long' when it comes + * to the format of the aux vector. In order to properly clear the struct + * padding present in the native auxv_t in 64-bit, we employ the Linux format. + */ +struct lx_auxv { + long la_type; + long la_val; +}; + +int +lx_auxv_stol(const auxv_t *ap, auxv_t *oap, const lx_elf_data_t *edp) +{ + struct lx_auxv *loap = (struct lx_auxv *)oap; + + switch (ap->a_type) { + case AT_BASE: + loap->la_val = edp->ed_base; + break; + case AT_ENTRY: + loap->la_val = edp->ed_entry; + break; + case AT_PHDR: + loap->la_val = edp->ed_phdr; + break; + case AT_PHENT: + loap->la_val = edp->ed_phent; + break; + case AT_PHNUM: + loap->la_val = edp->ed_phnum; + break; + case AT_SUN_BRAND_LX_SYSINFO_EHDR: + loap->la_type = AT_SYSINFO_EHDR; + loap->la_val = ap->a_un.a_val; + return (0); + case AT_SUN_BRAND_LX_CLKTCK: + loap->la_type = AT_CLKTCK; + loap->la_val = ap->a_un.a_val; + return (0); + case AT_SUN_AUXFLAGS: + if ((ap->a_un.a_val & AF_SUN_SETUGID) != 0) { + loap->la_type = AT_SECURE; + loap->la_val = 1; + return (0); + } else { + return (1); + } + case AT_SUN_GID: + loap->la_type = AT_LX_EGID; + loap->la_val = ap->a_un.a_val; + return (0); + case AT_SUN_RGID: + loap->la_type = AT_LX_GID; + loap->la_val = ap->a_un.a_val; + return (0); + case AT_SUN_UID: + loap->la_type = AT_LX_EUID; + loap->la_val = ap->a_un.a_val; + return (0); + case AT_SUN_RUID: + loap->la_type = AT_LX_UID; + loap->la_val = ap->a_un.a_val; + return (0); + case AT_EXECFD: + case AT_PAGESZ: + case AT_FLAGS: + case AT_RANDOM: + case AT_NULL: + /* No translate needed */ + loap->la_val = ap->a_un.a_val; + break; + default: + /* All other unrecognized entries are ignored */ + return (1); + } + loap->la_type = ap->a_type; + return (0); +} diff --git a/usr/src/common/brand/lx/lx_auxv.h b/usr/src/common/brand/lx/lx_auxv.h new file mode 100644 index 0000000000..190d939f35 --- /dev/null +++ b/usr/src/common/brand/lx/lx_auxv.h @@ -0,0 +1,32 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LX_AUXV_H +#define _LX_AUXV_H + +#include <sys/auxv.h> +#include <sys/lx_brand.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern int lx_auxv_stol(const auxv_t *, auxv_t *, const lx_elf_data_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUXV_H */ diff --git a/usr/src/common/brand/lx/lx_errno.c b/usr/src/common/brand/lx/lx_errno.c new file mode 100644 index 0000000000..269ed470dc --- /dev/null +++ b/usr/src/common/brand/lx/lx_errno.c @@ -0,0 +1,206 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * This file contains a mapping table and lookup function for converting + * illumos native error numbers into error numbers appropriate for Linux + * emulation. + * + * The translation table is generated by the "gen_errno", built from and + * documented in "usr/src/common/brand/lx/tools". + */ + +#include <sys/debug.h> + +const int +lx_stol_errno[] = { + 0, /* 0: No Error */ + 1, /* 1: EPERM --> 1: EPERM */ + 2, /* 2: ENOENT --> 2: ENOENT */ + 3, /* 3: ESRCH --> 3: ESRCH */ + 4, /* 4: EINTR --> 4: EINTR */ + 5, /* 5: EIO --> 5: EIO */ + 6, /* 6: ENXIO --> 6: ENXIO */ + 7, /* 7: E2BIG --> 7: E2BIG */ + 8, /* 8: ENOEXEC --> 8: ENOEXEC */ + 9, /* 9: EBADF --> 9: EBADF */ + 10, /* 10: ECHILD --> 10: ECHILD */ + 11, /* 11: EAGAIN --> 11: EAGAIN */ + 12, /* 12: ENOMEM --> 12: ENOMEM */ + 13, /* 13: EACCES --> 13: EACCES */ + 14, /* 14: EFAULT --> 14: EFAULT */ + 15, /* 15: ENOTBLK --> 15: ENOTBLK */ + 16, /* 16: EBUSY --> 16: EBUSY */ + 17, /* 17: EEXIST --> 17: EEXIST */ + 18, /* 18: EXDEV --> 18: EXDEV */ + 19, /* 19: ENODEV --> 19: ENODEV */ + 20, /* 20: ENOTDIR --> 20: ENOTDIR */ + 21, /* 21: EISDIR --> 21: EISDIR */ + 22, /* 22: EINVAL --> 22: EINVAL */ + 23, /* 23: ENFILE --> 23: ENFILE */ + 24, /* 24: EMFILE --> 24: EMFILE */ + 25, /* 25: ENOTTY --> 25: ENOTTY */ + 26, /* 26: ETXTBSY --> 26: ETXTBSY */ + 27, /* 27: EFBIG --> 27: EFBIG */ + 28, /* 28: ENOSPC --> 28: ENOSPC */ + 29, /* 29: ESPIPE --> 29: ESPIPE */ + 30, /* 30: EROFS --> 30: EROFS */ + 31, /* 31: EMLINK --> 31: EMLINK */ + 32, /* 32: EPIPE --> 32: EPIPE */ + 33, /* 33: EDOM --> 33: EDOM */ + 34, /* 34: ERANGE --> 34: ERANGE */ + 42, /* 35: ENOMSG --> 42: ENOMSG */ + 43, /* 36: EIDRM --> 43: EIDRM */ + 44, /* 37: ECHRNG --> 44: ECHRNG */ + 45, /* 38: EL2NSYNC --> 45: EL2NSYNC */ + 46, /* 39: EL3HLT --> 46: EL3HLT */ + 47, /* 40: EL3RST --> 47: EL3RST */ + 48, /* 41: ELNRNG --> 48: ELNRNG */ + 49, /* 42: EUNATCH --> 49: EUNATCH */ + 50, /* 43: ENOCSI --> 50: ENOCSI */ + 51, /* 44: EL2HLT --> 51: EL2HLT */ + 35, /* 45: EDEADLK --> 35: EDEADLK */ + 37, /* 46: ENOLCK --> 37: ENOLCK */ + 125, /* 47: ECANCELED --> 125: ECANCELED */ + 38, /* 48: ENOTSUP --> 38: ENOSYS */ + 122, /* 49: EDQUOT --> 122: EDQUOT */ + 52, /* 50: EBADE --> 52: EBADE */ + 53, /* 51: EBADR --> 53: EBADR */ + 54, /* 52: EXFULL --> 54: EXFULL */ + 55, /* 53: ENOANO --> 55: ENOANO */ + 56, /* 54: EBADRQC --> 56: EBADRQC */ + 57, /* 55: EBADSLT --> 57: EBADSLT */ + 35, /* 56: EDEADLOCK --> 35: EDEADLK */ + 59, /* 57: EBFONT --> 59: EBFONT */ + 130, /* 58: EOWNERDEAD --> 130: EOWNERDEAD */ + 131, /* 59: ENOTRECOVERABLE --> 131: ENOTRECOVERABLE */ + 60, /* 60: ENOSTR --> 60: ENOSTR */ + 61, /* 61: ENODATA --> 61: ENODATA */ + 62, /* 62: ETIME --> 62: ETIME */ + 63, /* 63: ENOSR --> 63: ENOSR */ + 64, /* 64: ENONET --> 64: ENONET */ + 65, /* 65: ENOPKG --> 65: ENOPKG */ + 66, /* 66: EREMOTE --> 66: EREMOTE */ + 67, /* 67: ENOLINK --> 67: ENOLINK */ + 68, /* 68: EADV --> 68: EADV */ + 69, /* 69: ESRMNT --> 69: ESRMNT */ + 70, /* 70: ECOMM --> 70: ECOMM */ + 71, /* 71: EPROTO --> 71: EPROTO */ + -2, /* 72: ELOCKUNMAPPED --> -2: No Analogue */ + -2, /* 73: ENOTACTIVE --> -2: No Analogue */ + 72, /* 74: EMULTIHOP --> 72: EMULTIHOP */ + -1, /* 75: Unused Number */ + -1, /* 76: Unused Number */ + 74, /* 77: EBADMSG --> 74: EBADMSG */ + 36, /* 78: ENAMETOOLONG --> 36: ENAMETOOLONG */ + 75, /* 79: EOVERFLOW --> 75: EOVERFLOW */ + 76, /* 80: ENOTUNIQ --> 76: ENOTUNIQ */ + 77, /* 81: EBADFD --> 77: EBADFD */ + 78, /* 82: EREMCHG --> 78: EREMCHG */ + 79, /* 83: ELIBACC --> 79: ELIBACC */ + 80, /* 84: ELIBBAD --> 80: ELIBBAD */ + 81, /* 85: ELIBSCN --> 81: ELIBSCN */ + 82, /* 86: ELIBMAX --> 82: ELIBMAX */ + 83, /* 87: ELIBEXEC --> 83: ELIBEXEC */ + 84, /* 88: EILSEQ --> 84: EILSEQ */ + 38, /* 89: ENOSYS --> 38: ENOSYS */ + 40, /* 90: ELOOP --> 40: ELOOP */ + 85, /* 91: ERESTART --> 85: ERESTART */ + 86, /* 92: ESTRPIPE --> 86: ESTRPIPE */ + 39, /* 93: ENOTEMPTY --> 39: ENOTEMPTY */ + 87, /* 94: EUSERS --> 87: EUSERS */ + 88, /* 95: ENOTSOCK --> 88: ENOTSOCK */ + 89, /* 96: EDESTADDRREQ --> 89: EDESTADDRREQ */ + 90, /* 97: EMSGSIZE --> 90: EMSGSIZE */ + 91, /* 98: EPROTOTYPE --> 91: EPROTOTYPE */ + 92, /* 99: ENOPROTOOPT --> 92: ENOPROTOOPT */ + -1, /* 100: Unused Number */ + -1, /* 101: Unused Number */ + -1, /* 102: Unused Number */ + -1, /* 103: Unused Number */ + -1, /* 104: Unused Number */ + -1, /* 105: Unused Number */ + -1, /* 106: Unused Number */ + -1, /* 107: Unused Number */ + -1, /* 108: Unused Number */ + -1, /* 109: Unused Number */ + -1, /* 110: Unused Number */ + -1, /* 111: Unused Number */ + -1, /* 112: Unused Number */ + -1, /* 113: Unused Number */ + -1, /* 114: Unused Number */ + -1, /* 115: Unused Number */ + -1, /* 116: Unused Number */ + -1, /* 117: Unused Number */ + -1, /* 118: Unused Number */ + -1, /* 119: Unused Number */ + 93, /* 120: EPROTONOSUPPORT --> 93: EPROTONOSUPPORT */ + 94, /* 121: ESOCKTNOSUPPORT --> 94: ESOCKTNOSUPPORT */ + 95, /* 122: EOPNOTSUPP --> 95: EOPNOTSUPP */ + 96, /* 123: EPFNOSUPPORT --> 96: EPFNOSUPPORT */ + 97, /* 124: EAFNOSUPPORT --> 97: EAFNOSUPPORT */ + 98, /* 125: EADDRINUSE --> 98: EADDRINUSE */ + 99, /* 126: EADDRNOTAVAIL --> 99: EADDRNOTAVAIL */ + 100, /* 127: ENETDOWN --> 100: ENETDOWN */ + 101, /* 128: ENETUNREACH --> 101: ENETUNREACH */ + 102, /* 129: ENETRESET --> 102: ENETRESET */ + 103, /* 130: ECONNABORTED --> 103: ECONNABORTED */ + 104, /* 131: ECONNRESET --> 104: ECONNRESET */ + 105, /* 132: ENOBUFS --> 105: ENOBUFS */ + 106, /* 133: EISCONN --> 106: EISCONN */ + 107, /* 134: ENOTCONN --> 107: ENOTCONN */ + -1, /* 135: Unused Number */ + -1, /* 136: Unused Number */ + -1, /* 137: Unused Number */ + -1, /* 138: Unused Number */ + -1, /* 139: Unused Number */ + -1, /* 140: Unused Number */ + -1, /* 141: Unused Number */ + -1, /* 142: Unused Number */ + 108, /* 143: ESHUTDOWN --> 108: ESHUTDOWN */ + 109, /* 144: ETOOMANYREFS --> 109: ETOOMANYREFS */ + 110, /* 145: ETIMEDOUT --> 110: ETIMEDOUT */ + 111, /* 146: ECONNREFUSED --> 111: ECONNREFUSED */ + 112, /* 147: EHOSTDOWN --> 112: EHOSTDOWN */ + 113, /* 148: EHOSTUNREACH --> 113: EHOSTUNREACH */ + 114, /* 149: EALREADY --> 114: EALREADY */ + 115, /* 150: EINPROGRESS --> 115: EINPROGRESS */ + 116 /* 151: ESTALE --> 116: ESTALE */ +}; + +/* + * Convert an illumos native error number to a Linux error number and return + * it. If no valid conversion is possible, the function fails back to the + * value of "defval". In userland, passing a default error number of "-1" + * will abort the program if the error number could not be converted. + */ +int +lx_errno(int native_errno, int defval) +{ +#ifdef _KERNEL + VERIFY3S(defval, >=, 0); +#endif + + if (native_errno < 0 || native_errno >= (sizeof (lx_stol_errno) / + sizeof (lx_stol_errno[0]))) { +#ifndef _KERNEL + VERIFY3S(defval, >=, 0); +#endif + + return (defval); + } + + return (lx_stol_errno[native_errno]); +} diff --git a/usr/src/common/brand/lx/lx_errno.h b/usr/src/common/brand/lx/lx_errno.h new file mode 100644 index 0000000000..10b6b3066c --- /dev/null +++ b/usr/src/common/brand/lx/lx_errno.h @@ -0,0 +1,29 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LX_ERRNO_H +#define _LX_ERRNO_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern int lx_errno(int, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_ERRNO_H */ diff --git a/usr/src/common/brand/lx/lx_signum.c b/usr/src/common/brand/lx/lx_signum.c new file mode 100644 index 0000000000..9c861c282a --- /dev/null +++ b/usr/src/common/brand/lx/lx_signum.c @@ -0,0 +1,339 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/signal.h> +#include <sys/lx_siginfo.h> +#include <lx_signum.h> +#include <sys/debug.h> + +/* + * Delivering signals to a Linux process is complicated by differences in + * signal numbering, stack structure and contents, and the action taken when a + * signal handler exits. In addition, many signal-related structures, such as + * sigset_ts, vary between Solaris and Linux. + * + * The simplest transformation that must be done when sending signals is to + * translate between Linux and Solaris signal numbers. + * + * These are the major signal number differences between Linux and Solaris: + * + * ==================================== + * | Number | Linux | Solaris | + * | ====== | ========= | ========== | + * | 7 | SIGBUS | SIGEMT | + * | 10 | SIGUSR1 | SIGBUS | + * | 12 | SIGUSR2 | SIGSYS | + * | 16 | SIGSTKFLT | SIGUSR1 | + * | 17 | SIGCHLD | SIGUSR2 | + * | 18 | SIGCONT | SIGCHLD | + * | 19 | SIGSTOP | SIGPWR | + * | 20 | SIGTSTP | SIGWINCH | + * | 21 | SIGTTIN | SIGURG | + * | 22 | SIGTTOU | SIGPOLL | + * | 23 | SIGURG | SIGSTOP | + * | 24 | SIGXCPU | SIGTSTP | + * | 25 | SIGXFSZ | SIGCONT | + * | 26 | SIGVTALARM | SIGTTIN | + * | 27 | SIGPROF | SIGTTOU | + * | 28 | SIGWINCH | SIGVTALARM | + * | 29 | SIGPOLL | SIGPROF | + * | 30 | SIGPWR | SIGXCPU | + * | 31 | SIGSYS | SIGXFSZ | + * ==================================== + * + * Not every Linux signal maps to a Solaris signal, nor does every Solaris + * signal map to a Linux counterpart. However, when signals do map, the + * mapping is unique. + * + * One mapping issue is that Linux supports 33 real time signals, with SIGRTMIN + * typically starting at or near 32 (SIGRTMIN) and proceeding to 64 (SIGRTMAX) + * (SIGRTMIN is "at or near" 32 because glibc usually "steals" one ore more of + * these signals for its own internal use, adjusting SIGRTMIN and SIGRTMAX as + * needed.) Conversely, Solaris actively uses signals 32-40 for other purposes + * and supports exactly 32 real time signals, in the range 41 (SIGRTMIN) + * to 72 (SIGRTMAX). + * + * At present, attempting to translate a Linux signal equal to 63 + * will generate an error (we allow SIGRTMAX because a program + * should be able to send SIGRTMAX without getting an EINVAL, though obviously + * anything that loops through the signals from SIGRTMIN to SIGRTMAX will + * fail.) + * + * Similarly, attempting to translate a native Solaris signal in the range + * 32-40 will also generate an error as we don't want to support the receipt of + * those signals from the Solaris global zone. + */ + +/* + * Linux to Solaris signal map + * + * Usage: solaris_signal = ltos_signum[lx_signal]; + */ +const int +ltos_signo[LX_NSIG + 1] = { + 0, + SIGHUP, + SIGINT, + SIGQUIT, + SIGILL, + SIGTRAP, + SIGABRT, + SIGBUS, + SIGFPE, + SIGKILL, + SIGUSR1, + SIGSEGV, + SIGUSR2, + SIGPIPE, + SIGALRM, + SIGTERM, + SIGEMT, /* 16: Linux SIGSTKFLT; use Solaris SIGEMT */ + SIGCHLD, + SIGCONT, + SIGSTOP, + SIGTSTP, + SIGTTIN, + SIGTTOU, + SIGURG, + SIGXCPU, + SIGXFSZ, + SIGVTALRM, + SIGPROF, + SIGWINCH, + SIGPOLL, + SIGPWR, + SIGSYS, + _SIGRTMIN, /* 32: Linux SIGRTMIN */ + _SIGRTMIN + 1, + _SIGRTMIN + 2, + _SIGRTMIN + 3, + _SIGRTMIN + 4, + _SIGRTMIN + 5, + _SIGRTMIN + 6, + _SIGRTMIN + 7, + _SIGRTMIN + 8, + _SIGRTMIN + 9, + _SIGRTMIN + 10, + _SIGRTMIN + 11, + _SIGRTMIN + 12, + _SIGRTMIN + 13, + _SIGRTMIN + 14, + _SIGRTMIN + 15, + _SIGRTMIN + 16, + _SIGRTMIN + 17, + _SIGRTMIN + 18, + _SIGRTMIN + 19, + _SIGRTMIN + 20, + _SIGRTMIN + 21, + _SIGRTMIN + 22, + _SIGRTMIN + 23, + _SIGRTMIN + 24, + _SIGRTMIN + 25, + _SIGRTMIN + 26, + _SIGRTMIN + 27, + _SIGRTMIN + 28, + _SIGRTMIN + 29, + _SIGRTMIN + 30, + _SIGRTMIN + 31, + _SIGRTMAX, /* 64: Linux SIGRTMAX */ +}; + +/* + * Solaris to Linux signal map + * + * Usage: lx_signal = stol_signo[solaris_signal]; + */ +const int +stol_signo[NSIG] = { + 0, + LX_SIGHUP, + LX_SIGINT, + LX_SIGQUIT, + LX_SIGILL, + LX_SIGTRAP, + LX_SIGABRT, + LX_SIGSTKFLT, /* 7: Solaris SIGEMT; use for LX_SIGSTKFLT */ + LX_SIGFPE, + LX_SIGKILL, + LX_SIGBUS, + LX_SIGSEGV, + LX_SIGSYS, + LX_SIGPIPE, + LX_SIGALRM, + LX_SIGTERM, + LX_SIGUSR1, + LX_SIGUSR2, + LX_SIGCHLD, + LX_SIGPWR, + LX_SIGWINCH, + LX_SIGURG, + LX_SIGPOLL, + LX_SIGSTOP, + LX_SIGTSTP, + LX_SIGCONT, + LX_SIGTTIN, + LX_SIGTTOU, + LX_SIGVTALRM, + LX_SIGPROF, + LX_SIGXCPU, + LX_SIGXFSZ, + -1, /* 32: Solaris SIGWAITING */ + -1, /* 33: Solaris SIGLWP */ + -1, /* 34: Solaris SIGFREEZE */ + -1, /* 35: Solaris SIGTHAW */ + -1, /* 36: Solaris SIGCANCEL */ + -1, /* 37: Solaris SIGLOST */ + -1, /* 38: Solaris SIGXRES */ + -1, /* 39: Solaris SIGJVM1 */ + -1, /* 40: Solaris SIGJVM2 */ + -1, /* 41: Solaris SIGINFO */ + LX_SIGRTMIN, /* 42: Solaris _SIGRTMIN */ + LX_SIGRTMIN + 1, + LX_SIGRTMIN + 2, + LX_SIGRTMIN + 3, + LX_SIGRTMIN + 4, + LX_SIGRTMIN + 5, + LX_SIGRTMIN + 6, + LX_SIGRTMIN + 7, + LX_SIGRTMIN + 8, + LX_SIGRTMIN + 9, + LX_SIGRTMIN + 10, + LX_SIGRTMIN + 11, + LX_SIGRTMIN + 12, + LX_SIGRTMIN + 13, + LX_SIGRTMIN + 14, + LX_SIGRTMIN + 15, + LX_SIGRTMIN + 16, + LX_SIGRTMIN + 17, + LX_SIGRTMIN + 18, + LX_SIGRTMIN + 19, + LX_SIGRTMIN + 20, + LX_SIGRTMIN + 21, + LX_SIGRTMIN + 22, + LX_SIGRTMIN + 23, + LX_SIGRTMIN + 24, + LX_SIGRTMIN + 25, + LX_SIGRTMIN + 26, + LX_SIGRTMIN + 27, + LX_SIGRTMIN + 28, + LX_SIGRTMIN + 29, + LX_SIGRTMIN + 30, + LX_SIGRTMIN + 31, + LX_SIGRTMAX, /* 74: Solaris _SIGRTMAX */ +}; + +/* + * Convert an illumos native signal number to a Linux signal number and return + * it. If no valid conversion is possible, the function fails back to the + * value of "defsig". In userland, passing a default signal number of "-1" + * will abort the program if the signal number could not be converted. + */ +int +lx_stol_signo(int signo, int defsig) +{ + int rval; + +#ifdef _KERNEL + VERIFY3S(defsig, >=, 0); +#endif + + if (signo < 0 || signo >= NSIG || (rval = stol_signo[signo]) < 1) { +#ifndef _KERNEL + VERIFY3S(defsig, >=, 0); +#endif + return (defsig); + } + + return (rval); +} + + +/* + * Convert a Linux signal number to an illumos signal number and return it. + * Error behavior is identical to lx_stol_signo. + */ +int +lx_ltos_signo(int signo, int defsig) +{ +#ifdef _KERNEL + VERIFY3S(defsig, >=, 0); +#endif + + if (signo < 1 || signo >= NSIG) { +#ifndef _KERNEL + VERIFY3S(defsig, >=, 0); +#endif + return (defsig); + } + + return (ltos_signo[signo]); +} + +/* + * Convert the "status" field of a SIGCLD siginfo_t. We need to extract the + * illumos signal number and convert it to a Linux signal number while leaving + * the ptrace(2) event bits intact. In userland, passing a default signal + * number of "-1" will abort the program if the signal number could not be + * converted, as for lx_stol_signo(). + */ +int +lx_stol_status(int s, int defsig) +{ + /* + * We mask out the top bit here in case PTRACE_O_TRACESYSGOOD + * is in use and 0x80 has been ORed with the signal number. + */ + int stat = lx_stol_signo(s & 0x7f, defsig); + + /* + * We must mix in the ptrace(2) event which may be stored in + * the second byte of the status code. We also re-include the + * PTRACE_O_TRACESYSGOOD bit. + */ + return ((s & 0xff80) | stat); +} + +int +lx_stol_sigcode(int code) +{ + switch (code) { + case SI_USER: + return (LX_SI_USER); + case SI_LWP: + return (LX_SI_TKILL); + case SI_QUEUE: + return (LX_SI_QUEUE); + case SI_TIMER: + return (LX_SI_TIMER); + case SI_ASYNCIO: + return (LX_SI_ASYNCIO); + case SI_MESGQ: + return (LX_SI_MESGQ); + default: + return (code); + } +} diff --git a/usr/src/common/brand/lx/lx_signum.h b/usr/src/common/brand/lx/lx_signum.h new file mode 100644 index 0000000000..b6c5f32731 --- /dev/null +++ b/usr/src/common/brand/lx/lx_signum.h @@ -0,0 +1,114 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LX_SIGNUM_H +#define _LX_SIGNUM_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define LX_SIGHUP 1 +#define LX_SIGINT 2 +#define LX_SIGQUIT 3 +#define LX_SIGILL 4 +#define LX_SIGTRAP 5 +#define LX_SIGABRT 6 +#define LX_SIGIOT 6 +#define LX_SIGBUS 7 +#define LX_SIGFPE 8 +#define LX_SIGKILL 9 +#define LX_SIGUSR1 10 +#define LX_SIGSEGV 11 +#define LX_SIGUSR2 12 +#define LX_SIGPIPE 13 +#define LX_SIGALRM 14 +#define LX_SIGTERM 15 +#define LX_SIGSTKFLT 16 +#define LX_SIGCHLD 17 +#define LX_SIGCONT 18 +#define LX_SIGSTOP 19 +#define LX_SIGTSTP 20 +#define LX_SIGTTIN 21 +#define LX_SIGTTOU 22 +#define LX_SIGURG 23 +#define LX_SIGXCPU 24 +#define LX_SIGXFSZ 25 +#define LX_SIGVTALRM 26 +#define LX_SIGPROF 27 +#define LX_SIGWINCH 28 +#define LX_SIGIO 29 +#define LX_SIGPOLL LX_SIGIO +#define LX_SIGPWR 30 +#define LX_SIGSYS 31 +#define LX_SIGUNUSED 31 + +#define LX_NSIG 64 /* Linux _NSIG */ + +#define LX_SIGRTMIN 32 +#define LX_SIGRTMAX LX_NSIG + +extern const int ltos_signo[]; +extern const int stol_signo[]; + +extern int lx_stol_signo(int, int); +extern int lx_ltos_signo(int, int); +extern int lx_stol_status(int, int); +extern int lx_stol_sigcode(int); + +/* + * NOTE: Linux uses different definitions for 'sigset_t's and 'sigaction_t's + * depending on whether the definition is for user space or the kernel. + * + * The definitions below MUST correspond to the Linux kernel versions, + * as glibc will do the necessary translation from the Linux user + * versions. + */ +#if defined(_LP64) +#define LX_NSIG_WORDS 1 +#define LX_WSHIFT 6 +#elif defined(_ILP32) +#define LX_NSIG_WORDS 2 +#define LX_WSHIFT 5 +#else +#error "LX only supports LP64 and ILP32" +#endif + +typedef struct { + ulong_t __bits[LX_NSIG_WORDS]; +} lx_sigset_t; + +#define LX_NBITS (sizeof (ulong_t) * NBBY) +#define lx_sigmask(n) (1UL << (((n) - 1) % LX_NBITS)) +#define lx_sigword(n) (((ulong_t)((n) - 1)) >> LX_WSHIFT) +#define lx_sigismember(s, n) (lx_sigmask(n) & (s)->__bits[lx_sigword(n)]) +#define lx_sigaddset(s, n) ((s)->__bits[lx_sigword(n)] |= lx_sigmask(n)) + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_SIGNUM_H */ diff --git a/usr/src/common/brand/lx/lx_syscall.h b/usr/src/common/brand/lx/lx_syscall.h new file mode 100644 index 0000000000..01e8b79512 --- /dev/null +++ b/usr/src/common/brand/lx/lx_syscall.h @@ -0,0 +1,123 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LX_SYSCALL_H +#define _LX_SYSCALL_H + +#include <sys/lx_brand.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The br_scall_args field of lx_lwp_data is going to be populated with + * pointers to structs. The types of these structs should be defined in this + * header file. These are Linux specific arguments to system calls that don't + * exist in illumos. Each section should be labelled with which system call it + * belongs to. + */ + +/* arguments for waitpid(2) */ +/* see comments in usr/src/lib/brand/lx/lx_brand/common/wait.c */ +#define LX_WNOTHREAD 0x20000000 /* Do not wait on siblings' children */ +#define LX_WALL 0x40000000 /* Wait on all children */ +#define LX_WCLONE 0x80000000 /* Wait only on clone children */ + +/* For arch_prctl(2) */ +#define LX_ARCH_SET_GS 0x1001 +#define LX_ARCH_SET_FS 0x1002 +#define LX_ARCH_GET_FS 0x1003 +#define LX_ARCH_GET_GS 0x1004 + +/* + * For ptrace(2): + */ +#define LX_PTRACE_TRACEME 0 +#define LX_PTRACE_PEEKTEXT 1 +#define LX_PTRACE_PEEKDATA 2 +#define LX_PTRACE_PEEKUSER 3 +#define LX_PTRACE_POKETEXT 4 +#define LX_PTRACE_POKEDATA 5 +#define LX_PTRACE_POKEUSER 6 +#define LX_PTRACE_CONT 7 +#define LX_PTRACE_KILL 8 +#define LX_PTRACE_SINGLESTEP 9 +#define LX_PTRACE_GETREGS 12 +#define LX_PTRACE_SETREGS 13 +#define LX_PTRACE_GETFPREGS 14 +#define LX_PTRACE_SETFPREGS 15 +#define LX_PTRACE_ATTACH 16 +#define LX_PTRACE_DETACH 17 +#define LX_PTRACE_GETFPXREGS 18 +#define LX_PTRACE_SETFPXREGS 19 +#define LX_PTRACE_SYSCALL 24 +#define LX_PTRACE_SETOPTIONS 0x4200 +#define LX_PTRACE_GETEVENTMSG 0x4201 +#define LX_PTRACE_GETSIGINFO 0x4202 + +/* + * For clone(2): + */ +#define LX_CSIGNAL 0x000000ff +#define LX_CLONE_VM 0x00000100 +#define LX_CLONE_FS 0x00000200 +#define LX_CLONE_FILES 0x00000400 +#define LX_CLONE_SIGHAND 0x00000800 +#define LX_CLONE_PID 0x00001000 +#define LX_CLONE_PTRACE 0x00002000 +#define LX_CLONE_VFORK 0x00004000 +#define LX_CLONE_PARENT 0x00008000 +#define LX_CLONE_THREAD 0x00010000 +#define LX_CLONE_NEWNS 0x00020000 +#define LX_CLONE_SYSVSEM 0x00040000 +#define LX_CLONE_SETTLS 0x00080000 +#define LX_CLONE_PARENT_SETTID 0x00100000 +#define LX_CLONE_CHILD_CLEARTID 0x00200000 +#define LX_CLONE_DETACH 0x00400000 +#define LX_CLONE_UNTRACED 0x00800000 +#define LX_CLONE_CHILD_SETTID 0x01000000 +#define LX_CLONE_NEWCGROUP 0x02000000 +#define LX_CLONE_NEWUTS 0x04000000 +#define LX_CLONE_NEWIPC 0x08000000 +#define LX_CLONE_NEWUSER 0x10000000 +#define LX_CLONE_NEWPID 0x20000000 +#define LX_CLONE_NEWNET 0x40000000 +#define LX_CLONE_IO 0x80000000 + +#define SHARED_AS \ + (LX_CLONE_VM | LX_CLONE_FS | LX_CLONE_FILES | LX_CLONE_SIGHAND | \ + LX_CLONE_THREAD) + +/* + * Valid clone flags when not a full process or full thread (SHARED_AS), This + * can be expanded as additional clone-group support is added. + */ +#define LX_CLONE_GRP_SUBSET (LX_CLONE_FS) + +#define LX_IS_CLONE_GRP(X) ((X & SHARED_AS) != 0 && \ + (X & SHARED_AS) != SHARED_AS && \ + ((X & SHARED_AS) & ~LX_CLONE_GRP_SUBSET) == 0) + +#define LX_CLONE_NS_UNSUP (LX_CLONE_NEWNS | LX_CLONE_NEWCGROUP | \ + LX_CLONE_NEWUTS | LX_CLONE_NEWIPC | \ + LX_CLONE_NEWUSER | LX_CLONE_NEWPID | \ + LX_CLONE_NEWNET | LX_CLONE_IO) + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_SYSCALL_H */ diff --git a/usr/src/common/brand/lx/tools/Makefile b/usr/src/common/brand/lx/tools/Makefile new file mode 100644 index 0000000000..2b5bb92251 --- /dev/null +++ b/usr/src/common/brand/lx/tools/Makefile @@ -0,0 +1,47 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +PROG = gen_errno + +include ../../../../cmd/Makefile.cmd + +OBJS = gen_errno.o + +CLOBBERFILES += $(PROG) + +NATIVECC_CFLAGS += $(CFLAGS) $(CCVERBOSE) +# As evidenced by the use of the NATIVE_ variables, gen_errno is intended +# to be able to run on the build host. We continue to link it against +# libcmdutils.so instead of libcustr.so in order to allow it to run on +# older build hosts (relying on the libcmdutil filter entries if run on +# newer hosts with libcustr.so). +NATIVECC_LDLIBS += -lcmdutils -lnvpair + +.KEEP_STATE: + +all: $(PROG) + +install: all + +lint: lint_PROG + +clean: + $(RM) $(OBJS) + +$(PROG): $(OBJS) + $(NATIVECC) $(NATIVECC_CFLAGS) $(NATIVECC_LDLIBS) $(OBJS) -o $@ + $(POST_PROCESS) + +include ../../../../cmd/Makefile.targ diff --git a/usr/src/common/brand/lx/tools/README.md b/usr/src/common/brand/lx/tools/README.md new file mode 100644 index 0000000000..5e4976f200 --- /dev/null +++ b/usr/src/common/brand/lx/tools/README.md @@ -0,0 +1,39 @@ +# Updating Error Number Translations + +To create an updated error number translation table, you can use the +`gen_errno` tool. This tool requires, as input: + +* the illumos native `errno.h` file +* a set of foreign operating system `errno.h` files + +The output is a set of translation table entries suitable for inclusion in a +cstyled C array. The index of the array is the native error number and the +value at each index is the translated error number for use with the foreign +operating system. + +## Example + +To generate a translation table for the LX Brand, you will require two files +from the current Linux source: + +* `include/uapi/asm-generic/errno-base.h` (low-valued, or base, error numbers) +* `include/uapi/asm-generic/errno.h` (extended error numbers) + +Assuming the files are in the current directory, you should run the tool as +follows: + + $ dmake + ... + $ ./gen_errno -F errno-base.h -F errno.h \ + -N $SRC/uts/common/sys/errno.h + 0, /* 0: No Error */ + 1, /* 1: EPERM --> 1: EPERM */ + 2, /* 2: ENOENT --> 2: ENOENT */ + 3, /* 3: ESRCH --> 3: ESRCH */ + 4, /* 4: EINTR --> 4: EINTR */ + 5, /* 5: EIO --> 5: EIO */ + 6, /* 6: ENXIO --> 6: ENXIO */ + 7, /* 7: E2BIG --> 7: E2BIG */ + ... + +The output may be used in the `$SRC/common/brand/lx/lx_errno.c` file. diff --git a/usr/src/common/brand/lx/tools/gen_errno.c b/usr/src/common/brand/lx/tools/gen_errno.c new file mode 100644 index 0000000000..6089fed3bd --- /dev/null +++ b/usr/src/common/brand/lx/tools/gen_errno.c @@ -0,0 +1,444 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Take the error number definitions from a foreign system and generate a + * translation table that converts illumos native error numbers to foreign + * system error numbers. + */ + +#include <ctype.h> +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <errno.h> +#include <err.h> +#include <sys/sysmacros.h> +#include <libcustr.h> +#include <libnvpair.h> + +nvlist_t *native_errors; +nvlist_t *foreign_errors; + +struct override { + const char *ovr_from; + const char *ovr_to; +} overrides[] = { + { "ENOTSUP", "ENOSYS" }, + { 0 } +}; + +static const char * +lookup_override(const char *from) +{ + int i; + + for (i = 0; overrides[i].ovr_from != NULL; i++) { + if (strcmp(overrides[i].ovr_from, from) == 0) { + return (overrides[i].ovr_to); + } + } + + return (NULL); +} + +static int +parse_int(const char *number, int *rval) +{ + long n; + char *endpos; + + errno = 0; + if ((n = strtol(number, &endpos, 10)) == 0 && errno != 0) { + return (-1); + } + + if (endpos != NULL && *endpos != '\0') { + errno = EINVAL; + return (-1); + } + + if (n > INT_MAX || n < INT_MIN) { + errno = EOVERFLOW; + return (-1); + } + + *rval = (int)n; + return (0); +} + +static int +errnum_add(nvlist_t *nvl, const char *name, const char *number) +{ + int val; + + if (nvlist_exists(nvl, name)) { + (void) fprintf(stderr, "ERROR: duplicate definition: %s -> " + "%s\n", name, number); + errno = EEXIST; + return (-1); + } + + /* + * Try and parse the error number: + */ + if (parse_int(number, &val) == 0) { + /* + * The name refers to a number. + */ + if (nvlist_add_int32(nvl, name, val) != 0) { + (void) fprintf(stderr, "ERROR: nvlist_add_int32: %s\n", + strerror(errno)); + return (-1); + } + } else { + /* + * The name refers to another definition. + */ + if (nvlist_add_string(nvl, name, number) != 0) { + (void) fprintf(stderr, "ERROR: nvlist_add_string: %s\n", + strerror(errno)); + return (-1); + } + } + + return (0); +} + +static int +errnum_max(nvlist_t *nvl) +{ + int max = 0; + nvpair_t *nvp = NULL; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + if (nvpair_type(nvp) != DATA_TYPE_INT32) { + continue; + } + + max = MAX(fnvpair_value_int32(nvp), max); + } + + return (max); +} + +static int +errname_by_num(nvlist_t *nvl, int num, const char **name) +{ + nvpair_t *nvp = NULL; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + if (nvpair_type(nvp) != DATA_TYPE_INT32) { + continue; + } + + if (fnvpair_value_int32(nvp) == num) { + *name = nvpair_name(nvp); + return (0); + } + } + + errno = ENOENT; + return (-1); +} + +static int +errno_by_name(nvlist_t *nvl, const char *name, int *rval, const char **rname) +{ + nvpair_t *nvp = NULL; + + if (nvlist_lookup_nvpair(nvl, name, &nvp) != 0) { + errno = ENOENT; + return (-1); + } + + if (nvpair_type(nvp) == DATA_TYPE_STRING) { + return (errno_by_name(nvl, fnvpair_value_string(nvp), rval, + rname)); + } else { + *rval = fnvpair_value_int32(nvp); + if (rname != NULL) { + *rname = name; + } + return (0); + } +} + +static int +process_line(const char *line, nvlist_t *nvl) +{ + custr_t *nam = NULL, *num = NULL; + const char *c = line; + + if (custr_alloc(&nam) != 0 || custr_alloc(&num) != 0) { + int en = errno; + + custr_free(nam); + custr_free(num); + + errno = en; + return (-1); + } + + /* + * Valid lines begin with "#define": + */ + if (*c++ != '#' || *c++ != 'd' || *c++ != 'e' || *c++ != 'f' || + *c++ != 'i' || *c++ != 'n' || *c++ != 'e') { + return (0); + } + + /* + * Eat whitespace: + */ + for (;;) { + if (*c == '\0') { + return (0); + } + + if (*c != ' ' && *c != '\t') { + break; + } + + c++; + } + + /* + * Read error number token: + */ + for (;;) { + if (*c == '\0') { + return (0); + } + + if (*c == ' ' || *c == '\t') { + break; + } + + if (custr_appendc(nam, *c) != 0) { + return (-1); + } + + c++; + } + + /* + * Eat whitespace: + */ + for (;;) { + if (*c == '\0') { + return (0); + } + + if (*c != ' ' && *c != '\t') { + break; + } + + c++; + } + + /* + * Read error number token: + */ + for (;;) { + if (*c == '\0') { + break; + } + + if (*c == ' ' || *c == '\t') { + break; + } + + if (custr_appendc(num, *c) != 0) { + return (-1); + } + + c++; + } + + return (errnum_add(nvl, custr_cstr(nam), custr_cstr(num))); +} + +static int +read_file_into_list(const char *path, nvlist_t *nvl) +{ + int rval = 0, en = 0; + FILE *f; + custr_t *cu = NULL; + + if (custr_alloc(&cu) != 0) { + return (-1); + } + + if ((f = fopen(path, "r")) == NULL) { + custr_free(cu); + return (-1); + } + + for (;;) { + int c; + + errno = 0; + switch (c = fgetc(f)) { + case '\n': + case EOF: + if (errno != 0) { + en = errno; + rval = -1; + goto out; + } + if (process_line(custr_cstr(cu), nvl) != 0) { + en = errno; + rval = -1; + goto out; + } + custr_reset(cu); + if (c == EOF) { + goto out; + } + break; + + case '\r': + case '\0': + /* + * Ignore these characters. + */ + break; + + default: + if (custr_appendc(cu, c) != 0) { + en = errno; + rval = -1; + goto out; + } + break; + } + } + +out: + (void) fclose(f); + custr_free(cu); + errno = en; + return (rval); +} + +int +main(int argc, char **argv) +{ + int max; + int fval; + int c; + + if (nvlist_alloc(&native_errors, NV_UNIQUE_NAME, 0) != 0 || + nvlist_alloc(&foreign_errors, NV_UNIQUE_NAME, 0) != 0) { + err(1, "could not allocate memory"); + } + + while ((c = getopt(argc, argv, ":N:F:")) != -1) { + switch (c) { + case 'N': + if (read_file_into_list(optarg, native_errors) != 0) { + err(1, "could not read file: %s", optarg); + } + break; + + case 'F': + if (read_file_into_list(optarg, foreign_errors) != 0) { + err(1, "could not read file: %s", optarg); + } + break; + + case ':': + errx(1, "option -%c requires an operand", c); + break; + + case '?': + errx(1, "option -%c unrecognised", c); + break; + } + } + + /* + * Print an array entry for each error number: + */ + max = errnum_max(native_errors); + for (fval = 0; fval <= max; fval++) { + const char *fname; + const char *tname = NULL; + int32_t tval; + const char *msg = NULL; + const char *comma = (fval != max) ? "," : ""; + + if (errname_by_num(native_errors, fval, &fname) == -1) { + fname = NULL; + } + + if (fval == 0) { + /* + * The error number "0" is special: it means no worries. + */ + msg = "No Error"; + tval = 0; + } else if (fname == NULL) { + /* + * There is no defined name for this error number; it + * is unused. + */ + msg = "Unused Number"; + tval = -1; + } else { + /* + * Check if we want to override the name of this error + * in the foreign error number lookup: + */ + const char *oname = lookup_override(fname); + + /* + * Do the lookup: + */ + if (errno_by_name(foreign_errors, oname != NULL ? + oname : fname, &tval, &tname) != 0) { + /* + * There was no foreign error number by that + * name. + */ + tname = "No Analogue"; + tval = -2; + } + } + + if (msg == NULL) { + size_t flen = strlen(fname); + size_t tlen = strlen(tname); + const char *t = flen > 7 ? "\t" : "\t\t"; + const char *tt = tlen < 7 ? "\t\t\t" : tlen < 15 ? + "\t\t" : "\t"; + + (void) fprintf(stdout, "\t%d%s\t/* %3d: %s%s--> %3d: " + "%s%s*/\n", tval, comma, fval, fname, t, tval, + tname, tt); + } else { + const char *t = "\t\t\t\t\t"; + + (void) fprintf(stdout, "\t%d%s\t/* %3d: %s%s*/\n", tval, + comma, fval, msg, t); + } + } + + (void) nvlist_free(native_errors); + (void) nvlist_free(foreign_errors); + + return (0); +} diff --git a/usr/src/common/crypto/aes/aes_modes.c b/usr/src/common/crypto/aes/aes_modes.c index b23c78d65c..8c7cc6b093 100644 --- a/usr/src/common/crypto/aes/aes_modes.c +++ b/usr/src/common/crypto/aes/aes_modes.c @@ -101,7 +101,7 @@ aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length, if (aes_ctx->ac_flags & CTR_MODE) { rv = ctr_mode_contiguous_blocks(ctx, data, length, out, - AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); + AES_BLOCK_LEN, aes_encrypt_block); } else if (aes_ctx->ac_flags & CCM_MODE) { rv = ccm_mode_encrypt_contiguous_blocks(ctx, data, length, out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, @@ -134,7 +134,7 @@ aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length, if (aes_ctx->ac_flags & CTR_MODE) { rv = ctr_mode_contiguous_blocks(ctx, data, length, out, - AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); + AES_BLOCK_LEN, aes_encrypt_block); if (rv == CRYPTO_DATA_LEN_RANGE) rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE; } else if (aes_ctx->ac_flags & CCM_MODE) { diff --git a/usr/src/common/crypto/chacha/chacha.c b/usr/src/common/crypto/chacha/chacha.c index 5f9ef3b411..0a0b09919e 100644 --- a/usr/src/common/crypto/chacha/chacha.c +++ b/usr/src/common/crypto/chacha/chacha.c @@ -1,13 +1,25 @@ /* + * This implementation of ChaCha20 comes from the initial Dan Bernstein + * implementation, including a 256-bit key, a 64-bit nonce and a 64-bit + * counter. This is in contrast to ChaCha20 as defined in RFC 7539, which + * defines a 256-bit key, a 96-bit nonce and a 32-bit counter. In particular, + * kernel crash dump encryption relies on the fact that our larger counter + * allows for the encryption of very large messages (many gigabytes in + * length); any change to this implementation that reduces the size of the + * counter should be mindful of this use case. + */ + +/* chacha-merged.c version 20080118 D. J. Bernstein Public domain. */ -/* $OpenBSD: chacha_private.h,v 1.2 2013/10/04 07:02:27 djm Exp $ */ +/* $OpenBSD: chacha.c,v 1.1 2013/11/21 00:45:44 djm Exp $ */ -#include <chacha.h> -#include <stddef.h> +#include "chacha.h" +#include <sys/stddef.h> +#include <sys/null.h> typedef unsigned char u8; typedef unsigned int u32; @@ -76,10 +88,10 @@ chacha_keysetup(chacha_ctx_t *x,const u8 *k,u32 kbits,u32 ivbits) } void -chacha_ivsetup(chacha_ctx_t *x,const u8 *iv) +chacha_ivsetup(chacha_ctx_t *x,const u8 *iv, const u8 *counter) { - x->chacha_input[12] = 0; - x->chacha_input[13] = 0; + x->chacha_input[12] = counter == NULL ? 0 : U8TO32_LITTLE(counter + 0); + x->chacha_input[13] = counter == NULL ? 0 : U8TO32_LITTLE(counter + 4); x->chacha_input[14] = U8TO32_LITTLE(iv + 0); x->chacha_input[15] = U8TO32_LITTLE(iv + 4); } diff --git a/usr/src/common/crypto/chacha/chacha.h b/usr/src/common/crypto/chacha/chacha.h index ac9993a8a4..edadca4934 100644 --- a/usr/src/common/crypto/chacha/chacha.h +++ b/usr/src/common/crypto/chacha/chacha.h @@ -10,7 +10,7 @@ */ /* - * Copyright (c) 2015, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _CHACHA_H @@ -27,7 +27,7 @@ * over the data and xoring it with the generated cipher. */ -#include <inttypes.h> +#include <sys/inttypes.h> #ifdef __cplusplus extern "C" { @@ -39,7 +39,7 @@ typedef struct chacha_ctx { extern void chacha_keysetup(chacha_ctx_t *, const uint8_t *, uint32_t, uint32_t); -extern void chacha_ivsetup(chacha_ctx_t *, const uint8_t *); +extern void chacha_ivsetup(chacha_ctx_t *, const uint8_t *, const uint8_t *); extern void chacha_encrypt_bytes(chacha_ctx_t *, const uint8_t *, uint8_t *, uint32_t); diff --git a/usr/src/common/crypto/modes/ctr.c b/usr/src/common/crypto/modes/ctr.c index 919ed3ab53..7bf0134bb4 100644 --- a/usr/src/common/crypto/modes/ctr.c +++ b/usr/src/common/crypto/modes/ctr.c @@ -21,6 +21,8 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019 Joyent, Inc. */ #ifndef _KERNEL @@ -30,6 +32,7 @@ #include <security/cryptoki.h> #endif +#include <sys/debug.h> #include <sys/types.h> #include <modes/modes.h> #include <sys/crypto/common.h> @@ -37,164 +40,184 @@ #include <sys/byteorder.h> /* - * Encrypt and decrypt multiple blocks of data in counter mode. + * CTR (counter mode) is a stream cipher. That is, it generates a + * pseudo-random keystream that is used to XOR with the input to + * encrypt or decrypt. The pseudo-random keystream is generated by + * concatenating a nonce (supplied during initialzation) and with a + * counter (initialized to zero) to form an input block to the cipher + * mechanism. The resulting output of the cipher is used as a chunk + * of the pseudo-random keystream. Once all of the bytes of the + * keystream block have been used, the counter is incremented and + * the process repeats. + * + * Since this is a stream cipher, we do not accumulate input cipher + * text like we do for block modes. Instead we use ctr_ctx_t->ctr_offset + * to track the amount of bytes used in the current keystream block. */ -int -ctr_mode_contiguous_blocks(ctr_ctx_t *ctx, char *data, size_t length, - crypto_data_t *out, size_t block_size, - int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct), - void (*xor_block)(uint8_t *, uint8_t *)) + +static void +ctr_new_keyblock(ctr_ctx_t *ctx, + int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct)) { - size_t remainder = length; - size_t need; - uint8_t *datap = (uint8_t *)data; - uint8_t *blockp; - uint8_t *lastp; - void *iov_or_mp; - offset_t offset; - uint8_t *out_data_1; - uint8_t *out_data_2; - size_t out_data_1_len; uint64_t lower_counter, upper_counter; - if (length + ctx->ctr_remainder_len < block_size) { - /* accumulate bytes here and return */ - bcopy(datap, - (uint8_t *)ctx->ctr_remainder + ctx->ctr_remainder_len, - length); - ctx->ctr_remainder_len += length; - ctx->ctr_copy_to = datap; - return (CRYPTO_SUCCESS); + /* increment the counter */ + lower_counter = ntohll(ctx->ctr_cb[1] & ctx->ctr_lower_mask); + lower_counter = htonll(lower_counter + 1); + lower_counter &= ctx->ctr_lower_mask; + ctx->ctr_cb[1] = (ctx->ctr_cb[1] & ~(ctx->ctr_lower_mask)) | + lower_counter; + + /* wrap around */ + if (lower_counter == 0) { + upper_counter = ntohll(ctx->ctr_cb[0] & ctx->ctr_upper_mask); + upper_counter = htonll(upper_counter + 1); + upper_counter &= ctx->ctr_upper_mask; + ctx->ctr_cb[0] = (ctx->ctr_cb[0] & ~(ctx->ctr_upper_mask)) | + upper_counter; } - lastp = (uint8_t *)ctx->ctr_cb; - if (out != NULL) - crypto_init_ptrs(out, &iov_or_mp, &offset); - - do { - /* Unprocessed data from last call. */ - if (ctx->ctr_remainder_len > 0) { - need = block_size - ctx->ctr_remainder_len; - - if (need > remainder) - return (CRYPTO_DATA_LEN_RANGE); - - bcopy(datap, &((uint8_t *)ctx->ctr_remainder) - [ctx->ctr_remainder_len], need); - - blockp = (uint8_t *)ctx->ctr_remainder; - } else { - blockp = datap; - } + /* generate the new keyblock */ + cipher(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb, + (uint8_t *)ctx->ctr_keystream); + ctx->ctr_offset = 0; +} - /* ctr_cb is the counter block */ - cipher(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb, - (uint8_t *)ctx->ctr_tmp); +#ifdef __x86 +/* + * It's not worth bothering to check for pointer alignment on X86 -- always + * try to do 32-bits at a time when enough data is available. + */ +#define TRY32(_src, _dst, _key, _keylen, _outlen) \ + ((_keylen) > 3 && (_outlen) > 3) +#else +/* + * Other platforms (e.g. SPARC) require the pointers to be aligned to + * do 32-bits at a time. + */ +#define TRY32(_src, _dst, _key, _keylen, _outlen) \ + ((_keylen) > 3 && (_outlen) > 3 && \ + IS_P2ALIGNED((_src), sizeof (uint32_t)) && \ + IS_P2ALIGNED((_dst), sizeof (uint32_t)) && \ + IS_P2ALIGNED((_key), sizeof (uint32_t))) +#endif - lastp = (uint8_t *)ctx->ctr_tmp; +/* + * XOR the input with the keystream and write the result to out. + * This requires that the amount of data in 'in' is >= outlen + * (ctr_mode_contiguous_blocks() guarantees this for us before we are + * called). As CTR mode is a stream cipher, we cannot use a cipher's + * xxx_xor_block function (e.g. aes_xor_block()) as we must handle + * arbitrary lengths of input and should not buffer/accumulate partial blocks + * between calls. + */ +static void +ctr_xor(ctr_ctx_t *ctx, uint8_t *in, uint8_t *out, size_t outlen, + size_t block_size, + int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct)) +{ + uint8_t *keyp; + size_t keyamt; + while (outlen > 0) { /* - * Increment Counter. + * This occurs once we've consumed all the bytes in the + * current block of the keystream. ctr_init_ctx() creates + * the initial block of the keystream, so we always start + * with a full block of key data. */ - lower_counter = ntohll(ctx->ctr_cb[1] & ctx->ctr_lower_mask); - lower_counter = htonll(lower_counter + 1); - lower_counter &= ctx->ctr_lower_mask; - ctx->ctr_cb[1] = (ctx->ctr_cb[1] & ~(ctx->ctr_lower_mask)) | - lower_counter; - - /* wrap around */ - if (lower_counter == 0) { - upper_counter = - ntohll(ctx->ctr_cb[0] & ctx->ctr_upper_mask); - upper_counter = htonll(upper_counter + 1); - upper_counter &= ctx->ctr_upper_mask; - ctx->ctr_cb[0] = - (ctx->ctr_cb[0] & ~(ctx->ctr_upper_mask)) | - upper_counter; + if (ctx->ctr_offset == block_size) { + ctr_new_keyblock(ctx, cipher); } + keyp = (uint8_t *)ctx->ctr_keystream + ctx->ctr_offset; + keyamt = block_size - ctx->ctr_offset; + /* - * XOR encrypted counter block with the current clear block. + * Try to process 32-bits at a time when possible. */ - xor_block(blockp, lastp); - - if (out == NULL) { - if (ctx->ctr_remainder_len > 0) { - bcopy(lastp, ctx->ctr_copy_to, - ctx->ctr_remainder_len); - bcopy(lastp + ctx->ctr_remainder_len, datap, - need); - } - } else { - crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, - &out_data_1_len, &out_data_2, block_size); - - /* copy block to where it belongs */ - bcopy(lastp, out_data_1, out_data_1_len); - if (out_data_2 != NULL) { - bcopy(lastp + out_data_1_len, out_data_2, - block_size - out_data_1_len); - } - /* update offset */ - out->cd_offset += block_size; + if (TRY32(in, out, keyp, keyamt, outlen)) { + uint32_t *in32 = (uint32_t *)in; + uint32_t *out32 = (uint32_t *)out; + uint32_t *key32 = (uint32_t *)keyp; + + do { + *out32++ = *in32++ ^ *key32++; + keyamt -= sizeof (uint32_t); + outlen -= sizeof (uint32_t); + } while (keyamt > 3 && outlen > 3); + + in = (uint8_t *)in32; + out = (uint8_t *)out32; + keyp = (uint8_t *)key32; } - /* Update pointer to next block of data to be processed. */ - if (ctx->ctr_remainder_len != 0) { - datap += need; - ctx->ctr_remainder_len = 0; - } else { - datap += block_size; + while (keyamt > 0 && outlen > 0) { + *out++ = *in++ ^ *keyp++; + keyamt--; + outlen--; } - remainder = (size_t)&data[length] - (size_t)datap; - - /* Incomplete last block. */ - if (remainder > 0 && remainder < block_size) { - bcopy(datap, ctx->ctr_remainder, remainder); - ctx->ctr_remainder_len = remainder; - ctx->ctr_copy_to = datap; - goto out; - } - ctx->ctr_copy_to = NULL; - - } while (remainder > 0); - -out: - return (CRYPTO_SUCCESS); + ctx->ctr_offset = block_size - keyamt; + } } +/* + * Encrypt and decrypt multiple blocks of data in counter mode. + */ int -ctr_mode_final(ctr_ctx_t *ctx, crypto_data_t *out, - int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)) +ctr_mode_contiguous_blocks(ctr_ctx_t *ctx, char *data, size_t length, + crypto_data_t *out, size_t block_size, + int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct)) { - uint8_t *lastp; - uint8_t *p; - int i; - int rv; + size_t remainder = length; + uint8_t *datap = (uint8_t *)data; + void *iov_or_mp; + offset_t offset; + uint8_t *out_data_1; + uint8_t *out_data_2; + size_t out_data_1_len; - if (out->cd_length < ctx->ctr_remainder_len) - return (CRYPTO_DATA_LEN_RANGE); + if (block_size > sizeof (ctx->ctr_keystream)) + return (CRYPTO_ARGUMENTS_BAD); - encrypt_block(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb, - (uint8_t *)ctx->ctr_tmp); + if (out == NULL) + return (CRYPTO_ARGUMENTS_BAD); - lastp = (uint8_t *)ctx->ctr_tmp; - p = (uint8_t *)ctx->ctr_remainder; - for (i = 0; i < ctx->ctr_remainder_len; i++) { - p[i] ^= lastp[i]; - } + /* + * This check guarantees 'out' contains sufficient space for + * the resulting output. + */ + if (out->cd_offset + length > out->cd_length) + return (CRYPTO_BUFFER_TOO_SMALL); - rv = crypto_put_output_data(p, out, ctx->ctr_remainder_len); - if (rv == CRYPTO_SUCCESS) { - out->cd_offset += ctx->ctr_remainder_len; - ctx->ctr_remainder_len = 0; + crypto_init_ptrs(out, &iov_or_mp, &offset); + + /* Now XOR the output with the keystream */ + while (remainder > 0) { + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, remainder); + + /* + * crypto_get_ptrs() should guarantee this, but act as a + * safeguard in case the behavior ever changes. + */ + ASSERT3U(out_data_1_len, <=, remainder); + ctr_xor(ctx, datap, out_data_1, out_data_1_len, block_size, + cipher); + + datap += out_data_1_len; + remainder -= out_data_1_len; } - return (rv); + + out->cd_offset += length; + + return (CRYPTO_SUCCESS); } int ctr_init_ctx(ctr_ctx_t *ctr_ctx, ulong_t count, uint8_t *cb, + int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct), void (*copy_block)(uint8_t *, uint8_t *)) { uint64_t upper_mask = 0; @@ -217,6 +240,11 @@ ctr_init_ctx(ctr_ctx_t *ctr_ctx, ulong_t count, uint8_t *cb, copy_block(cb, (uchar_t *)ctr_ctx->ctr_cb); ctr_ctx->ctr_lastp = (uint8_t *)&ctr_ctx->ctr_cb[0]; + + /* Generate the first block of the keystream */ + cipher(ctr_ctx->ctr_keysched, (uint8_t *)ctr_ctx->ctr_cb, + (uint8_t *)ctr_ctx->ctr_keystream); + ctr_ctx->ctr_flags |= CTR_MODE; return (CRYPTO_SUCCESS); } diff --git a/usr/src/common/crypto/modes/modes.h b/usr/src/common/crypto/modes/modes.h index 0ad18b0c25..0e8fb66c8a 100644 --- a/usr/src/common/crypto/modes/modes.h +++ b/usr/src/common/crypto/modes/modes.h @@ -23,7 +23,7 @@ * Use is subject to license terms. * * Copyright 2014 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _COMMON_CRYPTO_MODES_H @@ -51,6 +51,9 @@ extern "C" { #define GMAC_MODE 0x00000040 #define CMAC_MODE 0x00000080 +/* Private flag for pkcs11_softtoken */ +#define P11_DECRYPTED 0x80000000 + /* * cc_keysched: Pointer to key schedule. * @@ -130,7 +133,8 @@ typedef struct ctr_ctx { struct common_ctx ctr_common; uint64_t ctr_lower_mask; uint64_t ctr_upper_mask; - uint32_t ctr_tmp[4]; + size_t ctr_offset; + uint32_t ctr_keystream[4]; } ctr_ctx_t; /* @@ -307,8 +311,7 @@ extern int cbc_decrypt_contiguous_blocks(cbc_ctx_t *, char *, size_t, extern int ctr_mode_contiguous_blocks(ctr_ctx_t *, char *, size_t, crypto_data_t *, size_t, - int (*cipher)(const void *, const uint8_t *, uint8_t *), - void (*xor_block)(uint8_t *, uint8_t *)); + int (*cipher)(const void *, const uint8_t *, uint8_t *)); extern int ccm_mode_encrypt_contiguous_blocks(ccm_ctx_t *, char *, size_t, crypto_data_t *, size_t, @@ -356,15 +359,13 @@ extern int cmac_mode_final(cbc_ctx_t *, crypto_data_t *, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)); -extern int ctr_mode_final(ctr_ctx_t *, crypto_data_t *, - int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)); - extern int cbc_init_ctx(cbc_ctx_t *, char *, size_t, size_t, void (*copy_block)(uint8_t *, uint64_t *)); extern int cmac_init_ctx(cbc_ctx_t *, size_t); extern int ctr_init_ctx(ctr_ctx_t *, ulong_t, uint8_t *, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *)); extern int ccm_init_ctx(ccm_ctx_t *, char *, int, boolean_t, size_t, diff --git a/usr/src/common/dis/i386/dis_tables.c b/usr/src/common/dis/i386/dis_tables.c index 12a1112d8a..ddca678f1c 100644 --- a/usr/src/common/dis/i386/dis_tables.c +++ b/usr/src/common/dis/i386/dis_tables.c @@ -21,7 +21,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2019, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ /* @@ -562,6 +562,11 @@ const instable_t dis_opMOVSLD = TNS("movslq",MOVSXZ); const instable_t dis_opPause = TNS("pause", NORM); /* + * "decode table" for wbnoinvd instruction + */ +const instable_t dis_opWbnoinvd = TNS("wbnoinvd", NORM); + +/* * Decode table for 0x0F00 opcodes */ const instable_t dis_op0F00[8] = { @@ -2660,7 +2665,7 @@ dtrace_vex_adjust(uint_t vex_byte1, uint_t mode, uint_t *reg, uint_t *r_m) */ /* ARGSUSED */ static void -dtrace_evex_mnem_adjust(dis86_t *x, instable_t *dp, uint_t vex_W, +dtrace_evex_mnem_adjust(dis86_t *x, const instable_t *dp, uint_t vex_W, uint_t evex_byte2) { #ifdef DIS_TEXT @@ -3215,7 +3220,7 @@ dtrace_get_operand(dis86_t *x, uint_t mode, uint_t r_m, int wbit, int opindex) int dtrace_disx86(dis86_t *x, uint_t cpu_mode) { - instable_t *dp; /* decode table being used */ + const instable_t *dp; /* decode table being used */ #ifdef DIS_TEXT uint_t i; #endif @@ -3712,11 +3717,11 @@ not_avx512: if (opnd_size_prefix == 0) { /* SSSE3 MMX instructions */ dp_mmx = *dp; - dp = &dp_mmx; - dp->it_adrmode = MMOPM_66o; + dp_mmx.it_adrmode = MMOPM_66o; #ifdef DIS_MEM - dp->it_size = 8; + dp_mmx.it_size = 8; #endif + dp = &dp_mmx; } break; default: @@ -3797,11 +3802,11 @@ not_avx512: if (opnd_size_prefix == 0) { /* SSSE3 MMX instructions */ dp_mmx = *dp; - dp = &dp_mmx; - dp->it_adrmode = MM; + dp_mmx.it_adrmode = MM; #ifdef DIS_MEM - dp->it_size = 8; + dp_mmx.it_size = 8; #endif + dp = &dp_mmx; } break; case CRC32: @@ -3818,6 +3823,9 @@ not_avx512: default: goto error; } + } else if (rep_prefix == 0xf3 && opcode4 == 0 && opcode5 == 9) { + rep_prefix = 0; + dp = (instable_t *)&dis_opWbnoinvd; } else { dp = (instable_t *)&dis_op0F[opcode4][opcode5]; } diff --git a/usr/src/common/idspace/id_space.c b/usr/src/common/idspace/id_space.c new file mode 100644 index 0000000000..7d28a8f533 --- /dev/null +++ b/usr/src/common/idspace/id_space.c @@ -0,0 +1,184 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/id_space.h> +#include <sys/debug.h> + +/* + * ID Spaces + * + * The id_space_t provides a simple implementation of a managed range of + * integer identifiers using a vmem arena. An ID space guarantees that the + * next identifer returned by an allocation is larger than the previous one, + * unless there are no larger slots remaining in the range. In this case, + * the ID space will return the first available slot in the lower part of the + * range (viewing the previous identifier as a partitioning element). If no + * slots are available, id_alloc()/id_allocff() will sleep until an + * identifier becomes available. Accordingly, id_space allocations must be + * initiated from contexts where sleeping is acceptable. id_alloc_nosleep()/ + * id_allocff_nosleep() will return -1 if no slots are available or if the + * system is low on memory. If id_alloc_nosleep() fails, callers should + * not try to extend the ID space. This is to avoid making a possible + * low-memory situation worse. + * + * As an ID space is designed for representing a range of id_t's, there + * is a preexisting maximal range: [0, MAXUID]. ID space requests outside + * that range will fail on a DEBUG kernel. The id_allocff*() functions + * return the first available id, and should be used when there is benefit + * to having a compact allocated range. + * + * (Presently, the id_space_t abstraction supports only direct allocations; ID + * reservation, in which an ID is allocated but placed in a internal + * dictionary for later use, should be added when a consuming subsystem + * arrives.) + * + * This code is also shared with userland. In userland, we don't have the same + * ability to have sleeping variants, so we effectively turn the normal + * versions without _nosleep into _nosleep. + */ + +#define ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1)) +#define ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1)) + +/* + * Create an arena to represent the range [low, high). + * Caller must be in a context in which VM_SLEEP is legal, + * for the kernel. Always VM_NOSLEEP in userland. + */ +id_space_t * +id_space_create(const char *name, id_t low, id_t high) +{ +#ifdef _KERNEL + int flag = VM_SLEEP; +#else + int flag = VM_NOSLEEP; +#endif + ASSERT(low >= 0); + ASSERT(low < high); + + return (vmem_create(name, ID_TO_ADDR(low), high - low, 1, + NULL, NULL, NULL, 0, flag | VMC_IDENTIFIER)); +} + +/* + * Destroy a previously created ID space. + * No restrictions on caller's context. + */ +void +id_space_destroy(id_space_t *isp) +{ + vmem_destroy(isp); +} + +void +id_space_extend(id_space_t *isp, id_t low, id_t high) +{ +#ifdef _KERNEL + int flag = VM_SLEEP; +#else + int flag = VM_NOSLEEP; +#endif + (void) vmem_add(isp, ID_TO_ADDR(low), high - low, flag); +} + +/* + * Allocate an id_t from specified ID space. + * Caller must be in a context in which VM_SLEEP is legal. + */ +id_t +id_alloc(id_space_t *isp) +{ +#ifdef _KERNEL + int flag = VM_SLEEP; +#else + int flag = VM_NOSLEEP; +#endif + return (ADDR_TO_ID(vmem_alloc(isp, 1, flag | VM_NEXTFIT))); +} + +/* + * Allocate an id_t from specified ID space. + * Returns -1 on failure (see module block comments for more information on + * failure modes). + */ +id_t +id_alloc_nosleep(id_space_t *isp) +{ + return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT))); +} + +/* + * Allocate an id_t from specified ID space using FIRSTFIT. + * Caller must be in a context in which VM_SLEEP is legal. + */ +id_t +id_allocff(id_space_t *isp) +{ +#ifdef _KERNEL + int flag = VM_SLEEP; +#else + int flag = VM_NOSLEEP; +#endif + return (ADDR_TO_ID(vmem_alloc(isp, 1, flag | VM_FIRSTFIT))); +} + +/* + * Allocate an id_t from specified ID space using FIRSTFIT + * Returns -1 on failure (see module block comments for more information on + * failure modes). + */ +id_t +id_allocff_nosleep(id_space_t *isp) +{ + return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT))); +} + +/* + * Allocate a specific identifier if possible, returning the id if + * successful, or -1 on failure. + */ +id_t +id_alloc_specific_nosleep(id_space_t *isp, id_t id) +{ + void *minaddr = ID_TO_ADDR(id); + void *maxaddr = ID_TO_ADDR(id + 1); + + /* + * Note that even though we're vmem_free()ing this later, it + * should be OK, since there's no quantum cache. + */ + return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0, + minaddr, maxaddr, VM_NOSLEEP))); +} + +/* + * Free a previously allocated ID. + * No restrictions on caller's context. + */ +void +id_free(id_space_t *isp, id_t id) +{ + vmem_free(isp, ID_TO_ADDR(id), 1); +} diff --git a/usr/src/common/inet/inet_hash.c b/usr/src/common/inet/inet_hash.c new file mode 100644 index 0000000000..3a511fe588 --- /dev/null +++ b/usr/src/common/inet/inet_hash.c @@ -0,0 +1,359 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. + */ + +/* + * Common routines usable by any part of the networking stack for hashing + * packets. The hashing logic originally was part of MAC, but it has more + * utility being usable by the rest of the broader system. + */ + +#include <sys/types.h> +#include <sys/mac.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/vlan.h> +#include <inet/ip.h> +#include <inet/ip_impl.h> +#include <inet/ip6.h> +#include <sys/dlpi.h> +#include <sys/sunndi.h> +#include <inet/ipsec_impl.h> +#include <inet/sadb.h> +#include <inet/ipsecesp.h> +#include <inet/ipsecah.h> +#include <inet/inet_hash.h> + +/* + * Determines the IPv6 header length accounting for all the optional IPv6 + * headers (hop-by-hop, destination, routing and fragment). The header length + * and next header value (a transport header) is captured. + * + * Returns B_FALSE if all the IP headers are not in the same mblk otherwise + * returns B_TRUE. + */ +static boolean_t +inet_pkthash_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, + uint16_t *hdr_length, uint8_t *next_hdr, ip6_frag_t **fragp) +{ + uint16_t length; + uint_t ehdrlen; + uint8_t *whereptr; + uint8_t *nexthdrp; + ip6_dest_t *desthdr; + ip6_rthdr_t *rthdr; + ip6_frag_t *fraghdr; + + if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr) + return (B_FALSE); + ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); + length = IPV6_HDR_LEN; + whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ + + if (fragp != NULL) + *fragp = NULL; + + nexthdrp = &ip6h->ip6_nxt; + while (whereptr < endptr) { + /* Is there enough left for len + nexthdr? */ + if (whereptr + MIN_EHDR_LEN > endptr) + break; + + switch (*nexthdrp) { + case IPPROTO_HOPOPTS: + case IPPROTO_DSTOPTS: + /* Assumes the headers are identical for hbh and dst */ + desthdr = (ip6_dest_t *)whereptr; + ehdrlen = 8 * (desthdr->ip6d_len + 1); + if ((uchar_t *)desthdr + ehdrlen > endptr) + return (B_FALSE); + nexthdrp = &desthdr->ip6d_nxt; + break; + case IPPROTO_ROUTING: + rthdr = (ip6_rthdr_t *)whereptr; + ehdrlen = 8 * (rthdr->ip6r_len + 1); + if ((uchar_t *)rthdr + ehdrlen > endptr) + return (B_FALSE); + nexthdrp = &rthdr->ip6r_nxt; + break; + case IPPROTO_FRAGMENT: + fraghdr = (ip6_frag_t *)whereptr; + ehdrlen = sizeof (ip6_frag_t); + if ((uchar_t *)&fraghdr[1] > endptr) + return (B_FALSE); + nexthdrp = &fraghdr->ip6f_nxt; + if (fragp != NULL) + *fragp = fraghdr; + break; + case IPPROTO_NONE: + /* No next header means we're finished */ + default: + *hdr_length = length; + *next_hdr = *nexthdrp; + return (B_TRUE); + } + length += ehdrlen; + whereptr += ehdrlen; + *hdr_length = length; + *next_hdr = *nexthdrp; + } + switch (*nexthdrp) { + case IPPROTO_HOPOPTS: + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_FRAGMENT: + /* + * If any known extension headers are still to be processed, + * the packet's malformed (or at least all the IP header(s) are + * not in the same mblk - and that should never happen. + */ + return (B_FALSE); + + default: + /* + * If we get here, we know that all of the IP headers were in + * the same mblk, even if the ULP header is in the next mblk. + */ + *hdr_length = length; + *next_hdr = *nexthdrp; + return (B_TRUE); + } +} + +#define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1]) +#define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3]) +#define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5]) +uint64_t +inet_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy) +{ + struct ether_header *ehp; + uint64_t hash = 0; + uint16_t sap; + uint_t skip_len; + uint8_t proto; + boolean_t ip_fragmented; + + /* + * We may want to have one of these per MAC type plugin in the + * future. For now supports only ethernet. + */ + if (media != DL_ETHER) + return (0L); + + /* for now we support only outbound packets */ + ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))); + ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); + + /* compute L2 hash */ + + ehp = (struct ether_header *)mp->b_rptr; + + if ((policy & INET_PKT_HASH_L2) != 0) { + uchar_t *mac_src = ehp->ether_shost.ether_addr_octet; + uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet; + hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst); + policy &= ~INET_PKT_HASH_L2; + } + + if (policy == 0) + goto done; + + /* skip ethernet header */ + + sap = ntohs(ehp->ether_type); + if (sap == ETHERTYPE_VLAN) { + struct ether_vlan_header *evhp; + mblk_t *newmp = NULL; + + skip_len = sizeof (struct ether_vlan_header); + if (MBLKL(mp) < skip_len) { + /* the vlan tag is the payload, pull up first */ + newmp = msgpullup(mp, -1); + if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) { + goto done; + } + evhp = (struct ether_vlan_header *)newmp->b_rptr; + } else { + evhp = (struct ether_vlan_header *)mp->b_rptr; + } + + sap = ntohs(evhp->ether_type); + freemsg(newmp); + } else { + skip_len = sizeof (struct ether_header); + } + + /* if ethernet header is in its own mblk, skip it */ + if (MBLKL(mp) <= skip_len) { + skip_len -= MBLKL(mp); + mp = mp->b_cont; + if (mp == NULL) + goto done; + } + + sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; + + /* compute IP src/dst addresses hash and skip IPv{4,6} header */ + + switch (sap) { + case ETHERTYPE_IP: { + ipha_t *iphp; + + /* + * If the header is not aligned or the header doesn't fit + * in the mblk, bail now. Note that this may cause packet + * reordering. + */ + iphp = (ipha_t *)(mp->b_rptr + skip_len); + if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) || + !OK_32PTR((char *)iphp)) + goto done; + + proto = iphp->ipha_protocol; + skip_len += IPH_HDR_LENGTH(iphp); + + /* Check if the packet is fragmented. */ + ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) & + IPH_OFFSET; + + /* + * For fragmented packets, use addresses in addition to + * the frag_id to generate the hash inorder to get + * better distribution. + */ + if (ip_fragmented || (policy & INET_PKT_HASH_L3) != 0) { + uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src); + uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst); + + hash ^= (PKT_HASH_4BYTES(ip_src) ^ + PKT_HASH_4BYTES(ip_dst)); + policy &= ~INET_PKT_HASH_L3; + } + + if (ip_fragmented) { + uint8_t *identp = (uint8_t *)&iphp->ipha_ident; + hash ^= PKT_HASH_2BYTES(identp); + goto done; + } + break; + } + case ETHERTYPE_IPV6: { + ip6_t *ip6hp; + ip6_frag_t *frag = NULL; + uint16_t hdr_length; + + /* + * If the header is not aligned or the header doesn't fit + * in the mblk, bail now. Note that this may cause packets + * reordering. + */ + + ip6hp = (ip6_t *)(mp->b_rptr + skip_len); + if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) || + !OK_32PTR((char *)ip6hp)) + goto done; + + if (!inet_pkthash_ip_hdr_length_v6(ip6hp, mp->b_wptr, + &hdr_length, &proto, &frag)) + goto done; + skip_len += hdr_length; + + /* + * For fragmented packets, use addresses in addition to + * the frag_id to generate the hash inorder to get + * better distribution. + */ + if (frag != NULL || (policy & INET_PKT_HASH_L3) != 0) { + uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]); + uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]); + + hash ^= (PKT_HASH_4BYTES(ip_src) ^ + PKT_HASH_4BYTES(ip_dst)); + policy &= ~INET_PKT_HASH_L3; + } + + if (frag != NULL) { + uint8_t *identp = (uint8_t *)&frag->ip6f_ident; + hash ^= PKT_HASH_4BYTES(identp); + goto done; + } + break; + } + default: + goto done; + } + + if (policy == 0) + goto done; + + /* if ip header is in its own mblk, skip it */ + if (MBLKL(mp) <= skip_len) { + skip_len -= MBLKL(mp); + mp = mp->b_cont; + if (mp == NULL) + goto done; + } + + /* parse ULP header */ +again: + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_ESP: + case IPPROTO_SCTP: + /* + * These Internet Protocols are intentionally designed + * for hashing from the git-go. Port numbers are in the first + * word for transports, SPI is first for ESP. + */ + if (mp->b_rptr + skip_len + 4 > mp->b_wptr) + goto done; + hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len)); + break; + + case IPPROTO_AH: { + ah_t *ah = (ah_t *)(mp->b_rptr + skip_len); + uint_t ah_length = AH_TOTAL_LEN(ah); + + if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr) + goto done; + + proto = ah->ah_nexthdr; + skip_len += ah_length; + + /* if AH header is in its own mblk, skip it */ + if (MBLKL(mp) <= skip_len) { + skip_len -= MBLKL(mp); + mp = mp->b_cont; + if (mp == NULL) + goto done; + } + + goto again; + } + } + +done: + return (hash); +} diff --git a/usr/src/common/mc/imc/imc_decode.c b/usr/src/common/mc/imc/imc_decode.c new file mode 100644 index 0000000000..7e52e9795e --- /dev/null +++ b/usr/src/common/mc/imc/imc_decode.c @@ -0,0 +1,770 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * Memory decoding logic. + * + * This file is part of the 'imc' driver on x86. It supports taking a physical + * address and determining what the corresponding DIMM is. This is shared + * between the kernel and userland for easier testing. + * + * For more information about the different parts of the decoding process, + * please see the file 'uts/i86pc/io/imc/imc.c'. + */ + +#include <sys/sysmacros.h> + +#ifndef _KERNEL +#include <stdint.h> +#include <strings.h> +#define BITX(u, h, l) (((u) >> (l)) & ((1LU << ((h) - (l) + 1LU)) - 1LU)) +#endif /* !_KERNEL */ + +#include "imc.h" + +/* + * Address ranges for decoding system addresses. There are three ranges that + * exist on x86, traditional DOS memory (hi 640 KiB), low memory, and high + * memory. Low memory always starts at 1 MiB and high memory always starts at 4 + * GiB. The upper bounds of these ranges is based on registers on the system. + */ +#define IMC_DECODE_CONV_BASE 0UL +#define IMC_DECODE_CONV_MAX 0x00009ffffULL /* 640 KiB - 1 */ +#define IMC_DECODE_LOW_BASE 0x000100000ULL /* 1 M */ +#define IMC_DECODE_HIGH_BASE 0x100000000ULL /* 4 GiB */ + +typedef struct imc_legacy_range { + uint64_t ilr_base; + size_t ilr_len; + const char *ilr_desc; +} imc_legacy_range_t; + +/* + * These represent regions of memory that are reserved for use and will not be + * decoded by DRAM. + */ +static imc_legacy_range_t imc_legacy_ranges[] = { + { 0x00000A0000ULL, 128 * 1024, "VGA" }, + { 0x00000C0000ULL, 256 * 1024, "PAM" }, + { 0x0000F00000ULL, 1024 * 1024, "Reserved" }, + { 0x00FE000000ULL, 32 * 1024 * 1024, "Unknown" }, + { 0x00FF000000ULL, 16 * 1024 * 1024, "Firmware" }, + { 0x00FED20000ULL, 384 * 1024, "TXT" }, + { 0x00FED00000ULL, 1024 * 1024, "PCH" }, + { 0x00FEC00000ULL, 1024 * 1024, "IOAPIC" }, + { 0x00FEB80000ULL, 512 * 1024, "Reserved" }, + { 0x00FEB00000ULL, 64 * 1024, "Reserved" } +}; + +/* + * Determine whether or not this address is in one of the reserved regions or if + * it falls outside of the explicit DRAM ranges. + */ +static boolean_t +imc_decode_addr_resvd(const imc_t *imc, imc_decode_state_t *dec) +{ + uint_t i; + const imc_sad_t *sad; + + for (i = 0; i < ARRAY_SIZE(imc_legacy_ranges); i++) { + uint64_t end = imc_legacy_ranges[i].ilr_base + + imc_legacy_ranges[i].ilr_len; + + if (dec->ids_pa >= imc_legacy_ranges[i].ilr_base && + dec->ids_pa < end) { + dec->ids_fail = IMC_DECODE_F_LEGACY_RANGE; + dec->ids_fail_data = i; + return (B_TRUE); + } + } + + /* + * For checking and determining whether or not we fit in DRAM, we need + * to check against the top of low memory and the top of high memory. + * While we technically have this information on a per-socket basis, we + * have to rely on the fact that both processors have the same + * information. A requirement which if not true, would lead to chaos + * depending on what socket we're running on. + */ + sad = &imc->imc_sockets[0].isock_sad; + if (sad->isad_valid != IMC_SAD_V_VALID) { + dec->ids_fail = IMC_DECODE_F_BAD_SAD; + return (B_TRUE); + } + + /* + * An address may fall into three ranges. It may fall into conventional + * memory. It may fall into low memory. It may fall into high memory. + * The conventional memory range is inclusive at the top. The others + * have been translated such that they are uniformly exclusive at the + * top. Because the bottom of conventional memory is at zero, the + * compiler will be angry if we compare against IMC_DECODE_CONV_BASE as + * it is always true. + */ + if (dec->ids_pa <= IMC_DECODE_CONV_MAX) { + return (B_FALSE); + } + + if (dec->ids_pa >= IMC_DECODE_LOW_BASE && + dec->ids_pa < sad->isad_tolm) { + return (B_FALSE); + } + + if (dec->ids_pa >= IMC_DECODE_HIGH_BASE && + dec->ids_pa < sad->isad_tohm) { + return (B_FALSE); + } + + /* + * Memory fell outside of the valid range. It's not for us. + */ + dec->ids_fail = IMC_DECODE_F_OUTSIDE_DRAM; + return (B_TRUE); +} + +static uint_t +imc_decode_sad_interleave(const imc_sad_rule_t *rule, uint64_t pa) +{ + uint_t itgt = 0; + + switch (rule->isr_imode) { + case IMC_SAD_IMODE_8t6: + if (rule->isr_a7mode) { + itgt = BITX(pa, 9, 9); + itgt |= (BITX(pa, 8, 7) << 1); + } else { + itgt = BITX(pa, 8, 6); + } + break; + case IMC_SAD_IMODE_8t6XOR: + if (rule->isr_a7mode) { + itgt = BITX(pa, 9, 9); + itgt |= (BITX(pa, 8, 7) << 1); + } else { + itgt = BITX(pa, 8, 6); + } + itgt ^= BITX(pa, 18, 16); + break; + case IMC_SAD_IMODE_10t8: + itgt = BITX(pa, 10, 8); + break; + case IMC_SAD_IMODE_14t12: + itgt = BITX(pa, 14, 12); + break; + case IMC_SAD_IMODE_32t30: + itgt = BITX(pa, 32, 30); + break; + } + + return (itgt); +} + +/* + * Use the system address decoder to try and find a valid SAD entry for this + * address. We always use socket zero's SAD as the SAD rules should be the same + * between the different sockets. + */ +static boolean_t +imc_decode_sad(const imc_t *imc, imc_decode_state_t *dec) +{ + uint_t i, ileaveidx; + uint8_t ileavetgt; + uint32_t nodeid, tadid, channelid; + uint64_t base; + const imc_socket_t *socket = &imc->imc_sockets[0]; + const imc_sad_t *sad = &socket->isock_sad; + const imc_sad_rule_t *rule; + boolean_t loop = B_FALSE; + + /* + * Note, all SAD rules have been adjusted so that they are uniformly + * exclusive. + */ +start: + for (rule = NULL, i = 0, base = 0; i < sad->isad_nrules; i++) { + rule = &sad->isad_rules[i]; + + if (rule->isr_enable && dec->ids_pa >= base && + dec->ids_pa < rule->isr_limit) { + break; + } + + base = rule->isr_limit; + } + + if (rule == NULL || i == sad->isad_nrules) { + dec->ids_fail = IMC_DECODE_F_NO_SAD_RULE; + return (B_FALSE); + } + + /* + * Store the SAD rule in the decode information for debugging's sake. + */ + dec->ids_sad = sad; + dec->ids_sad_rule = rule; + + /* + * We have found a SAD rule. We now need to transform that into the + * corresponding target based on its mode, etc. The way we do this + * varies based on the generation. + * + * The first thing we need to do is to figure out the target in the + * interleave list. + */ + ileaveidx = imc_decode_sad_interleave(rule, dec->ids_pa); + if (ileaveidx >= rule->isr_ntargets) { + dec->ids_fail = IMC_DECODE_F_BAD_SAD_INTERLEAVE; + dec->ids_fail_data = ileaveidx; + return (B_FALSE); + } + ileavetgt = rule->isr_targets[ileaveidx]; + if (imc->imc_gen >= IMC_GEN_SKYLAKE && + IMC_SAD_ILEAVE_SKX_LOCAL(ileavetgt) == 0) { + /* + * If we're in this case, the interleave rule said we had a + * remote target. That means we need to find the correct SAD + * based on the Node ID and then do all of this over again. + */ + nodeid = IMC_SAD_ILEAVE_SKX_TARGET(ileavetgt); + + if (loop) { + dec->ids_fail = IMC_DECODE_F_SAD_SEARCH_LOOP; + return (B_FALSE); + } + + for (i = 0; i < imc->imc_nsockets; i++) { + if (imc->imc_sockets[i].isock_valid == + IMC_SOCKET_V_VALID && + imc->imc_sockets[i].isock_nodeid == nodeid) { + socket = &imc->imc_sockets[i]; + sad = &imc->imc_sockets[i].isock_sad; + loop = B_TRUE; + goto start; + } + } + + dec->ids_fail = IMC_DECODE_F_BAD_REMOTE_MC_ROUTE; + dec->ids_fail_data = nodeid; + return (B_FALSE); + } + + /* + * On some platforms we need to derive the target channel based on the + * physical address and additional rules in the SAD. If we do, do that + * here. The idea is that this may overrule the memory channel route + * table target that was determined from the SAD rule. + */ + if (rule->isr_need_mod3) { + uint64_t addr; + uint8_t channel; + + switch (rule->isr_mod_mode) { + case IMC_SAD_MOD_MODE_45t6: + addr = dec->ids_pa >> 6; + break; + case IMC_SAD_MOD_MODE_45t8: + addr = dec->ids_pa >> 8; + break; + case IMC_SAD_MOD_MODE_45t12: + addr = dec->ids_pa >> 12; + break; + default: + dec->ids_fail = IMC_DECODE_F_SAD_BAD_MOD; + return (B_FALSE); + } + + switch (rule->isr_mod_type) { + case IMC_SAD_MOD_TYPE_MOD3: + channel = (addr % 3) << 1; + channel |= ileavetgt & 1; + break; + case IMC_SAD_MOD_TYPE_MOD2_01: + channel = (addr % 2) << 1; + channel |= ileavetgt & 1; + break; + case IMC_SAD_MOD_TYPE_MOD2_12: + channel = (addr % 2) << 2; + channel |= (~addr % 2) << 1; + channel |= ileavetgt & 1; + break; + case IMC_SAD_MOD_TYPE_MOD2_02: + channel = (addr % 2) << 2; + channel |= ileavetgt & 1; + break; + default: + dec->ids_fail = IMC_DECODE_F_SAD_BAD_MOD; + return (B_FALSE); + } + + ileavetgt = channel; + } + + switch (imc->imc_gen) { + case IMC_GEN_SANDY: + /* + * Sandy Bridge systems only have a single home agent, so the + * interleave target is always the node id. + */ + nodeid = ileavetgt; + tadid = 0; + channelid = UINT32_MAX; + break; + case IMC_GEN_IVY: + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + /* + * On these generations, the interleave NodeID in the SAD + * encodes both the nodeid and the home agent ID that we care + * about. + */ + nodeid = IMC_NODEID_IVY_BRD_UPPER(ileavetgt) | + IMC_NODEID_IVY_BRD_LOWER(ileavetgt); + tadid = IMC_NODEID_IVY_BRD_HA(ileavetgt); + channelid = UINT32_MAX; + break; + case IMC_GEN_SKYLAKE: + /* + * On Skylake generation systems we take the interleave target + * and use that to look up both the memory controller and the + * physical channel in the route table. The nodeid is already + * known because its SAD rules redirect us. + */ + nodeid = socket->isock_nodeid; + if (ileavetgt > IMC_SAD_ILEAVE_SKX_MAX) { + dec->ids_fail = IMC_DECODE_F_BAD_SAD_INTERLEAVE; + dec->ids_fail_data = ileavetgt; + return (B_FALSE); + } + ileavetgt = IMC_SAD_ILEAVE_SKX_TARGET(ileavetgt); + if (ileavetgt > sad->isad_mcroute.ismc_nroutes) { + dec->ids_fail = IMC_DECODE_F_BAD_SAD_INTERLEAVE; + dec->ids_fail_data = ileavetgt; + return (B_FALSE); + } + tadid = sad->isad_mcroute.ismc_mcroutes[ileavetgt].ismce_imc; + channelid = + sad->isad_mcroute.ismc_mcroutes[ileavetgt].ismce_pchannel; + break; + default: + nodeid = tadid = channelid = UINT32_MAX; + break; + } + + /* + * Map to the correct socket based on the nodeid. Make sure that we have + * a valid TAD. + */ + dec->ids_socket = NULL; + for (i = 0; i < imc->imc_nsockets; i++) { + if (imc->imc_sockets[i].isock_nodeid == nodeid) { + dec->ids_socket = &imc->imc_sockets[i]; + break; + } + } + if (dec->ids_socket == NULL) { + dec->ids_fail = IMC_DECODE_F_SAD_BAD_SOCKET; + dec->ids_fail_data = nodeid; + return (B_FALSE); + } + + if (tadid >= dec->ids_socket->isock_ntad) { + dec->ids_fail = IMC_DECODE_F_SAD_BAD_TAD; + dec->ids_fail_data = tadid; + return (B_FALSE); + } + + dec->ids_nodeid = nodeid; + dec->ids_tadid = tadid; + dec->ids_channelid = channelid; + dec->ids_tad = &dec->ids_socket->isock_tad[tadid]; + dec->ids_mc = &dec->ids_socket->isock_imcs[tadid]; + + return (B_TRUE); +} + +/* + * For Sandy Bridge through Broadwell we need to decode the memory channel that + * we're targeting. This is determined based on the number of ways that the + * socket and channel are supposed to be interleaved. The TAD has a target + * channel list sitting with the TAD rule. To figure out the appropriate index, + * the algorithm is roughly: + * + * idx = [(dec->ids_pa >> 6) / socket-ways] % channel-ways + * + * The shift by six, comes from taking the number of bits that are in theory in + * the cache line size. Of course, if things were this simple, that'd be great. + * The first complication is a7mode / MCChanShiftUpEnable. When this is enabled, + * more cache lines are used for this. The next complication comes when the + * feature MCChanHashEn is enabled. This means that we have to hash the + * resulting address before we do the modulus based on the number of channel + * ways. + * + * The last, and most complicated problem is when the number of channel ways is + * set to three. When this is the case, the base address of the range may not + * actually start at index zero. The nominal solution is to use the offset + * that's programmed on a per-channel basis to offset the system address. + * However, to get that information we would have to know what channel we're on, + * which is what we're trying to figure out. Regretfully, proclaim that we can't + * in this case. + */ +static boolean_t +imc_decode_tad_channel(const imc_t *imc, imc_decode_state_t *dec) +{ + uint64_t index; + const imc_tad_rule_t *rule = dec->ids_tad_rule; + + index = dec->ids_pa >> 6; + if ((dec->ids_tad->itad_flags & IMC_TAD_FLAG_CHANSHIFT) != 0) { + index = index >> 1; + } + + /* + * When performing a socket way equals three comparison, this would not + * work. + */ + index = index / rule->itr_sock_way; + + if ((dec->ids_tad->itad_flags & IMC_TAD_FLAG_CHANHASH) != 0) { + uint_t i; + for (i = 12; i < 28; i += 2) { + uint64_t shift = (dec->ids_pa >> i) & 0x3; + index ^= shift; + } + } + + index %= rule->itr_chan_way; + if (index >= rule->itr_ntargets) { + dec->ids_fail = IMC_DECODE_F_TAD_BAD_TARGET_INDEX; + dec->ids_fail_data = index; + return (B_FALSE); + } + + dec->ids_channelid = rule->itr_targets[index]; + return (B_TRUE); +} + +static uint_t +imc_tad_gran_to_shift(const imc_tad_t *tad, imc_tad_gran_t gran) +{ + uint_t shift = 0; + + switch (gran) { + case IMC_TAD_GRAN_64B: + shift = 6; + if ((tad->itad_flags & IMC_TAD_FLAG_CHANSHIFT) != 0) { + shift++; + } + break; + case IMC_TAD_GRAN_256B: + shift = 8; + break; + case IMC_TAD_GRAN_4KB: + shift = 12; + break; + case IMC_TAD_GRAN_1GB: + shift = 30; + break; + } + + return (shift); +} + +static boolean_t +imc_decode_tad(const imc_t *imc, imc_decode_state_t *dec) +{ + uint_t i, tadruleno; + uint_t sockshift, chanshift, sockmask, chanmask; + uint64_t off, chanaddr; + const imc_tad_t *tad = dec->ids_tad; + const imc_mc_t *mc = dec->ids_mc; + const imc_tad_rule_t *rule = NULL; + const imc_channel_t *chan; + + /* + * The first step in all of this is to determine which TAD rule applies + * for this address. + */ + for (i = 0; i < tad->itad_nrules; i++) { + rule = &tad->itad_rules[i]; + + if (dec->ids_pa >= rule->itr_base && + dec->ids_pa < rule->itr_limit) { + break; + } + } + + if (rule == NULL || i == tad->itad_nrules) { + dec->ids_fail = IMC_DECODE_F_NO_TAD_RULE; + return (B_FALSE); + } + tadruleno = i; + dec->ids_tad_rule = rule; + + /* + * Check if our TAD rule requires 3-way interleaving on the channel. We + * basically can't do that right now. For more information, see the + * comment above imc_decode_tad_channel(). + */ + if (rule->itr_chan_way == 3) { + dec->ids_fail = IMC_DECODE_F_TAD_3_ILEAVE; + return (B_FALSE); + } + + /* + * On some platforms, we need to now calculate the channel index from + * this. The way that we calculate this is nominally straightforward, + * but complicated by a number of different issues. + */ + switch (imc->imc_gen) { + case IMC_GEN_SANDY: + case IMC_GEN_IVY: + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + if (!imc_decode_tad_channel(imc, dec)) { + return (B_FALSE); + } + break; + default: + /* + * On Skylake and newer platforms we should have already decoded + * the target channel based on using the memory controller route + * table above. + */ + break; + } + + /* + * We initialize ids_channelid to UINT32_MAX, so this should make sure + * that we catch an incorrect channel as well. + */ + if (dec->ids_channelid >= mc->icn_nchannels) { + dec->ids_fail = IMC_DECODE_F_BAD_CHANNEL_ID; + dec->ids_fail_data = dec->ids_channelid; + return (B_FALSE); + } + chan = &mc->icn_channels[dec->ids_channelid]; + dec->ids_chan = chan; + + if (tadruleno >= chan->ich_ntad_offsets) { + dec->ids_fail = IMC_DECODE_F_BAD_CHANNEL_TAD_OFFSET; + dec->ids_fail_data = tadruleno; + return (B_FALSE); + } + + /* + * Now we can go ahead and calculate the channel address, which is + * roughly equal to: + * + * chan_addr = (sys_addr - off) / (chan way * sock way). + * + * The catch is that we want to preserve the low bits where possible. + * The number of bits is based on the interleaving granularities, the + * way that's calculated is based on information in the TAD rule. + * However, if a7mode is enabled on Ivy Bridge through Broadwell, then + * we need to add one to that. So we will save the smallest number of + * bits that are left after interleaving. + * + * Because the interleaving occurs at different granularities, we need + * to break this into two discrete steps, one where we apply the socket + * interleaving and one where we apply the channel interleaving, + * shifting and dividing at each step. + */ + off = chan->ich_tad_offsets[tadruleno]; + if (off > dec->ids_pa) { + dec->ids_fail = IMC_DECODE_F_CHANOFF_UNDERFLOW; + return (B_FALSE); + } + chanshift = imc_tad_gran_to_shift(tad, rule->itr_chan_gran); + sockshift = imc_tad_gran_to_shift(tad, rule->itr_sock_gran); + chanmask = (1 << chanshift) - 1; + sockmask = (1 << sockshift) - 1; + + chanaddr = dec->ids_pa - off; + chanaddr >>= sockshift; + chanaddr /= rule->itr_sock_way; + chanaddr <<= sockshift; + chanaddr |= dec->ids_pa & sockmask; + chanaddr >>= chanshift; + chanaddr /= rule->itr_chan_way; + chanaddr <<= chanshift; + chanaddr |= dec->ids_pa & chanmask; + + dec->ids_chanaddr = chanaddr; + + return (B_TRUE); +} + +static boolean_t +imc_decode_rir(const imc_t *imc, imc_decode_state_t *dec) +{ + const imc_mc_t *mc = dec->ids_mc; + const imc_channel_t *chan = dec->ids_chan; + const imc_rank_ileave_t *rir = NULL; + const imc_rank_ileave_entry_t *rirtarg; + const imc_dimm_t *dimm; + uint32_t shift, index; + uint_t i, dimmid, rankid; + uint64_t mask, base, rankaddr; + + if (mc->icn_closed) { + shift = IMC_PAGE_BITS_CLOSED; + } else { + shift = IMC_PAGE_BITS_OPEN; + } + mask = (1UL << shift) - 1; + + for (i = 0, base = 0; i < chan->ich_nrankileaves; i++) { + rir = &chan->ich_rankileaves[i]; + if (rir->irle_enabled && dec->ids_chanaddr >= base && + dec->ids_chanaddr < rir->irle_limit) { + break; + } + + base = rir->irle_limit; + } + + if (rir == NULL || i == chan->ich_nrankileaves) { + dec->ids_fail = IMC_DECODE_F_NO_RIR_RULE; + return (B_FALSE); + } + dec->ids_rir = rir; + + /* + * Determine the index of the rule that we care about. This is done by + * shifting the address based on the open and closed page bits and then + * just modding it by the number of ways in question. + */ + index = (dec->ids_chanaddr >> shift) % rir->irle_nways; + if (index >= rir->irle_nentries) { + dec->ids_fail = IMC_DECODE_F_BAD_RIR_ILEAVE_TARGET; + dec->ids_fail_data = index; + return (B_FALSE); + } + rirtarg = &rir->irle_entries[index]; + + /* + * The rank interleaving register has information about a physical rank + * target. This is within the notion of the physical chip selects that + * exist. While the memory controller only has eight actual chip + * selects, the physical values that are programmed depend a bit on the + * underlying hardware. Effectively, in this ID space, each DIMM has + * four ranks associated with it. Even when we only have two ranks with + * each physical channel, they'll be programmed so we can simply do the + * following match: + * + * DIMM = rank id / 4 + * RANK = rank id % 4 + */ + dec->ids_physrankid = rirtarg->irle_target; + dimmid = dec->ids_physrankid / 4; + rankid = dec->ids_physrankid % 4; + + if (dimmid >= chan->ich_ndimms) { + dec->ids_fail = IMC_DECODE_F_BAD_DIMM_INDEX; + dec->ids_fail_data = dimmid; + return (B_FALSE); + } + + dimm = &chan->ich_dimms[dimmid]; + if (!dimm->idimm_present) { + dec->ids_fail = IMC_DECODE_F_DIMM_NOT_PRESENT; + return (B_FALSE); + } + dec->ids_dimmid = dimmid; + dec->ids_dimm = dimm; + + if (rankid >= dimm->idimm_nranks) { + dec->ids_fail = IMC_DECODE_F_BAD_DIMM_RANK; + dec->ids_fail_data = rankid; + return (B_FALSE); + } + dec->ids_rankid = rankid; + + /* + * Calculate the rank address. We need to divide the address by the + * number of rank ways and then or in the lower bits. + */ + rankaddr = dec->ids_chanaddr; + rankaddr >>= shift; + rankaddr /= rir->irle_nways; + rankaddr <<= shift; + rankaddr |= dec->ids_chanaddr & mask; + + if (rirtarg->irle_offset > rankaddr) { + dec->ids_fail = IMC_DECODE_F_RANKOFF_UNDERFLOW; + return (B_FALSE); + } + rankaddr -= rirtarg->irle_offset; + dec->ids_rankaddr = rankaddr; + + return (B_TRUE); +} + +boolean_t +imc_decode_pa(const imc_t *imc, uint64_t pa, imc_decode_state_t *dec) +{ + bzero(dec, sizeof (*dec)); + dec->ids_pa = pa; + dec->ids_nodeid = dec->ids_tadid = dec->ids_channelid = UINT32_MAX; + + /* + * We need to rely on socket zero's information. Make sure that it both + * exists and is considered valid. + */ + if (imc->imc_nsockets < 1 || + imc->imc_sockets[0].isock_valid != IMC_SOCKET_V_VALID) { + dec->ids_fail = IMC_DECODE_F_BAD_SOCKET; + dec->ids_fail_data = 0; + return (B_FALSE); + } + + /* + * First, we need to make sure that the PA we've been given actually is + * meant to target a DRAM address. This address may fall to MMIO, MMCFG, + * be an address that's outside of DRAM, or belong to a legacy address + * range that is interposed. + */ + if (imc_decode_addr_resvd(imc, dec)) { + return (B_FALSE); + } + + /* + * Now that we have this data, we want to go through and look at the + * SAD. The SAD will point us to a specific socket and an IMC / home + * agent on that socket which will tell us which TAD we need to use. + */ + if (!imc_decode_sad(imc, dec)) { + return (B_FALSE); + } + + /* + * The decoded SAD information has pointed us a TAD. We need to use this + * to point us to the corresponding memory channel and the corresponding + * address on the channel. + */ + if (!imc_decode_tad(imc, dec)) { + return (B_FALSE); + } + + /* + * Use the rank interleaving data to determine which DIMM this is, the + * relevant rank, and the rank address. + */ + if (!imc_decode_rir(imc, dec)) { + return (B_FALSE); + } + + return (B_TRUE); +} diff --git a/usr/src/common/mc/imc/imc_dump.c b/usr/src/common/mc/imc/imc_dump.c new file mode 100644 index 0000000000..05a2f72308 --- /dev/null +++ b/usr/src/common/mc/imc/imc_dump.c @@ -0,0 +1,569 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * This implements logic to allow us to dump IMC data for decoding purposes, + * such that we can later encode it elsewhere. In general, dumping is done by + * the kernel and reconstituting this data is done by user land. + */ + +#include "imc.h" + +#ifndef _KERNEL +#include <stdint.h> +#include <strings.h> +#endif /* !_KERNEL */ + + +static nvlist_t * +imc_dump_sad(imc_sad_t *sad) +{ + uint_t i; + nvlist_t *nvl; + nvlist_t *rules[IMC_MAX_SAD_RULES]; + nvlist_t *routes[IMC_MAX_SAD_MCROUTES]; + + nvl = fnvlist_alloc(); + fnvlist_add_uint32(nvl, "isad_flags", sad->isad_flags); + fnvlist_add_uint32(nvl, "isad_valid", sad->isad_valid); + fnvlist_add_uint64(nvl, "isad_tolm", sad->isad_tolm); + fnvlist_add_uint64(nvl, "isad_tohm", sad->isad_tohm); + + for (i = 0; i < sad->isad_nrules; i++) { + nvlist_t *n = fnvlist_alloc(); + imc_sad_rule_t *r = &sad->isad_rules[i]; + + fnvlist_add_boolean_value(n, "isr_enable", r->isr_enable); + fnvlist_add_boolean_value(n, "isr_a7mode", r->isr_a7mode); + fnvlist_add_boolean_value(n, "isr_need_mod3", r->isr_need_mod3); + fnvlist_add_uint64(n, "isr_limit", r->isr_limit); + fnvlist_add_uint32(n, "isr_type", r->isr_type); + fnvlist_add_uint32(n, "isr_imode", r->isr_imode); + fnvlist_add_uint32(n, "isr_mod_mode", r->isr_mod_mode); + fnvlist_add_uint32(n, "isr_mod_type", r->isr_mod_type); + fnvlist_add_uint8_array(n, "isr_targets", r->isr_targets, + r->isr_ntargets); + + rules[i] = n; + } + fnvlist_add_nvlist_array(nvl, "isad_rules", rules, sad->isad_nrules); + for (i = 0; i < sad->isad_nrules; i++) { + nvlist_free(rules[i]); + } + + if (sad->isad_mcroute.ismc_nroutes == 0) { + return (nvl); + } + + for (i = 0; i < sad->isad_mcroute.ismc_nroutes; i++) { + nvlist_t *r = fnvlist_alloc(); + imc_sad_mcroute_entry_t *e = + &sad->isad_mcroute.ismc_mcroutes[i]; + + fnvlist_add_uint8(r, "ismce_imc", e->ismce_imc); + fnvlist_add_uint8(r, "ismce_pchannel", e->ismce_pchannel); + routes[i] = r; + } + fnvlist_add_nvlist_array(nvl, "isad_mcroute", routes, i); + for (i = 0; i < sad->isad_mcroute.ismc_nroutes; i++) { + nvlist_free(routes[i]); + } + + return (nvl); +} + +static nvlist_t * +imc_dump_tad(imc_tad_t *tad) +{ + uint_t i; + nvlist_t *nvl; + nvlist_t *rules[IMC_MAX_TAD_RULES]; + + nvl = fnvlist_alloc(); + fnvlist_add_uint32(nvl, "itad_valid", tad->itad_valid); + fnvlist_add_uint32(nvl, "itad_flags", tad->itad_flags); + for (i = 0; i < tad->itad_nrules; i++) { + nvlist_t *t = fnvlist_alloc(); + imc_tad_rule_t *r = &tad->itad_rules[i]; + + fnvlist_add_uint64(t, "itr_base", r->itr_base); + fnvlist_add_uint64(t, "itr_limit", r->itr_limit); + fnvlist_add_uint8(t, "itr_sock_way", r->itr_sock_way); + fnvlist_add_uint8(t, "itr_chan_way", r->itr_chan_way); + fnvlist_add_uint32(t, "itr_sock_gran", r->itr_sock_gran); + fnvlist_add_uint32(t, "itr_chan_gran", r->itr_chan_gran); + fnvlist_add_uint8_array(t, "itr_targets", r->itr_targets, + r->itr_ntargets); + + rules[i] = t; + } + fnvlist_add_nvlist_array(nvl, "itad_rules", rules, tad->itad_nrules); + for (i = 0; i < tad->itad_nrules; i++) { + nvlist_free(rules[i]); + } + + return (nvl); +} + +static nvlist_t * +imc_dump_channel(imc_channel_t *chan) +{ + uint_t i; + nvlist_t *nvl; + nvlist_t *dimms[IMC_MAX_DIMMPERCHAN]; + nvlist_t *ranks[IMC_MAX_RANK_WAYS]; + + nvl = fnvlist_alloc(); + fnvlist_add_uint32(nvl, "ich_valid", chan->ich_valid); + for (i = 0; i < chan->ich_ndimms; i++) { + nvlist_t *d = fnvlist_alloc(); + imc_dimm_t *dimm = &chan->ich_dimms[i]; + + fnvlist_add_uint32(d, "idimm_valid", dimm->idimm_valid); + fnvlist_add_boolean_value(d, "idimm_present", + dimm->idimm_present); + if (!dimm->idimm_present) + goto add; + + fnvlist_add_uint8(d, "idimm_nbanks", dimm->idimm_nbanks); + fnvlist_add_uint8(d, "idimm_nranks", dimm->idimm_nranks); + fnvlist_add_uint8(d, "idimm_width", dimm->idimm_width); + fnvlist_add_uint8(d, "idimm_density", dimm->idimm_density); + fnvlist_add_uint8(d, "idimm_nrows", dimm->idimm_nrows); + fnvlist_add_uint8(d, "idimm_ncolumns", dimm->idimm_ncolumns); + fnvlist_add_uint64(d, "idimm_size", dimm->idimm_size); +add: + dimms[i] = d; + } + fnvlist_add_nvlist_array(nvl, "ich_dimms", dimms, i); + for (i = 0; i < chan->ich_ndimms; i++) { + nvlist_free(dimms[i]); + } + + fnvlist_add_uint64_array(nvl, "ich_tad_offsets", chan->ich_tad_offsets, + chan->ich_ntad_offsets); + + for (i = 0; i < chan->ich_nrankileaves; i++) { + uint_t j; + nvlist_t *r = fnvlist_alloc(); + nvlist_t *ileaves[IMC_MAX_RANK_INTERLEAVES]; + imc_rank_ileave_t *rank = &chan->ich_rankileaves[i]; + + fnvlist_add_boolean_value(r, "irle_enabled", + rank->irle_enabled); + fnvlist_add_uint8(r, "irle_nways", rank->irle_nways); + fnvlist_add_uint8(r, "irle_nwaysbits", rank->irle_nwaysbits); + fnvlist_add_uint64(r, "irle_limit", rank->irle_limit); + + for (j = 0; j < rank->irle_nentries; j++) { + nvlist_t *e = fnvlist_alloc(); + + fnvlist_add_uint8(e, "irle_target", + rank->irle_entries[j].irle_target); + fnvlist_add_uint64(e, "irle_offset", + rank->irle_entries[j].irle_offset); + ileaves[j] = e; + } + fnvlist_add_nvlist_array(r, "irle_entries", ileaves, j); + for (j = 0; j < rank->irle_nentries; j++) { + nvlist_free(ileaves[j]); + } + + ranks[i] = r; + } + fnvlist_add_nvlist_array(nvl, "ich_rankileaves", ranks, i); + for (i = 0; i < chan->ich_nrankileaves; i++) { + nvlist_free(ranks[i]); + } + + return (nvl); +} + +static nvlist_t * +imc_dump_mc(imc_mc_t *mc) +{ + uint_t i; + nvlist_t *nvl; + nvlist_t *channels[IMC_MAX_CHANPERMC]; + + nvl = fnvlist_alloc(); + fnvlist_add_boolean_value(nvl, "icn_ecc", mc->icn_ecc); + fnvlist_add_boolean_value(nvl, "icn_lockstep", mc->icn_lockstep); + fnvlist_add_boolean_value(nvl, "icn_closed", mc->icn_closed); + fnvlist_add_uint32(nvl, "icn_dimm_type", mc->icn_dimm_type); + + for (i = 0; i < mc->icn_nchannels; i++) { + channels[i] = imc_dump_channel(&mc->icn_channels[i]); + } + fnvlist_add_nvlist_array(nvl, "icn_channels", channels, i); + for (i = 0; i < mc->icn_nchannels; i++) { + nvlist_free(channels[i]); + } + + return (nvl); +} + +static nvlist_t * +imc_dump_socket(imc_socket_t *sock) +{ + uint_t i; + nvlist_t *nvl, *sad; + nvlist_t *tad[IMC_MAX_TAD]; + nvlist_t *mc[IMC_MAX_IMCPERSOCK]; + + nvl = fnvlist_alloc(); + + sad = imc_dump_sad(&sock->isock_sad); + fnvlist_add_nvlist(nvl, "isock_sad", sad); + nvlist_free(sad); + + for (i = 0; i < sock->isock_ntad; i++) { + tad[i] = imc_dump_tad(&sock->isock_tad[i]); + } + fnvlist_add_nvlist_array(nvl, "isock_tad", tad, i); + for (i = 0; i < sock->isock_ntad; i++) { + fnvlist_free(tad[i]); + } + + fnvlist_add_uint32(nvl, "isock_nodeid", sock->isock_nodeid); + + for (i = 0; i < sock->isock_nimc; i++) { + mc[i] = imc_dump_mc(&sock->isock_imcs[i]); + } + fnvlist_add_nvlist_array(nvl, "isock_imcs", mc, i); + for (i = 0; i < sock->isock_nimc; i++) { + fnvlist_free(mc[i]); + } + return (nvl); +} + +nvlist_t * +imc_dump_decoder(imc_t *imc) +{ + uint_t i; + nvlist_t *nvl, *invl; + nvlist_t *sockets[IMC_MAX_SOCKETS]; + + nvl = fnvlist_alloc(); + fnvlist_add_uint32(nvl, "mc_dump_version", 0); + fnvlist_add_string(nvl, "mc_dump_driver", "imc"); + + invl = fnvlist_alloc(); + fnvlist_add_uint32(invl, "imc_gen", imc->imc_gen); + + for (i = 0; i < imc->imc_nsockets; i++) { + sockets[i] = imc_dump_socket(&imc->imc_sockets[i]); + } + fnvlist_add_nvlist_array(invl, "imc_sockets", sockets, i); + fnvlist_add_nvlist(nvl, "imc", invl); + + for (i = 0; i < imc->imc_nsockets; i++) { + nvlist_free(sockets[i]); + } + nvlist_free(invl); + + return (nvl); +} + +static boolean_t +imc_restore_sad(nvlist_t *nvl, imc_sad_t *sad) +{ + nvlist_t **rules, **routes; + uint_t i, nroutes; + + if (nvlist_lookup_uint32(nvl, "isad_flags", &sad->isad_flags) != 0 || + nvlist_lookup_uint32(nvl, "isad_valid", &sad->isad_valid) != 0 || + nvlist_lookup_uint64(nvl, "isad_tolm", &sad->isad_tolm) != 0 || + nvlist_lookup_uint64(nvl, "isad_tohm", &sad->isad_tohm) != 0 || + nvlist_lookup_nvlist_array(nvl, "isad_rules", + &rules, &sad->isad_nrules) != 0) { + return (B_FALSE); + } + + for (i = 0; i < sad->isad_nrules; i++) { + imc_sad_rule_t *r = &sad->isad_rules[i]; + uint8_t *targs; + + if (nvlist_lookup_boolean_value(rules[i], "isr_enable", + &r->isr_enable) != 0 || + nvlist_lookup_boolean_value(rules[i], "isr_a7mode", + &r->isr_a7mode) != 0 || + nvlist_lookup_boolean_value(rules[i], "isr_need_mod3", + &r->isr_need_mod3) != 0 || + nvlist_lookup_uint64(rules[i], "isr_limit", + &r->isr_limit) != 0 || + nvlist_lookup_uint32(rules[i], "isr_type", + &r->isr_type) != 0 || + nvlist_lookup_uint32(rules[i], "isr_imode", + &r->isr_imode) != 0 || + nvlist_lookup_uint32(rules[i], "isr_mod_mode", + &r->isr_mod_mode) != 0 || + nvlist_lookup_uint32(rules[i], "isr_mod_type", + &r->isr_mod_type) != 0 || + nvlist_lookup_uint8_array(rules[i], "isr_targets", &targs, + &r->isr_ntargets) != 0 || + r->isr_ntargets > IMC_MAX_SAD_RULES) { + return (B_FALSE); + } + + bcopy(targs, r->isr_targets, r->isr_ntargets * + sizeof (uint8_t)); + } + + /* + * The mcroutes entry right now is only included conditionally. + */ + if (nvlist_lookup_nvlist_array(nvl, "isad_mcroute", &routes, + &nroutes) == 0) { + if (nroutes > IMC_MAX_SAD_MCROUTES) + return (B_FALSE); + sad->isad_mcroute.ismc_nroutes = nroutes; + for (i = 0; i < nroutes; i++) { + imc_sad_mcroute_entry_t *r = + &sad->isad_mcroute.ismc_mcroutes[i]; + if (nvlist_lookup_uint8(routes[i], "ismce_imc", + &r->ismce_imc) != 0 || + nvlist_lookup_uint8(routes[i], "ismce_pchannel", + &r->ismce_pchannel) != 0) { + return (B_FALSE); + } + } + } + + return (B_TRUE); +} + +static boolean_t +imc_restore_tad(nvlist_t *nvl, imc_tad_t *tad) +{ + nvlist_t **rules; + + if (nvlist_lookup_uint32(nvl, "itad_valid", &tad->itad_valid) != 0 || + nvlist_lookup_uint32(nvl, "itad_flags", &tad->itad_flags) != 0 || + nvlist_lookup_nvlist_array(nvl, "itad_rules", &rules, + &tad->itad_nrules) != 0 || tad->itad_nrules > IMC_MAX_TAD_RULES) { + return (B_FALSE); + } + + for (uint_t i = 0; i < tad->itad_nrules; i++) { + imc_tad_rule_t *r = &tad->itad_rules[i]; + uint8_t *targs; + + if (nvlist_lookup_uint64(rules[i], "itr_base", + &r->itr_base) != 0 || + nvlist_lookup_uint64(rules[i], "itr_limit", + &r->itr_limit) != 0 || + nvlist_lookup_uint8(rules[i], "itr_sock_way", + &r->itr_sock_way) != 0 || + nvlist_lookup_uint8(rules[i], "itr_chan_way", + &r->itr_chan_way) != 0 || + nvlist_lookup_uint32(rules[i], "itr_sock_gran", + &r->itr_sock_gran) != 0 || + nvlist_lookup_uint32(rules[i], "itr_chan_gran", + &r->itr_chan_gran) != 0 || + nvlist_lookup_uint8_array(rules[i], "itr_targets", + &targs, &r->itr_ntargets) != 0 || + r->itr_ntargets > IMC_MAX_TAD_TARGETS) { + return (B_FALSE); + } + + bcopy(targs, r->itr_targets, r->itr_ntargets * + sizeof (uint8_t)); + } + + return (B_TRUE); +} + +static boolean_t +imc_restore_channel(nvlist_t *nvl, imc_channel_t *chan) +{ + nvlist_t **dimms, **rir; + uint64_t *tadoff; + + if (nvlist_lookup_uint32(nvl, "ich_valid", &chan->ich_valid) != 0 || + nvlist_lookup_nvlist_array(nvl, "ich_dimms", &dimms, + &chan->ich_ndimms) != 0 || + chan->ich_ndimms > IMC_MAX_DIMMPERCHAN || + nvlist_lookup_uint64_array(nvl, "ich_tad_offsets", &tadoff, + &chan->ich_ntad_offsets) != 0 || + chan->ich_ntad_offsets > IMC_MAX_TAD_RULES || + nvlist_lookup_nvlist_array(nvl, "ich_rankileaves", &rir, + &chan->ich_nrankileaves) != 0 || + chan->ich_nrankileaves > IMC_MAX_RANK_WAYS) { + return (B_FALSE); + } + + for (uint_t i = 0; i < chan->ich_ndimms; i++) { + imc_dimm_t *d = &chan->ich_dimms[i]; + + if (nvlist_lookup_uint32(dimms[i], "idimm_valid", + &d->idimm_valid) != 0 || + nvlist_lookup_boolean_value(dimms[i], "idimm_present", + &d->idimm_present) != 0) { + return (B_FALSE); + } + + if (!d->idimm_present) + continue; + + if (nvlist_lookup_uint8(dimms[i], "idimm_nbanks", + &d->idimm_nbanks) != 0 || + nvlist_lookup_uint8(dimms[i], "idimm_nranks", + &d->idimm_nranks) != 0 || + nvlist_lookup_uint8(dimms[i], "idimm_width", + &d->idimm_width) != 0 || + nvlist_lookup_uint8(dimms[i], "idimm_density", + &d->idimm_density) != 0 || + nvlist_lookup_uint8(dimms[i], "idimm_nrows", + &d->idimm_nrows) != 0 || + nvlist_lookup_uint8(dimms[i], "idimm_ncolumns", + &d->idimm_ncolumns) != 0 || + nvlist_lookup_uint64(dimms[i], "idimm_size", + &d->idimm_size) != 0) { + return (B_FALSE); + } + } + + bcopy(tadoff, chan->ich_tad_offsets, chan->ich_ntad_offsets * + sizeof (uint64_t)); + + for (uint_t i = 0; i < chan->ich_nrankileaves; i++) { + nvlist_t **ileaves; + imc_rank_ileave_t *r = &chan->ich_rankileaves[i]; + + if (nvlist_lookup_boolean_value(rir[i], "irle_enabled", + &r->irle_enabled) != 0 || + nvlist_lookup_uint8(rir[i], "irle_nways", + &r->irle_nways) != 0 || + nvlist_lookup_uint8(rir[i], "irle_nwaysbits", + &r->irle_nwaysbits) != 0 || + nvlist_lookup_uint64(rir[i], "irle_limit", + &r->irle_limit) != 0 || + nvlist_lookup_nvlist_array(rir[i], "irle_entries", + &ileaves, &r->irle_nentries) != 0 || + r->irle_nentries > IMC_MAX_RANK_INTERLEAVES) { + return (B_FALSE); + } + + for (uint_t j = 0; j < r->irle_nentries; j++) { + imc_rank_ileave_entry_t *ril = &r->irle_entries[j]; + + if (nvlist_lookup_uint8(ileaves[j], "irle_target", + &ril->irle_target) != 0 || + nvlist_lookup_uint64(ileaves[j], "irle_offset", + &ril->irle_offset) != 0) { + return (B_FALSE); + } + } + } + + return (B_TRUE); +} + +static boolean_t +imc_restore_mc(nvlist_t *nvl, imc_mc_t *mc) +{ + nvlist_t **channels; + + if (nvlist_lookup_boolean_value(nvl, "icn_ecc", &mc->icn_ecc) != 0 || + nvlist_lookup_boolean_value(nvl, "icn_lockstep", + &mc->icn_lockstep) != 0 || + nvlist_lookup_boolean_value(nvl, "icn_closed", + &mc->icn_closed) != 0 || + nvlist_lookup_uint32(nvl, "icn_dimm_type", + &mc->icn_dimm_type) != 0 || + nvlist_lookup_nvlist_array(nvl, "icn_channels", &channels, + &mc->icn_nchannels) != 0 || mc->icn_nchannels > IMC_MAX_CHANPERMC) { + return (B_FALSE); + } + + for (uint_t i = 0; i < mc->icn_nchannels; i++) { + if (!imc_restore_channel(channels[i], &mc->icn_channels[i])) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static boolean_t +imc_restore_socket(nvlist_t *nvl, imc_socket_t *sock) +{ + uint_t i; + nvlist_t *sad, **tads, **imcs; + + if (nvlist_lookup_nvlist(nvl, "isock_sad", &sad) != 0 || + nvlist_lookup_nvlist_array(nvl, "isock_tad", &tads, + &sock->isock_ntad) != 0 || + nvlist_lookup_uint32(nvl, "isock_nodeid", + &sock->isock_nodeid) != 0 || + nvlist_lookup_nvlist_array(nvl, "isock_imcs", &imcs, + &sock->isock_nimc) != 0 || + sock->isock_ntad > IMC_MAX_TAD || + sock->isock_nimc > IMC_MAX_IMCPERSOCK) { + return (B_FALSE); + } + + if (!imc_restore_sad(sad, &sock->isock_sad)) { + return (B_FALSE); + } + + for (i = 0; i < sock->isock_ntad; i++) { + if (!imc_restore_tad(tads[i], &sock->isock_tad[i])) { + return (B_FALSE); + } + } + + for (i = 0; i < sock->isock_nimc; i++) { + if (!imc_restore_mc(imcs[i], &sock->isock_imcs[i])) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +boolean_t +imc_restore_decoder(nvlist_t *nvl, imc_t *imc) +{ + uint_t i; + uint32_t vers; + nvlist_t *invl, **socks; + char *driver; + + bzero(imc, sizeof (imc_t)); + + if (nvlist_lookup_uint32(nvl, "mc_dump_version", &vers) != 0 || + vers != 0 || + nvlist_lookup_string(nvl, "mc_dump_driver", &driver) != 0 || + strcmp(driver, "imc") != 0 || + nvlist_lookup_nvlist(nvl, "imc", &invl) != 0) { + return (B_FALSE); + } + + if (nvlist_lookup_uint32(invl, "imc_gen", &imc->imc_gen) != 0 || + nvlist_lookup_nvlist_array(invl, "imc_sockets", &socks, + &imc->imc_nsockets) != 0 || + imc->imc_nsockets > IMC_MAX_SOCKETS) { + return (B_FALSE); + } + + for (i = 0; i < imc->imc_nsockets; i++) { + if (!imc_restore_socket(socks[i], &imc->imc_sockets[i])) + return (B_FALSE); + } + + return (B_TRUE); +} diff --git a/usr/src/common/net/dhcp/octet.c b/usr/src/common/net/dhcp/octet.c index d8367bbf0b..370604c4e3 100644 --- a/usr/src/common/net/dhcp/octet.c +++ b/usr/src/common/net/dhcp/octet.c @@ -77,6 +77,9 @@ octet_to_hexascii(const void *nump, uint_t nlen, char *bufp, uint_t *blen) * Converts an ASCII string into an octet string. * * Returns 0 for success, errno otherwise. + * + * If the string contains invalid hexadecimal characters, or an odd number of + * characters then this function returns EINVAL. */ int hexascii_to_octet(const char *asp, uint_t alen, void *bufp, uint_t *blen) diff --git a/usr/src/common/pnglite/pnglite.c b/usr/src/common/pnglite/pnglite.c index 7a30bdc609..5d8b41f9e9 100644 --- a/usr/src/common/pnglite/pnglite.c +++ b/usr/src/common/pnglite/pnglite.c @@ -9,6 +9,7 @@ #else #include <stdio.h> #include <stdlib.h> +#include <limits.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> diff --git a/usr/src/common/zfs/zfs_prop.c b/usr/src/common/zfs/zfs_prop.c index a4f02b18db..6f633147a3 100644 --- a/usr/src/common/zfs/zfs_prop.c +++ b/usr/src/common/zfs/zfs_prop.c @@ -547,6 +547,23 @@ zfs_prop_delegatable(zfs_prop_t prop) return (pd->pd_attr != PROP_READONLY); } +boolean_t +zfs_prop_cacheable(zfs_prop_t prop) +{ + /* + * It'd be nice if each prop had a flags field which could have flag + * like PROP_CACHEABLE, but since zprop_attr_t is an enum and this + * setting is orthogonal to the concepts of PROP_READONLY, etc., we have + * this function. + */ + return (prop == ZFS_PROP_VERSION || + prop == ZFS_PROP_NORMALIZE || + prop == ZFS_PROP_UTF8ONLY || + prop == ZFS_PROP_CASE || + prop == ZFS_PROP_VOLSIZE || + prop == ZFS_PROP_VOLBLOCKSIZE); +} + /* * Given a zfs dataset property name, returns the corresponding property ID. */ diff --git a/usr/src/common/zfs/zfs_prop.h b/usr/src/common/zfs/zfs_prop.h index 45423cc72f..3f34ad64a6 100644 --- a/usr/src/common/zfs/zfs_prop.h +++ b/usr/src/common/zfs/zfs_prop.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #ifndef _ZFS_PROP_H @@ -89,6 +90,7 @@ typedef struct { void zfs_prop_init(void); zprop_type_t zfs_prop_get_type(zfs_prop_t); boolean_t zfs_prop_delegatable(zfs_prop_t prop); +boolean_t zfs_prop_cacheable(zfs_prop_t prop); zprop_desc_t *zfs_prop_get_table(void); /* |