summaryrefslogtreecommitdiff
path: root/usr/src/lib/brand/bhyve/zone/boot.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/lib/brand/bhyve/zone/boot.c')
-rw-r--r--usr/src/lib/brand/bhyve/zone/boot.c926
1 files changed, 926 insertions, 0 deletions
diff --git a/usr/src/lib/brand/bhyve/zone/boot.c b/usr/src/lib/brand/bhyve/zone/boot.c
new file mode 100644
index 0000000000..3d26e350d4
--- /dev/null
+++ b/usr/src/lib/brand/bhyve/zone/boot.c
@@ -0,0 +1,926 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2020 Joyent, Inc.
+ */
+
+/*
+ * This program runs as a child of zoneadmd, which sets a variety of
+ * _ZONECFG_<resource>_<instance> properties so that child processes don't have
+ * to parse xml.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libnvpair.h>
+#include <libcustr.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/debug.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+#include <zone.h>
+
+/* These two paths must be relative to the zone root. */
+#define BHYVE_DIR "var/run/bhyve"
+#define BHYVE_ARGS_FILE BHYVE_DIR "/" "zhyve.cmd"
+
+#define ZH_MAXARGS 100
+
+#define DEFAULT_BOOTROM "/usr/share/bhyve/uefi-rom.bin"
+#define DEFAULT_BOOTROM_CSM "/usr/share/bhyve/uefi-csm-rom.bin"
+
+typedef enum {
+ PCI_SLOT_HOSTBRIDGE = 0,
+ PCI_SLOT_CD = 3, /* Windows ahci allows slots 3 - 6 */
+ PCI_SLOT_BOOT_DISK,
+ PCI_SLOT_OTHER_DISKS,
+ PCI_SLOT_NICS,
+ PCI_SLOT_FBUF = 30,
+ PCI_SLOT_LPC = 31, /* Windows requires lpc in slot 31 */
+} pci_slot_t;
+
+static boolean_t debug;
+static const char *zonename;
+static const char *zonepath;
+
+#define dprintf(x) if (debug) (void)printf x
+
+static char *
+get_zcfg_var(const char *rsrc, const char *inst, const char *prop)
+{
+ char envvar[MAXNAMELEN];
+ char *ret;
+
+ if (prop == NULL) {
+ if (snprintf(envvar, sizeof (envvar), "_ZONECFG_%s_%s",
+ rsrc, inst) >= sizeof (envvar)) {
+ return (NULL);
+ }
+ } else {
+ if (snprintf(envvar, sizeof (envvar), "_ZONECFG_%s_%s_%s",
+ rsrc, inst, prop) >= sizeof (envvar)) {
+ return (NULL);
+ }
+ }
+
+ ret = getenv(envvar);
+
+ dprintf(("%s: '%s=%s'\n", __func__, envvar, ret ? ret : "<null>"));
+
+ return (ret);
+}
+
+static boolean_t
+is_env_true(const char *rsrc, const char *inst, const char *prop)
+{
+ char *val = get_zcfg_var(rsrc, inst, prop);
+
+ return (val != NULL && strcmp(val, "true") == 0);
+}
+
+static boolean_t
+is_env_string(const char *rsrc, const char *inst, const char *prop,
+ const char *val)
+{
+ char *pval = get_zcfg_var(rsrc, inst, prop);
+
+ return (pval != NULL && strcmp(pval, val) == 0);
+}
+
+static int
+add_arg(int *argc, char **argv, const char *val)
+{
+ if (*argc >= ZH_MAXARGS) {
+ (void) printf("Error: too many arguments\n");
+ return (1);
+ }
+ argv[*argc] = strdup(val);
+ assert(argv[*argc] != NULL);
+ dprintf(("%s: argv[%d]='%s'\n", __func__, *argc, argv[*argc]));
+ (*argc)++;
+ return (0);
+}
+
+static int
+add_smbios(int *argc, char **argv)
+{
+ char smbios[MAXPATHLEN];
+ struct utsname utsname;
+ const char *version;
+ const char *uuid;
+
+ if ((uuid = getenv("_ZONECFG_uuid")) != NULL) {
+ if (add_arg(argc, argv, "-U") != 0 ||
+ add_arg(argc, argv, uuid) != 0)
+ return (1);
+ }
+
+ /*
+ * Look for something like joyent_20180329T120303Z. A little mucky, but
+ * it's exactly what sysinfo does.
+ */
+ (void) uname(&utsname);
+ if (strncmp(utsname.version, "joyent_", strlen("joyent_")) == 0)
+ version = utsname.version + strlen("joyent_");
+ else
+ version = "?";
+
+ /*
+ * This is based upon the SMBIOS values we expose to KVM guests.
+ */
+ (void) snprintf(smbios, sizeof (smbios),
+ "1,manufacturer=Joyent,product=SmartDC HVM,version=7.%s,"
+ "serial=%s,sku=001,family=Virtual Machine",
+ version, zonename);
+
+ if (add_arg(argc, argv, "-B") != 0 ||
+ add_arg(argc, argv, smbios) != 0)
+ return (1);
+
+ return (0);
+}
+
+static int
+add_cpu(int *argc, char **argv)
+{
+ char *val;
+
+ if ((val = get_zcfg_var("attr", "vcpus", NULL)) != NULL) {
+ if (add_arg(argc, argv, "-c") != 0 ||
+ add_arg(argc, argv, val) != 0) {
+ return (1);
+ }
+ }
+ return (0);
+}
+
+static int
+add_ram(int *argc, char **argv)
+{
+ char *val;
+
+ if ((val = get_zcfg_var("attr", "ram", NULL)) != NULL) {
+ if (add_arg(argc, argv, "-m") != 0 ||
+ add_arg(argc, argv, val) != 0) {
+ return (1);
+ }
+ }
+ return (0);
+}
+
+static int
+parse_pcislot(const char *pcislot, uint_t *busp, uint_t *devp, uint_t *funcp)
+{
+ char junk;
+
+ switch (sscanf(pcislot, "%u:%u:%u%c", busp, devp, funcp, &junk)) {
+ case 3:
+ break;
+ case 2:
+ case 1:
+ *funcp = *devp;
+ *devp = *busp;
+ *busp = 0;
+ break;
+ default:
+ (void) printf("Error: device %d has illegal PCI slot: %s\n",
+ *devp, pcislot);
+ return (-1);
+ }
+
+ if (*busp > 255 || *devp > 31 || *funcp > 7) {
+ (void) printf("Error: device %d has illegal PCI slot: %s\n",
+ *devp, pcislot);
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * In the initial implementation, slot assignment was dynamic on every boot.
+ * Now, each device resource can have a pci_slot property that will override
+ * dynamic assignment. The original behavior is preserved, but no effort is
+ * made to detect or avoid conflicts between legacy behavior and new behavior.
+ * When used with vmadm, this is not an issue, as it will update the zone
+ * config at boot time to contain static assignments.
+ */
+static int
+add_disk(char *disk, char *path, char *slotconf, size_t slotconf_len)
+{
+ static char *boot = NULL;
+ static int next_cd = 0;
+ static int next_other = 0;
+ custr_t *sconfstr = NULL;
+ const char *model = "virtio-blk";
+ uint_t pcibus = 0, pcidev = 0, pcifn = 0;
+ const char *slotstr;
+ const char *guest_block_size = NULL;
+ boolean_t isboot;
+ boolean_t nodelete = B_FALSE;
+
+ if (custr_alloc_buf(&sconfstr, slotconf, slotconf_len) == -1) {
+ return (-1);
+ }
+
+ isboot = is_env_true("device", disk, "boot");
+ if (isboot) {
+ /* Allow at most one "primary" disk */
+ if (boot != NULL) {
+ (void) printf("Error: multiple boot disks: %s %s\n",
+ boot, path);
+ goto fail;
+ }
+ boot = path;
+ }
+
+ if ((slotstr = get_zcfg_var("device", disk, "pci_slot")) != NULL) {
+ if (parse_pcislot(slotstr, &pcibus, &pcidev, &pcifn) != 0) {
+ goto fail;
+ }
+ } else {
+ if (isboot) {
+ pcidev = PCI_SLOT_BOOT_DISK;
+ pcifn = 0;
+ } else if (is_env_string("device", disk, "media", "cdrom")) {
+ pcidev = PCI_SLOT_CD;
+ pcifn = next_cd;
+ next_cd++;
+ } else {
+ pcidev = PCI_SLOT_OTHER_DISKS;
+ pcifn = next_other;
+ next_other++;
+ }
+ }
+
+ if (is_env_string("device", disk, "model", "virtio")) {
+ model = "virtio-blk";
+ /*
+ * bhyve's blockif code refers to the UNMAP/DISCARD/TRIM
+ * feature as 'delete' and so 'nodelete' is used by
+ * bhyve to disable the feature. We use 'trim' for
+ * interfaces we expose to the operator as that seems to
+ * be the most familiar name for the operation (and less
+ * likely to cause confusion).
+ */
+ nodelete = is_env_true("device", disk, "notrim");
+ guest_block_size = get_zcfg_var("device", disk,
+ "guest_block_size");
+
+ /* Treat a 0 size to mean the whatever the volume advertises */
+ if (guest_block_size != NULL &&
+ strcmp(guest_block_size, "0") == 0) {
+ guest_block_size = NULL;
+ }
+ } else if (is_env_string("device", disk, "model", "nvme")) {
+ model = "nvme";
+ } else if (is_env_string("device", disk, "model", "ahci")) {
+ if (is_env_string("device", disk, "media", "cdrom")) {
+ model = "ahci-cd";
+ } else {
+ model = "ahci-hd";
+ }
+ } else {
+ (void) printf("Error: unknown disk model '%s'\n", model);
+ goto fail;
+ }
+
+ if (custr_append_printf(sconfstr, "%u:%u:%u,%s,%s",
+ pcibus, pcidev, pcifn, model, path) == -1) {
+ (void) printf("Error: disk path '%s' too long\n", path);
+ goto fail;
+ }
+
+ if (nodelete && custr_append(sconfstr, ",nodelete") == -1) {
+ (void) printf("Error: too many disk options\n");
+ goto fail;
+ }
+
+ if (guest_block_size != NULL && custr_append_printf(sconfstr,
+ ",sectorsize=%s", guest_block_size) == -1) {
+ (void) printf("Error: too many disk options\n");
+ goto fail;
+ }
+
+ custr_free(sconfstr);
+ return (0);
+
+fail:
+ custr_free(sconfstr);
+ return (-1);
+}
+
+static int
+add_ppt(int *argc, char **argv, char *ppt, char *path, char *slotconf,
+ size_t slotconf_len)
+{
+ static boolean_t wired = B_FALSE;
+ static boolean_t acpi = B_FALSE;
+ uint_t bus = 0, dev = 0, func = 0;
+ char *pcislot;
+
+ pcislot = get_zcfg_var("device", ppt, "pci_slot");
+
+ if (pcislot == NULL) {
+ (void) printf("Error: device %s has no PCI slot\n", ppt);
+ return (-1);
+ }
+
+ if (parse_pcislot(pcislot, &bus, &dev, &func) != 0) {
+ return (-1);
+ }
+
+ if (bus > 0) {
+ if (!acpi && add_arg(argc, argv, "-A") != 0)
+ return (-1);
+ acpi = B_TRUE;
+ }
+
+ if (!wired && add_arg(argc, argv, "-S") != 0)
+ return (-1);
+
+ wired = B_TRUE;
+
+ if (snprintf(slotconf, slotconf_len, "%d:%d:%d,passthru,%s",
+ bus, dev, func, path) >= slotconf_len) {
+ (void) printf("Error: device path '%s' too long\n", path);
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+add_devices(int *argc, char **argv)
+{
+ char *devices;
+ char *dev;
+ char *lasts;
+ char slotconf[MAXNAMELEN];
+
+ if ((devices = get_zcfg_var("device", "resources", NULL)) == NULL) {
+ return (0);
+ }
+
+ for (dev = strtok_r(devices, " ", &lasts); dev != NULL;
+ dev = strtok_r(NULL, " ", &lasts)) {
+ int ret;
+ char *path;
+ char *model;
+
+ /* zoneadmd is not careful about a trailing delimiter. */
+ if (dev[0] == '\0') {
+ continue;
+ }
+
+ if ((path = get_zcfg_var("device", dev, "path")) == NULL) {
+ (void) printf("Error: device %s has no path\n", dev);
+ return (-1);
+ }
+
+ if ((model = get_zcfg_var("device", dev, "model")) == NULL) {
+ (void) printf("Error: device %s has no model\n", dev);
+ return (-1);
+ }
+
+ if (strcmp(model, "passthru") == 0) {
+ ret = add_ppt(argc, argv, dev, path, slotconf,
+ sizeof (slotconf));
+ } else {
+ ret = add_disk(dev, path, slotconf, sizeof (slotconf));
+ }
+
+ if (ret != 0)
+ return (-1);
+
+ if (add_arg(argc, argv, "-s") != 0 ||
+ add_arg(argc, argv, slotconf) != 0) {
+ return (-1);
+ }
+ }
+
+ return (0);
+}
+
+static int
+add_nets(int *argc, char **argv)
+{
+ char *nets;
+ char *net;
+ char *lasts;
+ int nextpcifn = 1; /* 0 reserved for primary */
+ char slotconf[MAXNAMELEN];
+ char *primary = NULL;
+
+ if ((nets = get_zcfg_var("net", "resources", NULL)) == NULL ||
+ strcmp(nets, "") == 0) {
+ return (0);
+ }
+
+ for (net = strtok_r(nets, " ", &lasts); net != NULL;
+ net = strtok_r(NULL, " ", &lasts)) {
+ int pcifn;
+
+ /* zoneadmd is not careful about a trailing delimiter. */
+ if (net[0] == '\0') {
+ continue;
+ }
+
+ /* Allow at most one "primary" net */
+ if (is_env_true("net", net, "primary")) {
+ if (primary != NULL) {
+ (void) printf("Error: "
+ "multiple primary nets: %s %s\n",
+ primary, net);
+ return (-1);
+ }
+ primary = net;
+ pcifn = 0;
+ } else {
+ pcifn = nextpcifn;
+ nextpcifn++;
+ }
+
+ if (snprintf(slotconf, sizeof (slotconf),
+ "%d:%d,virtio-net-viona,%s", PCI_SLOT_NICS, pcifn, net) >=
+ sizeof (slotconf)) {
+ (void) printf("Error: net '%s' too long\n", net);
+ return (-1);
+ }
+
+ if (add_arg(argc, argv, "-s") != 0 ||
+ add_arg(argc, argv, slotconf) != 0) {
+ return (-1);
+ }
+ }
+
+ /* Make sure there is a "primary" net */
+ if (primary == NULL) {
+ (void) printf("Error: no primary net has been specified\n");
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+add_lpc(int *argc, char **argv)
+{
+ char *lpcdevs[] = { "bootrom", "com1", "com2", NULL };
+ const int bootrom_idx = 0;
+ int i;
+ char *val;
+ char conf[MAXPATHLEN];
+ boolean_t found_bootrom = B_FALSE;
+
+ assert(strcmp(lpcdevs[bootrom_idx], "bootrom") == 0);
+
+ (void) snprintf(conf, sizeof (conf), "%d,lpc", PCI_SLOT_LPC);
+ if (add_arg(argc, argv, "-s") != 0 ||
+ add_arg(argc, argv, conf) != 0) {
+ return (-1);
+ }
+
+ for (i = 0; lpcdevs[i] != NULL; i++) {
+ if ((val = get_zcfg_var("attr", lpcdevs[i], NULL)) == NULL) {
+ continue;
+ }
+ if (i == bootrom_idx) {
+ found_bootrom = B_TRUE;
+ if (strcmp(val, "bios") == 0) {
+ val = DEFAULT_BOOTROM_CSM;
+ } else if (strcmp(val, "uefi") == 0) {
+ val = DEFAULT_BOOTROM;
+ }
+ }
+ if (snprintf(conf, sizeof (conf), "%s,%s", lpcdevs[i], val) >=
+ sizeof (conf)) {
+ (void) printf("Error: value of attr '%s' too long\n",
+ lpcdevs[i]);
+ return (-1);
+ }
+ if (add_arg(argc, argv, "-l") != 0 ||
+ add_arg(argc, argv, conf) != 0) {
+ return (-1);
+ }
+ }
+
+ if (!found_bootrom) {
+ if (add_arg(argc, argv, "-l") != 0 ||
+ add_arg(argc, argv, "bootrom," DEFAULT_BOOTROM_CSM) != 0) {
+ return (-1);
+ }
+ }
+
+ return (0);
+}
+
+static int
+add_hostbridge(int *argc, char **argv)
+{
+ char conf[MAXPATHLEN];
+ char *model = NULL;
+ boolean_t raw_config = B_FALSE;
+
+ if ((model = get_zcfg_var("attr", "hostbridge", NULL)) != NULL) {
+ /* Easy bypass for doing testing */
+ if (strcmp("none", model) == 0) {
+ return (0);
+ }
+
+ if (strchr(model, '=') != NULL) {
+ /*
+ * If the attribute contains '=', assume the creator
+ * wants total control over the config. Do not prepend
+ * the value with 'model='.
+ */
+ raw_config = B_TRUE;
+ }
+ }
+
+ /* Default to Natoma if nothing else is specified */
+ if (model == NULL) {
+ model = "i440fx";
+ }
+
+ (void) snprintf(conf, sizeof (conf), "%d,hostbridge,%s%s",
+ PCI_SLOT_HOSTBRIDGE, raw_config ? "" : "model=", model);
+ if (add_arg(argc, argv, "-s") != 0 ||
+ add_arg(argc, argv, conf) != 0) {
+ return (-1);
+ }
+ return (0);
+}
+
+static int
+add_bhyve_extra_opts(int *argc, char **argv)
+{
+ char *val;
+ char *tok;
+ char *lasts;
+
+ if ((val = get_zcfg_var("attr", "bhyve_extra_opts", NULL)) == NULL) {
+ return (0);
+ }
+
+ val = strdup(val);
+ if (val == NULL) {
+ (void) printf("Error: strdup failed\n");
+ return (-1);
+ }
+
+ for (tok = strtok_r(val, " \t", &lasts); tok != NULL;
+ tok = strtok_r(NULL, " \t", &lasts)) {
+ if (tok[0] == '\0') {
+ continue;
+ }
+ if (add_arg(argc, argv, tok) != 0) {
+ return (-1);
+ }
+ }
+
+ free(val);
+ return (0);
+}
+
+#define INVALID_CHAR (char)(255)
+
+static char
+decode_char(char encoded)
+{
+ if (encoded >= 'A' && encoded <= 'Z')
+ return (encoded - 'A');
+ if (encoded >= 'a' && encoded <= 'z')
+ return (encoded - 'a' + 26);
+ if (encoded >= '0' && encoded <= '9')
+ return (encoded - '0' + 52);
+ if (encoded == '+')
+ return (62);
+ if (encoded == '/')
+ return (63);
+ if (encoded == '=')
+ return (0);
+ return (INVALID_CHAR);
+}
+
+static int
+add_base64(custr_t *cus, const char *b64)
+{
+ size_t b64len = strlen(b64);
+
+ if (b64len == 0 || b64len % 4 != 0)
+ return (-1);
+
+ while (b64len > 0) {
+ uint_t padding = 0;
+ char c0 = decode_char(b64[0]);
+ char c1 = decode_char(b64[1]);
+ char c2 = decode_char(b64[2]);
+ char c3 = decode_char(b64[3]);
+
+ if (c0 == INVALID_CHAR || c1 == INVALID_CHAR ||
+ c2 == INVALID_CHAR || c3 == INVALID_CHAR) {
+ (void) printf("Error: base64 value contains invalid "
+ "character(s)\n");
+ return (-1);
+ }
+
+ /*
+ * For each block of 4 input characters, an '=' should
+ * only appear as the last two characters.
+ */
+ if (b64[0] == '=' || b64[1] == '=') {
+ (void) printf("Error: base64 value contains invalid "
+ "padding\n");
+ return (-1);
+ }
+
+ if (b64len == 4) {
+ /*
+ * We can end with '==' or '=', but never '='
+ * followed by something else.
+ */
+ if (b64[2] == '=') {
+ if (b64[3] != '=') {
+ (void) printf("Error: base64 value "
+ "contains invalid padding\n");
+ return (-1);
+ }
+ padding = 2;
+ } else if (b64[3] == '=') {
+ padding = 1;
+ }
+ }
+
+ VERIFY0(custr_appendc(cus, c0 << 2 | c1 >> 4));
+ if (padding < 2)
+ VERIFY0(custr_appendc(cus, c1 << 4 | c2 >> 2));
+ if (padding < 1)
+ VERIFY0(custr_appendc(cus, c2 << 6 | c3));
+
+ b64len -= 4;
+ b64 += 4;
+ }
+
+ return (0);
+}
+
+/*
+ * Adds the frame buffer and an xhci tablet to help with the pointer.
+ */
+static int
+add_fbuf(int *argc, char **argv)
+{
+ char conf[MAXPATHLEN];
+ custr_t *cconf = NULL;
+ char *password = NULL;
+
+ /*
+ * Do not add a frame buffer or tablet if VNC is disabled.
+ */
+ if (is_env_string("attr", "vnc_port", NULL, "-1")) {
+ return (0);
+ }
+
+ if (custr_alloc_buf(&cconf, conf, sizeof (conf)) != 0) {
+ return (-1);
+ }
+
+ VERIFY0(custr_append_printf(cconf, "%d:0,fbuf,vga=off,unix=/tmp/vm.vnc",
+ PCI_SLOT_FBUF));
+
+ password = get_zcfg_var("attr", "vnc_password", NULL);
+ if (password != NULL) {
+ VERIFY0(custr_append(cconf, ",password="));
+
+ if (add_base64(cconf, password) != 0) {
+ goto fail;
+ }
+ }
+
+ if (add_arg(argc, argv, "-s") != 0 ||
+ add_arg(argc, argv, conf) != 0) {
+ goto fail;
+ }
+
+ custr_reset(cconf);
+ VERIFY0(custr_append_printf(cconf, "%d:1,xhci,tablet", PCI_SLOT_FBUF));
+
+ if (add_arg(argc, argv, "-s") != 0 ||
+ add_arg(argc, argv, conf) != 0) {
+ goto fail;
+ }
+
+ /*
+ * Since cconf was allocated using custr_alloc_buf() where 'conf'
+ * is the underlying fixed buffer for cconf, we can free cconf
+ * which in this instance will just free cconf, but _not_ the
+ * underlying fixed buffer (conf) which is left unchanged by
+ * custr_free().
+ */
+
+ custr_free(cconf);
+ return (0);
+
+fail:
+ custr_free(cconf);
+ return (-1);
+}
+
+/* Must be called last */
+static int
+add_vmname(int *argc, char **argv)
+{
+ char buf[229]; /* VM_MAX_NAMELEN */
+ char *val = getenv("_ZONECFG_did");
+
+ if (val == NULL || val[0] == '\0') {
+ val = "SYSbhyve-unknown";
+ } else {
+ (void) snprintf(buf, sizeof (buf), "SYSbhyve-%s", val);
+ val = buf;
+ }
+ if (add_arg(argc, argv, val) != 0) {
+ return (-1);
+ }
+ return (0);
+}
+
+/*
+ * Write the entire buffer or return an error. This function could be more
+ * paranoid and call fdsync() at the end. That's not really need for this use
+ * case because it is being written to tmpfs.
+ */
+static int
+full_write(int fd, char *buf, size_t buflen)
+{
+ ssize_t nwritten;
+ size_t totwritten = 0;
+
+ while (totwritten < buflen) {
+ nwritten = write(fd, buf + totwritten, buflen - totwritten);
+ if (nwritten < 0) {
+ if (errno == EAGAIN || errno == EINTR) {
+ continue;
+ }
+ return (-1);
+ }
+ assert(nwritten > 0);
+ totwritten += nwritten;
+ }
+ assert(totwritten == buflen);
+
+ return (0);
+}
+
+static void
+init_debug(void)
+{
+ char *val = getenv("_ZONEADMD_brand_debug");
+
+ debug = (val != NULL && val[0] != '\0');
+}
+
+static int
+setup_reboot(void)
+{
+ zoneid_t zoneid;
+
+ if ((zoneid = getzoneidbyname(zonename)) < 0) {
+ (void) printf("Error: bhyve zoneid (%s) does not exist\n",
+ zonename);
+ return (-1);
+ }
+
+ if (zoneid == GLOBAL_ZONEID) {
+ (void) printf("Error: bhyve global zoneid (%s)\n", zonename);
+ return (-1);
+ }
+
+ if (zone_setattr(zoneid, ZONE_ATTR_INITRESTART0, NULL, 0) < 0) {
+ (void) printf("Error: bhyve zoneid %ld setattr failed: %s\n",
+ zoneid, strerror(errno));
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
+main(int argc, char **argv)
+{
+ int fd, err;
+ char *zhargv[ZH_MAXARGS] = {
+ "bhyve", /* Squats on argv[0] */
+ "-H", /* vmexit on halt isns */
+ NULL };
+ int zhargc = 2;
+ nvlist_t *nvl;
+ char *nvbuf = NULL;
+ size_t nvbuflen = 0;
+ char zoneroot[MAXPATHLEN];
+ int zrfd;
+
+ init_debug();
+
+ if (argc != 3) {
+ (void) printf("Error: bhyve boot program called with "
+ "%d args, expecting 2\n", argc - 1);
+ return (1);
+ }
+ zonename = argv[1];
+ zonepath = argv[2];
+
+ if (setup_reboot() < 0)
+ return (1);
+
+ if (add_smbios(&zhargc, (char **)&zhargv) != 0 ||
+ add_lpc(&zhargc, (char **)&zhargv) != 0 ||
+ add_hostbridge(&zhargc, (char **)&zhargv) != 0 ||
+ add_cpu(&zhargc, (char **)&zhargv) != 0 ||
+ add_ram(&zhargc, (char **)&zhargv) != 0 ||
+ add_devices(&zhargc, (char **)&zhargv) != 0 ||
+ add_nets(&zhargc, (char **)&zhargv) != 0 ||
+ add_bhyve_extra_opts(&zhargc, (char **)&zhargv) != 0 ||
+ add_fbuf(&zhargc, (char **)&zhargv) != 0 ||
+ add_vmname(&zhargc, (char **)&zhargv) != 0) {
+ return (1);
+ }
+
+ /*
+ * This and other dynamically allocated resources are intentionally
+ * leaked. It's a short-lived program and it will all get mopped up on
+ * exit.
+ */
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 ||
+ nvlist_add_string_array(nvl, "bhyve_args", zhargv, zhargc) != 0) {
+ (void) printf("Error: failed to create nvlist: %s\n",
+ strerror(errno));
+ return (1);
+ }
+
+ if (debug) {
+ dprintf(("packing nvlist:\n"));
+ nvlist_print(stdout, nvl);
+ }
+
+ err = nvlist_pack(nvl, &nvbuf, &nvbuflen, NV_ENCODE_XDR, 0);
+ if (err != 0) {
+ (void) printf("Error: failed to pack nvlist: %s\n",
+ strerror(err));
+ return (1);
+ }
+
+ if (snprintf(zoneroot, sizeof (zoneroot), "%s/root", zonepath) >=
+ sizeof (zoneroot)) {
+ (void) printf("Error: zonepath '%s' too long\n", zonepath);
+ return (1);
+ }
+
+ if ((zrfd = open(zoneroot, O_RDONLY|O_SEARCH)) < 0) {
+ (void) printf("Error: cannot open zone root '%s': %s\n",
+ zoneroot, strerror(errno));
+ return (1);
+ }
+
+ /*
+ * This mkdirat() and the subsequent openat() are only safe because the
+ * zone root is always under the global zone's exclusive control (always
+ * read-only in all zones) and the writable directory is a tmpfs file
+ * system that was just mounted and no zone code has run yet.
+ */
+ if (mkdirat(zrfd, BHYVE_DIR, 0700) != 0 && errno != EEXIST) {
+ (void) printf("Error: failed to create directory %s "
+ "in zone: %s\n", BHYVE_DIR, strerror(errno));
+ return (1);
+ }
+
+ fd = openat(zrfd, BHYVE_ARGS_FILE, O_WRONLY|O_CREAT|O_EXCL, 0600);
+ if (fd < 0) {
+ (void) printf("Error: failed to create file %s in zone: %s\n",
+ BHYVE_ARGS_FILE, strerror(errno));
+ return (1);
+ }
+ if (full_write(fd, nvbuf, nvbuflen) != 0) {
+ (void) printf("Error: failed to write %s: %s\n",
+ BHYVE_ARGS_FILE, strerror(errno));
+ (void) unlink(BHYVE_ARGS_FILE);
+ return (1);
+ }
+
+ return (0);
+}