summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/bhyve/block_if.c59
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_block.c128
2 files changed, 166 insertions, 21 deletions
diff --git a/usr/src/cmd/bhyve/block_if.c b/usr/src/cmd/bhyve/block_if.c
index 72c5b02a0d..8278bf3f92 100644
--- a/usr/src/cmd/bhyve/block_if.c
+++ b/usr/src/cmd/bhyve/block_if.c
@@ -29,7 +29,7 @@
*/
/*
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
*/
#include <sys/cdefs.h>
@@ -364,9 +364,40 @@ blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
else
br->br_resid = 0;
}
-#endif
else
err = EOPNOTSUPP;
+#else
+ else if (bc->bc_ischr) {
+ dkioc_free_list_t dfl = {
+ .dfl_num_exts = 1,
+ .dfl_offset = 0,
+ .dfl_flags = 0,
+ .dfl_exts = {
+ {
+ .dfle_start = br->br_offset,
+ .dfle_length = br->br_resid
+ }
+ }
+ };
+
+ if (ioctl(bc->bc_fd, DKIOCFREE, &dfl))
+ err = errno;
+ else
+ br->br_resid = 0;
+ } else {
+ struct flock fl = {
+ .l_whence = 0,
+ .l_type = F_WRLCK,
+ .l_start = br->br_offset,
+ .l_len = br->br_resid
+ };
+
+ if (fcntl(bc->bc_fd, F_FREESP, &fl))
+ err = errno;
+ else
+ br->br_resid = 0;
+ }
+#endif
break;
default:
err = EINVAL;
@@ -475,6 +506,8 @@ blockif_open(const char *optstr, const char *ident)
off_t size, psectsz, psectoff;
int extra, fd, i, sectsz;
int nocache, sync, ro, candelete, geom, ssopt, pssopt;
+ int nodelete;
+
#ifndef WITHOUT_CAPSICUM
cap_rights_t rights;
cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE };
@@ -487,6 +520,7 @@ blockif_open(const char *optstr, const char *ident)
nocache = 0;
sync = 0;
ro = 0;
+ nodelete = 0;
/*
* The first element in the optstring is always a pathname.
@@ -499,6 +533,8 @@ blockif_open(const char *optstr, const char *ident)
continue;
else if (!strcmp(cp, "nocache"))
nocache = 1;
+ else if (!strcmp(cp, "nodelete"))
+ nodelete = 1;
else if (!strcmp(cp, "sync") || !strcmp(cp, "direct"))
sync = 1;
else if (!strcmp(cp, "ro"))
@@ -566,7 +602,7 @@ blockif_open(const char *optstr, const char *ident)
ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
arg.len = sizeof(arg.value.i);
- if (ioctl(fd, DIOCGATTR, &arg) == 0)
+ if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
candelete = arg.value.i;
if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
geom = 1;
@@ -619,6 +655,10 @@ blockif_open(const char *optstr, const char *ident)
}
}
}
+
+ if (nodelete == 0 && ioctl(fd, DKIOC_CANFREE, &candelete))
+ candelete = 0;
+
} else {
int flags;
@@ -628,6 +668,19 @@ blockif_open(const char *optstr, const char *ident)
wce = WCE_FCNTL;
}
}
+
+ /*
+ * We don't have a way to discover if a file supports the
+ * FREESP fcntl cmd (other than trying it). However,
+ * zfs, ufs, tmpfs, and udfs all support the FREESP fcntl cmd.
+ * Nfsv4 and nfsv4 also forward the FREESP request
+ * to the server, so we always enable it for file based
+ * volumes. Anyone trying to run volumes on an unsupported
+ * configuration is on their own, and should be prepared
+ * for the requests to fail.
+ */
+ if (nodelete == 0)
+ candelete = 1;
}
#endif
diff --git a/usr/src/cmd/bhyve/pci_virtio_block.c b/usr/src/cmd/bhyve/pci_virtio_block.c
index 5a7ecbfe9e..406a232710 100644
--- a/usr/src/cmd/bhyve/pci_virtio_block.c
+++ b/usr/src/cmd/bhyve/pci_virtio_block.c
@@ -3,7 +3,7 @@
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
- * Copyright (c) 2019 Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -39,7 +39,6 @@
* http://www.illumos.org/license/CDDL.
*
* Copyright 2014 Pluribus Networks Inc.
- * Copyright 2018 Joyent, Inc.
*/
#include <sys/cdefs.h>
@@ -69,26 +68,36 @@ __FBSDID("$FreeBSD$");
#include "virtio.h"
#include "block_if.h"
-#define VTBLK_RINGSZ 128
+#define VTBLK_BSIZE 512
+#define VTBLK_RINGSZ 128
_Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request");
-#define VTBLK_S_OK 0
-#define VTBLK_S_IOERR 1
+#define VTBLK_S_OK 0
+#define VTBLK_S_IOERR 1
#define VTBLK_S_UNSUPP 2
#define VTBLK_BLK_ID_BYTES 20 + 1
/* Capability bits */
-#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */
-#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */
-#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */
-#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */
+#define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */
+#define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */
+#define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */
+#define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */
+#define VTBLK_F_RO (1 << 5) /* Disk is read-only */
+#define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/
+#define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */
+#define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */
+#define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */
+#define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */
+#define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */
+#define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */
+#define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */
/*
* Host capabilities
*/
-#define VTBLK_S_HOSTCAPS \
+#define VTBLK_S_HOSTCAPS \
( VTBLK_F_SEG_MAX | \
VTBLK_F_BLK_SIZE | \
VTBLK_F_FLUSH | \
@@ -96,6 +105,18 @@ _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able t
VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */
/*
+ * The current blockif_delete() interface only allows a single delete
+ * request at a time.
+ */
+#define VTBLK_MAX_DISCARD_SEG 1
+
+/*
+ * An arbitrary limit to prevent excessive latency due to large
+ * delete requests.
+ */
+#define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */
+
+/*
* Config space "registers"
*/
struct vtblk_config {
@@ -115,6 +136,14 @@ struct vtblk_config {
uint32_t opt_io_size;
} vbc_topology;
uint8_t vbc_writeback;
+ uint8_t unused0[3];
+ uint32_t max_discard_sectors;
+ uint32_t max_discard_seg;
+ uint32_t discard_sector_alignment;
+ uint32_t max_write_zeroes_sectors;
+ uint32_t max_write_zeroes_seg;
+ uint8_t write_zeroes_may_unmap;
+ uint8_t unused1[3];
} __packed;
/*
@@ -123,9 +152,14 @@ struct vtblk_config {
struct virtio_blk_hdr {
#define VBH_OP_READ 0
#define VBH_OP_WRITE 1
+#define VBH_OP_SCSI_CMD 2
+#define VBH_OP_SCSI_CMD_OUT 3
#define VBH_OP_FLUSH 4
#define VBH_OP_FLUSH_OUT 5
#define VBH_OP_IDENT 8
+#define VBH_OP_DISCARD 11
+#define VBH_OP_WRITE_ZEROES 13
+
#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */
uint32_t vbh_type;
uint32_t vbh_ioprio;
@@ -136,8 +170,8 @@ struct virtio_blk_hdr {
* Debug printf
*/
static int pci_vtblk_debug;
-#define DPRINTF(params) if (pci_vtblk_debug) printf params
-#define WPRINTF(params) printf params
+#define DPRINTF(params) if (pci_vtblk_debug) printf params
+#define WPRINTF(params) printf params
struct pci_vtblk_ioreq {
struct blockif_req io_req;
@@ -146,6 +180,15 @@ struct pci_vtblk_ioreq {
uint16_t io_idx;
};
+struct virtio_blk_discard_write_zeroes {
+ uint64_t sector;
+ uint32_t num_sectors;
+ struct {
+ uint32_t unmap:1;
+ uint32_t reserved:31;
+ } flags;
+};
+
/*
* Per-device softc
*/
@@ -154,6 +197,7 @@ struct pci_vtblk_softc {
pthread_mutex_t vsc_mtx;
struct vqueue_info vbsc_vq;
struct vtblk_config vbsc_cfg;
+ struct virtio_consts vbsc_consts;
struct blockif_ctxt *bc;
#ifndef __FreeBSD__
int vbsc_wce;
@@ -243,6 +287,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
int writeop, type;
struct iovec iov[BLOCKIF_IOV_MAX + 2];
uint16_t idx, flags[BLOCKIF_IOV_MAX + 2];
+ struct virtio_blk_discard_write_zeroes *discard;
n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags);
@@ -262,7 +307,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
vbh = (struct virtio_blk_hdr *)iov[0].iov_base;
memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2));
io->io_req.br_iovcnt = n - 2;
- io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE;
+ io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE;
io->io_status = (uint8_t *)iov[--n].iov_base;
assert(iov[n].iov_len == 1);
assert(flags[n] & VRING_DESC_F_WRITE);
@@ -273,7 +318,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
* we don't advertise the capability.
*/
type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
- writeop = (type == VBH_OP_WRITE);
+ writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD);
iolen = 0;
for (i = 1; i < n; i++) {
@@ -289,7 +334,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
io->io_req.br_resid = iolen;
DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r",
- writeop ? "write" : "read/ident", iolen, i - 1,
+ writeop ? "write/discard" : "read/ident", iolen, i - 1,
io->io_req.br_offset));
switch (type) {
@@ -299,6 +344,46 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
case VBH_OP_WRITE:
err = blockif_write(sc->bc, &io->io_req);
break;
+ case VBH_OP_DISCARD:
+ /*
+ * We currently only support a single request, if the guest
+ * has submitted a request that doesn't conform to the
+ * requirements, we return a error.
+ */
+ if (iov[1].iov_len != sizeof (*discard)) {
+ pci_vtblk_done_locked(io, EINVAL);
+ return;
+ }
+
+ /* The segments to discard are provided rather than data */
+ discard = (struct virtio_blk_discard_write_zeroes *)
+ iov[1].iov_base;
+
+ /*
+ * virtio v1.1 5.2.6.2:
+ * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP
+ * for discard and write zeroes commands if any unknown flag is
+ * set. Furthermore, the device MUST set the status byte to
+ * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag
+ * is set.
+ *
+ * Currently there are no known flags for a DISCARD request.
+ */
+ if (discard->flags.unmap != 0 || discard->flags.reserved != 0) {
+ pci_vtblk_done_locked(io, ENOTSUP);
+ return;
+ }
+
+ /* Make sure the request doesn't exceed our size limit */
+ if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) {
+ pci_vtblk_done_locked(io, EINVAL);
+ return;
+ }
+
+ io->io_req.br_offset = discard->sector * VTBLK_BSIZE;
+ io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE;
+ err = blockif_delete(sc->bc, &io->io_req);
+ break;
case VBH_OP_FLUSH:
case VBH_OP_FLUSH_OUT:
err = blockif_flush(sc->bc, &io->io_req);
@@ -348,7 +433,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
*/
snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func);
bctxt = blockif_open(opts, bident);
- if (bctxt == NULL) {
+ if (bctxt == NULL) {
perror("Could not open backing file");
return (1);
}
@@ -367,6 +452,10 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
io->io_idx = i;
}
+ bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts));
+ if (blockif_candelete(sc->bc))
+ sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD;
+
#ifndef __FreeBSD__
/* Disable write cache until FLUSH feature is negotiated */
(void) blockif_set_wce(sc->bc, 0);
@@ -376,7 +465,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
pthread_mutex_init(&sc->vsc_mtx, NULL);
/* init virtio softc and virtqueues */
- vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq);
+ vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq);
sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
@@ -394,7 +483,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);
/* setup virtio block config space */
- sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */
+ sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */
sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */
/*
@@ -416,6 +505,9 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
sc->vbsc_cfg.vbc_topology.min_io_size = 0;
sc->vbsc_cfg.vbc_topology.opt_io_size = 0;
sc->vbsc_cfg.vbc_writeback = 0;
+ sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT;
+ sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG;
+ sc->vbsc_cfg.discard_sector_alignment = sectsz / VTBLK_BSIZE;
/*
* Should we move some of this into virtio.c? Could