diff options
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/cmd/bhyve/block_if.c | 59 | ||||
-rw-r--r-- | usr/src/cmd/bhyve/pci_virtio_block.c | 128 |
2 files changed, 166 insertions, 21 deletions
diff --git a/usr/src/cmd/bhyve/block_if.c b/usr/src/cmd/bhyve/block_if.c index 72c5b02a0d..8278bf3f92 100644 --- a/usr/src/cmd/bhyve/block_if.c +++ b/usr/src/cmd/bhyve/block_if.c @@ -29,7 +29,7 @@ */ /* - * Copyright 2018 Joyent, Inc. + * Copyright 2020 Joyent, Inc. */ #include <sys/cdefs.h> @@ -364,9 +364,40 @@ blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) else br->br_resid = 0; } -#endif else err = EOPNOTSUPP; +#else + else if (bc->bc_ischr) { + dkioc_free_list_t dfl = { + .dfl_num_exts = 1, + .dfl_offset = 0, + .dfl_flags = 0, + .dfl_exts = { + { + .dfle_start = br->br_offset, + .dfle_length = br->br_resid + } + } + }; + + if (ioctl(bc->bc_fd, DKIOCFREE, &dfl)) + err = errno; + else + br->br_resid = 0; + } else { + struct flock fl = { + .l_whence = 0, + .l_type = F_WRLCK, + .l_start = br->br_offset, + .l_len = br->br_resid + }; + + if (fcntl(bc->bc_fd, F_FREESP, &fl)) + err = errno; + else + br->br_resid = 0; + } +#endif break; default: err = EINVAL; @@ -475,6 +506,8 @@ blockif_open(const char *optstr, const char *ident) off_t size, psectsz, psectoff; int extra, fd, i, sectsz; int nocache, sync, ro, candelete, geom, ssopt, pssopt; + int nodelete; + #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; @@ -487,6 +520,7 @@ blockif_open(const char *optstr, const char *ident) nocache = 0; sync = 0; ro = 0; + nodelete = 0; /* * The first element in the optstring is always a pathname. @@ -499,6 +533,8 @@ blockif_open(const char *optstr, const char *ident) continue; else if (!strcmp(cp, "nocache")) nocache = 1; + else if (!strcmp(cp, "nodelete")) + nodelete = 1; else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) sync = 1; else if (!strcmp(cp, "ro")) @@ -566,7 +602,7 @@ blockif_open(const char *optstr, const char *ident) ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); arg.len = sizeof(arg.value.i); - if (ioctl(fd, DIOCGATTR, &arg) == 0) + if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) candelete = arg.value.i; if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) geom = 1; @@ -619,6 +655,10 @@ blockif_open(const char *optstr, const char *ident) } } } + + if (nodelete == 0 && ioctl(fd, DKIOC_CANFREE, &candelete)) + candelete = 0; + } else { int flags; @@ -628,6 +668,19 @@ blockif_open(const char *optstr, const char *ident) wce = WCE_FCNTL; } } + + /* + * We don't have a way to discover if a file supports the + * FREESP fcntl cmd (other than trying it). However, + * zfs, ufs, tmpfs, and udfs all support the FREESP fcntl cmd. + * Nfsv4 and nfsv4 also forward the FREESP request + * to the server, so we always enable it for file based + * volumes. Anyone trying to run volumes on an unsupported + * configuration is on their own, and should be prepared + * for the requests to fail. + */ + if (nodelete == 0) + candelete = 1; } #endif diff --git a/usr/src/cmd/bhyve/pci_virtio_block.c b/usr/src/cmd/bhyve/pci_virtio_block.c index 5a7ecbfe9e..406a232710 100644 --- a/usr/src/cmd/bhyve/pci_virtio_block.c +++ b/usr/src/cmd/bhyve/pci_virtio_block.c @@ -3,7 +3,7 @@ * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. - * Copyright (c) 2019 Joyent, Inc. + * Copyright 2020 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -39,7 +39,6 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. - * Copyright 2018 Joyent, Inc. */ #include <sys/cdefs.h> @@ -69,26 +68,36 @@ __FBSDID("$FreeBSD$"); #include "virtio.h" #include "block_if.h" -#define VTBLK_RINGSZ 128 +#define VTBLK_BSIZE 512 +#define VTBLK_RINGSZ 128 _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request"); -#define VTBLK_S_OK 0 -#define VTBLK_S_IOERR 1 +#define VTBLK_S_OK 0 +#define VTBLK_S_IOERR 1 #define VTBLK_S_UNSUPP 2 #define VTBLK_BLK_ID_BYTES 20 + 1 /* Capability bits */ -#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */ -#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */ -#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */ -#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */ +#define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */ +#define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */ +#define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */ +#define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */ +#define VTBLK_F_RO (1 << 5) /* Disk is read-only */ +#define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/ +#define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */ +#define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */ +#define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */ +#define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */ +#define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */ +#define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */ +#define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */ /* * Host capabilities */ -#define VTBLK_S_HOSTCAPS \ +#define VTBLK_S_HOSTCAPS \ ( VTBLK_F_SEG_MAX | \ VTBLK_F_BLK_SIZE | \ VTBLK_F_FLUSH | \ @@ -96,6 +105,18 @@ _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able t VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ /* + * The current blockif_delete() interface only allows a single delete + * request at a time. + */ +#define VTBLK_MAX_DISCARD_SEG 1 + +/* + * An arbitrary limit to prevent excessive latency due to large + * delete requests. + */ +#define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */ + +/* * Config space "registers" */ struct vtblk_config { @@ -115,6 +136,14 @@ struct vtblk_config { uint32_t opt_io_size; } vbc_topology; uint8_t vbc_writeback; + uint8_t unused0[3]; + uint32_t max_discard_sectors; + uint32_t max_discard_seg; + uint32_t discard_sector_alignment; + uint32_t max_write_zeroes_sectors; + uint32_t max_write_zeroes_seg; + uint8_t write_zeroes_may_unmap; + uint8_t unused1[3]; } __packed; /* @@ -123,9 +152,14 @@ struct vtblk_config { struct virtio_blk_hdr { #define VBH_OP_READ 0 #define VBH_OP_WRITE 1 +#define VBH_OP_SCSI_CMD 2 +#define VBH_OP_SCSI_CMD_OUT 3 #define VBH_OP_FLUSH 4 #define VBH_OP_FLUSH_OUT 5 #define VBH_OP_IDENT 8 +#define VBH_OP_DISCARD 11 +#define VBH_OP_WRITE_ZEROES 13 + #define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ uint32_t vbh_type; uint32_t vbh_ioprio; @@ -136,8 +170,8 @@ struct virtio_blk_hdr { * Debug printf */ static int pci_vtblk_debug; -#define DPRINTF(params) if (pci_vtblk_debug) printf params -#define WPRINTF(params) printf params +#define DPRINTF(params) if (pci_vtblk_debug) printf params +#define WPRINTF(params) printf params struct pci_vtblk_ioreq { struct blockif_req io_req; @@ -146,6 +180,15 @@ struct pci_vtblk_ioreq { uint16_t io_idx; }; +struct virtio_blk_discard_write_zeroes { + uint64_t sector; + uint32_t num_sectors; + struct { + uint32_t unmap:1; + uint32_t reserved:31; + } flags; +}; + /* * Per-device softc */ @@ -154,6 +197,7 @@ struct pci_vtblk_softc { pthread_mutex_t vsc_mtx; struct vqueue_info vbsc_vq; struct vtblk_config vbsc_cfg; + struct virtio_consts vbsc_consts; struct blockif_ctxt *bc; #ifndef __FreeBSD__ int vbsc_wce; @@ -243,6 +287,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) int writeop, type; struct iovec iov[BLOCKIF_IOV_MAX + 2]; uint16_t idx, flags[BLOCKIF_IOV_MAX + 2]; + struct virtio_blk_discard_write_zeroes *discard; n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags); @@ -262,7 +307,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) vbh = (struct virtio_blk_hdr *)iov[0].iov_base; memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); io->io_req.br_iovcnt = n - 2; - io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE; + io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE; io->io_status = (uint8_t *)iov[--n].iov_base; assert(iov[n].iov_len == 1); assert(flags[n] & VRING_DESC_F_WRITE); @@ -273,7 +318,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) * we don't advertise the capability. */ type = vbh->vbh_type & ~VBH_FLAG_BARRIER; - writeop = (type == VBH_OP_WRITE); + writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD); iolen = 0; for (i = 1; i < n; i++) { @@ -289,7 +334,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) io->io_req.br_resid = iolen; DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r", - writeop ? "write" : "read/ident", iolen, i - 1, + writeop ? "write/discard" : "read/ident", iolen, i - 1, io->io_req.br_offset)); switch (type) { @@ -299,6 +344,46 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) case VBH_OP_WRITE: err = blockif_write(sc->bc, &io->io_req); break; + case VBH_OP_DISCARD: + /* + * We currently only support a single request, if the guest + * has submitted a request that doesn't conform to the + * requirements, we return a error. + */ + if (iov[1].iov_len != sizeof (*discard)) { + pci_vtblk_done_locked(io, EINVAL); + return; + } + + /* The segments to discard are provided rather than data */ + discard = (struct virtio_blk_discard_write_zeroes *) + iov[1].iov_base; + + /* + * virtio v1.1 5.2.6.2: + * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP + * for discard and write zeroes commands if any unknown flag is + * set. Furthermore, the device MUST set the status byte to + * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag + * is set. + * + * Currently there are no known flags for a DISCARD request. + */ + if (discard->flags.unmap != 0 || discard->flags.reserved != 0) { + pci_vtblk_done_locked(io, ENOTSUP); + return; + } + + /* Make sure the request doesn't exceed our size limit */ + if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) { + pci_vtblk_done_locked(io, EINVAL); + return; + } + + io->io_req.br_offset = discard->sector * VTBLK_BSIZE; + io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE; + err = blockif_delete(sc->bc, &io->io_req); + break; case VBH_OP_FLUSH: case VBH_OP_FLUSH_OUT: err = blockif_flush(sc->bc, &io->io_req); @@ -348,7 +433,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) */ snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); bctxt = blockif_open(opts, bident); - if (bctxt == NULL) { + if (bctxt == NULL) { perror("Could not open backing file"); return (1); } @@ -367,6 +452,10 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) io->io_idx = i; } + bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts)); + if (blockif_candelete(sc->bc)) + sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD; + #ifndef __FreeBSD__ /* Disable write cache until FLUSH feature is negotiated */ (void) blockif_set_wce(sc->bc, 0); @@ -376,7 +465,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pthread_mutex_init(&sc->vsc_mtx, NULL); /* init virtio softc and virtqueues */ - vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq); + vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq); sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; @@ -394,7 +483,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); /* setup virtio block config space */ - sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */ + sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ /* @@ -416,6 +505,9 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) sc->vbsc_cfg.vbc_topology.min_io_size = 0; sc->vbsc_cfg.vbc_topology.opt_io_size = 0; sc->vbsc_cfg.vbc_writeback = 0; + sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT; + sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG; + sc->vbsc_cfg.discard_sector_alignment = sectsz / VTBLK_BSIZE; /* * Should we move some of this into virtio.c? Could |