summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason King <jason.king@joyent.com>2020-04-08 21:30:30 +0000
committerJason King <jason.king@joyent.com>2020-04-09 18:44:26 +0000
commit5fd0dd9b1022937d01e35de0abccd19f83c07649 (patch)
tree05f1587f571e4ab67b25058cfa694a66a12f1cff
parent716eac69f4d465c68cc31ebc0cf5bb342d010997 (diff)
downloadillumos-joyent-5fd0dd9b1022937d01e35de0abccd19f83c07649.tar.gz
Move segmenting/splitting into blkdev, simplify dfl_iter
-rw-r--r--usr/src/uts/common/io/blkdev/blkdev.c129
-rw-r--r--usr/src/uts/common/io/vioblk/vioblk.c141
-rw-r--r--usr/src/uts/common/os/dkioc_free_util.c556
-rw-r--r--usr/src/uts/common/sys/blkdev.h10
-rw-r--r--usr/src/uts/common/sys/dkioc_free_util.h44
5 files changed, 381 insertions, 499 deletions
diff --git a/usr/src/uts/common/io/blkdev/blkdev.c b/usr/src/uts/common/io/blkdev/blkdev.c
index 8b8c944006..d05fb20abb 100644
--- a/usr/src/uts/common/io/blkdev/blkdev.c
+++ b/usr/src/uts/common/io/blkdev/blkdev.c
@@ -162,6 +162,10 @@ struct bd {
uint64_t d_numblks;
ddi_devid_t d_devid;
+ uint64_t d_max_free_seg;
+ uint64_t d_max_free_sect;
+ uint64_t d_free_align;
+
kmem_cache_t *d_cache;
bd_queue_t *d_queues;
kstat_t *d_ksp;
@@ -713,6 +717,23 @@ bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
bd_create_inquiry_props(dip, &drive);
+ if (CAN_FREESPACE(bd)) {
+ /*
+ * Default values -- no limits, no stricter alignment than
+ * the device block size (unspecified fields are set to 0).
+ */
+ bd_free_info_t bfi = {
+ .bfi_align = 1,
+ };
+
+ if (bd->d_ops.o_free_space_info != NULL)
+ bd->d_ops.o_free_space_info(bd->d_private, &bfi);
+
+ bd->d_max_free_seg = bfi.bfi_max_seg;
+ bd->d_max_free_sect = bfi.bfi_max_sect;
+ bd->d_free_align = bfi.bfi_align;
+ }
+
bd_create_errstats(bd, inst, &drive);
bd_init_errstats(bd, &drive);
bd_update_state(bd);
@@ -1552,7 +1573,6 @@ bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
if (rv != 0)
return (rv);
- /* bd_xfer_done() frees dfl via bd_xfer_free() */
rv = bd_free_space(dev, bd, dfl);
return (rv);
}
@@ -1967,75 +1987,14 @@ bd_free_space_done(struct buf *bp)
return (0);
}
-/*
- * Adjust extents to be relative to start of the device. When DKIOCFREE
- * is called on a blkdev instance, the extents are relative to the start of
- * the partition for a given blkdev instance. We adjust the extent
- * starting addresses (by adding the partition start offset to dfl_offset)
- * and truncate any extents that extend beyond the end of the partition.
- */
-static int
-bd_adjust_extents(dev_t dev, bd_t *bd, dkioc_free_list_t *dfl)
-{
- dkioc_free_list_ext_t *ext;
- minor_t part;
- diskaddr_t p_lba;
- diskaddr_t p_nblks;
- uint64_t offset;
- uint64_t length;
- size_t i;
- uint32_t shift = bd->d_blkshift;
-
- part = BDPART(dev);
-
- if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
- NULL, NULL, 0) != 0) {
- return (ENXIO);
- }
-
- offset = (uint64_t)p_lba << shift;
- length = (uint64_t)p_nblks << shift;
-
- dfl->dfl_offset += offset;
- if (dfl->dfl_offset < offset)
- return (EOVERFLOW); /* XXX: or EINVAL? */
-
- for (ext = dfl->dfl_exts, i = 0; i < dfl->dfl_num_exts; i++, ext++) {
- if (ext->dfle_start > length) {
- ext->dfle_length = 0;
- continue;
- }
-
- uint64_t end = ext->dfle_start + ext->dfle_length;
-
- if (end < ext->dfle_start)
- return (EOVERFLOW); /* XXX: or EINVAL? */
-
- if (end > length)
- ext->dfle_length = length - ext->dfle_start;
- }
-
- return (0);
-}
-
static int
-bd_free_space(dev_t dev, bd_t *bd, dkioc_free_list_t *dfl)
+bd_free_space_cb(dkioc_free_list_t *dfl, void *arg)
{
- buf_t *bp = NULL;
- bd_xfer_impl_t *xi = NULL;
- int rv = 0;
- boolean_t sync = (dfl->dfl_flags & DF_WAIT_SYNC) != 0 ?
- B_TRUE : B_FALSE;
-
- /*
- * bd_ioctl created our own copy of dfl, so we can modify as
- * necessary
- */
- rv = bd_adjust_extents(dev, bd, dfl);
- if (rv != 0) {
- dfl_free(dfl);
- return (rv);
- }
+ bd_t *bd = arg;
+ buf_t *bp = NULL;
+ bd_xfer_impl_t *xi = NULL;
+ boolean_t sync = DFL_ISSYNC(dfl) ? B_TRUE : B_FALSE;
+ int rv = 0;
bp = getrbuf(KM_SLEEP);
bp->b_resid = 0;
@@ -2060,6 +2019,39 @@ bd_free_space(dev_t dev, bd_t *bd, dkioc_free_list_t *dfl)
return (rv);
}
+static int
+bd_free_space(dev_t dev, bd_t *bd, dkioc_free_list_t *dfl)
+{
+ diskaddr_t p_len, p_offset;
+ uint64_t offset_bytes;
+ minor_t part = BDINST(dev);
+ dkioc_free_info_t dfi = {
+ .dfi_bshift = bd->d_blkshift,
+ .dfi_align = bd->d_free_align,
+ .dfi_max_blocks = bd->d_max_free_sect,
+ .dfi_max_ext = bd->d_max_free_seg,
+ };
+
+ if (cmlb_partinfo(bd->d_cmlbh, part, &p_len, &p_offset, NULL,
+ NULL, 0) != 0) {
+ dfl_free(dfl);
+ return (ENXIO);
+ }
+
+ /*
+ * bd_ioctl created our own copy of dfl, so we can modify as
+ * necessary
+ */
+ offset_bytes = (uint64_t)p_offset << bd->d_blkshift;
+ dfl->dfl_offset += offset_bytes;
+ if (dfl->dfl_offset < offset_bytes) {
+ dfl_free(dfl);
+ return (EOVERFLOW);
+ }
+
+ return (dfl_iter(dfl, &dfi, p_len, bd_free_space_cb, bd, KM_SLEEP));
+}
+
/*
* Nexus support.
*/
@@ -2126,6 +2118,7 @@ bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
switch (ops->o_version) {
case BD_OPS_CURRENT_VERSION:
hdl->h_ops.o_free_space = ops->o_free_space;
+ hdl->h_ops.o_free_space_info = ops->o_free_space_info;
/*FALLTHRU*/
case BD_OPS_VERSION_1:
case BD_OPS_VERSION_0:
diff --git a/usr/src/uts/common/io/vioblk/vioblk.c b/usr/src/uts/common/io/vioblk/vioblk.c
index b23dbfcfa3..1271796c62 100644
--- a/usr/src/uts/common/io/vioblk/vioblk.c
+++ b/usr/src/uts/common/io/vioblk/vioblk.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Alexey Zaytsev <alexey.zaytsev@gmail.com>
- * Copyright 2019 Joyent Inc.
+ * Copyright 2020 Joyent Inc.
* Copyright 2019 Western Digital Corporation.
*/
@@ -148,13 +148,6 @@ static const ddi_dma_attr_t vioblk_dma_attr = {
.dma_attr_flags = 0
};
-/*
- * Break up DISCARD requests into smaller pieces if the request exceeds
- * the limits given by the host. If 0, requests may be truncated if
- * they exceed the host limits.
- */
-static int vioblk_split_discard = 0;
-
static vioblk_req_t *
vioblk_req_alloc(vioblk_t *vib)
{
@@ -551,6 +544,16 @@ vioblk_bd_mediainfo(void *arg, bd_media_t *media)
}
static void
+vioblk_bd_free_space_info(void *arg, bd_free_info_t *bfi)
+{
+ vioblk_t *vib = (void *)arg;
+
+ bfi->bfi_max_seg = vib->vib_max_discard_seg;
+ bfi->bfi_max_sect = vib->vib_max_discard_sectors;
+ bfi->bfi_align = vib->vib_discard_sector_align;
+}
+
+static void
vioblk_get_id(vioblk_t *vib)
{
virtio_dma_t *dma;
@@ -618,76 +621,35 @@ vioblk_bd_devid(void *arg, dev_info_t *dip, ddi_devid_t *devid)
devid));
}
-struct vioblk_freesp_arg {
- vioblk_t *vfpa_vioblk;
- bd_xfer_t *vfpa_xfer;
-};
-
static int
-vioblk_free_exts(const dkioc_free_list_ext_t *exts, size_t n_exts,
- boolean_t last, void *arg)
+vioblk_bd_free_space(void *arg, bd_xfer_t *xfer)
{
- struct vioblk_freesp_arg *args = arg;
- vioblk_t *vib = args->vfpa_vioblk;
+ const dkioc_free_list_t *dfl = xfer->x_dfl;
+ const dkioc_free_list_ext_t *exts = dfl->dfl_exts;
+ vioblk_t *vib = arg;
virtio_dma_t *dma = NULL;
virtio_chain_t *vic = NULL;
vioblk_req_t *vbr = NULL;
struct vioblk_discard_write_zeroes *wzp = NULL;
- size_t i;
int r = 0;
- boolean_t polled;
-
- /*
- * While rare, it's possible we might get called with a list of
- * zero extents. In that case, just return success.
- */
- if (n_exts == 0) {
- if (last) {
- bd_xfer_done(args->vfpa_xfer, 0);
- args->vfpa_xfer = NULL;
- }
+ boolean_t polled = DFL_ISSYNC(dfl) ? B_TRUE : B_FALSE;
- return (0);
- }
-
- /*
- * If last is false, dfl_iter() had to segment or split the
- * original DKIOCFREE request into multiple requests for us (if last
- * is B_FALSE, it implies more extents are coming). While the blkdev
- * framework handles implementing the necessary sync/not sync semantics
- * for a DKIOCFREE request, that is in terms of the entire original
- * request. If dfl_iter() had to break things up, we always treat
- * the non-final (last == B_FALSE) calls to vioblk_free_extents() as
- * synchronous (polled).
- *
- * The assumption is that if the host is placing limits on a
- * DISCARD request, issuing multiple requests to the same device
- * asynchronously is likely to have undesirable results (or else why
- * wouldn't the host expose larger limits to prevent segmentation?),
- * so we issue one at a time (at least until the final group).
- *
- * The vioblk_split_discard tunable can be set to 0 to disable this
- * behavior -- in that case, any extents that exceed the host limits
- * are just discarded.
- *
- * Unfortunately, there isn't currently a way to report partial
- * results, so the choices are to fail the request if any extent in
- * the request doesn't meet the device requirements or to break
- * as much of the request as is possible
- */
- polled = !!last;
+ /* XXX: This seems like it could be folded into vioblk_request() */
- dma = virtio_dma_alloc(vib->vib_virtio, n_exts * sizeof (*wzp),
- &vioblk_dma_attr, DDI_DMA_CONSISTENT | DDI_DMA_WRITE, KM_SLEEP);
+ dma = virtio_dma_alloc(vib->vib_virtio,
+ dfl->dfl_num_exts * sizeof (*wzp), &vioblk_dma_attr,
+ DDI_DMA_CONSISTENT | DDI_DMA_WRITE, KM_SLEEP);
if (dma == NULL)
return (ENOMEM);
wzp = virtio_dma_va(dma, 0);
- for (i = 0; i < n_exts; i++, exts++, wzp++) {
+ for (size_t i = 0; i < dfl->dfl_num_exts; i++, exts++, wzp++) {
+ uint64_t start = dfl->dfl_offset + exts->dfle_start;
+
struct vioblk_discard_write_zeroes vdwz = {
- .vdwz_sector = exts->dfle_start,
- .vdwz_num_sectors = exts->dfle_length,
+ .vdwz_sector = start >> DEV_BSHIFT,
+ .vdwz_num_sectors = exts->dfle_length >> DEV_BSHIFT,
.vdwz_flags = 0
};
@@ -714,51 +676,9 @@ vioblk_free_exts(const dkioc_free_list_ext_t *exts, size_t n_exts,
return (ENOMEM);
}
- if (last) {
- /*
- * We attach xfer to the final vioblk request we submit.
- * This will allow the vioblk_complete() to handle any
- * notifications (e.g. a synchronous request) and
- * dispose of xfer afterwards.
- */
- vbr->vbr_xfer = args->vfpa_xfer;
- args->vfpa_xfer = NULL;
- }
-
+ vbr->vbr_xfer = xfer;
r = vioblk_common_submit(vib, vic);
mutex_exit(&vib->vib_mutex);
- return (r);
-}
-
-static int
-vioblk_bd_free_space(void *arg, bd_xfer_t *xfer)
-{
- vioblk_t *vib = arg;
- dkioc_free_align_t align = {
- .dfa_bsize = DEV_BSIZE,
- .dfa_max_ext = vib->vib_max_discard_seg,
- .dfa_max_blocks = vib->vib_max_discard_sectors,
- .dfa_align = vib->vib_discard_sector_align,
- .dfa_gran = 1,
- };
- struct vioblk_freesp_arg sp_arg = {
- .vfpa_vioblk = vib,
- .vfpa_xfer = xfer
- };
- dkioc_free_list_t *dfl = xfer->x_dfl;
- dkioc_iter_flags_t iter_flags =
- (vioblk_split_discard == 0) ? DIF_NOSPLIT : DIF_NONE;
- int r = 0;
-
- r = dfl_iter(dfl, &align, vioblk_free_exts, &sp_arg, KM_SLEEP,
- iter_flags);
-
- /*
- * If we didn't include xfer as part of the final request
- * (sp_arg.vfpa_xfer is still set), we should be returning failure
- * so that bd_sched() will free xfer.
- */
- IMPLY(sp_arg.vfpa_xfer != NULL, r != 0);
return (r);
}
@@ -1084,6 +1004,13 @@ vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
* "o_sync_cache" member from the ops vector. As "bd_alloc_handle()"
* makes a copy of the ops vector, we can safely assemble one on the
* stack based on negotiated features.
+ *
+ * Similarly, the blkdev framework does not provide a way to indicate
+ * if a device supports an TRIM/UNMAP/DISCARD type operation except
+ * by omitting the "o_free_space" member from the ops vector. For
+ * consistency, we also omit the "o_free_info" member since it is
+ * only possibly used when a device specifies a "o_free_space"
+ * function.
*/
bd_ops_t vioblk_bd_ops = {
.o_version = BD_OPS_CURRENT_VERSION,
@@ -1094,12 +1021,14 @@ vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
.o_read = vioblk_bd_read,
.o_write = vioblk_bd_write,
.o_free_space = vioblk_bd_free_space,
+ .o_free_space_info = vioblk_bd_free_space_info,
};
if (!virtio_feature_present(vio, VIRTIO_BLK_F_FLUSH)) {
vioblk_bd_ops.o_sync_cache = NULL;
}
if (!virtio_feature_present(vio, VIRTIO_BLK_F_DISCARD)) {
vioblk_bd_ops.o_free_space = NULL;
+ vioblk_bd_ops.o_free_space_info = NULL;
}
vib->vib_bd_h = bd_alloc_handle(vib, &vioblk_bd_ops,
diff --git a/usr/src/uts/common/os/dkioc_free_util.c b/usr/src/uts/common/os/dkioc_free_util.c
index 421899c8de..2703e250fa 100644
--- a/usr/src/uts/common/os/dkioc_free_util.c
+++ b/usr/src/uts/common/os/dkioc_free_util.c
@@ -26,36 +26,13 @@
#include <sys/file.h>
#include <sys/sdt.h>
-struct ext_arg {
- uint64_t ea_ext_cnt;
- dfl_iter_fn_t ea_fn;
- void *ea_arg;
- dkioc_free_list_ext_t *ea_exts;
- size_t ea_nreq;
- dkioc_iter_flags_t ea_flags;
-};
-
-typedef int (*ext_iter_fn_t)(const dkioc_free_list_ext_t *,
- boolean_t, void *);
-
-static int ext_iter(const dkioc_free_list_t *, const dkioc_free_align_t *,
- uint_t, ext_iter_fn_t, void *);
-static int ext_xlate(dkioc_free_list_ext_t *, uint64_t, uint64_t, uint64_t,
- uint_t);
-static int count_exts(const dkioc_free_list_ext_t *, boolean_t, void *);
-static int process_exts(const dkioc_free_list_ext_t *, boolean_t, void *);
-
-#if __GNUC__ > 4 || __GNU_C_MINOR__ >= 8
-#define uadd64_overflow(a, b, c) __builtin_uaddl_overflow(a, b, c)
-#else
-static bool
-uadd64_overflow(uint64_t a, uint64_t b, uint64_t *res)
-{
- *res = a + b;
- return ((*res < a || *res < b) ? true : false);
-}
-#endif
-
+static int adjust_exts(dkioc_free_list_t *, const dkioc_free_info_t *,
+ uint64_t len_blk);
+static int split_extent(dkioc_free_list_t *, const dkioc_free_info_t *,
+ size_t, dfl_iter_fn_t, void *, int);
+static int process_range(dkioc_free_list_t *, size_t, size_t,
+ dfl_iter_fn_t, void *, int);
+
/*
* Copy-in convenience function for variable-length dkioc_free_list_t
* structures. The pointer to be copied from is in `arg' (may be a pointer
@@ -121,79 +98,65 @@ dfl_free(dkioc_free_list_t *dfl)
* address stricter than the device block size.
*
* Since there is currently no mechanism for callers of DKIOCFREE to discover
- * any alignment, segmentation, or size requirements for DKIOCFREE requests
- * for a particular driver (or instance of a particular driver), dfl_iter()
- * allows drivers to tranform the dkioc_free_list_t from a DKIOCFREE request
- * into groups of dkioc_free_ext_ts that conform to the driver's alignment,
- * segmentation, or size requirements. The transformation done by dfl_iter()
- * may involve modifications such as splitting a list of extents into smaller
- * groups, splitting extents into multiple smaller extents, increasing the
- * start address of an extent to conform to alignments, or reducing the size
- * of an extent so that the resulting size is a multiple of the device block
- * size. In all instances, the resultant set is either identical to the
- * original set of extents, or a subset -- that is we _never_ transform a
- * a range into a range that exceeds the original boundaries of the original
- * extents.
+ * such restrictions, instead of rejecting any requests that do not conform to
+ * some undiscoverable (to the caller) set of requirements, a driver can use
+ * dfl_iter() to adjust and resegment the extents from a DKIOCFREE call as
+ * required to conform to its requirements.
*
- * The transformed extents are grouped per the driver's requirements described
- * by the constraints contained in the 'dfa' parameter, and the 'func'
- * callback is invoked for each group of transformed extents. An optional
- * opaque (to dfl_iter()) 'arg' parameter is passed through to 'func' as well.
- * In addition, on the final group, the 'last' argument of 'func' is set
- * to B_TRUE (for all other groups of extents passed to 'func', 'func' is
- * called with 'last' set to B_FALSE). Indicating the final group of extents
- * allows a driver to mark a request as complete or implement synchronous
- * semantics as required.
+ * The original request is passed as 'dfl' and the alignment requirements
+ * are passed in 'dfi'. Additionally the size of the device (in units of
+ * blocks as described in dfi) is passed as len -- this allows a driver with
+ * multiple instances of different sizes but similar requirements (e.g.
+ * a partitioned blkdev device) to not construct a separate dkioc_free_info_t
+ * struct for each device.
*
- * Unfortunately, the DKIOCFREE ioctl provides no method for communicating
- * any sort of partial completion -- either it returns success (0) or
- * an error. As such, there's little benefit to providing more detailed
- * error semantics beyond what DKIOCFREE can handle (if that ever changes, it
- * would be worth revisiting this). As a result, we take a somewhat simplistic
- * approach -- we stop processing the request on the first error encountered
- * and return the error. Otherwise dfl_iter() returns 0.
+ * dfl_iter() always consumes the contents of 'dfl'. The caller should never
+ * free 'dfl' after callign dfl_iter(). Many drivers will queue free requests
+ * and then release the resources after the request completes (successfully
+ * or not) some time later, so always consuming dfl makes supporting this
+ * simpler for the caller.
+ *
+ * dfl_iter() will call 'func' with a dkioc_free_list_t and the value of
+ * arg passed to it as needed. 'func' can assume that the extents in
+ * dkioc_free_list_t passed to it should conform to the requirements in
+ * 'dfi', but should NOT assume that the dkioc_free_list_t instance passed to it
+ * is the same instance passed to dfl_iter(). While this may be the case in
+ * some instances (e.g. all the extents conform to the driver's requirements),
+ * dfl_iter() may allocate new dkioc_free_list_t instances as required.
+ * 'func' must always properly free the dkioc_free_list_t passed to it as
+ * appropriate (either via a callback after completion, upon error, etc.).
*
- * Note that transformed extents that result in a range too small to be
- * processed by the driver (e.g. a 4k block size with a request to free
- * starting at offset 512 and a length of 1024) aren't considered an error and
- * are silently ignored. This means it is possible (though hopefully unlikely)
- * a request to a driver may result in no freed extents. When this happens,
- * 'func' is still called, but with a NULL list of extents, an extent count
- * of 0, and with last set to B_TRUE to allow for cleanup (calling done
- * routines, etc.).
+ * Unfortunately, the DKIOCFREE ioctl provides no method for communicating
+ * any notion of partial completion -- either it returns success (0) or
+ * an error. It's not clear if such a notion would even be possible while
+ * supporting multiple types of devices (NVMe, SCSI, etc.) with the same
+ * interface. As such, there's little benefit to providing more detailed error
+ * semantics beyond what DKIOCFREE can handle.
*
- * Currently no flags are defined, and should always be zero.
+ * Due to this, a somewhat simplistic approach is taken to error handling. The
+ * original list of extents is first checked to make sure they all appear
+ * valid -- that is they do not start or extend beyond the end of the device.
+ * Any request that contains such extents is always rejected in it's entirety.
+ * It is possible after applying any needed adjustments to the original list
+ * of extents that the result is not acceptable to the driver. For example,
+ * a device with a 512 byte block size that tries to free the range 513-1023
+ * (bytes) would not be able to be processed. Such extents will be silently
+ * ignored. If the original request consists of nothing but such requests,
+ * dfl_iter() will never call 'func' and will merely return 0.
*/
int
-dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa,
- dfl_iter_fn_t func, void *arg, int kmflag, dkioc_iter_flags_t flags)
+dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t len,
+ dfl_iter_fn_t func, void *arg, int kmflag)
{
- dkioc_free_list_ext_t *exts;
- uint64_t n_exts = 0;
- struct ext_arg earg = { 0 };
- uint_t bshift;
+ size_t n_blocks, n_segs, start_idx, i;
int r = 0;
-
- if ((flags & ~(DIF_NONE|DIF_NOSPLIT)) != 0)
- return (SET_ERROR(EINVAL));
-
- /* Block size must be at least 1 and a power of two */
- if (dfa->dfa_bsize == 0 || !ISP2(dfa->dfa_bsize))
- return (SET_ERROR(EINVAL));
+ boolean_t need_copy = B_FALSE;
/* Offset alignment must also be at least 1 and a power of two */
- if (dfa->dfa_align == 0 || !ISP2(dfa->dfa_align))
- return (SET_ERROR(EINVAL));
-
- /* Length granularity must be at least 1 and a power of two */
- if (dfa->dfa_gran == 0 || !ISP2(dfa->dfa_gran))
- return (SET_ERROR(EINVAL));
-
- /*
- * Since dfa_bsize != 0 (see above), ddi_ffsll() _must_ return a
- * value > 1
- */
- bshift = ddi_ffsll((long long)dfa->dfa_bsize) - 1;
+ if (dfi->dfi_align == 0 || !ISP2(dfi->dfi_align)) {
+ r = SET_ERROR(EINVAL);
+ goto done;
+ }
/*
* If a limit on the total number of blocks is given, it must be
@@ -202,261 +165,260 @@ dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa,
* allow extent sizes at least 8 blocks long (otherwise there will be
* device addresses that cannot be contained within an extent).
*/
- if (dfa->dfa_max_blocks > 0 && dfa->dfa_max_blocks < dfa->dfa_align)
- return (SET_ERROR(EINVAL));
+ if (dfi->dfi_max_blocks > 0 && dfi->dfi_max_blocks < dfi->dfi_align) {
+ r = SET_ERROR(EINVAL);
+ goto done;
+ }
/*
- * The general approach is that we walk the array of extents twice
- * using ext_iter(). For each extent, ext_iter() will invoke the
- * given callback function 0 or more times (based on the requirements
- * in dfa), and then invoke the callback function with a NULL extent.
- *
- * This first walk is used to count the total number of extents
- * after applying the driver requirements in 'dfa'. This may be
- * different from the initial number of extents due to splitting
- * extents or discarding extents that do not conform to alignment
- * requirements (and may even be 0).
+ * The first pass, align everything as needed and make sure all the
+ * extents look valid.
*/
- r = ext_iter(dfl, dfa, bshift, count_exts, &n_exts);
- if (r != 0)
- return (r);
+ if ((r = adjust_exts(dfl, dfi, len)) != 0) {
+ goto done;
+ }
/*
- * It's possible that some extents do not conform to the alignment
- * requirements, nor do they have a conforming subset. For example,
- * a device with a block size of 512 bytes, and a starting alignment
- * of 4096 bytes would not be able to free extent with a starting
- * offset of 512 and a length of 1024. Such extents are ignored
- * (we have no good way to report back partial results). While unlikely,
- * it is possible a request consists of nothing but non-conforming
- * extents. In this case, we invoke the callback with a NULL list
- * of extents and with last set so it can perform any necessary
- * cleanup, completion tasks.
+ * Go through and split things up as needed. The general idea is to
+ * split along the original extent boundaries when needed. We only
+ * split an extent from the original request into multiple extents
+ * if the original extent is by itself too big for the device to
+ * process in a single request.
*/
- if (n_exts == 0)
- return (func(NULL, 0, B_TRUE, arg));
+ start_idx = 0;
+ n_blocks = n_segs = 0;
+ for (i = 0; i < dfl->dfl_num_exts; i++) {
+ uint64_t start = dfl->dfl_offset + dfl->dfl_exts[i].dfle_start;
+ uint64_t end = start + dfl->dfl_exts[i].dfle_length;
+ size_t len_blk = (end - start) >> dfi->dfi_bshift;
+
+ if (len_blk == 0) {
+ /*
+ * If we encounter a zero length extent, we're going
+ * to create a new copy of dfl no matter what --
+ * the size of dfl is determined by dfl_num_exts so
+ * we cannot do things like shift the contents and
+ * reduce dfl_num_exts to get a contiguous array
+ * of non-zero length extents.
+ */
+ need_copy = B_TRUE;
+ continue;
+ }
- exts = kmem_zalloc(n_exts * sizeof (*exts), kmflag);
- if (exts == NULL)
- return (SET_ERROR(ENOMEM));
+ if (n_blocks + len_blk > dfi->dfi_max_blocks) {
+ if ((r = process_range(dfl, start_idx, i - start_idx,
+ func, arg, kmflag)) != 0) {
+ goto done;
+ }
- earg.ea_ext_cnt = 0;
- earg.ea_fn = func;
- earg.ea_arg = arg;
- earg.ea_exts = exts;
- earg.ea_nreq = 0;
- earg.ea_flags = flags;
+ if (len_blk < dfi->dfi_max_blocks) {
+ /*
+ * We've spilled over, but this block on its
+ * own is fine. Start the next range of
+ * blocks with this one and continue;
+ */
+ start_idx = i;
+ n_segs = 1;
+ n_blocks = len_blk;
+ continue;
+ }
+
+ /*
+ * Even after starting a new request, this extent
+ * is too big. Split it until it fits.
+ */
+ if ((r = split_extent(dfl, dfi, i, func, arg,
+ kmflag)) != 0) {
+ goto done;
+ }
+
+ start_idx = i + 1;
+ n_segs = 0;
+ n_blocks = 0;
+ continue;
+ }
+
+ if (n_segs + 1 > dfi->dfi_max_ext) {
+ if ((r = process_range(dfl, start_idx, i - start_idx,
+ func, arg, kmflag)) != 0) {
+ goto done;
+ }
+
+ start_idx = i;
+ n_segs = 0;
+ n_blocks = 0;
+ continue;
+ }
+
+ n_segs++;
+ n_blocks += len_blk;
+ }
/*
- * We've allocated enough space to hold all the transformed extents
- * in 'exts'. Now walk the original list of extents a second time
- * and do the work. process_exts() will accumulate the transformed
- * extents and invoke 'func' (the callback passed into dfl_iter()) to
- * perform the free request with the accumulated extents, repeating
- * as necessary.
+ * If a copy wasn't required, and we haven't processed a subset of
+ * the extents already, we can just use the original request.
*/
- r = ext_iter(dfl, dfa, bshift, process_exts, &earg);
- kmem_free(exts, n_exts * sizeof (*exts));
- return (r);
-}
-
-static int
-count_exts(const dkioc_free_list_ext_t *ext, boolean_t newreq __unused,
- void *arg)
-{
- size_t *np = arg;
+ if (!need_copy && start_idx == 0) {
+ return (func(dfl, arg));
+ }
- if (ext != NULL && ext->dfle_length > 0)
- (*np)++;
+ r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag);
- return (0);
+done:
+ dfl_free(dfl);
+ return (r);
}
+/*
+ * Adjust the start and length of each extent in dfl so that it conforms to
+ * the requirements in dfi. It also verifies that no extent extends beyond
+ * the end of the device (given by len_blk).
+ *
+ * Returns 0 on success, or an error value.
+ */
static int
-process_exts(const dkioc_free_list_ext_t *ext, boolean_t newreq, void *arg)
+adjust_exts(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi,
+ uint64_t len_blk)
{
- struct ext_arg *args = arg;
- dkioc_free_list_ext_t *ext_list = args->ea_exts;
+ dkioc_free_list_ext_t *exts = dfl->dfl_exts;
+ size_t len = len_blk << dfi->dfi_bshift;
+ uint_t align = dfi->dfi_align << dfi->dfi_bshift;
+ uint_t bsize = (uint_t)1 << dfi->dfi_bshift;
- if (ext == NULL) {
+ for (size_t i = 0; i < dfl->dfl_num_exts; i++, exts++) {
/*
- * The very last call should be with ext set to NULL to
- * flush any accumulated extents since the last start of
- * a new group.
+ * Since there are no known requirements on the value of
+ * dfl_offset, it's possible (though odd) to have a scenario
+ * where dfl_offset == 1, and dfle_start == 511 (resulting
+ * in an actual start offset of 512). As such, we always
+ * apply the offset and find the resulting starting offset
+ * and length (in bytes) first, then apply any rounding
+ * and alignment.
*/
- VERIFY(newreq);
+ uint64_t start = exts->dfle_start + dfl->dfl_offset;
+ uint64_t end = start + exts->dfle_length;
/*
- * A corner case -- we never had any extents that could
- * be passed to the callback. Do a final call with the
- * extent list as NULL (and a count of 0).
+ * Make sure after applying dfl->dfl_offset that the results
+ * don't overflow.
*/
- if (args->ea_ext_cnt == 0)
- ext_list = NULL;
-
- args->ea_nreq++;
+ if (start < dfl->dfl_offset) {
+ return (SET_ERROR(EOVERFLOW));
+ }
- return (args->ea_fn(ext_list, args->ea_ext_cnt, B_TRUE,
- args->ea_arg));
- }
+ if (end < start) {
+ return (SET_ERROR(EOVERFLOW));
+ }
- /*
- * Starting a new request, and we have accumulated extents to
- * flush.
- */
- if (newreq && args->ea_ext_cnt > 0) {
- int r;
+ /*
+ * Make sure we don't extend past the end of the device
+ * XXX: ENXIO instead?
+ */
+ if (end > len) {
+ return (SET_ERROR(ERANGE));
+ }
- args->ea_nreq++;
+ start = P2ROUNDUP(start, align);
+ end = P2ALIGN(end, bsize);
- r = args->ea_fn(ext_list, args->ea_ext_cnt, B_FALSE,
- args->ea_arg);
- if (r != 0)
- return (r);
+ ASSERT(IS_P2ALIGNED(end - start, bsize));
/*
- * A bit simplistic, but we just keep appending to the
- * original array allocated by dfl_iter(), but just update
- * our starting position (args->ex_exts) for the next group.
+ * Remove the offset so that when it's later applied again,
+ * the correct start value is obtained.
*/
- args->ea_exts += args->ea_ext_cnt;
- args->ea_ext_cnt = 0;
+ exts->dfle_start = start - dfl->dfl_offset;
+ exts->dfle_length = end - start;
}
- /* Skip any extents that end up with zero length after aligning. */
- if (ext->dfle_length > 0)
- args->ea_exts[args->ea_ext_cnt++] = *ext;
-
return (0);
}
/*
- * Translate the ext from byte-based units to units of
- * (1 << bshift) sized blocks, with the start and length values adjusted to
- * the align and gran values (align and gran are in units of bytes).
- *
- * Returns 0 on success, or an error value.
+ * Take a subset of extents from dfl (starting at start_idx, with n entries)
+ * and create a new dkioc_free_list_t, passing that to func.
*/
static int
-ext_xlate(dkioc_free_list_ext_t *ext, uint64_t offset, uint64_t align,
- uint64_t gran, uint_t bshift)
+process_range(dkioc_free_list_t *dfl, size_t start_idx, size_t n,
+ dfl_iter_fn_t func, void *arg, int kmflag)
{
- uint64_t start, end;
+ dkioc_free_list_t *new_dfl = NULL;
+ dkioc_free_list_ext_t *new_exts = NULL;
+ dkioc_free_list_ext_t *exts = dfl->dfl_exts + start_idx;
+ size_t actual_len = n;
+ int r = 0;
- if (uadd64_overflow(offset, ext->dfle_start, &start))
- return (SET_ERROR(EOVERFLOW));
+ if (n == 0) {
+ return (0);
+ }
- if (uadd64_overflow(start, ext->dfle_length, &end))
- return (SET_ERROR(EOVERFLOW));
-
- start = P2ROUNDUP(start, align);
- end = P2ALIGN(end, gran);
+ /*
+ * Ignore any zero length extents. No known devices attach any
+ * semantic meaning to such extents, and are likely just a result of
+ * narrowing the range of the extent to fit the device alignment
+ * requirements. It is possible the original caller submitted a
+ * zero length extent, but we ignore those as well. Since we can't
+ * communicate partial results back to the caller anyway, it's
+ * unclear whether reporting that one of potentially many exents was
+ * too small (without being able to identify which one) to the caller
+ * of the DKIOCFREE request would be useful.
+ */
+ for (size_t i = 0; i < n; i++) {
+ if (exts[i].dfle_length == 0 && --actual_len == 0) {
+ return (0);
+ }
+ }
- ext->dfle_start = start >> bshift;
- ext->dfle_length = (end > start) ? (end - start) >> bshift : 0;
- return (0);
+ new_dfl = kmem_zalloc(DFL_SZ(actual_len), kmflag);
+ if (new_dfl == NULL) {
+ return (SET_ERROR(ENOMEM));
+ }
+
+ new_dfl->dfl_flags = dfl->dfl_flags;
+ new_dfl->dfl_num_exts = actual_len;
+ new_dfl->dfl_offset = dfl->dfl_offset;
+ new_exts = new_dfl->dfl_exts;
+
+ for (size_t i = 0; i < n; i++) {
+ if (exts[i].dfle_length == 0) {
+ continue;
+ }
+
+ *new_exts++ = exts[i];
+ }
+
+ return (func(new_dfl, arg));
}
/*
- * Iterate through the extents in dfl. fn is called for each adjusted extent
- * (adjusting offsets and lengths to conform to the alignment requirements)
- * and one input extent may result in 0, 1, or multiple calls to fn as a
- * result.
+ * Split the extent at idx into multiple lists (calling func for each one).
*/
static int
-ext_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa,
- uint_t bshift, ext_iter_fn_t fn, void *arg)
+split_extent(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, size_t idx,
+ dfl_iter_fn_t func, void *arg, int kmflag)
{
- const dkioc_free_list_ext_t *src = dfl->dfl_exts;
- uint64_t n_exts = 0;
- uint64_t n_blks = 0;
- uint64_t align = dfa->dfa_align << bshift;
- uint64_t gran = dfa->dfa_gran << bshift;
- size_t i;
- boolean_t newreq = B_TRUE;
-
- for (i = 0; i < dfl->dfl_num_exts; i++, src++) {
- dkioc_free_list_ext_t ext = *src;
- int r;
-
- r = ext_xlate(&ext, dfl->dfl_offset, align, gran, bshift);
- if (r != 0)
- return (r);
+ ASSERT3U(idx, <, dfl->dfl_num_exts);
- while (ext.dfle_length > 0) {
- dkioc_free_list_ext_t seg = ext;
+ const uint64_t amt = dfi->dfi_max_blocks << dfi->dfi_bshift;
+ dkioc_free_list_ext_t *ext = dfl->dfl_exts + idx;
+ uint64_t len = ext->dfle_length;
+ int r;
- if (dfa->dfa_max_ext > 0 &&
- n_exts + 1 > dfa->dfa_max_ext) {
- /*
- * Reached the max # of extents, start a new
- * request, and retry.
- */
- newreq = B_TRUE;
- n_exts = 0;
- n_blks = 0;
- continue;
- }
-
- if (dfa->dfa_max_blocks > 0 &&
- n_blks + seg.dfle_length > dfa->dfa_max_blocks) {
- /*
- * This extent puts us over the max # of
- * blocks in a request.
- */
- if (!newreq) {
- /*
- * If we haven't started a new request,
- * start one, and retry as a new
- * request in case it can fit on
- * its own. If not, we'll skip
- * this block and split it in the
- * code below.
- */
- newreq = B_TRUE;
- n_exts = 0;
- n_blks = 0;
- continue;
- }
-
- /*
- * A new request, and the extent length is
- * larger than our max. Reduce the length to
- * the largest multiple of dfa_align
- * equal to or less than dfa_max_blocks
- * so the next starting address has the
- * correct alignment, splitting the request.
- */
- seg.dfle_length = P2ALIGN(dfa->dfa_max_blocks,
- align);
-
- /*
- * Our sanity checks on the alignment
- * requirements mean we should be able to
- * free at least part of the extent.
- */
- ASSERT3U(seg.dfle_length, >, 0);
- }
-
- r = fn(&seg, newreq, arg);
- if (r != 0)
- return (r);
-
- n_exts++;
- n_blks += seg.dfle_length;
-
- ASSERT3U(ext.dfle_length, >=, seg.dfle_length);
-
- ext.dfle_length -= seg.dfle_length;
- ext.dfle_start += seg.dfle_length;
- newreq = B_FALSE;
+ /*
+ * Break the extent into as many single requests as needed. While it
+ * would be possible in some circumstances to combine the final chunk
+ * of the extent (after splitting) with the remaining extents in the
+ * original request, it's not clear there's much benefit from the
+ * added complexity. Such behavior could be added in the future if
+ * it's determined to be worthwhile.
+ */
+ while (len > 0) {
+ ext->dfle_length = (len > amt) ? amt : len;
+ if ((r = process_range(dfl, idx, 1, func, arg, kmflag)) != 0) {
+ return (r);
}
+ ext->dfle_start += ext->dfle_length;
}
- /*
- * Invoke the callback one last time w/ a NULL array of extents and
- * newreq == B_TRUE to signal completion (and flush any accumulated
- * extents).
- */
- return (fn(NULL, B_TRUE, arg));
+ return (0);
}
diff --git a/usr/src/uts/common/sys/blkdev.h b/usr/src/uts/common/sys/blkdev.h
index f0336d2656..92a3807feb 100644
--- a/usr/src/uts/common/sys/blkdev.h
+++ b/usr/src/uts/common/sys/blkdev.h
@@ -23,7 +23,7 @@
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2019 Western Digital Corporation.
- * Copyright 2019 Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
*/
#ifndef _SYS_BLKDEV_H
@@ -80,6 +80,7 @@ typedef struct bd_handle *bd_handle_t;
typedef struct bd_xfer bd_xfer_t;
typedef struct bd_drive bd_drive_t;
typedef struct bd_media bd_media_t;
+typedef struct bd_free_info bd_free_info_t;
typedef struct bd_ops bd_ops_t;
struct dkioc_free_list_s;
@@ -149,6 +150,12 @@ struct bd_media {
#define BD_INFO_FLAG_HOTPLUGGABLE (1U << 1)
#define BD_INFO_FLAG_READ_ONLY (1U << 2)
+struct bd_free_info {
+ uint64_t bfi_align;
+ uint64_t bfi_max_seg;
+ uint64_t bfi_max_sect;
+};
+
/*
* If the API changes and we want to bump the version, add another
* enum value, Eg BD_OPS_VERSION_1. BD_OPS_CURRENT_VERSION should always
@@ -169,6 +176,7 @@ struct bd_ops {
int (*o_read)(void *, bd_xfer_t *);
int (*o_write)(void *, bd_xfer_t *);
int (*o_free_space)(void *, bd_xfer_t *);
+ void (*o_free_space_info)(void *, bd_free_info_t *);
};
struct bd_errstats {
diff --git a/usr/src/uts/common/sys/dkioc_free_util.h b/usr/src/uts/common/sys/dkioc_free_util.h
index 7028084bcd..ef9e9d8c9a 100644
--- a/usr/src/uts/common/sys/dkioc_free_util.h
+++ b/usr/src/uts/common/sys/dkioc_free_util.h
@@ -25,44 +25,34 @@ extern "C" {
#define DFL_COPYIN_MAX_EXTS (1024 * 1024)
-typedef struct dkioc_free_align {
- /* Device block size in bytes. Must be > 0, and must be a power of 2 */
- size_t dfa_bsize;
+#define DFL_ISSYNC(dfl) ((dfl)->dfl_flags & DF_WAIT_SYNC)
+
+typedef struct dkioc_free_info {
+ /* log2(block size) */
+ size_t dfi_bshift;
/* Maximum number of extents in a single request. 0 == no limit */
- size_t dfa_max_ext;
+ size_t dfi_max_ext;
- /*
- * Maximum number of blocks (in units of dfa_bsize) in a single request.
- * 0 == no limit.
- */
- size_t dfa_max_blocks;
+ /* Maximum number of blocks in a single request. 0 == no limit. */
+ size_t dfi_max_blocks;
/*
- * Minimum alignment for extent offsets in units of blocks (dfa_bsize).
- * etc). Must be > 0, and a power of two.
- */
- size_t dfa_align;
-
- /*
- * Minimum granularity for length in units of blocks (dfa_bsize).
+ * Minimum alignment for starting extent offsets in units of blocks.
* Must be > 0, and a power of two.
+ *
+ * A possible future extention might be to also express a preferred
+ * alignment when splitting extents.
*/
- size_t dfa_gran;
-} dkioc_free_align_t;
-
-typedef enum dkioc_iter_flags {
- DIF_NONE = 0,
- DIF_NOSPLIT = (1 << 1)
-} dkioc_iter_flags_t;
+ size_t dfi_align;
+} dkioc_free_info_t;
-typedef int (*dfl_iter_fn_t)(const dkioc_free_list_ext_t *exts, size_t n_ext,
- boolean_t last, void *arg);
+typedef int (*dfl_iter_fn_t)(dkioc_free_list_t *dfl, void *arg);
int dfl_copyin(void *arg, dkioc_free_list_t **out, int ddi_flags, int kmflags);
void dfl_free(dkioc_free_list_t *dfl);
-int dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *align,
- dfl_iter_fn_t fn, void *arg, int kmflag, dkioc_iter_flags_t);
+int dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t len,
+ dfl_iter_fn_t fn, void *arg, int kmflag);
#ifdef __cplusplus
}