diff options
author | Jason King <jason.king@joyent.com> | 2020-04-08 21:30:30 +0000 |
---|---|---|
committer | Jason King <jason.king@joyent.com> | 2020-04-09 18:44:26 +0000 |
commit | 5fd0dd9b1022937d01e35de0abccd19f83c07649 (patch) | |
tree | 05f1587f571e4ab67b25058cfa694a66a12f1cff | |
parent | 716eac69f4d465c68cc31ebc0cf5bb342d010997 (diff) | |
download | illumos-joyent-5fd0dd9b1022937d01e35de0abccd19f83c07649.tar.gz |
Move segmenting/splitting into blkdev, simplify dfl_iter
-rw-r--r-- | usr/src/uts/common/io/blkdev/blkdev.c | 129 | ||||
-rw-r--r-- | usr/src/uts/common/io/vioblk/vioblk.c | 141 | ||||
-rw-r--r-- | usr/src/uts/common/os/dkioc_free_util.c | 556 | ||||
-rw-r--r-- | usr/src/uts/common/sys/blkdev.h | 10 | ||||
-rw-r--r-- | usr/src/uts/common/sys/dkioc_free_util.h | 44 |
5 files changed, 381 insertions, 499 deletions
diff --git a/usr/src/uts/common/io/blkdev/blkdev.c b/usr/src/uts/common/io/blkdev/blkdev.c index 8b8c944006..d05fb20abb 100644 --- a/usr/src/uts/common/io/blkdev/blkdev.c +++ b/usr/src/uts/common/io/blkdev/blkdev.c @@ -162,6 +162,10 @@ struct bd { uint64_t d_numblks; ddi_devid_t d_devid; + uint64_t d_max_free_seg; + uint64_t d_max_free_sect; + uint64_t d_free_align; + kmem_cache_t *d_cache; bd_queue_t *d_queues; kstat_t *d_ksp; @@ -713,6 +717,23 @@ bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) bd_create_inquiry_props(dip, &drive); + if (CAN_FREESPACE(bd)) { + /* + * Default values -- no limits, no stricter alignment than + * the device block size (unspecified fields are set to 0). + */ + bd_free_info_t bfi = { + .bfi_align = 1, + }; + + if (bd->d_ops.o_free_space_info != NULL) + bd->d_ops.o_free_space_info(bd->d_private, &bfi); + + bd->d_max_free_seg = bfi.bfi_max_seg; + bd->d_max_free_sect = bfi.bfi_max_sect; + bd->d_free_align = bfi.bfi_align; + } + bd_create_errstats(bd, inst, &drive); bd_init_errstats(bd, &drive); bd_update_state(bd); @@ -1552,7 +1573,6 @@ bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp) if (rv != 0) return (rv); - /* bd_xfer_done() frees dfl via bd_xfer_free() */ rv = bd_free_space(dev, bd, dfl); return (rv); } @@ -1967,75 +1987,14 @@ bd_free_space_done(struct buf *bp) return (0); } -/* - * Adjust extents to be relative to start of the device. When DKIOCFREE - * is called on a blkdev instance, the extents are relative to the start of - * the partition for a given blkdev instance. We adjust the extent - * starting addresses (by adding the partition start offset to dfl_offset) - * and truncate any extents that extend beyond the end of the partition. - */ -static int -bd_adjust_extents(dev_t dev, bd_t *bd, dkioc_free_list_t *dfl) -{ - dkioc_free_list_ext_t *ext; - minor_t part; - diskaddr_t p_lba; - diskaddr_t p_nblks; - uint64_t offset; - uint64_t length; - size_t i; - uint32_t shift = bd->d_blkshift; - - part = BDPART(dev); - - if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba, - NULL, NULL, 0) != 0) { - return (ENXIO); - } - - offset = (uint64_t)p_lba << shift; - length = (uint64_t)p_nblks << shift; - - dfl->dfl_offset += offset; - if (dfl->dfl_offset < offset) - return (EOVERFLOW); /* XXX: or EINVAL? */ - - for (ext = dfl->dfl_exts, i = 0; i < dfl->dfl_num_exts; i++, ext++) { - if (ext->dfle_start > length) { - ext->dfle_length = 0; - continue; - } - - uint64_t end = ext->dfle_start + ext->dfle_length; - - if (end < ext->dfle_start) - return (EOVERFLOW); /* XXX: or EINVAL? */ - - if (end > length) - ext->dfle_length = length - ext->dfle_start; - } - - return (0); -} - static int -bd_free_space(dev_t dev, bd_t *bd, dkioc_free_list_t *dfl) +bd_free_space_cb(dkioc_free_list_t *dfl, void *arg) { - buf_t *bp = NULL; - bd_xfer_impl_t *xi = NULL; - int rv = 0; - boolean_t sync = (dfl->dfl_flags & DF_WAIT_SYNC) != 0 ? - B_TRUE : B_FALSE; - - /* - * bd_ioctl created our own copy of dfl, so we can modify as - * necessary - */ - rv = bd_adjust_extents(dev, bd, dfl); - if (rv != 0) { - dfl_free(dfl); - return (rv); - } + bd_t *bd = arg; + buf_t *bp = NULL; + bd_xfer_impl_t *xi = NULL; + boolean_t sync = DFL_ISSYNC(dfl) ? B_TRUE : B_FALSE; + int rv = 0; bp = getrbuf(KM_SLEEP); bp->b_resid = 0; @@ -2060,6 +2019,39 @@ bd_free_space(dev_t dev, bd_t *bd, dkioc_free_list_t *dfl) return (rv); } +static int +bd_free_space(dev_t dev, bd_t *bd, dkioc_free_list_t *dfl) +{ + diskaddr_t p_len, p_offset; + uint64_t offset_bytes; + minor_t part = BDINST(dev); + dkioc_free_info_t dfi = { + .dfi_bshift = bd->d_blkshift, + .dfi_align = bd->d_free_align, + .dfi_max_blocks = bd->d_max_free_sect, + .dfi_max_ext = bd->d_max_free_seg, + }; + + if (cmlb_partinfo(bd->d_cmlbh, part, &p_len, &p_offset, NULL, + NULL, 0) != 0) { + dfl_free(dfl); + return (ENXIO); + } + + /* + * bd_ioctl created our own copy of dfl, so we can modify as + * necessary + */ + offset_bytes = (uint64_t)p_offset << bd->d_blkshift; + dfl->dfl_offset += offset_bytes; + if (dfl->dfl_offset < offset_bytes) { + dfl_free(dfl); + return (EOVERFLOW); + } + + return (dfl_iter(dfl, &dfi, p_len, bd_free_space_cb, bd, KM_SLEEP)); +} + /* * Nexus support. */ @@ -2126,6 +2118,7 @@ bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag) switch (ops->o_version) { case BD_OPS_CURRENT_VERSION: hdl->h_ops.o_free_space = ops->o_free_space; + hdl->h_ops.o_free_space_info = ops->o_free_space_info; /*FALLTHRU*/ case BD_OPS_VERSION_1: case BD_OPS_VERSION_0: diff --git a/usr/src/uts/common/io/vioblk/vioblk.c b/usr/src/uts/common/io/vioblk/vioblk.c index b23dbfcfa3..1271796c62 100644 --- a/usr/src/uts/common/io/vioblk/vioblk.c +++ b/usr/src/uts/common/io/vioblk/vioblk.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Alexey Zaytsev <alexey.zaytsev@gmail.com> - * Copyright 2019 Joyent Inc. + * Copyright 2020 Joyent Inc. * Copyright 2019 Western Digital Corporation. */ @@ -148,13 +148,6 @@ static const ddi_dma_attr_t vioblk_dma_attr = { .dma_attr_flags = 0 }; -/* - * Break up DISCARD requests into smaller pieces if the request exceeds - * the limits given by the host. If 0, requests may be truncated if - * they exceed the host limits. - */ -static int vioblk_split_discard = 0; - static vioblk_req_t * vioblk_req_alloc(vioblk_t *vib) { @@ -551,6 +544,16 @@ vioblk_bd_mediainfo(void *arg, bd_media_t *media) } static void +vioblk_bd_free_space_info(void *arg, bd_free_info_t *bfi) +{ + vioblk_t *vib = (void *)arg; + + bfi->bfi_max_seg = vib->vib_max_discard_seg; + bfi->bfi_max_sect = vib->vib_max_discard_sectors; + bfi->bfi_align = vib->vib_discard_sector_align; +} + +static void vioblk_get_id(vioblk_t *vib) { virtio_dma_t *dma; @@ -618,76 +621,35 @@ vioblk_bd_devid(void *arg, dev_info_t *dip, ddi_devid_t *devid) devid)); } -struct vioblk_freesp_arg { - vioblk_t *vfpa_vioblk; - bd_xfer_t *vfpa_xfer; -}; - static int -vioblk_free_exts(const dkioc_free_list_ext_t *exts, size_t n_exts, - boolean_t last, void *arg) +vioblk_bd_free_space(void *arg, bd_xfer_t *xfer) { - struct vioblk_freesp_arg *args = arg; - vioblk_t *vib = args->vfpa_vioblk; + const dkioc_free_list_t *dfl = xfer->x_dfl; + const dkioc_free_list_ext_t *exts = dfl->dfl_exts; + vioblk_t *vib = arg; virtio_dma_t *dma = NULL; virtio_chain_t *vic = NULL; vioblk_req_t *vbr = NULL; struct vioblk_discard_write_zeroes *wzp = NULL; - size_t i; int r = 0; - boolean_t polled; - - /* - * While rare, it's possible we might get called with a list of - * zero extents. In that case, just return success. - */ - if (n_exts == 0) { - if (last) { - bd_xfer_done(args->vfpa_xfer, 0); - args->vfpa_xfer = NULL; - } + boolean_t polled = DFL_ISSYNC(dfl) ? B_TRUE : B_FALSE; - return (0); - } - - /* - * If last is false, dfl_iter() had to segment or split the - * original DKIOCFREE request into multiple requests for us (if last - * is B_FALSE, it implies more extents are coming). While the blkdev - * framework handles implementing the necessary sync/not sync semantics - * for a DKIOCFREE request, that is in terms of the entire original - * request. If dfl_iter() had to break things up, we always treat - * the non-final (last == B_FALSE) calls to vioblk_free_extents() as - * synchronous (polled). - * - * The assumption is that if the host is placing limits on a - * DISCARD request, issuing multiple requests to the same device - * asynchronously is likely to have undesirable results (or else why - * wouldn't the host expose larger limits to prevent segmentation?), - * so we issue one at a time (at least until the final group). - * - * The vioblk_split_discard tunable can be set to 0 to disable this - * behavior -- in that case, any extents that exceed the host limits - * are just discarded. - * - * Unfortunately, there isn't currently a way to report partial - * results, so the choices are to fail the request if any extent in - * the request doesn't meet the device requirements or to break - * as much of the request as is possible - */ - polled = !!last; + /* XXX: This seems like it could be folded into vioblk_request() */ - dma = virtio_dma_alloc(vib->vib_virtio, n_exts * sizeof (*wzp), - &vioblk_dma_attr, DDI_DMA_CONSISTENT | DDI_DMA_WRITE, KM_SLEEP); + dma = virtio_dma_alloc(vib->vib_virtio, + dfl->dfl_num_exts * sizeof (*wzp), &vioblk_dma_attr, + DDI_DMA_CONSISTENT | DDI_DMA_WRITE, KM_SLEEP); if (dma == NULL) return (ENOMEM); wzp = virtio_dma_va(dma, 0); - for (i = 0; i < n_exts; i++, exts++, wzp++) { + for (size_t i = 0; i < dfl->dfl_num_exts; i++, exts++, wzp++) { + uint64_t start = dfl->dfl_offset + exts->dfle_start; + struct vioblk_discard_write_zeroes vdwz = { - .vdwz_sector = exts->dfle_start, - .vdwz_num_sectors = exts->dfle_length, + .vdwz_sector = start >> DEV_BSHIFT, + .vdwz_num_sectors = exts->dfle_length >> DEV_BSHIFT, .vdwz_flags = 0 }; @@ -714,51 +676,9 @@ vioblk_free_exts(const dkioc_free_list_ext_t *exts, size_t n_exts, return (ENOMEM); } - if (last) { - /* - * We attach xfer to the final vioblk request we submit. - * This will allow the vioblk_complete() to handle any - * notifications (e.g. a synchronous request) and - * dispose of xfer afterwards. - */ - vbr->vbr_xfer = args->vfpa_xfer; - args->vfpa_xfer = NULL; - } - + vbr->vbr_xfer = xfer; r = vioblk_common_submit(vib, vic); mutex_exit(&vib->vib_mutex); - return (r); -} - -static int -vioblk_bd_free_space(void *arg, bd_xfer_t *xfer) -{ - vioblk_t *vib = arg; - dkioc_free_align_t align = { - .dfa_bsize = DEV_BSIZE, - .dfa_max_ext = vib->vib_max_discard_seg, - .dfa_max_blocks = vib->vib_max_discard_sectors, - .dfa_align = vib->vib_discard_sector_align, - .dfa_gran = 1, - }; - struct vioblk_freesp_arg sp_arg = { - .vfpa_vioblk = vib, - .vfpa_xfer = xfer - }; - dkioc_free_list_t *dfl = xfer->x_dfl; - dkioc_iter_flags_t iter_flags = - (vioblk_split_discard == 0) ? DIF_NOSPLIT : DIF_NONE; - int r = 0; - - r = dfl_iter(dfl, &align, vioblk_free_exts, &sp_arg, KM_SLEEP, - iter_flags); - - /* - * If we didn't include xfer as part of the final request - * (sp_arg.vfpa_xfer is still set), we should be returning failure - * so that bd_sched() will free xfer. - */ - IMPLY(sp_arg.vfpa_xfer != NULL, r != 0); return (r); } @@ -1084,6 +1004,13 @@ vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) * "o_sync_cache" member from the ops vector. As "bd_alloc_handle()" * makes a copy of the ops vector, we can safely assemble one on the * stack based on negotiated features. + * + * Similarly, the blkdev framework does not provide a way to indicate + * if a device supports an TRIM/UNMAP/DISCARD type operation except + * by omitting the "o_free_space" member from the ops vector. For + * consistency, we also omit the "o_free_info" member since it is + * only possibly used when a device specifies a "o_free_space" + * function. */ bd_ops_t vioblk_bd_ops = { .o_version = BD_OPS_CURRENT_VERSION, @@ -1094,12 +1021,14 @@ vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) .o_read = vioblk_bd_read, .o_write = vioblk_bd_write, .o_free_space = vioblk_bd_free_space, + .o_free_space_info = vioblk_bd_free_space_info, }; if (!virtio_feature_present(vio, VIRTIO_BLK_F_FLUSH)) { vioblk_bd_ops.o_sync_cache = NULL; } if (!virtio_feature_present(vio, VIRTIO_BLK_F_DISCARD)) { vioblk_bd_ops.o_free_space = NULL; + vioblk_bd_ops.o_free_space_info = NULL; } vib->vib_bd_h = bd_alloc_handle(vib, &vioblk_bd_ops, diff --git a/usr/src/uts/common/os/dkioc_free_util.c b/usr/src/uts/common/os/dkioc_free_util.c index 421899c8de..2703e250fa 100644 --- a/usr/src/uts/common/os/dkioc_free_util.c +++ b/usr/src/uts/common/os/dkioc_free_util.c @@ -26,36 +26,13 @@ #include <sys/file.h> #include <sys/sdt.h> -struct ext_arg { - uint64_t ea_ext_cnt; - dfl_iter_fn_t ea_fn; - void *ea_arg; - dkioc_free_list_ext_t *ea_exts; - size_t ea_nreq; - dkioc_iter_flags_t ea_flags; -}; - -typedef int (*ext_iter_fn_t)(const dkioc_free_list_ext_t *, - boolean_t, void *); - -static int ext_iter(const dkioc_free_list_t *, const dkioc_free_align_t *, - uint_t, ext_iter_fn_t, void *); -static int ext_xlate(dkioc_free_list_ext_t *, uint64_t, uint64_t, uint64_t, - uint_t); -static int count_exts(const dkioc_free_list_ext_t *, boolean_t, void *); -static int process_exts(const dkioc_free_list_ext_t *, boolean_t, void *); - -#if __GNUC__ > 4 || __GNU_C_MINOR__ >= 8 -#define uadd64_overflow(a, b, c) __builtin_uaddl_overflow(a, b, c) -#else -static bool -uadd64_overflow(uint64_t a, uint64_t b, uint64_t *res) -{ - *res = a + b; - return ((*res < a || *res < b) ? true : false); -} -#endif - +static int adjust_exts(dkioc_free_list_t *, const dkioc_free_info_t *, + uint64_t len_blk); +static int split_extent(dkioc_free_list_t *, const dkioc_free_info_t *, + size_t, dfl_iter_fn_t, void *, int); +static int process_range(dkioc_free_list_t *, size_t, size_t, + dfl_iter_fn_t, void *, int); + /* * Copy-in convenience function for variable-length dkioc_free_list_t * structures. The pointer to be copied from is in `arg' (may be a pointer @@ -121,79 +98,65 @@ dfl_free(dkioc_free_list_t *dfl) * address stricter than the device block size. * * Since there is currently no mechanism for callers of DKIOCFREE to discover - * any alignment, segmentation, or size requirements for DKIOCFREE requests - * for a particular driver (or instance of a particular driver), dfl_iter() - * allows drivers to tranform the dkioc_free_list_t from a DKIOCFREE request - * into groups of dkioc_free_ext_ts that conform to the driver's alignment, - * segmentation, or size requirements. The transformation done by dfl_iter() - * may involve modifications such as splitting a list of extents into smaller - * groups, splitting extents into multiple smaller extents, increasing the - * start address of an extent to conform to alignments, or reducing the size - * of an extent so that the resulting size is a multiple of the device block - * size. In all instances, the resultant set is either identical to the - * original set of extents, or a subset -- that is we _never_ transform a - * a range into a range that exceeds the original boundaries of the original - * extents. + * such restrictions, instead of rejecting any requests that do not conform to + * some undiscoverable (to the caller) set of requirements, a driver can use + * dfl_iter() to adjust and resegment the extents from a DKIOCFREE call as + * required to conform to its requirements. * - * The transformed extents are grouped per the driver's requirements described - * by the constraints contained in the 'dfa' parameter, and the 'func' - * callback is invoked for each group of transformed extents. An optional - * opaque (to dfl_iter()) 'arg' parameter is passed through to 'func' as well. - * In addition, on the final group, the 'last' argument of 'func' is set - * to B_TRUE (for all other groups of extents passed to 'func', 'func' is - * called with 'last' set to B_FALSE). Indicating the final group of extents - * allows a driver to mark a request as complete or implement synchronous - * semantics as required. + * The original request is passed as 'dfl' and the alignment requirements + * are passed in 'dfi'. Additionally the size of the device (in units of + * blocks as described in dfi) is passed as len -- this allows a driver with + * multiple instances of different sizes but similar requirements (e.g. + * a partitioned blkdev device) to not construct a separate dkioc_free_info_t + * struct for each device. * - * Unfortunately, the DKIOCFREE ioctl provides no method for communicating - * any sort of partial completion -- either it returns success (0) or - * an error. As such, there's little benefit to providing more detailed - * error semantics beyond what DKIOCFREE can handle (if that ever changes, it - * would be worth revisiting this). As a result, we take a somewhat simplistic - * approach -- we stop processing the request on the first error encountered - * and return the error. Otherwise dfl_iter() returns 0. + * dfl_iter() always consumes the contents of 'dfl'. The caller should never + * free 'dfl' after callign dfl_iter(). Many drivers will queue free requests + * and then release the resources after the request completes (successfully + * or not) some time later, so always consuming dfl makes supporting this + * simpler for the caller. + * + * dfl_iter() will call 'func' with a dkioc_free_list_t and the value of + * arg passed to it as needed. 'func' can assume that the extents in + * dkioc_free_list_t passed to it should conform to the requirements in + * 'dfi', but should NOT assume that the dkioc_free_list_t instance passed to it + * is the same instance passed to dfl_iter(). While this may be the case in + * some instances (e.g. all the extents conform to the driver's requirements), + * dfl_iter() may allocate new dkioc_free_list_t instances as required. + * 'func' must always properly free the dkioc_free_list_t passed to it as + * appropriate (either via a callback after completion, upon error, etc.). * - * Note that transformed extents that result in a range too small to be - * processed by the driver (e.g. a 4k block size with a request to free - * starting at offset 512 and a length of 1024) aren't considered an error and - * are silently ignored. This means it is possible (though hopefully unlikely) - * a request to a driver may result in no freed extents. When this happens, - * 'func' is still called, but with a NULL list of extents, an extent count - * of 0, and with last set to B_TRUE to allow for cleanup (calling done - * routines, etc.). + * Unfortunately, the DKIOCFREE ioctl provides no method for communicating + * any notion of partial completion -- either it returns success (0) or + * an error. It's not clear if such a notion would even be possible while + * supporting multiple types of devices (NVMe, SCSI, etc.) with the same + * interface. As such, there's little benefit to providing more detailed error + * semantics beyond what DKIOCFREE can handle. * - * Currently no flags are defined, and should always be zero. + * Due to this, a somewhat simplistic approach is taken to error handling. The + * original list of extents is first checked to make sure they all appear + * valid -- that is they do not start or extend beyond the end of the device. + * Any request that contains such extents is always rejected in it's entirety. + * It is possible after applying any needed adjustments to the original list + * of extents that the result is not acceptable to the driver. For example, + * a device with a 512 byte block size that tries to free the range 513-1023 + * (bytes) would not be able to be processed. Such extents will be silently + * ignored. If the original request consists of nothing but such requests, + * dfl_iter() will never call 'func' and will merely return 0. */ int -dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa, - dfl_iter_fn_t func, void *arg, int kmflag, dkioc_iter_flags_t flags) +dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t len, + dfl_iter_fn_t func, void *arg, int kmflag) { - dkioc_free_list_ext_t *exts; - uint64_t n_exts = 0; - struct ext_arg earg = { 0 }; - uint_t bshift; + size_t n_blocks, n_segs, start_idx, i; int r = 0; - - if ((flags & ~(DIF_NONE|DIF_NOSPLIT)) != 0) - return (SET_ERROR(EINVAL)); - - /* Block size must be at least 1 and a power of two */ - if (dfa->dfa_bsize == 0 || !ISP2(dfa->dfa_bsize)) - return (SET_ERROR(EINVAL)); + boolean_t need_copy = B_FALSE; /* Offset alignment must also be at least 1 and a power of two */ - if (dfa->dfa_align == 0 || !ISP2(dfa->dfa_align)) - return (SET_ERROR(EINVAL)); - - /* Length granularity must be at least 1 and a power of two */ - if (dfa->dfa_gran == 0 || !ISP2(dfa->dfa_gran)) - return (SET_ERROR(EINVAL)); - - /* - * Since dfa_bsize != 0 (see above), ddi_ffsll() _must_ return a - * value > 1 - */ - bshift = ddi_ffsll((long long)dfa->dfa_bsize) - 1; + if (dfi->dfi_align == 0 || !ISP2(dfi->dfi_align)) { + r = SET_ERROR(EINVAL); + goto done; + } /* * If a limit on the total number of blocks is given, it must be @@ -202,261 +165,260 @@ dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa, * allow extent sizes at least 8 blocks long (otherwise there will be * device addresses that cannot be contained within an extent). */ - if (dfa->dfa_max_blocks > 0 && dfa->dfa_max_blocks < dfa->dfa_align) - return (SET_ERROR(EINVAL)); + if (dfi->dfi_max_blocks > 0 && dfi->dfi_max_blocks < dfi->dfi_align) { + r = SET_ERROR(EINVAL); + goto done; + } /* - * The general approach is that we walk the array of extents twice - * using ext_iter(). For each extent, ext_iter() will invoke the - * given callback function 0 or more times (based on the requirements - * in dfa), and then invoke the callback function with a NULL extent. - * - * This first walk is used to count the total number of extents - * after applying the driver requirements in 'dfa'. This may be - * different from the initial number of extents due to splitting - * extents or discarding extents that do not conform to alignment - * requirements (and may even be 0). + * The first pass, align everything as needed and make sure all the + * extents look valid. */ - r = ext_iter(dfl, dfa, bshift, count_exts, &n_exts); - if (r != 0) - return (r); + if ((r = adjust_exts(dfl, dfi, len)) != 0) { + goto done; + } /* - * It's possible that some extents do not conform to the alignment - * requirements, nor do they have a conforming subset. For example, - * a device with a block size of 512 bytes, and a starting alignment - * of 4096 bytes would not be able to free extent with a starting - * offset of 512 and a length of 1024. Such extents are ignored - * (we have no good way to report back partial results). While unlikely, - * it is possible a request consists of nothing but non-conforming - * extents. In this case, we invoke the callback with a NULL list - * of extents and with last set so it can perform any necessary - * cleanup, completion tasks. + * Go through and split things up as needed. The general idea is to + * split along the original extent boundaries when needed. We only + * split an extent from the original request into multiple extents + * if the original extent is by itself too big for the device to + * process in a single request. */ - if (n_exts == 0) - return (func(NULL, 0, B_TRUE, arg)); + start_idx = 0; + n_blocks = n_segs = 0; + for (i = 0; i < dfl->dfl_num_exts; i++) { + uint64_t start = dfl->dfl_offset + dfl->dfl_exts[i].dfle_start; + uint64_t end = start + dfl->dfl_exts[i].dfle_length; + size_t len_blk = (end - start) >> dfi->dfi_bshift; + + if (len_blk == 0) { + /* + * If we encounter a zero length extent, we're going + * to create a new copy of dfl no matter what -- + * the size of dfl is determined by dfl_num_exts so + * we cannot do things like shift the contents and + * reduce dfl_num_exts to get a contiguous array + * of non-zero length extents. + */ + need_copy = B_TRUE; + continue; + } - exts = kmem_zalloc(n_exts * sizeof (*exts), kmflag); - if (exts == NULL) - return (SET_ERROR(ENOMEM)); + if (n_blocks + len_blk > dfi->dfi_max_blocks) { + if ((r = process_range(dfl, start_idx, i - start_idx, + func, arg, kmflag)) != 0) { + goto done; + } - earg.ea_ext_cnt = 0; - earg.ea_fn = func; - earg.ea_arg = arg; - earg.ea_exts = exts; - earg.ea_nreq = 0; - earg.ea_flags = flags; + if (len_blk < dfi->dfi_max_blocks) { + /* + * We've spilled over, but this block on its + * own is fine. Start the next range of + * blocks with this one and continue; + */ + start_idx = i; + n_segs = 1; + n_blocks = len_blk; + continue; + } + + /* + * Even after starting a new request, this extent + * is too big. Split it until it fits. + */ + if ((r = split_extent(dfl, dfi, i, func, arg, + kmflag)) != 0) { + goto done; + } + + start_idx = i + 1; + n_segs = 0; + n_blocks = 0; + continue; + } + + if (n_segs + 1 > dfi->dfi_max_ext) { + if ((r = process_range(dfl, start_idx, i - start_idx, + func, arg, kmflag)) != 0) { + goto done; + } + + start_idx = i; + n_segs = 0; + n_blocks = 0; + continue; + } + + n_segs++; + n_blocks += len_blk; + } /* - * We've allocated enough space to hold all the transformed extents - * in 'exts'. Now walk the original list of extents a second time - * and do the work. process_exts() will accumulate the transformed - * extents and invoke 'func' (the callback passed into dfl_iter()) to - * perform the free request with the accumulated extents, repeating - * as necessary. + * If a copy wasn't required, and we haven't processed a subset of + * the extents already, we can just use the original request. */ - r = ext_iter(dfl, dfa, bshift, process_exts, &earg); - kmem_free(exts, n_exts * sizeof (*exts)); - return (r); -} - -static int -count_exts(const dkioc_free_list_ext_t *ext, boolean_t newreq __unused, - void *arg) -{ - size_t *np = arg; + if (!need_copy && start_idx == 0) { + return (func(dfl, arg)); + } - if (ext != NULL && ext->dfle_length > 0) - (*np)++; + r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag); - return (0); +done: + dfl_free(dfl); + return (r); } +/* + * Adjust the start and length of each extent in dfl so that it conforms to + * the requirements in dfi. It also verifies that no extent extends beyond + * the end of the device (given by len_blk). + * + * Returns 0 on success, or an error value. + */ static int -process_exts(const dkioc_free_list_ext_t *ext, boolean_t newreq, void *arg) +adjust_exts(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, + uint64_t len_blk) { - struct ext_arg *args = arg; - dkioc_free_list_ext_t *ext_list = args->ea_exts; + dkioc_free_list_ext_t *exts = dfl->dfl_exts; + size_t len = len_blk << dfi->dfi_bshift; + uint_t align = dfi->dfi_align << dfi->dfi_bshift; + uint_t bsize = (uint_t)1 << dfi->dfi_bshift; - if (ext == NULL) { + for (size_t i = 0; i < dfl->dfl_num_exts; i++, exts++) { /* - * The very last call should be with ext set to NULL to - * flush any accumulated extents since the last start of - * a new group. + * Since there are no known requirements on the value of + * dfl_offset, it's possible (though odd) to have a scenario + * where dfl_offset == 1, and dfle_start == 511 (resulting + * in an actual start offset of 512). As such, we always + * apply the offset and find the resulting starting offset + * and length (in bytes) first, then apply any rounding + * and alignment. */ - VERIFY(newreq); + uint64_t start = exts->dfle_start + dfl->dfl_offset; + uint64_t end = start + exts->dfle_length; /* - * A corner case -- we never had any extents that could - * be passed to the callback. Do a final call with the - * extent list as NULL (and a count of 0). + * Make sure after applying dfl->dfl_offset that the results + * don't overflow. */ - if (args->ea_ext_cnt == 0) - ext_list = NULL; - - args->ea_nreq++; + if (start < dfl->dfl_offset) { + return (SET_ERROR(EOVERFLOW)); + } - return (args->ea_fn(ext_list, args->ea_ext_cnt, B_TRUE, - args->ea_arg)); - } + if (end < start) { + return (SET_ERROR(EOVERFLOW)); + } - /* - * Starting a new request, and we have accumulated extents to - * flush. - */ - if (newreq && args->ea_ext_cnt > 0) { - int r; + /* + * Make sure we don't extend past the end of the device + * XXX: ENXIO instead? + */ + if (end > len) { + return (SET_ERROR(ERANGE)); + } - args->ea_nreq++; + start = P2ROUNDUP(start, align); + end = P2ALIGN(end, bsize); - r = args->ea_fn(ext_list, args->ea_ext_cnt, B_FALSE, - args->ea_arg); - if (r != 0) - return (r); + ASSERT(IS_P2ALIGNED(end - start, bsize)); /* - * A bit simplistic, but we just keep appending to the - * original array allocated by dfl_iter(), but just update - * our starting position (args->ex_exts) for the next group. + * Remove the offset so that when it's later applied again, + * the correct start value is obtained. */ - args->ea_exts += args->ea_ext_cnt; - args->ea_ext_cnt = 0; + exts->dfle_start = start - dfl->dfl_offset; + exts->dfle_length = end - start; } - /* Skip any extents that end up with zero length after aligning. */ - if (ext->dfle_length > 0) - args->ea_exts[args->ea_ext_cnt++] = *ext; - return (0); } /* - * Translate the ext from byte-based units to units of - * (1 << bshift) sized blocks, with the start and length values adjusted to - * the align and gran values (align and gran are in units of bytes). - * - * Returns 0 on success, or an error value. + * Take a subset of extents from dfl (starting at start_idx, with n entries) + * and create a new dkioc_free_list_t, passing that to func. */ static int -ext_xlate(dkioc_free_list_ext_t *ext, uint64_t offset, uint64_t align, - uint64_t gran, uint_t bshift) +process_range(dkioc_free_list_t *dfl, size_t start_idx, size_t n, + dfl_iter_fn_t func, void *arg, int kmflag) { - uint64_t start, end; + dkioc_free_list_t *new_dfl = NULL; + dkioc_free_list_ext_t *new_exts = NULL; + dkioc_free_list_ext_t *exts = dfl->dfl_exts + start_idx; + size_t actual_len = n; + int r = 0; - if (uadd64_overflow(offset, ext->dfle_start, &start)) - return (SET_ERROR(EOVERFLOW)); + if (n == 0) { + return (0); + } - if (uadd64_overflow(start, ext->dfle_length, &end)) - return (SET_ERROR(EOVERFLOW)); - - start = P2ROUNDUP(start, align); - end = P2ALIGN(end, gran); + /* + * Ignore any zero length extents. No known devices attach any + * semantic meaning to such extents, and are likely just a result of + * narrowing the range of the extent to fit the device alignment + * requirements. It is possible the original caller submitted a + * zero length extent, but we ignore those as well. Since we can't + * communicate partial results back to the caller anyway, it's + * unclear whether reporting that one of potentially many exents was + * too small (without being able to identify which one) to the caller + * of the DKIOCFREE request would be useful. + */ + for (size_t i = 0; i < n; i++) { + if (exts[i].dfle_length == 0 && --actual_len == 0) { + return (0); + } + } - ext->dfle_start = start >> bshift; - ext->dfle_length = (end > start) ? (end - start) >> bshift : 0; - return (0); + new_dfl = kmem_zalloc(DFL_SZ(actual_len), kmflag); + if (new_dfl == NULL) { + return (SET_ERROR(ENOMEM)); + } + + new_dfl->dfl_flags = dfl->dfl_flags; + new_dfl->dfl_num_exts = actual_len; + new_dfl->dfl_offset = dfl->dfl_offset; + new_exts = new_dfl->dfl_exts; + + for (size_t i = 0; i < n; i++) { + if (exts[i].dfle_length == 0) { + continue; + } + + *new_exts++ = exts[i]; + } + + return (func(new_dfl, arg)); } /* - * Iterate through the extents in dfl. fn is called for each adjusted extent - * (adjusting offsets and lengths to conform to the alignment requirements) - * and one input extent may result in 0, 1, or multiple calls to fn as a - * result. + * Split the extent at idx into multiple lists (calling func for each one). */ static int -ext_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa, - uint_t bshift, ext_iter_fn_t fn, void *arg) +split_extent(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, size_t idx, + dfl_iter_fn_t func, void *arg, int kmflag) { - const dkioc_free_list_ext_t *src = dfl->dfl_exts; - uint64_t n_exts = 0; - uint64_t n_blks = 0; - uint64_t align = dfa->dfa_align << bshift; - uint64_t gran = dfa->dfa_gran << bshift; - size_t i; - boolean_t newreq = B_TRUE; - - for (i = 0; i < dfl->dfl_num_exts; i++, src++) { - dkioc_free_list_ext_t ext = *src; - int r; - - r = ext_xlate(&ext, dfl->dfl_offset, align, gran, bshift); - if (r != 0) - return (r); + ASSERT3U(idx, <, dfl->dfl_num_exts); - while (ext.dfle_length > 0) { - dkioc_free_list_ext_t seg = ext; + const uint64_t amt = dfi->dfi_max_blocks << dfi->dfi_bshift; + dkioc_free_list_ext_t *ext = dfl->dfl_exts + idx; + uint64_t len = ext->dfle_length; + int r; - if (dfa->dfa_max_ext > 0 && - n_exts + 1 > dfa->dfa_max_ext) { - /* - * Reached the max # of extents, start a new - * request, and retry. - */ - newreq = B_TRUE; - n_exts = 0; - n_blks = 0; - continue; - } - - if (dfa->dfa_max_blocks > 0 && - n_blks + seg.dfle_length > dfa->dfa_max_blocks) { - /* - * This extent puts us over the max # of - * blocks in a request. - */ - if (!newreq) { - /* - * If we haven't started a new request, - * start one, and retry as a new - * request in case it can fit on - * its own. If not, we'll skip - * this block and split it in the - * code below. - */ - newreq = B_TRUE; - n_exts = 0; - n_blks = 0; - continue; - } - - /* - * A new request, and the extent length is - * larger than our max. Reduce the length to - * the largest multiple of dfa_align - * equal to or less than dfa_max_blocks - * so the next starting address has the - * correct alignment, splitting the request. - */ - seg.dfle_length = P2ALIGN(dfa->dfa_max_blocks, - align); - - /* - * Our sanity checks on the alignment - * requirements mean we should be able to - * free at least part of the extent. - */ - ASSERT3U(seg.dfle_length, >, 0); - } - - r = fn(&seg, newreq, arg); - if (r != 0) - return (r); - - n_exts++; - n_blks += seg.dfle_length; - - ASSERT3U(ext.dfle_length, >=, seg.dfle_length); - - ext.dfle_length -= seg.dfle_length; - ext.dfle_start += seg.dfle_length; - newreq = B_FALSE; + /* + * Break the extent into as many single requests as needed. While it + * would be possible in some circumstances to combine the final chunk + * of the extent (after splitting) with the remaining extents in the + * original request, it's not clear there's much benefit from the + * added complexity. Such behavior could be added in the future if + * it's determined to be worthwhile. + */ + while (len > 0) { + ext->dfle_length = (len > amt) ? amt : len; + if ((r = process_range(dfl, idx, 1, func, arg, kmflag)) != 0) { + return (r); } + ext->dfle_start += ext->dfle_length; } - /* - * Invoke the callback one last time w/ a NULL array of extents and - * newreq == B_TRUE to signal completion (and flush any accumulated - * extents). - */ - return (fn(NULL, B_TRUE, arg)); + return (0); } diff --git a/usr/src/uts/common/sys/blkdev.h b/usr/src/uts/common/sys/blkdev.h index f0336d2656..92a3807feb 100644 --- a/usr/src/uts/common/sys/blkdev.h +++ b/usr/src/uts/common/sys/blkdev.h @@ -23,7 +23,7 @@ * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2019 Western Digital Corporation. - * Copyright 2019 Joyent, Inc. + * Copyright 2020 Joyent, Inc. */ #ifndef _SYS_BLKDEV_H @@ -80,6 +80,7 @@ typedef struct bd_handle *bd_handle_t; typedef struct bd_xfer bd_xfer_t; typedef struct bd_drive bd_drive_t; typedef struct bd_media bd_media_t; +typedef struct bd_free_info bd_free_info_t; typedef struct bd_ops bd_ops_t; struct dkioc_free_list_s; @@ -149,6 +150,12 @@ struct bd_media { #define BD_INFO_FLAG_HOTPLUGGABLE (1U << 1) #define BD_INFO_FLAG_READ_ONLY (1U << 2) +struct bd_free_info { + uint64_t bfi_align; + uint64_t bfi_max_seg; + uint64_t bfi_max_sect; +}; + /* * If the API changes and we want to bump the version, add another * enum value, Eg BD_OPS_VERSION_1. BD_OPS_CURRENT_VERSION should always @@ -169,6 +176,7 @@ struct bd_ops { int (*o_read)(void *, bd_xfer_t *); int (*o_write)(void *, bd_xfer_t *); int (*o_free_space)(void *, bd_xfer_t *); + void (*o_free_space_info)(void *, bd_free_info_t *); }; struct bd_errstats { diff --git a/usr/src/uts/common/sys/dkioc_free_util.h b/usr/src/uts/common/sys/dkioc_free_util.h index 7028084bcd..ef9e9d8c9a 100644 --- a/usr/src/uts/common/sys/dkioc_free_util.h +++ b/usr/src/uts/common/sys/dkioc_free_util.h @@ -25,44 +25,34 @@ extern "C" { #define DFL_COPYIN_MAX_EXTS (1024 * 1024) -typedef struct dkioc_free_align { - /* Device block size in bytes. Must be > 0, and must be a power of 2 */ - size_t dfa_bsize; +#define DFL_ISSYNC(dfl) ((dfl)->dfl_flags & DF_WAIT_SYNC) + +typedef struct dkioc_free_info { + /* log2(block size) */ + size_t dfi_bshift; /* Maximum number of extents in a single request. 0 == no limit */ - size_t dfa_max_ext; + size_t dfi_max_ext; - /* - * Maximum number of blocks (in units of dfa_bsize) in a single request. - * 0 == no limit. - */ - size_t dfa_max_blocks; + /* Maximum number of blocks in a single request. 0 == no limit. */ + size_t dfi_max_blocks; /* - * Minimum alignment for extent offsets in units of blocks (dfa_bsize). - * etc). Must be > 0, and a power of two. - */ - size_t dfa_align; - - /* - * Minimum granularity for length in units of blocks (dfa_bsize). + * Minimum alignment for starting extent offsets in units of blocks. * Must be > 0, and a power of two. + * + * A possible future extention might be to also express a preferred + * alignment when splitting extents. */ - size_t dfa_gran; -} dkioc_free_align_t; - -typedef enum dkioc_iter_flags { - DIF_NONE = 0, - DIF_NOSPLIT = (1 << 1) -} dkioc_iter_flags_t; + size_t dfi_align; +} dkioc_free_info_t; -typedef int (*dfl_iter_fn_t)(const dkioc_free_list_ext_t *exts, size_t n_ext, - boolean_t last, void *arg); +typedef int (*dfl_iter_fn_t)(dkioc_free_list_t *dfl, void *arg); int dfl_copyin(void *arg, dkioc_free_list_t **out, int ddi_flags, int kmflags); void dfl_free(dkioc_free_list_t *dfl); -int dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *align, - dfl_iter_fn_t fn, void *arg, int kmflag, dkioc_iter_flags_t); +int dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t len, + dfl_iter_fn_t fn, void *arg, int kmflag); #ifdef __cplusplus } |