diff options
author | Jason King <jason.king@joyent.com> | 2019-12-16 22:51:44 +0000 |
---|---|---|
committer | Jason King <jason.king@joyent.com> | 2020-04-09 18:44:25 +0000 |
commit | 8039bb9bfdd139f72275fafaa7280ee016d8e548 (patch) | |
tree | 60b43d7032f8efc61eec24fabbfff92bbbf66361 | |
parent | daeb6daf8c6eff1b8397f306c722ef02afdff012 (diff) | |
download | illumos-joyent-8039bb9bfdd139f72275fafaa7280ee016d8e548.tar.gz |
vioblk discard support (wip)
-rw-r--r-- | usr/src/uts/common/io/vioblk/vioblk.c | 114 | ||||
-rw-r--r-- | usr/src/uts/common/io/vioblk/vioblk.h | 64 | ||||
-rw-r--r-- | usr/src/uts/common/os/dkioc_free_util.c | 290 | ||||
-rw-r--r-- | usr/src/uts/common/sys/dkioc_free_util.h | 13 |
4 files changed, 468 insertions, 13 deletions
diff --git a/usr/src/uts/common/io/vioblk/vioblk.c b/usr/src/uts/common/io/vioblk/vioblk.c index b9459e1d9e..fd5b4541b8 100644 --- a/usr/src/uts/common/io/vioblk/vioblk.c +++ b/usr/src/uts/common/io/vioblk/vioblk.c @@ -87,6 +87,7 @@ #include <sys/containerof.h> #include <sys/ctype.h> #include <sys/sysmacros.h> +#include <sys/dkioc_free_util.h> #include "virtio.h" #include "vioblk.h" @@ -612,6 +613,105 @@ vioblk_bd_devid(void *arg, dev_info_t *dip, ddi_devid_t *devid) devid)); } +struct vioblk_freesp_arg { + vioblk_t *vfpa_vioblk; + bd_xfer_t *vfpa_xfer; +}; + +static int +vioblk_free_exts(const dkioc_free_list_ext_t *exts, size_t n_exts, + boolean_t last, void *arg) +{ + struct vioblk_freesp_arg *args = arg; + vioblk_t *vib = args->vfpa_vioblk; + virtio_dma_t *dma = NULL; + virtio_chain_t *vic = NULL; + vioblk_req_t *vbr = NULL; + struct vioblk_discard_write_zeroes *wzp = NULL; + size_t i; + int r = 0; + + dma = virtio_dma_alloc(vib->vib_virtio, n_exts * sizeof (*wzp), + &vioblk_dma_attr, DDI_DMA_CONSISTENT | DDI_DMA_WRITE, KM_SLEEP); + if (dma == NULL) + return (ENOMEM); + + wzp = virtio_dma_va(dma, 0); + + for (i = 0; i < n_exts; i++, exts++, wzp++) { + struct vioblk_discard_write_zeroes vdwz = { + .vdwz_sector = exts->dfle_start, + .vdwz_num_sectors = exts->dfle_length, + }; + + bcopy(&vdwz, wzp, sizeof (*wzp)); + } + + mutex_enter(&vib->vib_mutex); + + vic = vioblk_common_start(vib, VIRTIO_BLK_T_DISCARD, 0, B_FALSE); + if (vic == NULL) { + mutex_exit(&vib->vib_mutex); + virtio_dma_free(dma); + return (ENOMEM); + } + + vbr = virtio_chain_data(vic); + if (virtio_chain_append(vic, + virtio_dma_cookie_pa(dma, 0), + virtio_dma_cookie_size(dma, 0), + VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { + vioblk_req_free(vib, vbr); + virtio_chain_free(vic); + mutex_exit(&vib->vib_mutex); + return (ENOMEM); + } + + if (last) { + /* + * We attach xfer to the final vioblk request we submit. + * This will allow the vioblk_complete() to handle any + * notifications (e.g. a synchronous request) and + * dispose of xfer afterwards. + */ + vbr->vbr_xfer = args->vfpa_xfer; + args->vfpa_xfer = NULL; + } + + r = vioblk_common_submit(vib, vic); + mutex_exit(&vib->vib_mutex); + return (r); +} + +static int +vioblk_bd_free_space(void *arg, bd_xfer_t *xfer) +{ + vioblk_t *vib = arg; + dkioc_free_align_t align = { + .dfa_bsize = DEV_BSIZE, + .dfa_max_ext = vib->vib_max_discard_seg, + .dfa_max_blocks = vib->vib_max_discard_sectors, + .dfa_align = vib->vib_discard_sector_align + }; + struct vioblk_freesp_arg sp_arg = { + .vfpa_vioblk = vib, + .vfpa_xfer = xfer + }; + int r = dfl_iter(xfer->x_dfl, &align, vioblk_free_exts, &sp_arg, + KM_SLEEP, 0); + + /* + * If we didn't include xfer as part of the final request, we + * need to clean it up now. + */ + if (sp_arg.vfpa_xfer != NULL) { + VERIFY3S(r, !=, 0); + bd_xfer_done(sp_arg.vfpa_xfer, r); + } + + return (r); +} + /* * As the device completes processing of a request, it returns the chain for * that request to our I/O queue. This routine is called in two contexts: @@ -804,6 +904,15 @@ vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) } } + if (virtio_feature_present(vio, VIRTIO_BLK_F_DISCARD)) { + vib->vib_max_discard_sectors = virtio_dev_get32(vio, + VIRTIO_BLK_CONFIG_MAX_DISCARD_SECT); + vib->vib_max_discard_seg = virtio_dev_get32(vio, + VIRTIO_BLK_CONFIG_MAX_DISCARD_SEG); + vib->vib_discard_sector_align = virtio_dev_get32(vio, + VIRTIO_BLK_CONFIG_DISCARD_ALIGN); + } + /* * When allocating the request queue, we include two additional * descriptors (beyond those required for request data) to account for @@ -933,11 +1042,14 @@ vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) .o_sync_cache = vioblk_bd_flush, .o_read = vioblk_bd_read, .o_write = vioblk_bd_write, - .o_free_space = NULL, + .o_free_space = vioblk_bd_free_space, }; if (!virtio_feature_present(vio, VIRTIO_BLK_F_FLUSH)) { vioblk_bd_ops.o_sync_cache = NULL; } + if (!virtio_feature_present(vio, VIRTIO_BLK_F_DISCARD)) { + vioblk_bd_ops.o_free_space = NULL; + } vib->vib_bd_h = bd_alloc_handle(vib, &vioblk_bd_ops, &vib->vib_bd_dma_attr, KM_SLEEP); diff --git a/usr/src/uts/common/io/vioblk/vioblk.h b/usr/src/uts/common/io/vioblk/vioblk.h index e08fc31e8f..24303c193b 100644 --- a/usr/src/uts/common/io/vioblk/vioblk.h +++ b/usr/src/uts/common/io/vioblk/vioblk.h @@ -32,17 +32,26 @@ extern "C" { * These are offsets into the device-specific configuration space available * through the virtio_dev_*() family of functions. */ -#define VIRTIO_BLK_CONFIG_CAPACITY 0x00 /* 64 R */ -#define VIRTIO_BLK_CONFIG_SIZE_MAX 0x08 /* 32 R */ -#define VIRTIO_BLK_CONFIG_SEG_MAX 0x0C /* 32 R */ -#define VIRTIO_BLK_CONFIG_GEOMETRY_C 0x10 /* 16 R */ -#define VIRTIO_BLK_CONFIG_GEOMETRY_H 0x12 /* 8 R */ -#define VIRTIO_BLK_CONFIG_GEOMETRY_S 0x13 /* 8 R */ -#define VIRTIO_BLK_CONFIG_BLK_SIZE 0x14 /* 32 R */ -#define VIRTIO_BLK_CONFIG_TOPO_PBEXP 0x18 /* 8 R */ -#define VIRTIO_BLK_CONFIG_TOPO_ALIGN 0x19 /* 8 R */ -#define VIRTIO_BLK_CONFIG_TOPO_MIN_SZ 0x1A /* 16 R */ -#define VIRTIO_BLK_CONFIG_TOPO_OPT_SZ 0x1C /* 32 R */ +#define VIRTIO_BLK_CONFIG_CAPACITY 0x00 /* 64 R */ +#define VIRTIO_BLK_CONFIG_SIZE_MAX 0x08 /* 32 R */ +#define VIRTIO_BLK_CONFIG_SEG_MAX 0x0C /* 32 R */ +#define VIRTIO_BLK_CONFIG_GEOMETRY_C 0x10 /* 16 R */ +#define VIRTIO_BLK_CONFIG_GEOMETRY_H 0x12 /* 8 R */ +#define VIRTIO_BLK_CONFIG_GEOMETRY_S 0x13 /* 8 R */ +#define VIRTIO_BLK_CONFIG_BLK_SIZE 0x14 /* 32 R */ +#define VIRTIO_BLK_CONFIG_TOPO_PBEXP 0x18 /* 8 R */ +#define VIRTIO_BLK_CONFIG_TOPO_ALIGN 0x19 /* 8 R */ +#define VIRTIO_BLK_CONFIG_TOPO_MIN_SZ 0x1A /* 16 R */ +#define VIRTIO_BLK_CONFIG_TOPO_OPT_SZ 0x1C /* 32 R */ +#define VIRTIO_BLK_CONFIG_WRITEBACK 0x20 /* 8 R */ + /* unused 0x21 8 R */ +#define VIRTIO_BLK_CONFIG_NUM_QUEUES 0x22 /* 16 R */ +#define VIRTIO_BLK_CONFIG_MAX_DISCARD_SECT 0x24 /* 32 R */ +#define VIRTIO_BLK_CONFIG_MAX_DISCARD_SEG 0x28 /* 32 R */ +#define VIRTIO_BLK_CONFIG_DISCARD_ALIGN 0x2C /* 32 R */ +#define VIRTIO_BLK_CONFIG_MAX_WRITE_ZERO_SECT 0x30 /* 32 R */ +#define VIRTIO_BLK_CONFIG_MAX_WRITE_ZERO_SEG 0x34 /* 32 R */ +#define VIRTIO_BLK_CONFIG_WRITE_ZERO_UNMAP 0x38 /* 8 R */ /* * VIRTIO BLOCK VIRTQUEUES @@ -64,6 +73,10 @@ extern "C" { #define VIRTIO_BLK_F_SCSI (1ULL << 7) #define VIRTIO_BLK_F_FLUSH (1ULL << 9) #define VIRTIO_BLK_F_TOPOLOGY (1ULL << 10) +#define VIRTIO_BLK_F_CONFIG_WCE (1ULL << 11) +#define VIRTIO_BLK_F_MQ (1ULL << 12) +#define VIRTIO_BLK_F_DISCARD (1ULL << 13) +#define VIRTIO_BLK_F_WRITE_ZEROES (1ULL << 14) /* * These features are supported by the driver and we will request them from the @@ -74,7 +87,8 @@ extern "C" { VIRTIO_BLK_F_FLUSH | \ VIRTIO_BLK_F_TOPOLOGY | \ VIRTIO_BLK_F_SEG_MAX | \ - VIRTIO_BLK_F_SIZE_MAX) + VIRTIO_BLK_F_SIZE_MAX | \ + VIRTIO_BLK_F_DISCARD) /* * VIRTIO BLOCK REQUEST HEADER @@ -102,9 +116,31 @@ struct vioblk_req_hdr { #define VIRTIO_BLK_T_FLUSH 4 #define VIRTIO_BLK_T_FLUSH_OUT 5 #define VIRTIO_BLK_T_GET_ID 8 +#define VIRTIO_BLK_T_DISCARD 11 +#define VIRTIO_BLK_T_WRITE_ZEROES 13 #define VIRTIO_BLK_T_BARRIER 0x80000000 /* + * VIRTIO BLOCK DISCARD/WRITE ZEROS DATA + * + * For hosts that support the DISCARD or WRITE ZEROES features, instead of + * data, the vioblk_discard_write_zeros struct is used as the 'data' for + * the request. + */ +struct vioblk_discard_write_zeroes { + uint64_t vdwz_sector; + uint32_t vdwz_num_sectors; + uint32_t vdwz_flags; +} __packed; + +/* + * vdwz_flags values + */ + +/* For a WRITE ZEROES request, also unmap the block */ +#define VIRTIO_BLK_WRITE_ZEROS_UNMAP (1U << 0) + +/* * The GET_ID command type does not appear in the specification, but * implementations in the wild use a 20 byte buffer into which the device will * write an ASCII string. The string should not be assumed to be @@ -200,6 +236,10 @@ typedef struct vioblk { uint_t vib_seg_max; uint_t vib_seg_size_max; + uint_t vib_max_discard_sectors; /* WO */ + uint_t vib_max_discard_seg; /* WO */ + uint_t vib_discard_sector_align; /* WO */ + boolean_t vib_devid_fetched; char vib_devid[VIRTIO_BLK_ID_BYTES + 1]; uint8_t vib_rawid[VIRTIO_BLK_ID_BYTES]; diff --git a/usr/src/uts/common/os/dkioc_free_util.c b/usr/src/uts/common/os/dkioc_free_util.c index 85470f7e28..2dfb4289d4 100644 --- a/usr/src/uts/common/os/dkioc_free_util.c +++ b/usr/src/uts/common/os/dkioc_free_util.c @@ -11,6 +11,7 @@ /* * Copyright 2017 Nexenta Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* needed when building libzpool */ @@ -25,6 +26,23 @@ #include <sys/file.h> #include <sys/sdt.h> +struct ext_arg { + uint64_t ea_ext_cnt; + dfl_iter_fn_t ea_fn; + void *ea_arg; + dkioc_free_list_ext_t *ea_exts; +}; + +typedef int (*ext_iter_fn_t)(const dkioc_free_list_ext_t *, + boolean_t, void *); + +static int ext_iter(const dkioc_free_list_t *, const dkioc_free_align_t *, + uint_t, ext_iter_fn_t, void *); +static int ext_xlate(const dkioc_free_list_t *, const dkioc_free_list_ext_t *, + const dkioc_free_align_t *, uint_t, uint64_t *, uint64_t *); +static int count_exts(const dkioc_free_list_ext_t *, boolean_t, void *); +static int process_exts(const dkioc_free_list_ext_t *, boolean_t, void *); + /* * Copy-in convenience function for variable-length dkioc_free_list_t * structures. The pointer to be copied from is in `arg' (may be a pointer @@ -78,3 +96,275 @@ dfl_free(dkioc_free_list_t *dfl) { kmem_free(dfl, DFL_SZ(dfl->dfl_num_exts)); } + +/* + * Convenience function to iterate through the array of extents in dfl while + * respecting segmentation and alignment of the extents. + * + * Some devices that implement DKIOCFREE (e.g. nvme and vioblk) have limits + * on either the number of extents that can be submitted in a single request, + * or the total number of blocks that can be submitted in a single request. + * In addition, devices may have alignment requirements on the starting + * address stricter than the device block size. + * + * Since there is currently no way for callers of DKIOCFREE to discover + * any alignment or segmentation requirements, the driver itself may choose + * to adjust the actual extent start and length that is freed (never freeing + * outside the original unmodified extent boundaries), split extents into + * multiple smaller extents, or split a single request into multiple requests + * to the underlying hardware. dfl_iter() frees the driver from having to + * deal with such complexity/tedium. + * + * The original request is passed in dfl and the alignment requirements are + * given in dkfa. dfl_iter() will do the necessary adjustments and then + * call func with an array of extents, number of extents, as well as a flag + * that is set upon the last invocation of func for the original request, as + * well as the void * arg passed to dfl_iter(). + * + * func should return 0 on success or an error value. An error may result + * in partial completion of the request, sorry. + * + * Currently no flags are defined, and should always be zero. + */ +int +dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa, + dfl_iter_fn_t func, void *arg, int kmflag, uint32_t dfl_flag) +{ + dkioc_free_list_ext_t *exts; + uint64_t n_exts = 0; + struct ext_arg earg = { 0 }; + uint_t bshift; + int r = 0; + + if (dfl_flag != 0) + return (SET_ERROR(EINVAL)); + + /* Block size must be at least 1 and a power of two */ + if (dfa->dfa_bsize == 0 || !ISP2(dfa->dba_bsize)) + return (SET_ERROR(EINVAL)); + + /* Offset alignment must also be at least 1 and a power of two */ + if (dfa->dfa_align == 0 || !ISP2(dfa->dfa_align)) + return (SET_ERROR(EINVAL)); + + /* The offset alignment must be at least as large as the block size */ + if (dfa->dfa_align < dfa->dfa_bsize) + return (SET_ERROR(EINVAL)); + + /* Since dfa_bsize != 0, ddi_ffsll() _must_ return a value > 1 */ + bshift = ddi_ffsll((long long)dfa->dfa_bsize) - 1; + + /* + * If a limit on the total number of blocks is given, it must be + * greater than the offset alignment. E.g. if the block size is 512 + * bytes, the offset alignment is 4096 (8 blocks), the device must + * allow extent sizes at least 8 blocks long (otherwise it is not + * possible to free the entire device). + */ + if (dfa->dfa_max_blocks > 0 && + (dfa->dfa_max_blocks >> bshift) < dfa->dfa_align) + return (SET_ERROR(EINVAL)); + + /* + * Determine the total number of extents needed. Due to alignment + * and segmentation requirements, this may be different than + * the initial number of segments. + */ + r = ext_iter(dfl, dfa, bshift, count_exts, &n_exts); + if (r != 0) + return (r); + + /* + * It's possible that some extents do not conform to the alignment + * requirements, nor do they have a conforming subset. For example, + * with a minimum alignment of 8 blocks, an extent starting at + * offset 2 and a length of 5 is such a case. Since there is no way + * to report partial results, such extents are silently skipped. + * It is then possible that a request could consist of nothing but + * ineligible extents, and so such a request is also silently + * ignored. + */ + if (n_exts == 0) + return (0); + + n_exts = earg.ea_ext_cnt; + exts = kmem_zalloc(n_exts * sizeof (*exts), kmflag); + if (exts == NULL) + return (SET_ERROR(EOVERFLOW)); + + earg.ea_ext_cnt = n_exts; + earg.ea_fn = func; + earg.ea_arg = arg; + earg.ea_exts = exts; + + /* + * Run through all the extents, calling func as the limits for + * each request are reached. The final request remains queued + * when ext_iter() returns. + */ + r = ext_iter(dfl, dfa, bshift, process_exts, &earg); + if (r != 0) + goto done; + + /* Process the final request */ + r = process_exts(NULL, B_TRUE, &earg); + +done: + kmem_free(exts, n_exts * sizeof (*exts)); + return (r); +} + +static int +count_exts(const dkioc_free_list_ext_t *ext, boolean_t newreq __unused, + void *arg) +{ + size_t *np = arg; + + (*np)++; + return (0); +} + +static int +process_exts(const dkioc_free_list_ext_t *ext, boolean_t newreq, void *arg) +{ + struct ext_arg *args = arg; + + if (newreq && args->ea_ext_cnt > 0) { + /* + * A new request, and are extents from the previous request + * ready to dispatch. + */ + int r; + boolean_t last = (ext == NULL) ? B_TRUE : B_FALSE; + + r = args->ea_fn(args->ea_exts, args->ea_ext_cnt, last, + args->ea_arg); + + if (r != 0) + return (r); + + args->ea_exts += args->ea_ext_cnt; + args->ea_ext_cnt = 0; + + /* + * After the last request, we are called with a NULL ext + * and a new request to process the final request. + */ + if (ext == NULL) + return (0); + } + + args->ea_exts[args->ea_ext_cnt++] = *ext; + return (0); +} + +/* + * Translate the byte offset and lengths in ext into block offsets and + * lengths, with the offset aligned per dfla. + */ +static int +ext_xlate(const dkioc_free_list_t *dfl, const dkioc_free_list_ext_t *ext, + const dkioc_free_align_t *dfa, uint_t bshift, uint64_t *startp, + uint64_t *lengthp) +{ + uint64_t start = dfl->dfl_offset + ext->dfle_start; + uint64_t end = start + ext->dfle_length; + + if (start < dfl->dfl_offset || start < ext->dfle_start) + return (SET_ERROR(EOVERFLOW)); + if (end < start || end < ext->dfle_length) + return (SET_ERROR(EOVERFLOW)); + + start = P2ROUNDUP(start, dfa->dfa_align) >> bshift; + end = P2ALIGN(end, dfa->dfa_bsize) >> bshift; + + *startp = start; + *lengthp = (end > start) ? end - start : 0; + return (0); +} + +/* + * Iterate through the extents in dfl. fn is called for each adjusted extent + * (adjusting offsets and lengths to conform to the alignment requirements) + * and one input extent may result in 0, 1, or multiple calls to fn as a + * result. + */ +static int +ext_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa, + uint_t bshift, ext_iter_fn_t fn, void *arg) +{ + const dkioc_free_list_ext_t *ext; + uint64_t n_exts = 0; + uint64_t n_blk = 0; + size_t i; + boolean_t newreq = B_TRUE; + + for (i = 0, ext = dfl->dfl_exts; i < dfl->dfl_num_exts; i++, ext++) { + uint64_t start, length; + int r; + + r = ext_xlate(dfl, ext, dfa, bshift, &start, &length); + if (r != 0) + return (r); + + while (length > 0) { + dkioc_free_list_ext_t blk_ext = { + .dfle_start = start, + .dfle_length = length + }; + + if (dfa->dfa_max_ext > 0 && + n_exts + 1 > dfa->dfa_max_ext) { + /* + * Reached the max # of extents, start a new + * request. + */ + newreq = B_TRUE; + n_exts = 0; + n_blk = 0; + continue; + } + + if (dfa->dfa_max_blocks > 0 && + n_blk + length > dfa->dfa_max_blocks) { + /* + * This extent puts us over the max # of + * blocks in a request. If this isn't a + * new request, start a new request, + */ + if (!newreq) { + newreq = B_TRUE; + n_exts = 0; + n_blk = 0; + continue; + } + + /* + * A new request, and the extent length is + * larger than our max. Reduce the length to + * the largest multiple of dfa_align + * equal to or less than dfa_max_blocks + * so the next starting address has the + * correct alignment. + */ + blk_ext.dfle_length = + P2ALIGN(dfa->dfa_max_blocks, + dfa->dfa_align >> bshift); + } + + r = fn(&blk_ext, newreq, arg); + if (r != 0) + return (r); + + newreq = B_FALSE; + + n_exts++; + n_blk += blk_ext.dfle_length; + + length -= blk_ext.dfle_length; + start += blk_ext.dfle_length; + } + } + + return (0); +} diff --git a/usr/src/uts/common/sys/dkioc_free_util.h b/usr/src/uts/common/sys/dkioc_free_util.h index 9e83ab3bff..42b16cd152 100644 --- a/usr/src/uts/common/sys/dkioc_free_util.h +++ b/usr/src/uts/common/sys/dkioc_free_util.h @@ -11,6 +11,7 @@ /* * Copyright 2017 Nexenta Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_DKIOC_FREE_UTIL_H @@ -24,8 +25,20 @@ extern "C" { #define DFL_COPYIN_MAX_EXTS (1024 * 1024) +typedef struct dkioc_free_align { + size_t dfa_bsize; /* device block size in bytes */ + size_t dfa_max_ext; /* max # of extents in a single req */ + size_t dfa_max_blocks; /* max # of blocks in a single req */ + size_t dfa_align; /* alignment for starting addresses */ +} dkioc_free_align_t; + +typedef int (*dfl_iter_fn_t)(const dkioc_free_list_ext_t *exts, size_t n_ext, + boolean_t last, void *arg); + int dfl_copyin(void *arg, dkioc_free_list_t **out, int ddi_flags, int kmflags); void dfl_free(dkioc_free_list_t *dfl); +int dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *align, + dfl_iter_fn_t fn, void *arg, int kmflag, uint32_t flags); #ifdef __cplusplus } |