summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason King <jason.king@joyent.com>2020-04-02 01:27:24 +0000
committerJason King <jason.king@joyent.com>2020-04-09 18:44:26 +0000
commit716eac69f4d465c68cc31ebc0cf5bb342d010997 (patch)
tree7df091a99bf736161e9080f0d4b68ee8955b58c8
parent6d3858467073e8cf44088bbf936e6f0ed3098187 (diff)
downloadillumos-joyent-716eac69f4d465c68cc31ebc0cf5bb342d010997.tar.gz
Redo bits a bit, now working
-rw-r--r--usr/src/uts/common/io/vioblk/vioblk.c119
-rw-r--r--usr/src/uts/common/os/dkioc_free_util.c225
-rw-r--r--usr/src/uts/common/sys/dkioc_free_util.h17
3 files changed, 178 insertions, 183 deletions
diff --git a/usr/src/uts/common/io/vioblk/vioblk.c b/usr/src/uts/common/io/vioblk/vioblk.c
index c15b262c87..b23dbfcfa3 100644
--- a/usr/src/uts/common/io/vioblk/vioblk.c
+++ b/usr/src/uts/common/io/vioblk/vioblk.c
@@ -638,14 +638,15 @@ vioblk_free_exts(const dkioc_free_list_ext_t *exts, size_t n_exts,
boolean_t polled;
/*
- * Corner case -- we had a DKIOCFREE request, but none of the
- * extents conformed to the host's alignment requirements. We
- * just complete the I/O with no actual work done.
+ * While rare, it's possible we might get called with a list of
+ * zero extents. In that case, just return success.
*/
if (n_exts == 0) {
- VERIFY(last);
- VERIFY3P(exts, ==, NULL);
- bd_xfer_done(args->vfpa_xfer, 0);
+ if (last) {
+ bd_xfer_done(args->vfpa_xfer, 0);
+ args->vfpa_xfer = NULL;
+ }
+
return (0);
}
@@ -656,34 +657,23 @@ vioblk_free_exts(const dkioc_free_list_ext_t *exts, size_t n_exts,
* framework handles implementing the necessary sync/not sync semantics
* for a DKIOCFREE request, that is in terms of the entire original
* request. If dfl_iter() had to break things up, we always treat
- * the non-final calls to vioblk_free_extents() as synchronous (polled).
+ * the non-final (last == B_FALSE) calls to vioblk_free_extents() as
+ * synchronous (polled).
*
- * This is a conscious choice -- we could always just issue the
- * resulting requests and let the host deal with it without waiting for
- * the results, but since it's common for TRIM/UNMAP/DELETE requests
- * to physical devices to be slow enough that users such as zfs go to
- * great lengths to control the rate at which it issues them, it
- * seems reasonable to at minimum issue the broken up bits of the
- * original request one at a time (until the final request) to avoid
- * causing problems.
+ * The assumption is that if the host is placing limits on a
+ * DISCARD request, issuing multiple requests to the same device
+ * asynchronously is likely to have undesirable results (or else why
+ * wouldn't the host expose larger limits to prevent segmentation?),
+ * so we issue one at a time (at least until the final group).
*
- * Ideally, we'd have a way for DKIOCFREE issuers (e.g. zfs) to
- * discover the underlying device requirements so that it always issues
- * requests that don't require any special handling by the driver,
- * however, that's not possible today, so we're faced with either
- * ignoring a non-conforming request (too big, too many extents, etc.)
- * completely, only processing as much of the request as possible
- * in a single request to the host (with no way to inform the caller),
- * or try to honor the request by issuing multiple requests to the
- * host.
+ * The vioblk_split_discard tunable can be set to 0 to disable this
+ * behavior -- in that case, any extents that exceed the host limits
+ * are just discarded.
*
- * We've currently opted for the third option. By far the single
- * most common user of this is almost certainly going to be zfs
- * and zfs today will only issue a single DKIOCFREE request with
- * a single extent at a time, so this seems reasonable for the
- * initial implementation. Nothing here should prevent us from
- * implementing alternate strategies in the future if the need
- * arises.
+ * Unfortunately, there isn't currently a way to report partial
+ * results, so the choices are to fail the request if any extent in
+ * the request doesn't meet the device requirements or to break
+ * as much of the request as is possible
*/
polled = !!last;
@@ -696,8 +686,8 @@ vioblk_free_exts(const dkioc_free_list_ext_t *exts, size_t n_exts,
for (i = 0; i < n_exts; i++, exts++, wzp++) {
struct vioblk_discard_write_zeroes vdwz = {
- .vdwz_sector = exts->dfle_start >> DEV_BSHIFT,
- .vdwz_num_sectors = exts->dfle_length >> DEV_BSHIFT,
+ .vdwz_sector = exts->dfle_start,
+ .vdwz_num_sectors = exts->dfle_length,
.vdwz_flags = 0
};
@@ -744,60 +734,31 @@ static int
vioblk_bd_free_space(void *arg, bd_xfer_t *xfer)
{
vioblk_t *vib = arg;
+ dkioc_free_align_t align = {
+ .dfa_bsize = DEV_BSIZE,
+ .dfa_max_ext = vib->vib_max_discard_seg,
+ .dfa_max_blocks = vib->vib_max_discard_sectors,
+ .dfa_align = vib->vib_discard_sector_align,
+ .dfa_gran = 1,
+ };
struct vioblk_freesp_arg sp_arg = {
.vfpa_vioblk = vib,
.vfpa_xfer = xfer
};
dkioc_free_list_t *dfl = xfer->x_dfl;
+ dkioc_iter_flags_t iter_flags =
+ (vioblk_split_discard == 0) ? DIF_NOSPLIT : DIF_NONE;
int r = 0;
- if (dfl->dfl_num_exts == 0) {
- bd_xfer_done(xfer, 0);
- return (0);
- }
+ r = dfl_iter(dfl, &align, vioblk_free_exts, &sp_arg, KM_SLEEP,
+ iter_flags);
- if (vioblk_split_discard) {
- dkioc_free_align_t align = {
- .dfa_bsize = DEV_BSIZE,
- .dfa_max_ext = vib->vib_max_discard_seg,
- .dfa_max_blocks = vib->vib_max_discard_sectors,
- .dfa_align = vib->vib_discard_sector_align * DEV_BSIZE
- };
-
- r = dfl_iter(dfl, &align, vioblk_free_exts, &sp_arg,
- KM_SLEEP, 0);
-
- /*
- * If we didn't include xfer as part of the final request, we
- * should return failure so that bd_sched() will free xfer.
- */
- if (sp_arg.vfpa_xfer != NULL)
- VERIFY3S(r, !=, 0);
- } else {
- uint64_t start = dfl->dfl_offset + dfl->dfl_exts[0].dfle_start;
- uint64_t end = start + dfl->dfl_exts[0].dfle_length;
- uint64_t align = vib->vib_discard_sector_align * DEV_BSIZE;
- uint64_t maxend = start +
- vib->vib_max_discard_sectors * DEV_BSIZE;
-
- if (end > maxend)
- end = maxend;
-
- start = P2ROUNDUP(start, align);
- end = P2ALIGN(end, DEV_BSIZE);
-
- if (start == end) {
- bd_xfer_done(xfer, 0);
- return (0);
- }
-
- dkioc_free_list_ext_t ext = {
- .dfle_start = start,
- .dfle_length = end - start
- };
-
- r = vioblk_free_exts(&ext, 1, B_TRUE, &sp_arg);
- }
+ /*
+ * If we didn't include xfer as part of the final request
+ * (sp_arg.vfpa_xfer is still set), we should be returning failure
+ * so that bd_sched() will free xfer.
+ */
+ IMPLY(sp_arg.vfpa_xfer != NULL, r != 0);
return (r);
}
diff --git a/usr/src/uts/common/os/dkioc_free_util.c b/usr/src/uts/common/os/dkioc_free_util.c
index f4da8538bf..421899c8de 100644
--- a/usr/src/uts/common/os/dkioc_free_util.c
+++ b/usr/src/uts/common/os/dkioc_free_util.c
@@ -27,30 +27,35 @@
#include <sys/sdt.h>
struct ext_arg {
- uint64_t ea_ext_cnt;
- dfl_iter_fn_t ea_fn;
- void *ea_arg;
- dkioc_free_list_ext_t *ea_exts;
+ uint64_t ea_ext_cnt;
+ dfl_iter_fn_t ea_fn;
+ void *ea_arg;
+ dkioc_free_list_ext_t *ea_exts;
+ size_t ea_nreq;
+ dkioc_iter_flags_t ea_flags;
};
-typedef enum ext_iter_flags {
- EIF_NONE = 0,
- EIF_FIRST = (1 << 0),
- EIF_LAST = (1 << 1),
- EIF_NEWREQ = (1 << 2)
-} ext_iter_flags_t;
-
typedef int (*ext_iter_fn_t)(const dkioc_free_list_ext_t *,
- ext_iter_flags_t, void *);
+ boolean_t, void *);
static int ext_iter(const dkioc_free_list_t *, const dkioc_free_align_t *,
uint_t, ext_iter_fn_t, void *);
-static int ext_xlate(const dkioc_free_list_t *, const dkioc_free_list_ext_t *,
- const dkioc_free_align_t *, uint64_t *, uint64_t *);
-static int count_exts(const dkioc_free_list_ext_t *, ext_iter_flags_t, void *);
-static int process_exts(const dkioc_free_list_ext_t *, ext_iter_flags_t,
- void *);
-
+static int ext_xlate(dkioc_free_list_ext_t *, uint64_t, uint64_t, uint64_t,
+ uint_t);
+static int count_exts(const dkioc_free_list_ext_t *, boolean_t, void *);
+static int process_exts(const dkioc_free_list_ext_t *, boolean_t, void *);
+
+#if __GNUC__ > 4 || __GNU_C_MINOR__ >= 8
+#define uadd64_overflow(a, b, c) __builtin_uaddl_overflow(a, b, c)
+#else
+static bool
+uadd64_overflow(uint64_t a, uint64_t b, uint64_t *res)
+{
+ *res = a + b;
+ return ((*res < a || *res < b) ? true : false);
+}
+#endif
+
/*
* Copy-in convenience function for variable-length dkioc_free_list_t
* structures. The pointer to be copied from is in `arg' (may be a pointer
@@ -161,7 +166,7 @@ dfl_free(dkioc_free_list_t *dfl)
*/
int
dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa,
- dfl_iter_fn_t func, void *arg, int kmflag, uint32_t dfl_flag)
+ dfl_iter_fn_t func, void *arg, int kmflag, dkioc_iter_flags_t flags)
{
dkioc_free_list_ext_t *exts;
uint64_t n_exts = 0;
@@ -169,7 +174,7 @@ dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa,
uint_t bshift;
int r = 0;
- if (dfl_flag != 0)
+ if ((flags & ~(DIF_NONE|DIF_NOSPLIT)) != 0)
return (SET_ERROR(EINVAL));
/* Block size must be at least 1 and a power of two */
@@ -180,8 +185,8 @@ dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa,
if (dfa->dfa_align == 0 || !ISP2(dfa->dfa_align))
return (SET_ERROR(EINVAL));
- /* The offset alignment must be at least as large as the block size */
- if (dfa->dfa_align < dfa->dfa_bsize)
+ /* Length granularity must be at least 1 and a power of two */
+ if (dfa->dfa_gran == 0 || !ISP2(dfa->dfa_gran))
return (SET_ERROR(EINVAL));
/*
@@ -194,11 +199,10 @@ dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa,
* If a limit on the total number of blocks is given, it must be
* greater than the offset alignment. E.g. if the block size is 512
* bytes and the offset alignment is 4096 (8 blocks), the device must
- * allow extent sizes at least 8 blocks long (otherwise it is not
- * possible to free the entire device).
+ * allow extent sizes at least 8 blocks long (otherwise there will be
+ * device addresses that cannot be contained within an extent).
*/
- if (dfa->dfa_max_blocks > 0 &&
- (dfa->dfa_max_blocks << bshift) < dfa->dfa_align)
+ if (dfa->dfa_max_blocks > 0 && dfa->dfa_max_blocks < dfa->dfa_align)
return (SET_ERROR(EINVAL));
/*
@@ -220,15 +224,17 @@ dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa,
/*
* It's possible that some extents do not conform to the alignment
* requirements, nor do they have a conforming subset. For example,
- * with a minimum alignment of 8 blocks, an extent starting at
- * offset 2 and a length of 5 is such a case. Since there is no way
- * to report partial results, such extents are silently skipped.
- * It is then possible that a request could consist of nothing but
- * ineligible extents, and so such a request is also silently
- * ignored.
+ * a device with a block size of 512 bytes, and a starting alignment
+ * of 4096 bytes would not be able to free extent with a starting
+ * offset of 512 and a length of 1024. Such extents are ignored
+ * (we have no good way to report back partial results). While unlikely,
+ * it is possible a request consists of nothing but non-conforming
+ * extents. In this case, we invoke the callback with a NULL list
+ * of extents and with last set so it can perform any necessary
+ * cleanup, completion tasks.
*/
if (n_exts == 0)
- return (0);
+ return (func(NULL, 0, B_TRUE, arg));
exts = kmem_zalloc(n_exts * sizeof (*exts), kmflag);
if (exts == NULL)
@@ -238,6 +244,8 @@ dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa,
earg.ea_fn = func;
earg.ea_arg = arg;
earg.ea_exts = exts;
+ earg.ea_nreq = 0;
+ earg.ea_flags = flags;
/*
* We've allocated enough space to hold all the transformed extents
@@ -253,34 +261,30 @@ dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa,
}
static int
-count_exts(const dkioc_free_list_ext_t *ext, ext_iter_flags_t flags __unused,
+count_exts(const dkioc_free_list_ext_t *ext, boolean_t newreq __unused,
void *arg)
{
size_t *np = arg;
- if (ext != NULL)
+ if (ext != NULL && ext->dfle_length > 0)
(*np)++;
return (0);
}
static int
-process_exts(const dkioc_free_list_ext_t *ext, ext_iter_flags_t flags,
- void *arg)
+process_exts(const dkioc_free_list_ext_t *ext, boolean_t newreq, void *arg)
{
struct ext_arg *args = arg;
dkioc_free_list_ext_t *ext_list = args->ea_exts;
- boolean_t flush = B_FALSE;
- boolean_t last = ((flags & EIF_LAST) != 0) ? B_TRUE : B_FALSE;
- if (last) {
+ if (ext == NULL) {
/*
* The very last call should be with ext set to NULL to
* flush any accumulated extents since the last start of
* a new group.
*/
- VERIFY3P(ext, ==, NULL);
- flush = B_TRUE;
+ VERIFY(newreq);
/*
* A corner case -- we never had any extents that could
@@ -289,22 +293,24 @@ process_exts(const dkioc_free_list_ext_t *ext, ext_iter_flags_t flags,
*/
if (args->ea_ext_cnt == 0)
ext_list = NULL;
- } else if ((flags & EIF_NEWREQ) != 0 && (flags & EIF_FIRST) == 0) {
- /*
- * A new request that isn't the very first request will flush
- * what we've accumulated (and then start accumulating for
- * the next grouping). We should always have at least one
- * extent to flush at this point -- the 'no extents'
- * case was handled earlier.
- */
- VERIFY3U(args->ea_ext_cnt, >, 0);
- flush = B_TRUE;
+
+ args->ea_nreq++;
+
+ return (args->ea_fn(ext_list, args->ea_ext_cnt, B_TRUE,
+ args->ea_arg));
}
- if (flush) {
+ /*
+ * Starting a new request, and we have accumulated extents to
+ * flush.
+ */
+ if (newreq && args->ea_ext_cnt > 0) {
int r;
- r = args->ea_fn(ext_list, args->ea_ext_cnt, last, args->ea_arg);
+ args->ea_nreq++;
+
+ r = args->ea_fn(ext_list, args->ea_ext_cnt, B_FALSE,
+ args->ea_arg);
if (r != 0)
return (r);
@@ -317,33 +323,37 @@ process_exts(const dkioc_free_list_ext_t *ext, ext_iter_flags_t flags,
args->ea_ext_cnt = 0;
}
- if (ext != NULL)
+ /* Skip any extents that end up with zero length after aligning. */
+ if (ext->dfle_length > 0)
args->ea_exts[args->ea_ext_cnt++] = *ext;
+
return (0);
}
/*
- * Set *startp and *lengthp to the actual _byte_ offset and lengths
- * after adjusting for dfl->dfl_offset, the starting alignment, and
- * the block size requirements.
+ * Translate the ext from byte-based units to units of
+ * (1 << bshift) sized blocks, with the start and length values adjusted to
+ * the align and gran values (align and gran are in units of bytes).
+ *
+ * Returns 0 on success, or an error value.
*/
static int
-ext_xlate(const dkioc_free_list_t *dfl, const dkioc_free_list_ext_t *ext,
- const dkioc_free_align_t *dfa, uint64_t *startp, uint64_t *lengthp)
+ext_xlate(dkioc_free_list_ext_t *ext, uint64_t offset, uint64_t align,
+ uint64_t gran, uint_t bshift)
{
- uint64_t start = dfl->dfl_offset + ext->dfle_start;
- uint64_t end = start + ext->dfle_length;
+ uint64_t start, end;
- if (start < dfl->dfl_offset || start < ext->dfle_start)
- return (SET_ERROR(EOVERFLOW));
- if (end < start || end < ext->dfle_length)
+ if (uadd64_overflow(offset, ext->dfle_start, &start))
return (SET_ERROR(EOVERFLOW));
- start = P2ROUNDUP(start, dfa->dfa_align);
- end = P2ALIGN(end, dfa->dfa_bsize);
+ if (uadd64_overflow(start, ext->dfle_length, &end))
+ return (SET_ERROR(EOVERFLOW));
+
+ start = P2ROUNDUP(start, align);
+ end = P2ALIGN(end, gran);
- *startp = start;
- *lengthp = (end > start) ? end - start : 0;
+ ext->dfle_start = start >> bshift;
+ ext->dfle_length = (end > start) ? (end - start) >> bshift : 0;
return (0);
}
@@ -357,50 +367,55 @@ static int
ext_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa,
uint_t bshift, ext_iter_fn_t fn, void *arg)
{
- const dkioc_free_list_ext_t *ext;
+ const dkioc_free_list_ext_t *src = dfl->dfl_exts;
uint64_t n_exts = 0;
- uint64_t n_blk = 0;
+ uint64_t n_blks = 0;
+ uint64_t align = dfa->dfa_align << bshift;
+ uint64_t gran = dfa->dfa_gran << bshift;
size_t i;
- ext_iter_flags_t flags = EIF_FIRST|EIF_NEWREQ;
+ boolean_t newreq = B_TRUE;
- for (i = 0, ext = dfl->dfl_exts; i < dfl->dfl_num_exts; i++, ext++) {
- uint64_t start, length;
+ for (i = 0; i < dfl->dfl_num_exts; i++, src++) {
+ dkioc_free_list_ext_t ext = *src;
int r;
- r = ext_xlate(dfl, ext, dfa, &start, &length);
+ r = ext_xlate(&ext, dfl->dfl_offset, align, gran, bshift);
if (r != 0)
return (r);
- while (length > 0) {
- dkioc_free_list_ext_t blk_ext = {
- .dfle_start = start,
- .dfle_length = length
- };
- uint64_t len_blk = length >> bshift;
+ while (ext.dfle_length > 0) {
+ dkioc_free_list_ext_t seg = ext;
if (dfa->dfa_max_ext > 0 &&
n_exts + 1 > dfa->dfa_max_ext) {
/*
* Reached the max # of extents, start a new
- * request and retry.
+ * request, and retry.
*/
- flags |= EIF_NEWREQ;
+ newreq = B_TRUE;
n_exts = 0;
- n_blk = 0;
+ n_blks = 0;
continue;
}
if (dfa->dfa_max_blocks > 0 &&
- n_blk + len_blk > dfa->dfa_max_blocks) {
+ n_blks + seg.dfle_length > dfa->dfa_max_blocks) {
/*
* This extent puts us over the max # of
- * blocks in a request. If this isn't a
- * new request, start a new request,
+ * blocks in a request.
*/
- if ((flags & EIF_NEWREQ) == 0) {
- flags |= EIF_NEWREQ;
+ if (!newreq) {
+ /*
+ * If we haven't started a new request,
+ * start one, and retry as a new
+ * request in case it can fit on
+ * its own. If not, we'll skip
+ * this block and split it in the
+ * code below.
+ */
+ newreq = B_TRUE;
n_exts = 0;
- n_blk = 0;
+ n_blks = 0;
continue;
}
@@ -410,30 +425,38 @@ ext_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *dfa,
* the largest multiple of dfa_align
* equal to or less than dfa_max_blocks
* so the next starting address has the
- * correct alignment.
+ * correct alignment, splitting the request.
+ */
+ seg.dfle_length = P2ALIGN(dfa->dfa_max_blocks,
+ align);
+
+ /*
+ * Our sanity checks on the alignment
+ * requirements mean we should be able to
+ * free at least part of the extent.
*/
- blk_ext.dfle_length =
- P2ALIGN(dfa->dfa_max_blocks,
- dfa->dfa_align);
+ ASSERT3U(seg.dfle_length, >, 0);
}
- r = fn(&blk_ext, flags, arg);
+ r = fn(&seg, newreq, arg);
if (r != 0)
return (r);
n_exts++;
- n_blk += len_blk;
+ n_blks += seg.dfle_length;
- length -= blk_ext.dfle_length;
- start += blk_ext.dfle_length;
+ ASSERT3U(ext.dfle_length, >=, seg.dfle_length);
- flags = EIF_NONE;
+ ext.dfle_length -= seg.dfle_length;
+ ext.dfle_start += seg.dfle_length;
+ newreq = B_FALSE;
}
}
/*
* Invoke the callback one last time w/ a NULL array of extents and
- * newreq == B_TRUE to signal completion.
+ * newreq == B_TRUE to signal completion (and flush any accumulated
+ * extents).
*/
- return (fn(NULL, flags|EIF_LAST, arg));
+ return (fn(NULL, B_TRUE, arg));
}
diff --git a/usr/src/uts/common/sys/dkioc_free_util.h b/usr/src/uts/common/sys/dkioc_free_util.h
index a6a43ef717..7028084bcd 100644
--- a/usr/src/uts/common/sys/dkioc_free_util.h
+++ b/usr/src/uts/common/sys/dkioc_free_util.h
@@ -39,19 +39,30 @@ typedef struct dkioc_free_align {
size_t dfa_max_blocks;
/*
- * Minimum alignment for extent offsets in bytes (e.g. 512, 4096,
- * etc). Must be >= dfa_bsize, and must be a power of 2.
+ * Minimum alignment for extent offsets in units of blocks (dfa_bsize).
+ * etc). Must be > 0, and a power of two.
*/
size_t dfa_align;
+
+ /*
+ * Minimum granularity for length in units of blocks (dfa_bsize).
+ * Must be > 0, and a power of two.
+ */
+ size_t dfa_gran;
} dkioc_free_align_t;
+typedef enum dkioc_iter_flags {
+ DIF_NONE = 0,
+ DIF_NOSPLIT = (1 << 1)
+} dkioc_iter_flags_t;
+
typedef int (*dfl_iter_fn_t)(const dkioc_free_list_ext_t *exts, size_t n_ext,
boolean_t last, void *arg);
int dfl_copyin(void *arg, dkioc_free_list_t **out, int ddi_flags, int kmflags);
void dfl_free(dkioc_free_list_t *dfl);
int dfl_iter(const dkioc_free_list_t *dfl, const dkioc_free_align_t *align,
- dfl_iter_fn_t fn, void *arg, int kmflag, uint32_t flags);
+ dfl_iter_fn_t fn, void *arg, int kmflag, dkioc_iter_flags_t);
#ifdef __cplusplus
}