1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
|
/*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*/
/*
* Copyright 2021 Tintri by DDN, Inc. All rights reserved.
* Copyright 2020 Joyent, Inc.
*/
/* needed when building libzpool */
#ifndef _KERNEL
#include <sys/zfs_context.h>
#endif
#include <sys/sunddi.h>
#include <sys/dkio.h>
#include <sys/dkioc_free_util.h>
#include <sys/sysmacros.h>
#include <sys/file.h>
#include <sys/sdt.h>
static int adjust_exts(dkioc_free_list_t *, const dkioc_free_info_t *,
uint64_t len_blk);
static int split_extent(dkioc_free_list_t *, const dkioc_free_info_t *,
uint64_t, dfl_iter_fn_t, void *, int);
static int process_range(dkioc_free_list_t *, uint64_t, uint64_t,
dfl_iter_fn_t, void *, int);
/*
* Copy-in convenience function for variable-length dkioc_free_list_t
* structures. The pointer to be copied from is in `arg' (may be a pointer
* to userspace). A new buffer is allocated and a pointer to it is placed
* in `out'. `ddi_flags' indicates whether the pointer is from user-
* or kernelspace (FKIOCTL) and `kmflags' are the flags passed to
* kmem_zalloc when allocating the new structure.
* Returns 0 on success, or an errno on failure.
*/
int
dfl_copyin(void *arg, dkioc_free_list_t **out, int ddi_flags, int kmflags)
{
dkioc_free_list_t *dfl;
if (ddi_flags & FKIOCTL) {
dkioc_free_list_t *dfl_in = arg;
if (dfl_in->dfl_num_exts == 0 ||
dfl_in->dfl_num_exts > DFL_COPYIN_MAX_EXTS)
return (SET_ERROR(EINVAL));
dfl = kmem_alloc(DFL_SZ(dfl_in->dfl_num_exts), kmflags);
if (dfl == NULL)
return (SET_ERROR(ENOMEM));
bcopy(dfl_in, dfl, DFL_SZ(dfl_in->dfl_num_exts));
} else {
uint64_t num_exts;
if (ddi_copyin(((uint8_t *)arg) + offsetof(dkioc_free_list_t,
dfl_num_exts), &num_exts, sizeof (num_exts),
ddi_flags) != 0)
return (SET_ERROR(EFAULT));
if (num_exts == 0 || num_exts > DFL_COPYIN_MAX_EXTS)
return (SET_ERROR(EINVAL));
dfl = kmem_alloc(DFL_SZ(num_exts), kmflags);
if (dfl == NULL)
return (SET_ERROR(ENOMEM));
if (ddi_copyin(arg, dfl, DFL_SZ(num_exts), ddi_flags) != 0 ||
dfl->dfl_num_exts != num_exts) {
kmem_free(dfl, DFL_SZ(num_exts));
return (SET_ERROR(EFAULT));
}
}
*out = dfl;
return (0);
}
/* Frees a variable-length dkioc_free_list_t structure. */
void
dfl_free(dkioc_free_list_t *dfl)
{
kmem_free(dfl, DFL_SZ(dfl->dfl_num_exts));
}
/*
* Convenience function to resize and segment the array of extents in
* a DKIOCFREE request as required by a driver.
*
* Some devices that implement DKIOCFREE (e.g. vioblk) have limits
* on either the number of extents that can be submitted in a single request,
* or the total number of blocks that can be submitted in a single request.
* In addition, devices may have alignment requirements on the starting
* address stricter than the device block size.
*
* Since there is currently no mechanism for callers of DKIOCFREE to discover
* such restrictions, instead of rejecting any requests that do not conform to
* some undiscoverable (to the caller) set of requirements, a driver can use
* dfl_iter() to adjust and resegment the extents from a DKIOCFREE call as
* required to conform to its requirements.
*
* The original request is passed as 'dfl' and the alignment requirements
* are passed in 'dfi'. Additionally the maximum offset of the device allowed
* in bytes) is passed as max_off -- this allows a driver with
* multiple instances of different sizes but similar requirements (e.g.
* a partitioned blkdev device) to not construct a separate dkioc_free_info_t
* struct for each device.
*
* dfl_iter() will call 'func' with a dkioc_free_list_t and the value of
* arg passed to it as needed. If the extents in the dkioc_free_list_t passed
* to dfl_iter() meet all the requirements in 'dfi', the dkioc_free_list_t will
* be passed on to 'func' unmodified. If any of the extents passed to dfl_iter()
* do not meet the requirements, dfl_iter() will allocate new dkioc_free_list_t
* instances and populate them with the adjusted extents that do conform to the
* requirements in 'dfi'. dfl_iter() will also free the dkioc_free_list_t
* passed to it when this occurs. The net result is that 'func' can always
* assume it will be called with a dkioc_free_list_t with extents that
* comply with the requirements in 'dfi'. 'func' is also responsible for
* freeing the dkioc_free_list_t passed to it (likely via a completion
* callback).
*
* Combined with the behavior described above, dfl_iter() can be viewed as
* consuming the dkioc_free_list_t passed to it. Either it will pass it along
* to 'func' (and let 'func' handle freeing it), or it will free it and
* allocate one or more new dkioc_free_list_ts to pass to 'func' (while still
* letting 'func' handle freeing the new instances). This way neither the
* dfl_iter() caller nor nor the driver need to worry about treating
* conforming and non-conforming requests differently.
*
* Unfortunately, the DKIOCFREE ioctl provides no method for communicating
* any notion of partial completion -- either it returns success (0) or
* an error. It's not clear if such a notion would even be possible while
* supporting multiple types of devices (NVMe, SCSI, etc.) with the same
* interface. As such, there's little benefit to providing more detailed error
* semantics beyond what DKIOCFREE can handle.
*
* Due to this, a somewhat simplistic approach is taken to error handling. The
* original list of extents is first checked to make sure they all appear
* valid -- that is they do not start or extend beyond the end of the device.
* Any request that contains such extents is always rejected in it's entirety.
* It is possible after applying any needed adjustments to the original list
* of extents that the result is not acceptable to the driver. For example,
* a device with a 512 byte block size that tries to free the range 513-1023
* (bytes) would not be able to be processed. Such extents will be silently
* ignored. If the original request consists of nothing but such requests,
* dfl_iter() will never call 'func' and will merely return 0.
*/
int
dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t max_off,
dfl_iter_fn_t func, void *arg, int kmflag)
{
dkioc_free_list_ext_t *ext;
uint64_t n_bytes, n_segs, start_idx, i;
uint_t bsize = 1U << dfi->dfi_bshift;
int r = 0;
boolean_t need_copy = B_FALSE;
/*
* Make sure the block size derived from dfi_bshift is at least 512
* (1U << DEV_BSHIFT) bytes and less than 2^30. The lower bound is
* to prevent any problems with other parts of the system that might
* assume a minimum block size of 512, and the upper bound is just
* to prevent overflow when creating the block size from dfi_bshift
* (though it seems unlikely we'll have _block_ sizes near a GiB
* any time soon).
*/
if (dfi->dfi_bshift < DEV_BSHIFT || dfi->dfi_bshift > 30) {
r = SET_ERROR(EINVAL);
goto done;
}
/* Max bytes must be a multiple of the block size */
if (!IS_P2ALIGNED(dfi->dfi_max_bytes, bsize)) {
r = SET_ERROR(EINVAL);
goto done;
}
/* Start offset alignment must also be a multiple of the block size */
if (dfi->dfi_align == 0 || !IS_P2ALIGNED(dfi->dfi_align, bsize)) {
r = SET_ERROR(EINVAL);
goto done;
}
/* Max bytes in an extent must be a multiple of the block size */
if (!IS_P2ALIGNED(dfi->dfi_max_ext_bytes, bsize)) {
r = SET_ERROR(EINVAL);
goto done;
}
/*
* It makes no sense to allow a single extent to be larger than the
* total allowed for an entire request.
*/
if (dfi->dfi_max_ext_bytes > 0 &&
dfi->dfi_max_ext_bytes > dfi->dfi_max_bytes) {
r = SET_ERROR(EINVAL);
goto done;
}
/*
* The first pass, align everything as needed and make sure all the
* extents look valid.
*/
if ((r = adjust_exts(dfl, dfi, max_off)) != 0) {
goto done;
}
/*
* Go through and split things up as needed. The general idea is to
* split along the original extent boundaries when needed. We only
* split an extent from the original request into multiple extents
* if the original extent is by itself too big for the device to
* process in a single request.
*/
start_idx = 0;
n_bytes = n_segs = 0;
ext = dfl->dfl_exts;
for (i = 0; i < dfl->dfl_num_exts; i++, ext++) {
uint64_t start = dfl->dfl_offset + ext->dfle_start;
uint64_t len = ext->dfle_length;
if (len == 0) {
/*
* If we encounter a zero length extent, we're going
* to create a new copy of dfl no matter what --
* the size of dfl is determined by dfl_num_exts so
* we cannot do things like shift the contents and
* reduce dfl_num_exts to get a contiguous array
* of non-zero length extents.
*/
need_copy = B_TRUE;
continue;
}
if (dfi->dfi_max_ext_bytes > 0 &&
len > dfi->dfi_max_ext_bytes) {
/*
* An extent that's too large. Dispatch what we've
* accumulated, and then split this extent into
* smaller ones the device can accept.
*/
if ((r = process_range(dfl, start_idx, i - start_idx,
func, arg, kmflag)) != 0) {
goto done;
}
if ((r = split_extent(dfl, dfi, i, func, arg,
kmflag)) != 0) {
goto done;
}
start_idx = i + 1;
n_segs = 0;
n_bytes = 0;
continue;
}
if (dfi->dfi_max_bytes > 0 &&
n_bytes + len > dfi->dfi_max_bytes) {
/*
* This extent would put us over the limit for total
* bytes that can be trimmed in one request.
* Dispatch what we've accumulated. Then deal
* with this extent.
*/
if ((r = process_range(dfl, start_idx, i - start_idx,
func, arg, kmflag)) != 0) {
goto done;
}
if (len < dfi->dfi_max_bytes) {
/*
* After dispatching what we've accumulated,
* this extent can fit in a new request
* Just add it to the accumulated list of
* extents and move on.
*/
start_idx = i;
n_segs = 1;
n_bytes = len;
continue;
}
/*
* Even after starting a new request, this extent
* is too big. Split it until it fits.
*/
if ((r = split_extent(dfl, dfi, i, func, arg,
kmflag)) != 0) {
goto done;
}
start_idx = i + 1;
n_segs = 0;
n_bytes = 0;
continue;
}
if (dfi->dfi_max_ext > 0 && n_segs + 1 > dfi->dfi_max_ext) {
/*
* This extent will put us over the limit on the number
* of extents the device can accept. Dispatch what
* we've accumulated so far.
*/
if ((r = process_range(dfl, start_idx, i - start_idx,
func, arg, kmflag)) != 0) {
goto done;
}
start_idx = i;
n_segs = 1;
n_bytes = len;
continue;
}
n_segs++;
n_bytes += len;
}
/*
* If a copy wasn't required, and we haven't processed a subset of
* the extents already, we can just use the original request.
*/
if (!need_copy && start_idx == 0) {
return (func(dfl, arg, kmflag));
}
r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag);
done:
dfl_free(dfl);
return (r);
}
/*
* Adjust the start and length of each extent in dfl so that it conforms to
* the requirements in dfi. It also verifies that no extent extends beyond
* the end of the device (given by len_blk).
*
* Returns 0 on success, or an error value.
*/
static int
adjust_exts(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi,
uint64_t max_off)
{
dkioc_free_list_ext_t *exts = dfl->dfl_exts;
/*
* These must be uint64_t to prevent the P2 macros from truncating
* the result.
*/
const uint64_t align = dfi->dfi_align;
const uint64_t bsize = (uint64_t)1 << dfi->dfi_bshift;
for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++) {
/*
* Since there are no known requirements on the value of
* dfl_offset, it's possible (though odd) to have a scenario
* where dfl_offset == 1, and dfle_start == 511 (resulting
* in an actual start offset of 512). As such, we always
* apply the offset and find the resulting starting offset
* and length (in bytes) first, then apply any rounding
* and alignment.
*/
uint64_t start = exts->dfle_start + dfl->dfl_offset;
uint64_t end = start + exts->dfle_length;
/*
* Make sure after applying dfl->dfl_offset and any alignment
* adjustments that the results don't overflow.
*/
if (start < dfl->dfl_offset || start > (UINT64_MAX - bsize)) {
return (SET_ERROR(EOVERFLOW));
}
if (end < start) {
return (SET_ERROR(EOVERFLOW));
}
/*
* Make sure we don't extend past the end of the device
*/
if (end > max_off) {
return (SET_ERROR(EINVAL));
}
start = P2ROUNDUP(start, align);
end = P2ALIGN(end, bsize);
/*
* Remove the offset so that when it's later applied again,
* the correct start value is obtained.
*/
exts->dfle_start = start - dfl->dfl_offset;
/*
* If the original length was less than the block size
* of the device, we can end up with end < start. If that
* happens we just set the length to zero.
*/
exts->dfle_length = (end < start) ? 0 : end - start;
}
return (0);
}
/*
* Take a subset of extents from dfl (starting at start_idx, with n entries)
* and create a new dkioc_free_list_t, passing that to func.
*/
static int
process_range(dkioc_free_list_t *dfl, uint64_t start_idx, uint64_t n,
dfl_iter_fn_t func, void *arg, int kmflag)
{
dkioc_free_list_t *new_dfl = NULL;
dkioc_free_list_ext_t *new_exts = NULL;
dkioc_free_list_ext_t *exts = dfl->dfl_exts + start_idx;
size_t actual_n = n;
int r = 0;
if (n == 0) {
return (0);
}
/*
* Ignore any zero length extents. No known devices attach any
* semantic meaning to such extents, and are likely just a result of
* narrowing the range of the extent to fit the device alignment
* requirements. It is possible the original caller submitted a
* zero length extent, but we ignore those as well. Since we can't
* communicate partial results back to the caller anyway, it's
* unclear whether reporting that one of potentially many exents was
* too small (without being able to identify which one) to the caller
* of the DKIOCFREE request would be useful.
*/
for (uint64_t i = 0; i < n; i++) {
if (exts[i].dfle_length == 0 && --actual_n == 0) {
return (0);
}
}
new_dfl = kmem_zalloc(DFL_SZ(actual_n), kmflag);
if (new_dfl == NULL) {
return (SET_ERROR(ENOMEM));
}
new_dfl->dfl_flags = dfl->dfl_flags;
new_dfl->dfl_num_exts = actual_n;
new_dfl->dfl_offset = dfl->dfl_offset;
new_exts = new_dfl->dfl_exts;
for (uint64_t i = 0; i < n; i++) {
if (exts[i].dfle_length == 0) {
continue;
}
*new_exts++ = exts[i];
}
return (func(new_dfl, arg, kmflag));
}
/*
* If dfi_max_ext_bytes is set, use as the max segment length,
* otherwise use dfi_max_bytes if set, otherwise fallback to UINT64_MAX
*/
#define MAX_SEGLEN(dfi) \
(((dfi)->dfi_max_ext_bytes > 0) ? (dfi)->dfi_max_ext_bytes : \
((dfi)->dfi_max_bytes > 0) ? (dfi)->dfi_max_bytes : UINT64_MAX)
/*
* Split the extent at idx into multiple lists (calling func for each one).
*/
static int
split_extent(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t idx,
dfl_iter_fn_t func, void *arg, int kmflag)
{
ASSERT3U(idx, <, dfl->dfl_num_exts);
const uint64_t maxlen = MAX_SEGLEN(dfi);
dkioc_free_list_ext_t *ext = dfl->dfl_exts + idx;
uint64_t remain = ext->dfle_length;
int r;
/*
* Break the extent into as many single requests as needed. While it
* would be possible in some circumstances to combine the final chunk
* of the extent (after splitting) with the remaining extents in the
* original request, it's not clear there's much benefit from the
* added complexity. Such behavior could be added in the future if
* it's determined to be worthwhile.
*/
while (remain > 0) {
uint64_t start = dfl->dfl_offset + ext->dfle_start;
uint64_t len = remain;
/*
* If we know we have at least one more segment left after
* the current iteration of this loop, split it so that
* the next segment starts on an aligned boundary.
*/
if (len > maxlen) {
uint64_t end = P2ALIGN(start + maxlen, dfi->dfi_align);
len = end - start;
}
ext->dfle_length = len;
if ((r = process_range(dfl, idx, 1, func, arg, kmflag)) != 0) {
return (r);
}
ext->dfle_start += len;
remain -= len;
}
return (0);
}
|