diff options
author | John Sonnenschein <johns@joyent.com> | 2012-05-07 05:39:48 +0000 |
---|---|---|
committer | John Sonnenschein <johns@joyent.com> | 2012-05-07 05:39:48 +0000 |
commit | f9b92b874c7e2b0a97203e8b3e370a82027fee42 (patch) | |
tree | 5b0afcf9f17c210b1248494d7039b477f403793f /usr/src/uts/common/fs/zfs/vdev_raidz.c | |
parent | 99ab767f0b40b10adde1dc9ceaf6bb39d2be5d69 (diff) | |
parent | 435bba8aa5d73ccd0b9ec7c79e28bec795904992 (diff) | |
download | illumos-joyent-f9b92b874c7e2b0a97203e8b3e370a82027fee42.tar.gz |
illumos sync
Diffstat (limited to 'usr/src/uts/common/fs/zfs/vdev_raidz.c')
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_raidz.c | 131 |
1 files changed, 122 insertions, 9 deletions
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index 4b0f5602c1..6094e01876 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -21,11 +21,15 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Joyent, Inc. All rights reserved. */ #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/vdev_impl.h> +#include <sys/vdev_disk.h> +#include <sys/vdev_file.h> +#include <sys/vdev_raidz.h> #include <sys/zio.h> #include <sys/zio_checksum.h> #include <sys/fs/zfs.h> @@ -152,6 +156,8 @@ typedef struct raidz_map { VDEV_RAIDZ_64MUL_2((x), mask); \ } +#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) + /* * Force reconstruction to use the general purpose method. */ @@ -431,12 +437,12 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = { }; static raidz_map_t * -vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, - uint64_t nparity) +vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, + uint64_t unit_shift, uint64_t dcols, uint64_t nparity) { raidz_map_t *rm; - uint64_t b = zio->io_offset >> unit_shift; - uint64_t s = zio->io_size >> unit_shift; + uint64_t b = offset >> unit_shift; + uint64_t s = size >> unit_shift; uint64_t f = b % dcols; uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; @@ -506,7 +512,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); - rm->rm_col[c].rc_data = zio->io_data; + rm->rm_col[c].rc_data = data; for (c = c + 1; c < acols; c++) rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + @@ -535,7 +541,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, ASSERT(rm->rm_cols >= 2); ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); - if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { + if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { devidx = rm->rm_col[0].rc_devidx; o = rm->rm_col[0].rc_offset; rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; @@ -547,8 +553,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, rm->rm_skipstart = 1; } - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_raidz_vsd_ops; return (rm); } @@ -1491,6 +1495,104 @@ vdev_raidz_close(vdev_t *vd) vdev_close(vd->vdev_child[c]); } +/* + * Handle a read or write request to a RAID-Z dump device. + * + * Unlike the normal RAID-Z codepath in vdev_raidz_io_start(), reads and writes + * to the dump zvol are written across a full 128Kb block. As a result, an + * individual I/O may not span all columns in the RAID-Z map; moreover, a small + * I/O may only span a single column. + * + * Note that since there are no parity bits calculated or written, this format + * remains the same no matter how many parity bits are used in a normal RAID-Z + * stripe. + */ +int +vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, + uint64_t offset, uint64_t origoffset, boolean_t doread) +{ + vdev_t *tvd = vd->vdev_top; + vdev_t *cvd; + raidz_map_t *rm; + raidz_col_t *rc; + int c, err = 0; + + uint64_t start, end, colstart, colend; + uint64_t coloffset, colsize, colskip; + + int flags = doread ? B_READ : B_WRITE; + +#ifdef _KERNEL + + /* + * Don't write past the end of the block + */ + VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE); + + /* + * Even if this I/O operation doesn't span the full block size, let's + * treat the on-disk format as if the only blocks are the complete 128k + * size. + */ + start = offset; + end = start + size; + + /* + * Allocate a RAID-Z map for this block. Note that this block starts + * from the "original" offset, this is, the offset of the extent which + * contains the requisite offset of the data being read or written. + */ + rm = vdev_raidz_map_alloc(data - (offset - origoffset), + SPA_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, vd->vdev_children, + vd->vdev_nparity); + + coloffset = origoffset; + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; + c++, coloffset += rc->rc_size) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + + /* + * Find the start and end of this column in the RAID-Z matrix, + * keeping in mind that the stated size and offset of the + * operation may not fill the entire column for this vdev. + * + * If any portion of the data being read or written spans this + * column, issue the appropriate operation to the child vdev. + */ + if (coloffset + rc->rc_size <= start) + continue; + if (coloffset >= end) + continue; + + colstart = MAX(coloffset, start); + colend = MIN(end, coloffset + rc->rc_size); + colsize = colend - colstart; + colskip = colstart - coloffset; + + VERIFY3U(colsize, <=, rc->rc_size); + VERIFY3U(colskip, <=, rc->rc_size); + + /* + * Note that the child vdev will have a vdev label at the start + * of its range of offsets, hence the need for + * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another + * example of why this calculation is needed. + */ + if ((err = vdev_disk_physio(cvd, + ((char *)rc->rc_data) + colskip, colsize, + VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, + flags)) != 0) + break; + } + + vdev_raidz_map_free(rm); +#endif /* KERNEL */ + + return (err); +} + static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize) { @@ -1526,9 +1628,13 @@ vdev_raidz_io_start(zio_t *zio) raidz_col_t *rc; int c, i; - rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, + rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset, + tvd->vdev_ashift, vd->vdev_children, vd->vdev_nparity); + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); if (zio->io_type == ZIO_TYPE_WRITE) { @@ -1659,6 +1765,13 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) int c, ret = 0; raidz_col_t *rc; + blkptr_t *bp = zio->io_bp; + uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + + if (checksum == ZIO_CHECKSUM_NOPARITY) + return (ret); + for (c = 0; c < rm->rm_firstdatacol; c++) { rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) |