diff options
author | jeanm <none@none> | 2006-05-06 10:47:51 -0700 |
---|---|---|
committer | jeanm <none@none> | 2006-05-06 10:47:51 -0700 |
commit | da83352438a4a62b87fcb6fd1583e3a70aa31bb8 (patch) | |
tree | 2ed9be63db473a2144198071f61b16c67f50df86 /usr | |
parent | 1e3549a6454dbbb2d27b0f1fdb707b1d24b7141b (diff) | |
download | illumos-gate-da83352438a4a62b87fcb6fd1583e3a70aa31bb8.tar.gz |
4964366 metaimport should handle partial disksets
Diffstat (limited to 'usr')
27 files changed, 3801 insertions, 967 deletions
diff --git a/usr/src/cmd/lvm/rpc.metad/metad_svc_subr.c b/usr/src/cmd/lvm/rpc.metad/metad_svc_subr.c index 92f6d1c490..0d92faf3e2 100644 --- a/usr/src/cmd/lvm/rpc.metad/metad_svc_subr.c +++ b/usr/src/cmd/lvm/rpc.metad/metad_svc_subr.c @@ -132,11 +132,13 @@ add_sideno_sidenm( */ if (MD_MNSET_DESC(sd)) { if (add_name(local_sp, sideno, local_key, - sn->dname, sn->mnum, sn->cname, ep) == -1) + sn->dname, sn->mnum, sn->cname, NULL, NULL, + ep) == -1) return (-1); } else { if (add_name(local_sp, sideno+SKEW, local_key, - sn->dname, sn->mnum, sn->cname, ep) == -1) + sn->dname, sn->mnum, sn->cname, NULL, NULL, + ep) == -1) return (-1); } } else @@ -597,7 +599,8 @@ add_sidenamelist( */ if (nodeid == sn->sideno) { if ((err = add_name(local_sp, sn->sideno, key, - sn->dname, sn->mnum, sn->cname, ep)) == -1) + sn->dname, sn->mnum, sn->cname, + NULL, NULL, ep)) == -1) return (-1); key = (mdkey_t)err; break; @@ -620,7 +623,8 @@ add_sidenamelist( if (sn->sideno != thisside) continue; if ((err = add_name(local_sp, sn->sideno+SKEW, key, - sn->dname, sn->mnum, sn->cname, ep)) == -1) + sn->dname, sn->mnum, sn->cname, NULL, + NULL, ep)) == -1) return (-1); key = (mdkey_t)err; break; @@ -635,7 +639,8 @@ add_sidenamelist( if (sn->sideno == thisside) continue; if ((err = add_name(local_sp, sn->sideno+SKEW, key, - sn->dname, sn->mnum, sn->cname, ep)) == -1) + sn->dname, sn->mnum, sn->cname, NULL, NULL, + ep)) == -1) return (-1); key = (mdkey_t)err; } @@ -647,7 +652,8 @@ add_sidenamelist( sn = dn->side_names; if (sn) { if ((err = add_name(local_sp, sn->sideno, key, - sn->dname, sn->mnum, sn->cname, ep)) == -1) + sn->dname, sn->mnum, sn->cname, + NULL, NULL, ep)) == -1) return (-1); key = (mdkey_t)err; } @@ -658,6 +664,139 @@ add_sidenamelist( return (0); } +/* + * imp_adddrvs + * This is a version of adddrvs that is specific to the + * metaimport command. Due to the unavailability of some disks, + * information needs to be obtained about the disk from the devid so + * it can eventually be passed down to add_sidenamelist. + * Go ahead and set drive state to MD_DR_OK here so that no + * later RPC is needed to set OK where UNRLSV_REPLICATED could + * be cleared. Set record is still set to MD_SR_ADD which will force + * a cleanup of the set in case of panic. + */ +void +imp_adddrvs( + char *setname, + md_drive_desc *dd, + md_timeval32_t timestamp, + ulong_t genid, + md_error_t *ep +) +{ + mddb_userreq_t req; + md_drive_record *dr, *tdr; + md_set_record *sr; + md_drive_desc *p; + mddrivename_t *dn; + mdname_t *np; + md_dev64_t dev; + md_error_t xep = mdnullerror; + char *minorname = NULL; + ddi_devid_t devidp = NULL; + mdsidenames_t *sn; + mdsetname_t *local_sp; + + + if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) { + return; + } + + if ((sr = getsetbyname(setname, ep)) == NULL) + return; + + for (p = dd; p != NULL; p = p->dd_next) { + uint_t rep_slice; + int ret = 0; + + dn = p->dd_dnp; + + /* + * We need the minorname and devid string decoded from the + * devid to add the sidename for this drive to the + * local set. + */ + ret = devid_str_decode(dn->devid, &devidp, &minorname); + if (ret != 0) { + /* failed to decode the devid */ + goto out; + } + + sn = dn->side_names; + if (sn == NULL) { + dn->side_names_key = MD_KEYWILD; + continue; + } + + if ((dn->side_names_key = add_name(local_sp, SKEW, MD_KEYWILD, + sn->dname, sn->mnum, sn->cname, minorname, devidp, + ep)) == -1) { + devid_free(devidp); + devid_str_free(minorname); + goto out; + } + + devid_free(devidp); + devid_str_free(minorname); + + /* Create the drive record */ + (void) memset(&req, 0, sizeof (req)); + METAD_SETUP_DR(MD_DB_CREATE, 0); + req.ur_size = sizeof (*dr); + if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) { + (void) mdstealerror(ep, &req.ur_mde); + goto out; + } + + /* Fill in the drive record values */ + dr = Zalloc(sizeof (*dr)); + dr->dr_selfid = req.ur_recid; + dr->dr_dbcnt = p->dd_dbcnt; + dr->dr_dbsize = p->dd_dbsize; + dr->dr_key = dn->side_names_key; + + dr->dr_ctime = timestamp; + dr->dr_genid = genid; + dr->dr_revision = MD_DRIVE_RECORD_REVISION; + dr->dr_flags = MD_DR_OK; + if (p->dd_flags & MD_DR_UNRSLV_REPLICATED) { + dr->dr_flags |= MD_DR_UNRSLV_REPLICATED; + sr->sr_flags |= MD_SR_UNRSLV_REPLICATED; + } + + /* Link the drive records and fill in in-core data */ + dr_cache_add(sr, dr); + + dev = NODEV64; + if ((meta_replicaslice(dn, &rep_slice, &xep) == 0) && + ((np = metaslicename(dn, rep_slice, &xep)) != NULL)) + dev = np->dev; + else + mdclrerror(&xep); + + SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, SVM_TAG_DRIVE, + MD_LOCAL_SET, dev); + SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_ADD, SVM_TAG_DRIVE, + sr->sr_setno, dev); + } + + /* Commit all the records atomically */ + commitset(sr, TRUE, ep); + free_sr(sr); + return; + +out: + /* If failures, remove drive records. */ + dr = tdr = sr->sr_drivechain; + while (dr != NULL) { + tdr = dr->dr_next; + if (del_name(local_sp, 0, dr->dr_key, &xep)) + mdclrerror(&xep); + sr_del_drv(sr, dr->dr_selfid); + dr = tdr; + } +} + static void adddrvs( char *setname, @@ -836,6 +975,51 @@ mdrpc_adddrvs_2_svc( } } +/* + * add 1 or more drive records to a set when importing. + */ +bool_t +mdrpc_imp_adddrvs_2_svc( + mdrpc_drives_2_args *args, + mdrpc_generic_res *res, + struct svc_req *rqstp /* RPC stuff */ +) +{ + mdrpc_drives_2_args_r1 *v2_args; + md_error_t *ep = &res->status; + int err; + int op_mode = W_OK; + + switch (args->rev) { + case MD_METAD_ARGS_REV_1: + v2_args = &args->mdrpc_drives_2_args_u.rev1; + if (v2_args == NULL) { + return (FALSE); + } + break; + default: + return (FALSE); + } + + /* setup, check permissions */ + (void) memset(res, 0, sizeof (*res)); + if ((err = svc_init(rqstp, op_mode, ep)) < 0) + return (FALSE); + else if (err != 0) + return (TRUE); + + if (check_set_lock(op_mode, v2_args->cl_sk, ep)) + return (TRUE); + + /* doit */ + imp_adddrvs(v2_args->sp->setname, v2_args->drivedescs, + v2_args->timestamp, v2_args->genid, ep); + + err = svc_fini(ep); + + return (TRUE); +} + static void addhosts( char *setname, diff --git a/usr/src/cmd/lvm/util/metaimport.c b/usr/src/cmd/lvm/util/metaimport.c index 0ec9adfeb1..231be20eed 100644 --- a/usr/src/cmd/lvm/util/metaimport.c +++ b/usr/src/cmd/lvm/util/metaimport.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,7 +38,7 @@ #include <sys/lvm/md_names.h> #include <sdssc.h> -static md_im_drive_info_t *overlap_disks = NULL; +static md_im_drive_info_t *overlap_disks; static void usage(mdsetname_t *sp, char *string) @@ -79,51 +78,123 @@ print_version(mdsetname_t *sp) static int set_disk_overlap(md_im_set_desc_t *misp) { - - md_im_set_desc_t *next, *isp = misp; - md_im_drive_info_t *set_dr, *next_set_dr, **chain; - int is_overlap = 0; - + md_im_set_desc_t *next, *isp = misp; + md_im_drive_info_t *set_dr, *next_set_dr, **chain; + int is_overlap = 0; + md_im_drive_info_t *good_disk = NULL; + md_im_drive_info_t *d; + md_timeval32_t gooddisktime; + int disk_not_available = 0; + /* + * There are 2 ways we could get an "overlap" disk. + * One is if the ctd's are the same. The other is if + * the setcreatetimestamp on the disk doesn't agree with the + * "good" disk in the set. However, if we have a disk that is + * unavailable and the other instance of the ctd is available we + * really don't have a conflict. It's just that the unavailable ctd + * is it's "old" location and the available instance is a current + * location. + */ for (; isp != NULL; isp = isp->mis_next) { for (next = isp->mis_next; next != NULL; next = next->mis_next) { - for (set_dr = isp->mis_drives; set_dr != NULL; - set_dr = set_dr->mid_next) { - - for (next_set_dr = next->mis_drives; - next_set_dr != NULL; - next_set_dr = next_set_dr->mid_next) { - if (strcmp(set_dr->mid_dnp->cname, - next_set_dr->mid_dnp->cname) == 0) { + set_dr = set_dr->mid_next) { + if (set_dr->mid_available == MD_IM_DISK_NOT_AVAILABLE) + disk_not_available = 1; + else + disk_not_available = 0; + for (next_set_dr = next->mis_drives; next_set_dr != NULL; + next_set_dr = next_set_dr->mid_next) { + if (disk_not_available && + (next_set_dr->mid_available + == MD_IM_DISK_AVAILABLE)) + continue; + else if (!disk_not_available && + (next_set_dr->mid_available == + MD_IM_DISK_NOT_AVAILABLE)) + continue; + if (strcmp(set_dr->mid_dnp->cname, + next_set_dr->mid_dnp->cname) == 0) { /* - * Chain it, skip if already there + * Chain it, skip if + * already there */ if (overlap_disks == NULL) { set_dr->overlap = NULL; + set_dr->overlapped_disk = 1; + next_set_dr->overlapped_disk = 1; overlap_disks = set_dr; } else { for (chain = &overlap_disks; *chain != NULL; chain = &(*chain)->overlap) { if (strcmp(set_dr->mid_dnp->cname, - (*chain)->mid_dnp->cname) - == 0) + (*chain)->mid_dnp->cname) == 0) break; } if (*chain == NULL) { *chain = set_dr; set_dr->overlap = NULL; + set_dr->overlapped_disk = 1; + next_set_dr->overlapped_disk = 1; } } if (!is_overlap) is_overlap = 1; - } } + } } } } + for (isp = misp; isp != NULL; isp = isp->mis_next) { + good_disk = pick_good_disk(isp); + if (good_disk == NULL) { + /* didn't find a good disk */ + continue; + } + gooddisktime = good_disk->mid_setcreatetimestamp; + for (d = isp->mis_drives; d != NULL; d = d->mid_next) { + if (d->mid_available == MD_IM_DISK_NOT_AVAILABLE) + continue; + /* + * If the disk doesn't have the same set creation + * time as the designated "good disk" we have a + * time conflict/overlap situation. Mark the disk + * as such. + */ + if ((gooddisktime.tv_usec != + d->mid_setcreatetimestamp.tv_usec) || + (gooddisktime.tv_sec != + d->mid_setcreatetimestamp.tv_sec)) { + d->overlapped_disk = 1; + if (overlap_disks == NULL) { + d->overlap = NULL; + d->overlapped_disk = 1; + overlap_disks = d; + } else { + for (chain = &overlap_disks; + *chain != NULL; + chain = &(*chain)->overlap) { + if (strcmp(d->mid_dnp->cname, + (*chain)->mid_dnp->cname) + == 0) { + break; + } + } + + if (*chain == NULL) { + *chain = d; + d->overlap = NULL; + d->overlapped_disk = 1; + } + } + if (!is_overlap) + is_overlap = 1; + } + } + } return (is_overlap); } @@ -155,6 +226,19 @@ report_overlap_recommendation() uint_t sliceno; int fd = -1; + /* + * If the disk isn't available (i.e. powered off or dead) + * we can't read the master block timestamp and thus + * cannot make a recommendation as to which set it belongs to. + */ + if (d->mid_available != MD_IM_DISK_AVAILABLE) { + (void) fprintf(stdout, " %s ", d->mid_dnp->cname); + (void) fprintf(stdout, + gettext(" - no recommendation can " + "be made because disk is unavailable\n")); + continue; + } + if (meta_replicaslice(d->mid_dnp, &sliceno, ep) != 0) continue; @@ -173,13 +257,174 @@ report_overlap_recommendation() (void) close(fd); fprintf(stdout, " %s ", d->mid_dnp->cname); (void) fprintf(stdout, "%s: %s\n", - gettext(" - recommend importing with set " + gettext(" - must import with set " "created at "), meta_print_time((md_timeval32_t *) (&(mbp->mb_setcreatetime)))); } Free(mbp); } +/* + * is_first_disk is called to determine if the disk passed to it is + * eligible to be used as the "first disk time" in the set. It checks to + * see if the disk is available, on the skip list or not (thus already in + * an importable set) or being used by the system already. + * RETURN: + * 1 The time can be used as the first disk time + * 0 The time should not be used. + */ +static int +is_first_disk( +md_im_drive_info_t *d, +mddrivenamelist_t **skiph) +{ + mddrivenamelist_t *slp; + md_error_t status = mdnullerror; + md_error_t *ep = &status; + mdsetname_t *sp = metasetname(MD_LOCAL_NAME, ep); + + /* + * If a disk is not available there is no + * set creation timestamp available. + */ + if (d->mid_available == MD_IM_DISK_AVAILABLE) { + /* + * We also need to make sure this disk isn't already on + * the skip list. + */ + for (slp = *skiph; slp != NULL; slp = slp->next) { + if (d->mid_dnp == slp->drivenamep) + return (0); + } + /* + * And we need to make sure the drive isn't + * currently being used for something else + * like a mounted file system or a current + * metadevice or in a set. + */ + if (meta_imp_drvused(sp, d->mid_dnp, ep)) { + return (0); + } + } else { + return (0); + } + return (1); +} + +/* + * Input a list of disks (dnlp), find the sets that are importable, create + * a list of these sets (mispp), and a list of the disks within each of these + * sets (midp). These lists (mispp and midp) will be used by metaimport. + */ +static int process_disks( + mddrivenamelist_t *dnlp, + mddrivenamelist_t **skipt, + md_im_set_desc_t **mispp, + int flags, + int *set_count, + int overlap, + md_error_t *ep +) +{ + mddrivenamelist_t *dp; + int rscount = 0; + int hasreplica; + md_im_set_desc_t *p; + md_im_drive_info_t *d; + mddrivenamelist_t **skiph = skipt; + + /* Scan qualified disks */ + for (dp = dnlp; dp != NULL; dp = dp->next) { + mddrivenamelist_t *slp; + + /* is the current drive on the skip list? */ + for (slp = *skiph; slp != NULL; slp = slp->next) { + if (dp->drivenamep == slp->drivenamep) + break; + } + /* drive on the skip list ? */ + if (slp != NULL) + continue; + + /* + * In addition to updating the misp list, either verbose or + * standard output will be generated. + * + */ + hasreplica = meta_get_and_report_set_info(dp, mispp, 0, + flags, set_count, overlap, overlap_disks, ep); + + if (hasreplica < 0) { + mde_perror(ep, ""); + mdclrerror(ep); + } else { + + rscount += hasreplica; + + /* Eliminate duplicate reporting */ + if (hasreplica > 0) { + md_timeval32_t firstdisktime; + + /* + * Go to the tail for the current set + */ + for (p = *mispp; p->mis_next != NULL; + p = p->mis_next); + + /* + * Now look for the set creation timestamp. + * If a disk is not available there is no + * set creation timestamp available so look + * for the first available disk to grab this + * information from. We also need to make + * sure this disk isn't already on the skip + * list. If so go to the next available drive. + * And we need to make sure the drive isn't + * currently being used for something else + * like a mounted file system or a current + * metadevice or in a set. + */ + for (d = p->mis_drives; d != NULL; + d = d->mid_next) { + if (is_first_disk(d, skiph)) { + firstdisktime = + d->mid_setcreatetimestamp; + break; + } + } + for (d = p->mis_drives; d != NULL; + d = d->mid_next) { + /* + * if the mb_setcreatetime for a disk + * is not the same as the first disk + * in the set, don't put it on the + * skip list. This disk probably + * doesn't really belong in this set + * and we'll want to look at it again + * to figure out where it does belong. + * If the disk isn't available, there's + * really no point in looking at it + * again so put it on the skip list. + */ + if (d->mid_available == + MD_IM_DISK_AVAILABLE) { + if ((d->mid_setcreatetimestamp. + tv_sec != firstdisktime. + tv_sec) || + (d->mid_setcreatetimestamp. + tv_usec != + firstdisktime.tv_usec)) + continue; + } + skipt = + meta_drivenamelist_append_wrapper( + skipt, d->mid_dnp); + } + } + } + } + return (rscount); +} int main(int argc, char *argv[]) @@ -197,18 +442,18 @@ main(int argc, char *argv[]) mddrivenamelist_t *dnlp = NULL; mddrivenamelist_t *dp; mddrivenamelist_t *skiph = NULL; - mddrivenamelist_t **skipt = &skiph; int rscount = 0; - int hasreplica; + md_im_set_desc_t *pass1_misp = NULL; md_im_set_desc_t *misp = NULL; + md_im_set_desc_t **pass1_mispp = &pass1_misp; md_im_set_desc_t **mispp = &misp; mhd_mhiargs_t mhiargs = defmhiargs; int have_multiple_sets = 0; int force = 0; int overlap = 0; - int partial = 0; uint_t imp_flags = 0; int set_count = 0; + int no_quorum = 0; /* * Get the locale set up before calling any other routines @@ -374,7 +619,8 @@ main(int argc, char *argv[]) char *dlist; int sizecnt = 0; - sizecnt += strlen(ip->drive); + /* add 1 for null terminator */ + sizecnt += strlen(ip->drive) + 1; for (dp = dnlp->next; dp != NULL; dp = dp->next) { sizecnt += 2; /* for the ", " */ sizecnt += strlen(dp->drivenamep->cname); @@ -383,15 +629,14 @@ main(int argc, char *argv[]) dlist = Malloc(sizecnt); strlcpy(dlist, ip->drive, sizecnt); - Free(ip->drive); - dlist += strlen(ip->drive); + Free(ip->drive); for (dp = dnlp->next; dp != NULL; dp = dp->next) { strlcat(dlist, ", ", sizecnt); strlcat(dlist, dp->drivenamep->cname, sizecnt); } - ip->drive = Strdup(dlist); + ip->drive = dlist; } /* Don't continue if we're already hosed */ @@ -406,96 +651,26 @@ main(int argc, char *argv[]) md_exit(sp, 0); } - /* Scan qualified disks */ - for (dp = dnlp; dp != NULL; dp = dp->next) { - mddrivenamelist_t *slp; - - /* is the current drive on the skip list? */ - for (slp = skiph; slp != NULL; slp = slp->next) { - if (dp->drivenamep == slp->drivenamep) - goto skipdisk; - } - - /* - * In addition to updating the misp list, either verbose or - * standard output will be generated. - * - */ - hasreplica = meta_get_and_report_set_info(dp, mispp, 0, - imp_flags, &set_count, ep); - - /* - * If current disk is part of a partial diskset, - * meta_get_set_info returns an ENOTSUP for this disk. - * Import of partial disksets isn't supported yet, - * so do NOT put this disk onto any list being set up - * by metaimport. The partial diskset error message will - * only be printed once when the first partial diskset is - * detected. If the user is actually trying to import the - * partial diskset, print the error and exit; otherwise, - * print the error and continue. - */ - if (hasreplica == ENOTSUP) { - if (report_only) { - if (!partial) { - mde_perror(ep, ""); - partial = 1; - } - mdclrerror(ep); - goto skipdisk; - } else { - mde_perror(ep, ""); - md_exit(sp, 1); - } - } - - if (hasreplica < 0) { - mde_perror(ep, ""); - mdclrerror(ep); - } else { - md_im_set_desc_t *p; - md_im_drive_info_t *d; - - rscount += hasreplica; + /* + * META_IMP_PASS1 means gather the info, but don't report. + */ + (void) process_disks(dnlp, &skiph, pass1_mispp, + imp_flags | META_IMP_PASS1, &set_count, overlap, ep); - /* Eliminate duplicate reporting */ - if (hasreplica > 0) { - md_timeval32_t firstdisktime; + overlap_disks = NULL; + overlap = set_disk_overlap(pass1_misp); + skiph = NULL; - /* - * Go to the tail for the current set - */ - for (p = misp; p->mis_next != NULL; - p = p->mis_next); - firstdisktime = - p->mis_drives->mid_setcreatetimestamp; - for (d = p->mis_drives; - d != NULL; - d = d->mid_next) { - /* - * if the mb_setcreatetime for a disk - * is not the same as the first disk - * in the set, don't put it on the - * skip list. This disk probably - * doesn't really belong in this set - * and we'll want to look at it again - * to figure out where it does belong. - */ - if ((d->mid_setcreatetimestamp.tv_sec != - firstdisktime.tv_sec) || - (d->mid_setcreatetimestamp.tv_usec - != firstdisktime.tv_usec)) - continue; - skipt = - meta_drivenamelist_append_wrapper( - skipt, d->mid_dnp); - } - } - } - -skipdisk: - ; - } + /* + * This time call without META_IMP_PASS1 set and we gather + * and report the information. + * We need to do this twice because of the overlap detection. + * The first pass generates a list of disks to detect overlap on. + * We then do a second pass using that overlap list to generate + * the report. + */ + rscount = process_disks(dnlp, &skiph, mispp, imp_flags, &set_count, + overlap, ep); /* * Now have entire list of disks associated with diskset including @@ -508,15 +683,52 @@ skipdisk: md_im_drive_info_t *d; mddrivename_t *dnp; + if (sp == NULL) { + /* Get sp for local set */ + if ((sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) { + mde_perror(ep, ""); + meta_free_im_set_desc(misp); + md_exit(sp, 1); + } + } + for (p = misp; p != NULL; p = p->mis_next) { for (d = p->mis_drives; d != NULL; d = d->mid_next) { dnp = d->mid_dnp; - if (meta_imp_drvused(sp, dnp, ep)) { - (void) mddserror(ep, - MDE_DS_DRIVEINUSE, 0, NULL, - dnp->cname, NULL); - mde_perror(ep, ""); - md_exit(sp, 0); + if (d->mid_available == MD_IM_DISK_AVAILABLE) { + if (meta_imp_drvused(sp, dnp, ep)) { + (void) mddserror(ep, + MDE_DS_DRIVEINUSE, 0, NULL, + dnp->cname, NULL); + mde_perror(ep, ""); + meta_free_im_set_desc(misp); + md_exit(sp, 1); + } + } else { + /* + * If drive is unavailable, then check + * that this drive hasn't already been + * imported as part of another partial + * diskset. Check by devid instead of + * cname since the unavailable drive + * would have the cname from its + * previous system and this may collide + * with a valid cname on this system. + * Fail if devid is found in another + * set or if the routine fails. + */ + mdsetname_t *tmp_sp = NULL; + + if ((meta_is_devid_in_anyset( + d->mid_devid, &tmp_sp, ep) == -1) || + (tmp_sp != NULL)) { + (void) mddserror(ep, + MDE_DS_DRIVEINUSE, 0, NULL, + dnp->cname, NULL); + mde_perror(ep, ""); + meta_free_im_set_desc(misp); + md_exit(sp, 1); + } } } } @@ -531,9 +743,11 @@ skipdisk: * If we've found partial disksets but no complete disksets, * we don't want this to print. */ - if (!partial) { + if (!misp) { md_eprintf("%s\n", gettext("no unconfigured sets " "detected")); + meta_free_im_set_desc(misp); + md_exit(sp, 1); } md_exit(sp, 0); } @@ -566,17 +780,15 @@ skipdisk: gettext("Number of disksets eligible for import"), set_count); } + } + if (overlap) { + report_overlap_recommendation(); + } - overlap = set_disk_overlap(misp); - if (overlap) { - report_overlap_recommendation(); - } - - if (!report_only) { - md_eprintf("%s\n\n", gettext("multiple unconfigured " - "sets detected.\nRerun the command with the " - "suggested options for the desired set.")); - } + if (have_multiple_sets && !report_only) { + md_eprintf("%s\n\n", gettext("multiple unconfigured " + "sets detected.\nRerun the command with the " + "suggested options for the desired set.")); } @@ -586,8 +798,29 @@ skipdisk: */ if (report_only) { + meta_free_im_set_desc(misp); md_exit(sp, 0); } else if (have_multiple_sets) { + meta_free_im_set_desc(misp); + md_exit(sp, 1); + } else if (overlap) { + md_im_drive_info_t *d; + /* + * The only way we can get here is if we're doing an import + * request on a set that contains at least one disk with + * a time conflict. We are prohibiting the importation of + * this type of set until the offending disk(s) are turned + * off to prevent data corruption. + */ + printf(gettext("To import this set, ")); + for (d = pass1_misp->mis_drives; + d != NULL; + d = d->mid_next) { + if (d->overlapped_disk) + printf("%s ", d->mid_dnp->cname); + } + printf(gettext("must be removed from the system\n")); + meta_free_im_set_desc(misp); md_exit(sp, 1); } @@ -595,32 +828,50 @@ skipdisk: usage(sp, gettext("You must specify a new set name.")); } + /* + * The user must specify the -f (force) flag if the following + * conditions exist: + * - partial diskset + * - stale diskset + */ + if (meta_replica_quorum(misp) != 0) + no_quorum = 1; + if (misp->mis_partial || no_quorum) { + if (!force) + usage(sp, gettext("You must specify the force flag")); + } (void) meta_imp_set(misp, setname_new, force, dry_run, ep); - if (dry_run) { + meta_free_im_set_desc(misp); md_exit(sp, 0); } if (!mdisok(ep)) { + meta_free_im_set_desc(misp); mde_perror(ep, ""); md_exit(sp, 1); } if ((sp = metasetname(setname_new, ep)) == NULL) { + meta_free_im_set_desc(misp); mde_perror(ep, ""); md_exit(sp, 1); } if (meta_lock_nowait(sp, ep) != 0) { + meta_free_im_set_desc(misp); mde_perror(ep, ""); md_exit(sp, 10); /* special errcode */ } - if (meta_set_take(sp, &mhiargs, 0, 0, &status)) { + if (meta_set_take(sp, &mhiargs, (misp->mis_partial | TAKE_IMP), + 0, &status)) { + meta_free_im_set_desc(misp); mde_perror(&status, ""); md_exit(sp, 1); } + meta_free_im_set_desc(misp); md_exit(sp, 0); /*NOTREACHED*/ return (0); diff --git a/usr/src/cmd/lvm/util/metaset.c b/usr/src/cmd/lvm/util/metaset.c index 59c803d2f3..953554e3c5 100644 --- a/usr/src/cmd/lvm/util/metaset.c +++ b/usr/src/cmd/lvm/util/metaset.c @@ -1510,6 +1510,7 @@ parse_takeset(int argc, char **argv) sdssc_boolean_e cluster_take = SDSSC_False; sdssc_version_t vers; rval_e rval; + int set_take_rval; /* reset and parse args */ optind = 1; @@ -1646,7 +1647,30 @@ parse_takeset(int argc, char **argv) md_exit(sp, 10); /* special errcode */ } - if (meta_set_take(sp, &mhiargs, flags, usetag, &status)) { + /* + * If a 2 is returned from meta_set_take, this take was able to resolve + * an unresolved replicated disk (i.e. a disk is now available that + * had been missing during the import of the replicated diskset). + * Need to release the diskset and re-take in order to have + * the subdrivers re-snarf using the newly resolved (or newly mapped) + * devids. This also allows the namespace to be updated with the + * correct major names in the case where the disk being replicated + * was handled by a different driver than the replicated disk. + */ + set_take_rval = meta_set_take(sp, &mhiargs, flags, usetag, &status); + if (set_take_rval == 2) { + if (meta_set_release(sp, &status)) { + mde_perror(&status, + "Need to release and take set to resolve names."); + md_exit(sp, 1); + } + metaflushdrivenames(); + metaflushsetname(sp); + set_take_rval = meta_set_take(sp, &mhiargs, + (flags | TAKE_RETAKE), usetag, &status); + } + + if (set_take_rval == -1) { mde_perror(&status, ""); if (mdismddberror(&status, MDE_DB_TAGDATA)) md_exit(sp, 2); diff --git a/usr/src/head/meta.h b/usr/src/head/meta.h index ada550ed07..dc4cd38691 100644 --- a/usr/src/head/meta.h +++ b/usr/src/head/meta.h @@ -404,6 +404,8 @@ typedef struct md_mn_msg_tbl_entry { #define TAKE_FORCE 0x0001 #define TAKE_USETAG 0x0002 #define TAKE_USEIT 0x0004 +#define TAKE_IMP 0x0008 +#define TAKE_RETAKE 0x0010 /* * ignore gettext for lint so we check printf args @@ -595,6 +597,62 @@ typedef struct md_evlist { /* end of meta event definitions ("meta_notify.h") */ +typedef struct md_im_names { + int min_count; + char **min_names; +} md_im_names_t; + +/* Values for replica info status */ +#define MD_IM_REPLICA_SCANNED (0x01) +#define MD_IM_REPLICA_VALID (0x02) + +typedef struct md_im_replica_info { + struct md_im_replica_info *mir_next; + int mir_status; + int mir_flags; + daddr32_t mir_offset; + daddr32_t mir_length; + md_timeval32_t mir_timestamp; +} md_im_replica_info_t; + +typedef struct md_im_drive_info { + struct md_im_drive_info *mid_next; /* next drive in this set */ + mddrivename_t *mid_dnp; + void *mid_devid; + void *mid_o_devid; + int mid_devid_sz; + int mid_o_devid_sz; + char mid_minor_name[MDDB_MINOR_NAME_MAX]; + minor_t mid_mnum; + int mid_available; + md_timeval32_t mid_setcreatetimestamp; + char *mid_driver_name; + char *mid_devname; + md_im_replica_info_t *mid_replicas; + int overlapped_disk; + struct md_im_drive_info *overlap; /* chain of overlap disks */ +} md_im_drive_info_t; + +/* Values for mid_available */ +#define MD_IM_DISK_AVAILABLE 0x00 +#define MD_IM_DISK_NOT_AVAILABLE 0x01 + +/* Values for set descriptor flags */ +#define MD_IM_SET_INVALID 0x10 +#define MD_IM_SET_REPLICATED 0x20 + +/* Values for mis_partial */ +#define MD_IM_COMPLETE_DISKSET 0x04 +#define MD_IM_PARTIAL_DISKSET 0x08 + +typedef struct md_im_set_desc { + struct md_im_set_desc *mis_next; + int mis_flags; + int mis_oldsetno; + md_im_drive_info_t *mis_drives; + int mis_active_replicas; + int mis_partial; +} md_im_set_desc_t; /* meta_admin.c */ extern int open_admin(md_error_t *ep); @@ -1120,12 +1178,15 @@ extern mdsetname_t *metasetnosetname(set_t setno, md_error_t *ep); extern mdsetname_t *metafakesetname(set_t setno, char *sname); extern md_set_desc *metaget_setdesc(mdsetname_t *sp, md_error_t *ep); extern void metaflushsetname(mdsetname_t *sp); +extern void metaflushdrivenames(void); extern int metaislocalset(mdsetname_t *sp); extern int metaissameset(mdsetname_t *sp1, mdsetname_t *sp2); extern void metaflushsidenames(mddrivename_t *dnp); extern char *metadiskname(char *name); extern mddrivename_t *metadrivename(mdsetname_t **spp, char *uname, md_error_t *ep); +extern mddrivename_t *metadrivenamebydevid(mdsetname_t **spp, char *devid, + char *uname, md_error_t *ep); extern mdname_t *metaslicename(mddrivename_t *dnp, uint_t sliceno, md_error_t *ep); extern void metafreedrivename(mddrivename_t *dnp); @@ -1181,6 +1242,9 @@ extern int meta_get_hotspare_names(mdsetname_t *sp, mdnamelist_t **nlpp, int options, md_error_t *ep); extern void meta_create_non_dup_list(mdname_t *mdnp, mddevid_t **ldevidpp); +extern mddrivename_t *meta_getdnp_bydevid(mdsetname_t *sp, side_t sideno, + ddi_devid_t devidp, mdkey_t key, md_error_t *ep); + /* meta_nameinfo.c */ extern mdsetname_t *metagetset(mdname_t *np, int bypass_daemon, @@ -1233,7 +1297,7 @@ extern int meta_setdid(set_t setno, side_t sideno, mdkey_t key, md_error_t *ep); extern int add_name(mdsetname_t *sp, side_t sideno, mdkey_t key, char *dname, minor_t mnum, char *bname, - md_error_t *ep); + char *minorname, ddi_devid_t devid, md_error_t *ep); extern int del_name(mdsetname_t *sp, side_t sideno, mdkey_t key, md_error_t *ep); extern int add_key_name(mdsetname_t *sp, mdname_t *np, @@ -1391,6 +1455,10 @@ extern int meta_is_drive_in_anyset(mddrivename_t *dnp, extern int meta_is_drive_in_thisset(mdsetname_t *sp, mddrivename_t *dnp, int bypass_daemon, md_error_t *ep); +extern int meta_is_devid_in_anyset(void *devid, + mdsetname_t **spp, md_error_t *ep); +extern int meta_is_devid_in_thisset(mdsetname_t *sp, + void *devid, md_error_t *ep); extern int meta_set_balance(mdsetname_t *sp, md_error_t *ep); extern int meta_set_destroy(mdsetname_t *sp, int lock_set, md_error_t *ep); @@ -1428,7 +1496,8 @@ extern int meta_devid_use(md_error_t *ep); /* meta_set_drv.c */ extern int meta_make_sidenmlist(mdsetname_t *, - mddrivename_t *, md_error_t *); + mddrivename_t *, int imp_flag, + md_im_drive_info_t *midp, md_error_t *); extern int meta_set_adddrives(mdsetname_t *sp, mddrivenamelist_t *dnlp, daddr_t dbsize, int force_label, md_error_t *ep); @@ -1763,49 +1832,18 @@ extern int read_database_block(md_error_t *, int, mddb_mb_t *, int, void *, int); extern daddr_t getphysblk(mddb_block_t, mddb_mb_t *); -typedef struct md_im_names { - int min_count; - char **min_names; -} md_im_names_t; - -/* Values for replica info status */ -#define MD_IM_REPLICA_SCANNED (0x01) -#define MD_IM_REPLICA_VALID (0x02) +extern md_im_drive_info_t *pick_good_disk(md_im_set_desc_t *misp); -typedef struct md_im_replica_info { - struct md_im_replica_info *mir_next; - int mir_status; - int mir_flags; - daddr32_t mir_offset; - daddr32_t mir_length; - md_timeval32_t mir_timestamp; -} md_im_replica_info_t; - -typedef struct md_im_drive_info { - struct md_im_drive_info *mid_next; /* next drive in this set */ - mddrivename_t *mid_dnp; - void *mid_devid; - void *mid_o_devid; - int mid_devid_sz; - int mid_o_devid_sz; - char mid_minor_name[MDDB_MINOR_NAME_MAX]; - md_timeval32_t mid_setcreatetimestamp; - char *mid_devname; - md_im_replica_info_t *mid_replicas; - struct md_im_drive_info *overlap; /* chain of overlap disks */ -} md_im_drive_info_t; - -/* Values for set descriptor flags */ -#define MD_IM_SET_INVALID 0x01 -#define MD_IM_SET_REPLICATED 0x02 - -typedef struct md_im_set_desc { - struct md_im_set_desc *mis_next; - int mis_flags; - int mis_oldsetno; - md_im_drive_info_t *mis_drives; - int mis_active_replicas; -} md_im_set_desc_t; +extern void meta_unrslv_replicated_mb(mdsetname_t *sp, + md_drive_desc *dd, mddrivenamelist_t *dnlp, + md_error_t *ep); +extern void meta_unrslv_replicated_nm(mdsetname_t *sp, + md_drive_desc *dd, mddrivenamelist_t *dnlp, + md_error_t *ep); +extern void * replicated_list_lookup(uint_t devid_len, + void *old_devid); +extern int build_replicated_disks_list(md_error_t *ep, + mddrivenamelist_t *dnlp); /* * pnm_rec is used to store the mapping from keys in the NM namespace @@ -1831,18 +1869,29 @@ typedef struct pnm_rec { /* Flags for metaimport reporting */ #define META_IMP_REPORT 0x0001 #define META_IMP_VERBOSE 0x0002 +#define META_IMP_PASS1 0x1000 extern int meta_list_disks(md_error_t *, md_im_names_t *); extern mddrivenamelist_t *meta_prune_cnames(md_error_t *, md_im_names_t *, int); extern int meta_get_and_report_set_info( mddrivenamelist_t *, md_im_set_desc_t **, - int, uint_t, int *, md_error_t *); + int, uint_t, int *, int, + md_im_drive_info_t *, md_error_t *); extern void free_pnm_rec_list(pnm_rec_t **); extern int meta_imp_set(md_im_set_desc_t *, char *, int, bool_t, md_error_t *); extern int meta_imp_drvused(mdsetname_t *sp, mddrivename_t *dnp, md_error_t *ep); +extern int meta_replica_quorum(md_im_set_desc_t *misp); +extern int meta_imp_set_adddrives(mdsetname_t *sp, + mddrivenamelist_t *dnlp, + md_im_set_desc_t *misp, md_error_t *ep); +extern void meta_free_im_set_desc(md_im_set_desc_t *misp); +extern int clnt_imp_adddrvs(char *hostname, + mdsetname_t *sp, md_drive_desc *dd, + md_timeval32_t timestamp, + ulong_t genid, md_error_t *ep); /* Flags for direction in copy_msg_1 */ #define MD_MN_COPY_TO_ONDISK 0x0001 @@ -1866,9 +1915,6 @@ extern int meta_write_nodelist(int nodecnt, char **nids, md_error_t *ep); extern void meta_free_nodelist(mndiskset_membershiplist_t *nl); -/* Values for set descriptor flags */ -#define MD_IM_SET_INVALID 0x01 - /* meta_mn_subr.c */ /* defines for flags argument for meta_mn_send_command() */ #define MD_DISP_STDERR 0x0000 diff --git a/usr/src/head/metad.x b/usr/src/head/metad.x index 6754f4b121..6aa42b637b 100644 --- a/usr/src/head/metad.x +++ b/usr/src/head/metad.x @@ -1,5 +1,5 @@ %/* -% * Copyright 2005 Sun Microsystems, Inc. All rights reserved. +% * Copyright 2006 Sun Microsystems, Inc. All rights reserved. % * Use is subject to license terms. % * % * CDDL HEADER START @@ -965,6 +965,9 @@ program METAD { mdrpc_generic_res mdrpc_mn_sp_update_abr(mdrpc_setno_2_args) = 43; + mdrpc_generic_res + mdrpc_imp_adddrvs(mdrpc_drives_2_args) = 44; + } = 2; } = 100229; diff --git a/usr/src/lib/lvm/libmeta/common/meta_devadm.c b/usr/src/lib/lvm/libmeta/common/meta_devadm.c index 975c87e4f3..8668c6eb6e 100644 --- a/usr/src/lib/lvm/libmeta/common/meta_devadm.c +++ b/usr/src/lib/lvm/libmeta/common/meta_devadm.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -626,6 +625,8 @@ pathname_reload( /* metadevices do not have devid's in them */ mda_debug("pathname_reload: no devid for %s\n", (char *)(uintptr_t)nm.devname); + /* Clear error if no devid and go to next nm entry */ + mdclrerror(ep); continue; } diff --git a/usr/src/lib/lvm/libmeta/common/meta_error.c b/usr/src/lib/lvm/libmeta/common/meta_error.c index f98aed73a9..a5f0f45cbe 100644 --- a/usr/src/lib/lvm/libmeta/common/meta_error.c +++ b/usr/src/lib/lvm/libmeta/common/meta_error.c @@ -1964,13 +1964,6 @@ ds_to_str( (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, "multiple namespace records detected")); break; - case MDE_DS_PARTIALSET: - (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, - "partial diskset detected\n" - "Please refer to the Solaris Volume Manager documentation," - "\nTroubleshooting section, at http://docs.sun.com or from" - "\nyour local copy")); - break; case MDE_DS_COMMDCTL_SUSPEND_NYD: (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, "rpc.mdcommd on host %s is not yet drained during " diff --git a/usr/src/lib/lvm/libmeta/common/meta_hotspares.c b/usr/src/lib/lvm/libmeta/common/meta_hotspares.c index 15f310d0b0..5ba8c84866 100644 --- a/usr/src/lib/lvm/libmeta/common/meta_hotspares.c +++ b/usr/src/lib/lvm/libmeta/common/meta_hotspares.c @@ -926,7 +926,7 @@ add_hsp_name_mn_sides( if (nd->nd_nodeid == curside) continue; if (add_name(sp, nd->nd_nodeid, key, MD_HOTSPARES, - minor(NODEV), hsp_name, ep) == -1) { + minor(NODEV), hsp_name, NULL, NULL, ep) == -1) { return (-1); } } @@ -964,7 +964,7 @@ add_hsp_name_trad_sides( continue; if (sd->sd_nodes[i][0] != '\0') { if (add_name(sp, i, key, MD_HOTSPARES, minor(NODEV), - hsp_name, ep) == -1) { + hsp_name, NULL, NULL, ep) == -1) { return (-1); } } @@ -1014,7 +1014,7 @@ add_hsp_name( /* First add the record for the side of the current node. */ key = add_name(sp, thisside, MD_KEYWILD, MD_HOTSPARES, minor(NODEV), - hsp_name, ep); + hsp_name, NULL, NULL, ep); if (key == -1) { goto cleanup; } diff --git a/usr/src/lib/lvm/libmeta/common/meta_import.c b/usr/src/lib/lvm/libmeta/common/meta_import.c index 650df6fcff..cb3a6aaaf9 100644 --- a/usr/src/lib/lvm/libmeta/common/meta_import.c +++ b/usr/src/lib/lvm/libmeta/common/meta_import.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -48,6 +47,8 @@ typedef struct did_list { dev_t dev; uint_t did_index; char *minor_name; + char *driver_name; + int available; struct did_list *next; } did_list_t; @@ -76,7 +77,11 @@ static replicated_disk_t *replicated_disk_list[MAX_DEVID_LEN + 1] = {NULL}; * The list of replicated disks is built just once and this flag is set * once it's done */ -static int replicated_disk_list_built = 0; +int replicated_disk_list_built_pass1 = 0; +int replicated_disk_list_built_pass2 = 0; +int *replicated_disk_list_built; + +static void free_did_list(did_list_t *did_listp); /* * Map logical blk to physical @@ -120,17 +125,15 @@ static md_im_drive_info_t * drive_append( md_im_drive_info_t **midpp, mddrivename_t *dnp, - void *devid, - void *rdevid, - void *devname, - int devid_sz, - char *minor_name, + did_list_t *nonrep_did_listp, + minor_t mnum, md_timeval32_t timestamp, md_im_replica_info_t *mirp ) { md_im_drive_info_t *midp; int o_devid_sz; + int devid_sz; for (; (*midpp != NULL); midpp = &((*midpp)->mid_next)) ; @@ -140,37 +143,47 @@ drive_append( midp->mid_dnp = dnp; /* - * If rdevid is not NULL then we know we are dealing with + * If rdid is not NULL then we know we are dealing with * replicated diskset case. 'devid_sz' will always be the - * size of a valid devid which can be 'devid' or 'rdevid' + * size of a valid devid which can be 'did' or 'rdid' */ - midp->mid_devid = (void *)Malloc(devid_sz); - if (rdevid) { - (void) memcpy(midp->mid_devid, rdevid, devid_sz); + if (nonrep_did_listp->rdid) { + devid_sz = devid_sizeof(nonrep_did_listp->rdid); + midp->mid_devid = (void *)Malloc(devid_sz); + (void) memcpy(midp->mid_devid, nonrep_did_listp->rdid, + devid_sz); /* * Also need to store the 'other' devid */ - o_devid_sz = devid_sizeof((ddi_devid_t)devid); + o_devid_sz = devid_sizeof((ddi_devid_t)(nonrep_did_listp->did)); midp->mid_o_devid = (void *)Malloc(o_devid_sz); - (void) memcpy(midp->mid_o_devid, devid, o_devid_sz); + (void) memcpy(midp->mid_o_devid, nonrep_did_listp->did, + o_devid_sz); midp->mid_o_devid_sz = o_devid_sz; } else { + devid_sz = devid_sizeof(nonrep_did_listp->did); + midp->mid_devid = (void *)Malloc(devid_sz); /* * In the case of regular diskset, midp->mid_o_devid * will be a NULL pointer */ - (void) memcpy(midp->mid_devid, devid, devid_sz); + (void) memcpy(midp->mid_devid, nonrep_did_listp->did, devid_sz); } - if (devname) - midp->mid_devname = Strdup(devname); - midp->mid_devid_sz = devid_sz; midp->mid_setcreatetimestamp = timestamp; - (void) strlcpy(midp->mid_minor_name, minor_name, MDDB_MINOR_NAME_MAX); + midp->mid_available = nonrep_did_listp->available; + if (nonrep_did_listp->minor_name) { + (void) strlcpy(midp->mid_minor_name, + nonrep_did_listp->minor_name, MDDB_MINOR_NAME_MAX); + } + midp->mid_mnum = mnum; + if (nonrep_did_listp->driver_name) + midp->mid_driver_name = Strdup(nonrep_did_listp->driver_name); midp->mid_replicas = mirp; - + if (nonrep_did_listp->devname) + midp->mid_devname = Strdup(nonrep_did_listp->devname); return (midp); } @@ -187,17 +200,14 @@ static md_im_drive_info_t ** drive_append_wrapper( md_im_drive_info_t **tailpp, mddrivename_t *dnp, - void *devid, - void *rdevid, - void *devname, - int devid_sz, - char *minor_name, + did_list_t *nonrep_did_listp, + minor_t mnum, md_timeval32_t timestamp, md_im_replica_info_t *mirp ) { - (void) drive_append(tailpp, dnp, devid, rdevid, devname, devid_sz, - minor_name, timestamp, mirp); + (void) drive_append(tailpp, dnp, nonrep_did_listp, mnum, timestamp, + mirp); if ((*tailpp)->mid_next == NULL) return (tailpp); @@ -302,7 +312,7 @@ map_replica_disk( * for the disk. * If you store the returned devid you must create a local copy. */ -static void * +void * replicated_list_lookup( uint_t devid_len, void *old_devid @@ -374,16 +384,13 @@ get_replica_disks( did_list_t *did_listp, mddb_mb_t *mb, mddb_lb_t *lbp, - md_error_t *ep, - int replicated + md_error_t *ep ) { mddrivename_t *dnp; int indx, on_list; mdsetname_t *sp = metasetname(MD_LOCAL_NAME, ep); int flags; - int devid_sz; - char *minor_name; did_list_t *replica_disk; daddr32_t offset; daddr32_t length; @@ -391,63 +398,68 @@ get_replica_disks( md_im_replica_info_t **mirpp = NULL; md_im_drive_info_t **midpp = &misp->mis_drives; md_im_drive_info_t *midp; - void *did; for (indx = 0; indx < lbp->lb_loccnt; indx++) { on_list = 0; - if (lbp->lb_locators[indx].l_flags & MDDB_F_ACTIVE) { + if ((lbp->lb_locators[indx].l_flags == 0) || + (lbp->lb_locators[indx].l_flags & MDDB_F_DELETED)) + continue; - /* - * search the device id list for a - * specific ctds based on the locator - * block device id array index. - */ - replica_disk = map_replica_disk(did_listp, indx); + /* + * search the device id list for a + * specific ctds based on the locator + * block device id array index. + */ + replica_disk = map_replica_disk(did_listp, indx); - assert(replica_disk != NULL); + assert(replica_disk != NULL); - /* - * metadrivename() can fail for a slice name - * if there is not an existing mddrivename_t. - * So we use metadiskname() to strip the slice - * number. - */ - dnp = metadrivename(&sp, - metadiskname(replica_disk->devname), ep); + /* + * metadrivename() can fail for a slice name + * if there is not an existing mddrivename_t. + * So we use metadiskname() to strip the slice + * number. + */ + dnp = metadrivename(&sp, metadiskname(replica_disk->devname), + ep); - for (midp = misp->mis_drives; midp != NULL; - midp = midp->mid_next) { - if (dnp == midp->mid_dnp) { + for (midp = misp->mis_drives; midp != NULL; + midp = midp->mid_next) { + if (dnp == midp->mid_dnp) { + /* + * You could get a dnp match, but if 1 disk + * is unavailable and the other isn't, they + * will have the same dnp due + * to the name being the same, but in fact + * are different disks. + */ + if (midp->mid_available == + replica_disk->available) { on_list = 1; mirpp = &midp->mid_replicas; break; } } + } - /* - * Get the correct devid_sz - */ - if (replicated) - did = replica_disk->rdid; - else - did = replica_disk->did; + /* + * New on the list so add it + */ + if (!on_list) { + mddb_mb_t *mbp; + uint_t sliceno; + mdname_t *rsp; + int fd = -1; - devid_sz = devid_sizeof((ddi_devid_t)did); - minor_name = replica_disk->minor_name; + mbp = Malloc(DEV_BSIZE); /* - * New on the list so add it + * If the disk isn't available, we don't + * want to try to read from it. */ - if (!on_list) { - mddb_mb_t *mbp; - uint_t sliceno; - mdname_t *rsp; - int fd = -1; - - mbp = Malloc(DEV_BSIZE); - + if (replica_disk->available == MD_IM_DISK_AVAILABLE) { /* determine the replica slice */ if (meta_replicaslice(dnp, &sliceno, ep) != 0) { @@ -488,54 +500,57 @@ get_replica_disks( } (void) close(fd); - midpp = drive_append_wrapper(midpp, dnp, - replica_disk->did, replica_disk->rdid, - replica_disk->devname, - devid_sz, minor_name, mbp->mb_setcreatetime, - NULL); - mirpp = &((*midpp)->mid_replicas); - Free(mbp); } + midpp = drive_append_wrapper(midpp, dnp, + replica_disk, + meta_getminor(replica_disk->dev), + mbp->mb_setcreatetime, NULL); + mirpp = &((*midpp)->mid_replicas); + Free(mbp); + } - /* - * For either of these assertions to fail, it implies - * a NULL return from metadrivename() above. Since - * the args came from a presumed valid locator block, - * that's Bad. - */ - assert(midpp != NULL); - assert(mirpp != NULL); + /* + * For either of these assertions to fail, it implies + * a NULL return from metadrivename() above. Since + * the args came from a presumed valid locator block, + * that's Bad. + */ + assert(midpp != NULL); + assert(mirpp != NULL); - /* - * Extract the parameters describing this replica. - * - * The magic "1" in the length calculation accounts - * for the length of the master block, in addition to - * the block count it describes. (The master block - * will always take up one block on the disk, and - * there will always only be one master block per - * replica, even though much of the code is structured - * to handle noncontiguous replicas.) - */ - flags = lbp->lb_locators[indx].l_flags; - offset = lbp->lb_locators[indx].l_blkno; - length = mb->mb_blkcnt + 1; - timestamp = mb->mb_setcreatetime; + /* + * Extract the parameters describing this replica. + * + * The magic "1" in the length calculation accounts + * for the length of the master block, in addition to + * the block count it describes. (The master block + * will always take up one block on the disk, and + * there will always only be one master block per + * replica, even though much of the code is structured + * to handle noncontiguous replicas.) + */ + flags = lbp->lb_locators[indx].l_flags; + offset = lbp->lb_locators[indx].l_blkno; + length = mb->mb_blkcnt + 1; + timestamp = mb->mb_setcreatetime; - mirpp = replica_append_wrapper(mirpp, flags, - offset, length, timestamp); + mirpp = replica_append_wrapper(mirpp, flags, + offset, length, timestamp); - /* - * If we're here it means - - * - * a) we had an active copy of the replica, and - * b) we've added the disk to the list of - * disks as well. - * - * We need to bump up the number of active - * replica count for each such replica so that it - * can be used later for replica quorum check. - */ + /* + * If we're here it means - + * + * we've added the disk to the list of + * disks. + */ + + /* + * We need to bump up the number of active + * replica count for each such replica that is + * active so that it can be used later for replica + * quorum check. + */ + if (flags & MDDB_F_ACTIVE) { misp->mis_active_replicas++; } } @@ -621,6 +636,8 @@ static void get_disks_from_didnamespace( md_im_set_desc_t *misp, pnm_rec_t **pnm, + mddb_rb_t *nm, + mddb_rb_t *shrnm, mddb_rb_t *did_nm, mddb_rb_t *did_shrnm, uint_t imp_flags, @@ -635,14 +652,24 @@ get_disks_from_didnamespace( mdsetname_t *sp = metasetname(MD_LOCAL_NAME, ep); mddb_rb_t *rbp_did = did_nm; mddb_rb_t *rbp_did_shr = did_shrnm; + mddb_rb_t *rbp_nm = nm; + mddb_rb_t *rbp_shr_nm = shrnm; int on_list = 0; - int devid_sz; struct devid_min_rec *did_rec; struct devid_shr_rec *did_shr_rec; + struct nm_rec *namesp_rec; + struct nm_shr_rec *namesp_shr_rec; struct did_shr_name *did; struct did_min_name *min; void *r_did; /* NULL if not a replicated diskset */ void *valid_did; + int avail = 0; + struct nm_name *nmp; + struct nm_shared_name *snmp; + mdkey_t drv_key, key, dev_key; + minor_t mnum = 0; + did_list_t *nonrep_did_listp; + size_t used_size, offset; /* * We got a pointer to an mddb record, which we expect to contain a @@ -653,6 +680,10 @@ get_disks_from_didnamespace( /* LINTED */ did_shr_rec = (struct devid_shr_rec *) ((caddr_t)(&rbp_did_shr->rb_data)); + /* LINTED */ + namesp_rec = (struct nm_rec *)((caddr_t)(&rbp_nm->rb_data)); + /* LINTED */ + namesp_shr_rec = (struct nm_shr_rec *)((caddr_t)(&rbp_shr_nm->rb_data)); /* * Skip the nm_rec_hdr and iterate on the array of struct minor_name @@ -664,9 +695,10 @@ get_disks_from_didnamespace( on_list = 0; r_did = NULL; + nonrep_did_listp = Zalloc(sizeof (struct did_list)); /* - * For a give DID_NM key, locate the corresponding device + * For a given DID_NM key, locate the corresponding device * id from DID_NM_SHR */ for (did = &did_shr_rec->device_id[0]; did->did_key != 0; @@ -691,7 +723,7 @@ get_disks_from_didnamespace( * If replicated diskset */ if (replicated) { - size_t new_devid_len; + size_t new_devid_len, old_devid_len; char *temp; /* * In this case, did->did_devid will @@ -699,56 +731,206 @@ get_disks_from_didnamespace( */ temp = replicated_list_lookup(did->did_size, did->did_devid); - new_devid_len = devid_sizeof((ddi_devid_t)temp); - r_did = Zalloc(new_devid_len); - (void) memcpy(r_did, temp, new_devid_len); + if (temp == NULL) { + /* we have a partial replicated set, fake it */ + new_devid_len = did->did_size; + r_did = Zalloc(new_devid_len); + (void) memcpy(r_did, did->did_devid, + new_devid_len); + } else { + new_devid_len = devid_sizeof((ddi_devid_t)temp); + r_did = Zalloc(new_devid_len); + (void) memcpy(r_did, temp, new_devid_len); + } valid_did = r_did; + nonrep_did_listp->rdid = Zalloc(new_devid_len); + (void) memcpy(nonrep_did_listp->rdid, r_did, + new_devid_len); + old_devid_len = + devid_sizeof((ddi_devid_t)did->did_devid); + nonrep_did_listp->did = Zalloc(old_devid_len); + (void) memcpy((void *)nonrep_did_listp->did, + (void *)did->did_devid, old_devid_len); } else { + size_t new_devid_len; + valid_did = did->did_devid; + new_devid_len = + devid_sizeof((ddi_devid_t)did->did_devid); + nonrep_did_listp->did = Zalloc(new_devid_len); + (void) memcpy((void *)nonrep_did_listp->did, + (void *)did->did_devid, new_devid_len); } - /* Get the ctds mapping for that device id */ + /* + * Get a ctds mapping for that device id. + * Since disk is being imported into this system, + * just use the first ctds in list. + */ if (meta_deviceid_to_nmlist(search_path, (ddi_devid_t)valid_did, &min->min_name[0], &nmlist) == 0) { + /* + * We know the disk is available. Use the + * device information in nmlist. + */ + assert(nmlist[0].devname != NULL); + nonrep_did_listp->devname = Strdup(nmlist[0].devname); + nonrep_did_listp->available = MD_IM_DISK_AVAILABLE; + avail = 0; + mnum = meta_getminor(nmlist[0].dev); + devid_free_nmlist(nmlist); + } else { + /* + * The disk is not available. That means we need to + * use the (old) device information stored in the + * namespace. + */ + /* search in nm space for a match */ + offset = sizeof (struct nm_rec) - + sizeof (struct nm_name); + used_size = namesp_rec->r_rec_hdr.r_used_size - offset; + for (nmp = &namesp_rec->r_name[0]; nmp->n_key != 0; + /* LINTED */ + nmp = (struct nm_name *)((char *)nmp + + NAMSIZ(nmp))) { + if (nmp->n_key == min->min_key) + break; + used_size -= NAMSIZ(nmp); + if ((int)used_size <= 0) { + md_exit(NULL, 1); + } + } - assert(nmlist->devname != NULL); - dnp = metadrivename(&sp, - metadiskname(nmlist->devname), ep); + if (nmp->n_key == 0) { + assert(nmp->n_key != 0); + md_exit(NULL, 1); + } + dev_key = nmp->n_dir_key; + snmp = &namesp_shr_rec->sr_name[0]; + key = snmp->sn_key; /* - * Add drive to pnm_rec_t list of physical devices for - * metastat output. + * Use the namespace n_dir_key to look in the + * shared namespace. When we find the matching + * key, that is the devname and minor number we + * want. */ - if (imp_flags & META_IMP_VERBOSE) { - append_pnm_rec(pnm, min->min_key, - nmlist->devname); + offset = sizeof (struct nm_shr_rec) - + sizeof (struct nm_shared_name); + used_size = namesp_shr_rec->sr_rec_hdr.r_used_size - + offset; + while (key != 0) { + if (dev_key == key) { + /* + * This complicated looking series + * of code creates a devname of the + * form <sn_name>/<n_name> which + * will look like /dev/dsk/c1t4d0s0. + */ + nonrep_did_listp->devname = + Zalloc(strlen(nmp->n_name) + + strlen(snmp->sn_name) + 2); + (void) strlcpy( + nonrep_did_listp->devname, + snmp->sn_name, + strlen(snmp->sn_name)); + (void) strlcat( + nonrep_did_listp->devname, "/", + strlen(nmp->n_name) + + strlen(snmp->sn_name) + 2); + (void) strlcat( + nonrep_did_listp->devname, + nmp->n_name, + strlen(nmp->n_name) + + strlen(snmp->sn_name) + 2); + mnum = nmp->n_minor; + break; + } + /* LINTED */ + snmp = (struct nm_shared_name *)((char *)snmp + + SHR_NAMSIZ(snmp)); + key = snmp->sn_key; + used_size -= SHR_NAMSIZ(snmp); + if ((int)used_size <= 0) { + md_exit(NULL, 1); + } + } + if (key == 0) { + nonrep_did_listp->devname = NULL; + mnum = 0; } - assert(dnp != NULL); - /* Is it already on the list? */ - for (midp = misp->mis_drives; midp != NULL; - midp = midp->mid_next) { - if (midp->mid_dnp == dnp) { - on_list = 1; + nonrep_did_listp->available = MD_IM_DISK_NOT_AVAILABLE; + nonrep_did_listp->minor_name = Strdup(min->min_name); + avail = 1; + drv_key = nmp->n_drv_key; + snmp = &namesp_shr_rec->sr_name[0]; + key = snmp->sn_key; + /* + * Use the namespace n_drv_key to look in the + * shared namespace. When we find the matching + * key, that is the driver name for the disk. + */ + offset = sizeof (struct nm_shr_rec) - + sizeof (struct nm_shared_name); + used_size = namesp_shr_rec->sr_rec_hdr.r_used_size - + offset; + while (key != 0) { + if (drv_key == key) { + nonrep_did_listp->driver_name = + Strdup(snmp->sn_name); break; } + /* LINTED */ + snmp = (struct nm_shared_name *)((char *)snmp + + SHR_NAMSIZ(snmp)); + key = snmp->sn_key; + used_size -= SHR_NAMSIZ(snmp); + if ((int)used_size <= 0) { + md_exit(NULL, 1); + } } + if (key == 0) + nonrep_did_listp->driver_name = NULL; + } + dnp = metadrivename(&sp, + metadiskname(nonrep_did_listp->devname), ep); + /* + * Add drive to pnm_rec_t list of physical devices for + * metastat output. + */ + if (imp_flags & META_IMP_VERBOSE) { + append_pnm_rec(pnm, min->min_key, + nonrep_did_listp->devname); + } - devid_sz = devid_sizeof( - (ddi_devid_t)valid_did); + assert(dnp != NULL); + /* Is it already on the list? */ + for (midp = misp->mis_drives; midp != NULL; + midp = midp->mid_next) { + if (midp->mid_dnp == dnp) { + if (midp->mid_available == + nonrep_did_listp->available) { + on_list = 1; + break; + } + } + } - if (!on_list) { - mddb_mb_t *mbp; - uint_t sliceno; - mdname_t *rsp; - int fd = -1; + if (!on_list) { + mddb_mb_t *mbp; + uint_t sliceno; + mdname_t *rsp; + int fd = -1; - mbp = Malloc(DEV_BSIZE); + mbp = Malloc(DEV_BSIZE); + if (!avail) { /* determine the replica slice */ if (meta_replicaslice(dnp, &sliceno, ep) != 0) { Free(mbp); + free_did_list(nonrep_did_listp); continue; } @@ -759,18 +941,21 @@ get_disks_from_didnamespace( if (dnp->vtoc.parts[sliceno].size == 0) { Free(mbp); + free_did_list(nonrep_did_listp); continue; } if ((rsp = metaslicename(dnp, sliceno, ep)) == NULL) { Free(mbp); + free_did_list(nonrep_did_listp); continue; } if ((fd = open(rsp->rname, O_RDONLY| O_NDELAY)) < 0) { Free(mbp); + free_did_list(nonrep_did_listp); continue; } @@ -781,26 +966,26 @@ get_disks_from_didnamespace( DEV_BSIZE) <= 0) { mdclrerror(ep); Free(mbp); - (void) close(fd); - continue; + free_did_list(nonrep_did_listp); + (void) close(fd); + continue; } (void) close(fd); - /* - * If it is replicated diskset, - * r_did will be non-NULL and - * devid_sz will be its size. - * Passing the devname as NULL because field - * is not currently used for a non-replica disk. - */ - midpp = drive_append_wrapper(midpp, - dnp, &did->did_devid, r_did, NULL, - devid_sz, &min->min_name[0], - mbp->mb_setcreatetime, NULL); - Free(mbp); } - devid_free_nmlist(nmlist); + /* + * If it is replicated diskset, + * r_did will be non-NULL. + * Passing the devname as NULL because field + * is not currently used for a non-replica disk. + */ + midpp = drive_append_wrapper(midpp, + dnp, nonrep_did_listp, + mnum, mbp->mb_setcreatetime, NULL); + Free(mbp); + free_did_list(nonrep_did_listp); } + free_did_list(nonrep_did_listp); } } @@ -821,17 +1006,19 @@ set_append( mddb_mb_t *mb, mddb_lb_t *lbp, mddb_rb_t *nm, + mddb_rb_t *shrnm, pnm_rec_t **pnm, mddb_rb_t *did_nm, mddb_rb_t *did_shrnm, uint_t imp_flags, - int replicated, md_error_t *ep ) { md_im_set_desc_t *misp; set_t setno = mb->mb_setno; + int partial = imp_flags & MD_IM_PARTIAL_DISKSET; + int replicated = imp_flags & MD_IM_SET_REPLICATED; /* run to end of list */ for (; (*mispp != NULL); mispp = &((*mispp)->mis_next)) @@ -844,12 +1031,13 @@ set_append( misp->mis_flags = MD_IM_SET_REPLICATED; misp->mis_oldsetno = setno; + misp->mis_partial = partial; /* Get the disks with and without replicas */ - get_replica_disks(misp, did_listp, mb, lbp, ep, replicated); + get_replica_disks(misp, did_listp, mb, lbp, ep); if (nm != NULL && did_nm != NULL && did_shrnm != NULL) { - get_disks_from_didnamespace(misp, pnm, did_nm, + get_disks_from_didnamespace(misp, pnm, nm, shrnm, did_nm, did_shrnm, imp_flags, replicated, ep); } @@ -1404,8 +1592,8 @@ read_nm_rec( * ids; the caller of this routine is responsible for free'ing up the memory. * * Returns: - * 1 if it's a replicated disk - * 0 if it's not a replicated disk + * MD_IM_SET_REPLICATED if it's a replicated disk + * 0 if it's not a replicated disk */ static int is_replicated( @@ -1426,7 +1614,7 @@ is_replicated( return (retval); if (devid_compare((ddi_devid_t)mbp->mb_devid, current_devid) != 0) - retval = 1; + retval = MD_IM_SET_REPLICATED; if (retval && need_devid) { new_devid_len = devid_sizeof(current_devid); @@ -1474,7 +1662,7 @@ free_replicated_disks_list() * 1 on success * 0 on failure */ -static int +int build_replicated_disks_list( md_error_t *ep, mddrivenamelist_t *dnlp @@ -1522,7 +1710,7 @@ build_replicated_disks_list( } (void) close(fd); } - replicated_disk_list_built = 1; + *replicated_disk_list_built = 1; Free(mbp); return (1); @@ -1553,6 +1741,102 @@ free_did_list( Free(temp->devname); if (temp->minor_name) Free(temp->minor_name); + if (temp->driver_name) + Free(temp->driver_name); + Free(temp); + } +} + +/* + * meta_free_im_replica_info + * + * Frees the md_im_replica_info list + */ +static void +meta_free_im_replica_info( + md_im_replica_info_t *mirp +) +{ + md_im_replica_info_t *r, *temp; + + r = mirp; + + while (r != NULL) { + temp = r; + r = r->mir_next; + + Free(temp); + } +} + +/* + * meta_free_im_drive_info + * + * Frees the md_im_drive_info list + */ +static void +meta_free_im_drive_info( + md_im_drive_info_t *midp +) +{ + md_im_drive_info_t *d, *temp; + + d = midp; + + while (d != NULL) { + temp = d; + d = d->mid_next; + + if (temp->mid_available & MD_IM_DISK_NOT_AVAILABLE) + /* + * dnp is not on the drivenamelist and is a temp + * dnp for metaimport if the disk is unavailable. + * We need to specifically free it because of this. + * If the disk is available, standard drivelist freeing + * will kick in so we don't need to do it. + */ + metafreedrivename(temp->mid_dnp); + if (temp->mid_devid) + Free(temp->mid_devid); + if (temp->mid_o_devid) + Free(temp->mid_o_devid); + if (temp->mid_driver_name) + Free(temp->mid_driver_name); + if (temp->mid_devname) + Free(temp->mid_devname); + if (temp->mid_replicas) { + meta_free_im_replica_info(temp->mid_replicas); + temp->mid_replicas = NULL; + } + if (temp->overlap) { + meta_free_im_drive_info(temp->overlap); + temp->overlap = NULL; + } + Free(temp); + } +} + +/* + * meta_free_im_set_desc + * + * Frees the md_im_set_desc_t list + */ +void +meta_free_im_set_desc( + md_im_set_desc_t *misp +) +{ + md_im_set_desc_t *s, *temp; + + s = misp; + + while (s != NULL) { + temp = s; + s = s->mis_next; + if (temp->mis_drives) { + meta_free_im_drive_info(temp->mis_drives); + temp->mis_drives = NULL; + } Free(temp); } } @@ -1577,7 +1861,9 @@ build_did_list( md_error_t *ep, int fd, mddb_mb_t *mb, + mddb_lb_t *lbp, mddb_did_blk_t *lbdidp, + mddb_ln_t *lnp, did_list_t **did_listp, int replicated ) @@ -1593,8 +1879,11 @@ build_did_list( mddb_did_info_t *did_info = NULL; void *did = NULL; size_t new_devid_len; + int partial = 0; + int partial_replicated = 0; for (cnt = 0; cnt < MDDB_NLB; cnt++) { + partial_replicated = 0; did_info = &lbdidp->blk_info[cnt]; if (!(did_info->info_flags & MDDB_DID_EXISTS)) @@ -1604,7 +1893,7 @@ build_did_list( new->did = Zalloc(did_info->info_length); /* - * If we can re-use the buffer already has been + * If we can re-use the buffer that has already been * read in then just use it. Otherwise free * the previous one and alloc a new one */ @@ -1646,10 +1935,19 @@ build_did_list( if (replicated) { temp = replicated_list_lookup(did_info->info_length, new->did); - new_devid_len = devid_sizeof((ddi_devid_t)temp); - new->rdid = Zalloc(new_devid_len); - (void) memcpy(new->rdid, temp, new_devid_len); - did = new->rdid; + if (temp == NULL) { + /* we have a partial replicated set, fake it */ + new_devid_len = devid_sizeof((ddi_devid_t)new->did); + new->rdid = Zalloc(new_devid_len); + (void) memcpy(new->rdid, new->did, new_devid_len); + did = new->rdid; + partial_replicated = 1; + } else { + new_devid_len = devid_sizeof((ddi_devid_t)temp); + new->rdid = Zalloc(new_devid_len); + (void) memcpy(new->rdid, temp, new_devid_len); + did = new->rdid; + } } else { did = new->did; } @@ -1658,20 +1956,42 @@ build_did_list( return (-1); } - if ((rval = meta_deviceid_to_nmlist(search_path, - (ddi_devid_t)did, minor_name, &nm)) != 0) { - *did_listp = head; - free_did_list(*did_listp); - *did_listp = NULL; - (void) mddserror(ep, MDE_DS_PARTIALSET, MD_SET_BAD, - mynode(), NULL, NULL); - return (ENOTSUP); + if (partial_replicated || meta_deviceid_to_nmlist(search_path, + (ddi_devid_t)did, minor_name, &nm) != 0) { + int len = 0; + + /* + * Partial diskset case. We'll need to get the + * device information from the metadb instead + * of the output (nm) of meta_deviceid_to_nmlist. + */ + len = strlen(lnp->ln_prefixes[0].pre_data) + + strlen(lnp->ln_suffixes[0][cnt].suf_data) + 2; + new->devname = Zalloc(len); + (void) strlcpy(new->devname, + lnp->ln_prefixes[0].pre_data, + strlen(lnp->ln_prefixes[0].pre_data) + 1); + (void) strlcat(new->devname, "/", len); + (void) strlcat(new->devname, + lnp->ln_suffixes[0][cnt].suf_data, len); + new->minor_name = Strdup(minor_name); + new->next = head; + new->available = MD_IM_DISK_NOT_AVAILABLE; + new->driver_name = Strdup(lbp->lb_drvnm[0].dn_data); + new->dev = lbp->lb_locators[cnt].l_dev; + head = new; + partial = ENOTSUP; + continue; } + /* + * Disk is there. Grab device information from nm structure. + */ assert(nm->devname != NULL); new->devname = Strdup(nm->devname); new->dev = nm->dev; new->minor_name = Strdup(minor_name); + new->available = MD_IM_DISK_AVAILABLE; devid_free_nmlist(nm); @@ -1683,6 +2003,8 @@ build_did_list( if (bp) Free(bp); *did_listp = head; + if (partial) + return (partial); return (1); } /* @@ -1698,7 +2020,6 @@ build_did_list( */ static int check_nm_disks( - md_error_t *ep, struct devid_min_rec *did_nmp, struct devid_shr_rec *did_shrnmp ) @@ -1751,8 +2072,7 @@ check_nm_disks( */ if ((meta_deviceid_to_nmlist(search_path, did, minor_name, &nm)) != 0) { - (void) mddserror(ep, MDE_DS_PARTIALSET, MD_SET_BAD, - mynode(), NULL, NULL); + /* Partial diskset detected */ return (ENOTSUP); } devid_free_nmlist(nm); @@ -1828,6 +2148,86 @@ report_metadb_info( (void) printf("\n"); } +/* + * meta_replica_quorum will determine if the disks in the set to be + * imported have enough valid replicas to have quorum. + * + * RETURN: + * -1 Set doesn't have quorum + * 0 Set does have quorum + */ +int +meta_replica_quorum( + md_im_set_desc_t *misp +) +{ + md_im_drive_info_t *midp; + md_im_replica_info_t *midr; + int replica_count = 0; + + for (midp = misp->mis_drives; midp != NULL; + midp = midp->mid_next) { + + if (midp->mid_available == MD_IM_DISK_NOT_AVAILABLE) + continue; + + /* + * The drive is okay. Now count its replicas + */ + for (midr = midp->mid_replicas; midr != NULL; + midr = midr->mir_next) { + replica_count++; + } + } + + if (misp->mis_active_replicas & 1) { + /* odd number of replicas */ + if (replica_count < (misp->mis_active_replicas + 1)/2) + return (-1); + } else { + /* even number of replicas */ + if (replica_count <= ((misp->mis_active_replicas + 1)/2)) + return (-1); + } + + return (0); +} + + +/* + * Choose the best drive to use for the metaimport command. + */ +md_im_drive_info_t * +pick_good_disk(md_im_set_desc_t *misp) +{ + md_timeval32_t *setcrtime; /* set creation time */ + md_im_drive_info_t *good_disk = NULL; + md_im_drive_info_t *midp = NULL; + md_im_replica_info_t *mirp; + + setcrtime = &(misp->mis_drives->mid_replicas->mir_timestamp); + for (midp = misp->mis_drives; (midp != NULL) && (good_disk == NULL); + midp = midp->mid_next) { + /* drive must be available */ + if (midp->mid_available == MD_IM_DISK_NOT_AVAILABLE) { + continue; + } + for (mirp = midp->mid_replicas; mirp != NULL; + mirp = mirp->mir_next) { + /* replica must be active to be a good one */ + if (mirp->mir_flags & MDDB_F_ACTIVE) { + if ((setcrtime->tv_sec == + midp-> mid_setcreatetimestamp.tv_sec) && + (setcrtime->tv_usec == + midp->mid_setcreatetimestamp.tv_usec)) { + good_disk = midp; + break; + } + } + } + } + return (good_disk); +} /* * report_set_info() @@ -1848,22 +2248,21 @@ report_set_info( int fd, uint_t imp_flags, int set_count, + int overlap, + md_im_drive_info_t *overlap_disks, md_error_t *ep ) { int rval = 0; md_im_drive_info_t *d; - md_im_replica_info_t *r; md_im_drive_info_t *good_disk = NULL; int i; int in = META_INDENT; char indent[MAXPATHLEN]; - int dlen = 0; - md_timeval32_t firstdisktime; md_timeval32_t lastaccess; /* stores last modified timestamp */ - int set_contains_time_conflict = 0; - int disk_time_conflict = 0; - + int has_overlap = 0; + int no_quorum = 0; + int partial = 0; /* Calculates the correct indentation. */ indent[0] = 0; @@ -1881,99 +2280,113 @@ report_set_info( } } + partial = misp->mis_partial; + good_disk = pick_good_disk(misp); + if (good_disk == NULL) { + return (rval); + } + /* * Make the distinction between a regular diskset and - * a replicated diskset. + * a replicated diskset. Also make the distinction + * between a partial vs. full diskset. */ - if (misp->mis_flags & MD_IM_SET_REPLICATED) { - if (imp_flags & META_IMP_REPORT) { - (void) printf("%i) %s:\n", set_count, gettext( - "Found replicated diskset containing disks")); + if (partial == MD_IM_PARTIAL_DISKSET) { + if (misp->mis_flags & MD_IM_SET_REPLICATED) { + if (imp_flags & META_IMP_REPORT) { + (void) printf("%i) %s:\n", set_count, gettext( + "Found partial replicated diskset " + "containing disks")); + } else { + (void) printf("\n%s:\n", gettext( + "Importing partial replicated diskset " + "containing disks")); + } } else { - (void) printf("\n%s:\n", gettext( - "Importing replicated diskset containing disks")); + if (imp_flags & META_IMP_REPORT) { + (void) printf("%i) %s:\n", set_count, gettext( + "Found partial regular diskset containing " + "disks")); + } else { + (void) printf("\n%s:\n", gettext( + "Importing partial regular diskset " + "containing disks")); + } } } else { - if (imp_flags & META_IMP_REPORT) { - (void) printf("%i) %s:\n", set_count, gettext( - "Found regular diskset containing disks")); + if (misp->mis_flags & MD_IM_SET_REPLICATED) { + if (imp_flags & META_IMP_REPORT) { + (void) printf("%i) %s:\n", set_count, gettext( + "Found replicated diskset containing " + "disks")); + } else { + (void) printf("\n%s:\n", gettext( + "Importing replicated diskset containing " + "disks")); + } } else { - (void) printf("\n%s:\n", gettext( - "Importing regular diskset containing disks")); + if (imp_flags & META_IMP_REPORT) { + (void) printf("%i) %s:\n", set_count, gettext( + "Found regular diskset containing disks")); + } else { + (void) printf("\n%s:\n", gettext( + "Importing regular diskset containing " + "disks")); + } } } - /* - * Save the set creation time for the first disk in the - * diskset. + * Check each drive in the set. If it's unavailable or + * an overlap tell the user. */ for (d = misp->mis_drives; d != NULL; d = d->mid_next) { - dlen = max(dlen, strlen(d->mid_dnp->cname)); - if (good_disk == NULL) { - for (r = d->mid_replicas; r != NULL; r = r->mir_next) { - if (r->mir_flags & MDDB_F_ACTIVE) { - good_disk = d; - firstdisktime = - d->mid_setcreatetimestamp; + (void) fprintf(stdout, " %s", d->mid_dnp->cname); + if (d->mid_available == MD_IM_DISK_NOT_AVAILABLE) { + (void) fprintf(stdout, " (UNAVAIL)"); + } + if (overlap) { + md_im_drive_info_t **chain; + /* + * There is the potential for an overlap, see if + * this disk is one of the overlapped disks. + */ + for (chain = &overlap_disks; *chain != NULL; + chain = &(*chain)->overlap) { + if (strcmp(d->mid_dnp->cname, + (*chain)->mid_dnp->cname) == 0) { + (void) fprintf(stdout, " (CONFLICT)"); + has_overlap = 1; break; } } - } else { - break; } + (void) fprintf(stdout, "\n"); } - /* - * Compares the set creation time from the first disk in the - * diskset to the diskset creation time on all other - * disks in the diskset. - * If they are different then the disk probably belongs to a - * different diskset so we will print out a warning. - * - * Looping through all drives in the diskset to print - * out information about the drive. + * This note explains the (UNAVAIL) that appears next to the + * disks in the diskset that are not available. */ - for (d = misp->mis_drives; d != NULL; disk_time_conflict = 0, - d = d->mid_next) { - /* - * Verify that the disk's seconds and micro-seconds fields - * match the fields for the good_disk. - */ - if ((firstdisktime.tv_sec != - d->mid_setcreatetimestamp.tv_sec) || - (firstdisktime.tv_usec != - d->mid_setcreatetimestamp.tv_usec)) { - disk_time_conflict = 1; - set_contains_time_conflict = 1; - } - - /* Printing disk names. */ - if (disk_time_conflict == 1) { - /* print '*' next to conflicting disk */ - (void) printf("%s%-*.*s *\n", indent, - dlen, dlen, d->mid_dnp->cname); - } else { - (void) printf("%s%-*.*s\n", indent, - dlen, dlen, d->mid_dnp->cname); - } + if (partial) { + (void) printf("%s%s\n%s%s\n\n", indent, + gettext("(UNAVAIL) WARNING: This disk is unavailable on" + " this system."), indent, gettext("Import may corrupt " + "data in the diskset.")); } - (void) printf("\n"); /* - * This note explains the "*" that appears next to the - * disks with metadbs' whose lb_inittime timestamp does not + * This note explains the (CONFLICT) that appears next to the + * disks whose lb_inittime timestamp does not * match the rest of the diskset. */ - if (set_contains_time_conflict) { + if (has_overlap) { (void) printf("%s%s\n%s%s\n\n", indent, - gettext("* WARNING: This disk has been reused in " - "another diskset."), indent, gettext("Import may corrupt " - "data in the diskset.")); + gettext("(CONFLICT) WARNING: This disk has been reused in " + "another diskset or system configuration."), indent, + gettext("Import may corrupt data in the diskset.")); } - /* * If the verbose flag was given on the command line, * we will print out the metastat -c information , the @@ -2039,6 +2452,10 @@ report_set_info( gettext("For more information about this diskset"), indent, myname, good_disk->mid_dnp->cname); } + + if (meta_replica_quorum(misp) != 0) + no_quorum = 1; + /* * TRANSLATION_NOTE * @@ -2047,9 +2464,15 @@ report_set_info( * (untranslatable) that the user may use to import * the specified diskset. */ - (void) printf("%s%s:\n%s %s -s <newsetname> %s\n", indent, - gettext("To import this diskset"), indent, myname, - good_disk->mid_dnp->cname); + if (partial || has_overlap || no_quorum) { + (void) printf("%s%s:\n%s %s -f -s <newsetname> %s\n", + indent, gettext("To import this diskset"), indent, + myname, good_disk->mid_dnp->cname); + } else { + (void) printf("%s%s:\n%s %s -s <newsetname> %s\n", + indent, gettext("To import this diskset"), indent, + myname, good_disk->mid_dnp->cname); + } } (void) printf("\n\n"); @@ -2063,12 +2486,12 @@ report_set_info( * Scans a given drive for set specific information. If the given drive * has a shared metadb, scans the shared metadb for information pertaining * to the set. + * If imp_flags has META_IMP_PASS1 set don't report. * * Returns: * <0 for failure * 0 success but no replicas were found * 1 success and a replica was found - * ENOTSUP for partial disksets detected */ int meta_get_and_report_set_info( @@ -2077,6 +2500,8 @@ meta_get_and_report_set_info( int local_mb_ok, uint_t imp_flags, int *set_count, + int overlap, + md_im_drive_info_t *overlap_disks, md_error_t *ep ) { @@ -2100,13 +2525,15 @@ meta_get_and_report_set_info( mddrivenamelist_t *dnlp; mddrivename_t *dnp; md_im_names_t cnames = { 0, NULL}; - char *nm = NULL; + char *nm = NULL, *shrnm = NULL; char *did_nm = NULL, *did_shrnm = NULL; struct nm_rec *nmp; + struct nm_shr_rec *snmp; struct devid_shr_rec *did_shrnmp; struct devid_min_rec *did_nmp; int extended_namespace = 0; int replicated = 0; + int partial = 0; pnm_rec_t *pnm = NULL; /* list of physical devs in set */ md_im_set_desc_t *misp; @@ -2198,7 +2625,18 @@ meta_get_and_report_set_info( * the locator block are invalid and we need to build a list of * replicated disks. */ - if (replicated && !replicated_disk_list_built) { + if (imp_flags & META_IMP_PASS1) { + /* + * We need to do this for both passes but + * replicated_disk_list_built is global so we need some way + * to determine which pass we're on. Set it to the appropriate + * pass's flag. + */ + replicated_disk_list_built = &replicated_disk_list_built_pass1; + } else { + replicated_disk_list_built = &replicated_disk_list_built_pass2; + } + if (replicated && !(*replicated_disk_list_built)) { /* * if there's a replicated diskset involved, we need to * scan the system one more time and build a list of all @@ -2214,11 +2652,6 @@ meta_get_and_report_set_info( goto out; } - rval = build_did_list(ep, fd, mbp, lbdidp, &did_listp, replicated); - - if ((rval <= 0) || (rval == ENOTSUP)) - goto out; - /* * Until here, we've gotten away with fixed sizes for the * master block and locator block. The locator names, @@ -2231,6 +2664,20 @@ meta_get_and_report_set_info( if ((rval = read_locator_names(ep, fd, mbp, lbp, lnp, lnsize)) <= 0) goto out; + rval = build_did_list(ep, fd, mbp, lbp, lbdidp, lnp, &did_listp, + replicated); + + /* + * An rval of ENOTSUP means we have a partial diskset. We'll want + * to set the partial variable so we can pass this information + * set_append_wrapper later for placing on the misp list. + */ + if (rval == ENOTSUP) + partial = MD_IM_PARTIAL_DISKSET; + + if (rval < 0) + goto out; + /* * Read in the NM record * If no NM record was found, it still is a valid configuration @@ -2260,6 +2707,20 @@ meta_get_and_report_set_info( goto out; } + if ((rval = read_nm_rec(ep, fd, mbp, lbp, &shrnm, MDDB_SHR_NM, + rsp->cname)) < 0) + goto out; + else if (rval == 0) + goto append; + + /*LINTED*/ + snmp = (struct nm_shr_rec *)(shrnm + sizeof (mddb_rb_t)); + if (snmp->sr_rec_hdr.r_next_recid != (mddb_recid_t)0) { + extended_namespace = 1; + rval = 0; + goto out; + } + if ((rval = read_nm_rec(ep, fd, mbp, lbp, &did_nm, MDDB_DID_NM, rsp->cname)) < 0) goto out; @@ -2295,23 +2756,36 @@ meta_get_and_report_set_info( * are actually available. If they aren't we'll return with * an ENOTSUP error which indicates a partial diskset. */ - rval = check_nm_disks(ep, did_nmp, did_shrnmp); - if ((rval < 0) || (rval == ENOTSUP)) + rval = check_nm_disks(did_nmp, did_shrnmp); + + /* + * An rval of ENOTSUP means we have a partial diskset. We'll want + * to set the partial variable so we can pass this information + * to set_append_wrapper later for placing on the misp list. + */ + if (rval == ENOTSUP) + partial = MD_IM_PARTIAL_DISKSET; + + if (rval < 0) goto out; append: /* Finally, we've got what we need to process this replica. */ misp = set_append(mispp, did_listp, mbp, lbp, /*LINTED*/ - (mddb_rb_t *)nm, &pnm, (mddb_rb_t *)did_nm, (mddb_rb_t *)did_shrnm, - imp_flags, replicated, ep); - - *set_count += 1; - rval = report_set_info(misp, mbp, lbp, - /*LINTED*/ - (mddb_rb_t *)nm, &pnm, rsp, fd, imp_flags, *set_count, ep); - if (rval < 0) - goto out; + (mddb_rb_t *)nm, (mddb_rb_t *)shrnm, &pnm, (mddb_rb_t *)did_nm, + /*LINTED*/ + (mddb_rb_t *)did_shrnm, (imp_flags | partial | replicated), ep); + + if (!(imp_flags & META_IMP_PASS1)) { + *set_count += 1; + rval = report_set_info(misp, mbp, lbp, + /*LINTED*/ + (mddb_rb_t *)nm, &pnm, rsp, fd, imp_flags, *set_count, + overlap, overlap_disks, ep); + if (rval < 0) + goto out; + } /* Return the fact that we found at least one set */ rval = 1; @@ -2376,48 +2850,563 @@ meta_getminor_name( return (ret_minor_name); } -static int -meta_replica_quorum( - md_im_set_desc_t *misp, - md_error_t *ep +/* + * meta_update_mb_did + * + * Update or create the master block with the new set number. + * If a non-null devid pointer is given, the devid in the + * master block will also be changed. + * + * This routine is called during the import of a diskset + * (meta_imp_update_mb) and during the take of a diskset that has + * some unresolved replicated drives (meta_unrslv_replicated_mb). + * + * Returns : nothing (void) + */ +static void +meta_update_mb_did( + mdsetname_t *sp, + mddrivename_t *dnp, /* raw name of drive with mb */ + void *new_devid, /* devid to be stored in mb */ + int new_devid_len, + void *old_devid, /* old devid stored in mb */ + int replica_present, /* does replica follow mb? */ + int offset, + md_error_t *ep ) { + int fd; + struct mddb_mb *mbp; + uint_t sliceno; + mdname_t *rsp; + + /* determine the replica slice */ + if (meta_replicaslice(dnp, &sliceno, ep) != 0) { + return; + } + + /* + * if the replica slice size is zero, + * don't bother opening + */ + if (dnp->vtoc.parts[sliceno].size == 0) { + return; + } + + if ((rsp = metaslicename(dnp, sliceno, ep)) == NULL) { + return; + } + + if ((fd = open(rsp->rname, O_RDWR | O_NDELAY)) < 0) { + return; + } + + if (lseek(fd, (off_t)dbtob(offset), SEEK_SET) < 0) + return; + + mbp = Zalloc(DEV_BSIZE); + if (read(fd, mbp, DEV_BSIZE) != DEV_BSIZE) { + Free(mbp); + return; + } + + /* If no replica on disk, check for dummy mb */ + if (replica_present == NULL) { + /* + * Check to see if there is a dummy there. If not + * create one. This would happen if the set was + * created before the master block dummy code was + * implemented. + */ + if ((mbp->mb_magic != MDDB_MAGIC_DU) || + (mbp->mb_revision != MDDB_REV_MB)) { + meta_mkdummymaster(sp, fd, offset); + Free(mbp); + return; + } + } + + mbp->mb_setno = sp->setno; + if (meta_gettimeofday(&mbp->mb_timestamp) == -1) { + Free(mbp); + return; + } + + /* + * If a old_devid is non-NULL then we're are dealing with a + * replicated diskset and the devid needs to be updated. + */ + if (old_devid) { + if (mbp->mb_devid_magic == MDDB_MAGIC_DE) { + if (mbp->mb_devid_len) + (void) memset(mbp->mb_devid, 0, + mbp->mb_devid_len); + (void) memcpy(mbp->mb_devid, + (char *)new_devid, new_devid_len); + mbp->mb_devid_len = new_devid_len; + } + } + + crcgen((uchar_t *)mbp, (uint_t *)&mbp->mb_checksum, + (uint_t)DEV_BSIZE, (crc_skip_t *)NULL); + + /* + * Now write out the changes to disk. + * If an error occurs, just continue on. + * Next take of set will register this drive as + * an unresolved replicated drive and will attempt + * to fix the master block again. + */ + if (lseek(fd, (off_t)dbtob(offset), SEEK_SET) < 0) { + Free(mbp); + return; + } + if (write(fd, mbp, DEV_BSIZE) != DEV_BSIZE) { + Free(mbp); + return; + } + + Free(mbp); + (void) close(fd); +} + + +/* + * meta_imp_update_mb + * + * Update the master block information during an import. + * Takes an import set descriptor. + * + * Returns : nothing (void) + */ +void +meta_imp_update_mb(mdsetname_t *sp, md_im_set_desc_t *misp, md_error_t *ep) +{ md_im_drive_info_t *midp; mddrivename_t *dnp; - md_im_replica_info_t *midr; - mdname_t *np; - struct stat st_buf; - uint_t rep_slice; - int replica_count = 0; + int offset = 16; /* default mb offset is 16 */ - for (midp = misp->mis_drives; midp != NULL; - midp = midp->mid_next) { + for (midp = misp->mis_drives; midp != NULL; midp = midp->mid_next) { + /* + * If disk isn't available we can't update, so go to next + */ + if (midp->mid_available == MD_IM_DISK_NOT_AVAILABLE) { + continue; + } dnp = midp->mid_dnp; - if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || - ((np = metaslicename(dnp, rep_slice, ep)) + if (midp->mid_replicas) { + md_im_replica_info_t *mirp; + + /* + * If we have replicas on this disk we need to make + * sure that we update the master block on every + * replica on the disk. + */ + for (mirp = midp->mid_replicas; mirp != NULL; + mirp = mirp->mir_next) { + offset = mirp->mir_offset; + meta_update_mb_did(sp, dnp, midp->mid_devid, + midp->mid_devid_sz, midp->mid_o_devid, + 1, offset, ep); + } + } else { + /* No replicas, just update the one dummy mb */ + meta_update_mb_did(sp, dnp, midp->mid_devid, + midp->mid_devid_sz, midp->mid_o_devid, + 0, offset, ep); + } + if (!mdisok(ep)) + return; + } +} + +/* + * meta_unrslv_replicated_common + * + * Given a drive_desc and a drivenamelist pointer, + * return the devidp associated with the drive_desc, + * the replicated (new) devidp associated with the drive_desc + * and the specific mddrivename in the drivenamelist that + * matches the replicated (new) devidp. + * + * Typically the drivenamelist pointer would be setup by + * the meta_prune_cnames function. + * + * Calling function must free devidp using devid_free. + * + * Returns 0 - success, found new_devidp and dnp_new. + * Returns 1 - failure, didn't find new devid info + */ +static int +meta_unrslv_replicated_common( + int myside, + md_drive_desc *dd, /* drive list for diskset */ + mddrivenamelist_t *dnlp, /* list of drives on current system */ + ddi_devid_t *devidp, /* old devid */ + ddi_devid_t *new_devidp, /* replicated (new) devid */ + mddrivename_t **dnp_new, /* replicated drive name */ + md_error_t *ep +) +{ + mddrivename_t *dnp; /* drive name of old drive */ + mdsidenames_t *sn = NULL; + uint_t rep_slice; + mdname_t *np; + char *minor_name = NULL; + char *devid_str = NULL; + size_t len; + int devid_sz; + mddrivenamelist_t *dp; + ddi_devid_t old_devid; /* devid of old drive */ + ddi_devid_t new_devid; /* devid of new replicated drive */ + ddi_devid_t dnp_new_devid; /* devid derived from drive */ + /* name of replicated drive */ + + dnp = dd->dd_dnp; + + /* Get old devid from drive record */ + (void) devid_str_decode(dd->dd_dnp->devid, + &old_devid, NULL); + + /* Look up replicated (new) devid */ + new_devid = replicated_list_lookup( + devid_sizeof(old_devid), old_devid); + + devid_free(old_devid); + + if (new_devid == NULL) + return (1); + + /* + * Using new_devid, find a drivename entry with a matching devid. + * Use the passed in dnlp since it has the new (replicated) disknames + * in it. + */ + for (dp = dnlp; dp != NULL; dp = dp->next) { + (void) devid_str_decode(dp->drivenamep->devid, + &dnp_new_devid, NULL); + + if (dnp_new_devid == NULL) + continue; + + if (devid_compare(new_devid, dnp_new_devid) == 0) { + devid_free(dnp_new_devid); + break; + } + devid_free(dnp_new_devid); + } + + /* If can't find new name for drive - nothing to update */ + if (dp == NULL) + return (1); + + /* + * Setup returned value to be the drivename structure associated + * with new (replicated) drive. + */ + *dnp_new = dp->drivenamep; + + /* + * Need to return the new devid including the minor name. + * Find the minor_name here using the sidename or by + * looking in the namespace. + */ + for (sn = dnp->side_names; sn != NULL; sn = sn->next) { + if (sn->sideno == myside) + break; + } + + /* + * The disk has no side name information + */ + if (sn == NULL) { + if ((meta_replicaslice(*dnp_new, &rep_slice, ep) != 0) || + ((np = metaslicename(*dnp_new, rep_slice, ep)) == NULL)) { mdclrerror(ep); - continue; + return (1); } - if (stat(np->bname, &st_buf) != 0) + if (np->dev == NODEV64) + return (1); + + /* + * minor_name will be NULL if dnp->devid == NULL + * - see metagetvtoc() + */ + if (np->minor_name == NULL) + return (1); + else + minor_name = Strdup(np->minor_name); + + } else { + minor_name = meta_getdidminorbykey( + MD_LOCAL_SET, sn->sideno + SKEW, + dnp->side_names_key, ep); + if (!mdisok(ep)) + return (1); + } + /* + * Now, use the old devid with minor name to lookup + * the replicated (new) devid that will also contain + * a minor name. + */ + len = strlen(dnp->devid) + strlen(minor_name) + 2; + devid_str = (char *)Malloc(len); + (void) snprintf(devid_str, len, "%s/%s", dnp->devid, + minor_name); + (void) devid_str_decode(devid_str, devidp, NULL); + Free(devid_str); + devid_sz = devid_sizeof((ddi_devid_t)*devidp); + *new_devidp = replicated_list_lookup(devid_sz, *devidp); + return (0); +} + +/* + * meta_unrslv_replicated_mb + * + * Update the master block information during a take. + * Takes an md_drive_desc descriptor. + * + * Returns : nothing (void) + */ +void +meta_unrslv_replicated_mb( + mdsetname_t *sp, + md_drive_desc *dd, /* drive list for diskset */ + mddrivenamelist_t *dnlp, /* list of drives on current system */ + md_error_t *ep +) +{ + md_drive_desc *d = NULL, *d_save; + mddrivename_t *dnp; /* dnp of old drive */ + mddrivename_t *dnp_new; /* dnp of new (replicated) drive */ + mddrivename_t *dnp_save; /* saved copy needed to restore */ + ddi_devid_t devidp, new_devidp; + int myside; + + if ((myside = getmyside(sp, ep)) == MD_SIDEWILD) + return; + + for (d = dd; d != NULL; d = d->dd_next) { + dnp = d->dd_dnp; + if (dnp == NULL) + continue; + + /* If don't need to update master block - skip it. */ + if (!(d->dd_flags & MD_DR_FIX_MB_DID)) continue; /* - * The drive is okay now count its replicas + * Get old and replicated (new) devids associated with this + * drive. Also, get the new (replicated) drivename structure. */ - for (midr = midp->mid_replicas; midr != NULL; - midr = midr->mir_next) { - replica_count++; + if (meta_unrslv_replicated_common(myside, d, dnlp, &devidp, + &new_devidp, &dnp_new, ep) != 0) { + mdclrerror(ep); + continue; + } + + if (new_devidp) { + int offset = 16; /* default mb offset is 16 */ + int dbcnt; + + if (d->dd_dbcnt) { + /* + * Update each master block on the disk + */ + for (dbcnt = d->dd_dbcnt; dbcnt != 0; dbcnt--) { + meta_update_mb_did(sp, dnp_new, + new_devidp, + devid_sizeof(new_devidp), devidp, + 1, offset, ep); + offset += d->dd_dbsize; + } + } else { + /* update the one dummy mb */ + meta_update_mb_did(sp, dnp_new, new_devidp, + devid_sizeof(new_devidp), devidp, + 0, offset, ep); + } + if (!mdisok(ep)) { + devid_free(devidp); + return; + } + + /* Set drive record flags to ok */ + /* Just update this one drive record. */ + d_save = d->dd_next; + dnp_save = d->dd_dnp; + d->dd_next = NULL; + d->dd_dnp = dnp_new; + /* Ignore failure since no bad effect. */ + (void) clnt_upd_dr_flags(mynode(), sp, d, + MD_DR_OK, ep); + d->dd_next = d_save; + d->dd_dnp = dnp_save; } + devid_free(devidp); } +} - if (replica_count < (misp->mis_active_replicas + 1)/2) - return (-1); +/* + * meta_update_nm_rr_did + * + * Change a devid stored in the diskset namespace and in the local set + * namespace with the new devid. + * + * This routine is called during the import of a diskset + * (meta_imp_update_nn) and during the take of a diskset that has + * some unresolved replicated drives (meta_unrslv_replicated_nm). + * + * Returns : nothing (void) + */ +static void +meta_update_nm_rr_did( + mdsetname_t *sp, + void *old_devid, /* old devid being replaced */ + int old_devid_sz, + void *new_devid, /* devid to be stored in nm */ + int new_devid_sz, + int import_flag, /* called during import? */ + md_error_t *ep +) +{ + struct mddb_config c; - return (0); + (void) memset(&c, 0, sizeof (c)); + c.c_setno = sp->setno; + + /* During import to NOT update the local namespace. */ + if (import_flag) + c.c_flags = MDDB_C_IMPORT; + + c.c_locator.l_devid = (uintptr_t)Malloc(new_devid_sz); + (void) memcpy((void *)(uintptr_t)c.c_locator.l_devid, + new_devid, new_devid_sz); + c.c_locator.l_devid_sz = new_devid_sz; + c.c_locator.l_devid_flags = + MDDB_DEVID_VALID | MDDB_DEVID_SPACE | MDDB_DEVID_SZ; + c.c_locator.l_old_devid = (uint64_t)(uintptr_t)Malloc(old_devid_sz); + (void) memcpy((void *)(uintptr_t)c.c_locator.l_old_devid, + old_devid, old_devid_sz); + c.c_locator.l_old_devid_sz = old_devid_sz; + if (metaioctl(MD_IOCUPDATE_NM_RR_DID, &c, &c.c_mde, NULL) != 0) { + (void) mdstealerror(ep, &c.c_mde); + } + Free((void *)(uintptr_t)c.c_locator.l_devid); + Free((void *)(uintptr_t)c.c_locator.l_old_devid); +} + +/* + * meta_imp_update_nm + * + * Change a devid stored in the diskset namespace with the new devid. + * This routine is called during the import of a remotely replicated diskset. + * + * Returns : nothing (void) + */ +void +meta_imp_update_nm(mdsetname_t *sp, md_im_set_desc_t *misp, md_error_t *ep) +{ + md_im_drive_info_t *midp; + + for (midp = misp->mis_drives; midp != NULL; midp = midp->mid_next) { + /* + * If disk isn't available we can't update, so go to next + */ + if (midp->mid_available == MD_IM_DISK_NOT_AVAILABLE) { + continue; + } + + meta_update_nm_rr_did(sp, midp->mid_o_devid, + midp->mid_o_devid_sz, midp->mid_devid, + midp->mid_devid_sz, 1, ep); + if (!mdisok(ep)) + return; + } +} + +/* + * meta_unrslv_replicated_nm + * + * Change a devid stored in the diskset namespace and in the local set + * namespace with the new devid. + * + * This routine is called during the take of a diskset that has + * some unresolved replicated drives. + * + * Returns : nothing (void) + */ +void +meta_unrslv_replicated_nm( + mdsetname_t *sp, + md_drive_desc *dd, /* drive list for diskset */ + mddrivenamelist_t *dnlp, /* list of drives on current system */ + md_error_t *ep +) +{ + md_drive_desc *d = NULL; + mddrivename_t *dnp; /* drive name of old drive */ + mddrivename_t *dnp_new; /* drive name of new (repl) drive */ + ddi_devid_t devidp, new_devidp; + ddi_devid_t old_devid; + char *devid_old_save; + mdsetname_t *local_sp = NULL; + int myside; + + if ((myside = getmyside(sp, ep)) == MD_SIDEWILD) + return; + + for (d = dd; d != NULL; d = d->dd_next) { + dnp = d->dd_dnp; + if (dnp == NULL) + continue; + + /* If don't need to update namespace - skip it. */ + if (!(d->dd_flags & MD_DR_FIX_LB_NM_DID)) + continue; + + /* Get old devid from drive record */ + (void) devid_str_decode(d->dd_dnp->devid, + &old_devid, NULL); + + /* + * Get old and replicated (new) devids associated with this + * drive. Also, get the new (replicated) drivename structure. + */ + if (meta_unrslv_replicated_common(myside, d, dnlp, &devidp, + &new_devidp, &dnp_new, ep) != 0) { + mdclrerror(ep); + continue; + } + + if (new_devidp) { + meta_update_nm_rr_did(sp, devidp, + devid_sizeof(devidp), new_devidp, + devid_sizeof(new_devidp), 0, ep); + if (!mdisok(ep)) { + devid_free(devidp); + return; + } + } + devid_free(devidp); + + /* + * Using the new devid, fix up the name. + * If meta_upd_ctdnames fails, the next take will re-resolve + * the name from the new devid. + */ + local_sp = metasetname(MD_LOCAL_NAME, ep); + devid_old_save = dnp->devid; + dnp->devid = dnp_new->devid; + (void) meta_upd_ctdnames(&local_sp, 0, (myside + SKEW), + dnp, NULL, ep); + mdclrerror(ep); + dnp->devid = devid_old_save; + } } static set_t @@ -2472,9 +3461,17 @@ meta_imp_set( struct mddb_config c; mdname_t *np; md_im_replica_info_t *mirp; - char setnum_link[MAXPATHLEN]; - char setname_link[MAXPATHLEN]; + set_t setno; + mdcinfo_t *cinfo; + mdsetname_t *sp; + mddrivenamelist_t *dnlp = NULL; + mddrivenamelist_t **dnlpp = &dnlp; char *minor_name = NULL; + int stale_flag = 0; + md_set_desc *sd; + int partial_replicated_flag = 0; + md_error_t xep = mdnullerror; + md_setkey_t *cl_sk; (void) memset(&c, 0, sizeof (c)); (void) strlcpy(c.c_setname, setname, sizeof (c.c_setname)); @@ -2493,45 +3490,99 @@ meta_imp_set( /* * Find the next available set number */ - if ((c.c_setno = meta_imp_setno(ep)) == MD_SET_BAD) { + if ((setno = meta_imp_setno(ep)) == MD_SET_BAD) { return (mddserror(ep, MDE_DS_SETNOTIMP, MD_SET_BAD, mynode(), NULL, c.c_setname)); } + c.c_setno = setno; if (meta_gettimeofday(&tp) == -1) { return (mdsyserror(ep, errno, NULL)); } c.c_timestamp = tp; /* Check to see if replica quorum requirement is fulfilled */ - if (!force && meta_replica_quorum(misp, ep) == -1) - return (mddserror(ep, MDE_DS_INSUFQUORUM, MD_SET_BAD, - mynode(), NULL, c.c_setname)); + if (meta_replica_quorum(misp) == -1) { + if (!force) { + return (mddserror(ep, MDE_DS_INSUFQUORUM, MD_SET_BAD, + mynode(), NULL, c.c_setname)); + } else { + stale_flag = MD_IMP_STALE_SET; + /* + * If we have a stale diskset, the kernel will + * delete the replicas on the unavailable disks. + * To be consistent, we'll zero out the mirp on those + * disks here. + */ + for (midp = misp->mis_drives; midp != NULL; + midp = midp->mid_next) { + if (midp->mid_available == + MD_IM_DISK_NOT_AVAILABLE) { + midp->mid_replicas = NULL; + } + } + } + } for (midp = misp->mis_drives; midp != NULL; midp = midp->mid_next) { - mdcinfo_t *cinfo; + + if ((misp->mis_flags & MD_IM_SET_REPLICATED) && + (partial_replicated_flag == 0) && + (midp->mid_available == MD_IM_DISK_NOT_AVAILABLE)) + partial_replicated_flag = MD_SR_UNRSLV_REPLICATED; /* - * We pass down the list of the drives in the - * set down to the kernel irrespective of - * whether the drives have a replica or not. - * - * The kernel detects which of the drives don't - * have a replica and accordingly does the - * right thing. + * We pass the list of the drives in the + * set with replicas on them down to the kernel. */ dnp = midp->mid_dnp; - if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || - ((np = metaslicename(dnp, rep_slice, ep)) - == NULL)) { - mdclrerror(ep); + mirp = midp->mid_replicas; + if (!mirp) { + /* + * No replicas on this disk, go to next disk. + */ continue; } - (void) strcpy(c.c_locator.l_devname, np->bname); - c.c_locator.l_dev = meta_cmpldev(np->dev); - c.c_locator.l_mnum = meta_getminor(np->dev); + if (midp->mid_available == MD_IM_DISK_NOT_AVAILABLE) { + /* + * The disk isn't there. We'll need to get the + * disk information from the midp list instead + * of going and looking for it. This means it + * will be information relative to the old + * system. + */ + minor_name = Strdup(midp->mid_minor_name); + (void) strncpy(c.c_locator.l_driver, + midp->mid_driver_name, + sizeof (c.c_locator.l_driver)); + (void) strcpy(c.c_locator.l_devname, midp->mid_devname); + c.c_locator.l_mnum = midp->mid_mnum; + + } else { + if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || + ((np = metaslicename(dnp, rep_slice, ep)) + == NULL)) { + mdclrerror(ep); + continue; + } + (void) strcpy(c.c_locator.l_devname, np->bname); + c.c_locator.l_dev = meta_cmpldev(np->dev); + c.c_locator.l_mnum = meta_getminor(np->dev); + minor_name = meta_getminor_name(np->bname, ep); + if ((cinfo = metagetcinfo(np, ep)) == NULL) { + mdclrerror(ep); + continue; + } + + if (cinfo->dname) { + (void) strncpy(c.c_locator.l_driver, + cinfo->dname, + sizeof (c.c_locator.l_driver)); + } + } + c.c_locator.l_devid = (uintptr_t)Malloc(midp->mid_devid_sz); (void) memcpy((void *)(uintptr_t)c.c_locator.l_devid, midp->mid_devid, midp->mid_devid_sz); @@ -2546,31 +3597,14 @@ meta_imp_set( midp->mid_o_devid, midp->mid_o_devid_sz); c.c_locator.l_old_devid_sz = midp->mid_o_devid_sz; } - minor_name = meta_getminor_name(np->bname, ep); - (void) strncpy(c.c_locator.l_minor_name, minor_name, - sizeof (c.c_locator.l_minor_name)); - - if ((cinfo = metagetcinfo(np, ep)) == NULL) { - mdclrerror(ep); - continue; + if (minor_name) { + (void) strncpy(c.c_locator.l_minor_name, minor_name, + sizeof (c.c_locator.l_minor_name)); } - (void) strncpy(c.c_locator.l_driver, cinfo->dname, - sizeof (c.c_locator.l_driver)); - - mirp = midp->mid_replicas; do { - if (mirp) { - c.c_locator.l_flags = 0; - c.c_locator.l_blkno = mirp->mir_offset; - mirp = mirp->mir_next; - } else { - /* - * Default offset for dummy is 16 - */ - c.c_locator.l_blkno = 16; - } - + c.c_locator.l_flags = 0; + c.c_locator.l_blkno = mirp->mir_offset; if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) { Free((void *)(uintptr_t)c.c_locator.l_devid); if (c.c_locator.l_old_devid) @@ -2578,6 +3612,7 @@ meta_imp_set( c.c_locator.l_old_devid); return (mdstealerror(ep, &c.c_mde)); } + mirp = mirp->mir_next; } while (mirp != NULL); } @@ -2595,39 +3630,143 @@ meta_imp_set( } /* - * Now kernel should have all the information + * Now the kernel should have all the information * regarding the import diskset replica. - * Tell kernel to load them up and import the set + * Tell the kernel to load them up and import the set */ - if (metaioctl(MD_IOCIMP_LOAD, &c.c_setno, &c.c_mde, NULL) != 0) { + (void) memset(&c, 0, sizeof (c)); + c.c_flags = stale_flag; + c.c_setno = setno; + if (metaioctl(MD_IOCIMP_LOAD, &c, &c.c_mde, NULL) != 0) { Free((void *)(uintptr_t)c.c_locator.l_devid); if (c.c_locator.l_old_devid) Free((void *)(uintptr_t)c.c_locator.l_old_devid); return (mdstealerror(ep, &c.c_mde)); } - (void) meta_smf_enable(META_SMF_DISKSET, NULL); + /* + * Create a set name for the set. + */ + sp = Zalloc(sizeof (*sp)); + sp->setname = Strdup(setname); + sp->lockfd = MD_NO_LOCK; + sp->setno = setno; + sd = Zalloc(sizeof (*sd)); + (void) strcpy(sd->sd_nodes[0], mynode()); + sd->sd_ctime = tp; + sd->sd_genid = 0; + + + if (misp->mis_flags & MD_IM_SET_REPLICATED) { + /* Update the diskset namespace */ + meta_imp_update_nm(sp, misp, ep); + + /* Release the diskset - even if update_nm failed */ + (void) memset(&c, 0, sizeof (c)); + c.c_setno = setno; + /* Don't need device id information from this ioctl */ + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + if (metaioctl(MD_RELEASE_SET, &c, &c.c_mde, NULL) != 0) { + if (mdisok(ep)) + (void) mdstealerror(ep, &c.c_mde); + Free(sd); + Free(sp); + return (-1); + } + + /* If update_nm failed, then fail the import. */ + if (!mdisok(ep)) { + Free(sd); + Free(sp); + return (-1); + } + } + + /* + * We'll need to update information in the master block due + * to the set number changing and if the case of a replicated + * diskset, the device id changing. May also need to create a + * dummy master block if it's not there. + */ + meta_imp_update_mb(sp, misp, ep); + if (!mdisok(ep)) { + Free(sd); + Free(sp); + return (-1); + } + + /* + * Create set record for diskset, but record is left in + * MD_SR_ADD state until after drives are added to set. + */ + if (clnt_lock_set(mynode(), sp, ep)) { + Free(sd); + Free(sp); + return (-1); + } + + if (clnt_createset(mynode(), sp, sd->sd_nodes, + sd->sd_ctime, sd->sd_genid, ep)) { + cl_sk = cl_get_setkey(sp->setno, sp->setname); + (void) clnt_unlock_set(mynode(), cl_sk, &xep); + Free(sd); + Free(sp); + return (-1); + } - /* The set has now been imported, create the appropriate symlink */ - (void) snprintf(setname_link, MAXPATHLEN, "/dev/md/%s", setname); - (void) snprintf(setnum_link, MAXPATHLEN, "shared/%d", c.c_setno); + Free(sd); /* - * Since we already verified that the setname was OK, make sure to - * cleanup before proceeding. + * Create drive records for the disks in the set. */ - if (unlink(setname_link) == -1) { - if (errno != ENOENT) - (void) mdsyserror(ep, errno, setname_link); + for (midp = misp->mis_drives; midp != NULL; midp = midp->mid_next) { + dnp = midp->mid_dnp; + if (midp->mid_available & MD_IM_DISK_NOT_AVAILABLE) { + /* + * If the disk isn't available, the dnp->devid is + * no good. It is either blank for the case where + * there is no disk with that devname, or it + * contains the devid for the real disk in the system + * with that name. The problem is, if the disk is + * unavailable, then the devid should be the devid + * of the missing disk. So we're faking a dnp for + * the import. This is needed for creating drive + * records. + */ + dnp = Zalloc(sizeof (mddrivename_t)); + dnp->side_names_key = midp->mid_dnp->side_names_key; + dnp->type = midp->mid_dnp->type; + dnp->cname = Strdup(midp->mid_dnp->cname); + dnp->rname = Strdup(midp->mid_dnp->rname); + dnp->devid = devid_str_encode(midp->mid_devid, + NULL); + midp->mid_dnp = dnp; + } + dnlpp = meta_drivenamelist_append_wrapper(dnlpp, dnp); + } + + if (meta_imp_set_adddrives(sp, dnlp, misp, ep)) { + Free(sp); + return (mddserror(ep, MDE_DS_SETNOTIMP, MD_SET_BAD, + mynode(), NULL, c.c_setname)); } - if (symlink(setnum_link, setname_link) == -1) - (void) mdsyserror(ep, errno, setname_link); + /* If drives were added without error, set set_record to OK */ + if (clnt_upd_sr_flags(mynode(), sp, + (partial_replicated_flag | MD_SR_OK | MD_SR_MB_DEVID), ep)) { + Free(sp); + return (mddserror(ep, MDE_DS_SETNOTIMP, MD_SET_BAD, + mynode(), NULL, c.c_setname)); + } + + Free(sp); - /* resnarf the set that has just been imported */ - if (clnt_resnarf_set(mynode(), c.c_setno, ep) != 0) - md_eprintf("%s\n", dgettext(TEXT_DOMAIN, "Please stop and " - "restart rpc.metad")); + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (clnt_unlock_set(mynode(), cl_sk, ep)) { + return (-1); + } + cl_set_setkey(NULL); Free((void *)(uintptr_t)c.c_locator.l_devid); if (c.c_locator.l_old_devid) diff --git a/usr/src/lib/lvm/libmeta/common/meta_metad.c b/usr/src/lib/lvm/libmeta/common/meta_metad.c index adf281e542..8c1d246afa 100644 --- a/usr/src/lib/lvm/libmeta/common/meta_metad.c +++ b/usr/src/lib/lvm/libmeta/common/meta_metad.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -510,6 +509,94 @@ clnt_add_drv_sidenms( } /* + * Adding drives via metaimport to disksets. Some of the drives may + * not be available so we need more information than the basic clnt_adddrvs + * offers us. + */ +int +clnt_imp_adddrvs( + char *hostname, + mdsetname_t *sp, + md_drive_desc *dd, + md_timeval32_t timestamp, + ulong_t genid, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_drives_2_args v2_args; + mdrpc_drives_2_args_r1 *v21_args; + mdrpc_generic_res res; + int rval; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + v21_args = &v2_args.mdrpc_drives_2_args_u.rev1; + v21_args->sp = sp; + v21_args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + v21_args->drivedescs = dd; + v21_args->timestamp = timestamp; + v21_args->genid = genid; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + + /* + * If the server is local, we call the v1 procedure + */ + bool = mdrpc_imp_adddrvs_2(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + sp->setno, hostname, NULL, NULL); + metarpcclose(clntp); + return (-1); + } else { + rval = mdrpc_imp_adddrvs_2(&v2_args, &res, clntp); + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad imp add drives")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + + +/* * Add drives to disksets. */ int diff --git a/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c b/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c index 6da29e6f3c..cb29de889d 100644 --- a/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c +++ b/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c @@ -1471,7 +1471,7 @@ mdmn_do_meta_md_addside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) */ for (i = 0; i < nm.ref_count; i++) { if (add_name(sp, d->msg_sideno, nm.key, dname, mnum, - cname, &ep) == -1) { + cname, NULL, NULL, &ep) == -1) { (void) mdstealerror(&(resp->mmr_ep), &ep); Free(cname); Free(dname); diff --git a/usr/src/lib/lvm/libmeta/common/meta_name.c b/usr/src/lib/lvm/libmeta/common/meta_name.c index b892f13493..dc628b5514 100644 --- a/usr/src/lib/lvm/libmeta/common/meta_name.c +++ b/usr/src/lib/lvm/libmeta/common/meta_name.c @@ -1070,7 +1070,7 @@ metafreedrivename( /* * flush the drive name cache */ -static void +void metaflushdrivenames() { mddrivenamelist_t *p, *n; @@ -2621,7 +2621,289 @@ metaname_fast( { return (metaname_common(spp, uname, 1, uname_type, ep)); } +/* + * Get the dnp using the device id. + * + * We have the potential to have more than 1 dnp with the same disk name but + * have different device ids. This would happen in the case of a partial + * diskset. The unavailable disk name is relative to the prior host and could + * possibly be the same as a disk on this system. The only way to tell which + * dnp belongs with this disk is by searching by device id. We have the + * potential to have the case where 1) the disk who's device id we pass in is + * in the system. In this case the name and the device id are both valid for + * the disk. 2) The disk whose device id we've been passed is not in the + * system and no disk with the same name has a dnp on the list. And 3) The + * disk whose device id we've been passed is not on the system but there is + * a disk with the same name (different devid) that is on the system. Here's + * what we return for each of those cases: + * 1) If disk is in system: + * disk is found on drivelistp or we create a new drivename and it's + * fully populated as expected. + * 2) If disk not in system, no collision + * Disk with the same devid is not found on drivelistp, we create a new + * drivename structure and the dnp->devid is filled in not from getparts + * but from the devidp passed in. No other disk in the system has the + * same "name" or devid. + * This situation would be caused by the import of a partial diskset. + * 3) If disk not in system, collision + * Disk with the same devid is not found on the drivelistp, we create a + * new drivename struct but getparts will use the information from the + * name which is actually in reference to another disk of the same name + * in the system. getparts will fill in the dnp->devid with the value + * from the other disk and we overwrite this with the value of this disk. + * To get into this situation one of the disks is actually unavailable + * as in the case of a partial import. + */ +mddrivename_t * +meta_getdnp_bydevid( + mdsetname_t *sp, + side_t sideno, + ddi_devid_t devidp, + mdkey_t key, + md_error_t *ep +) +{ + ddi_devid_t dnp_devidp; + char *nm; + mddrivenamelist_t **tail; + mddrivename_t *dnp; + uint_t slice; + mdname_t *np; + char *rname = NULL; + char *dname = NULL; + uint_t nparts, partno; + int ret; + md_set_desc *sd = NULL; + meta_device_type_t uname_type = LOGICAL_DEVICE; + + /* look in the cache first */ + for (tail = &drivelistp; (*tail != NULL); tail = &(*tail)->next) { + dnp = (*tail)->drivenamep; + if (dnp->type != MDT_COMP) + continue; + ret = devid_str_decode(dnp->devid, &dnp_devidp, NULL); + if (ret != 0) { + /* unable to decode the devid */ + return (NULL); + } + /* compare with the devid passed in. */ + if (devid_compare(devidp, dnp_devidp) == 0) { + /* match! We have the same disk */ + devid_free(dnp_devidp); + return (dnp); + } + devid_free(dnp_devidp); + } + + /* drive not in the cache */ + + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + return (NULL); + } + /* get namespace info */ + if (MD_MNSET_DESC(sd)) { + if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, + key, ep)) == NULL) + return (NULL); + } else { + if ((nm = meta_getnmbykey(MD_LOCAL_SET, + sideno+SKEW, key, ep)) == NULL) + return (NULL); + } + + /* get raw name (rname) of the slice and drive name (dname) */ + if ((rname = getrawnames(&sp, nm, &dname, &uname_type, ep)) == NULL) { + return (NULL); + } + + /* allocate new list element and drive */ + *tail = Zalloc(sizeof (**tail)); + dnp = (*tail)->drivenamep = Zalloc(sizeof (*dnp)); + metainitdrivename(dnp); + + /* get parts info */ + /* + * Note that if the disk is unavailable this name will point to + * either a nonexistent disk and thus the part info and devid will + * be empty or the name will point to the wrong disk and this + * information will be invalid. Because of this, we overwrite the + * dnp->devid with the correct one after getparts returns. + */ + if (getparts(dnp, rname, dname, uname_type, &nparts, &partno, ep) != 0) + goto out; + + dnp->devid = devid_str_encode(devidp, NULL); + + /* + * libmeta needs at least V_NUMPAR partitions. + * If we have an EFI partition with less than V_NUMPAR slices, + * we nevertheless reserve space for V_NUMPAR + */ + if (nparts < V_NUMPAR) { + nparts = V_NUMPAR; + } + + /* allocate and link in parts */ + dnp->parts.parts_len = nparts; + dnp->parts.parts_val = Zalloc((sizeof (*dnp->parts.parts_val)) * + dnp->parts.parts_len); + + for (slice = 0; (slice < nparts); ++slice) { + np = &dnp->parts.parts_val[slice]; + metainitname(np); + np->drivenamep = dnp; + } + + /* setup name_t (or slice) wanted */ + if ((np = setup_slice(sp, uname_type, dnp, nm, rname, + dname, partno, ep)) == NULL) + goto out; + + /* canonical disk name */ + if ((dnp->cname = metadiskname(np->cname)) == NULL) + dnp->cname = Strdup(np->cname); + if ((dnp->rname = metadiskname(np->rname)) == NULL) + dnp->rname = Strdup(np->rname); + + if (dname != NULL) + Free(dname); + Free(rname); + return (dnp); + +out: + if (dname != NULL) + Free(dname); + if (rname != NULL) + Free(rname); + + metafreedrivename(dnp); + Free(dnp); + Free(*tail); + *tail = NULL; + return (NULL); +} + +/* + * Search the drivename list by devid instead of name. If you don't find + * an entry with the same device id, create one for the uname passed in. + */ +mddrivename_t * +metadrivenamebydevid( + mdsetname_t **spp, + char *devid, + char *uname, + md_error_t *ep +) +{ + ddi_devid_t dnp_devidp, in_devidp; + mdname_t *np; + mddrivenamelist_t **tail; + char *rname = NULL; + mddrivename_t *dnp; + char *dname; + int ret; + uint_t nparts, partno; + uint_t slice; + meta_device_type_t uname_type = LOGICAL_DEVICE; + + /* look in the cache first */ + for (tail = &drivelistp; (*tail != NULL); tail = &(*tail)->next) { + dnp = (*tail)->drivenamep; + if (dnp->type != MDT_COMP) + continue; + + /* decode the dnp devid */ + ret = devid_str_decode(dnp->devid, &dnp_devidp, NULL); + if (ret != 0) { + /* unable to decode the devid */ + return (NULL); + } + /* decode the passed in devid */ + ret = devid_str_decode(devid, &in_devidp, NULL); + if (ret != 0) { + /* unable to decode the devid */ + devid_free(dnp_devidp); + return (NULL); + } + /* compare with the devids */ + if (devid_compare(in_devidp, dnp_devidp) == 0) { + /* match! We have the same disk */ + devid_free(dnp_devidp); + devid_free(in_devidp); + return (dnp); + } + } + devid_free(dnp_devidp); + devid_free(in_devidp); + + /* not in the cache */ + + /* get raw name (rname) of the slice and drive (dname) we have */ + if ((rname = getrawnames(spp, uname, &dname, &uname_type, + ep)) == NULL) { + return (NULL); + } + + /* allocate new list element and drive */ + *tail = Zalloc(sizeof (**tail)); + dnp = (*tail)->drivenamep = Zalloc(sizeof (*dnp)); + + metainitdrivename(dnp); + + /* get parts info */ + if (getparts(dnp, rname, dname, uname_type, &nparts, &partno, ep) != 0) + goto out; + + /* + * libmeta needs at least V_NUMPAR partitions. + * If we have an EFI partition with less than V_NUMPAR slices, + * we nevertheless reserve space for V_NUMPAR + */ + if (nparts < V_NUMPAR) { + nparts = V_NUMPAR; + } + + /* allocate and link in parts */ + dnp->parts.parts_len = nparts; + dnp->parts.parts_val = Zalloc((sizeof (*dnp->parts.parts_val)) * + dnp->parts.parts_len); + for (slice = 0; (slice < nparts); ++slice) { + np = &dnp->parts.parts_val[slice]; + metainitname(np); + np->drivenamep = dnp; + } + + /* setup name_t (or slice) wanted */ + if ((np = setup_slice(*spp, uname_type, dnp, uname, rname, + dname, partno, ep)) == NULL) + goto out; + + /* canonical disk name */ + if ((dnp->cname = metadiskname(np->cname)) == NULL) + dnp->cname = Strdup(np->cname); + if ((dnp->rname = metadiskname(np->rname)) == NULL) + dnp->rname = Strdup(np->rname); + + /* cleanup, return success */ + if (dname != NULL) + Free(dname); + Free(rname); + return (dnp); + + /* cleanup, return error */ +out: + if (dname != NULL) + Free(dname); + if (rname != NULL) + Free(rname); + + metafreedrivename(dnp); + Free(dnp); + Free(*tail); + *tail = NULL; + return (NULL); +} /* * set up names for a drive */ diff --git a/usr/src/lib/lvm/libmeta/common/meta_namespace.c b/usr/src/lib/lvm/libmeta/common/meta_namespace.c index 53c1b3e35c..9d2d16bd3f 100644 --- a/usr/src/lib/lvm/libmeta/common/meta_namespace.c +++ b/usr/src/lib/lvm/libmeta/common/meta_namespace.c @@ -392,6 +392,10 @@ meta_getnmentbydev( return (Strdup(device_name)); } +/* + * The arguments, minorname and devid, are only used with the partial + * import code and should be NULL otherwise. + */ int add_name( mdsetname_t *sp, @@ -400,6 +404,8 @@ add_name( char *dname, minor_t mnum, char *bname, + char *minorname, /* only used with a partial import */ + ddi_devid_t devid, /* only used with a partial import */ md_error_t *ep ) { @@ -413,7 +419,13 @@ add_name( (void) strncpy(nm.drvnm, dname, sizeof (nm.drvnm)); nm.devname_len = strlen(bname) + 1; nm.devname = (uintptr_t)bname; - + if (devid && minorname) { + nm.minorname_len = strlen(minorname) + 1; + nm.minorname = (uintptr_t)minorname; + nm.devid_size = devid_sizeof(devid); + nm.devid = (uintptr_t)devid; + nm.imp_flag = MDDB_C_IMPORT; + } if (metaioctl(MD_IOCSET_NM, &nm, &nm.mde, bname) < 0) return (mdstealerror(ep, &nm.mde)); @@ -579,7 +591,8 @@ add_key_name( } if ((err = add_name(sp, thisside, key, devlist[thisside].dname, - devlist[thisside].mnum, devlist[thisside].bname, ep)) == -1) { + devlist[thisside].mnum, devlist[thisside].bname, NULL, + NULL, ep)) == -1) { empty_devicelist(); return (-1); } @@ -602,7 +615,8 @@ add_key_name( if (devlist[sideno].dname != NULL) { err = add_name(sp, sideno, key, devlist[sideno].dname, - devlist[sideno].mnum, devlist[sideno].bname, ep); + devlist[sideno].mnum, devlist[sideno].bname, + NULL, NULL, ep); if (err == -1) { empty_devicelist(); return (-1); @@ -758,7 +772,7 @@ add_self_name( if (metaislocalset(sp)) { if ((key = add_name(sp, myside, MD_KEYWILD, drvname, - minor, devname, ep)) == MD_KEYBAD) { + minor, devname, NULL, NULL, ep)) == MD_KEYBAD) { Free(devname); return (-1); } @@ -767,7 +781,7 @@ add_self_name( * Add myside first and use the returned key to add other sides */ if ((key = add_name(sp, myside, MD_KEYWILD, drvname, - minor, devname, ep)) == MD_KEYBAD) { + minor, devname, NULL, NULL, ep)) == MD_KEYBAD) { Free(devname); return (-1); } @@ -786,7 +800,7 @@ add_self_name( if (mnside->nd_nodeid == myside) continue; if (add_name(sp, mnside->nd_nodeid, key, drvname, - minor, devname, ep) == -1) { + minor, devname, NULL, NULL, ep) == -1) { Free(devname); return (-1); } @@ -798,7 +812,7 @@ add_self_name( if (side == myside) continue; if (add_name(sp, side, key, drvname, minor, devname, - ep) == -1) { + NULL, NULL, ep) == -1) { Free(devname); return (-1); } diff --git a/usr/src/lib/lvm/libmeta/common/meta_set.c b/usr/src/lib/lvm/libmeta/common/meta_set.c index 94c380a10d..397f016f7b 100644 --- a/usr/src/lib/lvm/libmeta/common/meta_set.c +++ b/usr/src/lib/lvm/libmeta/common/meta_set.c @@ -926,6 +926,97 @@ meta_is_drive_in_thisset( return (0); } +/* + * Check to see if devid is in use in any diskset. + * This is used in the case when a partial diskset is being imported + * to make sure that the unvailable drive isn't already in use in an + * already imported partial diskset. Can't check on the cname since the + * unavailable disk's cname is from the previous system and may collide + * with a cname on this system. + * Return values: + * 1: devid has been found in a diskset + * 0: devid not found in any diskset + */ +int +meta_is_devid_in_anyset( + void *devid, + mdsetname_t **spp, + md_error_t *ep +) +{ + set_t setno; + mdsetname_t *this_sp; + int is_it; + set_t max_sets; + + if ((max_sets = get_max_sets(ep)) == 0) + return (-1); + + assert(spp != NULL); + *spp = NULL; + + for (setno = 1; setno < max_sets; setno++) { + if ((this_sp = metasetnosetname(setno, ep)) == NULL) { + if (mdismddberror(ep, MDE_DB_NODB)) { + mdclrerror(ep); + return (0); + } + if (mdiserror(ep, MDE_NO_SET)) { + mdclrerror(ep); + continue; + } + return (-1); + } + + if ((is_it = meta_is_devid_in_thisset(this_sp, + devid, ep)) == -1) { + if (mdiserror(ep, MDE_NO_SET)) { + mdclrerror(ep); + continue; + } + return (-1); + } + if (is_it) { + *spp = this_sp; + return (0); + } + } + return (0); +} + +int +meta_is_devid_in_thisset( + mdsetname_t *sp, + void *devid, + md_error_t *ep +) +{ + md_drive_desc *dd, *p; + ddi_devid_t dd_devid; + + dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); + if (dd == NULL) { + if (! mdisok(ep)) + return (-1); + return (0); + } + + for (p = dd; p != NULL; p = p->dd_next) { + if (p->dd_dnp->devid == NULL) + continue; + (void) devid_str_decode(p->dd_dnp->devid, + &dd_devid, NULL); + if (dd_devid == NULL) + continue; + if (devid_compare(devid, dd_devid) == 0) { + devid_free(dd_devid); + return (1); + } + devid_free(dd_devid); + } + return (0); +} + int meta_set_balance( mdsetname_t *sp, @@ -1769,55 +1860,6 @@ metadrivename_withdrkey( return (NULL); } - /* get namespace info */ - if (MD_MNSET_DESC(sd)) { - if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, - key, ep)) == NULL) - return (NULL); - } else { - if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno+SKEW, - key, ep)) == NULL) - return (NULL); - } - - /* get device name */ - if (flags & PRINT_FAST) { - if ((np = metaname_fast(&sp, nm, LOGICAL_DEVICE, ep)) == NULL) { - Free(nm); - return (NULL); - } - } else { - if ((np = metaname(&sp, nm, LOGICAL_DEVICE, ep)) == NULL) { - Free(nm); - return (NULL); - } - } - Free(nm); - - /* make sure it's OK */ - if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, ep) != 0)) - return (NULL); - - /* get drivename */ - dnp = np->drivenamep; - dnp->side_names_key = key; - - /* - * Skip the following devid check if dnp is did device - * The device id is disabled for did device due to the - * lack of minor name support in the did driver. The following - * devid code path can set and propagate the error and - * eventually prevent did disks from being added to the - * diskset under SunCluster systems - */ - if (strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) == 0) { - goto out; - } - - /* Also, Skip the check if MN diskset, no devid's */ - if (MD_MNSET_DESC(sd)) { - goto out; - } /* * Get the devid associated with the key. @@ -1829,10 +1871,69 @@ metadrivename_withdrkey( */ if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep)) != NULL) { - dnp->devid = devid_str_encode(devidp, NULL); + /* + * Look for the correct dnp using the devid for comparison. + */ + dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep); free(devidp); + dnp->side_names_key = key; } else { /* + * We didn't get a devid. We'll try for a dnp using the + * name. If we have a MN diskset or if the dnp is a did + * device, we're done because then we don't have devids. + * Otherwise we'll try to set the devid + * and get the dnp via devid again. + * We also need to clear the ep structure. When the + * above call to meta_getdidbykey returned a null, it + * also put an error code into ep. In this case, the null + * return is actually OK and any errors can be ignored. The + * reason it is OK is because this could be a MN set or + * we could be running without devids (ex cluster). + */ + mdclrerror(ep); + + if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, key, + ep)) == NULL) + return (NULL); + /* get device name */ + if (flags & PRINT_FAST) { + if ((np = metaname_fast(&sp, nm, + LOGICAL_DEVICE, ep)) == NULL) { + Free(nm); + return (NULL); + } + } else { + if ((np = metaname(&sp, nm, LOGICAL_DEVICE, + ep)) == NULL) { + Free(nm); + return (NULL); + } + } + Free(nm); + /* make sure it's OK */ + if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, + ep) != 0)) + return (NULL); + + /* get drivename */ + dnp = np->drivenamep; + dnp->side_names_key = key; + /* + * Skip the devid set/check for the following cases: + * 1) If MN diskset, there are no devid's + * 2) if dnp is did device + * The device id is disabled for did device due to the + * lack of minor name support in the did driver. The following + * devid code path can set and propagate the error and + * eventually prevent did disks from being added to the + * diskset under SunCluster systems + */ + if ((strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) + == 0) || (MD_MNSET_DESC(sd))) + goto out; + + /* * It is okay if replica is not in devid mode */ if (mdissyserror(ep, MDDB_F_NODEVID)) { @@ -1841,21 +1942,31 @@ metadrivename_withdrkey( } /* + * We're not MN or did devices but * devid is missing so this means that we have * just upgraded from a configuration where * devid's were not used so try to add in - * the devid and requery. + * the devid and requery. If the devid still isn't there, + * that's OK. dnp->devid will be null as it is in any + * configuration with no devids. */ if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, ep) < 0) return (NULL); if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET, - sideno+SKEW, key, ep)) == NULL) - return (NULL); - dnp->devid = devid_str_encode(devidp, NULL); - devid_free(devidp); + sideno+SKEW, key, ep)) != NULL) { + /* + * Found a devid so look for the dnp using the + * devid as the search mechanism. + */ + dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep); + free(devidp); + dnp->side_names_key = key; + } } + + out: if (flags & MD_BYPASS_DAEMON) return (dnp); diff --git a/usr/src/lib/lvm/libmeta/common/meta_set_drv.c b/usr/src/lib/lvm/libmeta/common/meta_set_drv.c index 5fad53ad7b..7dc51aec97 100644 --- a/usr/src/lib/lvm/libmeta/common/meta_set_drv.c +++ b/usr/src/lib/lvm/libmeta/common/meta_set_drv.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -128,39 +127,63 @@ metaget_drivedesc_fromdrivelist( int meta_make_sidenmlist( - mdsetname_t *sp, - mddrivename_t *dnp, - md_error_t *ep + mdsetname_t *sp, + mddrivename_t *dnp, + int import_flag, /* flags partial import */ + md_im_drive_info_t *midp, /* import drive information */ + md_error_t *ep ) { - mdsidenames_t *sn, **sn_next; - mdname_t *np; - int done; - side_t sideno = MD_SIDEWILD; - uint_t rep_slice; - - if (meta_replicaslice(dnp, &rep_slice, ep) != 0) - return (-1); + mdsidenames_t *sn, **sn_next; + mdname_t *np; + int done; + side_t sideno = MD_SIDEWILD; + uint_t rep_slice; + char *bname; - dnp->side_names_key = MD_KEYWILD; + if (!import_flag) { + /* + * Normal (aka NOT partial import) code path. + */ + if (meta_replicaslice(dnp, &rep_slice, ep) != 0) { + return (-1); + } - if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) - return (-1); + dnp->side_names_key = MD_KEYWILD; + if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) + return (-1); + bname = Strdup(np->bname); + } else { + /* + * When doing a partial import, we'll get the needed + * information from somewhere other than the system. + */ + dnp->side_names_key = MD_KEYWILD; + bname = Strdup(midp->mid_devname); + } metaflushsidenames(dnp); sn_next = &dnp->side_names; /*CONSTCOND*/ while (1) { sn = Zalloc(sizeof (*sn)); - if ((done = meta_getnextside_devinfo(sp, np->bname, - &sideno, &sn->cname, &sn->dname, &sn->mnum, ep)) == -1) { - Free(sn); - return (-1); + if ((done = meta_getnextside_devinfo(sp, bname, &sideno, + &sn->cname, &sn->dname, &sn->mnum, ep)) == -1) { + if (import_flag) { + mdclrerror(ep); + sn->dname = Strdup(midp->mid_driver_name); + sn->mnum = midp->mid_mnum; + } else { + Free(sn); + Free(bname); + return (-1); + } } if (done == 0) { Free(sn); + Free(bname); return (0); } @@ -312,18 +335,17 @@ meta_set_adddrives( */ for (p = dnlp; p != NULL; p = p->next) { if (meta_repartition_drive(sp, - p->drivenamep, - force_label == TRUE ? MD_REPART_FORCE : 0, + p->drivenamep, force_label == TRUE ? MD_REPART_FORCE : 0, NULL, /* Don't return the VTOC. */ ep) != 0) { rval = -1; goto out; } - /* * Create the names for the drives we are adding per side. */ - if (meta_make_sidenmlist(sp, p->drivenamep, ep) == -1) { + if (meta_make_sidenmlist(sp, p->drivenamep, 0, NULL, + ep) == -1) { rval = -1; goto out; } @@ -364,7 +386,6 @@ meta_set_adddrives( (void) close(fd); } } - /* * Get the set timeout information. */ @@ -938,6 +959,192 @@ rollback: return (rval); } +/* + * Add drives routine used during import of a diskset. + */ +int +meta_imp_set_adddrives( + mdsetname_t *sp, + mddrivenamelist_t *dnlp, + md_im_set_desc_t *misp, + md_error_t *ep +) +{ + md_set_desc *sd; + mddrivenamelist_t *p; + md_drive_desc *dd = NULL, *ddp; + int flush_set_onerr = 0; + md_timeval32_t now; + ulong_t genid; + mhd_mhiargs_t mhiargs; + md_im_replica_info_t *mirp; + md_im_drive_info_t *midp; + int rval = 0; + sigset_t oldsigs; + ulong_t max_genid = 0; + int rb_level = 0; + md_error_t xep = mdnullerror; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + for (p = dnlp; p != NULL; p = p->next) { + int imp_flag = 0; + + /* + * If we have a partial diskset, meta_make_sidenmlist will + * need information from midp to complete making the + * side name structure. + */ + if (misp->mis_partial) { + imp_flag = MDDB_C_IMPORT; + for (midp = misp->mis_drives; midp != NULL; + midp = midp->mid_next) { + if (midp->mid_dnp == p->drivenamep) + break; + } + if (midp == NULL) { + (void) mddserror(ep, MDE_DS_SETNOTIMP, + MD_SET_BAD, mynode(), NULL, sp->setname); + rval = -1; + goto out; + } + } + /* + * Create the names for the drives we are adding per side. + */ + if (meta_make_sidenmlist(sp, p->drivenamep, imp_flag, + midp, ep) == -1) { + rval = -1; + goto out; + } + } + + /* + * Get the list of drives descriptors that we are adding. + */ + dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep); + + if (! mdisok(ep)) { + rval = -1; + goto out; + } + + /* + * Get the set timeout information. + */ + (void) memset(&mhiargs, '\0', sizeof (mhiargs)); + if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) { + rval = -1; + goto out; + } + + /* + * Get timestamp and generation id for new records + */ + now = sd->sd_ctime; + genid = sd->sd_genid; + + /* At this point, in case of error, set should be flushed. */ + flush_set_onerr = 1; + + rb_level = 1; /* level 1 */ + + for (midp = misp->mis_drives; midp != NULL; midp = midp->mid_next) { + for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) { + if (ddp->dd_dnp == midp->mid_dnp) { + /* same disk */ + ddp->dd_dnp->devid = + devid_str_encode(midp->mid_devid, + midp->mid_minor_name); + + ddp->dd_dbcnt = 0; + mirp = midp->mid_replicas; + if (mirp) { + ddp->dd_dbsize = mirp->mir_length; + for (; mirp != NULL; + mirp = mirp->mir_next) { + ddp->dd_dbcnt++; + } + } + if ((midp->mid_available & + MD_IM_DISK_NOT_AVAILABLE) && + (misp->mis_flags & MD_IM_SET_REPLICATED)) { + ddp->dd_flags = MD_DR_UNRSLV_REPLICATED; + } + } + } + } + + /* + * Add the drive records for the drives that we are adding to + * each host in the set. Marks the drive records as MD_DR_ADD. + * May also mark a drive record as MD_DR_UNRSLV_REPLICATED if + * this flag was set in the dd_flags for that drive. + */ + if (clnt_imp_adddrvs(mynode(), sp, dd, now, genid, ep) == -1) + goto rollback; + + rb_level = 2; /* level 2 */ + + /* + * Take ownership of the added drives. + */ + if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep)) + goto rollback; + +out: + metafreedrivedesc(&dd); + + if (flush_set_onerr) { + metaflushsetname(sp); + } + + return (rval); + +rollback: + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + rval = -1; + + max_genid = sd->sd_genid; + + /* level 2 */ + if (rb_level > 1) { + if (!MD_ATSET_DESC(sd)) { + if (rel_own_bydd(sp, dd, TRUE, &xep)) { + mdclrerror(&xep); + } + } + } + + /* level 1 */ + if (rb_level > 0) { + if (clnt_deldrvs(mynode(), sp, dd, &xep) == -1) { + mdclrerror(&xep); + } + max_genid += 2; + resync_genid(sp, sd, max_genid, 0, NULL); + } + + /* level 0 */ + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + metafreedrivedesc(&dd); + + if (flush_set_onerr) { + metaflushsetname(sp); + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + + return (rval); +} + int meta_set_deletedrives( mdsetname_t *sp, diff --git a/usr/src/lib/lvm/libmeta/common/meta_set_hst.c b/usr/src/lib/lvm/libmeta/common/meta_set_hst.c index e665406cff..9bf87f8cd2 100644 --- a/usr/src/lib/lvm/libmeta/common/meta_set_hst.c +++ b/usr/src/lib/lvm/libmeta/common/meta_set_hst.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -201,8 +200,8 @@ add_md_sidenms(mdsetname_t *sp, side_t sideno, side_t otherside, md_error_t *ep) * increment the count to sync up with the other sides. */ for (i = 0; i < nm.ref_count; i++) { - if (add_name(sp, sideno, nm.key, dname, mnum, cname, - ep) == -1) + if (add_name(sp, sideno, nm.key, dname, mnum, + cname, NULL, NULL, ep) == -1) rval = -1; } diff --git a/usr/src/lib/lvm/libmeta/common/meta_set_prv.c b/usr/src/lib/lvm/libmeta/common/meta_set_prv.c index f3a8f39e17..76454d4db7 100644 --- a/usr/src/lib/lvm/libmeta/common/meta_set_prv.c +++ b/usr/src/lib/lvm/libmeta/common/meta_set_prv.c @@ -35,7 +35,6 @@ #include <sys/cladm.h> #include <devid.h> #include <sys/lvm/md_convert.h> -#include <sdssc.h> /* * Exported Entry Points @@ -642,15 +641,14 @@ setup_db_bydd(mdsetname_t *sp, md_drive_desc *dd, int force, md_error_t *ep) int i; md_set_desc *sd; int use_devid = 1; - ddi_devid_t devidp; + ddi_devid_t devidp, new_devidp; char *minor_name = NULL; size_t sz; char *devid_str = NULL; - sdssc_version_t version; + int need_to_free_devidp = 0; if ((sd = metaget_setdesc(sp, ep)) == NULL) return (-1); - (void) memset(&c, 0, sizeof (c)); c.c_setno = sp->setno; @@ -732,14 +730,7 @@ setup_db_bydd(mdsetname_t *sp, md_drive_desc *dd, int force, md_error_t *ep) } } - /* - * If the device does not have a devid or is a multinode - * diskset or we are in a SunCluster 3.x enviroment then - * do not use devids. - */ - if ((dnp->devid == NULL) || MD_MNSET_DESC(sd) || - ((sdssc_version(&version) == SDSSC_OKAY) && - (version.major >= 3))) { + if ((dnp->devid == NULL) || MD_MNSET_DESC(sd)) { use_devid = 0; } @@ -754,18 +745,50 @@ setup_db_bydd(mdsetname_t *sp, md_drive_desc *dd, int force, md_error_t *ep) (void) snprintf(devid_str, len, "%s/%s", dnp->devid, minor_name); (void) devid_str_decode(devid_str, &devidp, NULL); + need_to_free_devidp = 1; + + /* If need to fix LB then setup old_devid info */ + if (p->dd_flags & MD_DR_FIX_LB_NM_DID) { + sz = devid_sizeof(devidp); + c.c_locator.l_old_devid_sz = sz; + c.c_locator.l_old_devid = (uintptr_t)malloc(sz); + (void) memcpy((void *)(uintptr_t) + c.c_locator.l_old_devid, + devidp, sz); + + new_devidp = replicated_list_lookup( + devid_sizeof((ddi_devid_t)devidp), + (void *)(uintptr_t)devidp); + devid_free(devidp); + need_to_free_devidp = 0; + devidp = new_devidp; + } sz = devid_sizeof(devidp); c.c_locator.l_devid = (uintptr_t)malloc(sz); c.c_locator.l_devid_sz = sz; - (void) memcpy((void *)(uintptr_t)c.c_locator.l_devid, + (void) memcpy((void *)(uintptr_t) + c.c_locator.l_devid, devidp, sz); + if (need_to_free_devidp) { + devid_free(devidp); + need_to_free_devidp = 0; + } if (minor_name == NULL) { /* ERROR fix up */ Free(devid_str); + Free((void *)(uintptr_t)c.c_locator.l_devid); + if (c.c_locator.l_old_devid_sz) { + Free((void *) + (uintptr_t)c.c_locator.l_old_devid); + c.c_locator.l_old_devid_sz = 0; + c.c_locator.l_old_devid = + (uintptr_t)NULL; + } return (-1); } - (void) strcpy(c.c_locator.l_minor_name, minor_name); + (void) strcpy(c.c_locator.l_minor_name, + minor_name); c.c_locator.l_devid_flags = MDDB_DEVID_VALID | MDDB_DEVID_SPACE | MDDB_DEVID_SZ; } else { @@ -785,6 +808,15 @@ setup_db_bydd(mdsetname_t *sp, md_drive_desc *dd, int force, md_error_t *ep) if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) { if (use_devid) { Free(devid_str); + Free((void *) + (uintptr_t)c.c_locator.l_devid); + if (c.c_locator.l_old_devid_sz) { + Free((void *)(uintptr_t) + c.c_locator.l_old_devid); + c.c_locator.l_old_devid_sz = 0; + c.c_locator.l_old_devid = + (uintptr_t)NULL; + } } Free(minor_name); return (mdstealerror(ep, &c.c_mde)); @@ -792,6 +824,13 @@ setup_db_bydd(mdsetname_t *sp, md_drive_desc *dd, int force, md_error_t *ep) } if (use_devid) { Free(devid_str); + Free((void *)(uintptr_t)c.c_locator.l_devid); + if (c.c_locator.l_old_devid_sz) { + Free((void *) + (uintptr_t)c.c_locator.l_old_devid); + c.c_locator.l_old_devid_sz = 0; + c.c_locator.l_old_devid = (uintptr_t)NULL; + } } Free(minor_name); } diff --git a/usr/src/lib/lvm/libmeta/common/meta_set_tkr.c b/usr/src/lib/lvm/libmeta/common/meta_set_tkr.c index c46ba0220d..98e0329ab7 100644 --- a/usr/src/lib/lvm/libmeta/common/meta_set_tkr.c +++ b/usr/src/lib/lvm/libmeta/common/meta_set_tkr.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,6 +32,7 @@ #include "meta_set_prv.h" #include <sys/lvm/md_crc.h> +extern char *blkname(char *); static int upd_dr_dbinfo( @@ -480,6 +480,8 @@ cleanup: return (-1); } +extern int *replicated_disk_list_built; +extern int replicated_disk_list_built_pass1; /* * Exported Entry Points */ @@ -497,6 +499,7 @@ meta_set_take( md_drive_desc *d = NULL; char *owner = NULL; int rval = 0; + int pathname_return = 0; int i; int has_set; int matches = 0; @@ -511,6 +514,9 @@ meta_set_take( int ret = 0; char *newname = NULL; mdkey_t side_names_key; + int unrslv_replicated = 0; + mddrivenamelist_t *dnlp = NULL; + int retake_flag = 0; if ((flags & TAKE_USETAG) || (flags & TAKE_USEIT)) { if (flags & TAKE_USETAG) { @@ -598,6 +604,180 @@ meta_set_take( side += SKEW; /* + * If this set had been previously imported as a partial replicated + * diskset, then must attempt to updated any unresolved drive + * records in diskset with new devid information. Must set + * flags in drivedesc list before loading up set so that the + * md driver will fix up names and devids correctly in the + * locator block. + */ + if (sd->sd_flags & MD_SR_UNRSLV_REPLICATED) { + md_im_names_t cnames = { 0, NULL}; + ddi_devid_t old_devid, new_devid; + char *search_path = "/dev"; + devid_nmlist_t *nmlist; + int indx; + mddrivenamelist_t **dnlpp = &dnlp; + + if (meta_list_disks(ep, &cnames) != 0) { + rval = -1; + goto out; + } + + for (indx = 0; indx < cnames.min_count; ++indx) { + mddrivename_t *dnp; + mdsetname_t *sp = metasetname(MD_LOCAL_NAME, ep); + int fd = -1; + ddi_devid_t devid1; + char *cdevidp; + int len; + char *fp; + + /* + * We may have name collision here so we need to get + * the dnp using the devid and not the name. + */ + len = strlen(cnames.min_names[indx]) + strlen("s0"); + if ((fp = (char *)Malloc(len+1)) == NULL) { + (void) mdsyserror(ep, ENOMEM, NULL); + rval = -1; + goto out; + } + (void) snprintf(fp, len + 1, "%ss0", + cnames.min_names[indx]); + if ((fd = open(fp, O_RDONLY|O_NDELAY)) < 0) { + (void) mdsyserror(ep, EIO, fp); + rval = -1; + goto out; + } + Free(fp); + /* if no device id, what error?) */ + if (devid_get(fd, &devid1) != 0) { + (void) mdsyserror(ep, EIO, fp); + rval = -1; + goto out; + } + if (close(fd) < 0) { + (void) mdsyserror(ep, EIO, fp); + rval = -1; + goto out; + } + cdevidp = devid_str_encode(devid1, NULL); + if (cdevidp == NULL) { + (void) mdsyserror(ep, EIO, fp); + rval = -1; + goto out; + } + devid_free(devid1); + dnp = metadrivenamebydevid(&sp, cdevidp, + cnames.min_names[indx], ep); + devid_str_free(cdevidp); + if (dnp == NULL) { + /* + * Assuming we're interested in knowing about + * whatever error occurred, but not in stopping. + */ + mde_perror(ep, cnames.min_names[indx]); + mdclrerror(ep); + continue; + } + + dnlpp = meta_drivenamelist_append_wrapper(dnlpp, dnp); + } + /* Reget sd and dd since freed by meta_prune_cnames. */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + rval = -1; + goto out; + } + + if (sd->sd_flags & MD_SR_MB_DEVID) + dd = metaget_drivedesc(sp, + MD_BASICNAME_OK | PRINT_FAST, ep); + else + dd = metaget_drivedesc(sp, + MD_BASICNAME_OK, ep); + /* If ep has error, then there was a failure, set rval */ + if (!mdisok(ep)) { + rval = -1; + goto out; + } + + /* Builds global replicated disk list */ + replicated_disk_list_built = &replicated_disk_list_built_pass1; + + /* If success, then clear error structure */ + if (build_replicated_disks_list(ep, dnlp) == 1) + mdclrerror(ep); + /* If ep has error, then there was a failure, set rval */ + if (! mdisok(ep)) { + rval = -1; + goto out; + } + + for (d = dd; d != NULL; d = d->dd_next) { + if (d->dd_flags & MD_DR_UNRSLV_REPLICATED) { + /* Get old devid from drive record */ + (void) devid_str_decode(d->dd_dnp->devid, + &old_devid, NULL); + + /* + * If the devid stored in the drive record + * (old_devid) matches a devid known by + * the system, then this disk has already + * been partially resolved. This situation + * could occur if a panic happened during a + * previous take of this diskset. + * Set flag to later handle fixing the master + * block on disk and turning off the unresolved + * replicated flag. + */ + if (meta_deviceid_to_nmlist(search_path, + (ddi_devid_t)old_devid, + DEVID_MINOR_NAME_ALL, + &nmlist) == 0) { + d->dd_flags |= MD_DR_FIX_MB_DID; + retake_flag = 1; + continue; + } + + /* + * If the devid stored in the drive record + * is on the list of replicated disks found + * during a system scan then set both flags + * so that the locator block, namespaces + * (diskset and local set), master block + * and unresolved replicated flag are updated. + */ + new_devid = replicated_list_lookup( + devid_sizeof((ddi_devid_t)old_devid), + old_devid); + devid_free(old_devid); + + /* + * If devid stored in the drive record is + * not found then set flag to mark + * that set is still unresolved and + * continue to next drive record. + */ + if (new_devid == NULL) { + unrslv_replicated = 1; + continue; + } + + /* + * Set flags to fix up the master block, + * locator block of the diskset, diskset + * namespace and the local set namespace. + */ + d->dd_flags |= (MD_DR_FIX_MB_DID | + MD_DR_FIX_LB_NM_DID); + retake_flag = 1; + } + } + + } + + /* * Check the local devid namespace to see if the disks * have been moved. Use the local set first of all as this contains * entries for the disks in the set. @@ -627,6 +807,7 @@ meta_set_take( * we are interested in. */ if (newname != NULL) { + char *save_devid; /* * Need to save the side names key as this * points to the namespace entry that will @@ -635,16 +816,28 @@ meta_set_take( * set the namespace key. */ side_names_key = d->dd_dnp->side_names_key; + + /* + * There is the possibility that there + * will be multiple disks with the same + * name but different devids in the + * drivelist. Because of this, we need + * to look for a new dnp based on devid + * and not name. + */ + save_devid = Strdup(d->dd_dnp->devid); metafreedrivename(d->dd_dnp); - d->dd_dnp = metadrivename(&sp, - metadiskname(newname), ep); + d->dd_dnp = metadrivenamebydevid(&sp, + save_devid, newname, ep); + Free(save_devid); Free(newname); /* * null newname so we are reset for next time * through */ newname = NULL; - ret = meta_make_sidenmlist(sp, d->dd_dnp, ep); + ret = meta_make_sidenmlist(sp, + d->dd_dnp, 0, NULL, ep); d->dd_dnp->side_names_key = side_names_key; if (ret == -1) { rval = -1; @@ -663,7 +856,8 @@ meta_set_take( RB_TEST(2, "take", ep) if (!MD_ATSET_DESC(sd)) { - if (tk_own_bydd(sp, dd, mhiargsp, FALSE, ep)) + if (tk_own_bydd(sp, dd, mhiargsp, + flags & MD_IM_PARTIAL_DISKSET, ep)) goto rollback; } @@ -743,13 +937,38 @@ meta_set_take( (void) mddserror(ep, MDE_DS_SETCLEANUP, sp->setno, sp->setname, NULL, mynode()); rval = -1; - goto out; } goto rollback; } - rval = pathname_reload(&sp, sp->setno, ep); - if ((rval == METADEVADM_ERR) || (rval == METADEVADM_DSKNAME_ERR)) { + /* + * If an unresolved replicated diskset, fix up diskset + * and local namespaces, master block and drive record + * with the new devid. If all drives in diskset are + * now resolved, then clear set unresolved replicated flag. + * If an error is encountered, don't fail the take, but + * don't proceed any further in resolving the replicated disks. + */ + if (sd->sd_flags & MD_SR_UNRSLV_REPLICATED) { + /* Fix up diskset and local namespaces with new devids */ + meta_unrslv_replicated_nm(sp, dd, dnlp, ep); + if (mdisok(ep)) { + /* Fix up master block with new devids */ + meta_unrslv_replicated_mb(sp, dd, dnlp, ep); + } + + /* If all drives are resolved, set OK flag in set record. */ + if (mdisok(ep) && (unrslv_replicated == 0)) { + /* Ignore failure since no bad effect. */ + (void) clnt_upd_sr_flags(mynode(), sp, MD_SR_OK, ep); + } + mdclrerror(ep); + + } + + pathname_return = pathname_reload(&sp, sp->setno, ep); + if ((pathname_return == METADEVADM_ERR) || + (pathname_return == METADEVADM_DSKNAME_ERR)) { goto rollback; } @@ -847,6 +1066,23 @@ meta_set_take( RB_TEST(7, "take", ep) + /* + * In order to resolve the namespace major driver names and + * to have the subdrivers attempt to re-associate devts from + * the newly resolved replicated device ids, return a '2'. + * This instructs metaset to release the diskset and re-take. + * + * Return a 2 if + * - no error was detected on the take + * - a replicated unresolved devid was resolved during take + * - take isn't being called during an import + * - this isn't already a re-take situation + */ + if ((rval == 0) && (retake_flag == 1) && + ((flags & (TAKE_RETAKE | TAKE_IMP)) == 0)) { + rval = 2; + } + return (rval); out: diff --git a/usr/src/lib/lvm/libmeta/common/metad_svc_stubs.c b/usr/src/lib/lvm/libmeta/common/metad_svc_stubs.c index 32be258ab3..8631a82f3d 100644 --- a/usr/src/lib/lvm/libmeta/common/metad_svc_stubs.c +++ b/usr/src/lib/lvm/libmeta/common/metad_svc_stubs.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -65,6 +64,7 @@ #pragma weak mdrpc_setnameok_2_svc = _mdrpc_setnameok_2_svc #pragma weak mdrpc_ownset_2_svc = _mdrpc_ownset_2_svc #pragma weak mdrpc_adddrvs_2_svc = _mdrpc_adddrvs_2_svc +#pragma weak mdrpc_imp_set_drvs_2_svc = _mdrpc_imp_set_drvs_2_svc #pragma weak mdrpc_deldrvs_2_svc = _mdrpc_deldrvs_2_svc #pragma weak mdrpc_upd_dr_dbinfo_2_svc = _mdrpc_upd_dr_dbinfo_2_svc #pragma weak mdrpc_devinfo_2_svc = _mdrpc_devinfo_2_svc @@ -97,6 +97,7 @@ #pragma weak mdrpc_resnarf_set_2_svc = _mdrpc_resnarf_set_2_svc #pragma weak mdrpc_mn_mirror_resync_all_2_svc = \ _mdrpc_mn_mirror_resync_all_2_svc +#pragma weak mdrpc_imp_adddrvs_2_svc = _mdrpc_imp_adddrvs_2_svc /*ARGSUSED*/ bool_t @@ -486,6 +487,17 @@ _mdrpc_adddrvs_2_svc( /*ARGSUSED*/ bool_t +_mdrpc_imp_set_drvs_2_svc( + mdrpc_drives_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t _mdrpc_deldrvs_2_svc( mdrpc_drives_2_args *a, mdrpc_generic_res *b, @@ -823,3 +835,14 @@ _mdrpc_mn_mirror_resync_all_2_svc( assert(0); return (TRUE); } + +/*ARGSUSED*/ +bool_t +_mdrpc_imp_adddrvs_2_svc( + mdrpc_drives_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} diff --git a/usr/src/lib/lvm/libmeta/spec/meta.spec b/usr/src/lib/lvm/libmeta/spec/meta.spec index b33f13b015..9a077d12df 100644 --- a/usr/src/lib/lvm/libmeta/spec/meta.spec +++ b/usr/src/lib/lvm/libmeta/spec/meta.spec @@ -937,6 +937,10 @@ function meta_free_drive_info_list version SUNWprivate_1.1 end +function meta_free_im_set_desc +version SUNWprivate_1.1 +end + function meta_get_drive_names version SUNWprivate_1.1 end @@ -981,6 +985,10 @@ function meta_rel_own version SUNWprivate_1.1 end +function meta_replica_quorum +version SUNWprivate_1.1 +end + function meta_status_own version SUNWprivate_1.1 end @@ -1257,6 +1265,10 @@ function metaflushsidenames version SUNWprivate_1.1 end +function metaflushdrivenames +version SUNWprivate_1.1 +end + function metafreedrivename version SUNWprivate_1.1 end @@ -1769,6 +1781,14 @@ function meta_is_drive_in_thisset version SUNWprivate_1.1 end +function meta_is_devid_in_anyset +version SUNWprivate_1.1 +end + +function meta_is_devid_in_thisset +version SUNWprivate_1.1 +end + function meta_set_balance version SUNWprivate_1.1 end @@ -1817,10 +1837,6 @@ function strinlst version SUNWprivate_1.1 end -function meta_make_sidenmlist -version SUNWprivate_1.1 -end - function meta_set_adddrives version SUNWprivate_1.1 end @@ -3685,6 +3701,14 @@ function xdr_mdrpc_nodeid_2_args version SUNWprivate_1.1 end +function clnt_imp_adddrvs +version SUNWprivate_1.1 +end + +function mdrpc_imp_adddrvs_2 +version SUNWprivate_1.1 +end + function meta_is_member version SUNWprivate_1.1 end @@ -3737,6 +3761,10 @@ function read_master_block version SUNWprivate_1.1 end +function pick_good_disk +version SUNWprivate_1.1 +end + function add_self_name version SUNWprivate_1.1 end diff --git a/usr/src/uts/common/io/lvm/md/md_ioctl.c b/usr/src/uts/common/io/lvm/md/md_ioctl.c index c3102d7e6c..cfa6246d9a 100644 --- a/usr/src/uts/common/io/lvm/md/md_ioctl.c +++ b/usr/src/uts/common/io/lvm/md/md_ioctl.c @@ -143,9 +143,11 @@ get_lb_inittime_ioctl( static int setnm_ioctl(mdnm_params_t *nm, int mode) { - char *name; + char *name, *minorname = NULL; side_t side; int err = 0; + void *devid = NULL; + int devid_sz; /* * Don't allow addition of new names to namespace during upgrade. @@ -178,6 +180,36 @@ setnm_ioctl(mdnm_params_t *nm, int mode) goto out; } + if (nm->imp_flag) { + if ((nm->devid == NULL) || (nm->minorname == NULL)) { + err = EINVAL; + goto out; + } + if (nm->devid) { + devid_sz = nm->devid_size; + devid = kmem_zalloc(devid_sz, KM_SLEEP); + err = ddi_copyin((caddr_t)(uintptr_t)nm->devid, + devid, devid_sz, mode); + if (err) { + err = EFAULT; + goto out; + } + } + if (nm->minorname) { + if (nm->minorname_len > MAXPATHLEN) { + err = EINVAL; + goto out; + } + minorname = kmem_zalloc(nm->minorname_len, KM_SLEEP); + err = ddi_copyin((caddr_t)(uintptr_t)nm->minorname, + minorname, (size_t)nm->minorname_len, mode); + if (err) { + err = EFAULT; + goto out; + } + } + } + if (nm->side == -1) side = mddb_getsidenum(nm->setno); else @@ -190,7 +222,8 @@ setnm_ioctl(mdnm_params_t *nm, int mode) } nm->key = md_setdevname(nm->setno, side, nm->key, nm->drvnm, - nm->mnum, name, 0, &nm->mde); + nm->mnum, name, nm->imp_flag, (ddi_devid_t)devid, minorname, + 0, &nm->mde); /* * If we got an error from md_setdevname & md_setdevname did not * set the error code, we'll default to MDE_DB_NOSPACE. @@ -202,6 +235,11 @@ setnm_ioctl(mdnm_params_t *nm, int mode) out: kmem_free(name, MAXPATHLEN); + if (devid) { + kmem_free(devid, devid_sz); + } + if (minorname) + kmem_free(minorname, nm->minorname_len); return (err); } @@ -227,6 +265,7 @@ getnm_ioctl( if ((md_get_setstatus(nm->setno) & MD_SET_SNARFED) == 0) return (ENODEV); + name = kmem_alloc(MAXPATHLEN, KM_SLEEP); if (nm->side == -1) @@ -3295,15 +3334,16 @@ md_base_ioctl(md_dev64_t dev, int cmd, caddr_t data, int mode, IOLOCK *lockp) if (! (mode & FWRITE)) return (EACCES); - sz = sizeof (set_t); - d = kmem_alloc(sz, KM_SLEEP); + mddb_config_case = 1; - if (ddi_copyin(data, d, sz, mode) != 0) { - err = EFAULT; - break; + err = mddb_config_from_user(&d, data, mode, &c_devid_addr, + &c_old_devid_addr); + + if (err) { + return (err); } - err = md_imp_snarf_set((set_t *)d, mode); + err = md_imp_snarf_set((mddb_config_t *)d); break; } @@ -3324,6 +3364,22 @@ md_base_ioctl(md_dev64_t dev, int cmd, caddr_t data, int mode, IOLOCK *lockp) err = get_lb_inittime_ioctl((mddb_config_t *)d); break; } + case MD_IOCUPDATE_NM_RR_DID: + { + if (! (mode & FWRITE)) + return (EACCES); + + mddb_config_case = 1; + + err = mddb_config_from_user(&d, data, mode, &c_devid_addr, + &c_old_devid_addr); + + if (err) + return (err); + + err = md_update_nm_rr_did_ioctl((mddb_config_t *)d); + break; + } default: return (ENOTTY); /* used by next level up */ } diff --git a/usr/src/uts/common/io/lvm/md/md_mddb.c b/usr/src/uts/common/io/lvm/md/md_mddb.c index e98960da9d..ad2b567b33 100644 --- a/usr/src/uts/common/io/lvm/md/md_mddb.c +++ b/usr/src/uts/common/io/lvm/md/md_mddb.c @@ -113,7 +113,7 @@ extern md_ops_t *md_opslist; extern md_krwlock_t nm_lock; static int update_locatorblock(mddb_set_t *s, md_dev64_t dev, - ddi_devid_t didptr); + ddi_devid_t didptr, ddi_devid_t old_didptr); /* * Defines for crc calculation for records @@ -1027,8 +1027,16 @@ mddb_devid_add( ((char *)devid_ptr)[i] = ((char *)devid)[i]; /* Update mddb_did_info area for new device id */ - did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID | - MDDB_DID_UPDATED; + did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID; + + /* + * Only set UPDATED flag for non-replicated import cases. + * This allows the side locator driver name index to get + * updated in load_old_replicas. + */ + if (!(md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT)) + did_info->info_flags |= MDDB_DID_UPDATED; + did_info->info_firstblk = blk; did_info->info_blkcnt = blkcnt; did_info->info_offset = offset; @@ -1806,8 +1814,10 @@ getmasters( if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) { error = MDDB_F_EFMT | MDDB_F_EMASTER; } - if (!(md_get_setstatus(s->s_setno) & MD_SET_IMPORT) && - (mb->mb_setno != s->s_setno)) { + + if (!(md_get_setstatus(s->s_setno) & + (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && + (mb->mb_setno != s->s_setno)) { error = MDDB_F_EFMT | MDDB_F_EMASTER; } if (mb->mb_blkno != blkno) { @@ -1826,8 +1836,9 @@ getmasters( * Don't care about devid in local set since it is not used * and this should not be part of set importing */ - if ((s->s_setno != MD_LOCAL_SET) && !(md_get_setstatus(s->s_setno) & - MD_SET_IMPORT)) { + if ((s->s_setno != MD_LOCAL_SET) && + !(md_get_setstatus(s->s_setno) & + (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT))) { /* * Now check the destroy flag. We also need to handle * the case where the destroy flag is reset after the @@ -2331,7 +2342,8 @@ getuserdata( * record, we must convert it because it was incore as a 64 bit * structure but its on disk layout has only 32 bit for block sizes */ - if (!(md_get_setstatus(setno) & MD_SET_IMPORT) && + if (!(md_get_setstatus(setno) & + (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && (type >= MDDB_FIRST_MODID) && ((rbp->rb_revision == MDDB_REV_RB) || (rbp->rb_revision == MDDB_REV_RBFN))) { @@ -2878,9 +2890,21 @@ match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev, } if (rip->ri_devid && devid && minor) { - if (ddi_devid_compare(rip->ri_devid, devid) != 0 || - strcmp(rip->ri_minor_name, minor) != 0) - return (0); + /* + * If old devid exists, then this is a replicated diskset + * and both old and new devids must be checked. + */ + if (rip->ri_old_devid) { + if (((ddi_devid_compare(rip->ri_devid, devid) != 0) && + (ddi_devid_compare(rip->ri_old_devid, + devid) != 0)) || + (strcmp(rip->ri_minor_name, minor) != 0)) + return (0); + } else { + if (ddi_devid_compare(rip->ri_devid, devid) != 0 || + strcmp(rip->ri_minor_name, minor) != 0) + return (0); + } } else { if (rip->ri_dev != dev) return (0); @@ -4225,7 +4249,7 @@ selectlocator( if (r->ri_lbp == (mddb_lb_t *)NULL) continue; - if (cmpidentifier(s, &r->ri_lbp->lb_ident)) + if (!cmpidentifier(s, &r->ri_lbp->lb_ident)) continue; if (r->ri_dtp != (mddb_dt_t *)NULL) { @@ -4852,7 +4876,8 @@ get_mbs_n_lbs( * We don't do this check if we're in the middle of * importing a set. */ - if (!(md_get_setstatus(s->s_setno) & MD_SET_IMPORT) && + if (!(md_get_setstatus(s->s_setno) & + (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && (lbp->lb_setno != s->s_setno)) continue; @@ -5111,27 +5136,27 @@ get_mbs_n_lbs( if (!(did_info->info_flags & MDDB_DID_EXISTS)) continue; - if (rip->ri_old_devid == NULL) - continue; - if (did_icp->did_ic_devid[li] == NULL) continue; for (trip = s->s_rip; trip != NULL; trip = trip->ri_next) { + if (trip->ri_old_devid == NULL) + continue; if (ddi_devid_compare( trip->ri_old_devid, did_icp->did_ic_devid[li]) != 0) { continue; } - /* update l_dev */ + /* update l_dev and side mnum */ lp->l_dev = md_cmpldev(trip->ri_dev); + lbp->lb_sidelocators[0][li].l_mnum = + md_getminor(trip->ri_dev); } } } - /* * If there is a valid devid, verify that this locator * block has information about itself by checking the @@ -5162,8 +5187,9 @@ get_mbs_n_lbs( if (!(did_info->info_flags & MDDB_DID_EXISTS)) continue; - if ((md_get_setstatus(setno) & - MD_SET_REPLICATED_IMPORT)) { + if (((md_get_setstatus(setno) & + MD_SET_REPLICATED_IMPORT)) && + (rip->ri_old_devid != (ddi_devid_t)NULL)) { if (ddi_devid_compare(rip->ri_old_devid, did_icp->did_ic_devid[li]) != 0) continue; @@ -5471,6 +5497,7 @@ load_old_replicas( char *minor_name; int write_lb = 0; int rval; + int stale_rtn = 0; /* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */ if (retval = get_mbs_n_lbs(s, &write_lb)) @@ -5819,12 +5846,17 @@ load_old_replicas( /* This will return non-zero if STALE or TOOFEW */ /* This will write out chosen replica image to all replicas */ - if (selectreplicas(s, MDDB_SCANALL)) - goto errout; + stale_rtn = selectreplicas(s, MDDB_SCANALL); if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { ddi_devid_t devidptr; + /* + * ignore the return value from selectreplicas because we + * may have a STALE or TOOFEW set in the case of a partial + * replicated diskset. We will fix that up later. + */ + lbp = s->s_lbp; for (li = 0; li < lbp->lb_loccnt; li++) { did_info = &(did_icp->did_ic_blkp->blk_info[li]); @@ -5842,13 +5874,17 @@ load_old_replicas( } if (update_locatorblock(s, md_expldev(lp->l_dev), - rip->ri_devid)) { + rip->ri_devid, rip->ri_old_devid)) { goto errout; } } } } + } else { + if (stale_rtn) + goto errout; } + /* * If the replica is in device id style - validate the device id's, * if present, in the locator block devid area. @@ -7146,7 +7182,8 @@ mddb_unload_set( MD_SET_OWNERSHIP | MD_SET_BADTAG | MD_SET_CLRTAG | MD_SET_MNSET | MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | - MD_SET_MN_MIR_STATE_RC); + MD_SET_MN_MIR_STATE_RC | MD_SET_IMPORT | + MD_SET_REPLICATED_IMPORT); mutex_exit(SETMUTEX(setno)); } @@ -7674,6 +7711,22 @@ out: * the devt to see if it matches the given devt. If so, and * there is an associated device id which is not the same * as the passed in devid, delete old devid and add a new one. + * + * During import of replicated disksets, old_didptr contains + * the original disk's device id. Use this device id in + * addition to the devt to determine if an entry is a match + * and should be updated with the new device id of the + * replicated disk. Specifically, this is the case being handled: + * + * Original_disk Replicated_disk Disk_Available_During_Import + * c1t1d0 c1t3d0 no - so old name c1t1d0 shown + * c1t2d0 c1t1d0 yes - name is c1t1d0 + * c1t3d0 c1t2d0 yes - name is c1t2d0 + * + * Can't just match on devt since devt for the first and third + * disks will be the same, but the original disk's device id + * is known and can be used to distinguish which disk's + * replicated device id should be updated. * RETURN * MDDB_E_NODEVID * MDDB_E_NOLOCBLK @@ -7681,7 +7734,12 @@ out: * 0 Success */ static int -update_locatorblock(mddb_set_t *s, md_dev64_t dev, ddi_devid_t didptr) +update_locatorblock( + mddb_set_t *s, + md_dev64_t dev, + ddi_devid_t didptr, + ddi_devid_t old_didptr +) { mddb_lb_t *lbp = NULL; mddb_locator_t *lp; @@ -7690,6 +7748,11 @@ update_locatorblock(mddb_set_t *s, md_dev64_t dev, ddi_devid_t didptr) ddi_devid_t devid_ptr; int retval = 0; char *minor_name; + int repl_import_flag; + + /* Set replicated flag if this is a replicated import */ + repl_import_flag = md_get_setstatus(s->s_setno) & + MD_SET_REPLICATED_IMPORT; lbp = s->s_lbp; /* find replicas that haven't been deleted */ @@ -7713,20 +7776,32 @@ update_locatorblock(mddb_set_t *s, md_dev64_t dev, ddi_devid_t didptr) if (devid_ptr == NULL) { return (MDDB_E_NODEVID); } + + /* + * During a replicated import the old_didptr + * must match the current devid before the + * devid can be updated. + */ + if (repl_import_flag) { + if (ddi_devid_compare(devid_ptr, + old_didptr) != 0) + continue; + } + if (ddi_devid_compare(devid_ptr, didptr) != 0) { /* * devid's not equal so * delete and add */ if (ddi_lyr_get_minor_name( - md_dev64_to_dev(dev), - S_IFBLK, &minor_name) == DDI_SUCCESS) { + md_dev64_to_dev(dev), + S_IFBLK, &minor_name) == DDI_SUCCESS) { (void) mddb_devid_delete(s, li); (void) mddb_devid_add(s, li, didptr, - minor_name); + minor_name); kmem_free(minor_name, - strlen(minor_name)+1); - break; + strlen(minor_name)+1); + break; } else { retval = 1; goto err_out; @@ -7867,7 +7942,7 @@ setdid( } } - if (update_locatorblock(s, cp->c_devt, devidp)) { + if (update_locatorblock(s, cp->c_devt, devidp, NULL)) { err = -1; goto out; } @@ -8547,8 +8622,7 @@ mddb_configure( if (cp->c_locator.l_old_devid) { md_set_setstatus(setno, MD_SET_REPLICATED_IMPORT); } - if ((err = ridev(&s->s_rip, &cp->c_locator, NULL, flag)) != 0) - err = mddbstatus2error(ep, err, NODEV32, setno); + err = ridev(&s->s_rip, &cp->c_locator, NULL, flag); mddb_setexit(s); break; @@ -10065,6 +10139,16 @@ take_set(mddb_config_t *cp, int mode) snarf_ok = 1; } + /* + * Clear replicated import flag since this is + * used during the take of a diskset with + * previously unresolved replicated disks. + */ + if (md_get_setstatus(setno) & + MD_SET_REPLICATED_IMPORT) { + md_clr_setstatus(setno, MD_SET_REPLICATED_IMPORT); + } + if (! err && mdisok(ep)) { if (! cp->c_flags) { medup.med_setno = setno; @@ -12232,6 +12316,9 @@ update_mb( int err = 0; for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { + if (rip->ri_flags & MDDB_F_EMASTER) + /* disk is powered off or not there */ + continue; if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) { @@ -12282,7 +12369,10 @@ update_setname( rw_enter(&nm_lock.lock, RW_WRITER); if ((nh = get_first_record(setno, 0, NM_SHARED)) == NULL) { - err = MD_KEYBAD; + /* + * No namespace is okay + */ + err = 0; goto out; } @@ -12304,13 +12394,13 @@ update_setname( if (remove_shared_entry(nh, o_key, NULL, 0L | NM_IMP_SHARED | NM_NOCOMMIT)) { - err = MD_KEYBAD; + err = MDDB_E_NORECORD; goto out; } if ((new_shn = (struct nm_shared_name *)alloc_entry( nh, md_set[setno].s_nmid, len, NM_SHARED | NM_NOCOMMIT, &recid)) == NULL) { - err = MD_KEYBAD; + err = MDDB_E_NORECORD; goto out; } @@ -12332,17 +12422,26 @@ out: return (err); } +/* + * Returns 0 on success. + * Returns -1 on failure with ep filled in. + */ static int md_imp_db( - set_t setno + set_t setno, + int stale_flag, + md_error_t *ep ) { mddb_set_t *s; int err = 0; mddb_dt_t *dtp; + mddb_lb_t *lbp; + int i; + int loccnt; if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { - return (err); + return (mddbstatus2error(ep, err, NODEV32, setno)); } /* Update dt */ @@ -12351,6 +12450,7 @@ md_imp_db( } if ((err = dt_write(s)) != 0) { + err = mdsyserror(ep, err); mddb_setexit(s); return (err); } @@ -12362,14 +12462,36 @@ md_imp_db( */ /* Update lb */ - if ((err = writelocall(s)) != 0) { - mddb_setexit(s); - return (err); - } + if (stale_flag & MD_IMP_STALE_SET) { + lbp = s->s_lbp; + loccnt = lbp->lb_loccnt; + for (i = 0; i < loccnt; i++) { + mddb_locator_t *lp = &lbp->lb_locators[i]; + md_dev64_t ndev = md_expldev(lp->l_dev); + ddi_devid_t devid_ptr; + devid_ptr = s->s_did_icp->did_ic_devid[i]; + if (devid_ptr == NULL) { + /* + * Already deleted, go to next one. + */ + continue; + } + if (mddb_devid_validate((ddi_devid_t)devid_ptr, &ndev, + NULL)) { + /* disk unavailable, mark deleted */ + lp->l_flags = MDDB_F_DELETED; + /* then remove the device id from the list */ + free_mbipp(&s->s_mbiarray[i]); + s->s_mbiarray[i] = 0; + (void) mddb_devid_delete(s, i); + } + } + md_clr_setstatus(setno, MD_SET_STALE); + } - /* Update mb */ - if ((err = update_mb(s)) != 0) { + if ((err = writelocall(s)) != 0) { + err = mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno); mddb_setexit(s); return (err); } @@ -12377,11 +12499,13 @@ md_imp_db( mddb_setexit(s); /* Update db records */ - if ((err = update_db_rec(s)) != 0) - return (err); + if ((err = update_db_rec(s)) != 0) { + return (mddbstatus2error(ep, err, NODEV32, setno)); + } /* Update setname embedded in the namespace */ - err = update_setname(setno); + if ((err = update_setname(setno)) != 0) + return (mddbstatus2error(ep, err, NODEV32, setno)); return (err); } @@ -12436,136 +12560,20 @@ md_setup_recids( *ids = &recids[0]; } -static int -md_imp_create_set( - set_t setno -) -{ - mddb_set_t *s; - int drc = 0, err = 0; - size_t sr_size = sizeof (md_set_record); - md_set_record *sr; - mddb_recid_t sr_recid, dr_recid, *ids = NULL; - mddb_ri_t *rip, *trip; - md_drive_record *dr; - size_t dr_size = sizeof (md_drive_record); - mdkey_t dr_key; - md_error_t error = MDNULLERROR; - - - if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) - return (err); - - /* Create and fill in set record */ - if ((sr_recid = mddb_createrec(sr_size, MDDB_USER, MDDB_UR_SR, - MD_CRO_32BIT, MD_LOCAL_SET)) < 0) { - mddb_setexit(s); - return (MDDB_E_INVALID); - } - - sr = (md_set_record *)mddb_getrecaddr(sr_recid); - sr->sr_selfid = sr_recid; - sr->sr_setno = s->s_setno; - (void) strcpy(sr->sr_setname, s->s_setname); - uniqtime32(&sr->sr_ctime); - sr->sr_genid = 0; - sr->sr_revision = MD_SET_RECORD_REVISION; - sr->sr_flags |= MD_SR_OK; - sr->sr_mhiargs = defmhiargs; - (void) strcpy(sr->sr_nodes[0], utsname.nodename); - - /* Create and fillin drive records */ - for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { - /* - * Add entry and create the record - */ - if ((dr_key = md_setdevname(MD_LOCAL_SET, 1, MD_KEYWILD, - rip->ri_driver, md_getminor(rip->ri_dev), - rip->ri_devname, setno, &error)) == 0) - continue; - - if (dr_key < 0) { - mddb_setexit(s); - return (MD_KEYBAD); - } - - if ((dr_recid = mddb_createrec(dr_size, MDDB_USER, - MDDB_UR_DR, MD_CRO_32BIT, MD_LOCAL_SET)) < 0) { - mddb_setexit(s); - return (MDDB_E_INVALID); - } - - dr = (md_drive_record *)mddb_getrecaddr(dr_recid); - dr->dr_selfid = dr_recid; - - /* - * We need to check to see if the drive on - * the rip has a replica. If it doesn't have - * a replica, then we need to set the dr_dbcnt - * and dr_dbsize to 0 to reflect that. - */ - if (rip->ri_mbip == NULL) { - dr->dr_dbcnt = 0; - dr->dr_dbsize = 0; - } else { - dr->dr_dbcnt = 1; - - for (trip = s->s_rip; trip != NULL; - trip = trip->ri_next) { - - if (trip == rip) - continue; - - if ((trip->ri_dev == rip->ri_dev) && - (strcmp(trip->ri_devname, rip->ri_devname) - == 0)) - dr->dr_dbcnt++; - } - - dr->dr_dbsize = rip->ri_mbip->mbi_mddb_mb.mb_blkcnt + 1; - } - dr->dr_key = dr_key; - uniqtime32(&dr->dr_ctime); - dr->dr_genid = 1; - dr->dr_revision = MD_DRIVE_RECORD_REVISION; - dr->dr_flags = MD_SR_OK; - drc++; - - /* Add on the linked list */ - (void) md_dr_add(sr, dr); - } - - /* - * Alloc and setup recids which include set record - */ - (void) md_setup_recids(sr, &ids, drc + 2); - - /* - * Commit all the records - */ - err = mddb_commitrecs(ids); - - if (ids) - kmem_free(ids, sizeof (mddb_recid_t) * (drc + 2)); - mddb_setexit(s); - return (err); -} - /* - * namespace is loaded before this is called. - * The purpose of this function is to update the device ids in the entire - * namespace using the data in the ri structure. Compare the devid found in - * the namespace with ri_old_devid and if they are the same, update with the - * devid in ri_devid. + * The purpose of this function is to replace the old_devid with the + * new_devid in the given namespace. This is used for importing + * remotely replicated drives. */ -static int -md_imp_update_namespace_did(mddb_set_t *s) +int +md_update_namespace_rr_did( + mddb_config_t *cp +) { - set_t setno = s->s_lbp->lb_setno; + set_t setno = cp->c_setno; struct nm_next_hdr *nh; mdkey_t key = MD_KEYWILD; side_t side = MD_SIDEWILD; - mddb_ri_t *rip = NULL; mddb_recid_t recids[3]; struct did_min_name *n; struct nm_next_hdr *did_shr_nh; @@ -12578,6 +12586,13 @@ md_imp_update_namespace_did(mddb_set_t *s) struct did_shr_name *shn; size_t offset; struct nm_next_hdr *this_did_shr_nh; + void *old_devid, *new_devid; + + if (!(md_get_setstatus(setno) & MD_SET_NM_LOADED)) + return (EIO); + + old_devid = (void *)(uintptr_t)cp->c_locator.l_old_devid; + new_devid = (void *)(uintptr_t)cp->c_locator.l_devid; /* * It is okay if we dont have any configuration @@ -12591,7 +12606,7 @@ md_imp_update_namespace_did(mddb_set_t *s) /* check out every entry in the namespace */ if ((n = (struct did_min_name *)lookup_entry(nh, setno, side, key, NODEV64, NM_DEVID)) == NULL) { - break; + continue; } else { did_shr_nh = get_first_record(setno, 0, NM_DEVID | NM_SHARED); @@ -12608,39 +12623,37 @@ md_imp_update_namespace_did(mddb_set_t *s) rw_enter(&nm_lock.lock, RW_WRITER); devid = (ddi_devid_t)shr_n->did_devid; /* find this devid in the incore replica */ - for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { - if (ddi_devid_compare(devid, rip->ri_old_devid) - == 0) { - /* - * found the corresponding entry - * update with new devid - */ - /* first remove old devid info */ - ent_did_key = shr_n ->did_key; - ent_did_count = shr_n->did_count; - ent_did_data = shr_n->did_data; - ent_size = DID_SHR_NAMSIZ(shr_n); - size = ((struct nm_rec_hdr *) - this_did_shr_nh->nmn_record)-> - r_used_size - offset - ent_size; - if (size == 0) { - (void) bzero(shr_n, ent_size); - } else { - (void) ovbcopy((caddr_t)shr_n + - ent_size, shr_n, size); - (void) bzero((caddr_t)shr_n + - size, ent_size); - } - ((struct nm_rec_hdr *)this_did_shr_nh-> - nmn_record)->r_used_size -= - ent_size; - /* add in new devid info */ - if ((shn = (struct did_shr_name *) - alloc_entry(did_shr_nh, - md_set[setno].s_did_nmid, - ddi_devid_sizeof(rip->ri_devid), - NM_DEVID | NM_SHARED | NM_NOCOMMIT, - &recids[0])) == NULL) { + if (ddi_devid_compare(devid, old_devid) == 0) { + /* + * found the corresponding entry + * update with new devid + */ + /* first remove old devid info */ + ent_did_key = shr_n ->did_key; + ent_did_count = shr_n->did_count; + ent_did_data = shr_n->did_data; + ent_size = DID_SHR_NAMSIZ(shr_n); + size = ((struct nm_rec_hdr *) + this_did_shr_nh->nmn_record)-> + r_used_size - offset - ent_size; + if (size == 0) { + (void) bzero(shr_n, ent_size); + } else { + (void) ovbcopy((caddr_t)shr_n + + ent_size, shr_n, size); + (void) bzero((caddr_t)shr_n + + size, ent_size); + } + ((struct nm_rec_hdr *)this_did_shr_nh-> + nmn_record)->r_used_size -= + ent_size; + /* add in new devid info */ + if ((shn = (struct did_shr_name *) + alloc_entry(did_shr_nh, + md_set[setno].s_did_nmid, + cp->c_locator.l_devid_sz, + NM_DEVID | NM_SHARED | NM_NOCOMMIT, + &recids[0])) == NULL) { rw_exit(&nm_lock.lock); return (ENOMEM); } @@ -12649,34 +12662,74 @@ md_imp_update_namespace_did(mddb_set_t *s) ent_did_data |= NM_DEVID_VALID; shn->did_data = ent_did_data; shn->did_size = ddi_devid_sizeof( - rip->ri_devid); - bcopy((void *)rip->ri_devid, (void *) + new_devid); + bcopy((void *)new_devid, (void *) shn->did_devid, shn->did_size); recids[1] = md_set[setno].s_nmid; recids[2] = 0; mddb_commitrecs_wrapper(recids); - } } rw_exit(&nm_lock.lock); } } + return (0); } +/* + * namespace is loaded before this is called. + * This function is a wrapper for md_update_namespace_rr_did. + * + * md_update_namespace_rr_did may be called twice if attempting to + * resolve a replicated device id during the take of a diskset - once + * for the diskset namespace and a second time for the local namespace. + * The local namespace would need to be updated when a drive has been + * found during a take of the diskset that hadn't been resolved during + * the import (aka partial replicated import). + * + * If being called during the import of the diskset (IMPORT flag set) + * md_update_namespace_rr_did will only be called once with the disket + * namespace. + */ +int +md_update_nm_rr_did_ioctl( + mddb_config_t *cp +) +{ + int rval = 0; + + /* If update of diskset namespace fails, stop and return failure */ + if ((rval = md_update_namespace_rr_did(cp)) != 0) + return (rval); + + if (cp->c_flags & MDDB_C_IMPORT) + return (0); + + /* If update of local namespace fails, return failure */ + cp->c_setno = MD_LOCAL_SET; + rval = md_update_namespace_rr_did(cp); + return (rval); +} + /*ARGSUSED*/ int md_imp_snarf_set( - set_t *setnum, - int mode + mddb_config_t *cp ) { - set_t setno = *setnum; /* import setno */ + set_t setno; + int stale_flag; mddb_set_t *s; int i, err = 0; md_ops_t *ops; + md_error_t *ep = &cp->c_mde; + + setno = cp->c_setno; + stale_flag = cp->c_flags; + mdclrerror(ep); if (setno >= md_nsets) { - return (EINVAL); + return (mdsyserror(ep, EINVAL)); } md_haltsnarf_enter(setno); @@ -12688,6 +12741,7 @@ md_imp_snarf_set( md_set_setstatus(setno, MD_SET_IMPORT); if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { + err = mddbstatus2error(ep, err, NODEV32, setno); goto out; } @@ -12708,12 +12762,12 @@ md_imp_snarf_set( * and ask each module to fixup unit records */ if (!md_load_namespace(setno, NULL, NM_DEVID)) { - err = ENOENT; + err = mdsyserror(ep, ENOENT); goto cleanup; } if (!md_load_namespace(setno, NULL, 0L)) { (void) md_unload_namespace(setno, NM_DEVID); - err = ENOENT; + err = mdsyserror(ep, ENOENT); goto cleanup; } @@ -12732,22 +12786,17 @@ md_imp_snarf_set( * (4) directory block * calls appropriate writes to push changes out */ - if ((err = md_imp_db(setno)) != 0) - goto cleanup; - - /* - * Create set in MD_LOCAL_SET - */ - if ((err = md_imp_create_set(setno)) != 0) + if ((err = md_imp_db(setno, stale_flag, ep)) != 0) { goto cleanup; + } /* - * update the namespace device ids if necessary (ie. block copy disk) + * Don't unload namespace if importing a replicated diskset. + * Namespace will be unloaded with an explicit RELEASE_SET ioctl. */ - if ((md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT)) { - if ((err = md_imp_update_namespace_did(s)) != 0) { - goto cleanup; - } + if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) { + md_haltsnarf_exit(setno); + return (err); } cleanup: diff --git a/usr/src/uts/common/io/lvm/md/md_names.c b/usr/src/uts/common/io/lvm/md/md_names.c index efa401a6a0..43f01703ee 100644 --- a/usr/src/uts/common/io/lvm/md/md_names.c +++ b/usr/src/uts/common/io/lvm/md/md_names.c @@ -1726,14 +1726,17 @@ zero_data_ptrs(struct nm_next_hdr *nh, set_t setno) */ mdkey_t md_setdevname( - set_t setno, /* specify which namespace to put in */ - side_t side, /* (key 1) side # */ - mdkey_t key, /* (key 2) KEYWILD - alloc key, else use key */ - char *drvnm, /* store this driver name with devicename */ - minor_t mnum, /* store this minor number as well */ - char *devname, /* device name to be stored */ - set_t imp_setno, /* used exclusively by import */ - md_error_t *ep /* place to return error info */ + set_t setno, /* specify which namespace to put in */ + side_t side, /* (key 1) side # */ + mdkey_t key, /* (key 2) KEYWILD - alloc key, else use key */ + char *drvnm, /* store this driver name with devicename */ + minor_t mnum, /* store this minor number as well */ + char *devname, /* device name to be stored */ + int imp_flag, /* used exclusively by import */ + ddi_devid_t imp_devid, /* used exclusively by import */ + char *imp_mname, /* used exclusively by import */ + set_t imp_setno, /* used exclusively by import */ + md_error_t *ep /* place to return error info */ ) { struct nm_next_hdr *nh, *did_nh = NULL; @@ -1819,18 +1822,26 @@ md_setdevname( * of the side information is taken here because it is dealt * with later on. */ - devt = makedevice(ddi_name_to_major(drvnm), mnum); - if ((ddi_lyr_get_devid(devt, &devid) == DDI_SUCCESS) && - (ddi_lyr_get_minor_name(devt, S_IFBLK, &mname) == - DDI_SUCCESS) && - (((mddb_set_t *)md_set[setno].s_db)->s_lbp->lb_flags & - MDDB_DEVID_STYLE)) - /* - * Reference the device id namespace - */ + if (!imp_flag) { + devt = makedevice(ddi_name_to_major(drvnm), mnum); + if ((ddi_lyr_get_devid(devt, &devid) == DDI_SUCCESS) && + (ddi_lyr_get_minor_name(devt, S_IFBLK, &mname) == + DDI_SUCCESS) && + (((mddb_set_t *)md_set[setno].s_db)->s_lbp->lb_flags & + MDDB_DEVID_STYLE)) + /* + * Reference the device id namespace + */ + shared = NM_DEVID | NM_NOTSHARED; + else + shared = NM_NOTSHARED; + } else { + /* Importing diskset has devids so store in namespace */ + devid = kmem_alloc(ddi_devid_sizeof(imp_devid), KM_SLEEP); + bcopy(imp_devid, devid, ddi_devid_sizeof(imp_devid)); + mname = md_strdup(imp_mname); shared = NM_DEVID | NM_NOTSHARED; - else - shared = NM_NOTSHARED; + } /* * Always lookup the primary name space @@ -1873,6 +1884,41 @@ md_setdevname( */ lookup_res = lookup_deventry(nh, setno, side, key, drvnm, mnum, dname, fname, &n); + + /* If we are importing the set */ + if (imp_flag && (lookup_res == LOOKUP_DEV_FOUND)) { + ushort_t did_sz; + ddi_devid_t did; + + /* + * We need to check for the case where there is a disk + * already in the namespace with a different ID from + * the one we want to add, but the same name. This is + * possible in the case of an unavailable disk. + */ + rw_exit(&nm_lock.lock); + if (md_getdevid(setno, side, n->n_key, NULL, &did_sz) != 0) + did_sz = 0; + rw_enter(&nm_lock.lock, RW_WRITER); + if (did_sz > 0) { + did = kmem_zalloc(did_sz, KM_SLEEP); + rw_exit(&nm_lock.lock); + (void) md_getdevid(setno, side, n->n_key, did, &did_sz); + rw_enter(&nm_lock.lock, RW_WRITER); + if (ddi_devid_compare(did, devid) == 0) { + kmem_free(did, did_sz); + retval = 0; + goto out; + } + kmem_free(did, did_sz); + } + /* + * This is not the same disk so we haven't really found it. + * Thus, we need to say it's "NOMATCH" and create a new + * entry. + */ + lookup_res = LOOKUP_DEV_NOMATCH; + } switch (lookup_res) { case LOOKUP_DEV_FOUND: /* If we are importing the set */ @@ -2079,8 +2125,9 @@ add_devid: } } out: - if (devid) + if (devid) { ddi_devid_free(devid); + } if (dname) freestr(dname); if (mname) diff --git a/usr/src/uts/common/sys/lvm/md_mddb.h b/usr/src/uts/common/sys/lvm/md_mddb.h index b031594f19..8f6226e675 100644 --- a/usr/src/uts/common/sys/lvm/md_mddb.h +++ b/usr/src/uts/common/sys/lvm/md_mddb.h @@ -881,6 +881,7 @@ extern int mddb_validate_lb(set_t setno, int *rmaxsz); extern int mddb_getinvlb_devid(set_t setno, int count, int size, char **ctdptr); extern int md_update_minor(set_t, side_t, mdkey_t); +extern int md_update_nm_rr_did_ioctl(mddb_config_t *cp); extern int md_update_top_device_minor(set_t, side_t, md_dev64_t); #ifdef DEBUG diff --git a/usr/src/uts/common/sys/lvm/mdio.h b/usr/src/uts/common/sys/lvm/mdio.h index 9ff907f078..1cedfe2bc6 100644 --- a/usr/src/uts/common/sys/lvm/mdio.h +++ b/usr/src/uts/common/sys/lvm/mdio.h @@ -140,6 +140,11 @@ extern "C" { "logging; they\n#pass data directly to the underlying device.\n" /* + * for importing of disksets (IMP_LOAD) + */ +#define MD_IMP_STALE_SET 1 + +/* * miscname stuff */ @@ -304,6 +309,7 @@ typedef struct mdnm_params { ushort_t minorname_len; /* length of minor name */ uint64_t minorname; /* address of minor name */ uint_t ref_count; /* returned n_count */ + int imp_flag; /* used by metaimport */ } mdnm_params_t; typedef struct mdhspnm_params { @@ -749,6 +755,8 @@ typedef struct md_regen_param { #define MD_DB_LBINITTIME (MDIOC|104) /* get the lb_inittime */ #define MD_IOCGET_HSP_NM (MDIOC|105) /* get hsp entry from namespace */ #define MD_IOCREM_DEV (MDIOC|106) /* remove device node for unit */ +#define MD_IOCUPDATE_NM_RR_DID (MDIOC|107) /* update remotely repl did in NM */ + #define MDIOC_MISC (MDIOC|128) /* misc module base */ /* Used in DEBUG_TEST code */ diff --git a/usr/src/uts/common/sys/lvm/mdiox.x b/usr/src/uts/common/sys/lvm/mdiox.x index fdb2d14bb9..9280d849e9 100644 --- a/usr/src/uts/common/sys/lvm/mdiox.x +++ b/usr/src/uts/common/sys/lvm/mdiox.x @@ -188,9 +188,12 @@ const MD_DRIVE_RECORD_REVISION = 0x00010000; #ifdef RPC_HDR % -%#define MD_DR_ADD 0x00000001U -%#define MD_DR_DEL 0x00000002U -%#define MD_DR_OK 0x80000000U +%#define MD_DR_ADD 0x00000001U +%#define MD_DR_DEL 0x00000002U +%#define MD_DR_FIX_MB_DID 0x10000000U /* Fix MB */ +%#define MD_DR_FIX_LB_NM_DID 0x20000000U /* Fix LB and namespaces */ +%#define MD_DR_UNRSLV_REPLICATED 0x40000000U +%#define MD_DR_OK 0x80000000U #endif /* RPC_HDR */ #if !defined(_KERNEL) @@ -253,19 +256,21 @@ const MD_SET_RECORD_REVISION = 0x00010000; #ifdef RPC_HDR % -%#define MD_SR_ADD 0x00000001U -%#define MD_SR_DEL 0x00000002U -%#define MD_SR_CHECK 0x00000004U -%#define MD_SR_CVT 0x00000008U -%#define MD_SR_LOCAL 0x00000010U -%#define MD_SR_MB_DEVID 0x10000000U -%#define MD_SR_AUTO_TAKE 0x20000000U -%#define MD_SR_MN 0x40000000U -%#define MD_SR_OK 0x80000000U +%#define MD_SR_ADD 0x00000001U +%#define MD_SR_DEL 0x00000002U +%#define MD_SR_CHECK 0x00000004U +%#define MD_SR_CVT 0x00000008U +%#define MD_SR_LOCAL 0x00000010U +%#define MD_SR_UNRSLV_REPLICATED 0x08000000U +%#define MD_SR_MB_DEVID 0x10000000U +%#define MD_SR_AUTO_TAKE 0x20000000U +%#define MD_SR_MN 0x40000000U +%#define MD_SR_OK 0x80000000U %#define MD_SR_STATE_FLAGS (MD_SR_ADD | \ % MD_SR_DEL | \ % MD_SR_CHECK | \ % MD_SR_CVT | \ +% MD_SR_UNRSLV_REPLICATED | \ % MD_SR_OK) #endif /* RPC_HDR */ diff --git a/usr/src/uts/common/sys/lvm/mdvar.h b/usr/src/uts/common/sys/lvm/mdvar.h index 7388c54d41..c03847efb5 100644 --- a/usr/src/uts/common/sys/lvm/mdvar.h +++ b/usr/src/uts/common/sys/lvm/mdvar.h @@ -770,6 +770,7 @@ extern void md_remove_minor_node(minor_t); /* Externals from md_names.c */ extern mdkey_t md_setdevname(set_t, side_t, mdkey_t, char *, minor_t, char *, + int imp_flag, ddi_devid_t devid, char *minorname, set_t, md_error_t *); extern int md_getdevname(set_t, side_t, mdkey_t, md_dev64_t, char *, size_t); @@ -815,7 +816,7 @@ extern md_dev64_t md_makedevice(major_t, minor_t); extern major_t md_getmajor(md_dev64_t); extern minor_t md_getminor(md_dev64_t); extern void md_timeval(md_timeval32_t *); -extern int md_imp_snarf_set(set_t *, int); +extern int md_imp_snarf_set(mddb_config_t *); /* externals from md_mddb.c */ extern int mddb_reread_rr(set_t, mddb_recid_t); |