diff options
Diffstat (limited to 'usr/src/uts/common/sys/lvm/md_mddb.h')
| -rw-r--r-- | usr/src/uts/common/sys/lvm/md_mddb.h | 913 |
1 files changed, 913 insertions, 0 deletions
diff --git a/usr/src/uts/common/sys/lvm/md_mddb.h b/usr/src/uts/common/sys/lvm/md_mddb.h new file mode 100644 index 0000000000..45f023b26d --- /dev/null +++ b/usr/src/uts/common/sys/lvm/md_mddb.h @@ -0,0 +1,913 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_MD_MDDB_H +#define _SYS_MD_MDDB_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/buf.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#if 0 /* DRP FOR DEBUGGING */ +#define MDDB_FAKE +#endif + +/* Private flags */ +#define MD_PRV_GOTIT 0x0001 /* Been snarfed */ +#define MD_PRV_DELETE 0x0002 /* Record pending to be deleted */ +#define MD_PRV_COMMIT 0x0004 /* Record pending to be commited */ +#define MD_PRV_CLEANUP 0x0008 /* Record pending to be cleaned up */ +#define MD_PRV_CONVD 0x0010 /* Record has been converted (32->64) */ +#define MD_PRV_PENDDEL (MD_PRV_GOTIT | MD_PRV_DELETE) +#define MD_PRV_PENDCOM (MD_PRV_GOTIT | MD_PRV_COMMIT) +#define MD_PRV_PENDCLEAN (MD_PRV_GOTIT | MD_PRV_CLEANUP) + + +#define MDDB_E_INVALID (-1) /* an invalid argument was passed */ +#define MDDB_E_EXISTS (-2) /* doing an operation a 2nd time which can */ + /* only be done once */ +#define MDDB_E_MASTER (-3) /* problem occurred accessing mastor block */ + /* returned from NEW_DEV */ +#define MDDB_E_TOOSMALL (-4) /* device is not large enough */ +#define MDDB_E_NORECORD (-5) /* record does not exits */ + /* + * returned from: mddb_getnextrec + * mddb_getrecsize + * mddb_commitrec + * mddb_commitrecs + * mddb_deleterec + */ +#define MDDB_E_NOSPACE (-6) /* no space to create record */ +#define MDDB_E_NOTNOW (-7) /* do not presently have enough resources */ + /* to perform requested operation */ +#define MDDB_E_NODB (-8) /* no database exist */ +#define MDDB_E_NOTOWNER (-9) /* have not been told to grab this set */ +#define MDDB_E_STALE (-10) /* database is stale */ +#define MDDB_E_TOOFEW (-11) /* not enough replicas available */ +#define MDDB_E_TAGDATA (-12) /* tagged data detected */ +#define MDDB_E_ACCOK (-13) /* 50/50 mode */ +#define MDDB_E_NTAGDATA (-14) /* tagop try, no tag data */ +#define MDDB_E_ACCNOTOK (-15) /* accop try, no accept possible */ +#define MDDB_E_NOLOCBLK (-16) /* No valid locators found */ +#define MDDB_E_NOLOCNMS (-17) /* No valid locator name information */ +#define MDDB_E_NODIRBLK (-18) /* No directory blocks found */ +#define MDDB_E_NOTAGREC (-19) /* No tag record blocks found */ +#define MDDB_E_NOTAG (-20) /* No matching tag record found */ +#define MDDB_E_NODEVID (-21) /* No device id found */ + +#define MDDB_MINBLKS 16 /* enough for a few metadevices */ +#define MDDB_MAXBLKS 8192 /* size of free bit map (must be / 8) */ +#define MDDB_MN_MINBLKS 32768 /* Multinode metadb minimum size */ + /* 16MB */ +#define MDDB_MN_MAXBLKS 524288 /* size of free bit map (must be / 8) */ + /* 256MB */ + +#define MDDB_C_STALE 0x0001 +#define MDDB_C_TOOFEW 0x0002 +#define MDDB_C_NOTOWNER 0x0004 +#define MDDB_C_SET_MN_STALE 0x0008 /* Set MN set to stale */ +#define MDDB_C_IMPORT 0x0010 + +/* + * Defines used to set/reset new master flag in set structure. + * Used during reconfig cycle to determine quickly if there is + * new master for the set. + */ +#define MDDB_NM_SET 0x0001 +#define MDDB_NM_RESET 0x0002 +#define MDDB_NM_GET 0x0004 + +/* Definitions of flag in Locator Block Device ID data area - mddb_did_info */ +#define MDDB_DID_EXISTS 0x0001 /* Device ID exists */ +#define MDDB_DID_VALID 0x0002 /* Device ID valid on current system */ +#define MDDB_DID_UPDATED 0x0004 /* locator/sidelocator info updated */ + +/* Definitions of flag in Locator Block - mddb_lb */ +#define MDDB_DEVID_STYLE 0x0001 /* Locator Block in Device ID format */ +#define MDDB_MNSET 0x0002 /* MDDB is for a multi-node set */ + + +#define MDDB_MAX_PATCH 25 /* number of locations that */ + /* can be patched in etc/system */ + +/* + * Set struct used by all parts of the driver, to store anchor pointers. + */ +typedef struct md_set { + uint_t s_status; /* set status */ + void **s_ui; /* set unit incore anchor */ + void **s_un; /* set unit anchor */ + void *s_hsp; /* set Hot Spare Pool anchor */ + void *s_hs; /* set Hot Spare anchor */ + void *s_db; /* set MDDB anchor */ + kmutex_t s_dbmx; /* set MDDB mutex */ + void *s_nm; /* set namespace anchor */ + mddb_recid_t s_nmid; /* set namespace anchor record */ + void *s_did_nm; /* set device id namespace anchor */ + mddb_recid_t s_did_nmid; /* set device id namespace anchor rec */ + void *s_dtp; /* set data tag rec */ + int s_am_i_master; /* incore master flag for this node */ + md_mn_nodeid_t s_nodeid; /* nodeid of this node - for MN sets */ + uint_t s_rcnt; /* incore resync count for set */ +} md_set_t; + + +#define MDDB_MAGIC_MB 0x6d646d62 /* magic number for master blocks */ +#define MDDB_MAGIC_DB 0x6d646462 /* magic number for directory blocks */ +#define MDDB_MAGIC_RB 0x6d647262 /* magic number for record blocks */ +#define MDDB_MAGIC_LB 0x6d646c62 /* magic number for locator blocks */ +#define MDDB_MAGIC_LN 0x6d646c6e /* magic number for locator names */ +#define MDDB_MAGIC_DT 0x6d646474 /* magic number for data tag */ +#define MDDB_MAGIC_DI 0x6d646469 /* magic number for device ID block */ +#define MDDB_MAGIC_DU 0x6d646475 /* magic num for dummy mb */ +#define MDDB_MAGIC_DE 0x6d646465 /* magic num for mb devid */ + +#define MDDB_GLOBAL_XOR 1234567890 + +#define MDDB_REV_MAJOR (uint_t)0xff00 +#define MDDB_REV_MINOR (uint_t)0x00ff + +/* + * MDDB_REV_MNMB: + * If a MN diskset, master block revision is set to MDDB_REV_MNMB. + * Even though the master block structure is no different + * for a MN set, setting the revision field to a different + * number keeps any pre-MN_diskset code from accessing + * this diskset. It also allows for an early determination + * of a MN diskset when reading in from disk so that the + * proper size locator block and locator names structure + * can be read in thus saving time on diskset startup. + * Since no change in master block structure, the MDDB_REV_MINOR + * portion of the revision was incremented. + * + * MDDB_REV_MNLB: + * If a MN diskset, the locator block structure is a different size in + * order to accomodate up to MD_MNMAXSIDES nodes in a diskset + * with any nodeid (sideno) allowed. + * The revision is set to MDDB_REV_MNLB which is a change of the + * MDDB_REV_MAJOR portion of the revision. + * + * MDDB_REV_MNLN: + * If a MN diskset, the locator names is a different size in + * order to accomodate up to MD_MNMAXSIDES nodes in a diskset + * with any nodeid (sideno) allowed. + * The revision is set to MDDB_REV_MNLN which is a change of the + * MDDB_REV_MAJOR portion of the revision. + */ + +#define MDDB_REV_MB (uint_t)0x0201 +#define MDDB_REV_MNMB (uint_t)0x0202 +#define MDDB_REV_DB (uint_t)0x0201 +#define MDDB_REV_LB (uint_t)0x0500 +#define MDDB_REV_MNLB (uint_t)0x0600 +#define MDDB_REV_LN (uint_t)0x0100 +#define MDDB_REV_MNLN (uint_t)0x0300 +#define MDDB_REV_RB (uint_t)0x0200 +#define MDDB_REV_RB64 (uint_t)0x0201 +#define MDDB_REV_DT (uint_t)0x0100 +#define MDDB_REV_DI (uint_t)0x0100 + +#define MDDB_BSIZE (uint_t)DEV_BSIZE +#define MDDB_PREFIXCNT 10 +#define MDDB_DRVNMCNT 10 + +typedef int mddb_block_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif +typedef struct md_mnname_suffix { + md_name_suffix mn_ln_suffix; + uint_t mn_ln_sideno; +} md_mnname_suffix_t; + +typedef struct mddb_ln { + int ln_magic; + uint_t ln_revision; + uint_t ln_checksum; + struct timeval32 ln_timestamp; + md_name_prefix ln_prefixes[MDDB_PREFIXCNT]; + /* Don't change array sizes without changing RNDUP_BLKCNT */ + md_name_suffix ln_suffixes[MD_MAXSIDES][MDDB_NLB]; +} mddb_ln_t; + +/* + * Locator name structure for MN diskset. Same as for traditional + * and local diskset except that more sides are supported and the + * side number can be any number since the side number is stored + * in the ln_mnsuffixes structure instead of being used as an index + * into that array. This means that the whole array may need to be + * searched in order to find the correct information given a side number. + */ +typedef struct mddb_mnln { + int ln_magic; + uint_t ln_revision; + uint_t ln_checksum; + struct timeval32 ln_timestamp; + md_name_prefix ln_prefixes[MDDB_PREFIXCNT]; + /* Don't change array sizes without changing MDDB_MNLNCNT */ + md_mnname_suffix_t ln_mnsuffixes[MD_MNMAXSIDES][MDDB_NLB]; +} mddb_mnln_t; + +#define RNDUP_BLKCNT(sz, delta) (((sz) - \ + ((delta) * \ + ((MD_MAXSIDES - 1) * MDDB_NLB)) + \ + MDDB_BSIZE - 1) / MDDB_BSIZE) +#define MDDB_LNCNT RNDUP_BLKCNT(sizeof (mddb_ln_t), 0) +#define MDDB_LOCAL_LNCNT RNDUP_BLKCNT(sizeof (mddb_ln_t), \ + sizeof (md_name_suffix)) + +#define MDDB_MNLNCNT ((sizeof (mddb_mnln_t) + (MDDB_BSIZE - 1)) \ + / MDDB_BSIZE) + +typedef struct mddb_dt { + uint_t dt_mag; + uint_t dt_rev; + uint_t dt_cks; + mddb_dtag_t dt_dtag; +} mddb_dt_t; + +#define MDDB_DT_BYTES (roundup(sizeof (mddb_dt_t), MDDB_BSIZE)) +#define MDDB_DT_BLOCKS (btodb(MDDB_DT_BYTES)) + +typedef union identifier { + char serial[MDDB_SN_LEN]; + struct timeval32 createtime; +} identifier_t; + +typedef struct mddb_locator { + dev32_t l_dev; + daddr32_t l_blkno; + int l_flags; +} mddb_locator_t; + +typedef struct mddb_sidelocator { + uchar_t l_drvnm_index; + minor_t l_mnum; +} mddb_sidelocator_t; + +typedef struct mddb_mnsidelocator { + uchar_t mnl_drvnm_index; + minor_t mnl_mnum; + uint_t mnl_sideno; +} mddb_mnsidelocator_t; + +typedef struct mddb_drvnm { + uchar_t dn_len; + char dn_data[MD_MAXDRVNM]; +} mddb_drvnm_t; + +/* + * Locator Block Device ID Information + * Several device id's may share one disk block in an effort to + * conserve used replica space. + */ +typedef struct mddb_did_info { + uint_t info_flags; /* MDDB Device ID flags */ + uint_t info_firstblk; /* Device ID Start Block */ + uint_t info_blkcnt; /* Device ID Block Count */ + uint_t info_offset; /* Device ID offset w/i Block */ + uint_t info_length; /* Device ID Length */ + uint_t info_checksum; /* Device ID Checksum */ + char info_minor_name[32]; /* Minor name of lb dev */ +} mddb_did_info_t; + +typedef struct mddb_did_blk { + int blk_magic; /* used for verification */ + uint_t blk_revision; /* used for verification */ + int blk_checksum; /* used for verification */ + uint_t blk_commitcnt; /* matches LB's commitcnt */ + mddb_did_info_t blk_info[MDDB_NLB]; +} mddb_did_blk_t; +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +#define MDDB_DID_BYTES (roundup(sizeof (mddb_did_blk_t), MDDB_BSIZE)) +#define MDDB_DID_BLOCKS (btodb(MDDB_DID_BYTES)) + +/* + * Device ID Disk Blocks. + * Incore linked list of disk blocks containing device IDs. + * The list is built when reading in the mddb_did_blk structure and + * when reading in the actual disk blocks containing device ids. + * This list is used to easily write out all disk blocks containing + * device ids. + */ +typedef struct mddb_did_db { + uint_t db_firstblk; /* Disk Block's logical addr */ + uint_t db_blkcnt; /* Contig Disk Block Count */ + caddr_t db_ptr; /* Ptr to incore Block(s) */ + struct mddb_did_db *db_next; /* Ptr to next in list */ +} mddb_did_db_t; + +/* + * Device ID Free List. + * Incore linked list of free space in disk blocks containing device IDs. + * Used to manage placement of device IDs in disk blocks. + * All disk blocks on free list are also in linked list of disk block + * containing device IDs (mddb_did_db_t). + */ +typedef struct mddb_did_free { + uint_t free_blk; /* Disk Block's logical addr */ + uint_t free_offset; /* offset of free space */ + uint_t free_length; /* length of free space */ + struct mddb_did_free *free_next; /* Ptr to next in list */ +} mddb_did_free_t; + +/* + * Device ID Incore Area + * Contains pointer to Device ID Disk Block list and + * Device ID Free List. + * Also contains incore array of pointers to device IDs. Pointers + * point into the device ID Disk Block list and are used as a + * shortcut to find incore device IDs. + */ +typedef struct mddb_did_ic { + mddb_did_blk_t *did_ic_blkp; + mddb_did_db_t *did_ic_dbp; + mddb_did_free_t *did_ic_freep; + ddi_devid_t did_ic_devid[MDDB_NLB]; /* Ptr to device IDs */ +} mddb_did_ic_t; + +/* + * Locator Block (LB): + * - Are fixed size, but the size is different + * for local/shared set db replicas. + * - All LB's start at logical block 0. + * - After a replica quorum is found, there is + * is only one incore copy of the LB. + * - LB's are only written when replicas are added, deleted, or errored. + * - LB's provide information about other replica's and their state. + */ +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif +typedef struct mddb_lb { + int lb_magic; /* used for verification */ + uint_t lb_revision; /* used for verification */ + int lb_checksum; /* used for verification */ + uint_t lb_commitcnt; /* IMPORTANT */ + struct timeval32 lb_timestamp; /* informative only */ + int lb_loccnt; /* used for verification */ + identifier_t lb_ident; /* used for verification */ + uint_t lb_flags; /* flags describing LB */ + uint_t lb_spare[8]; /* Spare/Pad */ + mddb_block_t lb_didfirstblk; /* Devid Array Start Block */ + mddb_block_t lb_didblkcnt; /* Devid Array Number Blocks */ + mddb_block_t lb_dtfirstblk; /* Data Tag Start Block */ + mddb_block_t lb_dtblkcnt; /* Data Tag Number Block(s) */ + struct timeval32 lb_inittime; /* creation of database */ + set_t lb_setno; /* used for verification */ + mddb_block_t lb_blkcnt; /* used for verification */ + mddb_block_t lb_lnfirstblk; + mddb_block_t lb_lnblkcnt; + mddb_block_t lb_dbfirstblk; + mddb_drvnm_t lb_drvnm[MDDB_DRVNMCNT]; + mddb_locator_t lb_locators[MDDB_NLB]; + /* Don't change array sizes without changing RNDUP_BLKCNT */ + mddb_sidelocator_t lb_sidelocators[MD_MAXSIDES][MDDB_NLB]; +} mddb_lb_t; +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +/* + * Locator block structure for MN diskset. Same as for traditional + * and local diskset except that more sides are supported and the + * side number can be any number since the side number is stored + * in the lb_mnsidelocators structure instead of being used as an index + * into that array. This means that the whole array may need to be + * searched in order to find the correct information given a side number. + */ +typedef struct mddb_mnlb { + int lb_magic; /* used for verification */ + uint_t lb_revision; /* used for verification */ + int lb_checksum; /* used for verification */ + uint_t lb_commitcnt; /* IMPORTANT */ + struct timeval32 lb_timestamp; /* informative only */ + int lb_loccnt; /* used for verification */ + identifier_t lb_ident; /* used for verification */ + uint_t lb_flags; /* flags describing LB */ + uint_t lb_spare[8]; /* Spare/Pad */ + mddb_block_t lb_didfirstblk; /* Devid Array Start Block */ + mddb_block_t lb_didblkcnt; /* Devid Array Number Blocks */ + mddb_block_t lb_dtfirstblk; /* Data Tag Start Block */ + mddb_block_t lb_dtblkcnt; /* Data Tag Number Block(s) */ + struct timeval32 lb_inittime; /* creation of database */ + set_t lb_setno; /* used for verification */ + mddb_block_t lb_blkcnt; /* used for verification */ + mddb_block_t lb_lnfirstblk; + mddb_block_t lb_lnblkcnt; + mddb_block_t lb_dbfirstblk; + mddb_drvnm_t lb_drvnm[MDDB_DRVNMCNT]; + mddb_locator_t lb_locators[MDDB_NLB]; + /* Don't change array sizes without changing MDDB_MNLBCNT */ + mddb_mnsidelocator_t lb_mnsidelocators[MD_MNMAXSIDES][MDDB_NLB]; +} mddb_mnlb_t; + + +#define MDDB_LBCNT RNDUP_BLKCNT(sizeof (mddb_lb_t), 0) +#define MDDB_LOCAL_LBCNT RNDUP_BLKCNT(sizeof (mddb_lb_t), \ + sizeof (mddb_sidelocator_t)) + +#define MDDB_MNLBCNT ((sizeof (mddb_mnlb_t) + (MDDB_BSIZE - 1)) \ + / MDDB_BSIZE) + +typedef struct mddb_map { + daddr32_t m_consecutive; + daddr32_t m_firstblk; +} mddb_map_t; + +/* + * Master block(s) (MB) + * - Are written by userland; Never by the driver! + * - Each replica has there own master blocks, + * the master block(s) are not shared. + * - MB's are not in the logical block address space of the database. + * - MB's are a fixed size record (MDDB_BSIZE) + * - MB's provide the logical to physical block translation, + * for their replica. + */ +typedef struct mddb_mb { + int mb_magic; /* used for verification */ + uint_t mb_revision; /* used for verification */ + uint_t mb_checksum; /* used for verification */ +#ifdef _LP64 + uint32_t mb_next; /* incore to next mb */ +#else + struct mddb_mb *mb_next; /* incore to next mb */ +#endif /* _LP64 */ + daddr32_t mb_nextblk; /* block # for next mb */ + md_timeval32_t mb_timestamp; /* timestamp */ + daddr32_t mb_blkcnt; /* size of blkmap */ + daddr32_t mb_blkno; /* physical loc. for this MB */ + set_t mb_setno; /* used for verification */ + struct timeval32 mb_setcreatetime; /* set creation timestamp */ + int spares[7]; + mddb_map_t mb_blkmap; /* logical->physical blk map */ + int mb_devid_magic; /* verify devid in mb */ + short mb_devid_len; /* len of following devid */ + char mb_devid[1]; /* devid byte array */ +} mddb_mb_t; + +/* + * In-core version of mddb_mb. It is known that the mddb_mb is 512 bytes on + * disk, really, and so this structure is 512 + sizeof(struct mddb_mb_ic *) + */ +#define MDDB_IC_BSIZE (MDDB_BSIZE + sizeof (struct mddb_mb_ic *)) +typedef struct mddb_mb_ic { + struct mddb_mb_ic *mbi_next; + struct mddb_mb mbi_mddb_mb; +} mddb_mb_ic_t; + + +/* + * there can be no address in record block. The checksum must + * stay the same where ever the record is in memory. Many + * things depend on this. Also the timestamp is the time the the + * record was committed not the time it was written to a particular + * device. + * + * Old definition of mddb_rb, for 32-bit apps and libraries + */ +typedef struct mddb_rb { + uint_t rb_magic; + uint_t rb_revision; + uint_t rb_checksum; + uint_t rb_checksum_fiddle; + uint_t rb_private; + void *rb_userdata; + uint_t rb_commitcnt; + uint_t rb_spare[1]; + struct timeval32 rb_timestamp; + int rb_data[1]; +} mddb_rb_t; + +/* This is, and always will be, the on-disk version of mddb_rb */ +typedef struct mddb_rb32 { + uint_t rb_magic; + uint_t rb_revision; + uint_t rb_checksum; + uint_t rb_checksum_fiddle; + uint_t rb_private; + uint32_t rb_userdata; + uint_t rb_commitcnt; + uint_t rb_spare[1]; + struct timeval32 rb_timestamp; + int rb_data[1]; +} mddb_rb32_t; + +/* + * directory entries + */ +typedef struct mddb_optinfo { + int o_li; + int o_flags; +} mddb_optinfo_t; + +/* Old definition of mddb_de, for 32-bit apps and libraries */ +typedef struct mddb_de { + struct mddb_de *de_next; + mddb_rb_t *de_rb; + mddb_recid_t de_recid; + mddb_type_t de_type1; + uint_t de_type2; + uint_t de_reqsize; + uint_t de_recsize; + mddb_block_t de_blkcount; + uint_t de_flags; + mddb_optinfo_t de_optinfo[2]; + mddb_block_t de_blks[1]; +} mddb_de_t; + +/* + * In core version of mddb_de, includes pointer for mddb_rb32_t user data + * mddb_rb32_t is used incore + */ +typedef struct mddb_de_ic { + void *de_rb_userdata; + void *de_rb_userdata_ic; + uint_t de_owner_nodeid; + struct mddb_de_ic *de_next; + mddb_rb32_t *de_rb; + mddb_recid_t de_recid; + mddb_type_t de_type1; + uint_t de_type2; + size_t de_reqsize; + size_t de_icreqsize; + size_t de_recsize; + uint_t de_blkcount; + uint_t de_flags; + mddb_optinfo_t de_optinfo[2]; + mddb_block_t de_blks[1]; +} mddb_de_ic_t; + +typedef struct mddb_db { + uint_t db_magic; + uint_t db_revision; + uint_t db_checksum; + mddb_block_t db_blknum; + struct mddb_db *db_next; + mddb_block_t db_nextblk; + struct timeval32 db_timestamp; + uint_t db_recsum; +#ifdef _KERNEL + mddb_de_ic_t *db_firstentry; +#else + mddb_de_t *db_firstentry; +#endif +} mddb_db_t; + +/* + * This is, and always will be, the on-disk version of mddb_de + * When mddb_de32 is read in it is converted into mddb_de_ic + */ +typedef struct mddb_de32 { + uint32_t de32_next; + uint32_t de32_rb; + mddb_recid_t de32_recid; + mddb_type_t de32_type1; + uint_t de32_type2; + uint_t de32_reqsize; + uint_t de32_recsize; + mddb_block_t de32_blkcount; + uint_t de32_flags; + mddb_optinfo_t de32_optinfo[2]; + mddb_block_t de32_blks[1]; +} mddb_de32_t; + +/* + * This is, and always will be, the on-disk version of mddb_db + * When mddb_db32 is read in it is converted into mddb_db + * To minimize impact on mddb format mddb_db fileds remain intact + */ +typedef struct mddb_db32 { + uint_t db32_magic; + uint_t db32_revision; + uint_t db32_checksum; + mddb_block_t db32_blknum; + uint32_t db32_next; + mddb_block_t db32_nextblk; + struct timeval32 db32_timestamp; + uint_t db32_recsum; + uint32_t db32_firstentry; +} mddb_db32_t; + +#define de32tode(from, to) \ + { \ + int i; \ + to->de_rb_userdata = NULL; \ + to->de_owner_nodeid = MD_MN_INVALID_NID; \ + to->de_next = (struct mddb_de_ic *)(uintptr_t)from->de32_next; \ + to->de_rb = (mddb_rb32_t *)(uintptr_t)from->de32_rb; \ + to->de_recid = from->de32_recid; \ + to->de_type1 = from->de32_type1; \ + to->de_type2 = from->de32_type2; \ + to->de_reqsize = from->de32_reqsize; \ + to->de_recsize = from->de32_recsize; \ + to->de_blkcount = from->de32_blkcount; \ + to->de_flags = from->de32_flags; \ + to->de_optinfo[0] = from->de32_optinfo[0]; \ + to->de_optinfo[1] = from->de32_optinfo[1]; \ + for (i = 0; i < from->de32_blkcount; i++) \ + to->de_blks[i] = from->de32_blks[i]; \ + } + +#define detode32(from, to) \ + { \ + int i; \ + to->de32_next = (uint32_t)(uintptr_t)from->de_next; \ + to->de32_rb = (uint32_t)(uintptr_t)from->de_rb; \ + to->de32_recid = from->de_recid; \ + to->de32_type1 = from->de_type1; \ + to->de32_type2 = from->de_type2; \ + to->de32_reqsize = from->de_reqsize; \ + to->de32_recsize = from->de_recsize; \ + to->de32_blkcount = from->de_blkcount; \ + to->de32_flags = from->de_flags; \ + to->de32_optinfo[0] = from->de_optinfo[0]; \ + to->de32_optinfo[1] = from->de_optinfo[1]; \ + for (i = 0; i < from->de_blkcount; i++) \ + to->de32_blks[i] = from->de_blks[i]; \ + } + +#define db32todb(from, to) \ + to->db_magic = from->db32_magic; \ + to->db_revision = from->db32_revision; \ + to->db_checksum = from->db32_checksum; \ + to->db_blknum = from->db32_blknum; \ + to->db_next = (struct mddb_db *)(uintptr_t)from->db32_next; \ + to->db_nextblk = from->db32_nextblk; \ + to->db_timestamp = from->db32_timestamp; \ + to->db_recsum = from->db32_recsum; \ + to->db_firstentry = (mddb_de_ic_t *)(uintptr_t)from->db32_firstentry; + +#define dbtodb32(from, to) \ + to->db32_magic = from->db_magic; \ + to->db32_revision = from->db_revision; \ + to->db32_checksum = from->db_checksum; \ + to->db32_blknum = from->db_blknum; \ + to->db32_next = (uint32_t)(uintptr_t)from->db_next; \ + to->db32_nextblk = from->db_nextblk; \ + to->db32_timestamp = from->db_timestamp; \ + to->db32_recsum = from->db_recsum; \ + to->db32_firstentry = (uint32_t)(uintptr_t)from->db_firstentry; + +/* + * information about a replica of the data base + */ +typedef struct mddb_ri { + struct mddb_ri *ri_next; + uint_t ri_flags; + uint_t ri_commitcnt; + int ri_transplant; + md_dev64_t ri_dev; + daddr32_t ri_blkno; + char ri_driver[16]; + mddb_mb_ic_t *ri_mbip; + mddb_lb_t *ri_lbp; + mddb_dt_t *ri_dtp; + mddb_did_ic_t *ri_did_icp; + ddi_devid_t ri_devid; + ddi_devid_t ri_old_devid; + char ri_minor_name[MDDB_MINOR_NAME_MAX]; + char ri_devname[MAXPATHLEN]; +} mddb_ri_t; + +typedef struct mddb_bf { + struct mddb_bf *bf_next; + mddb_locator_t *bf_locator; + buf_t bf_buf; +} mddb_bf_t; + +/* + * Information for sets of databases (which include replicas) + */ +#define MDDB_BITSRECID 31 +#define MDDB_SETSHIFT (MDDB_BITSRECID - MD_BITSSET) +#define MDDB_SETMASK (MD_SETMASK << MDDB_SETSHIFT) +#define MDDB_RECIDMASK ((1 << MDDB_SETSHIFT) - 1) + +#define DBSET(id) (((id) & MDDB_SETMASK) >> MDDB_SETSHIFT) +#define DBID(id) ((id) & MDDB_RECIDMASK) +#define MAKERECID(s, i) ((((s) << MDDB_SETSHIFT) & MDDB_SETMASK) | \ + ((i) & MDDB_RECIDMASK)) + +#define MDDB_PARSE_LOCBLK 0x00000001 +#define MDDB_PARSE_LOCNM 0x00000002 +#define MDDB_PARSE_OPTRECS 0x00000004 +#define MDDB_PARSE_MASK 0x0000000F + + +#define MDDB_BLOCK_PARSE 0x00000001 /* Block sending parse msgs */ +#define MDDB_UNBLOCK_PARSE 0x00000002 /* Unblock sending parse msgs */ + +/* + * We need to keep s_ident and s_inittime 32 bit. They are used in mddb_lb + */ +typedef struct mddb_set { + uint_t s_setno; /* set number */ + uint_t s_sideno; /* side number */ + identifier_t s_ident; /* set identifier */ + char *s_setname; /* set name */ + mddb_mb_ic_t **s_mbiarray; /* master blocks array */ + mddb_db_t *s_dbp; /* directory block */ + mddb_lb_t *s_lbp; /* locator block */ + /* May be cast to mddb_mnlb_t */ + /* if accessing sidenames in */ + /* MN diskset */ + mddb_ln_t *s_lnp; /* locator names block */ + /* May be cast to mddb_mnln_t */ + /* if accessing sidenames in */ + /* MN diskset */ + mddb_dtag_lst_t *s_dtlp; /* List of data tags found */ + mddb_did_ic_t *s_did_icp; /* Device ID incore area */ + mddb_ri_t *s_rip; /* replicas incore list */ + int s_freeblkcnt; /* visable for test code */ + int s_totalblkcnt; /* visable for test code */ + int s_mn_parseflags; /* mddb parse flags for MNset */ + int s_mn_parseflags_sending; /* parse flgs sent to slaves */ + uchar_t *s_freebitmap; /* free blocks bitmap */ + uint_t s_freebitmapsize; /* size of bitmap */ + struct timeval32 s_inittime; /* timestamp set created */ + mddb_recid_t s_zombie; /* zombie record - createrec */ + int s_staledeletes; /* number of stale deleterec */ + int s_optcmtcnt; /* Following are opt. record */ + int s_opthavelck; /* bookkeeping records ... */ + int s_optwantlck; + kcondvar_t s_optwantlck_cv; + int s_optwaiterr; + int s_opthungerr; + kcondvar_t s_opthungerr_cv; + int s_opthavequeuinglck; + int s_optwantqueuinglck; + kcondvar_t s_optqueuing_cv; + ulong_t s_bufmisses; + mddb_bf_t *s_freebufhead; + int s_bufwakeup; + kcondvar_t s_buf_cv; + size_t s_databuffer_size; + void *s_databuffer; + int s_singlelockgotten; + int s_singlelockwanted; + kcondvar_t s_single_thread_cv; + md_hi_arr_t s_med; +} mddb_set_t; + +#ifndef MDDB_FAKE +#ifdef _KERNEL +/* md_mddb.c */ +extern uint_t mddb_lb_did_convert(mddb_set_t *, + uint_t, uint_t *); +extern void mddb_locatorblock2splitname(mddb_ln_t *, + int, side_t, md_splitname *); +extern int mddb_configure(mddb_cfgcmd_t, + struct mddb_config *); +extern mddb_recid_t mddb_getnextrec(mddb_recid_t, + mddb_type_t, uint_t); +extern int mddb_getoptloc(mddb_optloc_t *); +extern void *mddb_getrecaddr(mddb_recid_t); +extern void *mddb_getrecaddr_resize(mddb_recid_t, size_t, + off_t); +extern int mddb_getrecprivate(mddb_recid_t); +extern void mddb_setrecprivate(mddb_recid_t, uint_t); +extern mddb_de_ic_t *mddb_getrecdep(mddb_recid_t); +extern mddb_type_t mddb_getrectype1(mddb_recid_t); +extern int mddb_getrectype2(mddb_recid_t); +extern int mddb_getrecsize(mddb_recid_t); +extern int mddb_commitrec(mddb_recid_t); +extern int mddb_commitrecs(mddb_recid_t *); +extern int mddb_deleterec(mddb_recid_t); +extern mddb_recstatus_t mddb_getrecstatus(mddb_recid_t); +extern mddb_recid_t mddb_createrec(size_t usersize, + mddb_type_t type, uint_t type2, + md_create_rec_option_t option, set_t setno); +extern void mddb_init(void); +extern void mddb_unload(void); +extern void mddb_unload_set(set_t setno); +extern mddb_recid_t mddb_makerecid(set_t setno, mddb_recid_t id); +extern set_t mddb_getsetnum(mddb_recid_t id); +extern char *mddb_getsetname(set_t setno); +extern side_t mddb_getsidenum(set_t setno); +extern int mddb_ownset(set_t setno); +extern int getmed_ioctl(mddb_med_parm_t *medpp, int mode); +extern int setmed_ioctl(mddb_med_parm_t *medpp, int mode); +extern int updmed_ioctl(mddb_med_upd_parm_t *medpp, + int mode); +extern int take_set(mddb_config_t *cp, int mode); +extern int release_set(mddb_config_t *cp, int mode); +extern int gettag_ioctl(mddb_dtag_get_parm_t *dtgpp, + int mode); +extern int usetag_ioctl(mddb_dtag_use_parm_t *dtupp, + int mode); +extern int accept_ioctl(mddb_accept_parm_t *medpp, + int mode); +extern int md_update_locator_namespace(set_t setno, + side_t side, char *dname, char *pname, + md_dev64_t devt); +extern int mddb_validate_lb(set_t setno, int *rmaxsz); +extern int mddb_getinvlb_devid(set_t setno, int count, + int size, char **ctdptr); +extern int md_update_minor(set_t, side_t, mdkey_t); +#ifdef DEBUG +extern void mddb_check(void); +#endif /* DEBUG */ +#endif /* _KERNEL */ + +#else + +caddr_t mddb_fakeit; + +#define md_lb_did_convert(a, b, c) (0) +#define mddb_configure(a, b) (0) +#define mddb_getnextrec(a, b, c) ((mddb_recid_t)0) +#define mddb_getrecaddr(a) (mddb_fakeit) +#define mddb_getrecprivate(a) (0) +#define mddb_setrecprivate(a, b) (0) +#define mddb_getrectype1(a) (0) +#define mddb_getrectype2(a) (0) +#define mddb_getrecsize(a) (0) +#define mddb_commitrec(a) (0) +#define mddb_commitrecs(a) (0) +#define mddb_deleterec(a) (0) +#define mddb_getrecstatus(a) (MDDB_OK) +#define mddb_createrec(s, a, b) (0xffff & (int)(mddb_fakeit = \ + (caddr_t)kmem_zalloc(s, KM_SLEEP))) +#define mddb_unload() (0) + +#endif + +#define MDDB_NOSLEEP 1 +#define MDDB_SLEEPOK 0 + +#define MDDB_NOOLDOK 0x1 +#define MDDB_MUSTEXIST 0x2 +#define MDDB_NOINIT 0x4 +#define MDDB_MULTINODE 0x8 +#define MDDB_MN_STALE 0x10 /* MN set is stale */ + +/* Flags passed to selectreplicas - not a bit mask */ +#define MDDB_SCANALL 1 +#define MDDB_RETRYSCAN 0 +#define MDDB_SCANALLSYNC 2 /* During reconfig, sync up incore */ + /* and ondisk mddb by writing incore */ + /* values to disk. Don't write */ + /* change log records. */ + +/* Flags passed to writestart and writecopy */ +#define MDDB_WRITECOPY_ALL 1 /* Write all incore mddb to disk */ +#define MDDB_WRITECOPY_SYNC 2 /* Write incore mddb to disk except */ + /* - change log records */ + /* - optimized resync records */ + + +#define MDDB_PROBE 1 +#define MDDB_NOPROBE 0 + + +/* + * MN diskset definitions used to determine if a slave can write + * directly to the mddb. ONLY_MASTER only allows the master node + * to write to the mddb. ANY_NODE allows any node to write + * to the mddb. + */ +#define MDDB_WR_ONLY_MASTER 0 +#define MDDB_WR_ANY_NODE 1 + +#define MDDB_L_LOCKED 0x0001 /* this record is locked */ +#define MDDB_L_WANTED 0x0002 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MD_MDDB_H */ |
