summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/sys/lvm/md_mddb.h
blob: 0668d8c023c64d73b4f113d06ee70734d2e29930 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#ifndef _SYS_MD_MDDB_H
#define	_SYS_MD_MDDB_H

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <sys/types.h>
#include <sys/buf.h>

#ifdef	__cplusplus
extern "C" {
#endif

#if 0 /* DRP FOR DEBUGGING */
#define	MDDB_FAKE
#endif

/* Private flags */
#define	MD_PRV_GOTIT		0x0001	/* Been snarfed */
#define	MD_PRV_DELETE		0x0002	/* Record pending to be deleted */
#define	MD_PRV_COMMIT		0x0004	/* Record pending to be commited */
#define	MD_PRV_CLEANUP		0x0008	/* Record pending to be cleaned up */
#define	MD_PRV_CONVD		0x0010  /* Record has been converted (32->64) */
#define	MD_PRV_PENDDEL		(MD_PRV_GOTIT | MD_PRV_DELETE)
#define	MD_PRV_PENDCOM		(MD_PRV_GOTIT | MD_PRV_COMMIT)
#define	MD_PRV_PENDCLEAN	(MD_PRV_GOTIT | MD_PRV_CLEANUP)


#define	MDDB_E_INVALID	(-1)	/* an invalid argument was passed */
#define	MDDB_E_EXISTS	(-2)	/* doing an operation a 2nd time which can */
				/*	only be done once */
#define	MDDB_E_MASTER	(-3)	/* problem occurred accessing mastor block */
				/*	returned from NEW_DEV	*/
#define	MDDB_E_TOOSMALL	(-4)	/* device is not large enough */
#define	MDDB_E_NORECORD	(-5)	/* record does not exits */
				/*
				 *	returned from:	mddb_getnextrec
				 *			mddb_getrecsize
				 *			mddb_commitrec
				 *			mddb_commitrecs
				 *			mddb_deleterec
				 */
#define	MDDB_E_NOSPACE	(-6)	/* no space to create record */
#define	MDDB_E_NOTNOW	(-7)	/* do not presently have enough resources */
				/*	to perform requested operation */
#define	MDDB_E_NODB	(-8)	/* no database exist */
#define	MDDB_E_NOTOWNER (-9)	/* have not been told to grab this set */
#define	MDDB_E_STALE	(-10)	/* database is stale */
#define	MDDB_E_TOOFEW	(-11)	/* not enough replicas available */
#define	MDDB_E_TAGDATA	(-12)	/* tagged data detected */
#define	MDDB_E_ACCOK	(-13)	/* 50/50 mode */
#define	MDDB_E_NTAGDATA	(-14)	/* tagop try, no tag data */
#define	MDDB_E_ACCNOTOK	(-15)	/* accop try, no accept possible */
#define	MDDB_E_NOLOCBLK	(-16)	/* No valid locators found */
#define	MDDB_E_NOLOCNMS	(-17)	/* No valid locator name information */
#define	MDDB_E_NODIRBLK	(-18)	/* No directory blocks found */
#define	MDDB_E_NOTAGREC	(-19)	/* No tag record blocks found */
#define	MDDB_E_NOTAG	(-20)	/* No matching tag record found */
#define	MDDB_E_NODEVID	(-21)	/* No device id found */

#define	MDDB_MINBLKS		16	/* enough for a few metadevices */
#define	MDDB_MAXBLKS		8192	/* size of free bit map (must be / 8) */
#define	MDDB_MN_MINBLKS		32768	/* Multinode metadb minimum size */
					/* 16MB */
#define	MDDB_MN_MAXBLKS		524288	/* size of free bit map (must be / 8) */
					/* 256MB */

#define	MDDB_C_STALE		0x0001
#define	MDDB_C_TOOFEW		0x0002
#define	MDDB_C_NOTOWNER		0x0004
#define	MDDB_C_SET_MN_STALE	0x0008	/* Set MN set to stale */
#define	MDDB_C_IMPORT		0x0010

/*
 * Defines used to set/reset new master flag in set structure.
 * Used during reconfig cycle to determine quickly if there is
 * new master for the set.
 */
#define	MDDB_NM_SET		0x0001
#define	MDDB_NM_RESET		0x0002
#define	MDDB_NM_GET		0x0004

/* Definitions of flag in Locator Block Device ID data area - mddb_did_info */
#define	MDDB_DID_EXISTS		0x0001	/* Device ID exists */
#define	MDDB_DID_VALID		0x0002	/* Device ID valid on current system */
#define	MDDB_DID_UPDATED	0x0004  /* locator/sidelocator info updated */

/* Definitions of flag in Locator Block - mddb_lb */
#define	MDDB_DEVID_STYLE	0x0001	/* Locator Block in Device ID format */
#define	MDDB_MNSET		0x0002  /* MDDB is for a multi-node set */


#define	MDDB_MAX_PATCH	25		/* number of locations that */
					/*	can be patched in etc/system */

/*
 * Set struct used by all parts of the driver, to store anchor pointers.
 *
 * Lock associated with field in this structure:
 *
 * Some of fields are accessible by both the single threaded ioctl thread
 * and internal threads such as resync, hotsparing...etc.  In this case
 * additional protection is needed.  For example, s_db is protected by
 * s_dbmx additionally and s_un, s_ui are protected by md_unit_array_rw.lock
 * s_nm, s_nmid, s_did_nm and s_did_nmid and s_dtp are protected by nm_lock
 * Rest of other fileds are protected by md_mx.  Two fields s_un_next and
 * s_un_avail are introduced by the friendly name project and are ONLY
 * accessible via a single threaded ioctl thread which already is protected
 * by the ioctl lock and there is no need to add extra protection to them.
 * However, in the future if they become accessible by other internal threads
 * then an additional protection such as md_mx lock is highly recommended.
 *
 */
typedef struct md_set {
	uint_t		s_status;	/* set status */
	void		**s_ui;		/* set unit incore anchor */
	void		**s_un;		/* set unit anchor */
	void		*s_hsp;		/* set Hot Spare Pool anchor */
	void		*s_hs;		/* set Hot Spare anchor */
	void		*s_db;		/* set MDDB anchor */
	kmutex_t	s_dbmx;		/* set MDDB mutex */
	void		*s_nm;		/* set namespace anchor */
	mddb_recid_t	s_nmid;		/* set namespace anchor record */
	void		*s_did_nm;	/* set device id namespace anchor */
	mddb_recid_t	s_did_nmid;	/* set device id namespace anchor rec */
	void		*s_dtp;		/* set data tag rec */
	int		s_am_i_master;	/* incore master flag for this node */
	md_mn_nodeid_t	s_nodeid;	/* nodeid of this node - for MN sets */
	uint_t		s_rcnt;		/* incore resync count for set */
	unit_t		s_un_next;	/* s_un scan starts here */
	unit_t		s_un_avail;	/* number of avail slots */
} md_set_t;


#define	MDDB_MAGIC_MB	0x6d646d62	/* magic number for master blocks */
#define	MDDB_MAGIC_DB	0x6d646462	/* magic number for directory blocks */
#define	MDDB_MAGIC_RB	0x6d647262	/* magic number for record blocks */
#define	MDDB_MAGIC_LB	0x6d646c62	/* magic number for locator blocks */
#define	MDDB_MAGIC_LN	0x6d646c6e	/* magic number for locator names */
#define	MDDB_MAGIC_DT	0x6d646474	/* magic number for data tag */
#define	MDDB_MAGIC_DI	0x6d646469	/* magic number for device ID block */
#define	MDDB_MAGIC_DU	0x6d646475	/* magic num for dummy mb */
#define	MDDB_MAGIC_DE	0x6d646465	/* magic num for mb devid */

#define	MDDB_GLOBAL_XOR 1234567890

#define	MDDB_REV_MAJOR  (uint_t)0xff00
#define	MDDB_REV_MINOR  (uint_t)0x00ff

/*
 * MDDB_REV_MNMB:
 * If a MN diskset, master block revision is set to MDDB_REV_MNMB.
 * Even though the master block structure is no different
 * for a MN set, setting the revision field to a different
 * number keeps any pre-MN_diskset code from accessing
 * this diskset.  It also allows for an early determination
 * of a MN diskset when reading in from disk so that the
 * proper size locator block and locator names structure
 * can be read in thus saving time on diskset startup.
 * Since no change in master block structure, the MDDB_REV_MINOR
 * portion of the revision was incremented.
 *
 * MDDB_REV_MNLB:
 * If a MN diskset, the locator block structure is a different size in
 * order to accomodate up to MD_MNMAXSIDES nodes in a diskset
 * with any nodeid (sideno) allowed.
 * The revision is set to MDDB_REV_MNLB which is a change of the
 * MDDB_REV_MAJOR portion of the revision.
 *
 * MDDB_REV_MNLN:
 * If a MN diskset, the locator names is a different size in
 * order to accomodate up to MD_MNMAXSIDES nodes in a diskset
 * with any nodeid (sideno) allowed.
 * The revision is set to MDDB_REV_MNLN which is a change of the
 * MDDB_REV_MAJOR portion of the revision.
 *
 * The record blocks have two binary properties.  A record block can
 * represent either a 32 or 64 bit unit.  A record block can also represent
 * a traditionally named unit or a friendly named unit.  Thus, there are
 * minor revisions of record block.
 *
 *		Traditional		Friendly
 *		Name			Name
 *		-----------		--------
 * 32 bit	MDDB_REV_RB		MDDB_REV_RBFN
 * 64 bit	MDDB_REV_RB64		MDDB_REV_RB64FN
 */

#define	MDDB_REV_MB	(uint_t)0x0201
#define	MDDB_REV_MNMB	(uint_t)0x0202
#define	MDDB_REV_DB	(uint_t)0x0201
#define	MDDB_REV_LB	(uint_t)0x0500
#define	MDDB_REV_MNLB	(uint_t)0x0600
#define	MDDB_REV_LN	(uint_t)0x0100
#define	MDDB_REV_MNLN	(uint_t)0x0300
#define	MDDB_REV_RB	(uint_t)0x0200
#define	MDDB_REV_RB64	(uint_t)0x0201
#define	MDDB_REV_RBFN	(uint_t)0x0202
#define	MDDB_REV_RB64FN	(uint_t)0x0203
#define	MDDB_REV_DT	(uint_t)0x0100
#define	MDDB_REV_DI	(uint_t)0x0100

/*
 * Transfer record block friendly name status to unit/hs structure.
 */
#define	MDDB_NOTE_FN(rbv, unv)	switch (rbv) { \
				case MDDB_REV_RB: \
				case MDDB_REV_RB64: \
					unv &= ~MD_FN_META_DEV; \
					break; \
				case MDDB_REV_RBFN: \
				case MDDB_REV_RB64FN: \
					unv |= MD_FN_META_DEV; \
					break;	\
				}

#define	MDDB_BSIZE	(uint_t)DEV_BSIZE
#define	MDDB_PREFIXCNT	10
#define	MDDB_DRVNMCNT   10

typedef int	mddb_block_t;

#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
#pragma pack(4)
#endif
typedef struct md_mnname_suffix {
	md_name_suffix	mn_ln_suffix;
	uint_t		mn_ln_sideno;
} md_mnname_suffix_t;

typedef	struct mddb_ln {
	int			ln_magic;
	uint_t			ln_revision;
	uint_t			ln_checksum;
	struct timeval32	ln_timestamp;
	md_name_prefix		ln_prefixes[MDDB_PREFIXCNT];
	/* Don't change array sizes without changing RNDUP_BLKCNT */
	md_name_suffix		ln_suffixes[MD_MAXSIDES][MDDB_NLB];
} mddb_ln_t;

/*
 * Locator name structure for MN diskset.  Same as for traditional
 * and local diskset except that more sides are supported and the
 * side number can be any number since the side number is stored
 * in the ln_mnsuffixes structure instead of being used as an index
 * into that array.  This means that the whole array may need to be
 * searched in order to find the correct information given a side number.
 */
typedef	struct mddb_mnln {
	int			ln_magic;
	uint_t			ln_revision;
	uint_t			ln_checksum;
	struct timeval32	ln_timestamp;
	md_name_prefix		ln_prefixes[MDDB_PREFIXCNT];
	/* Don't change array sizes without changing MDDB_MNLNCNT */
	md_mnname_suffix_t	ln_mnsuffixes[MD_MNMAXSIDES][MDDB_NLB];
} mddb_mnln_t;

#define	RNDUP_BLKCNT(sz, delta)	(((sz) - \
				    ((delta) * \
				    ((MD_MAXSIDES  - 1) * MDDB_NLB)) + \
				    MDDB_BSIZE - 1) / MDDB_BSIZE)
#define	MDDB_LNCNT		RNDUP_BLKCNT(sizeof (mddb_ln_t), 0)
#define	MDDB_LOCAL_LNCNT	RNDUP_BLKCNT(sizeof (mddb_ln_t), \
				    sizeof (md_name_suffix))

#define	MDDB_MNLNCNT		((sizeof (mddb_mnln_t) + (MDDB_BSIZE - 1)) \
				    / MDDB_BSIZE)

typedef struct mddb_dt {
	uint_t		dt_mag;
	uint_t		dt_rev;
	uint_t		dt_cks;
	mddb_dtag_t	dt_dtag;
} mddb_dt_t;

#define	MDDB_DT_BYTES	(roundup(sizeof (mddb_dt_t), MDDB_BSIZE))
#define	MDDB_DT_BLOCKS	(btodb(MDDB_DT_BYTES))

typedef union identifier {
	char			serial[MDDB_SN_LEN];
	struct timeval32	createtime;
} identifier_t;

typedef struct mddb_locator {
	dev32_t		l_dev;
	daddr32_t	l_blkno;
	int		l_flags;
} mddb_locator_t;

typedef struct mddb_sidelocator {
	uchar_t		l_drvnm_index;
	minor_t		l_mnum;
} mddb_sidelocator_t;

typedef struct mddb_mnsidelocator {
	uchar_t		mnl_drvnm_index;
	minor_t		mnl_mnum;
	uint_t		mnl_sideno;
} mddb_mnsidelocator_t;

typedef struct mddb_drvnm {
	uchar_t		dn_len;
	char		dn_data[MD_MAXDRVNM];
} mddb_drvnm_t;

/*
 * Locator Block Device ID Information
 * Several device id's may share one disk block in an effort to
 * conserve used replica space.
 */
typedef struct mddb_did_info {
	uint_t		info_flags;	/* MDDB Device ID flags */
	uint_t		info_firstblk;	/* Device ID Start Block */
	uint_t		info_blkcnt;	/* Device ID Block Count */
	uint_t		info_offset;	/* Device ID offset w/i Block */
	uint_t		info_length;	/* Device ID Length */
	uint_t		info_checksum;	/* Device ID Checksum */
	char		info_minor_name[32]; /* Minor name of lb dev */
} mddb_did_info_t;

typedef struct mddb_did_blk {
	int		blk_magic;	/* used for verification */
	uint_t		blk_revision;	/* used for verification */
	int		blk_checksum;	/* used for verification */
	uint_t		blk_commitcnt;	/* matches LB's commitcnt */
	mddb_did_info_t	blk_info[MDDB_NLB];
} mddb_did_blk_t;
#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
#pragma pack()
#endif

#define	MDDB_DID_BYTES	(roundup(sizeof (mddb_did_blk_t), MDDB_BSIZE))
#define	MDDB_DID_BLOCKS	(btodb(MDDB_DID_BYTES))

/*
 * Device ID Disk Blocks.
 * Incore linked list of disk blocks containing device IDs.
 * The list is built when reading in the mddb_did_blk structure and
 * when reading in the actual disk blocks containing device ids.
 * This list is used to easily write out all disk blocks containing
 * device ids.
 */
typedef struct mddb_did_db {
	uint_t		db_firstblk;	/* Disk Block's logical addr */
	uint_t		db_blkcnt;	/* Contig Disk Block Count */
	caddr_t		db_ptr;		/* Ptr to incore Block(s) */
	struct mddb_did_db	*db_next;	/* Ptr to next in list */
} mddb_did_db_t;

/*
 * Device ID Free List.
 * Incore linked list of free space in disk blocks containing device IDs.
 * Used to manage placement of device IDs in disk blocks.
 * All disk blocks on free list are also in linked list of disk block
 * containing device IDs (mddb_did_db_t).
 */
typedef struct mddb_did_free {
	uint_t			free_blk;	/* Disk Block's logical addr */
	uint_t			free_offset;	/* offset of free space */
	uint_t			free_length;	/* length of free space */
	struct mddb_did_free	*free_next;	/* Ptr to next in list */
} mddb_did_free_t;

/*
 * Device ID Incore Area
 *    Contains pointer to Device ID Disk Block list and
 *         Device ID Free List.
 *    Also contains incore array of pointers to device IDs.  Pointers
 *    point into the device ID Disk Block list and are used as a
 *    shortcut to find incore device IDs.
 */
typedef struct mddb_did_ic {
	mddb_did_blk_t	*did_ic_blkp;
	mddb_did_db_t	*did_ic_dbp;
	mddb_did_free_t	*did_ic_freep;
	ddi_devid_t	did_ic_devid[MDDB_NLB]; /* Ptr to device IDs */
} mddb_did_ic_t;

/*
 * Locator Block (LB):
 *	- Are fixed size, but the size is different
 *		for local/shared set db replicas.
 *	- All LB's start at logical block 0.
 * 	- After a replica quorum is found, there is
 *	  is only one incore copy of the LB.
 *	- LB's are only written when replicas are added, deleted, or errored.
 *	- LB's provide information about other replica's and their state.
 */
#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
#pragma pack(4)
#endif
typedef struct mddb_lb {
	int			lb_magic;	/* used for verification */
	uint_t			lb_revision;	/* used for verification */
	int			lb_checksum;	/* used for verification */
	uint_t			lb_commitcnt;	/* IMPORTANT */
	struct timeval32	lb_timestamp;	/* informative only */
	int			lb_loccnt;	/* used for verification */
	identifier_t		lb_ident;	/* used for verification */
	uint_t			lb_flags;	/* flags describing LB */
	uint_t			lb_spare[8];	/* Spare/Pad */
	mddb_block_t		lb_didfirstblk;	/* Devid Array Start Block */
	mddb_block_t		lb_didblkcnt;	/* Devid Array Number Blocks */
	mddb_block_t		lb_dtfirstblk;	/* Data Tag Start Block */
	mddb_block_t		lb_dtblkcnt;	/* Data Tag Number Block(s) */
	struct timeval32	lb_inittime;	/* creation of database */
	set_t			lb_setno;	/* used for verification */
	mddb_block_t		lb_blkcnt;	/* used for verification */
	mddb_block_t		lb_lnfirstblk;
	mddb_block_t		lb_lnblkcnt;
	mddb_block_t		lb_dbfirstblk;
	mddb_drvnm_t		lb_drvnm[MDDB_DRVNMCNT];
	mddb_locator_t		lb_locators[MDDB_NLB];
	/* Don't change array sizes without changing RNDUP_BLKCNT */
	mddb_sidelocator_t	lb_sidelocators[MD_MAXSIDES][MDDB_NLB];
} mddb_lb_t;
#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
#pragma pack()
#endif

/*
 * Locator block structure for MN diskset.  Same as for traditional
 * and local diskset except that more sides are supported and the
 * side number can be any number since the side number is stored
 * in the lb_mnsidelocators structure instead of being used as an index
 * into that array.  This means that the whole array may need to be
 * searched in order to find the correct information given a side number.
 */
typedef struct mddb_mnlb {
	int			lb_magic;	/* used for verification */
	uint_t			lb_revision;	/* used for verification */
	int			lb_checksum;	/* used for verification */
	uint_t			lb_commitcnt;	/* IMPORTANT */
	struct timeval32	lb_timestamp;	/* informative only */
	int			lb_loccnt;	/* used for verification */
	identifier_t		lb_ident;	/* used for verification */
	uint_t			lb_flags;	/* flags describing LB */
	uint_t			lb_spare[8];	/* Spare/Pad */
	mddb_block_t		lb_didfirstblk;	/* Devid Array Start Block */
	mddb_block_t		lb_didblkcnt;	/* Devid Array Number Blocks */
	mddb_block_t		lb_dtfirstblk;	/* Data Tag Start Block */
	mddb_block_t		lb_dtblkcnt;	/* Data Tag Number Block(s) */
	struct timeval32	lb_inittime;	/* creation of database */
	set_t			lb_setno;	/* used for verification */
	mddb_block_t		lb_blkcnt;	/* used for verification */
	mddb_block_t		lb_lnfirstblk;
	mddb_block_t		lb_lnblkcnt;
	mddb_block_t		lb_dbfirstblk;
	mddb_drvnm_t		lb_drvnm[MDDB_DRVNMCNT];
	mddb_locator_t		lb_locators[MDDB_NLB];
	/* Don't change array sizes without changing MDDB_MNLBCNT */
	mddb_mnsidelocator_t	lb_mnsidelocators[MD_MNMAXSIDES][MDDB_NLB];
} mddb_mnlb_t;


#define	MDDB_LBCNT		RNDUP_BLKCNT(sizeof (mddb_lb_t), 0)
#define	MDDB_LOCAL_LBCNT	RNDUP_BLKCNT(sizeof (mddb_lb_t), \
				    sizeof (mddb_sidelocator_t))

#define	MDDB_MNLBCNT		((sizeof (mddb_mnlb_t) + (MDDB_BSIZE - 1)) \
				    / MDDB_BSIZE)

typedef struct mddb_map {
	daddr32_t		m_consecutive;
	daddr32_t		m_firstblk;
} mddb_map_t;

/*
 * Master block(s) (MB)
 * 	- Are written by userland; Never by the driver!
 *	- Each replica has there own master blocks,
 *		the master block(s) are not shared.
 *	- MB's are not in the logical block address space of the database.
 *	- MB's are a fixed size record (MDDB_BSIZE)
 *	- MB's provide the logical to physical block translation,
 *		for their replica.
 */
typedef	struct mddb_mb {
	int			mb_magic;	/* used for verification */
	uint_t			mb_revision;	/* used for verification */
	uint_t			mb_checksum;	/* used for verification */
#ifdef _LP64
	uint32_t		mb_next;	/* incore to next mb */
#else
	struct mddb_mb		*mb_next;	/* incore to next mb */
#endif	/* _LP64 */
	daddr32_t		mb_nextblk;	/* block # for next mb */
	md_timeval32_t		mb_timestamp;	/* timestamp */
	daddr32_t		mb_blkcnt;	/* size of blkmap */
	daddr32_t		mb_blkno;	/* physical loc. for this MB */
	set_t			mb_setno;	/* used for verification */
	struct timeval32	mb_setcreatetime; /* set creation timestamp */
	int			spares[7];
	mddb_map_t		mb_blkmap;	/* logical->physical blk map */
	int			mb_devid_magic;	/* verify devid in mb */
	short			mb_devid_len;	/* len of following devid */
	char			mb_devid[1];	/* devid byte array */
} mddb_mb_t;

/*
 * In-core version of mddb_mb. It is known that the mddb_mb is 512 bytes on
 * disk, really, and so this structure is 512 + sizeof(struct mddb_mb_ic *)
 */
#define	MDDB_IC_BSIZE	(MDDB_BSIZE + sizeof (struct mddb_mb_ic *))
typedef struct mddb_mb_ic {
	struct mddb_mb_ic 	*mbi_next;
	struct mddb_mb		mbi_mddb_mb;
} mddb_mb_ic_t;


/*
 * there can be no address in record block. The checksum must
 * stay the same where ever the record is in memory. Many
 * things depend on this. Also the timestamp is the time the the
 * record was committed not the time it was written to a particular
 * device.
 *
 * Old definition of mddb_rb, for 32-bit apps and libraries
 */
typedef struct mddb_rb {
	uint_t			rb_magic;
	uint_t			rb_revision;
	uint_t			rb_checksum;
	uint_t			rb_checksum_fiddle;
	uint_t			rb_private;
	void			*rb_userdata;
	uint_t			rb_commitcnt;
	uint_t			rb_spare[1];
	struct timeval32	rb_timestamp;
	int			rb_data[1];
} mddb_rb_t;

/* This is, and always will be, the on-disk version of mddb_rb */
typedef struct mddb_rb32 {
	uint_t			rb_magic;
	uint_t			rb_revision;
	uint_t			rb_checksum;
	uint_t			rb_checksum_fiddle;
	uint_t			rb_private;
	uint32_t		rb_userdata;
	uint_t			rb_commitcnt;
	uint_t			rb_spare[1];
	struct timeval32	rb_timestamp;
	int			rb_data[1];
} mddb_rb32_t;

/*
 * directory entries
 */
typedef struct mddb_optinfo {
	int		o_li;
	int		o_flags;
} mddb_optinfo_t;

/* Old definition of mddb_de, for 32-bit apps and libraries */
typedef struct mddb_de {
	struct mddb_de	*de_next;
	mddb_rb_t	*de_rb;
	mddb_recid_t	de_recid;
	mddb_type_t	de_type1;
	uint_t		de_type2;
	uint_t		de_reqsize;
	uint_t		de_recsize;
	mddb_block_t	de_blkcount;
	uint_t		de_flags;
	mddb_optinfo_t	de_optinfo[2];
	mddb_block_t	de_blks[1];
} mddb_de_t;

/*
 * In core version of mddb_de, includes pointer for mddb_rb32_t user data
 * mddb_rb32_t is used incore
 */
typedef struct mddb_de_ic {
	void			*de_rb_userdata;
	void			*de_rb_userdata_ic;
	uint_t			de_owner_nodeid;
	struct mddb_de_ic	*de_next;
	mddb_rb32_t		*de_rb;
	mddb_recid_t		de_recid;
	mddb_type_t		de_type1;
	uint_t			de_type2;
	size_t			de_reqsize;
	size_t			de_icreqsize;
	size_t			de_recsize;
	uint_t			de_blkcount;
	uint_t			de_flags;
	mddb_optinfo_t		de_optinfo[2];
	mddb_block_t		de_blks[1];
} mddb_de_ic_t;

typedef struct mddb_db {
	uint_t			db_magic;
	uint_t			db_revision;
	uint_t			db_checksum;
	mddb_block_t		db_blknum;
	struct mddb_db		*db_next;
	mddb_block_t		db_nextblk;
	struct timeval32	db_timestamp;
	uint_t			db_recsum;
#ifdef _KERNEL
	mddb_de_ic_t		*db_firstentry;
#else
	mddb_de_t		*db_firstentry;
#endif
} mddb_db_t;

/*
 * This is, and always will be, the on-disk version of mddb_de
 * When mddb_de32 is read in it is converted into mddb_de_ic
 */
typedef struct mddb_de32 {
	uint32_t	de32_next;
	uint32_t	de32_rb;
	mddb_recid_t	de32_recid;
	mddb_type_t	de32_type1;
	uint_t		de32_type2;
	uint_t		de32_reqsize;
	uint_t		de32_recsize;
	mddb_block_t	de32_blkcount;
	uint_t		de32_flags;
	mddb_optinfo_t	de32_optinfo[2];
	mddb_block_t	de32_blks[1];
} mddb_de32_t;

/*
 * This is, and always will be, the on-disk version of mddb_db
 * When mddb_db32 is read in it is converted into mddb_db
 * To minimize impact on mddb format mddb_db fileds remain intact
 */
typedef struct mddb_db32 {
	uint_t			db32_magic;
	uint_t			db32_revision;
	uint_t			db32_checksum;
	mddb_block_t		db32_blknum;
	uint32_t		db32_next;
	mddb_block_t		db32_nextblk;
	struct timeval32	db32_timestamp;
	uint_t			db32_recsum;
	uint32_t		db32_firstentry;
} mddb_db32_t;

#define	de32tode(from, to) \
	{ \
	int i; \
	to->de_rb_userdata = NULL; \
	to->de_owner_nodeid = MD_MN_INVALID_NID; \
	to->de_next = (struct mddb_de_ic *)(uintptr_t)from->de32_next; \
	to->de_rb = (mddb_rb32_t *)(uintptr_t)from->de32_rb; \
	to->de_recid =  from->de32_recid; \
	to->de_type1 =  from->de32_type1; \
	to->de_type2 =  from->de32_type2; \
	to->de_reqsize =  from->de32_reqsize; \
	to->de_recsize =  from->de32_recsize; \
	to->de_blkcount =  from->de32_blkcount; \
	to->de_flags =  from->de32_flags; \
	to->de_optinfo[0] =  from->de32_optinfo[0]; \
	to->de_optinfo[1] =  from->de32_optinfo[1]; \
	for (i = 0; i < from->de32_blkcount; i++) \
		to->de_blks[i] = from->de32_blks[i]; \
	}

#define	detode32(from, to) \
	{ \
	int i; \
	to->de32_next = (uint32_t)(uintptr_t)from->de_next; \
	to->de32_rb = (uint32_t)(uintptr_t)from->de_rb; \
	to->de32_recid =  from->de_recid; \
	to->de32_type1 =  from->de_type1; \
	to->de32_type2 =  from->de_type2; \
	to->de32_reqsize =  from->de_reqsize; \
	to->de32_recsize =  from->de_recsize; \
	to->de32_blkcount =  from->de_blkcount; \
	to->de32_flags =  from->de_flags; \
	to->de32_optinfo[0] =  from->de_optinfo[0]; \
	to->de32_optinfo[1] =  from->de_optinfo[1]; \
	for (i = 0; i < from->de_blkcount; i++) \
		to->de32_blks[i] = from->de_blks[i]; \
	}

#define	db32todb(from, to) \
	to->db_magic = from->db32_magic; \
	to->db_revision = from->db32_revision; \
	to->db_checksum = from->db32_checksum; \
	to->db_blknum = from->db32_blknum; \
	to->db_next = (struct mddb_db *)(uintptr_t)from->db32_next; \
	to->db_nextblk = from->db32_nextblk; \
	to->db_timestamp = from->db32_timestamp; \
	to->db_recsum = from->db32_recsum; \
	to->db_firstentry = (mddb_de_ic_t *)(uintptr_t)from->db32_firstentry;

#define	dbtodb32(from, to) \
	to->db32_magic = from->db_magic; \
	to->db32_revision = from->db_revision; \
	to->db32_checksum = from->db_checksum; \
	to->db32_blknum = from->db_blknum; \
	to->db32_next = (uint32_t)(uintptr_t)from->db_next; \
	to->db32_nextblk = from->db_nextblk; \
	to->db32_timestamp = from->db_timestamp; \
	to->db32_recsum = from->db_recsum; \
	to->db32_firstentry = (uint32_t)(uintptr_t)from->db_firstentry;

/*
 * information about a replica of the data base
 */
typedef struct mddb_ri {
	struct mddb_ri		*ri_next;
	uint_t			ri_flags;
	uint_t			ri_commitcnt;
	int			ri_transplant;
	md_dev64_t		ri_dev;
	daddr32_t		ri_blkno;
	char			ri_driver[16];
	mddb_mb_ic_t		*ri_mbip;
	mddb_lb_t		*ri_lbp;
	mddb_dt_t		*ri_dtp;
	mddb_did_ic_t		*ri_did_icp;
	ddi_devid_t		ri_devid;
	ddi_devid_t		ri_old_devid;
	char			ri_minor_name[MDDB_MINOR_NAME_MAX];
	char			ri_devname[MAXPATHLEN];
} mddb_ri_t;

typedef struct mddb_bf {
	struct mddb_bf	*bf_next;
	mddb_locator_t	*bf_locator;
	buf_t		bf_buf;
} mddb_bf_t;

/*
 * Information for sets of databases (which include replicas)
 */
#define	MDDB_BITSRECID	31
#define	MDDB_SETSHIFT	(MDDB_BITSRECID - MD_BITSSET)
#define	MDDB_SETMASK	(MD_SETMASK << MDDB_SETSHIFT)
#define	MDDB_RECIDMASK	((1 << MDDB_SETSHIFT) - 1)

#define	DBSET(id)	(((id) & MDDB_SETMASK) >> MDDB_SETSHIFT)
#define	DBID(id)	((id) & MDDB_RECIDMASK)
#define	MAKERECID(s, i)	((((s) << MDDB_SETSHIFT) & MDDB_SETMASK) | \
			((i) & MDDB_RECIDMASK))

#define	MDDB_PARSE_LOCBLK	0x00000001
#define	MDDB_PARSE_LOCNM	0x00000002
#define	MDDB_PARSE_OPTRECS	0x00000004
#define	MDDB_PARSE_MASK		0x0000000F


#define	MDDB_BLOCK_PARSE	0x00000001	/* Block sending parse msgs */
#define	MDDB_UNBLOCK_PARSE	0x00000002	/* Unblock sending parse msgs */

/*
 * We need to keep s_ident and s_inittime 32 bit.  They are used in mddb_lb
 */
typedef struct mddb_set {
	uint_t		s_setno;		/* set number */
	uint_t		s_sideno;		/* side number */
	identifier_t	s_ident;		/* set identifier */
	char		*s_setname;		/* set name */
	mddb_mb_ic_t	**s_mbiarray;		/* master blocks array */
	mddb_db_t	*s_dbp;			/* directory block */
	mddb_lb_t	*s_lbp;			/* locator block */
						/* May be cast to mddb_mnlb_t */
						/* if accessing sidenames in */
						/* MN diskset */
	mddb_ln_t	*s_lnp;			/* locator names block */
						/* May be cast to mddb_mnln_t */
						/* if accessing sidenames in */
						/* MN diskset */
	mddb_dtag_lst_t	*s_dtlp;		/* List of data tags found */
	mddb_did_ic_t	*s_did_icp;		/* Device ID incore area */
	mddb_ri_t	*s_rip;			/* replicas incore list */
	int		s_freeblkcnt;		/* visable for test code */
	int		s_totalblkcnt;		/* visable for test code */
	int		s_mn_parseflags;	/* mddb parse flags for MNset */
	int		s_mn_parseflags_sending; /* parse flgs sent to slaves */
	uchar_t		*s_freebitmap;		/* free blocks bitmap */
	uint_t		s_freebitmapsize;	/* size of bitmap */
	struct timeval32	s_inittime;	/* timestamp set created */
	mddb_recid_t	s_zombie;		/* zombie record - createrec */
	int		s_staledeletes;		/* number of stale deleterec */
	int		s_optcmtcnt;		/* Following are opt. record */
	int		s_opthavelck;		/*   bookkeeping records ... */
	int		s_optwantlck;
	kcondvar_t	s_optwantlck_cv;
	int		s_optwaiterr;
	int		s_opthungerr;
	kcondvar_t	s_opthungerr_cv;
	int		s_opthavequeuinglck;
	int		s_optwantqueuinglck;
	kcondvar_t	s_optqueuing_cv;
	ulong_t		s_bufmisses;
	mddb_bf_t	*s_freebufhead;
	int		s_bufwakeup;
	kcondvar_t	s_buf_cv;
	size_t		s_databuffer_size;
	void		*s_databuffer;
	int		s_singlelockgotten;
	int		s_singlelockwanted;
	kcondvar_t	s_single_thread_cv;
	md_hi_arr_t	s_med;
} mddb_set_t;

#ifndef MDDB_FAKE
#ifdef _KERNEL
/* md_mddb.c */
extern uint_t			mddb_lb_did_convert(mddb_set_t *,
				    uint_t, uint_t *);
extern void			mddb_locatorblock2splitname(mddb_ln_t *,
				    int, side_t, md_splitname *);
extern int			mddb_configure(mddb_cfgcmd_t,
				    struct mddb_config *);
extern mddb_recid_t		mddb_getnextrec(mddb_recid_t,
				    mddb_type_t, uint_t);
extern int			mddb_getoptloc(mddb_optloc_t *);
extern void			*mddb_getrecaddr(mddb_recid_t);
extern void			*mddb_getrecaddr_resize(mddb_recid_t, size_t,
				    off_t);
extern int			mddb_getrecprivate(mddb_recid_t);
extern void			mddb_setrecprivate(mddb_recid_t, uint_t);
extern mddb_de_ic_t		*mddb_getrecdep(mddb_recid_t);
extern mddb_type_t		mddb_getrectype1(mddb_recid_t);
extern int			mddb_getrectype2(mddb_recid_t);
extern int			mddb_getrecsize(mddb_recid_t);
extern int			mddb_commitrec(mddb_recid_t);
extern int			mddb_commitrecs(mddb_recid_t *);
extern int			mddb_deleterec(mddb_recid_t);
extern mddb_recstatus_t		mddb_getrecstatus(mddb_recid_t);
extern mddb_recid_t		mddb_createrec(size_t usersize,
				    mddb_type_t type, uint_t type2,
				    md_create_rec_option_t option, set_t setno);
extern void			mddb_init(void);
extern void			mddb_unload(void);
extern void			mddb_unload_set(set_t setno);
extern mddb_recid_t		mddb_makerecid(set_t setno, mddb_recid_t id);
extern set_t			mddb_getsetnum(mddb_recid_t id);
extern char			*mddb_getsetname(set_t setno);
extern side_t			mddb_getsidenum(set_t setno);
extern int			mddb_ownset(set_t setno);
extern int			getmed_ioctl(mddb_med_parm_t *medpp, int mode);
extern int			setmed_ioctl(mddb_med_parm_t *medpp, int mode);
extern int			updmed_ioctl(mddb_med_upd_parm_t *medpp,
				    int mode);
extern int			take_set(mddb_config_t *cp, int mode);
extern int			release_set(mddb_config_t *cp, int mode);
extern int			gettag_ioctl(mddb_dtag_get_parm_t *dtgpp,
				    int mode);
extern int			usetag_ioctl(mddb_dtag_use_parm_t *dtupp,
				    int mode);
extern int			accept_ioctl(mddb_accept_parm_t *medpp,
				    int mode);
extern int			md_update_locator_namespace(set_t setno,
				    side_t side, char *dname, char *pname,
				    md_dev64_t devt);
extern int			mddb_validate_lb(set_t setno, int *rmaxsz);
extern int			mddb_getinvlb_devid(set_t setno, int count,
				    int size, char **ctdptr);
extern int			md_update_minor(set_t, side_t, mdkey_t);
extern int			md_update_nm_rr_did_ioctl(mddb_config_t *cp);
extern int			md_update_top_device_minor(set_t, side_t,
				    md_dev64_t);
#ifdef DEBUG
extern void			mddb_check(void);
#endif /* DEBUG */
#endif /* _KERNEL */

#else

caddr_t mddb_fakeit;

#define	md_lb_did_convert(a, b, c)	(0)
#define	mddb_configure(a, b)	(0)
#define	mddb_getnextrec(a, b, c)		((mddb_recid_t)0)
#define	mddb_getrecaddr(a)	(mddb_fakeit)
#define	mddb_getrecprivate(a)	(0)
#define	mddb_setrecprivate(a, b) (0)
#define	mddb_getrectype1(a)	(0)
#define	mddb_getrectype2(a)	(0)
#define	mddb_getrecsize(a)	(0)
#define	mddb_commitrec(a)	(0)
#define	mddb_commitrecs(a)	(0)
#define	mddb_deleterec(a)	(0)
#define	mddb_getrecstatus(a)	(MDDB_OK)
#define	mddb_createrec(s, a, b)	(0xffff & (int)(mddb_fakeit = \
					(caddr_t)kmem_zalloc(s, KM_SLEEP)))
#define	mddb_unload()		(0)

#endif

#define	MDDB_NOSLEEP	1
#define	MDDB_SLEEPOK	0

#define	MDDB_NOOLDOK	0x1
#define	MDDB_MUSTEXIST	0x2
#define	MDDB_NOINIT	0x4
#define	MDDB_MULTINODE	0x8
#define	MDDB_MN_STALE	0x10	/* MN set is stale */

/* Flags passed to selectreplicas - not a bit mask */
#define	MDDB_SCANALL		1
#define	MDDB_RETRYSCAN		0
#define	MDDB_SCANALLSYNC	2	/* During reconfig, sync up incore */
					/* and ondisk mddb by writing incore */
					/* values to disk.  Don't write */
					/* change log records. */

/* Flags passed to writestart and writecopy */
#define	MDDB_WRITECOPY_ALL	1	/* Write all incore mddb to disk */
#define	MDDB_WRITECOPY_SYNC	2	/* Write incore mddb to disk except */
					/* 	- change log records */
					/*	- optimized resync records */


#define	MDDB_PROBE	1
#define	MDDB_NOPROBE	0


/*
 * MN diskset definitions used to determine if a slave can write
 * directly to the mddb.  ONLY_MASTER only allows the master node
 * to write to the mddb.  ANY_NODE allows any node to write
 * to the mddb.
 */
#define	MDDB_WR_ONLY_MASTER	0
#define	MDDB_WR_ANY_NODE	1

#define	MDDB_L_LOCKED	0x0001	/* this record is locked */
#define	MDDB_L_WANTED	0x0002

#ifdef	__cplusplus
}
#endif

#endif	/* _SYS_MD_MDDB_H */