1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
|
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_MD_MDDB_H
#define _SYS_MD_MDDB_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
#include <sys/buf.h>
#ifdef __cplusplus
extern "C" {
#endif
#if 0 /* DRP FOR DEBUGGING */
#define MDDB_FAKE
#endif
/* Private flags */
#define MD_PRV_GOTIT 0x0001 /* Been snarfed */
#define MD_PRV_DELETE 0x0002 /* Record pending to be deleted */
#define MD_PRV_COMMIT 0x0004 /* Record pending to be commited */
#define MD_PRV_CLEANUP 0x0008 /* Record pending to be cleaned up */
#define MD_PRV_CONVD 0x0010 /* Record has been converted (32->64) */
#define MD_PRV_PENDDEL (MD_PRV_GOTIT | MD_PRV_DELETE)
#define MD_PRV_PENDCOM (MD_PRV_GOTIT | MD_PRV_COMMIT)
#define MD_PRV_PENDCLEAN (MD_PRV_GOTIT | MD_PRV_CLEANUP)
#define MDDB_E_INVALID (-1) /* an invalid argument was passed */
#define MDDB_E_EXISTS (-2) /* doing an operation a 2nd time which can */
/* only be done once */
#define MDDB_E_MASTER (-3) /* problem occurred accessing mastor block */
/* returned from NEW_DEV */
#define MDDB_E_TOOSMALL (-4) /* device is not large enough */
#define MDDB_E_NORECORD (-5) /* record does not exits */
/*
* returned from: mddb_getnextrec
* mddb_getrecsize
* mddb_commitrec
* mddb_commitrecs
* mddb_deleterec
*/
#define MDDB_E_NOSPACE (-6) /* no space to create record */
#define MDDB_E_NOTNOW (-7) /* do not presently have enough resources */
/* to perform requested operation */
#define MDDB_E_NODB (-8) /* no database exist */
#define MDDB_E_NOTOWNER (-9) /* have not been told to grab this set */
#define MDDB_E_STALE (-10) /* database is stale */
#define MDDB_E_TOOFEW (-11) /* not enough replicas available */
#define MDDB_E_TAGDATA (-12) /* tagged data detected */
#define MDDB_E_ACCOK (-13) /* 50/50 mode */
#define MDDB_E_NTAGDATA (-14) /* tagop try, no tag data */
#define MDDB_E_ACCNOTOK (-15) /* accop try, no accept possible */
#define MDDB_E_NOLOCBLK (-16) /* No valid locators found */
#define MDDB_E_NOLOCNMS (-17) /* No valid locator name information */
#define MDDB_E_NODIRBLK (-18) /* No directory blocks found */
#define MDDB_E_NOTAGREC (-19) /* No tag record blocks found */
#define MDDB_E_NOTAG (-20) /* No matching tag record found */
#define MDDB_E_NODEVID (-21) /* No device id found */
#define MDDB_MINBLKS 16 /* enough for a few metadevices */
#define MDDB_MAXBLKS 8192 /* size of free bit map (must be / 8) */
#define MDDB_MN_MINBLKS 32768 /* Multinode metadb minimum size */
/* 16MB */
#define MDDB_MN_MAXBLKS 524288 /* size of free bit map (must be / 8) */
/* 256MB */
#define MDDB_C_STALE 0x0001
#define MDDB_C_TOOFEW 0x0002
#define MDDB_C_NOTOWNER 0x0004
#define MDDB_C_SET_MN_STALE 0x0008 /* Set MN set to stale */
#define MDDB_C_IMPORT 0x0010
/*
* Defines used to set/reset new master flag in set structure.
* Used during reconfig cycle to determine quickly if there is
* new master for the set.
*/
#define MDDB_NM_SET 0x0001
#define MDDB_NM_RESET 0x0002
#define MDDB_NM_GET 0x0004
/* Definitions of flag in Locator Block Device ID data area - mddb_did_info */
#define MDDB_DID_EXISTS 0x0001 /* Device ID exists */
#define MDDB_DID_VALID 0x0002 /* Device ID valid on current system */
#define MDDB_DID_UPDATED 0x0004 /* locator/sidelocator info updated */
/* Definitions of flag in Locator Block - mddb_lb */
#define MDDB_DEVID_STYLE 0x0001 /* Locator Block in Device ID format */
#define MDDB_MNSET 0x0002 /* MDDB is for a multi-node set */
#define MDDB_MAX_PATCH 25 /* number of locations that */
/* can be patched in etc/system */
/*
* Set struct used by all parts of the driver, to store anchor pointers.
*
* Lock associated with field in this structure:
*
* Some of fields are accessible by both the single threaded ioctl thread
* and internal threads such as resync, hotsparing...etc. In this case
* additional protection is needed. For example, s_db is protected by
* s_dbmx additionally and s_un, s_ui are protected by md_unit_array_rw.lock
* s_nm, s_nmid, s_did_nm and s_did_nmid and s_dtp are protected by nm_lock
* Rest of other fileds are protected by md_mx. Two fields s_un_next and
* s_un_avail are introduced by the friendly name project and are ONLY
* accessible via a single threaded ioctl thread which already is protected
* by the ioctl lock and there is no need to add extra protection to them.
* However, in the future if they become accessible by other internal threads
* then an additional protection such as md_mx lock is highly recommended.
*
*/
typedef struct md_set {
uint_t s_status; /* set status */
void **s_ui; /* set unit incore anchor */
void **s_un; /* set unit anchor */
void *s_hsp; /* set Hot Spare Pool anchor */
void *s_hs; /* set Hot Spare anchor */
void *s_db; /* set MDDB anchor */
kmutex_t s_dbmx; /* set MDDB mutex */
void *s_nm; /* set namespace anchor */
mddb_recid_t s_nmid; /* set namespace anchor record */
void *s_did_nm; /* set device id namespace anchor */
mddb_recid_t s_did_nmid; /* set device id namespace anchor rec */
void *s_dtp; /* set data tag rec */
int s_am_i_master; /* incore master flag for this node */
md_mn_nodeid_t s_nodeid; /* nodeid of this node - for MN sets */
uint_t s_rcnt; /* incore resync count for set */
unit_t s_un_next; /* s_un scan starts here */
unit_t s_un_avail; /* number of avail slots */
} md_set_t;
#define MDDB_MAGIC_MB 0x6d646d62 /* magic number for master blocks */
#define MDDB_MAGIC_DB 0x6d646462 /* magic number for directory blocks */
#define MDDB_MAGIC_RB 0x6d647262 /* magic number for record blocks */
#define MDDB_MAGIC_LB 0x6d646c62 /* magic number for locator blocks */
#define MDDB_MAGIC_LN 0x6d646c6e /* magic number for locator names */
#define MDDB_MAGIC_DT 0x6d646474 /* magic number for data tag */
#define MDDB_MAGIC_DI 0x6d646469 /* magic number for device ID block */
#define MDDB_MAGIC_DU 0x6d646475 /* magic num for dummy mb */
#define MDDB_MAGIC_DE 0x6d646465 /* magic num for mb devid */
#define MDDB_GLOBAL_XOR 1234567890
#define MDDB_REV_MAJOR (uint_t)0xff00
#define MDDB_REV_MINOR (uint_t)0x00ff
/*
* MDDB_REV_MNMB:
* If a MN diskset, master block revision is set to MDDB_REV_MNMB.
* Even though the master block structure is no different
* for a MN set, setting the revision field to a different
* number keeps any pre-MN_diskset code from accessing
* this diskset. It also allows for an early determination
* of a MN diskset when reading in from disk so that the
* proper size locator block and locator names structure
* can be read in thus saving time on diskset startup.
* Since no change in master block structure, the MDDB_REV_MINOR
* portion of the revision was incremented.
*
* MDDB_REV_MNLB:
* If a MN diskset, the locator block structure is a different size in
* order to accomodate up to MD_MNMAXSIDES nodes in a diskset
* with any nodeid (sideno) allowed.
* The revision is set to MDDB_REV_MNLB which is a change of the
* MDDB_REV_MAJOR portion of the revision.
*
* MDDB_REV_MNLN:
* If a MN diskset, the locator names is a different size in
* order to accomodate up to MD_MNMAXSIDES nodes in a diskset
* with any nodeid (sideno) allowed.
* The revision is set to MDDB_REV_MNLN which is a change of the
* MDDB_REV_MAJOR portion of the revision.
*
* The record blocks have two binary properties. A record block can
* represent either a 32 or 64 bit unit. A record block can also represent
* a traditionally named unit or a friendly named unit. Thus, there are
* minor revisions of record block.
*
* Traditional Friendly
* Name Name
* ----------- --------
* 32 bit MDDB_REV_RB MDDB_REV_RBFN
* 64 bit MDDB_REV_RB64 MDDB_REV_RB64FN
*/
#define MDDB_REV_MB (uint_t)0x0201
#define MDDB_REV_MNMB (uint_t)0x0202
#define MDDB_REV_DB (uint_t)0x0201
#define MDDB_REV_LB (uint_t)0x0500
#define MDDB_REV_MNLB (uint_t)0x0600
#define MDDB_REV_LN (uint_t)0x0100
#define MDDB_REV_MNLN (uint_t)0x0300
#define MDDB_REV_RB (uint_t)0x0200
#define MDDB_REV_RB64 (uint_t)0x0201
#define MDDB_REV_RBFN (uint_t)0x0202
#define MDDB_REV_RB64FN (uint_t)0x0203
#define MDDB_REV_DT (uint_t)0x0100
#define MDDB_REV_DI (uint_t)0x0100
/*
* Transfer record block friendly name status to unit/hs structure.
*/
#define MDDB_NOTE_FN(rbv, unv) switch (rbv) { \
case MDDB_REV_RB: \
case MDDB_REV_RB64: \
unv &= ~MD_FN_META_DEV; \
break; \
case MDDB_REV_RBFN: \
case MDDB_REV_RB64FN: \
unv |= MD_FN_META_DEV; \
break; \
}
#define MDDB_BSIZE (uint_t)DEV_BSIZE
#define MDDB_PREFIXCNT 10
#define MDDB_DRVNMCNT 10
typedef int mddb_block_t;
#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
#pragma pack(4)
#endif
typedef struct md_mnname_suffix {
md_name_suffix mn_ln_suffix;
uint_t mn_ln_sideno;
} md_mnname_suffix_t;
typedef struct mddb_ln {
int ln_magic;
uint_t ln_revision;
uint_t ln_checksum;
struct timeval32 ln_timestamp;
md_name_prefix ln_prefixes[MDDB_PREFIXCNT];
/* Don't change array sizes without changing RNDUP_BLKCNT */
md_name_suffix ln_suffixes[MD_MAXSIDES][MDDB_NLB];
} mddb_ln_t;
/*
* Locator name structure for MN diskset. Same as for traditional
* and local diskset except that more sides are supported and the
* side number can be any number since the side number is stored
* in the ln_mnsuffixes structure instead of being used as an index
* into that array. This means that the whole array may need to be
* searched in order to find the correct information given a side number.
*/
typedef struct mddb_mnln {
int ln_magic;
uint_t ln_revision;
uint_t ln_checksum;
struct timeval32 ln_timestamp;
md_name_prefix ln_prefixes[MDDB_PREFIXCNT];
/* Don't change array sizes without changing MDDB_MNLNCNT */
md_mnname_suffix_t ln_mnsuffixes[MD_MNMAXSIDES][MDDB_NLB];
} mddb_mnln_t;
#define RNDUP_BLKCNT(sz, delta) (((sz) - \
((delta) * \
((MD_MAXSIDES - 1) * MDDB_NLB)) + \
MDDB_BSIZE - 1) / MDDB_BSIZE)
#define MDDB_LNCNT RNDUP_BLKCNT(sizeof (mddb_ln_t), 0)
#define MDDB_LOCAL_LNCNT RNDUP_BLKCNT(sizeof (mddb_ln_t), \
sizeof (md_name_suffix))
#define MDDB_MNLNCNT ((sizeof (mddb_mnln_t) + (MDDB_BSIZE - 1)) \
/ MDDB_BSIZE)
typedef struct mddb_dt {
uint_t dt_mag;
uint_t dt_rev;
uint_t dt_cks;
mddb_dtag_t dt_dtag;
} mddb_dt_t;
#define MDDB_DT_BYTES (roundup(sizeof (mddb_dt_t), MDDB_BSIZE))
#define MDDB_DT_BLOCKS (btodb(MDDB_DT_BYTES))
typedef union identifier {
char serial[MDDB_SN_LEN];
struct timeval32 createtime;
} identifier_t;
typedef struct mddb_locator {
dev32_t l_dev;
daddr32_t l_blkno;
int l_flags;
} mddb_locator_t;
typedef struct mddb_sidelocator {
uchar_t l_drvnm_index;
minor_t l_mnum;
} mddb_sidelocator_t;
typedef struct mddb_mnsidelocator {
uchar_t mnl_drvnm_index;
minor_t mnl_mnum;
uint_t mnl_sideno;
} mddb_mnsidelocator_t;
typedef struct mddb_drvnm {
uchar_t dn_len;
char dn_data[MD_MAXDRVNM];
} mddb_drvnm_t;
/*
* Locator Block Device ID Information
* Several device id's may share one disk block in an effort to
* conserve used replica space.
*/
typedef struct mddb_did_info {
uint_t info_flags; /* MDDB Device ID flags */
uint_t info_firstblk; /* Device ID Start Block */
uint_t info_blkcnt; /* Device ID Block Count */
uint_t info_offset; /* Device ID offset w/i Block */
uint_t info_length; /* Device ID Length */
uint_t info_checksum; /* Device ID Checksum */
char info_minor_name[32]; /* Minor name of lb dev */
} mddb_did_info_t;
typedef struct mddb_did_blk {
int blk_magic; /* used for verification */
uint_t blk_revision; /* used for verification */
int blk_checksum; /* used for verification */
uint_t blk_commitcnt; /* matches LB's commitcnt */
mddb_did_info_t blk_info[MDDB_NLB];
} mddb_did_blk_t;
#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
#pragma pack()
#endif
#define MDDB_DID_BYTES (roundup(sizeof (mddb_did_blk_t), MDDB_BSIZE))
#define MDDB_DID_BLOCKS (btodb(MDDB_DID_BYTES))
/*
* Device ID Disk Blocks.
* Incore linked list of disk blocks containing device IDs.
* The list is built when reading in the mddb_did_blk structure and
* when reading in the actual disk blocks containing device ids.
* This list is used to easily write out all disk blocks containing
* device ids.
*/
typedef struct mddb_did_db {
uint_t db_firstblk; /* Disk Block's logical addr */
uint_t db_blkcnt; /* Contig Disk Block Count */
caddr_t db_ptr; /* Ptr to incore Block(s) */
struct mddb_did_db *db_next; /* Ptr to next in list */
} mddb_did_db_t;
/*
* Device ID Free List.
* Incore linked list of free space in disk blocks containing device IDs.
* Used to manage placement of device IDs in disk blocks.
* All disk blocks on free list are also in linked list of disk block
* containing device IDs (mddb_did_db_t).
*/
typedef struct mddb_did_free {
uint_t free_blk; /* Disk Block's logical addr */
uint_t free_offset; /* offset of free space */
uint_t free_length; /* length of free space */
struct mddb_did_free *free_next; /* Ptr to next in list */
} mddb_did_free_t;
/*
* Device ID Incore Area
* Contains pointer to Device ID Disk Block list and
* Device ID Free List.
* Also contains incore array of pointers to device IDs. Pointers
* point into the device ID Disk Block list and are used as a
* shortcut to find incore device IDs.
*/
typedef struct mddb_did_ic {
mddb_did_blk_t *did_ic_blkp;
mddb_did_db_t *did_ic_dbp;
mddb_did_free_t *did_ic_freep;
ddi_devid_t did_ic_devid[MDDB_NLB]; /* Ptr to device IDs */
} mddb_did_ic_t;
/*
* Locator Block (LB):
* - Are fixed size, but the size is different
* for local/shared set db replicas.
* - All LB's start at logical block 0.
* - After a replica quorum is found, there is
* is only one incore copy of the LB.
* - LB's are only written when replicas are added, deleted, or errored.
* - LB's provide information about other replica's and their state.
*/
#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
#pragma pack(4)
#endif
typedef struct mddb_lb {
int lb_magic; /* used for verification */
uint_t lb_revision; /* used for verification */
int lb_checksum; /* used for verification */
uint_t lb_commitcnt; /* IMPORTANT */
struct timeval32 lb_timestamp; /* informative only */
int lb_loccnt; /* used for verification */
identifier_t lb_ident; /* used for verification */
uint_t lb_flags; /* flags describing LB */
uint_t lb_spare[8]; /* Spare/Pad */
mddb_block_t lb_didfirstblk; /* Devid Array Start Block */
mddb_block_t lb_didblkcnt; /* Devid Array Number Blocks */
mddb_block_t lb_dtfirstblk; /* Data Tag Start Block */
mddb_block_t lb_dtblkcnt; /* Data Tag Number Block(s) */
struct timeval32 lb_inittime; /* creation of database */
set_t lb_setno; /* used for verification */
mddb_block_t lb_blkcnt; /* used for verification */
mddb_block_t lb_lnfirstblk;
mddb_block_t lb_lnblkcnt;
mddb_block_t lb_dbfirstblk;
mddb_drvnm_t lb_drvnm[MDDB_DRVNMCNT];
mddb_locator_t lb_locators[MDDB_NLB];
/* Don't change array sizes without changing RNDUP_BLKCNT */
mddb_sidelocator_t lb_sidelocators[MD_MAXSIDES][MDDB_NLB];
} mddb_lb_t;
#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
#pragma pack()
#endif
/*
* Locator block structure for MN diskset. Same as for traditional
* and local diskset except that more sides are supported and the
* side number can be any number since the side number is stored
* in the lb_mnsidelocators structure instead of being used as an index
* into that array. This means that the whole array may need to be
* searched in order to find the correct information given a side number.
*/
typedef struct mddb_mnlb {
int lb_magic; /* used for verification */
uint_t lb_revision; /* used for verification */
int lb_checksum; /* used for verification */
uint_t lb_commitcnt; /* IMPORTANT */
struct timeval32 lb_timestamp; /* informative only */
int lb_loccnt; /* used for verification */
identifier_t lb_ident; /* used for verification */
uint_t lb_flags; /* flags describing LB */
uint_t lb_spare[8]; /* Spare/Pad */
mddb_block_t lb_didfirstblk; /* Devid Array Start Block */
mddb_block_t lb_didblkcnt; /* Devid Array Number Blocks */
mddb_block_t lb_dtfirstblk; /* Data Tag Start Block */
mddb_block_t lb_dtblkcnt; /* Data Tag Number Block(s) */
struct timeval32 lb_inittime; /* creation of database */
set_t lb_setno; /* used for verification */
mddb_block_t lb_blkcnt; /* used for verification */
mddb_block_t lb_lnfirstblk;
mddb_block_t lb_lnblkcnt;
mddb_block_t lb_dbfirstblk;
mddb_drvnm_t lb_drvnm[MDDB_DRVNMCNT];
mddb_locator_t lb_locators[MDDB_NLB];
/* Don't change array sizes without changing MDDB_MNLBCNT */
mddb_mnsidelocator_t lb_mnsidelocators[MD_MNMAXSIDES][MDDB_NLB];
} mddb_mnlb_t;
#define MDDB_LBCNT RNDUP_BLKCNT(sizeof (mddb_lb_t), 0)
#define MDDB_LOCAL_LBCNT RNDUP_BLKCNT(sizeof (mddb_lb_t), \
sizeof (mddb_sidelocator_t))
#define MDDB_MNLBCNT ((sizeof (mddb_mnlb_t) + (MDDB_BSIZE - 1)) \
/ MDDB_BSIZE)
typedef struct mddb_map {
daddr32_t m_consecutive;
daddr32_t m_firstblk;
} mddb_map_t;
/*
* Master block(s) (MB)
* - Are written by userland; Never by the driver!
* - Each replica has there own master blocks,
* the master block(s) are not shared.
* - MB's are not in the logical block address space of the database.
* - MB's are a fixed size record (MDDB_BSIZE)
* - MB's provide the logical to physical block translation,
* for their replica.
*/
typedef struct mddb_mb {
int mb_magic; /* used for verification */
uint_t mb_revision; /* used for verification */
uint_t mb_checksum; /* used for verification */
#ifdef _LP64
uint32_t mb_next; /* incore to next mb */
#else
struct mddb_mb *mb_next; /* incore to next mb */
#endif /* _LP64 */
daddr32_t mb_nextblk; /* block # for next mb */
md_timeval32_t mb_timestamp; /* timestamp */
daddr32_t mb_blkcnt; /* size of blkmap */
daddr32_t mb_blkno; /* physical loc. for this MB */
set_t mb_setno; /* used for verification */
struct timeval32 mb_setcreatetime; /* set creation timestamp */
int spares[7];
mddb_map_t mb_blkmap; /* logical->physical blk map */
int mb_devid_magic; /* verify devid in mb */
short mb_devid_len; /* len of following devid */
char mb_devid[1]; /* devid byte array */
} mddb_mb_t;
/*
* In-core version of mddb_mb. It is known that the mddb_mb is 512 bytes on
* disk, really, and so this structure is 512 + sizeof(struct mddb_mb_ic *)
*/
#define MDDB_IC_BSIZE (MDDB_BSIZE + sizeof (struct mddb_mb_ic *))
typedef struct mddb_mb_ic {
struct mddb_mb_ic *mbi_next;
struct mddb_mb mbi_mddb_mb;
} mddb_mb_ic_t;
/*
* there can be no address in record block. The checksum must
* stay the same where ever the record is in memory. Many
* things depend on this. Also the timestamp is the time the the
* record was committed not the time it was written to a particular
* device.
*
* Old definition of mddb_rb, for 32-bit apps and libraries
*/
typedef struct mddb_rb {
uint_t rb_magic;
uint_t rb_revision;
uint_t rb_checksum;
uint_t rb_checksum_fiddle;
uint_t rb_private;
void *rb_userdata;
uint_t rb_commitcnt;
uint_t rb_spare[1];
struct timeval32 rb_timestamp;
int rb_data[1];
} mddb_rb_t;
/* This is, and always will be, the on-disk version of mddb_rb */
typedef struct mddb_rb32 {
uint_t rb_magic;
uint_t rb_revision;
uint_t rb_checksum;
uint_t rb_checksum_fiddle;
uint_t rb_private;
uint32_t rb_userdata;
uint_t rb_commitcnt;
uint_t rb_spare[1];
struct timeval32 rb_timestamp;
int rb_data[1];
} mddb_rb32_t;
/*
* directory entries
*/
typedef struct mddb_optinfo {
int o_li;
int o_flags;
} mddb_optinfo_t;
/* Old definition of mddb_de, for 32-bit apps and libraries */
typedef struct mddb_de {
struct mddb_de *de_next;
mddb_rb_t *de_rb;
mddb_recid_t de_recid;
mddb_type_t de_type1;
uint_t de_type2;
uint_t de_reqsize;
uint_t de_recsize;
mddb_block_t de_blkcount;
uint_t de_flags;
mddb_optinfo_t de_optinfo[2];
mddb_block_t de_blks[1];
} mddb_de_t;
/*
* In core version of mddb_de, includes pointer for mddb_rb32_t user data
* mddb_rb32_t is used incore
*/
typedef struct mddb_de_ic {
void *de_rb_userdata;
void *de_rb_userdata_ic;
uint_t de_owner_nodeid;
struct mddb_de_ic *de_next;
mddb_rb32_t *de_rb;
mddb_recid_t de_recid;
mddb_type_t de_type1;
uint_t de_type2;
size_t de_reqsize;
size_t de_icreqsize;
size_t de_recsize;
uint_t de_blkcount;
uint_t de_flags;
mddb_optinfo_t de_optinfo[2];
mddb_block_t de_blks[1];
} mddb_de_ic_t;
typedef struct mddb_db {
uint_t db_magic;
uint_t db_revision;
uint_t db_checksum;
mddb_block_t db_blknum;
struct mddb_db *db_next;
mddb_block_t db_nextblk;
struct timeval32 db_timestamp;
uint_t db_recsum;
#ifdef _KERNEL
mddb_de_ic_t *db_firstentry;
#else
mddb_de_t *db_firstentry;
#endif
} mddb_db_t;
/*
* This is, and always will be, the on-disk version of mddb_de
* When mddb_de32 is read in it is converted into mddb_de_ic
*/
typedef struct mddb_de32 {
uint32_t de32_next;
uint32_t de32_rb;
mddb_recid_t de32_recid;
mddb_type_t de32_type1;
uint_t de32_type2;
uint_t de32_reqsize;
uint_t de32_recsize;
mddb_block_t de32_blkcount;
uint_t de32_flags;
mddb_optinfo_t de32_optinfo[2];
mddb_block_t de32_blks[1];
} mddb_de32_t;
/*
* This is, and always will be, the on-disk version of mddb_db
* When mddb_db32 is read in it is converted into mddb_db
* To minimize impact on mddb format mddb_db fileds remain intact
*/
typedef struct mddb_db32 {
uint_t db32_magic;
uint_t db32_revision;
uint_t db32_checksum;
mddb_block_t db32_blknum;
uint32_t db32_next;
mddb_block_t db32_nextblk;
struct timeval32 db32_timestamp;
uint_t db32_recsum;
uint32_t db32_firstentry;
} mddb_db32_t;
#define de32tode(from, to) \
{ \
int i; \
to->de_rb_userdata = NULL; \
to->de_owner_nodeid = MD_MN_INVALID_NID; \
to->de_next = (struct mddb_de_ic *)(uintptr_t)from->de32_next; \
to->de_rb = (mddb_rb32_t *)(uintptr_t)from->de32_rb; \
to->de_recid = from->de32_recid; \
to->de_type1 = from->de32_type1; \
to->de_type2 = from->de32_type2; \
to->de_reqsize = from->de32_reqsize; \
to->de_recsize = from->de32_recsize; \
to->de_blkcount = from->de32_blkcount; \
to->de_flags = from->de32_flags; \
to->de_optinfo[0] = from->de32_optinfo[0]; \
to->de_optinfo[1] = from->de32_optinfo[1]; \
for (i = 0; i < from->de32_blkcount; i++) \
to->de_blks[i] = from->de32_blks[i]; \
}
#define detode32(from, to) \
{ \
int i; \
to->de32_next = (uint32_t)(uintptr_t)from->de_next; \
to->de32_rb = (uint32_t)(uintptr_t)from->de_rb; \
to->de32_recid = from->de_recid; \
to->de32_type1 = from->de_type1; \
to->de32_type2 = from->de_type2; \
to->de32_reqsize = from->de_reqsize; \
to->de32_recsize = from->de_recsize; \
to->de32_blkcount = from->de_blkcount; \
to->de32_flags = from->de_flags; \
to->de32_optinfo[0] = from->de_optinfo[0]; \
to->de32_optinfo[1] = from->de_optinfo[1]; \
for (i = 0; i < from->de_blkcount; i++) \
to->de32_blks[i] = from->de_blks[i]; \
}
#define db32todb(from, to) \
to->db_magic = from->db32_magic; \
to->db_revision = from->db32_revision; \
to->db_checksum = from->db32_checksum; \
to->db_blknum = from->db32_blknum; \
to->db_next = (struct mddb_db *)(uintptr_t)from->db32_next; \
to->db_nextblk = from->db32_nextblk; \
to->db_timestamp = from->db32_timestamp; \
to->db_recsum = from->db32_recsum; \
to->db_firstentry = (mddb_de_ic_t *)(uintptr_t)from->db32_firstentry;
#define dbtodb32(from, to) \
to->db32_magic = from->db_magic; \
to->db32_revision = from->db_revision; \
to->db32_checksum = from->db_checksum; \
to->db32_blknum = from->db_blknum; \
to->db32_next = (uint32_t)(uintptr_t)from->db_next; \
to->db32_nextblk = from->db_nextblk; \
to->db32_timestamp = from->db_timestamp; \
to->db32_recsum = from->db_recsum; \
to->db32_firstentry = (uint32_t)(uintptr_t)from->db_firstentry;
/*
* information about a replica of the data base
*/
typedef struct mddb_ri {
struct mddb_ri *ri_next;
uint_t ri_flags;
uint_t ri_commitcnt;
int ri_transplant;
md_dev64_t ri_dev;
daddr32_t ri_blkno;
char ri_driver[16];
mddb_mb_ic_t *ri_mbip;
mddb_lb_t *ri_lbp;
mddb_dt_t *ri_dtp;
mddb_did_ic_t *ri_did_icp;
ddi_devid_t ri_devid;
ddi_devid_t ri_old_devid;
char ri_minor_name[MDDB_MINOR_NAME_MAX];
char ri_devname[MAXPATHLEN];
} mddb_ri_t;
typedef struct mddb_bf {
struct mddb_bf *bf_next;
mddb_locator_t *bf_locator;
buf_t bf_buf;
} mddb_bf_t;
/*
* Information for sets of databases (which include replicas)
*/
#define MDDB_BITSRECID 31
#define MDDB_SETSHIFT (MDDB_BITSRECID - MD_BITSSET)
#define MDDB_SETMASK (MD_SETMASK << MDDB_SETSHIFT)
#define MDDB_RECIDMASK ((1 << MDDB_SETSHIFT) - 1)
#define DBSET(id) (((id) & MDDB_SETMASK) >> MDDB_SETSHIFT)
#define DBID(id) ((id) & MDDB_RECIDMASK)
#define MAKERECID(s, i) ((((s) << MDDB_SETSHIFT) & MDDB_SETMASK) | \
((i) & MDDB_RECIDMASK))
#define MDDB_PARSE_LOCBLK 0x00000001
#define MDDB_PARSE_LOCNM 0x00000002
#define MDDB_PARSE_OPTRECS 0x00000004
#define MDDB_PARSE_MASK 0x0000000F
#define MDDB_BLOCK_PARSE 0x00000001 /* Block sending parse msgs */
#define MDDB_UNBLOCK_PARSE 0x00000002 /* Unblock sending parse msgs */
/*
* We need to keep s_ident and s_inittime 32 bit. They are used in mddb_lb
*/
typedef struct mddb_set {
uint_t s_setno; /* set number */
uint_t s_sideno; /* side number */
identifier_t s_ident; /* set identifier */
char *s_setname; /* set name */
mddb_mb_ic_t **s_mbiarray; /* master blocks array */
mddb_db_t *s_dbp; /* directory block */
mddb_lb_t *s_lbp; /* locator block */
/* May be cast to mddb_mnlb_t */
/* if accessing sidenames in */
/* MN diskset */
mddb_ln_t *s_lnp; /* locator names block */
/* May be cast to mddb_mnln_t */
/* if accessing sidenames in */
/* MN diskset */
mddb_dtag_lst_t *s_dtlp; /* List of data tags found */
mddb_did_ic_t *s_did_icp; /* Device ID incore area */
mddb_ri_t *s_rip; /* replicas incore list */
int s_freeblkcnt; /* visable for test code */
int s_totalblkcnt; /* visable for test code */
int s_mn_parseflags; /* mddb parse flags for MNset */
int s_mn_parseflags_sending; /* parse flgs sent to slaves */
uchar_t *s_freebitmap; /* free blocks bitmap */
uint_t s_freebitmapsize; /* size of bitmap */
struct timeval32 s_inittime; /* timestamp set created */
mddb_recid_t s_zombie; /* zombie record - createrec */
int s_staledeletes; /* number of stale deleterec */
int s_optcmtcnt; /* Following are opt. record */
int s_opthavelck; /* bookkeeping records ... */
int s_optwantlck;
kcondvar_t s_optwantlck_cv;
int s_optwaiterr;
int s_opthungerr;
kcondvar_t s_opthungerr_cv;
int s_opthavequeuinglck;
int s_optwantqueuinglck;
kcondvar_t s_optqueuing_cv;
ulong_t s_bufmisses;
mddb_bf_t *s_freebufhead;
int s_bufwakeup;
kcondvar_t s_buf_cv;
size_t s_databuffer_size;
void *s_databuffer;
int s_singlelockgotten;
int s_singlelockwanted;
kcondvar_t s_single_thread_cv;
md_hi_arr_t s_med;
} mddb_set_t;
#ifndef MDDB_FAKE
#ifdef _KERNEL
/* md_mddb.c */
extern uint_t mddb_lb_did_convert(mddb_set_t *,
uint_t, uint_t *);
extern void mddb_locatorblock2splitname(mddb_ln_t *,
int, side_t, md_splitname *);
extern int mddb_configure(mddb_cfgcmd_t,
struct mddb_config *);
extern mddb_recid_t mddb_getnextrec(mddb_recid_t,
mddb_type_t, uint_t);
extern int mddb_getoptloc(mddb_optloc_t *);
extern void *mddb_getrecaddr(mddb_recid_t);
extern void *mddb_getrecaddr_resize(mddb_recid_t, size_t,
off_t);
extern int mddb_getrecprivate(mddb_recid_t);
extern void mddb_setrecprivate(mddb_recid_t, uint_t);
extern mddb_de_ic_t *mddb_getrecdep(mddb_recid_t);
extern mddb_type_t mddb_getrectype1(mddb_recid_t);
extern int mddb_getrectype2(mddb_recid_t);
extern int mddb_getrecsize(mddb_recid_t);
extern int mddb_commitrec(mddb_recid_t);
extern int mddb_commitrecs(mddb_recid_t *);
extern int mddb_deleterec(mddb_recid_t);
extern mddb_recstatus_t mddb_getrecstatus(mddb_recid_t);
extern mddb_recid_t mddb_createrec(size_t usersize,
mddb_type_t type, uint_t type2,
md_create_rec_option_t option, set_t setno);
extern void mddb_init(void);
extern void mddb_unload(void);
extern void mddb_unload_set(set_t setno);
extern mddb_recid_t mddb_makerecid(set_t setno, mddb_recid_t id);
extern set_t mddb_getsetnum(mddb_recid_t id);
extern char *mddb_getsetname(set_t setno);
extern side_t mddb_getsidenum(set_t setno);
extern int mddb_ownset(set_t setno);
extern int getmed_ioctl(mddb_med_parm_t *medpp, int mode);
extern int setmed_ioctl(mddb_med_parm_t *medpp, int mode);
extern int updmed_ioctl(mddb_med_upd_parm_t *medpp,
int mode);
extern int take_set(mddb_config_t *cp, int mode);
extern int release_set(mddb_config_t *cp, int mode);
extern int gettag_ioctl(mddb_dtag_get_parm_t *dtgpp,
int mode);
extern int usetag_ioctl(mddb_dtag_use_parm_t *dtupp,
int mode);
extern int accept_ioctl(mddb_accept_parm_t *medpp,
int mode);
extern int md_update_locator_namespace(set_t setno,
side_t side, char *dname, char *pname,
md_dev64_t devt);
extern int mddb_validate_lb(set_t setno, int *rmaxsz);
extern int mddb_getinvlb_devid(set_t setno, int count,
int size, char **ctdptr);
extern int md_update_minor(set_t, side_t, mdkey_t);
extern int md_update_nm_rr_did_ioctl(mddb_config_t *cp);
extern int md_update_top_device_minor(set_t, side_t,
md_dev64_t);
#ifdef DEBUG
extern void mddb_check(void);
#endif /* DEBUG */
#endif /* _KERNEL */
#else
caddr_t mddb_fakeit;
#define md_lb_did_convert(a, b, c) (0)
#define mddb_configure(a, b) (0)
#define mddb_getnextrec(a, b, c) ((mddb_recid_t)0)
#define mddb_getrecaddr(a) (mddb_fakeit)
#define mddb_getrecprivate(a) (0)
#define mddb_setrecprivate(a, b) (0)
#define mddb_getrectype1(a) (0)
#define mddb_getrectype2(a) (0)
#define mddb_getrecsize(a) (0)
#define mddb_commitrec(a) (0)
#define mddb_commitrecs(a) (0)
#define mddb_deleterec(a) (0)
#define mddb_getrecstatus(a) (MDDB_OK)
#define mddb_createrec(s, a, b) (0xffff & (int)(mddb_fakeit = \
(caddr_t)kmem_zalloc(s, KM_SLEEP)))
#define mddb_unload() (0)
#endif
#define MDDB_NOSLEEP 1
#define MDDB_SLEEPOK 0
#define MDDB_NOOLDOK 0x1
#define MDDB_MUSTEXIST 0x2
#define MDDB_NOINIT 0x4
#define MDDB_MULTINODE 0x8
#define MDDB_MN_STALE 0x10 /* MN set is stale */
/* Flags passed to selectreplicas - not a bit mask */
#define MDDB_SCANALL 1
#define MDDB_RETRYSCAN 0
#define MDDB_SCANALLSYNC 2 /* During reconfig, sync up incore */
/* and ondisk mddb by writing incore */
/* values to disk. Don't write */
/* change log records. */
/* Flags passed to writestart and writecopy */
#define MDDB_WRITECOPY_ALL 1 /* Write all incore mddb to disk */
#define MDDB_WRITECOPY_SYNC 2 /* Write incore mddb to disk except */
/* - change log records */
/* - optimized resync records */
#define MDDB_PROBE 1
#define MDDB_NOPROBE 0
/*
* MN diskset definitions used to determine if a slave can write
* directly to the mddb. ONLY_MASTER only allows the master node
* to write to the mddb. ANY_NODE allows any node to write
* to the mddb.
*/
#define MDDB_WR_ONLY_MASTER 0
#define MDDB_WR_ANY_NODE 1
#define MDDB_L_LOCKED 0x0001 /* this record is locked */
#define MDDB_L_WANTED 0x0002
#ifdef __cplusplus
}
#endif
#endif /* _SYS_MD_MDDB_H */
|