diff options
author | eschrock <none@none> | 2007-06-12 13:18:17 -0700 |
---|---|---|
committer | eschrock <none@none> | 2007-06-12 13:18:17 -0700 |
commit | 3d7072f8bd27709dba14f6fe336f149d25d9e207 (patch) | |
tree | d325ae63ce74901b55494e8a0dc011b9e2e13d43 /usr/src | |
parent | a5b881a79e40ec2c21d682e676b130a1ee3d2a73 (diff) | |
download | illumos-joyent-3d7072f8bd27709dba14f6fe336f149d25d9e207.tar.gz |
PSARC 2007/197 ZFS hotplug
PSARC 2007/283 FMA for ZFS Phase 2
6401126 ZFS DE should verify that diagnosis is still valid before solving cases
6500545 ZFS does not handle changes in devids
6508521 zpool online should warn when it is being used incorrectly
6509807 ZFS checksum ereports are not being posted
6514712 zfs_nicenum() doesn't work with perfectly-sized buffers
6520510 media state doesn't get updated properly on device removal
6520513 ZFS should have better support for device removal
6520514 vdev state should be controlled through a single ioctl()
6520519 ZFS should diagnose faulty devices
6520947 ZFS DE should close cases which no longer apply
6521393 ZFS case timeout should be FMD_TYPE_TIME
6521624 fmd_hash_walk() can dump core when given a bad address
6521946 ZFS DE needlessly subscribes to faults
6522085 ZFS dictionary files contain spelling errors
6523185 vdev_reopen() doesn't correctly propagate state
6523555 'zpool online' should be less chatty unless something goes wrong
6527379 zpool(1M) should not try to open faulted devices
6527700 ZFS should post a sysevent when topology changes
6528194 lofi should support force unmap and DKIO_DEV_GONE
6528732 ZFS should store physical device path in addition to /dev path
6532635 ZFS keeps devices open unnecessarily
6532979 bad argument to ZFS_IOC_VDEV_ATTACH can panic system
6567983 deadlock with spa_scrub_thread() and spa_namespace_lock
Diffstat (limited to 'usr/src')
47 files changed, 2694 insertions, 754 deletions
diff --git a/usr/src/cmd/fm/dicts/ZFS.dict b/usr/src/cmd/fm/dicts/ZFS.dict index 5d6cc6619e..8bc31a6eee 100644 --- a/usr/src/cmd/fm/dicts/ZFS.dict +++ b/usr/src/cmd/fm/dicts/ZFS.dict @@ -41,3 +41,5 @@ ereport.fs.zfs.device.version_mismatch=10 fault.fs.zfs.pool=11 fault.fs.zfs.device=12 ereport.fs.zfs.pool.hostname_mismatch=13 +fault.fs.zfs.vdev.io=14 +fault.fs.zfs.vdev.checksum=15 diff --git a/usr/src/cmd/fm/dicts/ZFS.po b/usr/src/cmd/fm/dicts/ZFS.po index 3d05a57268..7caa3c46c7 100644 --- a/usr/src/cmd/fm/dicts/ZFS.po +++ b/usr/src/cmd/fm/dicts/ZFS.po @@ -40,7 +40,7 @@ msgstr "No automated response will be taken." msgid "ZFS-8000-14.impact" msgstr "ZFS filesystems are not available" msgid "ZFS-8000-14.action" -msgstr "\nTo determine which pools are availabe for import, run the 'zpool status'\ncommand:\n\n\n# zpool import\n pool: test\n id: 12743384782310107047\n state: ONLINE\naction: The pool can be imported using its name or numeric identifier.\nconfig:\n\n test ONLINE\n c0t0d0 ONLINE\n#\n\n\nThis will automatically scan /dev/dsk for any devices\npart of a pool. If you previously had storage pools with devices in a\ndifferent directory, us the '-d' option to 'zpool import' to scan alternate\nlocations.\n\nOnce you have determined which pools are available for import, you can\nimport the pool explicitly by specifying the name or numeric identifier:\n\n\n# zpool import test\n#\n\n\nAlternately, you can import all available pools by specifying the '-a'\noption. Once a pool has been imported, the ZFS cache will be repaired so\nthat the pool will appear normally in the future.\n " +msgstr "\nZFS keeps a list of active pools on the filesystem to avoid having to\nscan all devices when the system is booted. If this file is corrupted, then\nnormally active pools will not be automatically opened. The pools can be\nrecovered using the 'zpool import' command:\n\n\n# zpool import\n pool: test\n id: 12743384782310107047\n state: ONLINE\naction: The pool can be imported using its name or numeric identifier.\nconfig:\n\n test ONLINE\n c0t0d0 ONLINE\n\n\nThis will automatically scan /dev/dsk for any\ndevices part of a pool. If devices have been made available in an alternate\nlocation, use the '-d' option to 'zpool import' to search for devices in a\ndifferent directory.\n\nOnce you have determined which pools are available for import, you can\nimport the pool explicitly by specifying the name or numeric\nidentifier:\n\n\n# zpool import test\n\n\nAlternately, you can import all available pools by specifying the\n'-a' option. Once a pool has been imported, the ZFS cache will be repaired so\nthat the pool will appear normally in the future.\n " # # code: ZFS-8000-2Q # keys: ereport.fs.zfs.device.missing_r @@ -52,11 +52,11 @@ msgstr "Major" msgid "ZFS-8000-2Q.description" msgstr "A device in a replicated configuration could not be\n opened. Refer to %s for more information." msgid "ZFS-8000-2Q.response" -msgstr "No automated response will be taken." +msgstr "A hot spare will be activated if available." msgid "ZFS-8000-2Q.impact" msgstr "The pool is no longer providing the configured level of\n replication." msgid "ZFS-8000-2Q.action" -msgstr "\nIf this error was encountered while running 'zpool import', please see the\nsection below. Otherwise, run 'zpool status -x' to determine which pool has\nexperienced a failure:\n\n\n# zpool status -x\n pool: test\n state: DEGRADED\nstatus: One or more devices could not be opened. Sufficient replicas exist for\n the pool to continue functioning in a degraded state.\naction: Attach the missing device and online it using 'zpool online'.\n see: http://www.sun.com/msg/ZFS-8000-2Q\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test DEGRADED 0 0 0\n mirror DEGRADED 0 0 0\n c0t0d0 ONLINE 0 0 0\n c0t0d1 FAULTED 0 0 0 cannot open\n#\n\n\nDetermine which device failed to open by looking for a FAULTED device with\nan additional 'cannot open' message. If this device has been inadvertently\nremoved from the system, attach the device and bring it online with 'zpool\nonline':\n\n\n# zpool online test c0t0d1\nBringing device 'c0t0d1' online\n#\n\n\nIf the device is no longer available, the device can be replaced using the\n'zpool replace' command:\n\n\n# zpool replace test c0t0d1 c0t0d2\n#\n\n\nExisting data will be resilvered to the new device. Once the resilvering\ncompletes, the device will be removed from the pool.\n\n\nIf this error is encountered during a 'zpool import', it means that one of\nthe devices is not attached to the system:\n\n\n# zpool import\n pool: test\n id: 10121266328238932306\n state: DEGRADED\nstatus: One or more devices are missing from the system.\naction: The pool can be imported despite missing or damaged devices. The\n fault tolerance of the pool may be compromised if imported.\n see: http://www.sun.com/msg/ZFS-8000-2Q\nconfig:\n\n test DEGRADED\n mirror DEGRADED\n c0t0d0 ONLINE\n c0t0d1 FAULTED cannot open\n\n\nUnlike when the pool is active on the system, the device cannot be replaced\nwhile the pool is exported. If the device can be attached to the system,\nattach the device and run 'zpool import' again.\n\nAlternatively, the pool can be imported as-is, though it will be placed in\nthe DEGRADED state due to a missing device. Once the pool has been\nimported, the missing device can be replaced as described above.\n " +msgstr "\nFor an active pool\n\nIf this error was encountered while running 'zpool import', please see\nthe section below. Otherwise, run 'zpool status -x' to determine which pool has\nexperienced a failure:\n\n\n# zpool status -x\n pool: test\n state: DEGRADED\nstatus: One or more devices could not be opened. Sufficient replicas exist for\n the pool to continue functioning in a degraded state.\naction: Attach the missing device and online it using 'zpool online'.\n see: http://www.sun.com/msg/ZFS-8000-2Q\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test DEGRADED 0 0 0\n mirror DEGRADED 0 0 0\n c0t0d0 ONLINE 0 0 0\n c0t0d1 FAULTED 0 0 0 cannot open\n\nerrors: No known data errors\n\n\nDetermine which device failed to open by looking for a FAULTED device\nwith an additional 'cannot open' message. If this device has been inadvertently\nremoved from the system, attach the device and bring it online with 'zpool\nonline':\n\n\n# zpool online test c0t0d1\n\n\nIf the device is no longer available, the device can be replaced using\nthe 'zpool replace' command:\n\n\n# zpool replace test c0t0d1 c0t0d2\n\n\nIf the device has been replaced by another disk in the same physical\nslot, then the device can be replaced using a single argument to the 'zpool\nreplace' command:\n\n\n# zpool replace test c0t0d1\n\n\nExisting data will be resilvered to the new device. Once the\nresilvering completes, the device will be removed from the pool.\n\nFor an exported pool\n\nIf this error is encountered during a 'zpool import', it means that one\nof the devices is not attached to the system:\n\n\n# zpool import\n pool: test\n id: 10121266328238932306\n state: DEGRADED\nstatus: One or more devices are missing from the system.\naction: The pool can be imported despite missing or damaged devices. The\n fault tolerance of the pool may be compromised if imported.\n see: http://www.sun.com/msg/ZFS-8000-2Q\nconfig:\n\n test DEGRADED\n mirror DEGRADED\n c0t0d0 ONLINE\n c0t0d1 FAULTED cannot open\n\n\nUnlike when the pool is active on the system, the device cannot be\nreplaced while the pool is exported. If the device can be attached to the\nsystem, attach the device and run 'zpool import' again.\n\nAlternatively, the pool can be imported as-is, though it will be placed\nin the DEGRADED state due to a missing device. The device will be marked as\nUNAVAIL. Once the pool has been imported, the missing device can be replaced as\ndescribed above.\n " # # code: ZFS-8000-3C # keys: ereport.fs.zfs.device.missing_nr @@ -72,7 +72,7 @@ msgstr "No automated response will be taken." msgid "ZFS-8000-3C.impact" msgstr "The pool is no longer available" msgid "ZFS-8000-3C.action" -msgstr "\nIf this error was encountered while running 'zpool import', please see the\nsection below. Otherwise, run 'zpool status -x' to determine which pool\nhas experienced a failure:\n\n\n# zpool status -x\n pool: test\n state: FAULTED\nstatus: One or more devices could not be opened. There are insufficient\n replicas for the pool to continue functioning.\naction: Attach the missing device and online it using 'zpool online'.\n see: http://www.sun.com/msg/ZFS-8000-3C\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test FAULTED 0 0 0 insufficient replicas\n c0t0d0 ONLINE 0 0 0\n c0t0d1 FAULTED 0 0 0 cannot open\n#\n\n\nAttach the device to the system and run 'zpool status' again. The pool\nshould automatically detect the newly attached device and resume\nfunctioning. You may have to mount the filesystems in the pool explicitly\nusing 'zfs mount'.\n\nIf the device is no longer available and cannot be reattached to the system,\nthen the pool must be destroyed and re-created from a backup source.\n\n\nIf this error is encountered during a 'zpool import', it means that one of\nthe devices is not attached to the system:\n\n\n# zpool import\n pool: test\n id: 10121266328238932306\n state: FAULTED\nstatus: One or more devices are missing from the system.\naction: The pool cannot be imported. Attach the missing devices and\n try again.\n see: http://www.sun.com/msg/ZFS-8000-3C\nconfig:\n\n test FAULTED insufficient replicas\n c0t0d0 ONLINE\n c0t0d1 FAULTED cannot open\n\n\nThe pool cannot be imported until the missing device is attached to the\nsystem. If the device has been made available in an alternate location, you\ncan use the '-d' option to 'zpool import' to search for devices in a\ndifferent directory.\n " +msgstr "\nFor an active pool\n\nIf this error was encountered while running 'zpool import', please see\nthe section below. Otherwise, run 'zpool status -x' to determine which pool\nhas experienced a failure:\n\n\n# zpool status -x\n pool: test\n state: FAULTED\nstatus: One or more devices could not be opened. There are insufficient\n replicas for the pool to continue functioning.\naction: Attach the missing device and online it using 'zpool online'.\n see: http://www.sun.com/msg/ZFS-8000-3C\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test FAULTED 0 0 0 insufficient replicas\n c0t0d0 ONLINE 0 0 0\n c0t0d1 FAULTED 0 0 0 cannot open\n\nerrors: No known data errors\n\n\nIf the device has been temporarily detached from the system, attach the\ndevice to the system and run 'zpool status' again. The pool should\nautomatically detect the newly attached device and resume functioning. You may\nhave to mount the filesystems in the pool explicitly using 'zfs\nmount -a'.\n\nIf the device is no longer available and cannot be reattached to the\nsystem, then the pool must be destroyed and re-created from a backup\nsource.\n\nFor an exported pool\n\nIf this error is encountered during a 'zpool import', it means that one\nof the devices is not attached to the system:\n\n\n# zpool import\n pool: test\n id: 10121266328238932306\n state: FAULTED\nstatus: One or more devices are missing from the system.\naction: The pool cannot be imported. Attach the missing devices and\n try again.\n see: http://www.sun.com/msg/ZFS-8000-3C\nconfig:\n\n test FAULTED insufficient replicas\n c0t0d0 ONLINE\n c0t0d1 FAULTED cannot open\n\n\nThe pool cannot be imported until the missing device is attached to the\nsystem. If the device has been made available in an alternate location, use the\n'-d' option to 'zpool import' to search for devices in a different directory.\nIf the missing device is unavailable, then the pool cannot be imported.\n " # # code: ZFS-8000-4J # keys: ereport.fs.zfs.device.corrupt_label_r @@ -84,11 +84,11 @@ msgstr "Major" msgid "ZFS-8000-4J.description" msgstr "A device could not be opened due to a missing or invalid\n device label. Refer to %s for more information." msgid "ZFS-8000-4J.response" -msgstr "No automated response will be taken." +msgstr "A hot spare will be activated if available." msgid "ZFS-8000-4J.impact" msgstr "The pool is no longer providing the configured level of\n replication." msgid "ZFS-8000-4J.action" -msgstr "\nIf this error is encountered while running 'zpool import', see the section\nbelow. Otherwise, run 'zpool status -x' to determine which pool has the\ndamaged device:\n\n\n# zpool status -x\n pool: test\n state: DEGRADED\nstatus: One or more devices could not be used because the label is missing or\n invalid. Sufficient replicas exist for the pool to continue\n functioning in a degraded state.\naction: Replace the device using 'zpool replace'.\n see: http://www.sun.com/msg/ZFS-8000-4J\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test DEGRADED 0 0 0\n mirror DEGRADED 0 0 0\n c0t0d0 ONLINE 0 0 0\n c0t0d1 FAULTED 0 0 0 corrupted data\n\n\nDetermine which device is damaged by locating the FAULTED device showing\n'corrupted data'. This indicates that the device label was corrupt. Because\nZFS could not identify the device as the one expected, no automatic resilvering\nwill take place.\n\nThe device can be resilvered by issuing 'zpool replace':\n\n\n# zpool replace test c0t0d1\n\n\nThis will replace the device in situ. To replace the device with another,\ndifferent, device, run 'zpool replace' with an additional argument specifying\nthe new device:\n\n\n# zpool replace test c0t0d1 c0t0d2\n\n\nZFS will being migrating data to the new device as soon as the replace is\nissued. Once the resilvering completes, the original device (if different from\nthe replacement) will be removed, and the pool will be restored to the ONLINE\nstate.\n\n\nIf this error is encountered while running 'zpool import', the pool can be still\nbe imported despite the failure:\n\n\n# zpool import\n pool: test\n id: 5187963178597328409\n state: DEGRADED\nstatus: One or more devices contains corrupted data. The fault tolerance of\n the pool may be compromised if imported.\naction: The pool can be imported using its name or numeric identifier.\n see: http://www.sun.com/msg/ZFS-8000-4J\nconfig:\n\n test DEGRADED\n mirror DEGRADED\n /disk/a ONLINE\n /disk/b FAULTED corrupted data\n\n\nTo import the pool, run 'zpool import':\n\n\n# zpool import test\n\n\nOnce the pool has been imported, the damaged device can be replaced according to\nthe above procedure.\n " +msgstr "\nFor an active pool\n\nIf this error was encountered while running 'zpool import', please see\nthe section below. Otherwise, run 'zpool status -x' to determine which pool\nhas experienced a failure:\n\n\n\n# zpool status -x\n pool: test\n state: DEGRADED\nstatus: One or more devices could not be used because the label is missing or\n invalid. Sufficient replicas exist for the pool to continue\n functioning in a degraded state.\naction: Replace the device using 'zpool replace'.\n see: http://www.sun.com/msg/ZFS-8000-4J\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test DEGRADED 0 0 0\n mirror DEGRADED 0 0 0\n c0t0d0 ONLINE 0 0 0\n c0t0d1 FAULTED 0 0 0 corrupted data\n\nerrors: No known data errors\n\n\nIf the device has been temporarily detached from the system, attach the\ndevice to the system and run 'zpool status' again. The pool should\nautomatically detect the newly attached device and resume functioning.\n\nIf the device is no longer available, it can be replaced using 'zpool\nreplace':\n\n\n# zpool replace test c0t0d1 c0t0d2\n\n\nIf the device has been replaced by another disk in the same physical\nslot, then the device can be replaced using a single argument to the 'zpool\nreplace' command:\n\n\n# zpool replace test c0t0d1\n\n\nZFS will begin migrating data to the new device as soon as the replace\nis issued. Once the resilvering completes, the original device (if different\nfrom the replacement) will be removed, and the pool will be restored to the\nONLINE state.\n\nFor an exported pool\n\nIf this error is encountered while running 'zpool import', the pool can\nbe still be imported despite the failure:\n\n\n# zpool import\n pool: test\n id: 5187963178597328409\n state: DEGRADED\nstatus: One or more devices contains corrupted data. The fault tolerance of\n the pool may be compromised if imported.\naction: The pool can be imported using its name or numeric identifier.\n see: http://www.sun.com/msg/ZFS-8000-4J\nconfig:\n\n test DEGRADED\n mirror DEGRADED\n c0t0d0 ONLINE\n c0t0d1 FAULTED corrupted data\n\n\nTo import the pool, run 'zpool import':\n\n\n# zpool import test\n\n\nOnce the pool has been imported, the damaged device can be replaced\naccording to the above procedure.\n " # # code: ZFS-8000-5E # keys: ereport.fs.zfs.device.corrupt_label_nr @@ -104,7 +104,7 @@ msgstr "No automated response will be taken." msgid "ZFS-8000-5E.impact" msgstr "The pool is no longer available" msgid "ZFS-8000-5E.action" -msgstr "\nIf this error is encountered during 'zpool import', see the section below.\nOtherwise, run 'zpool status -x' to determine which pool is faulted:\n\n\n# zpool status -x\n pool: test\n state: FAULTED\nstatus: One or more devices could not be used because the the label is missing \n or invalid. There are insufficient replicas for the pool to continue\n functioning.\naction: Destroy and re-create the pool from a backup source.\n see: http://www.sun.com/msg/ZFS-8000-5E\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test FAULTED 0 0 0 insufficient replicas\n c0t0d0 FAULTED 0 0 0 corrupted data\n c0t0d1 ONLINE 0 0 0\n\n\nThe device listed as FAULTED with 'corrupted data' cannot be opened due to a\ncorrupt label. ZFS will be unable to use the pool, and all data within the pool\nis irrevocably lost. The pool must be destroyed and recreated from an\nappropriate backup source. Using replicated configurations will prevent this\nfrom happening in the future.\n\n\nIf this error is enountered during 'zpool import', the action is the same. The\npool cannot be imported - all data is lost and must be restored from an\nappropriate backup source.\n " +msgstr "\nFor an active pool\n\nIf this error was encountered while running 'zpool import', please see\nthe section below. Otherwise, run 'zpool status -x' to determine which pool\nhas experienced a failure:\n\n\n# zpool status -x\n pool: test\n state: FAULTED\nstatus: One or more devices could not be used because the the label is missing \n or invalid. There are insufficient replicas for the pool to continue\n functioning.\naction: Destroy and re-create the pool from a backup source.\n see: http://www.sun.com/msg/ZFS-8000-5E\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test FAULTED 0 0 0 insufficient replicas\n c0t0d0 FAULTED 0 0 0 corrupted data\n c0t0d1 ONLINE 0 0 0\n\nerrors: No known data errors\n\n\nThe device listed as FAULTED with 'corrupted data' cannot be opened due\nto a corrupt label. ZFS will be unable to use the pool, and all data within the\npool is irrevocably lost. The pool must be destroyed and recreated from an\nappropriate backup source. Using replicated configurations will prevent this\nfrom happening in the future.\n\nFor an exported pool\n\nIf this error is encountered during 'zpool import', the action is the\nsame. The pool cannot be imported - all data is lost and must be restored from\nan appropriate backup source.\n " # # code: ZFS-8000-6X # keys: ereport.fs.zfs.pool.bad_guid_sum @@ -120,7 +120,7 @@ msgstr "No automated response will be taken." msgid "ZFS-8000-6X.impact" msgstr "The pool cannot be imported" msgid "ZFS-8000-6X.action" -msgstr "\nRun 'zpool import' to list which pool cannot be imported:\n\n\n# zpool import\n pool: test\n id: 13783646421373024673\n state: FAULTED\nstatus: One or more devices are missing from the system.\naction: The pool cannot be imported. Attach the missing\n devices and try again.\n see: http://www.sun.com/msg/ZFS-8000-6X\nconfig:\n\n test FAULTED missing device\n c0t0d0 ONLINE\n\n Additional devices are known to be part of this pool, though their\n exact configuration cannot be determined.\n\n\nZFS attempts to store enough configuration data on the devices such that the\nconfiguration is recoverable from any subset of devices. In some cases,\nparticularly when an entire toplevel virtual device is not attached to the\nsystem, ZFS will be unable to determine the complete configuration. It will\nalways detect that these devices are missing, even if it cannot identify all of\nthe devices.\n\nThe unknown missing devices must be attached to the system, at which point\n'zpool import' can be used to import the pool.\n " +msgstr "\nRun 'zpool import' to list which pool cannot be imported:\n\n\n# zpool import\n pool: test\n id: 13783646421373024673\n state: FAULTED\nstatus: One or more devices are missing from the system.\naction: The pool cannot be imported. Attach the missing\n devices and try again.\n see: http://www.sun.com/msg/ZFS-8000-6X\nconfig:\n\n test FAULTED missing device\n c0t0d0 ONLINE\n\n Additional devices are known to be part of this pool, though their\n exact configuration cannot be determined.\n\n\nZFS attempts to store enough configuration data on the devices such\nthat the configuration is recoverable from any subset of devices. In some\ncases, particularly when an entire toplevel virtual device is not attached to\nthe system, ZFS will be unable to determine the complete configuration. It will\nalways detect that these devices are missing, even if it cannot identify all of\nthe devices.\n\nThe pool cannot be imported until the unknown missing device is\nattached to the system. If the device has been made available in an alternate\nlocation, use the '-d' option to 'zpool import' to search for devices in a\ndifferent directory. If the missing device is unavailable, then the pool cannot\nbe imported.\n " # # code: ZFS-8000-72 # keys: ereport.fs.zfs.pool.corrupt_pool @@ -136,7 +136,7 @@ msgstr "No automated response will be taken." msgid "ZFS-8000-72.impact" msgstr "The pool is no longer available" msgid "ZFS-8000-72.action" -msgstr "\nIf this error is encountered during 'zpool import', see the section below.\nOtherwise, run 'zpool status -x' to determine which pool is faulted:\n\n\n# zpool status -x\n# zpool import\n pool: test\n id: 13783646421373024673\n state: FAULTED\nstatus: The pool metadata is corrupted and cannot be opened.\naction: Destroy the pool and restore from backup.\n see: http://www.sun.com/msg/ZFS-8000-72\nconfig:\n\n test FAULTED corrupted data\n c0t0d0 ONLINE\n c0t0d1 ONLINE\n\n\nEven though all the devices are available, the on-disk data has been corrupted\nsuch that the pool cannot be opened. All data within the pool is lost, and the\npool must be destroyed and restored from an appropriate backup source.\n\n\nIf this error is encountered during 'zpool import', the pool is unrecoverable\nand cannot be imported. The pool must be restored from an appropriate backup\nsource.\n " +msgstr "\nIf this error is encountered during 'zpool import', see the section\nbelow. Otherwise, run 'zpool status -x' to determine which pool is\nfaulted:\n\n\n# zpool status -x\n# zpool import\n pool: test\n id: 13783646421373024673\n state: FAULTED\nstatus: The pool metadata is corrupted and cannot be opened.\naction: Destroy the pool and restore from backup.\n see: http://www.sun.com/msg/ZFS-8000-72\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test FAULTED 0 0 2 corrupted data\n mirror DEGRADED 0 0 2\n c0t0d0 ONLINE 0 0 2\n c0t0d1 ONLINE 0 0 2\n\nerrors: No known errors\n\n\nEven though all the devices are available, the on-disk data has been\ncorrupted such that the pool cannot be opened. All data within the pool is\nlost, and the pool must be destroyed and restored from an appropriate backup\nsource. ZFS includes built-in metadata replication to prevent this from\nhappening even for unreplicated pools, but running in a replicated configuration\nwill decrease the chances of this happening in the future.\n\nIf this error is encountered during 'zpool import', the pool is\nunrecoverable and cannot be imported. The pool must be restored from an\nappropriate backup source.\n " # # code: ZFS-8000-8A # keys: ereport.fs.zfs.object.corrupt_data @@ -152,7 +152,7 @@ msgstr "No automated response will be taken." msgid "ZFS-8000-8A.impact" msgstr "The file or directory is unavailable." msgid "ZFS-8000-8A.action" -msgstr "\nRun 'zpool status -x' to determine which pool is damaged:\n\n\n# zpool status -x\n pool: test\n state: ONLINE\nstatus: One or more devices has experienced an error and no valid replicas\n are available. Some filesystem data is corrupt, and applications\n may have been affected.\naction: Destroy the pool and restore from backup.\n see: http://www.sun.com/msg/ZFS-8000-8A\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test ONLINE 0 0 2\n c0t0d0 ONLINE 0 0 2\n c0t0d1 ONLINE 0 0 0\n\n\nUnfrotunately, the data cannot be repaired, and the only choice to repair the\ndata is to restore the pool from backup. Applications attempting to access the\ncorrupted data will get an error (EIO), and data may be permanently lost.\n " +msgstr "\nRun 'zpool status -x' to determine which pool is damaged:\n\n\n# zpool status -x\n pool: test\n state: ONLINE\nstatus: One or more devices has experienced an error and no valid replicas\n are available. Some filesystem data is corrupt, and applications\n may have been affected.\naction: Destroy the pool and restore from backup.\n see: http://www.sun.com/msg/ZFS-8000-8A\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test ONLINE 0 0 2\n c0t0d0 ONLINE 0 0 2\n c0t0d1 ONLINE 0 0 0\n\nerrors: 1 data errors, use '-v' for a list\n\n\nUnfortunately, the data cannot be repaired, and the only choice to\nrepair the data is to restore the pool from backup. Applications attempting to\naccess the corrupted data will get an error (EIO), and data may be permanently\nlost.\n\nOn recent versions of Solaris, the list of affected files can be\nretrieved by using the '-v' option to 'zpool status':\n\n\n# zpool status -xv\n pool: test\n state: ONLINE\nstatus: One or more devices has experienced an error and no valid replicas\n are available. Some filesystem data is corrupt, and applications\n may have been affected.\naction: Destroy the pool and restore from backup.\n see: http://www.sun.com/msg/ZFS-8000-8A\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test ONLINE 0 0 2\n c0t0d0 ONLINE 0 0 2\n c0t0d1 ONLINE 0 0 0\n\nerrors: Permanent errors have been detected in the following files:\n\n /export/example/foo\n\n\nDamaged files may or may not be able to be removed depending on the\ntype of corruption. If the corruption is within the plain data, the file should\nbe removable. If the corruption is in the file metadata, then the file cannot\nbe removed, though it can be moved to an alternate location. In either case,\nthe data should be restored from a backup source. It is also possible for the\ncorruption to be within pool-wide metadata, resulting in entire datasets being\nunavailable. If this is the case, the only option is to destroy the pool and\nre-create the datasets from backup.\n " # # code: ZFS-8000-9P # keys: ereport.fs.zfs.device.failing @@ -166,9 +166,9 @@ msgstr "A device has experienced uncorrectable errors in a\n replicated conf msgid "ZFS-8000-9P.response" msgstr "ZFS has attempted to repair the affected data." msgid "ZFS-8000-9P.impact" -msgstr "The system is unaffected. The detected errors may\n indicate future failure." +msgstr "The system is unaffected, though errors may indicate future\n failure. Future errors may cause ZFS to automatically fault\n the device." msgid "ZFS-8000-9P.action" -msgstr "\nRun 'zpool status -x' to determine which pool has experienced errors:\n\n\n# zpool status\n pool: test\n state: ONLINE\nstatus: One or more devices has experienced an unrecoverable error. An\n attempt was made to correct the error. Applications are unaffected.\naction: Determine if the device needs to be replaced, and clear the errors\n using 'zpool online' or replace the device with 'zpool replace'.\n see: http://www.sun.com/msg/ZFS-8000-9P\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test ONLINE 0 0 0\n mirror ONLINE 0 0 0\n c0t0d0 ONLINE 0 0 2\n c0t0d1 ONLINE 0 0 0\n\n\nFind the device with a non-zero error count for READ, WRITE, or CKSUM. This\nindicates that the device has experienced a read I/O error, write I/O error, or\nchecksum validation error. Because the device is part of a mirror or RAID-Z\ndevice, ZFS was able to recover from the error and subsequently repair the\ndamaged data.\n\nThese error counts may or may not indicate that the device needs replacement.\nIt depends on how the errors were caused, which the administrator needs to\ndetermine. For example, the following cases will all produce errors that do not\nindicate potential device failure:\n\n\nA network attached device lost connectivity but has now\nrecovered\nA device suffered from a bit flip, and expected event over long\nperiods of time\nAn adminstrator accidentally wrote over a portion of the disk using\nanother program\n\n\nIn these cases, the presence of errors does not indicate that the device is\nlikely to fail in the future, and therefore does not need to be replaced. If\nthis is the case, then the device errors should be cleared using 'zpool online':\n\n\n# zpool online test c0t0d0\n\n\nOn the other hand, errors may very well indicate that the device has failed or\nis about to fail. If there are continual I/O errors to a device that is\notherwise attached and functioning on the system, it most likely needs to be\nreplaced. The administrator should check the system log for any driver\nmessages that may indicate hardware failure. If it is determined that the\ndevice needs to be replaced, then the 'zpool replace' command should be used:\n\n\n# zpool replace test c0t0d0 c0t0d2\n\n\nThis will attach the new device to the pool and begin resilvering data to it.\nOnce the resilvering process is complete, the old device will automatically be\nremoved from the pool, at which point it can safely be removed from the system.\nIf the device needs to be replaced in-place (because there are no available\nspare devices), the original device can be removed and replaced with a new\ndevice, at which point a different form of 'zpool replace' can be used:\n\n\n# zpool replace test c0t0d0\n\n\nThis assumes that the original device at 'c0t0d0' has been replaced with a new\ndevice under the same path, and will be replaced appropriately\n\nYou can monitor the progress of the resilvering operation by using the 'zpool\nstatus -x' command:\n\n\n# zpool status -x\n pool: test\n state: DEGRADED\nstatus: One or more devices is currently being replaced. The pool may not be\n providing the necessary level of replication.\naction: Wait for the resilvering operation to complete\n scrub: resilver in progress, 0.14% done, 0h0m to go\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test ONLINE 0 0 0\n mirror ONLINE 0 0 0\n replacing ONLINE 0 0 0\n c0t0d0 ONLINE 0 0 3\n c0t0d2 ONLINE 0 0 0 58.5K resilvered\n c0t0d1 ONLINE 0 0 0\n\n " +msgstr "\nRun 'zpool status -x' to determine which pool has experienced\nerrors:\n\n\n# zpool status\n pool: test\n state: ONLINE\nstatus: One or more devices has experienced an unrecoverable error. An\n attempt was made to correct the error. Applications are unaffected.\naction: Determine if the device needs to be replaced, and clear the errors\n using 'zpool online' or replace the device with 'zpool replace'.\n see: http://www.sun.com/msg/ZFS-8000-9P\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test ONLINE 0 0 0\n mirror ONLINE 0 0 0\n c0t0d0 ONLINE 0 0 2\n c0t0d1 ONLINE 0 0 0\n\nerrors: No known data errors\n\n\nFind the device with a non-zero error count for READ, WRITE, or CKSUM.\nThis indicates that the device has experienced a read I/O error, write I/O\nerror, or checksum validation error. Because the device is part of a mirror or\nRAID-Z device, ZFS was able to recover from the error and subsequently repair\nthe damaged data.\n\nIf these errors persist over a period of time, ZFS may determine the\ndevice is faulty and mark it as such. However, these error counts may or may\nnot indicate that the device is unusable. It depends on how the errors were\ncaused, which the administrator can determine in advance of any ZFS diagnosis.\nFor example, the following cases will all produce errors that do not indicate\npotential device failure:\n\n\nA network attached device lost connectivity but has now\nrecovered\nA device suffered from a bit flip, an expected event over long\nperiods of time\nAn administrator accidentally wrote over a portion of the disk using\nanother program\n\n\nIn these cases, the presence of errors does not indicate that the\ndevice is likely to fail in the future, and therefore does not need to be\nreplaced. If this is the case, then the device errors should be cleared using\n'zpool clear':\n\n\n# zpool clear test c0t0d0\n\n\nOn the other hand, errors may very well indicate that the device has\nfailed or is about to fail. If there are continual I/O errors to a device that\nis otherwise attached and functioning on the system, it most likely needs to be\nreplaced. The administrator should check the system log for any driver\nmessages that may indicate hardware failure. If it is determined that the\ndevice needs to be replaced, then the 'zpool replace' command should be\nused:\n\n\n# zpool replace test c0t0d0 c0t0d2\n\n\nThis will attach the new device to the pool and begin resilvering data\nto it. Once the resilvering process is complete, the old device will\nautomatically be removed from the pool, at which point it can safely be removed\nfrom the system. If the device needs to be replaced in-place (because there are\nno available spare devices), the original device can be removed and replaced\nwith a new device, at which point a different form of 'zpool replace' can be\nused:\n\n\n# zpool replace test c0t0d0\n\n\nThis assumes that the original device at 'c0t0d0' has been replaced\nwith a new device under the same path, and will be replaced\nappropriately.\n\nYou can monitor the progress of the resilvering operation by using the\n'zpool status -x' command:\n\n\n# zpool status -x\n pool: test\n state: DEGRADED\nstatus: One or more devices is currently being replaced. The pool may not be\n providing the necessary level of replication.\naction: Wait for the resilvering operation to complete\n scrub: resilver in progress, 0.14% done, 0h0m to go\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test ONLINE 0 0 0\n mirror ONLINE 0 0 0\n replacing ONLINE 0 0 0\n c0t0d0 ONLINE 0 0 3\n c0t0d2 ONLINE 0 0 0 58.5K resilvered\n c0t0d1 ONLINE 0 0 0\n\nerrors: No known data errors\n\n " # # code: ZFS-8000-A5 # keys: ereport.fs.zfs.device.version_mismatch @@ -180,11 +180,11 @@ msgstr "Major" msgid "ZFS-8000-A5.description" msgstr "The on-disk version is not compatible with the running\n system. Refer to %s for more information." msgid "ZFS-8000-A5.response" -msgstr "No automated response will occur," +msgstr "No automated response will occur." msgid "ZFS-8000-A5.impact" -msgstr "The pool is unavailable" +msgstr "The pool is unavailable." msgid "ZFS-8000-A5.action" -msgstr "\nIf this error is seen during 'zpool import', see the section below. Otherwise,\nrun 'zpool status -x' to determine which pool is faulted:\n\n\n# zpool status -x\n pool: test\n state: FAULTED\nstatus: The ZFS version for the pool is incompatible with the software running\n on this system.\naction: Destroy and re-create the pool.\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test FAULTED 0 0 0 incompatible version\n mirror ONLINE 0 0 0\n c0t0d0 ONLINE 0 0 0\n c0t0d1 ONLINE 0 0 0\n\n\nThe pool cannot be used on this system. Either move the disks to the system\nwhere they were originally created, or destroy the pool and re-create it from\nbackup.\n\n\nIf this error is seen during import, the pool cannot be imported on the current\nsystem. The disks must be attached to the system which originally created the\npool, and imported there.\n " +msgstr "\nIf this error is seen during 'zpool import', see the section below.\nOtherwise, run 'zpool status -x' to determine which pool is faulted:\n\n\n# zpool status -x\n pool: test\n state: FAULTED\nstatus: The ZFS version for the pool is incompatible with the software running\n on this system.\naction: Destroy and re-create the pool.\n scrub: none requested\nconfig:\n\n NAME STATE READ WRITE CKSUM\n test FAULTED 0 0 0 incompatible version\n mirror ONLINE 0 0 0\n c0t0d0 ONLINE 0 0 0\n c0t0d1 ONLINE 0 0 0\n\nerrors: No known errors\n\n\nThe pool cannot be used on this system. Either move the storage to the\nsystem where the pool was originally created, upgrade the current system\nsoftware to a more recent version, or destroy the pool and re-create it from\nbackup.\n\nIf this error is seen during import, the pool cannot be imported on the\ncurrent system. The disks must be attached to the system which originally\ncreated the pool, and imported there.\n\nThe list of currently supported versions can be displayed using 'zpool\nupgrade -v'.\n " # # code: ZFS-8000-CS # keys: fault.fs.zfs.pool @@ -232,4 +232,36 @@ msgstr "No automated response will be taken." msgid "ZFS-8000-EY.impact" msgstr "ZFS filesystems are not available" msgid "ZFS-8000-EY.action" -msgstr "\nTo determine which system last accessed the pool, run the 'zpool import'\ncommand:\n\n\n# zpool import\n pool: test\n id: 14702934086626715962\nstate: ONLINE\nstatus: The pool was last accessed by another system.\naction: The pool can be imported using its name or numeric identifier and\n the '-f' flag.\n see: http://www.sun.com/msg/ZFS-XXXXXXX\nconfig:\n\n test ONLINE\n c0t0d0 ONLINE\n\n# zpool import test\ncannot import 'test': pool may be in use from other system, it was last\naccessed by 'tank' (hostid: 0x1435718c) on Fri Mar 9 15:42:47 2007\nuse '-f' to import anyway\n#\n\n\nIf you are certain that the pool is not being actively accessed by another\nsystem, then you can use the '-f' option to 'zpool import' to forcible\nimport the pool.\n " +msgstr "\n\nThe pool has been written to from another host, and was not cleanly exported\nfrom the other system. Actively importing a pool on multiple systems will\ncorrupt the pool and leave it in an unrecoverable state. To determine which\nsystem last accessed the pool, run the 'zpool import' command:\n\n\n# zpool import\n pool: test\n id: 14702934086626715962\nstate: ONLINE\nstatus: The pool was last accessed by another system.\naction: The pool can be imported using its name or numeric identifier and\n the '-f' flag.\n see: http://www.sun.com/msg/ZFS-8000-EY\nconfig:\n\n test ONLINE\n c0t0d0 ONLINE\n\n# zpool import test\ncannot import 'test': pool may be in use from other system, it was last\naccessed by 'tank' (hostid: 0x1435718c) on Fri Mar 9 15:42:47 2007\nuse '-f' to import anyway\n\n\n\nIf you are certain that the pool is not being actively accessed by another\nsystem, then you can use the '-f' option to 'zpool import' to forcibly\nimport the pool.\n\n " +# +# code: ZFS-8000-FD +# keys: fault.fs.zfs.vdev.io +# +msgid "ZFS-8000-FD.type" +msgstr "Fault" +msgid "ZFS-8000-FD.severity" +msgstr "Major" +msgid "ZFS-8000-FD.description" +msgstr "The number of I/O errors associated with a ZFS device exceeded\n acceptable levels. Refer to %s for more information." +msgid "ZFS-8000-FD.response" +msgstr "The device has been offlined and marked as faulted. An attempt\n will be made to activate a hot spare if available. " +msgid "ZFS-8000-FD.impact" +msgstr "Fault tolerance of the pool may be compromised." +msgid "ZFS-8000-FD.action" +msgstr "Run 'zpool status -x' and replace the bad device." +# +# code: ZFS-8000-GH +# keys: fault.fs.zfs.vdev.checksum +# +msgid "ZFS-8000-GH.type" +msgstr "Fault" +msgid "ZFS-8000-GH.severity" +msgstr "Major" +msgid "ZFS-8000-GH.description" +msgstr "The number of checksum errors associated with a ZFS device\nexceeded acceptable levels. Refer to %s for more information." +msgid "ZFS-8000-GH.response" +msgstr "The device has been marked as degraded. An attempt\nwill be made to activate a hot spare if available." +msgid "ZFS-8000-GH.impact" +msgstr "Fault tolerance of the pool may be compromised." +msgid "ZFS-8000-GH.action" +msgstr "Run 'zpool status -x' and replace the bad device." diff --git a/usr/src/cmd/fm/fmd/common/fmd_dr.c b/usr/src/cmd/fm/fmd/common/fmd_dr.c index 46fbe24bca..39237b9953 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_dr.c +++ b/usr/src/cmd/fm/fmd/common/fmd_dr.c @@ -132,8 +132,17 @@ fmd_dr_event(sysevent_t *sep) void fmd_dr_init(void) { - const char *drsubclass = ESC_DR_AP_STATE_CHANGE; - const char *devsubclass = EC_SUB_ALL; + const char *dr_subclasses[] = { + ESC_DR_AP_STATE_CHANGE + }; + const char *zfs_subclasses[] = { + ESC_ZFS_VDEV_CLEAR, + ESC_ZFS_VDEV_REMOVE, + ESC_ZFS_POOL_DESTROY + }; + const char *dev_subclasses[] = { + EC_SUB_ALL + }; if (geteuid() != 0) return; /* legacy sysevent mechanism is still root-only */ @@ -141,12 +150,17 @@ fmd_dr_init(void) if ((fmd.d_dr_hdl = sysevent_bind_handle(fmd_dr_event)) == NULL) fmd_error(EFMD_EXIT, "failed to bind handle for DR sysevent"); - if (sysevent_subscribe_event(fmd.d_dr_hdl, EC_DR, &drsubclass, 1) == -1) - fmd_error(EFMD_EXIT, "failed to subscribe to DR sysevent"); + if (sysevent_subscribe_event(fmd.d_dr_hdl, EC_DR, + dr_subclasses, sizeof (dr_subclasses) / sizeof (char *)) == -1) + fmd_error(EFMD_EXIT, "failed to subscribe to DR sysevents"); if (sysevent_subscribe_event(fmd.d_dr_hdl, EC_DEVFS, - &devsubclass, 1) == -1) - fmd_error(EFMD_EXIT, "failed to subscribe to devfs sysevent"); + dev_subclasses, sizeof (dev_subclasses) / sizeof (char *)) == -1) + fmd_error(EFMD_EXIT, "failed to subscribe to devfs sysevents"); + + if (sysevent_subscribe_event(fmd.d_dr_hdl, EC_ZFS, + zfs_subclasses, sizeof (zfs_subclasses) / sizeof (char *)) == -1) + fmd_error(EFMD_EXIT, "failed to subscribe to ZFS sysevents"); } void diff --git a/usr/src/cmd/fm/fmd/common/fmd_mdb.c b/usr/src/cmd/fm/fmd/common/fmd_mdb.c index d2788559cb..b55e4cf641 100644 --- a/usr/src/cmd/fm/fmd/common/fmd_mdb.c +++ b/usr/src/cmd/fm/fmd/common/fmd_mdb.c @@ -304,9 +304,15 @@ static int hash_walk_init(mdb_walk_state_t *wsp, uintptr_t addr, uint_t hashlen, const char *name, size_t size, size_t next) { - hashwalk_data_t *hwp = mdb_alloc(sizeof (hashwalk_data_t), UM_SLEEP); + hashwalk_data_t *hwp; size_t len = sizeof (uintptr_t) * hashlen; + if (len == 0) { + mdb_warn("failed to walk hash: invalid hash length\n"); + return (WALK_ERR); + } + + hwp = mdb_alloc(sizeof (hashwalk_data_t), UM_SLEEP); hwp->hw_hash = mdb_zalloc(len, UM_SLEEP); (void) mdb_vread(hwp->hw_hash, len, addr); hwp->hw_hashlen = hashlen; diff --git a/usr/src/cmd/fm/modules/common/zfs-diagnosis/Makefile b/usr/src/cmd/fm/modules/common/zfs-diagnosis/Makefile index 03a7a0dda4..41594038e9 100644 --- a/usr/src/cmd/fm/modules/common/zfs-diagnosis/Makefile +++ b/usr/src/cmd/fm/modules/common/zfs-diagnosis/Makefile @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #ident "%Z%%M% %I% %E% SMI" @@ -30,4 +30,4 @@ SRCS = zfs_de.c include ../../Makefile.plugin -LDLIBS += -luutil +LDLIBS += -luutil -lzfs diff --git a/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs-diagnosis.conf b/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs-diagnosis.conf index cd493d69bc..b077b2d913 100644 --- a/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs-diagnosis.conf +++ b/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs-diagnosis.conf @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #ident "%Z%%M% %I% %E% SMI" @@ -28,5 +28,4 @@ # subscribe ereport.fs.zfs.* subscribe resource.fs.zfs.* -subscribe fault.fs.zfs.* dictionary ZFS diff --git a/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs_de.c b/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs_de.c index 66251d4ad9..8c06632a06 100644 --- a/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs_de.c +++ b/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs_de.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -29,11 +29,19 @@ #include <stddef.h> #include <strings.h> #include <libuutil.h> +#include <libzfs.h> #include <fm/fmd_api.h> #include <sys/fs/zfs.h> #include <sys/fm/protocol.h> #include <sys/fm/fs/zfs.h> +/* + * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This + * #define reserves enough space for two 64-bit hex values plus the length of + * the longest string. + */ +#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) + typedef struct zfs_case_data { uint64_t zc_version; uint64_t zc_ena; @@ -41,31 +49,49 @@ typedef struct zfs_case_data { uint64_t zc_vdev_guid; int zc_has_timer; int zc_pool_state; + char zc_serd_checksum[MAX_SERDLEN]; + char zc_serd_io[MAX_SERDLEN]; + int zc_has_serd_timer; } zfs_case_data_t; typedef struct zfs_case { - int zc_version; + boolean_t zc_present; + uint32_t zc_version; zfs_case_data_t zc_data; fmd_case_t *zc_case; uu_list_node_t zc_node; id_t zc_timer; + id_t zc_serd_timer; } zfs_case_t; -#define CASE_DATA "data" -#define CASE_DATA_VERSION 1 +#define CASE_DATA "data" +#define CASE_DATA_VERSION_INITIAL 1 +#define CASE_DATA_VERSION_SERD 2 -static int zfs_case_timeout; +static hrtime_t zfs_case_timeout; +static hrtime_t zfs_serd_timeout; uu_list_pool_t *zfs_case_pool; uu_list_t *zfs_cases; +/* + * Write out the persistent representation of an active case. + */ static void zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp) { + /* + * Always update cases to the latest version, even if they were the + * previous version when unserialized. + */ + zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD; fmd_buf_write(hdl, zcp->zc_case, CASE_DATA, &zcp->zc_data, sizeof (zcp->zc_data)); } +/* + * Read back the persistent representation of an active case. + */ static zfs_case_t * zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) { @@ -77,14 +103,23 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data, sizeof (zcp->zc_data)); - if (zcp->zc_data.zc_version != CASE_DATA_VERSION) { + if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) { fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); return (NULL); } + /* + * fmd_buf_read() will have already zeroed out the remainder of the + * buffer, so we don't have to do anything special if the version + * doesn't include the SERD engine name. + */ + if (zcp->zc_data.zc_has_timer) zcp->zc_timer = fmd_timer_install(hdl, zcp, NULL, zfs_case_timeout); + if (zcp->zc_data.zc_has_serd_timer) + zcp->zc_serd_timer = fmd_timer_install(hdl, zcp, + NULL, zfs_serd_timeout); (void) uu_list_insert_before(zfs_cases, NULL, zcp); @@ -93,22 +128,218 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) return (zcp); } +/* + * Iterate over any active cases. If any cases are associated with a pool or + * vdev which is no longer present on the system, close the associated case. + */ +static void +zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd) +{ + uint64_t vdev_guid; + uint_t c, children; + nvlist_t **child; + zfs_case_t *zcp; + int ret; + + ret = nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid); + assert(ret == 0); + + /* + * Mark any cases associated with this (pool, vdev) pair. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == vdev_guid) + zcp->zc_present = B_TRUE; + } + + /* + * Iterate over all children. + */ + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c]); + } +} + /*ARGSUSED*/ +static int +zfs_mark_pool(zpool_handle_t *zhp, void *unused) +{ + zfs_case_t *zcp; + uint64_t pool_guid = zpool_get_guid(zhp); + nvlist_t *config, *vd; + int ret; + + /* + * Mark any cases associated with just this pool. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == 0) + zcp->zc_present = B_TRUE; + } + + if ((config = zpool_get_config(zhp, NULL)) == NULL) + return (-1); + + ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd); + assert(ret == 0); + + zfs_mark_vdev(pool_guid, vd); + + return (0); +} + static void -zfs_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) +zfs_purge_cases(fmd_hdl_t *hdl) +{ + zfs_case_t *zcp; + uu_list_walk_t *walk; + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + + /* + * There is no way to open a pool by GUID, or lookup a vdev by GUID. No + * matter what we do, we're going to have to stomach a O(vdevs * cases) + * algorithm. In reality, both quantities are likely so small that + * neither will matter. Given that iterating over pools is more + * expensive than iterating over the in-memory case list, we opt for a + * 'present' flag in each case that starts off cleared. We then iterate + * over all pools, marking those that are still present, and removing + * those that aren't found. + * + * Note that we could also construct an FMRI and rely on + * fmd_nvl_fmri_present(), but this would end up doing the same search. + */ + + /* + * Mark the cases an not present. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) + zcp->zc_present = B_FALSE; + + /* + * Iterate over all pools and mark the pools and vdevs found. If this + * fails (most probably because we're out of memory), then don't close + * any of the cases and we cannot be sure they are accurate. + */ + if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0) + return; + + /* + * Remove those cases which were not found. + */ + walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); + while ((zcp = uu_list_walk_next(walk)) != NULL) { + if (!zcp->zc_present) + fmd_case_close(hdl, zcp->zc_case); + } + uu_list_walk_end(walk); +} + +/* + * Construct the name of a serd engine given the pool/vdev GUID and type (io or + * checksum). + */ +static void +zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, + const char *type) +{ + (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s", pool_guid, + vdev_guid, type); +} + +/* + * Solve a given ZFS case. This first checks to make sure the diagnosis is + * still valid, as well as cleaning up any pending timer associated with the + * case. + */ +static void +zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname, + boolean_t checkunusable) +{ + nvlist_t *detector, *fault; + boolean_t serialize; + + /* + * Construct the detector from the case data. The detector is in the + * ZFS scheme, and is either the pool or the vdev, depending on whether + * this is a vdev or pool fault. + */ + if (nvlist_alloc(&detector, NV_UNIQUE_NAME, 0) != 0) + return; + + if (nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0) != 0 || + nvlist_add_string(detector, FM_FMRI_SCHEME, + FM_FMRI_SCHEME_ZFS) != 0 || + nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, + zcp->zc_data.zc_pool_guid) != 0 || + (zcp->zc_data.zc_vdev_guid != 0 && + nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, + zcp->zc_data.zc_vdev_guid) != 0)) { + nvlist_free(detector); + return; + } + + /* + * We also want to make sure that the detector (pool or vdev) properly + * reflects the diagnosed state, when the fault corresponds to internal + * ZFS state (i.e. not checksum or I/O error-induced). Otherwise, a + * device which was unavailable early in boot (because the driver/file + * wasn't available) and is now healthy will be mis-diagnosed. + */ + if (!fmd_nvl_fmri_present(hdl, detector) || + (checkunusable && !fmd_nvl_fmri_unusable(hdl, detector))) { + fmd_case_close(hdl, zcp->zc_case); + nvlist_free(detector); + return; + } + + fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, NULL, + detector); + fmd_case_add_suspect(hdl, zcp->zc_case, fault); + fmd_case_solve(hdl, zcp->zc_case); + + serialize = B_FALSE; + if (zcp->zc_data.zc_has_timer) { + fmd_timer_remove(hdl, zcp->zc_timer); + zcp->zc_data.zc_has_timer = 0; + serialize = B_TRUE; + } + if (zcp->zc_data.zc_has_serd_timer) { + fmd_timer_remove(hdl, zcp->zc_serd_timer); + zcp->zc_data.zc_has_serd_timer = 0; + serialize = B_TRUE; + } + if (serialize) + zfs_case_serialize(hdl, zcp); + + nvlist_free(detector); +} + +/* + * Main fmd entry point. + */ +/*ARGSUSED*/ +static void +zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) { zfs_case_t *zcp; int32_t pool_state; uint64_t ena, pool_guid, vdev_guid; nvlist_t *detector; boolean_t isresource; + const char *serd; isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*"); if (isresource) { /* - * For our faked-up 'ok' resource (see below), we have no normal - * payload members. + * For resources, we don't have a normal payload. */ if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) @@ -124,23 +355,6 @@ zfs_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) } /* - * Without a retire agent, we subscribe to our own faults and just - * discard them. - */ - if (fmd_nvl_class_match(hdl, nvl, "fault.fs.zfs.*")) - return; - - /* - * Ignore all block level (.io and .checksum) errors not associated with - * a pool open. We should really update a bean counter, and eventually - * do some real predictive analysis based on these faults. - */ - if ((fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.io") || - fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.checksum")) && - pool_state == SPA_LOAD_NONE) - return; - - /* * We also ignore all ereports generated during an import of a pool, * since the only possible fault (.pool) would result in import failure, * and hence no persistent fault. Some day we may want to do something @@ -163,23 +377,13 @@ zfs_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) * - An error occurred for a device which already has an open * case. */ - if (!isresource) { - (void) nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena); - (void) nvlist_lookup_uint64(nvl, - FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); - if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) - (void) nvlist_lookup_uint64(nvl, - FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid); - else - vdev_guid = 0; - } else { - (void) nvlist_lookup_uint64(nvl, - FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); - if (nvlist_lookup_uint64(nvl, - FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) - vdev_guid = 0; + (void) nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); + if (nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) + vdev_guid = 0; + if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0) ena = 0; - } for (zcp = uu_list_first(zfs_cases); zcp != NULL; zcp = uu_list_next(zfs_cases, zcp)) { @@ -206,7 +410,7 @@ zfs_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) if (zcp == NULL) { fmd_case_t *cs; - zfs_case_data_t data; + zfs_case_data_t data = { 0 }; /* * If this is one of our 'fake' resource ereports, and there is @@ -228,11 +432,10 @@ zfs_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t)); - data.zc_version = CASE_DATA_VERSION; + data.zc_version = CASE_DATA_VERSION_SERD; data.zc_ena = ena; data.zc_pool_guid = pool_guid; data.zc_vdev_guid = vdev_guid; - data.zc_has_timer = 0; data.zc_pool_state = (int)pool_state; fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data)); @@ -241,14 +444,48 @@ zfs_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) assert(zcp != NULL); } - /* - * The 'resource.fs.zfs.ok' event is a special internal-only event that - * signifies that a pool or device that was previously faulted has now - * come online (as detected by ZFS). This allows us to close the - * associated case. - */ if (isresource) { - fmd_case_close(hdl, zcp->zc_case); + if (fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.ok")) { + /* + * The 'resource.fs.zfs.ok' event is a special + * internal-only event that signifies that a pool or + * device that was previously faulted has now come + * online (as detected by ZFS). This allows us to close + * the associated case. + */ + fmd_case_close(hdl, zcp->zc_case); + } else if (fmd_nvl_class_match(hdl, nvl, + "resource.fs.zfs.autoreplace")) { + /* + * The 'resource.fs.zfs.autoreplace' event indicates + * that the pool was loaded with the 'autoreplace' + * property set. In this case, any pending device + * failures should be ignored, as the asynchronous + * autoreplace handling will take care of them. + */ + fmd_case_close(hdl, zcp->zc_case); + } else { + /* + * The 'resource.fs.zfs.removed' event indicates that + * device removal was detected, and the device was + * closed asynchronously. If this is the case, we + * assume that any recent I/O errors were due to the + * device removal, not any fault of the device itself. + * We reset the SERD engine, and cancel any pending + * timers. + */ + if (zcp->zc_data.zc_has_serd_timer) { + fmd_timer_remove(hdl, zcp->zc_serd_timer); + zcp->zc_data.zc_has_serd_timer = 0; + zfs_case_serialize(hdl, zcp); + } + if (zcp->zc_data.zc_serd_io[0] != '\0') + fmd_serd_reset(hdl, + zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_serd_checksum[0] != '\0') + fmd_serd_reset(hdl, + zcp->zc_data.zc_serd_checksum); + } return; } @@ -281,89 +518,158 @@ zfs_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) /* * Pool level fault. */ - nvlist_t *fault; - - fault = fmd_nvl_create_fault(hdl, "fault.fs.zfs.pool", - 100, detector, NULL, detector); - fmd_case_add_suspect(hdl, zcp->zc_case, fault); - fmd_case_solve(hdl, zcp->zc_case); - - if (zcp->zc_data.zc_has_timer) { - fmd_timer_remove(hdl, zcp->zc_timer); - zcp->zc_data.zc_has_timer = 0; - zfs_case_serialize(hdl, zcp); - } - + zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE); } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*") && pool_state == SPA_LOAD_NONE) { /* - * Device fault. + * Device fault. Before solving the case, determine if the + * device failed during open, and the 'autoreplace' property is + * set. If this is the case, then we post a sysevent which is + * picked up by the syseventd module, and any processing is done + * as needed. */ - nvlist_t *fault; - - fault = fmd_nvl_create_fault(hdl, "fault.fs.zfs.device", - 100, detector, NULL, detector); - fmd_case_add_suspect(hdl, zcp->zc_case, fault); - fmd_case_solve(hdl, zcp->zc_case); + zfs_case_solve(hdl, zcp, "fault.fs.zfs.device", B_TRUE); + } else { + if (pool_state == SPA_LOAD_OPEN) { + /* + * Error incurred during a pool open. Reset the timer + * associated with this case. + */ + if (zcp->zc_data.zc_has_timer) + fmd_timer_remove(hdl, zcp->zc_timer); + zcp->zc_timer = fmd_timer_install(hdl, zcp, NULL, + zfs_case_timeout); + if (!zcp->zc_data.zc_has_timer) { + zcp->zc_data.zc_has_timer = 1; + zfs_case_serialize(hdl, zcp); + } + } - if (zcp->zc_data.zc_has_timer) { - fmd_timer_remove(hdl, zcp->zc_timer); - zcp->zc_data.zc_has_timer = 0; - zfs_case_serialize(hdl, zcp); + /* + * If this is a checksum or I/O error, then toss it into the + * appropriate SERD engine and check to see if it has fired. + * Ideally, we want to do something more sophisticated, + * (persistent errors for a single data block, etc). For now, + * a single SERD engine is sufficient. + */ + serd = NULL; + if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.io")) { + if (zcp->zc_data.zc_serd_io[0] == '\0') { + zfs_serd_name(zcp->zc_data.zc_serd_io, + pool_guid, vdev_guid, "io"); + fmd_serd_create(hdl, zcp->zc_data.zc_serd_io, + fmd_prop_get_int32(hdl, "io_N"), + fmd_prop_get_int64(hdl, "io_T")); + zfs_case_serialize(hdl, zcp); + } + serd = zcp->zc_data.zc_serd_io; + } else if (fmd_nvl_class_match(hdl, nvl, + "ereport.fs.zfs.checksum")) { + if (zcp->zc_data.zc_serd_checksum[0] == '\0') { + zfs_serd_name(zcp->zc_data.zc_serd_checksum, + pool_guid, vdev_guid, "checksum"); + fmd_serd_create(hdl, + zcp->zc_data.zc_serd_checksum, + fmd_prop_get_int32(hdl, "checksum_N"), + fmd_prop_get_int64(hdl, "checksum_T")); + zfs_case_serialize(hdl, zcp); + } + serd = zcp->zc_data.zc_serd_checksum; } - } else if (pool_state == SPA_LOAD_OPEN) { /* - * Error incurred during a pool open. Reset the timer - * associated with this case. + * Because I/O errors may be due to device removal, we postpone + * any diagnosis until we're sure that we aren't about to + * receive a 'resource.fs.zfs.removed' event. */ - if (zcp->zc_data.zc_has_timer) - fmd_timer_remove(hdl, zcp->zc_timer); - zcp->zc_timer = fmd_timer_install(hdl, zcp, NULL, - zfs_case_timeout); - if (!zcp->zc_data.zc_has_timer) { - zcp->zc_data.zc_has_timer = 1; - zfs_case_serialize(hdl, zcp); + if (serd && fmd_serd_record(hdl, serd, ep)) { + if (zcp->zc_data.zc_has_serd_timer) + fmd_timer_remove(hdl, zcp->zc_serd_timer); + zcp->zc_serd_timer = fmd_timer_install(hdl, zcp, NULL, + zfs_serd_timeout); + if (!zcp->zc_data.zc_has_serd_timer) { + zcp->zc_data.zc_has_serd_timer = 1; + zfs_case_serialize(hdl, zcp); + } } } } /* - * Timeout - indicates that a pool had faults, but was eventually opened - * successfully. + * Timeout indicates one of two scenarios: + * + * - The pool had faults but was eventually opened successfully. + * + * - We diagnosed an I/O error, and it was not due to device removal (which + * would cause the timeout to be cancelled). */ /* ARGSUSED */ static void -zfs_timeout(fmd_hdl_t *hdl, id_t id, void *data) +zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data) { zfs_case_t *zcp = data; + const char *faultname; - zcp->zc_data.zc_has_timer = 0; + if (id == zcp->zc_timer) { + zcp->zc_data.zc_has_timer = 0; + fmd_case_close(hdl, zcp->zc_case); + } - fmd_case_close(hdl, zcp->zc_case); + if (id == zcp->zc_serd_timer) { + if (zcp->zc_data.zc_serd_io[0] != '\0' && + fmd_serd_fired(hdl, zcp->zc_data.zc_serd_io)) { + faultname = "fault.fs.zfs.vdev.io"; + } else { + assert(fmd_serd_fired(hdl, + zcp->zc_data.zc_serd_checksum)); + faultname = "fault.fs.zfs.vdev.checksum"; + } + zfs_case_solve(hdl, zcp, faultname, B_FALSE); + } } static void -zfs_close(fmd_hdl_t *hdl, fmd_case_t *cs) +zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) { zfs_case_t *zcp = fmd_case_getspecific(hdl, cs); + if (zcp->zc_data.zc_serd_checksum[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); + if (zcp->zc_data.zc_serd_io[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); if (zcp->zc_data.zc_has_timer) fmd_timer_remove(hdl, zcp->zc_timer); + if (zcp->zc_data.zc_has_serd_timer) + fmd_timer_remove(hdl, zcp->zc_serd_timer); uu_list_remove(zfs_cases, zcp); fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); } +/* + * We use the fmd gc entry point to look for old cases that no longer apply. + * This allows us to keep our set of case data small in a long running system. + */ +static void +zfs_fm_gc(fmd_hdl_t *hdl) +{ + zfs_purge_cases(hdl); +} + static const fmd_hdl_ops_t fmd_ops = { - zfs_recv, /* fmdo_recv */ - zfs_timeout, /* fmdo_timeout */ - zfs_close, /* fmdo_close */ + zfs_fm_recv, /* fmdo_recv */ + zfs_fm_timeout, /* fmdo_timeout */ + zfs_fm_close, /* fmdo_close */ NULL, /* fmdo_stats */ - NULL, /* fmdo_gc */ + zfs_fm_gc, /* fmdo_gc */ }; static const fmd_prop_t fmd_props[] = { - { "case_timeout", FMD_TYPE_UINT32, "5" }, + { "case_timeout", FMD_TYPE_TIME, "5sec" }, + { "checksum_N", FMD_TYPE_UINT32, "10" }, + { "checksum_T", FMD_TYPE_TIME, "10min" }, + { "io_N", FMD_TYPE_UINT32, "10" }, + { "io_T", FMD_TYPE_TIME, "10min" }, + { "serd_timeout", FMD_TYPE_TIME, "5sec" }, { NULL, 0, NULL } }; @@ -375,23 +681,33 @@ void _fmd_init(fmd_hdl_t *hdl) { fmd_case_t *cp; + libzfs_handle_t *zhdl; + + if ((zhdl = libzfs_init()) == NULL) + return; if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool", sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node), - NULL, 0)) == NULL) + NULL, 0)) == NULL) { + libzfs_fini(zhdl); return; + } if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, 0)) == NULL) { uu_list_pool_destroy(zfs_case_pool); + libzfs_fini(zhdl); return; } if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { uu_list_destroy(zfs_cases); uu_list_pool_destroy(zfs_case_pool); + libzfs_fini(zhdl); return; } + fmd_hdl_setspecific(hdl, zhdl); + /* * Iterate over all active cases and unserialize the associated buffers, * adding them to our list of open cases. @@ -400,7 +716,13 @@ _fmd_init(fmd_hdl_t *hdl) cp != NULL; cp = fmd_case_next(hdl, cp)) (void) zfs_case_unserialize(hdl, cp); - zfs_case_timeout = fmd_prop_get_int32(hdl, "case_timeout") * NANOSEC; + /* + * Clear out any old cases that are no longer valid. + */ + zfs_purge_cases(hdl); + + zfs_case_timeout = fmd_prop_get_int64(hdl, "case_timeout"); + zfs_serd_timeout = fmd_prop_get_int64(hdl, "serd_timeout"); } void @@ -408,6 +730,7 @@ _fmd_fini(fmd_hdl_t *hdl) { zfs_case_t *zcp; uu_list_walk_t *walk; + libzfs_handle_t *zhdl; /* * Remove all active cases. @@ -421,4 +744,7 @@ _fmd_fini(fmd_hdl_t *hdl) uu_list_destroy(zfs_cases); uu_list_pool_destroy(zfs_case_pool); + + zhdl = fmd_hdl_getspecific(hdl); + libzfs_fini(zhdl); } diff --git a/usr/src/cmd/fm/modules/common/zfs-retire/zfs-retire.conf b/usr/src/cmd/fm/modules/common/zfs-retire/zfs-retire.conf index f506384bff..62fc163a7d 100644 --- a/usr/src/cmd/fm/modules/common/zfs-retire/zfs-retire.conf +++ b/usr/src/cmd/fm/modules/common/zfs-retire/zfs-retire.conf @@ -19,11 +19,13 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #ident "%Z%%M% %I% %E% SMI" # # fmd configuration file for the zfs retire agent. # -subscribe fault.fs.zfs.device +subscribe fault.fs.zfs.* +subscribe resource.fs.zfs.removed +subscribe list.repaired diff --git a/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c b/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c index 962b37bb82..a42f6ed5a0 100644 --- a/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c +++ b/usr/src/cmd/fm/modules/common/zfs-retire/zfs_retire.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -27,9 +27,12 @@ /* * The ZFS retire agent is responsible for managing hot spares across all pools. - * When we see a device fault, we try to open the associated pool and look for - * any hot spares. We iterate over any available hot spares and attempt a - * 'zpool replace' for each one. + * When we see a device fault or a device removal, we try to open the associated + * pool and look for any hot spares. We iterate over any available hot spares + * and attempt a 'zpool replace' for each one. + * + * For vdevs diagnosed as faulty, the agent is also responsible for proactively + * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors). */ #include <fm/fmd_api.h> @@ -37,6 +40,7 @@ #include <sys/fm/protocol.h> #include <sys/fm/fs/zfs.h> #include <libzfs.h> +#include <string.h> /* * Find a pool with a matching GUID. @@ -87,104 +91,210 @@ find_vdev(nvlist_t *nv, uint64_t search) return (NULL); } +/* + * Given a (pool, vdev) GUID pair, find the matching pool and vdev. + */ +static zpool_handle_t * +find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, + nvlist_t **vdevp) +{ + find_cbdata_t cb; + zpool_handle_t *zhp; + nvlist_t *config, *nvroot; + + /* + * Find the corresponding pool and make sure the vdev still exists. + */ + cb.cb_guid = pool_guid; + if (zpool_iter(zhdl, find_pool, &cb) != 1) + return (NULL); + + zhp = cb.cb_zhp; + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) { + zpool_close(zhp); + return (NULL); + } + + if ((*vdevp = find_vdev(nvroot, vdev_guid)) == NULL) { + zpool_close(zhp); + return (NULL); + } + + return (zhp); +} + +/* + * Given a vdev, attempt to replace it with every known spare until one + * succeeds. + */ +static void +replace_with_spare(zpool_handle_t *zhp, nvlist_t *vdev) +{ + nvlist_t *config, *nvroot, *replacement; + nvlist_t **spares; + uint_t s, nspares; + char *dev_name; + + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) + return; + + /* + * Find out if there are any hot spares available in the pool. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) != 0) + return; + + if (nvlist_alloc(&replacement, NV_UNIQUE_NAME, 0) != 0) + return; + + if (nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) != 0) { + nvlist_free(replacement); + return; + } + + dev_name = zpool_vdev_name(NULL, zhp, vdev); + + /* + * Try to replace each spare, ending when we successfully + * replace it. + */ + for (s = 0; s < nspares; s++) { + char *spare_name; + + if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, + &spare_name) != 0) + continue; + + if (nvlist_add_nvlist_array(replacement, + ZPOOL_CONFIG_CHILDREN, &spares[s], 1) != 0) + continue; + + if (zpool_vdev_attach(zhp, dev_name, spare_name, + replacement, B_TRUE) == 0) + break; + } + + free(dev_name); + nvlist_free(replacement); +} + /*ARGSUSED*/ static void zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) { uint64_t pool_guid, vdev_guid; - char *dev_name; zpool_handle_t *zhp; - nvlist_t *resource, *config, *nvroot; - nvlist_t *vdev; - nvlist_t **spares, **faults; - uint_t s, nspares, f, nfaults; - nvlist_t *replacement; - find_cbdata_t cb; + nvlist_t *resource, *fault; + nvlist_t **faults; + uint_t f, nfaults; libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + boolean_t fault_device, degrade_device; + boolean_t is_repair; + char *scheme; + nvlist_t *vdev; + + /* + * If this is a resource notifying us of device removal, then simply + * check for an available spare and continue. + */ + if (strcmp(class, "resource.fs.zfs.removed") == 0) { + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + &vdev_guid) != 0) + return; + + if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, + &vdev)) == NULL) + return; + + if (fmd_prop_get_int32(hdl, "spare_on_remove")) + replace_with_spare(zhp, vdev); + zpool_close(zhp); + return; + } + + if (strcmp(class, "list.repaired") == 0) + is_repair = B_TRUE; + else + is_repair = B_FALSE; /* - * Get information from the fault. + * We subscribe to zfs faults as well as all repair events. */ if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, &faults, &nfaults) != 0) return; for (f = 0; f < nfaults; f++) { - if (nvlist_lookup_nvlist(faults[f], FM_FAULT_RESOURCE, - &resource) != 0 || - nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, - &pool_guid) != 0 || - nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, - &vdev_guid) != 0) - continue; + fault = faults[f]; + + fault_device = B_FALSE; + degrade_device = B_FALSE; /* - * From the pool guid and vdev guid, get the pool name and - * device name. + * While we subscribe to fault.fs.zfs.*, we only take action + * for faults targeting a specific vdev (open failure or SERD + * failure). */ - cb.cb_guid = pool_guid; - if (zpool_iter(zhdl, find_pool, &cb) != 1) + if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) + fault_device = B_TRUE; + else if (fmd_nvl_class_match(hdl, fault, + "fault.fs.zfs.vdev.checksum")) + degrade_device = B_TRUE; + else if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.device")) + fault_device = B_FALSE; + else continue; - zhp = cb.cb_zhp; - config = zpool_get_config(zhp, NULL); - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) != 0) { - zpool_close(zhp); + if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, + &resource) != 0 || + nvlist_lookup_string(resource, FM_FMRI_SCHEME, + &scheme) != 0) continue; - } - if ((vdev = find_vdev(nvroot, vdev_guid)) == NULL) { - zpool_close(zhp); + if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0) continue; - } - /* - * Find out if there are any hot spares available in the pool. - */ - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) != 0) { - zpool_close(zhp); + if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, + &pool_guid) != 0 || + nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, + &vdev_guid) != 0) continue; - } - if (nvlist_alloc(&replacement, NV_UNIQUE_NAME, 0) != 0) { - zpool_close(zhp); + if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, + &vdev)) == NULL) continue; - } - if (nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_ROOT) != 0) { - nvlist_free(replacement); + /* + * If this is a repair event, then mark the vdev as repaired and + * continue. + */ + if (is_repair) { + (void) zpool_vdev_clear(zhp, vdev_guid); zpool_close(zhp); continue; } - dev_name = zpool_vdev_name(zhdl, zhp, vdev); - /* - * Try to replace each spare, ending when we successfully - * replace it. + * Actively fault the device if needed. */ - for (s = 0; s < nspares; s++) { - char *spare_name; - - if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, - &spare_name) != 0) - continue; + if (fault_device) + (void) zpool_vdev_fault(zhp, vdev_guid); + if (degrade_device) + (void) zpool_vdev_degrade(zhp, vdev_guid); - if (nvlist_add_nvlist_array(replacement, - ZPOOL_CONFIG_CHILDREN, &spares[s], 1) != 0) - continue; - - if (zpool_vdev_attach(zhp, dev_name, spare_name, - replacement, B_TRUE) == 0) - break; - } - - free(dev_name); - nvlist_free(replacement); + /* + * Attempt to substitute a hot spare. + */ + replace_with_spare(zhp, vdev); zpool_close(zhp); } } @@ -198,6 +308,7 @@ static const fmd_hdl_ops_t fmd_ops = { }; static const fmd_prop_t fmd_props[] = { + { "spare_on_remove", FMD_TYPE_BOOL, "true" }, { NULL, 0, NULL } }; diff --git a/usr/src/cmd/lofiadm/main.c b/usr/src/cmd/lofiadm/main.c index fa1774c52a..9c0f6577b7 100644 --- a/usr/src/cmd/lofiadm/main.c +++ b/usr/src/cmd/lofiadm/main.c @@ -19,9 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * + */ + +/* * lofiadm - administer lofi(7d). Very simple, add and remove file<->device * associations, and display status. All the ioctls are private between * lofi and lofiadm, and so are very simple - device information is @@ -222,10 +224,12 @@ add_mapping(int lfd, const char *devicename, const char *filename) * filename otherwise. */ static void -delete_mapping(int lfd, const char *devicename, const char *filename) +delete_mapping(int lfd, const char *devicename, const char *filename, + boolean_t force) { struct lofi_ioctl li; + li.li_force = force; if (devicename == NULL) { /* delete by filename */ (void) strcpy(li.li_filename, filename); @@ -286,13 +290,14 @@ main(int argc, char *argv[]) int minor; int fd = -1; static char *lofictl = "/dev/" LOFI_CTL_NAME; + boolean_t force = B_FALSE; pname = getpname(argv[0]); (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); - while ((c = getopt(argc, argv, "a:d:")) != EOF) { + while ((c = getopt(argc, argv, "a:d:f")) != EOF) { switch (c) { case 'a': addflag = 1; @@ -334,6 +339,9 @@ main(int argc, char *argv[]) else filename = optarg; break; + case 'f': + force = B_TRUE; + break; case '?': default: errflag = 1; @@ -389,7 +397,7 @@ main(int argc, char *argv[]) if (addflag) add_mapping(lfd, devicename, filename); else if (deleteflag) - delete_mapping(lfd, devicename, filename); + delete_mapping(lfd, devicename, filename, force); else if (filename || devicename) print_one_mapping(lfd, devicename, filename); else diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c index e0eac94d19..11e8826e84 100644 --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c @@ -1117,6 +1117,12 @@ do_print_vdev(uintptr_t addr, int flags, int depth, int stats, case VDEV_STATE_HEALTHY: state = "HEALTHY"; break; + case VDEV_STATE_REMOVED: + state = "REMOVED"; + break; + case VDEV_STATE_FAULTED: + state = "FAULTED"; + break; default: state = "UNKNOWN"; break; diff --git a/usr/src/cmd/syseventd/modules/Makefile b/usr/src/cmd/syseventd/modules/Makefile index ac87308c03..f54659eab3 100644 --- a/usr/src/cmd/syseventd/modules/Makefile +++ b/usr/src/cmd/syseventd/modules/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -22,8 +21,8 @@ # #ident "%Z%%M% %I% %E% SMI" # -# Copyright (c) 1998-2001 by Sun Microsystems, Inc. -# All rights reserved. +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. # # cmd/syseventd/modules/Makefile # @@ -33,7 +32,8 @@ SUBDIRS= \ devfsadmd_mod \ sysevent_conf_mod \ - sysevent_reg_mod + sysevent_reg_mod \ + zfs_mod all:= TARGET= all install:= TARGET= install diff --git a/usr/src/cmd/syseventd/modules/zfs_mod/Makefile b/usr/src/cmd/syseventd/modules/zfs_mod/Makefile new file mode 100644 index 0000000000..82616a7783 --- /dev/null +++ b/usr/src/cmd/syseventd/modules/zfs_mod/Makefile @@ -0,0 +1,42 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +LIBRARY= zfs_mod + +include ../Makefile.com + +LDLIBS += -lzfs -ldevid + +.KEEP_STATE: + +all: $(DYNLIB) + +install: all \ + $(ROOTLIBSYSEVENTDIR) \ + $(ROOTLIBDIR) \ + $(ROOTLIBS) + +include ../Makefile.targ diff --git a/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c b/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c new file mode 100644 index 0000000000..105a2ffced --- /dev/null +++ b/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c @@ -0,0 +1,523 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ZFS syseventd module. + * + * The purpose of this module is to identify when devices are added to the + * system, and appropriately online or replace the affected vdevs. + * + * When a device is added to the system: + * + * 1. Search for any vdevs whose devid matches that of the newly added + * device. + * + * 2. If no vdevs are found, then search for any vdevs whose devfs path + * matches that of the new device. + * + * 3. If no vdevs match by either method, then ignore the event. + * + * 4. Attempt to online the device with a flag to indicate that it should + * be unspared when resilvering completes. If this succeeds, then the + * same device was inserted and we should continue normally. + * + * 5. If the pool does not have the 'autoreplace' property set, attempt to + * online the device again without the unspare flag, which will + * generate a FMA fault. + * + * 6. If the pool has the 'autoreplace' property set, and the matching vdev + * is a whole disk, then label the new disk and attempt a 'zpool + * replace'. + * + * The module responds to EC_DEV_ADD events for both disks and lofi devices, + * with the latter used for testing. The special ESC_ZFS_VDEV_CHECK event + * indicates that a device failed to open during pool load, but the autoreplace + * property was set. In this case, we deferred the associated FMA fault until + * our module had a chance to process the autoreplace logic. If the device + * could not be replaced, then the second online attempt will trigger the FMA + * fault that we skipped earlier. + */ + +#include <alloca.h> +#include <devid.h> +#include <fcntl.h> +#include <libnvpair.h> +#include <libsysevent.h> +#include <libzfs.h> +#include <limits.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> +#include <sys/sunddi.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> +#include <unistd.h> + +#if defined(__i386) || defined(__amd64) +#define PHYS_PATH ":q" +#define RAW_SLICE "p0" +#elif defined(__sparc) +#define PHYS_PATH ":c" +#define RAW_SLICE "s2" +#else +#error Unknown architecture +#endif + +typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); + +libzfs_handle_t *g_zfshdl; + +/* + * The device associated with the given vdev (either by devid or physical path) + * has been added to the system. If 'isdisk' is set, then we only attempt a + * replacement if it's a whole disk. This also implies that we should label the + * disk first. + * + * First, we attempt to online the device (making sure to undo any spare + * operation when finished). If this succeeds, then we're done. If it fails, + * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, + * but that the label was not what we expected. If the 'autoreplace' property + * is not set, then we relabel the disk (if specified), and attempt a 'zpool + * replace'. If the online is successful, but the new state is something else + * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of + * race, and we should avoid attempting to relabel the disk. + */ +static void +zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t isdisk) +{ + char *path; + vdev_state_t newstate; + nvlist_t *nvroot, *newvd; + uint64_t wholedisk = 0ULL; + char *devid = NULL; + char rawpath[PATH_MAX], fullpath[PATH_MAX]; + size_t len; + + if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) + return; + + (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_DEVID, &devid); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); + + /* + * We should have a way to online a device by guid. With the current + * interface, we are forced to chop off the 's0' for whole disks. + */ + (void) strlcpy(fullpath, path, sizeof (fullpath)); + if (wholedisk) + fullpath[strlen(fullpath) - 2] = '\0'; + + /* + * Attempt to online the device. It would be nice to online this by + * GUID, but the current interface only supports lookup by path. + */ + if (zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && + newstate != VDEV_STATE_CANT_OPEN) + return; + + /* + * If the pool doesn't have the autoreplace property set, then attempt a + * true online (without the unspare flag), which will trigger a FMA + * fault. + */ + if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE) || + (isdisk && !wholedisk)) { + (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, + &newstate); + return; + } + + if (isdisk) { + /* + * If this is a request to label a whole disk, then attempt to + * write out the label. Before we can label the disk, we need + * access to a raw node. Ideally, we'd like to walk the devinfo + * tree and find a raw node from the corresponding parent node. + * This is overly complicated, and since we know how we labeled + * this device in the first place, we know it's save to switch + * from /dev/dsk to /dev/rdsk and append the backup slice. + */ + if (strncmp(path, "/dev/dsk/", 9) != 0) + return; + + (void) strlcpy(rawpath, path + 9, sizeof (rawpath)); + len = strlen(rawpath); + rawpath[len - 2] = '\0'; + + if (zpool_label_disk(g_zfshdl, zhp, rawpath) != 0) + return; + } + + /* + * Cosntruct the root vdev to pass to zpool_vdev_attach(). While adding + * the entire vdev structure is harmless, we construct a reduced set of + * path/devid/wholedisk to keep it simple. + */ + if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) + return; + + if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { + nvlist_free(nvroot); + return; + } + + if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || + nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || + (devid && nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, + devid) != 0) || + nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || + nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || + nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, + 1) != 0) { + nvlist_free(newvd); + nvlist_free(nvroot); + return; + } + + nvlist_free(newvd); + + (void) zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE); + + nvlist_free(nvroot); + +} + +/* + * Utility functions to find a vdev matching given criteria. + */ +typedef struct dev_data { + const char *dd_compare; + const char *dd_prop; + zfs_process_func_t dd_func; + boolean_t dd_found; + boolean_t dd_isdisk; + uint64_t dd_pool_guid; + uint64_t dd_vdev_guid; +} dev_data_t; + +static void +zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) +{ + dev_data_t *dp = data; + char *path; + uint_t c, children; + nvlist_t **child; + size_t len = strlen(dp->dd_compare); + uint64_t wholedisk = 0ULL; + uint64_t guid; + + /* + * First iterate over any children. + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_iter_vdev(zhp, child[c], data); + return; + } + + (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk); + + if (dp->dd_vdev_guid != 0) { + if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, + &guid) != 0 || guid != dp->dd_vdev_guid) + return; + } else { + if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || + strncmp(dp->dd_compare, path, len) != 0) + return; + + /* + * Normally, we want to have an exact match for the comparison + * string. However, we allow substring matches in the following + * cases: + * + * <path>: This is a devpath, and the target is one + * of its children. + * + * <path/> This is a devid for a whole disk, and + * the target is one of its children. + */ + if (path[len] != '\0' && path[len] != ':' && + path[len - 1] != '/') + return; + } + + (dp->dd_func)(zhp, nvl, dp->dd_isdisk); +} + +static int +zfs_iter_pool(zpool_handle_t *zhp, void *data) +{ + nvlist_t *config, *nvl; + dev_data_t *dp = data; + uint64_t pool_guid; + + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + if (dp->dd_pool_guid == 0 || + (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { + (void) nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvl); + zfs_iter_vdev(zhp, nvl, data); + } + } + + zpool_close(zhp); + return (0); +} + +/* + * Given a physical device path, iterate over all (pool, vdev) pairs which + * correspond to the given path. + */ +static boolean_t +devpath_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk) +{ + dev_data_t data = { 0 }; + + data.dd_compare = devpath; + data.dd_func = func; + data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; + data.dd_found = B_FALSE; + data.dd_isdisk = wholedisk; + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (data.dd_found); +} + +/* + * Given a /devices path, lookup the corresponding devid for each minor node, + * and find any vdevs with matching devids. Doing this straight up would be + * rather inefficient, O(minor nodes * vdevs in system), so we take advantage of + * the fact that each devid ends with "/<minornode>". Once we find any valid + * minor node, we chop off the portion after the last slash, and then search for + * matching vdevs, which is O(vdevs in system). + */ +static boolean_t +devid_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk) +{ + size_t len = strlen(devpath) + sizeof ("/devices") + + sizeof (PHYS_PATH) - 1; + char *fullpath; + int fd; + ddi_devid_t devid; + char *devidstr, *fulldevid; + dev_data_t data = { 0 }; + + /* + * Try to open a known minor node. + */ + fullpath = alloca(len); + (void) snprintf(fullpath, len, "/devices%s%s", devpath, PHYS_PATH); + if ((fd = open(fullpath, O_RDONLY)) < 0) + return (B_FALSE); + + /* + * Determine the devid as a string, with no trailing slash for the minor + * node. + */ + if (devid_get(fd, &devid) != 0) { + (void) close(fd); + return (B_FALSE); + } + (void) close(fd); + + if ((devidstr = devid_str_encode(devid, NULL)) == NULL) { + devid_free(devid); + return (B_FALSE); + } + + len = strlen(devidstr) + 2; + fulldevid = alloca(len); + (void) snprintf(fulldevid, len, "%s/", devidstr); + + data.dd_compare = fulldevid; + data.dd_func = func; + data.dd_prop = ZPOOL_CONFIG_DEVID; + data.dd_found = B_FALSE; + data.dd_isdisk = wholedisk; + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + devid_str_free(devidstr); + + return (data.dd_found); +} + +/* + * This function is called when we receive a devfs add event. This can be + * either a disk event or a lofi event, and the behavior is slightly different + * depending on which it is. + */ +static int +zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) +{ + char *devpath, *devname; + char path[PATH_MAX], realpath[PATH_MAX]; + char *colon, *raw; + int ret; + + /* + * The main unit of operation is the physical device path. For disks, + * this is the device node, as all minor nodes are affected. For lofi + * devices, this includes the minor path. Unfortunately, this isn't + * represented in the DEV_PHYS_PATH for various reasons. + */ + if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath) != 0) + return (-1); + + /* + * If this is a lofi device, then also get the minor instance name. + * Unfortunately, the current payload doesn't include an easy way to get + * this information. So we cheat by resolving the 'dev_name' (which + * refers to the raw device) and taking the portion between ':(*),raw'. + */ + (void) strlcpy(realpath, devpath, sizeof (realpath)); + if (is_lofi) { + if (nvlist_lookup_string(nvl, DEV_NAME, + &devname) == 0 && + (ret = resolvepath(devname, path, + sizeof (path))) > 0) { + path[ret] = '\0'; + colon = strchr(path, ':'); + if (colon != NULL) + raw = strstr(colon + 1, ",raw"); + if (colon != NULL && raw != NULL) { + *raw = '\0'; + (void) snprintf(realpath, + sizeof (realpath), "%s%s", + devpath, colon); + *raw = ','; + } + } + } + + /* + * Iterate over all vdevs with a matching devid, and then those with a + * matching /devices path. For disks, we only want to pay attention to + * vdevs marked as whole disks. For lofi, we don't care (because we're + * matching an exact minor name). + */ + if (!devid_iter(realpath, zfs_process_add, !is_lofi)) + (void) devpath_iter(realpath, zfs_process_add, !is_lofi); + + return (0); +} + +/* + * Called when we receive a VDEV_CHECK event, which indicates a device could not + * be opened during initial pool open, but the autoreplace property was set on + * the pool. In this case, we treat it as if it were an add event. + */ +static int +zfs_deliver_check(nvlist_t *nvl) +{ + dev_data_t data = { 0 }; + + if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, + &data.dd_pool_guid) != 0 || + nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, + &data.dd_vdev_guid) != 0) + return (0); + + data.dd_isdisk = B_TRUE; + data.dd_func = zfs_process_add; + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (0); +} + +/*ARGSUSED*/ +static int +zfs_deliver_event(sysevent_t *ev, int unused) +{ + const char *class = sysevent_get_class_name(ev); + const char *subclass = sysevent_get_subclass_name(ev); + nvlist_t *nvl; + int ret; + boolean_t is_lofi, is_check; + + if (strcmp(class, EC_DEV_ADD) == 0) { + /* + * We're mainly interested in disk additions, but we also listen + * for new lofi devices, to allow for simplified testing. + */ + if (strcmp(subclass, ESC_DISK) == 0) + is_lofi = B_FALSE; + else if (strcmp(subclass, ESC_LOFI) == 0) + is_lofi = B_TRUE; + else + return (0); + + is_check = B_FALSE; + } else if (strcmp(class, EC_ZFS) == 0 && + strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { + /* + * This event signifies that a device failed to open during pool + * load, but the 'autoreplace' property was set, so we should + * pretend it's just been added. + */ + is_check = B_TRUE; + } else { + return (0); + } + + if (sysevent_get_attr_list(ev, &nvl) != 0) + return (-1); + + if (is_check) + ret = zfs_deliver_check(nvl); + else + ret = zfs_deliver_add(nvl, is_lofi); + + + nvlist_free(nvl); + return (ret); +} + +static struct slm_mod_ops zfs_mod_ops = { + SE_MAJOR_VERSION, SE_MINOR_VERSION, 10, zfs_deliver_event +}; + +struct slm_mod_ops * +slm_init() +{ + if ((g_zfshdl = libzfs_init()) == NULL) + return (NULL); + + return (&zfs_mod_ops); +} + +void +slm_fini() +{ +} diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c index 62c900d80c..dbc2ccca9e 100644 --- a/usr/src/cmd/truss/codes.c +++ b/usr/src/cmd/truss/codes.c @@ -870,9 +870,7 @@ const struct ioc { "zfs_cmd_t" }, { (uint_t)ZFS_IOC_VDEV_REMOVE, "ZFS_IOC_VDEV_REMOVE", "zfs_cmd_t" }, - { (uint_t)ZFS_IOC_VDEV_ONLINE, "ZFS_IOC_VDEV_ONLINE", - "zfs_cmd_t" }, - { (uint_t)ZFS_IOC_VDEV_OFFLINE, "ZFS_IOC_VDEV_OFFLINE", + { (uint_t)ZFS_IOC_VDEV_SET_STATE, "ZFS_IOC_VDEV_SET_STATE", "zfs_cmd_t" }, { (uint_t)ZFS_IOC_VDEV_ATTACH, "ZFS_IOC_VDEV_ATTACH", "zfs_cmd_t" }, diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c index 78fcbec4e3..107a011a1a 100644 --- a/usr/src/cmd/zpool/zpool_main.c +++ b/usr/src/cmd/zpool/zpool_main.c @@ -383,6 +383,10 @@ state_to_name(vdev_stat_t *vs) return (gettext("UNAVAIL")); case VDEV_STATE_OFFLINE: return (gettext("OFFLINE")); + case VDEV_STATE_REMOVED: + return (gettext("REMOVED")); + case VDEV_STATE_FAULTED: + return (gettext("FAULTED")); case VDEV_STATE_DEGRADED: return (gettext("DEGRADED")); case VDEV_STATE_HEALTHY: @@ -950,9 +954,10 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) (uint64_t **)&vs, &c) == 0); (void) printf("\t%*s%-*s", depth, "", namewidth - depth, name); + (void) printf(" %s", state_to_name(vs)); if (vs->vs_aux != 0) { - (void) printf(" %-8s ", state_to_name(vs)); + (void) printf(" "); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: @@ -971,12 +976,14 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) (void) printf(gettext("newer version")); break; + case VDEV_AUX_ERR_EXCEEDED: + (void) printf(gettext("too many errors")); + break; + default: (void) printf(gettext("corrupted data")); break; } - } else { - (void) printf(" %s", state_to_name(vs)); } (void) printf("\n"); @@ -1083,6 +1090,12 @@ show_import(nvlist_t *config) (void) printf(gettext("status: The pool was last accessed by " "another system.\n")); break; + case ZPOOL_STATUS_FAULTED_DEV_R: + case ZPOOL_STATUS_FAULTED_DEV_NR: + (void) printf(gettext("status: One or more devices are " + "faulted.\n")); + break; + default: /* * No other status can be seen when importing pools. @@ -2307,7 +2320,6 @@ zpool_do_detach(int argc, char **argv) /* * zpool online <pool> <device> ... */ -/* ARGSUSED */ int zpool_do_online(int argc, char **argv) { @@ -2315,6 +2327,7 @@ zpool_do_online(int argc, char **argv) char *poolname; zpool_handle_t *zhp; int ret = 0; + vdev_state_t newstate; /* check options */ while ((c = getopt(argc, argv, "t")) != -1) { @@ -2345,12 +2358,25 @@ zpool_do_online(int argc, char **argv) if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); - for (i = 1; i < argc; i++) - if (zpool_vdev_online(zhp, argv[i]) == 0) - (void) printf(gettext("Bringing device %s online\n"), - argv[i]); - else + for (i = 1; i < argc; i++) { + if (zpool_vdev_online(zhp, argv[i], 0, &newstate) == 0) { + if (newstate != VDEV_STATE_HEALTHY) { + (void) printf(gettext("warning: device '%s' " + "onlined, but remains in faulted state\n"), + argv[i]); + if (newstate == VDEV_STATE_FAULTED) + (void) printf(gettext("use 'zpool " + "clear' to restore a faulted " + "device\n")); + else + (void) printf(gettext("use 'zpool " + "replace' to replace devices " + "that are no longer present\n")); + } + } else { ret = 1; + } + } if (!ret) { zpool_log_history(g_zfs, argc + optind, argv - optind, poolname, @@ -2413,12 +2439,10 @@ zpool_do_offline(int argc, char **argv) if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); - for (i = 1; i < argc; i++) - if (zpool_vdev_offline(zhp, argv[i], istmp) == 0) - (void) printf(gettext("Bringing device %s offline\n"), - argv[i]); - else + for (i = 1; i < argc; i++) { + if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) ret = 1; + } if (!ret) { zpool_log_history(g_zfs, argc + optind, argv - optind, poolname, @@ -2732,6 +2756,10 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, } break; + case VDEV_AUX_ERR_EXCEEDED: + (void) printf(gettext("too many errors")); + break; + default: (void) printf(gettext("corrupted data")); break; @@ -2968,6 +2996,26 @@ status_callback(zpool_handle_t *zhp, void *data) "backup.\n")); break; + case ZPOOL_STATUS_FAULTED_DEV_R: + (void) printf(gettext("status: One or more devices are " + "faulted in response to persistent errors.\n\tSufficient " + "replicas exist for the pool to continue functioning " + "in a\n\tdegraded state.\n")); + (void) printf(gettext("action: Replace the faulted device, " + "or use 'zpool clear' to mark the device\n\trepaired.\n")); + break; + + case ZPOOL_STATUS_FAULTED_DEV_NR: + (void) printf(gettext("status: One or more devices are " + "faulted in response to persistent errors. There are " + "insufficient replicas for the pool to\n\tcontinue " + "functioning.\n")); + (void) printf(gettext("action: Destroy and re-create the pool " + "from a backup source. Manually marking the device\n" + "\trepaired using 'zpool clear' may allow some data " + "to be recovered.\n")); + break; + default: /* * The remaining errors can't actually be generated, yet. @@ -3261,7 +3309,7 @@ zpool_do_upgrade(int argc, char **argv) (void) printf(gettext(" 4 zpool history\n")); (void) printf(gettext(" 5 Compression using the gzip " "algorithm\n")); - (void) printf(gettext(" 6 bootfs pool property ")); + (void) printf(gettext(" 6 pool properties ")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases, see:\n\n")); (void) printf("http://www.opensolaris.org/os/community/zfs/" diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index be5e3c96ab..6c07e628f8 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -435,82 +435,82 @@ process_options(int argc, char **argv) "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:z:h")) != EOF) { value = 0; switch (opt) { - case 'v': - case 's': - case 'a': - case 'm': - case 'r': - case 'R': - case 'd': - case 't': - case 'g': - case 'i': - case 'k': - case 'T': - case 'P': - case 'z': + case 'v': + case 's': + case 'a': + case 'm': + case 'r': + case 'R': + case 'd': + case 't': + case 'g': + case 'i': + case 'k': + case 'T': + case 'P': + case 'z': value = nicenumtoull(optarg); } switch (opt) { - case 'v': + case 'v': zopt_vdevs = value; break; - case 's': + case 's': zopt_vdev_size = MAX(SPA_MINDEVSIZE, value); break; - case 'a': + case 'a': zopt_ashift = value; break; - case 'm': + case 'm': zopt_mirrors = value; break; - case 'r': + case 'r': zopt_raidz = MAX(1, value); break; - case 'R': + case 'R': zopt_raidz_parity = MIN(MAX(value, 1), 2); break; - case 'd': + case 'd': zopt_datasets = MAX(1, value); break; - case 't': + case 't': zopt_threads = MAX(1, value); break; - case 'g': + case 'g': zio_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value); break; - case 'i': + case 'i': zopt_init = value; break; - case 'k': + case 'k': zopt_killrate = value; break; - case 'p': + case 'p': zopt_pool = strdup(optarg); break; - case 'f': + case 'f': zopt_dir = strdup(optarg); break; - case 'V': + case 'V': zopt_verbose++; break; - case 'E': + case 'E': zopt_init = 0; break; - case 'T': + case 'T': zopt_time = value; break; - case 'P': + case 'P': zopt_passtime = MAX(1, value); break; - case 'z': + case 'z': zio_zil_fail_shift = MIN(value, 16); break; - case 'h': + case 'h': usage(B_TRUE); break; - case '?': - default: + case '?': + default: usage(B_FALSE); break; } @@ -2616,7 +2616,7 @@ ztest_fault_inject(ztest_args_t *za) if (ztest_random(10) < 6) (void) vdev_offline(spa, guid0, B_TRUE); else - (void) vdev_online(spa, guid0); + (void) vdev_online(spa, guid0, B_FALSE, NULL); } /* diff --git a/usr/src/common/zfs/zfs_prop.c b/usr/src/common/zfs/zfs_prop.c index d4029d15d4..0d00e0aa2c 100644 --- a/usr/src/common/zfs/zfs_prop.c +++ b/usr/src/common/zfs/zfs_prop.c @@ -181,6 +181,8 @@ static prop_desc_t zfs_prop_table[] = { "1 | 2 | 3", "COPIES", B_TRUE, B_TRUE }, { "bootfs", prop_type_string, 0, NULL, prop_default, ZFS_TYPE_POOL, "<filesystem>", "BOOTFS", B_FALSE, B_TRUE }, + { "autoreplace", prop_type_boolean, 0, NULL, prop_default, + ZFS_TYPE_POOL, "on | off", "REPLACE", B_FALSE, B_TRUE }, }; #define ZFS_PROP_COUNT ((sizeof (zfs_prop_table))/(sizeof (prop_desc_t))) @@ -246,6 +248,12 @@ zfs_prop_get_type(zfs_prop_t prop) return (zfs_prop_table[prop].pd_proptype); } +zfs_proptype_t +zpool_prop_get_type(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_proptype); +} + static boolean_t propname_match(const char *p, zfs_prop_t prop, size_t len) { @@ -365,12 +373,24 @@ zfs_prop_default_string(zfs_prop_t prop) return (zfs_prop_table[prop].pd_strdefault); } +const char * +zpool_prop_default_string(zpool_prop_t prop) +{ + return (zfs_prop_table[prop].pd_strdefault); +} + uint64_t zfs_prop_default_numeric(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_numdefault); } +uint64_t +zpool_prop_default_numeric(zpool_prop_t prop) +{ + return (zfs_prop_table[prop].pd_numdefault); +} + /* * Returns TRUE if the property is readonly. */ @@ -382,7 +402,7 @@ zfs_prop_readonly(zfs_prop_t prop) /* * Given a dataset property ID, returns the corresponding name. - * Assuming the zfs dataset propety ID is valid. + * Assuming the zfs dataset property ID is valid. */ const char * zfs_prop_to_name(zfs_prop_t prop) @@ -392,7 +412,7 @@ zfs_prop_to_name(zfs_prop_t prop) /* * Given a pool property ID, returns the corresponding name. - * Assuming the pool propety ID is valid. + * Assuming the pool property ID is valid. */ const char * zpool_prop_to_name(zpool_prop_t prop) diff --git a/usr/src/common/zfs/zfs_prop.h b/usr/src/common/zfs/zfs_prop.h index 133e740ce6..3b18ec561f 100644 --- a/usr/src/common/zfs/zfs_prop.h +++ b/usr/src/common/zfs/zfs_prop.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -47,6 +47,7 @@ typedef enum { } zfs_proptype_t; zfs_proptype_t zfs_prop_get_type(zfs_prop_t); +zfs_proptype_t zpool_prop_get_type(zpool_prop_t); size_t zfs_prop_width(zfs_prop_t, boolean_t *); #ifdef __cplusplus diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h index 4550e4a320..b23cfd239a 100644 --- a/usr/src/lib/libzfs/common/libzfs.h +++ b/usr/src/lib/libzfs/common/libzfs.h @@ -158,14 +158,20 @@ extern int zpool_add(zpool_handle_t *, nvlist_t *); * Functions to manipulate pool and vdev state */ extern int zpool_scrub(zpool_handle_t *, pool_scrub_type_t); +extern int zpool_clear(zpool_handle_t *, const char *); -extern int zpool_vdev_online(zpool_handle_t *, const char *); -extern int zpool_vdev_offline(zpool_handle_t *, const char *, int); -extern int zpool_vdev_attach(zpool_handle_t *, const char *, const char *, - nvlist_t *, int); +extern int zpool_vdev_online(zpool_handle_t *, const char *, int, + vdev_state_t *); +extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); +extern int zpool_vdev_attach(zpool_handle_t *, const char *, + const char *, nvlist_t *, int); extern int zpool_vdev_detach(zpool_handle_t *, const char *); extern int zpool_vdev_remove(zpool_handle_t *, const char *); -extern int zpool_clear(zpool_handle_t *, const char *); + +extern int zpool_vdev_fault(zpool_handle_t *, uint64_t); +extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t); +extern int zpool_vdev_clear(zpool_handle_t *, uint64_t); + extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *); extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *); @@ -173,8 +179,9 @@ extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *); * Functions to manage pool properties */ extern int zpool_set_prop(zpool_handle_t *, const char *, const char *); -extern int zpool_get_prop(zpool_handle_t *, zfs_prop_t, char *, +extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, size_t proplen, zfs_source_t *); +extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t); extern const char *zpool_prop_to_name(zpool_prop_t); extern const char *zpool_prop_values(zpool_prop_t); @@ -197,6 +204,8 @@ typedef enum { ZPOOL_STATUS_FAILING_DEV, /* device experiencing errors */ ZPOOL_STATUS_VERSION_NEWER, /* newer on-disk version */ ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */ + ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ + ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ /* * The following are not faults per se, but still an error possibly @@ -372,7 +381,7 @@ extern int zfs_share(zfs_handle_t *); extern int zfs_unshare(zfs_handle_t *); /* - * Protocol-specifc share support functions. + * Protocol-specific share support functions. */ extern boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **); extern int zfs_share_nfs(zfs_handle_t *); diff --git a/usr/src/lib/libzfs/common/libzfs_dataset.c b/usr/src/lib/libzfs/common/libzfs_dataset.c index de438079c6..3f8a377eff 100644 --- a/usr/src/lib/libzfs/common/libzfs_dataset.c +++ b/usr/src/lib/libzfs/common/libzfs_dataset.c @@ -1020,7 +1020,7 @@ zfs_validate_properties(libzfs_handle_t *hdl, zfs_type_t type, char *pool_name, break; - case ZFS_PROP_BOOTFS: + case ZPOOL_PROP_BOOTFS: /* * bootfs property value has to be a dataset name and * the dataset has to be in the same pool as it sets to. @@ -3568,7 +3568,7 @@ zfs_get_user_props(zfs_handle_t *zhp) } /* - * Given a comma-separated list of properties, contruct a property list + * Given a comma-separated list of properties, construct a property list * containing both user-defined and native properties. This function will * return a NULL list if 'all' is specified, which can later be expanded on a * per-dataset basis by zfs_expand_proplist(). @@ -3631,7 +3631,8 @@ zfs_get_proplist_common(libzfs_handle_t *hdl, char *fields, */ c = s[len]; s[len] = '\0'; - prop = zfs_name_to_prop_common(s, type); + prop = type == ZFS_TYPE_POOL ? zpool_name_to_prop(s) : + zfs_name_to_prop(s); if (prop != ZFS_PROP_INVAL && !zfs_prop_valid_for_type(prop, type)) diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c index a5e469c007..fa6cb1a8d6 100644 --- a/usr/src/lib/libzfs/common/libzfs_pool.c +++ b/usr/src/lib/libzfs/common/libzfs_pool.c @@ -862,10 +862,12 @@ is_spare(zpool_handle_t *zhp, uint64_t guid) } /* - * Bring the specified vdev online + * Bring the specified vdev online. The 'flags' parameter is a set of the + * ZFS_ONLINE_* flags. */ int -zpool_vdev_online(zpool_handle_t *zhp, const char *path) +zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, + vdev_state_t *newstate) { zfs_cmd_t zc = { 0 }; char msg[1024]; @@ -885,17 +887,22 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path) if (avail_spare || is_spare(zhp, zc.zc_guid) == B_TRUE) return (zfs_error(hdl, EZFS_ISSPARE, msg)); - if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_ONLINE, &zc) == 0) - return (0); + zc.zc_cookie = VDEV_STATE_ONLINE; + zc.zc_obj = flags; - return (zpool_standard_error(hdl, errno, msg)); + + if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) + return (zpool_standard_error(hdl, errno, msg)); + + *newstate = zc.zc_cookie; + return (0); } /* * Take the specified vdev offline */ int -zpool_vdev_offline(zpool_handle_t *zhp, const char *path, int istmp) +zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) { zfs_cmd_t zc = { 0 }; char msg[1024]; @@ -915,9 +922,43 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, int istmp) if (avail_spare || is_spare(zhp, zc.zc_guid) == B_TRUE) return (zfs_error(hdl, EZFS_ISSPARE, msg)); - zc.zc_cookie = istmp; + zc.zc_cookie = VDEV_STATE_OFFLINE; + zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0; + + if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + return (0); + + switch (errno) { + case EBUSY: + + /* + * There are no other replicas of this device. + */ + return (zfs_error(hdl, EZFS_NOREPLICAS, msg)); + + default: + return (zpool_standard_error(hdl, errno, msg)); + } +} + +/* + * Mark the given vdev faulted. + */ +int +zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot fault %llu"), guid); - if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_OFFLINE, &zc) == 0) + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_guid = guid; + zc.zc_cookie = VDEV_STATE_FAULTED; + + if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); switch (errno) { @@ -931,6 +972,30 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, int istmp) default: return (zpool_standard_error(hdl, errno, msg)); } + +} + +/* + * Mark the given vdev degraded. + */ +int +zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot degrade %llu"), guid); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_guid = guid; + zc.zc_cookie = VDEV_STATE_DEGRADED; + + if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + return (0); + + return (zpool_standard_error(hdl, errno, msg)); } /* @@ -1232,6 +1297,29 @@ zpool_clear(zpool_handle_t *zhp, const char *path) } /* + * Similar to zpool_clear(), but takes a GUID (used by fmd). + */ +int +zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"), + guid); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_guid = guid; + + if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0) + return (0); + + return (zpool_standard_error(hdl, errno, msg)); +} + +/* * Iterate over all zvols in a given pool by walking the /dev/zvol/dsk/<pool> * hierarchy. */ @@ -1492,6 +1580,8 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv) char *path, *devid; uint64_t value; char buf[64]; + vdev_stat_t *vs; + uint_t vsc; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0) { @@ -1502,7 +1592,16 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv) path = buf; } else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { - if (zhp != NULL && + /* + * If the device is dead (faulted, offline, etc) then don't + * bother opening it. Otherwise we may be forcing the user to + * open a misbehaving device, which can have undesirable + * effects. + */ + if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS, + (uint64_t **)&vs, &vsc) != 0 || + vs->vs_state >= VDEV_STATE_DEGRADED) && + zhp != NULL && nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &devid) == 0) { /* * Determine if the current path is correct. @@ -1684,7 +1783,7 @@ zpool_upgrade(zpool_handle_t *zhp) * * 'pool' is B_TRUE if we are logging a command for 'zpool'; B_FALSE * otherwise ('zfs'). 'pool_create' is B_TRUE if we are logging the creation - * of the pool; B_FALSE otherwise. 'path' is the pathanme containing the + * of the pool; B_FALSE otherwise. 'path' is the pathname containing the * poolname. 'argc' and 'argv' are used to construct the command string. */ void @@ -2121,6 +2220,37 @@ zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) return (ret); } +uint64_t +zpool_get_prop_int(zpool_handle_t *zhp, zpool_prop_t prop) +{ + uint64_t value; + nvlist_t *nvp; + + if (zpool_get_version(zhp) < ZFS_VERSION_BOOTFS) + return (0); + + if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) + return (zpool_prop_default_numeric(prop)); + + switch (prop) { + case ZPOOL_PROP_AUTOREPLACE: + if (nvlist_lookup_nvlist(zhp->zpool_props, + zpool_prop_to_name(prop), &nvp) != 0) { + value = zpool_prop_default_numeric(prop); + } else { + VERIFY(nvlist_lookup_uint64(nvp, ZFS_PROP_VALUE, + &value) == 0); + } + return (value); + break; + + default: + assert(0); + } + + return (0); +} + int zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *propbuf, size_t proplen, zfs_source_t *srctype) @@ -2140,22 +2270,16 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *propbuf, return (zfs_error(zhp->zpool_hdl, EZFS_BADVERSION, msg)); } - if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) + if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) && + prop != ZPOOL_PROP_NAME) return (zfs_error(zhp->zpool_hdl, EZFS_POOLPROPS, msg)); - /* - * the "name" property is special cased - */ - if (!zfs_prop_valid_for_type(prop, ZFS_TYPE_POOL) && - prop != ZFS_PROP_NAME) - return (-1); - switch (prop) { - case ZFS_PROP_NAME: + case ZPOOL_PROP_NAME: (void) strlcpy(propbuf, zhp->zpool_name, proplen); break; - case ZFS_PROP_BOOTFS: + case ZPOOL_PROP_BOOTFS: if (nvlist_lookup_nvlist(zhp->zpool_props, zpool_prop_to_name(prop), &nvp) != 0) { strvalue = (char *)zfs_prop_default_string(prop); @@ -2171,7 +2295,22 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *propbuf, if (strlen(strvalue) >= proplen) return (-1); } - (void) strcpy(propbuf, strvalue); + (void) strlcpy(propbuf, strvalue, proplen); + break; + + case ZPOOL_PROP_AUTOREPLACE: + if (nvlist_lookup_nvlist(zhp->zpool_props, + zpool_prop_to_name(prop), &nvp) != 0) { + value = zpool_prop_default_numeric(prop); + src = ZFS_SRC_DEFAULT; + } else { + VERIFY(nvlist_lookup_uint64(nvp, + ZFS_PROP_SOURCE, &value) == 0); + src = value; + VERIFY(nvlist_lookup_uint64(nvp, ZFS_PROP_VALUE, + &value) == 0); + } + (void) strlcpy(propbuf, value ? "on" : "off", proplen); break; default: diff --git a/usr/src/lib/libzfs/common/libzfs_status.c b/usr/src/lib/libzfs/common/libzfs_status.c index 3eba97a431..97a81c35bb 100644 --- a/usr/src/lib/libzfs/common/libzfs_status.c +++ b/usr/src/lib/libzfs/common/libzfs_status.c @@ -30,7 +30,7 @@ * include both the status of an active pool, as well as the status exported * pools. Returns one of the ZPOOL_STATUS_* defines describing the status of * the pool. This status is independent (to a certain degree) from the state of - * the pool. A pool's state descsribes only whether or not it is capable of + * the pool. A pool's state describes only whether or not it is capable of * providing the necessary fault tolerance for data. The status describes the * overall status of devices. A pool that is online can still have a device * that is experiencing errors. @@ -47,7 +47,7 @@ #include "libzfs_impl.h" /* - * Message ID table. This must be kep in sync with the ZPOOL_STATUS_* defines + * Message ID table. This must be kept in sync with the ZPOOL_STATUS_* defines * in libzfs.h. Note that there are some status results which go past the end * of this table, and hence have no associated message ID. */ @@ -65,25 +65,6 @@ static char *zfs_msgid_table[] = { "ZFS-8000-EY" }; -/* - * If the pool is active, a certain class of static errors is overridden by the - * faults as analayzed by FMA. These faults have separate knowledge articles, - * and the article referred to by 'zpool status' must match that indicated by - * the syslog error message. We override missing data as well as corrupt pool. - */ -static char *zfs_msgid_table_active[] = { - "ZFS-8000-14", - "ZFS-8000-D3", /* overridden */ - "ZFS-8000-D3", /* overridden */ - "ZFS-8000-4J", - "ZFS-8000-5E", - "ZFS-8000-6X", - "ZFS-8000-CS", /* overridden */ - "ZFS-8000-8A", - "ZFS-8000-9P", - "ZFS-8000-CS", /* overridden */ -}; - #define NMSGID (sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0])) /* ARGSUSED */ @@ -96,9 +77,16 @@ vdev_missing(uint64_t state, uint64_t aux, uint64_t errs) /* ARGSUSED */ static int +vdev_faulted(uint64_t state, uint64_t aux, uint64_t errs) +{ + return (state == VDEV_STATE_FAULTED); +} + +/* ARGSUSED */ +static int vdev_errors(uint64_t state, uint64_t aux, uint64_t errs) { - return (errs != 0); + return (state == VDEV_STATE_DEGRADED || errs != 0); } /* ARGSUSED */ @@ -163,9 +151,9 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) * following: * * - Check for a complete and valid configuration - * - Look for any missing devices in a non-replicated config + * - Look for any faulted or missing devices in a non-replicated config * - Check for any data errors - * - Check for any missing devices in a replicated config + * - Check for any faulted or missing devices in a replicated config * - Look for any devices showing errors * - Check for any resilvering devices * @@ -215,9 +203,13 @@ check_status(nvlist_t *config, boolean_t isimport) return (ZPOOL_STATUS_BAD_GUID_SUM); /* - * Missing devices in non-replicated config. + * Bad devices in non-replicated config. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && + find_vdev_problem(nvroot, vdev_faulted)) + return (ZPOOL_STATUS_FAULTED_DEV_NR); + + if (vs->vs_state == VDEV_STATE_CANT_OPEN && find_vdev_problem(nvroot, vdev_missing)) return (ZPOOL_STATUS_MISSING_DEV_NR); @@ -244,6 +236,8 @@ check_status(nvlist_t *config, boolean_t isimport) /* * Missing devices in a replicated config. */ + if (find_vdev_problem(nvroot, vdev_faulted)) + return (ZPOOL_STATUS_FAULTED_DEV_R); if (find_vdev_problem(nvroot, vdev_missing)) return (ZPOOL_STATUS_MISSING_DEV_R); if (find_vdev_problem(nvroot, vdev_broken)) @@ -284,7 +278,7 @@ zpool_get_status(zpool_handle_t *zhp, char **msgid) if (ret >= NMSGID) *msgid = NULL; else - *msgid = zfs_msgid_table_active[ret]; + *msgid = zfs_msgid_table[ret]; return (ret); } diff --git a/usr/src/lib/libzfs/common/libzfs_util.c b/usr/src/lib/libzfs/common/libzfs_util.c index 6b5a6de945..2ae3e2e6ce 100644 --- a/usr/src/lib/libzfs/common/libzfs_util.c +++ b/usr/src/lib/libzfs/common/libzfs_util.c @@ -490,9 +490,8 @@ zfs_nicenum(uint64_t num, char *buf, size_t buflen) */ int i; for (i = 2; i >= 0; i--) { - (void) snprintf(buf, buflen, "%.*f%c", i, - (double)num / (1ULL << 10 * index), u); - if (strlen(buf) <= 5) + if (snprintf(buf, buflen, "%.*f%c", i, + (double)num / (1ULL << 10 * index), u) <= 5) break; } } diff --git a/usr/src/lib/libzfs/common/mapfile-vers b/usr/src/lib/libzfs/common/mapfile-vers index bd4d915a36..e182f14eed 100644 --- a/usr/src/lib/libzfs/common/mapfile-vers +++ b/usr/src/lib/libzfs/common/mapfile-vers @@ -116,6 +116,7 @@ SUNWprivate_1.1 { zpool_get_history; zpool_get_name; zpool_get_prop; + zpool_get_prop_int; zpool_get_proplist; zpool_get_root; zpool_get_space_total; @@ -144,7 +145,10 @@ SUNWprivate_1.1 { zpool_unmount_datasets; zpool_upgrade; zpool_vdev_attach; + zpool_vdev_clear; + zpool_vdev_degrade; zpool_vdev_detach; + zpool_vdev_fault; zpool_vdev_name; zpool_vdev_offline; zpool_vdev_online; diff --git a/usr/src/lib/libzpool/common/sys/zfs_context.h b/usr/src/lib/libzpool/common/sys/zfs_context.h index 53f65b009b..e30f1c704d 100644 --- a/usr/src/lib/libzpool/common/sys/zfs_context.h +++ b/usr/src/lib/libzpool/common/sys/zfs_context.h @@ -72,6 +72,7 @@ extern "C" { #include <sys/zfs_debug.h> #include <sys/sdt.h> #include <sys/kstat.h> +#include <sys/sysevent/eventdefs.h> /* * Debugging @@ -151,7 +152,7 @@ _NOTE(CONSTCOND) } while (0) #endif /* - * Dtrace SDT probes have different signatures in userland than they do in + * DTrace SDT probes have different signatures in userland than they do in * kernel. If they're being used in kernel code, re-define them out of * existence for their counterparts in libzpool. */ diff --git a/usr/src/pkgdefs/SUNWzfsu/prototype_com b/usr/src/pkgdefs/SUNWzfsu/prototype_com index a9bd9105b2..502cbd8862 100644 --- a/usr/src/pkgdefs/SUNWzfsu/prototype_com +++ b/usr/src/pkgdefs/SUNWzfsu/prototype_com @@ -20,7 +20,7 @@ # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -57,6 +57,9 @@ d none usr/lib/mdb 755 root sys d none usr/lib/mdb/kvm 755 root sys d none usr/lib/mdb/proc 755 root sys f none usr/lib/mdb/proc/libzpool.so 555 root sys +d none usr/lib/sysevent 755 root bin +d none usr/lib/sysevent/modules 755 root bin +f none usr/lib/sysevent/modules/zfs_mod.so 755 root sys d none usr/sbin 755 root bin l none usr/sbin/zdb=../../usr/lib/isaexec s none usr/sbin/zfs=../../sbin/zfs diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index dfdf0c846e..6963bcecab 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -424,6 +424,24 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) } /* + * Checks to see if the given vdev could not be opened, in which case we post a + * sysevent to notify the autoreplace code that the device has been removed. + */ +static void +spa_check_removed(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + spa_check_removed(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { + zfs_post_autoreplace(vd->vdev_spa, vd); + spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); + } +} + +/* * Load an existing storage pool, using the pool's builtin spa_config as a * source of configuration information. */ @@ -438,6 +456,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) uint64_t pool_guid; uint64_t version; zio_t *zio; + uint64_t autoreplace = 0; spa->spa_load_state = state; @@ -711,11 +730,25 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) if (error == 0) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), sizeof (uint64_t), 1, &spa->spa_bootfs); + (void) zap_lookup(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), + sizeof (uint64_t), 1, &autoreplace); } /* + * If the 'autoreplace' property is set, then post a resource notifying + * the ZFS DE that it should not issue any faults for unopenable + * devices. We also iterate over the vdevs, and post a sysevent for any + * unopenable vdevs so that the normal autoreplace handler can take + * over. + */ + if (autoreplace) + spa_check_removed(spa->spa_root_vdev); + + /* * Load the vdev state for all toplevel vdevs. */ vdev_load(rvd); @@ -795,7 +828,7 @@ out: * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the - * POOL_STATE_UNITIALIZED state. + * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some @@ -879,6 +912,13 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) } spa_open_ref(spa, tag); + + /* + * If we just loaded the pool, resilver anything that's out of date. + */ + if (loaded && (spa_mode & FWRITE)) + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + if (locked) mutex_exit(&spa_namespace_lock); @@ -890,12 +930,6 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) spa_config_exit(spa, FTAG); } - /* - * If we just loaded the pool, resilver anything that's out of date. - */ - if (loaded && (spa_mode & FWRITE)) - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); - return (0); } @@ -1219,7 +1253,7 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) dmu_tx_commit(tx); - spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); + spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); @@ -1325,14 +1359,14 @@ spa_import(const char *pool, nvlist_t *config, const char *altroot) */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); - mutex_exit(&spa_namespace_lock); - /* * Resilver anything that's out of date. */ if (spa_mode & FWRITE) VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); + return (0); } @@ -1476,6 +1510,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) } } + spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); + if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); @@ -1657,7 +1693,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own - * mirror using the 'replacing' vdev, which is functionally idendical to + * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) @@ -1685,7 +1721,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, - VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) + VDEV_ALLOC_ADD)) != 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; @@ -1818,9 +1857,12 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); /* - * Kick off a resilver to update newvd. + * Kick off a resilver to update newvd. We need to grab the namespace + * lock because spa_scrub() needs to post a sysevent with the pool name. */ + mutex_enter(&spa_namespace_lock); VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); return (0); } @@ -1973,7 +2015,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) /* * Reevaluate the parent vdev state. */ - vdev_propagate_state(cvd->vdev_parent); + vdev_propagate_state(cvd); /* * If the device we just detached was smaller than the others, it may be @@ -1996,6 +2038,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); + spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + error = spa_vdev_exit(spa, vd, txg, 0); /* @@ -2098,20 +2142,24 @@ out: } /* - * Find any device that's done replacing, so we can detach it. + * Find any device that's done replacing, or a vdev marked 'unspare' that's + * current spared, so we can detach it. */ static vdev_t * -spa_vdev_replace_done_hunt(vdev_t *vd) +spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; int c; for (c = 0; c < vd->vdev_children; c++) { - oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); + oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } + /* + * Check for a completed replacement. + */ if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { oldvd = vd->vdev_child[0]; newvd = vd->vdev_child[1]; @@ -2125,11 +2173,29 @@ spa_vdev_replace_done_hunt(vdev_t *vd) mutex_exit(&newvd->vdev_dtl_lock); } + /* + * Check for a completed resilver with the 'unspare' flag set. + */ + if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { + newvd = vd->vdev_child[0]; + oldvd = vd->vdev_child[1]; + + mutex_enter(&newvd->vdev_dtl_lock); + if (newvd->vdev_unspare && + newvd->vdev_dtl_map.sm_space == 0 && + newvd->vdev_dtl_scrub.sm_space == 0) { + newvd->vdev_unspare = 0; + mutex_exit(&newvd->vdev_dtl_lock); + return (oldvd); + } + mutex_exit(&newvd->vdev_dtl_lock); + } + return (NULL); } static void -spa_vdev_replace_done(spa_t *spa) +spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd; vdev_t *pvd; @@ -2138,7 +2204,7 @@ spa_vdev_replace_done(spa_t *spa) spa_config_enter(spa, RW_READER, FTAG); - while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { + while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { guid = vd->vdev_guid; /* * If we have just finished replacing a hot spared device, then @@ -2449,6 +2515,9 @@ spa_scrub_thread(spa_t *spa) vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); spa_errlog_rotate(spa); + if (scrub_type == POOL_SCRUB_RESILVER && complete) + spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH); + spa_config_exit(spa, FTAG); mutex_enter(&spa->spa_scrub_lock); @@ -2457,7 +2526,7 @@ spa_scrub_thread(spa_t *spa) * We may have finished replacing a device. * Let the async thread assess this and handle the detach. */ - spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); /* * If we were told to restart, our final act is to start a new scrub. @@ -2568,7 +2637,7 @@ spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) */ if (type == POOL_SCRUB_RESILVER) { type = POOL_SCRUB_NONE; - spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); } } else { /* @@ -2593,6 +2662,8 @@ spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) mintxg = ss->ss_start - 1; ss = avl_last(&rvd->vdev_dtl_map.sm_root); maxtxg = MIN(ss->ss_end, maxtxg); + + spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); } mutex_exit(&rvd->vdev_dtl_lock); @@ -2624,29 +2695,29 @@ spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) */ static void -spa_async_reopen(spa_t *spa) +spa_async_remove(spa_t *spa, vdev_t *vd) { - vdev_t *rvd = spa->spa_root_vdev; vdev_t *tvd; int c; - spa_config_enter(spa, RW_WRITER, FTAG); - - for (c = 0; c < rvd->vdev_children; c++) { - tvd = rvd->vdev_child[c]; - if (tvd->vdev_reopen_wanted) { - tvd->vdev_reopen_wanted = 0; - vdev_reopen(tvd); + for (c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + if (tvd->vdev_remove_wanted) { + tvd->vdev_remove_wanted = 0; + vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, + VDEV_AUX_NONE); + vdev_clear(spa, tvd); + vdev_config_dirty(tvd->vdev_top); } + spa_async_remove(spa, tvd); } - - spa_config_exit(spa, FTAG); } static void spa_async_thread(spa_t *spa) { int tasks; + uint64_t txg; ASSERT(spa->spa_sync_on); @@ -2665,28 +2736,40 @@ spa_async_thread(spa_t *spa) } /* - * See if any devices need to be reopened. + * See if any devices need to be marked REMOVED. */ - if (tasks & SPA_ASYNC_REOPEN) - spa_async_reopen(spa); + if (tasks & SPA_ASYNC_REMOVE) { + txg = spa_vdev_enter(spa); + spa_async_remove(spa, spa->spa_root_vdev); + (void) spa_vdev_exit(spa, NULL, txg, 0); + } /* * If any devices are done replacing, detach them. */ - if (tasks & SPA_ASYNC_REPLACE_DONE) - spa_vdev_replace_done(spa); + if (tasks & SPA_ASYNC_RESILVER_DONE) + spa_vdev_resilver_done(spa); /* - * Kick off a scrub. + * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING + * scrub which can become a resilver), we need to hold + * spa_namespace_lock() because the sysevent we post via + * spa_event_notify() needs to get the name of the pool. */ - if (tasks & SPA_ASYNC_SCRUB) + if (tasks & SPA_ASYNC_SCRUB) { + mutex_enter(&spa_namespace_lock); VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); + } /* * Kick off a resilver. */ - if (tasks & SPA_ASYNC_RESILVER) + if (tasks & SPA_ASYNC_RESILVER) { + mutex_enter(&spa_namespace_lock); VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); + } /* * Let the world know that we're done. @@ -2810,7 +2893,7 @@ spa_sync_spares(spa_t *spa, dmu_tx_t *tx) /* * Update the MOS nvlist describing the list of available spares. * spa_validate_spares() will have already made sure this nvlist is - * valid and the vdevs are labelled appropriately. + * valid and the vdevs are labeled appropriately. */ if (spa->spa_spares_object == 0) { spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, @@ -2869,6 +2952,7 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) nvpair_t *nvpair; objset_t *mos = spa->spa_meta_objset; uint64_t zapobj; + uint64_t intval; mutex_enter(&spa->spa_props_lock); if (spa->spa_pool_props_object == 0) { @@ -2886,14 +2970,23 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) nvpair = NULL; while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { switch (zpool_name_to_prop(nvpair_name(nvpair))) { - case ZFS_PROP_BOOTFS: + case ZPOOL_PROP_BOOTFS: VERIFY(nvlist_lookup_uint64(nvp, nvpair_name(nvpair), &spa->spa_bootfs) == 0); VERIFY(zap_update(mos, spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 8, 1, &spa->spa_bootfs, tx) == 0); break; + + case ZPOOL_PROP_AUTOREPLACE: + VERIFY(nvlist_lookup_uint64(nvp, + nvpair_name(nvpair), &intval) == 0); + VERIFY(zap_update(mos, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 8, 1, + &intval, tx) == 0); + break; } } } @@ -3191,7 +3284,7 @@ spa_get_props(spa_t *spa, nvlist_t **nvp) zap_attribute_t za; objset_t *mos = spa->spa_meta_objset; zfs_source_t src; - zfs_prop_t prop; + zpool_prop_t prop; nvlist_t *propval; uint64_t value; int err; @@ -3215,14 +3308,14 @@ spa_get_props(spa_t *spa, nvlist_t **nvp) VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); switch (za.za_integer_length) { case 8: - if (zfs_prop_default_numeric(prop) == + if (zpool_prop_default_numeric(prop) == za.za_first_integer) src = ZFS_SRC_DEFAULT; else src = ZFS_SRC_LOCAL; value = za.za_first_integer; - if (prop == ZFS_PROP_BOOTFS) { + if (prop == ZPOOL_PROP_BOOTFS) { dsl_pool_t *dp; dsl_dataset_t *ds = NULL; char strval[MAXPATHLEN]; @@ -3274,7 +3367,61 @@ spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } + +/* + * Post a sysevent corresponding to the given event. The 'name' must be one of + * the event definitions in sys/sysevent/eventdefs.h. The payload will be + * filled in from the spa and (optionally) the vdev. This doesn't do anything + * in the userland libzpool, as we don't want consumers to misinterpret ztest + * or zdb as real changes. + */ +void +spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) +{ +#ifdef _KERNEL + sysevent_t *ev; + sysevent_attr_list_t *attr = NULL; + sysevent_value_t value; + sysevent_id_t eid; + + ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", + SE_SLEEP); + + value.value_type = SE_DATA_TYPE_STRING; + value.value.sv_string = spa_name(spa); + if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) + goto done; + + value.value_type = SE_DATA_TYPE_UINT64; + value.value.sv_uint64 = spa_guid(spa); + if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) + goto done; + + if (vd) { + value.value_type = SE_DATA_TYPE_UINT64; + value.value.sv_uint64 = vd->vdev_guid; + if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, + SE_SLEEP) != 0) + goto done; + + if (vd->vdev_path) { + value.value_type = SE_DATA_TYPE_STRING; + value.value.sv_string = vd->vdev_path; + if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, + &value, SE_SLEEP) != 0) + goto done; + } + } + + (void) log_sysevent(ev, SE_SLEEP, &eid); + +done: + if (attr) + sysevent_free_attr(attr); + sysevent_free(ev); +#endif +} diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 3e51849766..c08c58cffe 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -590,13 +590,15 @@ spa_config_held(spa_t *spa, krw_t rw) uint64_t spa_vdev_enter(spa_t *spa) { + mutex_enter(&spa_namespace_lock); + /* - * Suspend scrub activity while we mess with the config. + * Suspend scrub activity while we mess with the config. We must do + * this after acquiring the namespace lock to avoid a 3-way deadlock + * with spa_scrub_stop() and the scrub thread. */ spa_scrub_suspend(spa); - mutex_enter(&spa_namespace_lock); - spa_config_enter(spa, RW_WRITER, spa); return (spa_last_synced_txg(spa) + 1); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 2bcf4c8a32..8c2a286847 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -330,8 +330,8 @@ extern void spa_async_resume(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); -#define SPA_ASYNC_REOPEN 0x01 -#define SPA_ASYNC_REPLACE_DONE 0x02 +#define SPA_ASYNC_REMOVE 0x01 +#define SPA_ASYNC_RESILVER_DONE 0x02 #define SPA_ASYNC_SCRUB 0x04 #define SPA_ASYNC_RESILVER 0x08 #define SPA_ASYNC_CONFIG_UPDATE 0x10 @@ -452,6 +452,8 @@ extern void spa_log_error(spa_t *spa, struct zio *zio); extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, struct zio *zio, uint64_t stateoroffset, uint64_t length); extern void zfs_post_ok(spa_t *spa, vdev_t *vd); +extern void zfs_post_remove(spa_t *spa, vdev_t *vd); +extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); extern void spa_errlog_rotate(spa_t *spa); @@ -469,6 +471,9 @@ extern int spa_get_props(spa_t *spa, nvlist_t **nvp); extern void spa_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern boolean_t spa_has_bootfs(spa_t *spa); +/* asynchronous event notification */ +extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); + #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index 3120811625..c651d1eebb 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -84,8 +84,11 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); extern void vdev_io_start(zio_t *zio); extern void vdev_io_done(zio_t *zio); -extern int vdev_online(spa_t *spa, uint64_t guid); -extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp); +extern int vdev_fault(spa_t *spa, uint64_t guid); +extern int vdev_degrade(spa_t *spa, uint64_t guid); +extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, + vdev_state_t *); +extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); extern void vdev_clear(spa_t *spa, vdev_t *vd); extern int vdev_error_inject(vdev_t *vd, zio_t *zio); @@ -95,6 +98,7 @@ extern void vdev_cache_init(vdev_t *vd); extern void vdev_cache_fini(vdev_t *vd); extern int vdev_cache_read(zio_t *zio); extern void vdev_cache_write(zio_t *zio); +extern void vdev_cache_purge(vdev_t *vd); extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 0891fcc0ad..4e83497420 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -140,7 +140,7 @@ struct vdev { txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ - uint8_t vdev_reopen_wanted; /* async reopen wanted? */ + boolean_t vdev_remove_wanted; /* async remove wanted? */ list_node_t vdev_dirty_node; /* config dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ @@ -151,14 +151,17 @@ struct vdev { space_map_obj_t vdev_dtl; /* dirty time log on-disk state */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_wholedisk; /* true if this is a whole disk */ - uint64_t vdev_offline; /* device taken offline? */ + uint64_t vdev_offline; /* persistent offline state */ + uint64_t vdev_faulted; /* persistent faulted state */ + uint64_t vdev_degraded; /* persistent degraded state */ + uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_nparity; /* number of parity devices for raidz */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ + char *vdev_physpath; /* vdev device path (if any) */ uint64_t vdev_fault_arg; /* fault injection paramater */ int vdev_fault_mask; /* zio types to fault */ uint8_t vdev_fault_mode; /* fault injection mode */ - uint8_t vdev_cache_active; /* vdev_cache and vdev_queue */ uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ uint8_t vdev_detached; /* device detached? */ uint64_t vdev_isspare; /* was a hot spare */ @@ -167,6 +170,9 @@ struct vdev { uint64_t vdev_not_present; /* not present during import */ hrtime_t vdev_last_try; /* last reopen time */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + uint64_t vdev_unspare; /* unspare when resilvering done */ + boolean_t vdev_checkremove; /* temporary online test */ + boolean_t vdev_forcefault; /* force online fault */ /* * For DTrace to work in userland (libzpool) context, these fields must diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_context.h b/usr/src/uts/common/fs/zfs/sys/zfs_context.h index 2f0e3e792d..8a689e0760 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_context.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -61,6 +60,8 @@ extern "C" { #include <sys/zone.h> #include <sys/uio.h> #include <sys/zfs_debug.h> +#include <sys/sysevent.h> +#include <sys/sysevent/eventdefs.h> #define CPU_SEQID (CPU->cpu_seqid) diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index fbb77774c2..9b2ec04710 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -319,44 +319,13 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) txg_list_create(&vd->vdev_dtl_list, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); + vdev_queue_init(vd); + vdev_cache_init(vd); return (vd); } /* - * Free a vdev_t that has been removed from service. - */ -static void -vdev_free_common(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - if (vd->vdev_path) - spa_strfree(vd->vdev_path); - if (vd->vdev_devid) - spa_strfree(vd->vdev_devid); - - if (vd->vdev_isspare) - spa_spare_remove(vd); - - txg_list_destroy(&vd->vdev_ms_list); - txg_list_destroy(&vd->vdev_dtl_list); - mutex_enter(&vd->vdev_dtl_lock); - space_map_unload(&vd->vdev_dtl_map); - space_map_destroy(&vd->vdev_dtl_map); - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); - space_map_destroy(&vd->vdev_dtl_scrub); - mutex_exit(&vd->vdev_dtl_lock); - mutex_destroy(&vd->vdev_dtl_lock); - mutex_destroy(&vd->vdev_stat_lock); - - if (vd == spa->spa_root_vdev) - spa->spa_root_vdev = NULL; - - kmem_free(vd, sizeof (vdev_t)); -} - -/* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. @@ -408,6 +377,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vd->vdev_path = spa_strdup(vd->vdev_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, + &vd->vdev_physpath) == 0) + vd->vdev_physpath = spa_strdup(vd->vdev_physpath); /* * Set the nparity propery for RAID-Z vdevs. @@ -477,13 +449,28 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } /* - * If we're a leaf vdev, try to load the DTL object and offline state. + * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl.smo_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, + &vd->vdev_unspare); + /* + * When importing a pool, we want to ignore the persistent fault + * state, as the diagnosis made on another system may not be + * valid in the current context. + */ + if (spa->spa_load_state == SPA_LOAD_OPEN) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, + &vd->vdev_faulted); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, + &vd->vdev_degraded); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, + &vd->vdev_removed); + } } /* @@ -500,6 +487,7 @@ void vdev_free(vdev_t *vd) { int c; + spa_t *spa = vd->vdev_spa; /* * vdev_free() implies closing the vdev first. This is simpler than @@ -507,6 +495,7 @@ vdev_free(vdev_t *vd) */ vdev_close(vd); + ASSERT(!list_link_active(&vd->vdev_dirty_node)); /* @@ -535,7 +524,37 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_parent == NULL); - vdev_free_common(vd); + /* + * Clean up vdev structure. + */ + vdev_queue_fini(vd); + vdev_cache_fini(vd); + + if (vd->vdev_path) + spa_strfree(vd->vdev_path); + if (vd->vdev_devid) + spa_strfree(vd->vdev_devid); + if (vd->vdev_physpath) + spa_strfree(vd->vdev_physpath); + + if (vd->vdev_isspare) + spa_spare_remove(vd); + + txg_list_destroy(&vd->vdev_ms_list); + txg_list_destroy(&vd->vdev_dtl_list); + mutex_enter(&vd->vdev_dtl_lock); + space_map_unload(&vd->vdev_dtl_map); + space_map_destroy(&vd->vdev_dtl_map); + space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + space_map_destroy(&vd->vdev_dtl_scrub); + mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_stat_lock); + + if (vd == spa->spa_root_vdev) + spa->spa_root_vdev = NULL; + + kmem_free(vd, sizeof (vdev_t)); } /* @@ -590,9 +609,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) vdev_config_dirty(tvd); } - tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted; - svd->vdev_reopen_wanted = 0; - tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; } @@ -781,13 +797,12 @@ vdev_open(vdev_t *vd) vd->vdev_stat.vs_aux = VDEV_AUX_NONE; - if (vd->vdev_ops->vdev_op_leaf) { - vdev_cache_init(vd); - vdev_queue_init(vd); - vd->vdev_cache_active = B_TRUE; - } - - if (vd->vdev_offline) { + if (!vd->vdev_removed && vd->vdev_faulted) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); + return (ENXIO); + } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (ENXIO); @@ -798,16 +813,25 @@ vdev_open(vdev_t *vd) if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, ENXIO); - dprintf("%s = %d, osize %llu, state = %d\n", - vdev_description(vd), error, osize, vd->vdev_state); - if (error) { + if (vd->vdev_removed && + vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) + vd->vdev_removed = B_FALSE; + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); return (error); } - vd->vdev_state = VDEV_STATE_HEALTHY; + vd->vdev_removed = B_FALSE; + + if (vd->vdev_degraded) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + } else { + vd->vdev_state = VDEV_STATE_HEALTHY; + } for (c = 0; c < vd->vdev_children; c++) if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { @@ -905,8 +929,7 @@ vdev_open(vdev_t *vd) /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't - * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen() - * won't succeed if the device has been changed underneath. + * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if @@ -988,11 +1011,7 @@ vdev_close(vdev_t *vd) { vd->vdev_ops->vdev_op_close(vd); - if (vd->vdev_cache_active) { - vdev_cache_fini(vd); - vdev_queue_fini(vd); - vd->vdev_cache_active = B_FALSE; - } + vdev_cache_purge(vd); /* * We record the previous state before we close it, so that if we are @@ -1022,22 +1041,13 @@ vdev_reopen(vdev_t *vd) * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). - * - * The downside to this is that if the user is simply experimenting by - * overwriting an entire disk, we'll fault the device rather than - * demonstrate self-healing capabilities. On the other hand, with - * proper FMA integration, the series of errors we'd see from the device - * would result in a faulted device anyway. Given that this doesn't - * model any real-world corruption, it's better to catch this here and - * correctly identify that the device has either changed beneath us, or - * is corrupted beyond recognition. */ (void) vdev_validate(vd); /* - * Reassess root vdev's health. + * Reassess parent vdev's health. */ - vdev_propagate_state(spa->spa_root_vdev); + vdev_propagate_state(vd); } int @@ -1428,8 +1438,12 @@ vdev_description(vdev_t *vd) return (vd->vdev_ops->vdev_op_type); } +/* + * Mark the given vdev faulted. A faulted vdev behaves as if the device could + * not be opened, and no I/O is attempted. + */ int -vdev_online(spa_t *spa, uint64_t guid) +vdev_fault(spa_t *spa, uint64_t guid) { vdev_t *rvd, *vd; uint64_t txg; @@ -1440,27 +1454,141 @@ vdev_online(spa_t *spa, uint64_t guid) if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + /* + * Faulted state takes precedence over degraded. + */ + vd->vdev_faulted = 1ULL; + vd->vdev_degraded = 0ULL; + vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); + + /* + * If marking the vdev as faulted cause the toplevel vdev to become + * unavailable, then back off and simply mark the vdev as degraded + * instead. + */ + if (vdev_is_dead(vd->vdev_top)) { + vd->vdev_degraded = 1ULL; + vd->vdev_faulted = 0ULL; + + /* + * If we reopen the device and it's not dead, only then do we + * mark it degraded. + */ + vdev_reopen(vd); + + if (!vdev_is_dead(vd)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + } + } + + vdev_config_dirty(vd->vdev_top); + + (void) spa_vdev_exit(spa, NULL, txg, 0); + + return (0); +} + +/* + * Mark the given vdev degraded. A degraded vdev is purely an indication to the + * user that something is wrong. The vdev continues to operate as normal as far + * as I/O is concerned. + */ +int +vdev_degrade(spa_t *spa, uint64_t guid) +{ + vdev_t *rvd, *vd; + uint64_t txg; + + txg = spa_vdev_enter(spa); + + rvd = spa->spa_root_vdev; + + if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - dprintf("ONLINE: %s\n", vdev_description(vd)); + /* + * If the vdev is already faulted, then don't do anything. + */ + if (vd->vdev_faulted || vd->vdev_degraded) { + (void) spa_vdev_exit(spa, NULL, txg, 0); + return (0); + } + + vd->vdev_degraded = 1ULL; + if (!vdev_is_dead(vd)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + vdev_config_dirty(vd->vdev_top); + + (void) spa_vdev_exit(spa, NULL, txg, 0); + + return (0); +} + +/* + * Online the given vdev. If 'unspare' is set, it implies two things. First, + * any attached spare device should be detached when the device finishes + * resilvering. Second, the online should be treated like a 'test' online case, + * so no FMA events are generated if the device fails to open. + */ +int +vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, + vdev_state_t *newstate) +{ + vdev_t *rvd, *vd; + uint64_t txg; + + txg = spa_vdev_enter(spa); + + rvd = spa->spa_root_vdev; + + if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; + vd->vdev_checkremove = (flags & ZFS_ONLINE_CHECKREMOVE) ? + B_TRUE : B_FALSE; + vd->vdev_forcefault = (flags & ZFS_ONLINE_FORCEFAULT) ? + B_TRUE : B_FALSE; vdev_reopen(vd->vdev_top); + vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; + + if (newstate) + *newstate = vd->vdev_state; + if ((flags & ZFS_ONLINE_UNSPARE) && + !vdev_is_dead(vd) && vd->vdev_parent && + vd->vdev_parent->vdev_ops == &vdev_spare_ops && + vd->vdev_parent->vdev_child[0] == vd) + vd->vdev_unspare = B_TRUE; vdev_config_dirty(vd->vdev_top); (void) spa_vdev_exit(spa, NULL, txg, 0); + /* + * Must hold spa_namespace_lock in order to post resilver sysevent + * w/pool name. + */ + mutex_enter(&spa_namespace_lock); VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + mutex_exit(&spa_namespace_lock); return (0); } int -vdev_offline(spa_t *spa, uint64_t guid, int istmp) +vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *rvd, *vd; uint64_t txg; @@ -1475,8 +1603,6 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - dprintf("OFFLINE: %s\n", vdev_description(vd)); - /* * If the device isn't already offline, try to offline it. */ @@ -1505,7 +1631,8 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp) } } - vd->vdev_tmpoffline = istmp; + vd->vdev_tmpoffline = (flags & ZFS_OFFLINE_TEMPORARY) ? + B_TRUE : B_FALSE; vdev_config_dirty(vd->vdev_top); @@ -1531,12 +1658,29 @@ vdev_clear(spa_t *spa, vdev_t *vd) for (c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); + + /* + * If we're in the FAULTED state, then clear the persistent state and + * attempt to reopen the device. We also mark the vdev config dirty, so + * that the new faulted state is written out to disk. + */ + if (vd->vdev_faulted || vd->vdev_degraded) { + vd->vdev_faulted = vd->vdev_degraded = 0; + vdev_reopen(vd); + vdev_config_dirty(vd->vdev_top); + + if (vd->vdev_faulted) + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, + B_TRUE) == 0); + + spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); + } } int vdev_is_dead(vdev_t *vd) { - return (vd->vdev_state <= VDEV_STATE_CANT_OPEN); + return (vd->vdev_state < VDEV_STATE_DEGRADED); } int @@ -1563,12 +1707,6 @@ vdev_error_inject(vdev_t *vd, zio_t *zio) break; } - if (error != 0) { - dprintf("returning %d for type %d on %s state %d offset %llx\n", - error, zio->io_type, vdev_description(vd), - vd->vdev_state, zio->io_offset); - } - return (error); } @@ -1792,28 +1930,34 @@ vdev_propagate_state(vdev_t *vd) int c; vdev_t *child; - for (c = 0; c < vd->vdev_children; c++) { - child = vd->vdev_child[c]; - if (child->vdev_state <= VDEV_STATE_CANT_OPEN) - faulted++; - else if (child->vdev_state == VDEV_STATE_DEGRADED) - degraded++; - - if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) - corrupted++; - } + if (vd->vdev_children > 0) { + for (c = 0; c < vd->vdev_children; c++) { + child = vd->vdev_child[c]; + if (vdev_is_dead(child)) + faulted++; + else if (child->vdev_state == VDEV_STATE_DEGRADED) + degraded++; - vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) + corrupted++; + } - /* - * Root special: if there is a toplevel vdev that cannot be - * opened due to corrupted metadata, then propagate the root - * vdev's aux state as 'corrupt' rather than 'insufficient - * replicas'. - */ - if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) - vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); + vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + + /* + * Root special: if there is a toplevel vdev that cannot be + * opened due to corrupted metadata, then propagate the root + * vdev's aux state as 'corrupt' rather than 'insufficient + * replicas'. + */ + if (corrupted && vd == rvd && + rvd->vdev_state == VDEV_STATE_CANT_OPEN) + vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + } + + if (vd->vdev_parent) + vdev_propagate_state(vd->vdev_parent); } /* @@ -1839,7 +1983,39 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; - if (state == VDEV_STATE_CANT_OPEN) { + /* + * If we are setting the vdev state to anything but an open state, then + * always close the underlying device. Otherwise, we keep accessible + * but invalid devices open forever. We don't call vdev_close() itself, + * because that implies some extra checks (offline, etc) that we don't + * want here. This is limited to leaf devices, because otherwise + * closing the device will affect other children. + */ + if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_close(vd); + + if (vd->vdev_removed && + state == VDEV_STATE_CANT_OPEN && + (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { + /* + * If the previous state is set to VDEV_STATE_REMOVED, then this + * device was previously marked removed and someone attempted to + * reopen it. If this failed due to a nonexistent device, then + * keep the device in the REMOVED state. We also let this be if + * it is one of our special test online cases, which is only + * attempting to online the device and shouldn't generate an FMA + * fault. + */ + vd->vdev_state = VDEV_STATE_REMOVED; + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + } else if (state == VDEV_STATE_REMOVED) { + /* + * Indicate to the ZFS DE that this device has been removed, and + * any recent errors should be ignored. + */ + zfs_post_remove(vd->vdev_spa, vd); + vd->vdev_removed = B_TRUE; + } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import, we mark it as * "not available", which signifies that it was never there to @@ -1856,8 +2032,17 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. + * + * If the 'checkremove' flag is set, then this is an attempt to + * online the device in response to an insertion event. If we + * hit this case, then we have detected an insertion event for a + * faulted or offline device that wasn't in the removed state. + * In this scenario, we don't post an ereport because we are + * about to replace the device, or attempt an online with + * vdev_forcefault, which will generate the fault for us. */ - if (vd->vdev_prevstate != state && !vd->vdev_not_present && + if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && + !vd->vdev_not_present && !vd->vdev_checkremove && vd != vd->vdev_spa->spa_root_vdev) { const char *class; @@ -1887,11 +2072,13 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) zfs_ereport_post(class, vd->vdev_spa, vd, NULL, save_state, 0); } - } - if (isopen) - return; + /* Erase any notion of persistent removed state */ + vd->vdev_removed = B_FALSE; + } else { + vd->vdev_removed = B_FALSE; + } - if (vd->vdev_parent != NULL) - vdev_propagate_state(vd->vdev_parent); + if (!isopen) + vdev_propagate_state(vd); } diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c index 2d8795c660..d7d8755f92 100644 --- a/usr/src/uts/common/fs/zfs/vdev_cache.c +++ b/usr/src/uts/common/fs/zfs/vdev_cache.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -352,6 +352,18 @@ vdev_cache_write(zio_t *zio) } void +vdev_cache_purge(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve; + + mutex_enter(&vc->vc_lock); + while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) + vdev_cache_evict(vc, ve); + mutex_exit(&vc->vc_lock); +} + +void vdev_cache_init(vdev_t *vd) { vdev_cache_t *vc = &vd->vdev_cache; @@ -371,12 +383,8 @@ void vdev_cache_fini(vdev_t *vd) { vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve; - mutex_enter(&vc->vc_lock); - while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) - vdev_cache_evict(vc, ve); - mutex_exit(&vc->vc_lock); + vdev_cache_purge(vd); avl_destroy(&vc->vc_offset_tree); avl_destroy(&vc->vc_lastused_tree); diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index b965b1c5f0..5789312667 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -50,6 +50,9 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) vdev_disk_t *dvd; struct dk_minfo dkm; int error; + dev_t dev; + char *physpath, *minorname; + int otyp; /* * We must have a pathname, and it must be absolute. @@ -141,12 +144,57 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, spa_mode, kcred, &dvd->vd_lh, zfs_li); + /* + * If all else fails, then try opening by physical path (if available) + * or the logical path (if we failed due to the devid check). While not + * as reliable as the devid, this will give us something, and the higher + * level vdev validation will prevent us from opening the wrong device. + */ + if (error) { + if (vd->vdev_physpath != NULL && + (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != ENODEV) + error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode, + kcred, &dvd->vd_lh, zfs_li); + + /* + * Note that we don't support the legacy auto-wholedisk support + * as above. This hasn't been used in a very long time and we + * don't need to propagate its oddities to this edge condition. + */ + if (error && vd->vdev_path != NULL) + error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred, + &dvd->vd_lh, zfs_li); + } + if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } /* + * Once a device is opened, verify that the physical device path (if + * available) is up to date. + */ + if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && + ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { + physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); + minorname = NULL; + if (ddi_dev_pathname(dev, otyp, physpath) == 0 && + ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && + (vd->vdev_physpath == NULL || + strcmp(vd->vdev_physpath, physpath) != 0)) { + if (vd->vdev_physpath) + spa_strfree(vd->vdev_physpath); + (void) strlcat(physpath, ":", MAXPATHLEN); + (void) strlcat(physpath, minorname, MAXPATHLEN); + vd->vdev_physpath = spa_strdup(physpath); + } + if (minorname) + kmem_free(minorname, strlen(minorname) + 1); + kmem_free(physpath, MAXPATHLEN); + } + + /* * Determine the actual size of the device. */ if (ldi_get_size(dvd->vd_lh, psize) != 0) { @@ -191,10 +239,6 @@ vdev_disk_close(vdev_t *vd) if (dvd == NULL) return; - dprintf("removing disk %s, devid %s\n", - vd->vdev_path ? vd->vdev_path : "<none>", - vd->vdev_devid ? vd->vdev_devid : "<none>"); - if (dvd->vd_minor != NULL) ddi_devid_str_free(dvd->vd_minor); @@ -340,6 +384,10 @@ vdev_disk_io_start(zio_t *zio) static void vdev_disk_io_done(zio_t *zio) { + vdev_t *vd = zio->io_vd; + vdev_disk_t *dvd = vd->vdev_tsd; + int state; + vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) @@ -348,6 +396,21 @@ vdev_disk_io_done(zio_t *zio) if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); + /* + * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if + * the device has been removed. If this is the case, then we trigger an + * asynchronous removal of the device. + */ + if (zio->io_error == EIO) { + state = DKIO_NONE; + if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, + FKIOCTL, kcred, NULL) == 0 && + state != DKIO_INSERTED) { + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } + } + zio_next_stage(zio); } diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index 9d9f5556fa..f7c51a1594 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -62,7 +62,7 @@ * or a device was added, we want to update all the labels such that we can deal * with fatal failure at any point. To this end, each disk has two labels which * are updated before and after the uberblock is synced. Assuming we have - * labels and an uberblock with the following transacation groups: + * labels and an uberblock with the following transaction groups: * * L1 UB L2 * +------+ +------+ +------+ @@ -209,6 +209,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid) == 0); + if (vd->vdev_physpath != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, + vd->vdev_physpath) == 0); + if (vd->vdev_nparity != 0) { ASSERT(strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_RAIDZ) == 0); @@ -285,9 +289,18 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_offline && !vd->vdev_tmpoffline) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE) == 0); - else - (void) nvlist_remove(nv, ZPOOL_CONFIG_OFFLINE, - DATA_TYPE_UINT64); + if (vd->vdev_faulted) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, + B_TRUE) == 0); + if (vd->vdev_degraded) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, + B_TRUE) == 0); + if (vd->vdev_removed) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, + B_TRUE) == 0); + if (vd->vdev_unspare) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, + B_TRUE) == 0); } return (nv); @@ -496,7 +509,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * If this is a replacement, then we want to fallthrough to the * rest of the code. If we're adding a spare, then it's already - * labelled appropriately and we can just return. + * labeled appropriately and we can just return. */ if (reason == VDEV_LABEL_SPARE) return (0); @@ -605,7 +618,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * If this vdev hasn't been previously identified as a spare, then we - * mark it as such only if a) we are labelling it as a spare, or b) it + * mark it as such only if a) we are labeling it as a spare, or b) it * exists as a spare elsewhere in the system. */ if (error == 0 && !vd->vdev_isspare && diff --git a/usr/src/uts/common/fs/zfs/zfs_fm.c b/usr/src/uts/common/fs/zfs/zfs_fm.c index a886d614d8..146c4ec438 100644 --- a/usr/src/uts/common/fs/zfs/zfs_fm.c +++ b/usr/src/uts/common/fs/zfs/zfs_fm.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -117,9 +117,11 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, /* * Ignore any errors from I/Os that we are going to retry anyway - we - * only generate errors from the final failure. + * only generate errors from the final failure. Checksum errors are + * generated after the pipeline stage responsible for retrying the I/O + * (VDEV_IO_ASSESS), so this only applies to standard I/O errors. */ - if (zio && zio_should_retry(zio)) + if (zio && zio_should_retry(zio) && zio->io_error != ECKSUM) return; /* @@ -292,13 +294,8 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, #endif } -/* - * The 'resource.fs.zfs.ok' event is an internal signal that the associated - * resource (pool or disk) has been identified by ZFS as healthy. This will - * then trigger the DE to close the associated case, if any. - */ -void -zfs_post_ok(spa_t *spa, vdev_t *vd) +static void +zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) { #ifdef _KERNEL nvlist_t *resource; @@ -308,7 +305,7 @@ zfs_post_ok(spa_t *spa, vdev_t *vd) return; (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, - ZFS_ERROR_CLASS, FM_RESOURCE_OK); + ZFS_ERROR_CLASS, name); VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); VERIFY(nvlist_add_uint64(resource, @@ -322,3 +319,37 @@ zfs_post_ok(spa_t *spa, vdev_t *vd) fm_nvlist_destroy(resource, FM_NVA_FREE); #endif } + +/* + * The 'resource.fs.zfs.ok' event is an internal signal that the associated + * resource (pool or disk) has been identified by ZFS as healthy. This will + * then trigger the DE to close the associated case, if any. + */ +void +zfs_post_ok(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_OK); +} + +/* + * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev + * has been removed from the system. This will cause the DE to ignore any + * recent I/O errors, inferring that they are due to the asynchronous device + * removal. + */ +void +zfs_post_remove(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_REMOVED); +} + +/* + * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool + * has the 'autoreplace' property set, and therefore any broken vdevs will be + * handled by higher level logic, and no vdev fault should be generated. + */ +void +zfs_post_autoreplace(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); +} diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index fccfc1355e..74d033001b 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -439,7 +439,9 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); + spa_config_enter(spa, RW_READER, FTAG); error = spa_scrub(spa, zc->zc_cookie, B_FALSE); + spa_config_exit(spa, FTAG); spa_close(spa, FTAG); @@ -618,28 +620,35 @@ zfs_ioc_vdev_remove(zfs_cmd_t *zc) } static int -zfs_ioc_vdev_online(zfs_cmd_t *zc) +zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; + vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - error = vdev_online(spa, zc->zc_guid); - spa_close(spa, FTAG); - return (error); -} + switch (zc->zc_cookie) { + case VDEV_STATE_ONLINE: + error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); + break; -static int -zfs_ioc_vdev_offline(zfs_cmd_t *zc) -{ - spa_t *spa; - int istmp = zc->zc_cookie; - int error; + case VDEV_STATE_OFFLINE: + error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); + break; - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - error = vdev_offline(spa, zc->zc_guid, istmp); + case VDEV_STATE_FAULTED: + error = vdev_fault(spa, zc->zc_guid); + break; + + case VDEV_STATE_DEGRADED: + error = vdev_degrade(spa, zc->zc_guid); + break; + + default: + error = EINVAL; + } + zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } @@ -1096,7 +1105,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) } switch (prop) { - case ZFS_PROP_BOOTFS: + case ZPOOL_PROP_BOOTFS: /* * A bootable filesystem can not be on a RAIDZ pool * nor a striped pool with more than 1 device. @@ -1115,8 +1124,8 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) VERIFY(nvpair_value_string(elem, &strval) == 0); if (strval == NULL || strval[0] == '\0') { - objnum = - zfs_prop_default_numeric(ZFS_PROP_BOOTFS); + objnum = zpool_prop_default_numeric( + ZPOOL_PROP_BOOTFS); break; } @@ -1126,9 +1135,6 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) objnum = dmu_objset_id(os); dmu_objset_close(os); break; - - default: - error = EINVAL; } if (error) @@ -1137,10 +1143,11 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) if (error == 0) { if (reset_bootfs) { VERIFY(nvlist_remove(nvl, - zpool_prop_to_name(ZFS_PROP_BOOTFS), + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING) == 0); VERIFY(nvlist_add_uint64(nvl, - zpool_prop_to_name(ZFS_PROP_BOOTFS), objnum) == 0); + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), + objnum) == 0); } error = spa_set_props(spa, nvl); } @@ -1565,23 +1572,24 @@ zfs_ioc_clear(zfs_cmd_t *zc) spa_t *spa; vdev_t *vd; int error; + uint64_t txg; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - spa_config_enter(spa, RW_WRITER, FTAG); + txg = spa_vdev_enter(spa); if (zc->zc_guid == 0) { vd = NULL; } else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) { - spa_config_exit(spa, FTAG); + (void) spa_vdev_exit(spa, NULL, txg, ENODEV); spa_close(spa, FTAG); return (ENODEV); } vdev_clear(spa, vd); - spa_config_exit(spa, FTAG); + (void) spa_vdev_exit(spa, NULL, txg, 0); spa_close(spa, FTAG); @@ -1620,8 +1628,7 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { { zfs_ioc_pool_log_history, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_add, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_remove, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_online, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_offline, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_set_state, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_attach, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_detach, zfs_secpolicy_config, pool_name }, { zfs_ioc_vdev_setpath, zfs_secpolicy_config, pool_name }, diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 42c30d7edd..130e697d60 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -1519,25 +1519,6 @@ zio_vdev_io_assess(zio_t *zio) return; } - if (zio->io_error != 0 && zio->io_error != ECKSUM && - !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) { - /* - * Poor man's hotplug support. Even if we're done retrying this - * I/O, try to reopen the vdev to see if it's still attached. - * To avoid excessive thrashing, we only try it once a minute. - * This also has the effect of detecting when missing devices - * have come back, by polling the device once a minute. - * - * We need to do this asynchronously because we can't grab - * all the necessary locks way down here. - */ - if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) { - vd->vdev_last_try = gethrtime(); - tvd->vdev_reopen_wanted = 1; - spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN); - } - } - zio_next_stage(zio); } diff --git a/usr/src/uts/common/io/lofi.c b/usr/src/uts/common/io/lofi.c index 4af7fe70b4..1a068ef3ee 100644 --- a/usr/src/uts/common/io/lofi.c +++ b/usr/src/uts/common/io/lofi.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -59,6 +59,14 @@ * controller to talk to, and that didn't seem easy to fake. Or possibly even * necessary, since we have mkfs_pcfs now). * + * Normally, a lofi device cannot be detached if it is open (i.e. busy). To + * support simulation of hotplug events, an optional force flag is provided. + * If a lofi device is open when a force detach is requested, then the + * underlying file is closed and any subsequent operations return EIO. When the + * device is closed for the last time, it will be cleaned up at that time. In + * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is + * detached but not removed. + * * Known problems: * * UFS logging. Mounting a UFS filesystem image "logging" @@ -207,7 +215,38 @@ mark_closed(struct lofi_state *lsp, int otyp) } } -/*ARGSUSED3*/ +static void +lofi_free_handle(dev_t dev, minor_t minor, struct lofi_state *lsp, + cred_t *credp) +{ + dev_t newdev; + char namebuf[50]; + + if (lsp->ls_vp) { + (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 1, 0, credp); + VN_RELE(lsp->ls_vp); + lsp->ls_vp = NULL; + } + + newdev = makedevice(getmajor(dev), minor); + (void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME); + (void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME); + + (void) snprintf(namebuf, sizeof (namebuf), "%d", minor); + ddi_remove_minor_node(lofi_dip, namebuf); + (void) snprintf(namebuf, sizeof (namebuf), "%d,raw", minor); + ddi_remove_minor_node(lofi_dip, namebuf); + + kmem_free(lsp->ls_filename, lsp->ls_filename_sz); + taskq_destroy(lsp->ls_taskq); + if (lsp->ls_kstat) { + kstat_delete(lsp->ls_kstat); + mutex_destroy(&lsp->ls_kstat_lock); + } + ddi_soft_state_free(lofi_statep, minor); +} + +/*ARGSUSED*/ static int lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp) { @@ -244,6 +283,11 @@ lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp) return (EINVAL); } + if (lsp->ls_vp == NULL) { + mutex_exit(&lofi_lock); + return (ENXIO); + } + if (mark_opened(lsp, otyp) == -1) { mutex_exit(&lofi_lock); return (EINVAL); @@ -253,16 +297,13 @@ lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp) return (0); } -/*ARGSUSED3*/ +/*ARGSUSED*/ static int lofi_close(dev_t dev, int flag, int otyp, struct cred *credp) { minor_t minor; struct lofi_state *lsp; -#ifdef lint - flag = flag; -#endif mutex_enter(&lofi_lock); minor = getminor(dev); lsp = ddi_get_soft_state(lofi_statep, minor); @@ -271,6 +312,13 @@ lofi_close(dev_t dev, int flag, int otyp, struct cred *credp) return (EINVAL); } mark_closed(lsp, otyp); + + /* + * If we have forcibly closed the underlying device, and this is the + * last close, then tear down the rest of the device. + */ + if (minor != 0 && lsp->ls_vp == NULL && !is_opened(lsp)) + lofi_free_handle(dev, minor, lsp, credp); mutex_exit(&lofi_lock); return (0); } @@ -312,7 +360,9 @@ lofi_strategy_task(void *arg) * we have the rw_lock. So instead we page, unless it's not * mapable or it's a character device. */ - if (((lsp->ls_vp->v_flag & VNOMAP) == 0) && + if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { + error = EIO; + } else if (((lsp->ls_vp->v_flag & VNOMAP) == 0) && (lsp->ls_vp->v_type != VCHR)) { /* * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on @@ -400,6 +450,12 @@ lofi_strategy_task(void *arg) kstat_runq_exit(kioptr); mutex_exit(lsp->ls_kstat->ks_lock); } + + mutex_enter(&lsp->ls_vp_lock); + if (--lsp->ls_vp_iocount == 0) + cv_broadcast(&lsp->ls_vp_cv); + mutex_exit(&lsp->ls_vp_lock); + bioerror(bp, error); biodone(bp); } @@ -422,6 +478,14 @@ lofi_strategy(struct buf *bp) * queues were incredibly easy so they win. */ lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev)); + mutex_enter(&lsp->ls_vp_lock); + if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { + bioerror(bp, EIO); + biodone(bp); + mutex_exit(&lsp->ls_vp_lock); + return (0); + } + offset = bp->b_lblkno * DEV_BSIZE; /* offset within file */ if (offset == lsp->ls_vp_size) { /* EOF */ @@ -433,13 +497,18 @@ lofi_strategy(struct buf *bp) bioerror(bp, ENXIO); } biodone(bp); + mutex_exit(&lsp->ls_vp_lock); return (0); } if (offset > lsp->ls_vp_size) { bioerror(bp, ENXIO); biodone(bp); + mutex_exit(&lsp->ls_vp_lock); return (0); } + lsp->ls_vp_iocount++; + mutex_exit(&lsp->ls_vp_lock); + if (lsp->ls_kstat) { mutex_enter(lsp->ls_kstat->ks_lock); kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); @@ -720,15 +789,15 @@ lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor, struct lofi_state *lsp; struct lofi_ioctl *klip; int error; - char namebuf[50]; struct vnode *vp; int64_t Nblocks_prop_val; int64_t Size_prop_val; vattr_t vattr; int flag; enum vtype v_type; - dev_t newdev; int zalloced = 0; + dev_t newdev; + char namebuf[50]; klip = copy_in_lofi_ioctl(ulip, ioctl_flag); if (klip == NULL) @@ -846,6 +915,9 @@ lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor, lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock; kstat_install(lsp->ls_kstat); } + cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL); + mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL); + /* * save open mode so file can be closed properly and vnode counts * updated correctly. @@ -911,8 +983,6 @@ lofi_unmap_file(dev_t dev, struct lofi_ioctl *ulip, int byfilename, struct lofi_state *lsp; struct lofi_ioctl *klip; minor_t minor; - char namebuf[20]; - dev_t newdev; klip = copy_in_lofi_ioctl(ulip, ioctl_flag); if (klip == NULL) @@ -930,38 +1000,51 @@ lofi_unmap_file(dev_t dev, struct lofi_ioctl *ulip, int byfilename, return (ENXIO); } lsp = ddi_get_soft_state(lofi_statep, minor); - if (lsp == NULL) { + if (lsp == NULL || lsp->ls_vp == NULL) { mutex_exit(&lofi_lock); free_lofi_ioctl(klip); return (ENXIO); } + if (is_opened(lsp)) { + /* + * If the 'force' flag is set, then we forcibly close the + * underlying file. Subsequent operations will fail, and the + * DKIOCSTATE ioctl will return DKIO_DEV_GONE. When the device + * is last closed, the device will be cleaned up appropriately. + * + * This is complicated by the fact that we may have outstanding + * dispatched I/Os. Rather than having a single mutex to + * serialize all I/O, we keep a count of the number of + * outstanding I/O requests, as well as a flag to indicate that + * no new I/Os should be dispatched. We set the flag, wait for + * the number of outstanding I/Os to reach 0, and then close the + * underlying vnode. + */ + if (klip->li_force) { + mutex_enter(&lsp->ls_vp_lock); + lsp->ls_vp_closereq = B_TRUE; + while (lsp->ls_vp_iocount > 0) + cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); + (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 1, 0, + credp); + VN_RELE(lsp->ls_vp); + lsp->ls_vp = NULL; + cv_broadcast(&lsp->ls_vp_cv); + mutex_exit(&lsp->ls_vp_lock); + mutex_exit(&lofi_lock); + klip->li_minor = minor; + (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); + free_lofi_ioctl(klip); + return (0); + } mutex_exit(&lofi_lock); free_lofi_ioctl(klip); return (EBUSY); } - /* - * Use saved open mode to properly update vnode counts - */ - (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 1, 0, credp); - VN_RELE(lsp->ls_vp); - lsp->ls_vp = NULL; - newdev = makedevice(getmajor(dev), minor); - (void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME); - (void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME); - (void) snprintf(namebuf, sizeof (namebuf), "%d", minor); - ddi_remove_minor_node(lofi_dip, namebuf); - (void) snprintf(namebuf, sizeof (namebuf), "%d,raw", minor); - ddi_remove_minor_node(lofi_dip, namebuf); + lofi_free_handle(dev, minor, lsp, credp); - kmem_free(lsp->ls_filename, lsp->ls_filename_sz); - taskq_destroy(lsp->ls_taskq); - if (lsp->ls_kstat) { - kstat_delete(lsp->ls_kstat); - mutex_destroy(&lsp->ls_kstat_lock); - } - ddi_soft_state_free(lofi_statep, minor); klip->li_minor = minor; mutex_exit(&lofi_lock); (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); @@ -973,7 +1056,7 @@ lofi_unmap_file(dev_t dev, struct lofi_ioctl *ulip, int byfilename, * get the filename given the minor number, or the minor number given * the name. */ -/*ARGSUSED3*/ +/*ARGSUSED*/ static int lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which, struct cred *credp, int ioctl_flag) @@ -983,9 +1066,6 @@ lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which, int error; minor_t minor; -#ifdef lint - dev = dev; -#endif klip = copy_in_lofi_ioctl(ulip, ioctl_flag); if (klip == NULL) return (EFAULT); @@ -1089,6 +1169,13 @@ lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, if (lsp == NULL) return (ENXIO); + /* + * We explicitly allow DKIOCSTATE, but all other ioctls should fail with + * EIO as if the device was no longer present. + */ + if (lsp->ls_vp == NULL && cmd != DKIOCSTATE) + return (EIO); + /* these are for faking out utilities like newfs */ switch (cmd) { case DKIOCGVTOC: @@ -1125,11 +1212,34 @@ lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, return (EFAULT); return (0); case DKIOCSTATE: - /* the file is always there */ - dkstate = DKIO_INSERTED; - error = ddi_copyout(&dkstate, (void *)arg, - sizeof (enum dkio_state), flag); - if (error) + /* + * Normally, lofi devices are always in the INSERTED state. If + * a device is forcefully unmapped, then the device transitions + * to the DKIO_DEV_GONE state. + */ + if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate), + flag) != 0) + return (EFAULT); + + mutex_enter(&lsp->ls_vp_lock); + while ((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) || + (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) { + /* + * By virtue of having the device open, we know that + * 'lsp' will remain valid when we return. + */ + if (!cv_wait_sig(&lsp->ls_vp_cv, + &lsp->ls_vp_lock)) { + mutex_exit(&lsp->ls_vp_lock); + return (EINTR); + } + } + + dkstate = (lsp->ls_vp != NULL ? DKIO_INSERTED : DKIO_DEV_GONE); + mutex_exit(&lsp->ls_vp_lock); + + if (ddi_copyout(&dkstate, (void *)arg, + sizeof (dkstate), flag) != 0) return (EFAULT); return (0); default: diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c index a43f9baf5c..90ea1d608f 100644 --- a/usr/src/uts/common/io/scsi/targets/sd.c +++ b/usr/src/uts/common/io/scsi/targets/sd.c @@ -81,11 +81,11 @@ char _depends_on[] = "misc/scsi misc/cmlb"; * Define the interconnect type, to allow the driver to distinguish * between parallel SCSI (sd) and fibre channel (ssd) behaviors. * - * This is really for backward compatability. In the future, the driver + * This is really for backward compatibility. In the future, the driver * should actually check the "interconnect-type" property as reported by * the HBA; however at present this property is not defined by all HBAs, * so we will use this #define (1) to permit the driver to run in - * backward-compatability mode; and (2) to print a notification message + * backward-compatibility mode; and (2) to print a notification message * if an FC HBA does not support the "interconnect-type" property. The * behavior of the driver will be to assume parallel SCSI behaviors unless * the "interconnect-type" property is defined by the HBA **AND** has a @@ -136,7 +136,7 @@ static char *sd_config_list = "sd-config-list"; #if (defined(__fibre)) /* * These #defines are to avoid namespace collisions that occur because this - * code is currently used to compile two seperate driver modules: sd and ssd. + * code is currently used to compile two separate driver modules: sd and ssd. * All global variables need to be treated this way (even if declared static) * in order to allow the debugger to resolve the names properly. * It is anticipated that in the near future the ssd module will be obsoleted, @@ -539,7 +539,7 @@ static sd_tunables tst_properties = { }; #endif -/* This is similiar to the ANSI toupper implementation */ +/* This is similar to the ANSI toupper implementation */ #define SD_TOUPPER(C) (((C) >= 'a' && (C) <= 'z') ? (C) - 'a' + 'A' : (C)) /* @@ -797,7 +797,7 @@ static int sd_pm_idletime = 1; #if (defined(__fibre)) /* * These #defines are to avoid namespace collisions that occur because this - * code is currently used to compile two seperate driver modules: sd and ssd. + * code is currently used to compile two separate driver modules: sd and ssd. * All function names need to be treated this way (even if declared static) * in order to allow the debugger to resolve the names properly. * It is anticipated that in the near future the ssd module will be obsoleted, @@ -1674,7 +1674,7 @@ struct sd_sense_info { }; /* - * Table of function pointers for iostart-side routines. Seperate "chains" + * Table of function pointers for iostart-side routines. Separate "chains" * of layered function calls are formed by placing the function pointers * sequentially in the desired order. Functions are called according to an * incrementing table index ordering. The last function in each chain must @@ -1683,9 +1683,9 @@ struct sd_sense_info { * * Note: It may seem more natural to organize both the iostart and iodone * functions together, into an array of structures (or some similar - * organization) with a common index, rather than two seperate arrays which + * organization) with a common index, rather than two separate arrays which * must be maintained in synchronization. The purpose of this division is - * to achiece improved performance: individual arrays allows for more + * to achieve improved performance: individual arrays allows for more * effective cache line utilization on certain platforms. */ @@ -2139,7 +2139,7 @@ _init(void) sd_label = mod_modname(&modlinkage); err = ddi_soft_state_init(&sd_state, sizeof (struct sd_lun), - SD_MAXUNIT); + SD_MAXUNIT); if (err != 0) { return (err); @@ -2481,9 +2481,9 @@ sdprobe(dev_info_t *devi) */ if (sd_dtype_optical_bind < 0) { - sd_dtype_optical_bind = ddi_prop_get_int - (DDI_DEV_T_ANY, devi, 0, - "optical-device-bind", 1); + sd_dtype_optical_bind = ddi_prop_get_int + (DDI_DEV_T_ANY, devi, 0, + "optical-device-bind", 1); } if (sd_dtype_optical_bind == 0) { @@ -3611,11 +3611,11 @@ sd_process_sdconf_file(struct sd_lun *un) * * This function reads the data list from the sd.conf file and pulls * the values that can have numeric values as arguments and places - * the values in the apropriate sd_tunables member. + * the values in the appropriate sd_tunables member. * Since the order of the data list members varies across platforms * This function reads them from the data list in a platform specific * order and places them into the correct sd_tunable member that is - * a consistant across all platforms. + * consistent across all platforms. */ static void sd_get_tunables_from_conf(struct sd_lun *un, int flags, int *data_list, @@ -4024,7 +4024,7 @@ sd_set_vers1_properties(struct sd_lun *un, int flags, sd_tunables *prop_list) if (flags & SD_CONF_BSET_NO_READ_HEADER) { un->un_f_cfg_no_read_header = TRUE; SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_set_vers1_properties: no_read_header set\n"); + "sd_set_vers1_properties: no_read_header set\n"); } if (flags & SD_CONF_BSET_READ_CD_XD4) { un->un_f_cfg_read_cd_xd4 = TRUE; @@ -4054,7 +4054,7 @@ sd_set_vers1_properties(struct sd_lun *un, int flags, sd_tunables *prop_list) ASSERT(prop_list != NULL); if (prop_list->sdt_not_rdy_retries) { un->un_notready_retry_count = - prop_list->sdt_not_rdy_retries; + prop_list->sdt_not_rdy_retries; SD_INFO(SD_LOG_ATTACH_DETACH, un, "sd_set_vers1_properties: not ready retry count" " set to %d\n", un->un_notready_retry_count); @@ -4074,8 +4074,8 @@ sd_set_vers1_properties(struct sd_lun *un, int flags, sd_tunables *prop_list) case CTYPE_CCS: un->un_ctype = prop_list->sdt_ctype; SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_set_vers1_properties: ctype set to " - "CTYPE_CCS\n"); + "sd_set_vers1_properties: ctype set to " + "CTYPE_CCS\n"); break; case CTYPE_ROD: /* RW optical */ un->un_ctype = prop_list->sdt_ctype; @@ -4095,7 +4095,7 @@ sd_set_vers1_properties(struct sd_lun *un, int flags, sd_tunables *prop_list) if (flags & SD_CONF_BSET_BSY_RETRY_COUNT) { ASSERT(prop_list != NULL); un->un_busy_retry_count = - prop_list->sdt_busy_retries; + prop_list->sdt_busy_retries; SD_INFO(SD_LOG_ATTACH_DETACH, un, "sd_set_vers1_properties: " "busy retry count set to %d\n", @@ -4106,7 +4106,7 @@ sd_set_vers1_properties(struct sd_lun *un, int flags, sd_tunables *prop_list) if (flags & SD_CONF_BSET_RST_RETRIES) { ASSERT(prop_list != NULL); un->un_reset_retry_count = - prop_list->sdt_reset_retries; + prop_list->sdt_reset_retries; SD_INFO(SD_LOG_ATTACH_DETACH, un, "sd_set_vers1_properties: " "reset retry count set to %d\n", @@ -4117,7 +4117,7 @@ sd_set_vers1_properties(struct sd_lun *un, int flags, sd_tunables *prop_list) if (flags & SD_CONF_BSET_RSV_REL_TIME) { ASSERT(prop_list != NULL); un->un_reserve_release_time = - prop_list->sdt_reserv_rel_time; + prop_list->sdt_reserv_rel_time; SD_INFO(SD_LOG_ATTACH_DETACH, un, "sd_set_vers1_properties: " "reservation release timeout set to %d\n", @@ -4177,7 +4177,7 @@ sd_set_vers1_properties(struct sd_lun *un, int flags, sd_tunables *prop_list) /* * Function: sd_is_lsi() * - * Description: Check for lsi devices, step throught the static device + * Description: Check for lsi devices, step through the static device * table to match vid/pid. * * Args: un - ptr to sd_lun @@ -5691,8 +5691,8 @@ sd_pm_idletimeout_handler(void *arg) un->un_pm_idle_timeid = NULL; } else { un->un_pm_idle_timeid = - timeout(sd_pm_idletimeout_handler, un, - (drv_usectohz((clock_t)300000))); /* 300 ms. */ + timeout(sd_pm_idletimeout_handler, un, + (drv_usectohz((clock_t)300000))); /* 300 ms. */ } mutex_exit(&un->un_pm_mutex); mutex_exit(SD_MUTEX(un)); @@ -6509,7 +6509,7 @@ sd_unit_attach(dev_info_t *devi) if (un->un_f_is_fibre == TRUE) { if (scsi_ifgetcap(SD_ADDRESS(un), "scsi-version", 1) == - SCSI_VERSION_3) { + SCSI_VERSION_3) { switch (un->un_interconnect_type) { case SD_INTERCONNECT_FIBRE: case SD_INTERCONNECT_SSA: @@ -6530,7 +6530,7 @@ sd_unit_attach(dev_info_t *devi) /* * Set un_retry_count with SD_RETRY_COUNT, this is ok for Sparc - * with seperate binary for sd and ssd. + * with separate binary for sd and ssd. * * x86 has 1 binary, un_retry_count is set base on connection type. * The hardcoded values will go away when Sparc uses 1 binary @@ -6552,7 +6552,7 @@ sd_unit_attach(dev_info_t *devi) */ un->un_notready_retry_count = ISCD(un) ? CD_NOT_READY_RETRY_COUNT(un) - : DISK_NOT_READY_RETRY_COUNT(un); + : DISK_NOT_READY_RETRY_COUNT(un); /* * Set the busy retry count to the default value of un_retry_count. @@ -6603,16 +6603,16 @@ sd_unit_attach(dev_info_t *devi) un->un_f_allow_bus_device_reset = TRUE; } else { if (ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, - "allow-bus-device-reset", 1) != 0) { + "allow-bus-device-reset", 1) != 0) { un->un_f_allow_bus_device_reset = TRUE; SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_unit_attach: un:0x%p Bus device reset enabled\n", - un); + "sd_unit_attach: un:0x%p Bus device reset " + "enabled\n", un); } else { un->un_f_allow_bus_device_reset = FALSE; SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_unit_attach: un:0x%p Bus device reset disabled\n", - un); + "sd_unit_attach: un:0x%p Bus device reset " + "disabled\n", un); } } @@ -7197,11 +7197,12 @@ sd_unit_attach(dev_info_t *devi) * register or not. */ if (un->un_f_is_fibre) { - if (strcmp(un->un_node_type, DDI_NT_BLOCK_CHAN)) { - sd_init_event_callbacks(un); - SD_TRACE(SD_LOG_ATTACH_DETACH, un, - "sd_unit_attach: un:0x%p event callbacks inserted", un); - } + if (strcmp(un->un_node_type, DDI_NT_BLOCK_CHAN)) { + sd_init_event_callbacks(un); + SD_TRACE(SD_LOG_ATTACH_DETACH, un, + "sd_unit_attach: un:0x%p event callbacks inserted", + un); + } } #endif @@ -7324,7 +7325,7 @@ cmlb_attach_failed: } if (un->un_f_is_fibre == FALSE) { - (void) scsi_ifsetcap(SD_ADDRESS(un), "auto-rqsense", 0, 1); + (void) scsi_ifsetcap(SD_ADDRESS(un), "auto-rqsense", 0, 1); } spinup_failed: @@ -7746,8 +7747,8 @@ sd_unit_detach(dev_info_t *devi) */ if (un->un_f_is_fibre == TRUE) { if ((un->un_insert_event != NULL) && - (ddi_remove_event_handler(un->un_insert_cb_id) != - DDI_SUCCESS)) { + (ddi_remove_event_handler(un->un_insert_cb_id) != + DDI_SUCCESS)) { /* * Note: We are returning here after having done * substantial cleanup above. This is consistent @@ -7755,14 +7756,14 @@ sd_unit_detach(dev_info_t *devi) * be the right thing to do. */ SD_ERROR(SD_LOG_ATTACH_DETACH, un, - "sd_dr_detach: Cannot cancel insert event\n"); + "sd_dr_detach: Cannot cancel insert event\n"); goto err_remove_event; } un->un_insert_event = NULL; if ((un->un_remove_event != NULL) && - (ddi_remove_event_handler(un->un_remove_cb_id) != - DDI_SUCCESS)) { + (ddi_remove_event_handler(un->un_remove_cb_id) != + DDI_SUCCESS)) { /* * Note: We are returning here after having done * substantial cleanup above. This is consistent @@ -7770,7 +7771,7 @@ sd_unit_detach(dev_info_t *devi) * be the right thing to do. */ SD_ERROR(SD_LOG_ATTACH_DETACH, un, - "sd_dr_detach: Cannot cancel remove event\n"); + "sd_dr_detach: Cannot cancel remove event\n"); goto err_remove_event; } un->un_remove_event = NULL; @@ -8270,7 +8271,7 @@ sd_cache_control(struct sd_lun *un, int rcd_flag, int wce_flag) * will fail. mode_cache_scsi3 is a superset of mode_caching. */ buflen = hdrlen + MODE_BLK_DESC_LENGTH + - sizeof (struct mode_cache_scsi3); + sizeof (struct mode_cache_scsi3); header = kmem_zalloc(buflen, KM_SLEEP); @@ -8332,8 +8333,8 @@ sd_cache_control(struct sd_lun *un, int rcd_flag, int wce_flag) * length of the sense data returned. */ sbuflen = hdrlen + MODE_BLK_DESC_LENGTH + - sizeof (struct mode_page) + - (int)mode_caching_page->mode_page.length; + sizeof (struct mode_page) + + (int)mode_caching_page->mode_page.length; /* * Set the caching bits as requested. @@ -8353,7 +8354,7 @@ sd_cache_control(struct sd_lun *un, int rcd_flag, int wce_flag) * drive supports it. */ save_pg = mode_caching_page->mode_page.ps ? - SD_SAVE_PAGE : SD_DONTSAVE_PAGE; + SD_SAVE_PAGE : SD_DONTSAVE_PAGE; /* Clear reserved bits before mode select. */ mode_caching_page->mode_page.ps = 0; @@ -8964,7 +8965,7 @@ sdopen(dev_t *dev_p, int flag, int otyp, cred_t *cred_p) cp = &un->un_ocmap.chkd[0]; while (cp < &un->un_ocmap.chkd[OCSIZE]) { if (*cp != (uchar_t)0) { - break; + break; } cp++; } @@ -9116,7 +9117,7 @@ sdclose(dev_t dev, int flag, int otyp, cred_t *cred_p) if (un->un_state == SD_STATE_OFFLINE) { if (un->un_f_is_fibre == FALSE) { scsi_log(SD_DEVINFO(un), sd_label, - CE_WARN, "offline\n"); + CE_WARN, "offline\n"); } mutex_exit(SD_MUTEX(un)); cmlb_invalidate(un->un_cmlbhandle, @@ -9838,7 +9839,7 @@ sdawrite(dev_t dev, struct aio_req *aio, cred_t *cred_p) * +----> SCSA ---->+ * * - * This code is based upon the following presumtions: + * This code is based upon the following presumptions: * * - iostart and iodone functions operate on buf(9S) structures. These * functions perform the necessary operations on the buf(9S) and pass @@ -9903,7 +9904,7 @@ static int sd_taskq_maxalloc = SD_TASKQ_MAXALLOC; /* * The following task queue is being created for the write part of * read-modify-write of non-512 block size devices. - * Limit the number of threads to 1 for now. This number has been choosen + * Limit the number of threads to 1 for now. This number has been chosen * considering the fact that it applies only to dvd ram drives/MO drives * currently. Performance for which is not main criteria at this stage. * Note: It needs to be explored if we can use a single taskq in future @@ -10487,7 +10488,7 @@ sd_uscsi_iodone(int index, struct sd_lun *un, struct buf *bp) /* * Function: sd_mapblockaddr_iostart * - * Description: Verify request lies withing the partition limits for + * Description: Verify request lies within the partition limits for * the indicated minor device. Issue "overrun" buf if * request would exceed partition range. Converts * partition-relative block address to absolute. @@ -10610,7 +10611,7 @@ sd_mapblockaddr_iostart(int index, struct sd_lun *un, struct buf *bp) ASSERT(bp->b_bcount >= resid); bp = sd_bioclone_alloc(bp, count, blocknum, - (int (*)(struct buf *)) sd_mapblockaddr_iodone); + (int (*)(struct buf *)) sd_mapblockaddr_iodone); xp = SD_GET_XBUF(bp); /* Update for 'new' bp! */ ASSERT(xp != NULL); } @@ -11756,7 +11757,7 @@ sd_setup_rw_pkt(struct sd_lun *un, */ blockcount -= SD_BYTES2TGTBLOCKS(un, - return_pktp->pkt_resid); + return_pktp->pkt_resid); } cdbp = (union scsi_cdb *)return_pktp->pkt_cdbp; @@ -11767,7 +11768,7 @@ sd_setup_rw_pkt(struct sd_lun *un, */ cdbp->scc_cmd = cp->sc_grpmask | ((bp->b_flags & B_READ) ? - SCMD_READ : SCMD_WRITE); + SCMD_READ : SCMD_WRITE); SD_FILL_SCSI1_LUN(un, return_pktp); @@ -12738,7 +12739,7 @@ sd_start_cmds(struct sd_lun *un, struct buf *immed_bp) if ((un->un_state != SD_STATE_SUSPENDED) && (un->un_state != SD_STATE_PM_CHANGING)) { New_state(un, SD_STATE_NORMAL); - } + } xp = SD_GET_XBUF(bp); ASSERT(xp != NULL); @@ -13012,8 +13013,8 @@ got_pkt: SD_UPDATE_KSTATS(un, kstat_runq_exit, bp); bp = sd_mark_rqs_idle(un, xp); sd_retry_command(un, bp, SD_RETRIES_STANDARD, - NULL, NULL, EIO, SD_BSY_TIMEOUT / 500, - kstat_waitq_enter); + NULL, NULL, EIO, SD_BSY_TIMEOUT / 500, + kstat_waitq_enter); goto exit; } @@ -13081,7 +13082,7 @@ got_pkt: * for this condition? */ sd_set_retry_bp(un, bp, SD_BSY_TIMEOUT / 500, - kstat_runq_back_to_waitq); + kstat_runq_back_to_waitq); goto exit; case TRAN_FATAL_ERROR: @@ -13180,8 +13181,8 @@ sd_return_command(struct sd_lun *un, struct buf *bp) * Note:x86: check for the "sdrestart failed" case. */ if (((xp->xb_pkt_flags & SD_XB_USCSICMD) != SD_XB_USCSICMD) && - (geterror(bp) == 0) && (xp->xb_dma_resid != 0) && - (xp->xb_pktp->pkt_resid == 0)) { + (geterror(bp) == 0) && (xp->xb_dma_resid != 0) && + (xp->xb_pktp->pkt_resid == 0)) { if (sd_setup_next_xfer(un, bp, pktp, xp) != 0) { /* @@ -13407,7 +13408,7 @@ sd_return_failed_command_no_restart(struct sd_lun *un, struct buf *bp, * is queued for a delayed retry. May be NULL if no kstat * update is desired. * - * Context: May be called from interupt context. + * Context: May be called from interrupt context. */ static void @@ -13639,7 +13640,7 @@ sd_retry_command(struct sd_lun *un, struct buf *bp, int retry_check_flag, xp->xb_ua_retry_count++; SD_TRACE(SD_LOG_IO_CORE | SD_LOG_ERROR, un, "sd_retry_command: retry count:%d\n", - xp->xb_ua_retry_count); + xp->xb_ua_retry_count); break; case SD_RETRIES_BUSY: @@ -14220,22 +14221,22 @@ sd_alloc_rqs(struct scsi_device *devp, struct sd_lun *un) switch (scsi_ifgetcap(SD_ADDRESS(un), "auto-rqsense", 1)) { case 0: SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_alloc_rqs: HBA supports ARQ\n"); + "sd_alloc_rqs: HBA supports ARQ\n"); /* * ARQ is supported by this HBA but currently is not * enabled. Attempt to enable it and if successful then * mark this instance as ARQ enabled. */ if (scsi_ifsetcap(SD_ADDRESS(un), "auto-rqsense", 1, 1) - == 1) { + == 1) { /* Successfully enabled ARQ in the HBA */ SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_alloc_rqs: ARQ enabled\n"); + "sd_alloc_rqs: ARQ enabled\n"); un->un_f_arq_enabled = TRUE; } else { /* Could not enable ARQ in the HBA */ SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_alloc_rqs: failed ARQ enable\n"); + "sd_alloc_rqs: failed ARQ enable\n"); un->un_f_arq_enabled = FALSE; } break; @@ -14245,7 +14246,7 @@ sd_alloc_rqs(struct scsi_device *devp, struct sd_lun *un) * Just mark ARQ as enabled for this instance. */ SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_alloc_rqs: ARQ already enabled\n"); + "sd_alloc_rqs: ARQ already enabled\n"); un->un_f_arq_enabled = TRUE; break; default: @@ -14254,7 +14255,7 @@ sd_alloc_rqs(struct scsi_device *devp, struct sd_lun *un) * instance. */ SD_INFO(SD_LOG_ATTACH_DETACH, un, - "sd_alloc_rqs: HBA does not support ARQ\n"); + "sd_alloc_rqs: HBA does not support ARQ\n"); un->un_f_arq_enabled = FALSE; break; } @@ -14304,7 +14305,7 @@ sd_free_rqs(struct sd_lun *un) /* * Function: sd_reduce_throttle * - * Description: Reduces the maximun # of outstanding commands on a + * Description: Reduces the maximum # of outstanding commands on a * target to the current number of outstanding commands. * Queues a tiemout(9F) callback to restore the limit * after a specified interval has elapsed. @@ -14344,7 +14345,7 @@ sd_reduce_throttle(struct sd_lun *un, int throttle_type) } if (un->un_ncmds_in_transport > 0) { - un->un_throttle = un->un_ncmds_in_transport; + un->un_throttle = un->un_ncmds_in_transport; } } else { @@ -14423,9 +14424,10 @@ sd_restore_throttle(void *arg) (throttle < un->un_saved_throttle) ? throttle : un->un_saved_throttle; if (un->un_throttle < un->un_saved_throttle) { - un->un_reset_throttle_timeid = - timeout(sd_restore_throttle, - un, SD_QFULL_THROTTLE_RESET_INTERVAL); + un->un_reset_throttle_timeid = + timeout(sd_restore_throttle, + un, + SD_QFULL_THROTTLE_RESET_INTERVAL); } } } @@ -14565,11 +14567,16 @@ sdintr(struct scsi_pkt *pktp) #endif /* - * If pkt_reason is CMD_DEV_GONE, just fail the command + * If pkt_reason is CMD_DEV_GONE, fail the command, and update the media + * state if needed. */ if (pktp->pkt_reason == CMD_DEV_GONE) { scsi_log(SD_DEVINFO(un), sd_label, CE_CONT, - "Device is gone\n"); + "Device is gone\n"); + if (un->un_mediastate != DKIO_DEV_GONE) { + un->un_mediastate = DKIO_DEV_GONE; + cv_broadcast(&un->un_state_cv); + } sd_return_failed_command(un, bp, EIO); goto exit; } @@ -14682,7 +14689,7 @@ sdintr(struct scsi_pkt *pktp) } else if (xp->xb_pkt_flags & SD_XB_USCSICMD) { SD_UPDATE_B_RESID(bp, pktp); SD_TRACE(SD_LOG_IO_CORE | SD_LOG_ERROR, un, - "sdintr: returning uscsi command\n"); + "sdintr: returning uscsi command\n"); } else { goto not_successful; } @@ -15320,7 +15327,7 @@ sense_failed: */ sd_retry_command(un, bp, SD_RETRIES_STANDARD, sd_print_sense_failed_msg, msgp, EIO, - un->un_f_is_fibre?drv_usectohz(100000):(clock_t)0, NULL); + un->un_f_is_fibre?drv_usectohz(100000):(clock_t)0, NULL); #else sd_retry_command(un, bp, SD_RETRIES_STANDARD, sd_print_sense_failed_msg, msgp, EIO, SD_RETRY_DELAY, NULL); @@ -15566,7 +15573,7 @@ sd_print_sense_msg(struct sd_lun *un, struct buf *bp, void *arg, int code) sensep = xp->xb_sense_data; if (scsi_sense_info_uint64(sensep, SENSE_LENGTH, - (uint64_t *)&err_blkno)) { + (uint64_t *)&err_blkno)) { /* * We retrieved the error block number from the information * portion of the sense data. @@ -15657,7 +15664,7 @@ sd_sense_key_no_sense(struct sd_lun *un, struct buf *bp, SD_UPDATE_ERRSTATS(un, sd_softerrs); sd_retry_command(un, bp, SD_RETRIES_STANDARD, sd_print_sense_msg, - &si, EIO, (clock_t)0, NULL); + &si, EIO, (clock_t)0, NULL); } @@ -15803,21 +15810,21 @@ sd_sense_key_not_ready(struct sd_lun *un, */ if (un->un_f_is_fibre == TRUE) { if (((sd_level_mask & SD_LOGMASK_DIAG) || - (xp->xb_retry_count > 0)) && - (un->un_startstop_timeid == NULL)) { + (xp->xb_retry_count > 0)) && + (un->un_startstop_timeid == NULL)) { scsi_log(SD_DEVINFO(un), sd_label, - CE_WARN, "logical unit not ready, " - "resetting disk\n"); + CE_WARN, "logical unit not ready, " + "resetting disk\n"); sd_reset_target(un, pktp); } } else { if (((sd_level_mask & SD_LOGMASK_DIAG) || - (xp->xb_retry_count > - un->un_reset_retry_count)) && - (un->un_startstop_timeid == NULL)) { + (xp->xb_retry_count > + un->un_reset_retry_count)) && + (un->un_startstop_timeid == NULL)) { scsi_log(SD_DEVINFO(un), sd_label, - CE_WARN, "logical unit not ready, " - "resetting disk\n"); + CE_WARN, "logical unit not ready, " + "resetting disk\n"); sd_reset_target(un, pktp); } } @@ -16856,8 +16863,8 @@ sd_pkt_status_check_condition(struct sd_lun *un, struct buf *bp, * when SD_RETRY_DELAY change in sddef.h */ sd_retry_command(un, bp, SD_RETRIES_STANDARD, NULL, NULL, EIO, - un->un_f_is_fibre?drv_usectohz(100000):(clock_t)0, - NULL); + un->un_f_is_fibre?drv_usectohz(100000):(clock_t)0, + NULL); #else sd_retry_command(un, bp, SD_RETRIES_STANDARD, NULL, NULL, EIO, SD_RETRY_DELAY, NULL); @@ -17821,13 +17828,13 @@ sd_send_scsi_START_STOP_UNIT(struct sd_lun *un, int flag, int path_flag) case STATUS_CHECK: if (ucmd_buf.uscsi_rqstatus == STATUS_GOOD) { switch (scsi_sense_key( - (uint8_t *)&sense_buf)) { + (uint8_t *)&sense_buf)) { case KEY_ILLEGAL_REQUEST: status = ENOTSUP; break; case KEY_NOT_READY: if (scsi_sense_asc( - (uint8_t *)&sense_buf) + (uint8_t *)&sense_buf) == 0x3A) { status = ENXIO; } @@ -18111,7 +18118,7 @@ sd_send_scsi_TEST_UNIT_READY(struct sd_lun *un, int flag) } if ((ucmd_buf.uscsi_rqstatus == STATUS_GOOD) && (scsi_sense_key((uint8_t *)&sense_buf) == - KEY_NOT_READY) && + KEY_NOT_READY) && (scsi_sense_asc((uint8_t *)&sense_buf) == 0x3A)) { status = ENXIO; } @@ -18200,7 +18207,7 @@ sd_send_scsi_PERSISTENT_RESERVE_IN(struct sd_lun *un, uchar_t usr_cmd, case STATUS_CHECK: if ((ucmd_buf.uscsi_rqstatus == STATUS_GOOD) && (scsi_sense_key((uint8_t *)&sense_buf) == - KEY_ILLEGAL_REQUEST)) { + KEY_ILLEGAL_REQUEST)) { status = ENOTSUP; } break; @@ -18345,7 +18352,7 @@ sd_send_scsi_PERSISTENT_RESERVE_OUT(struct sd_lun *un, uchar_t usr_cmd, case STATUS_CHECK: if ((ucmd_buf.uscsi_rqstatus == STATUS_GOOD) && (scsi_sense_key((uint8_t *)&sense_buf) == - KEY_ILLEGAL_REQUEST)) { + KEY_ILLEGAL_REQUEST)) { status = ENOTSUP; } break; @@ -18493,7 +18500,7 @@ sd_send_scsi_SYNCHRONIZE_CACHE_biodone(struct buf *bp) case STATUS_CHECK: if ((uscmd->uscsi_rqstatus == STATUS_GOOD) && (scsi_sense_key(sense_buf) == - KEY_ILLEGAL_REQUEST)) { + KEY_ILLEGAL_REQUEST)) { /* Ignore Illegal Request error */ mutex_enter(SD_MUTEX(un)); un->un_f_sync_cache_supported = FALSE; @@ -18627,7 +18634,7 @@ sd_send_scsi_GET_CONFIGURATION(struct sd_lun *un, struct uscsi_cmd *ucmdbuf, * Function: sd_send_scsi_feature_GET_CONFIGURATION * * Description: Issues the get configuration command to the device to - * retrieve a specfic feature. Called from + * retrieve a specific feature. Called from * sd_check_for_writable_cd & sd_set_mmc_caps. * Arguments: un * ucmdbuf @@ -19934,7 +19941,7 @@ skip_ready_valid: * the drive speed. Thus EINVAL would be returned * if a set request was made for an mmc device. * We no longer support get or set speed for - * mmc but need to remain consistant with regard + * mmc but need to remain consistent with regard * to the error code returned. */ err = EINVAL; @@ -20030,7 +20037,7 @@ skip_ready_valid: if (!un->un_f_sync_cache_supported || !un->un_f_write_cache_enabled) { err = un->un_f_sync_cache_supported ? - 0 : ENOTSUP; + 0 : ENOTSUP; mutex_exit(SD_MUTEX(un)); if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback != NULL) { @@ -20135,7 +20142,7 @@ skip_ready_valid: mutex_exit(SD_MUTEX(un)); err = sd_cache_control(un, SD_CACHE_NOCHANGE, - SD_CACHE_ENABLE); + SD_CACHE_ENABLE); mutex_enter(SD_MUTEX(un)); @@ -20194,7 +20201,7 @@ sd_dkio_ctrl_info(dev_t dev, caddr_t arg, int flag) } info = (struct dk_cinfo *) - kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); + kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); switch (un->un_ctype) { case CTYPE_CDROM: @@ -20302,8 +20309,8 @@ sd_get_media_info(dev_t dev, caddr_t arg, int flag) /* Allow SCMD_GET_CONFIGURATION to MMC devices only */ if (un->un_f_mmc_cap == TRUE) { rtn = sd_send_scsi_GET_CONFIGURATION(un, &com, rqbuf, - SENSE_LENGTH, out_data, SD_PROFILE_HEADER_LEN, - SD_PATH_STANDARD); + SENSE_LENGTH, out_data, SD_PROFILE_HEADER_LEN, + SD_PATH_STANDARD); if (rtn) { /* @@ -21395,17 +21402,17 @@ sd_mhdioc_inresv(dev_t dev, caddr_t arg, int flag) * SCSI-2 * The cluster software takes ownership of a multi-hosted disk by issuing the * MHIOCTKOWN ioctl to the disk driver. It releases ownership by issuing the - * MHIOCRELEASE ioctl.Closely related is the MHIOCENFAILFAST ioctl -- a cluster, - * just after taking ownership of the disk with the MHIOCTKOWN ioctl then issues - * the MHIOCENFAILFAST ioctl. This ioctl "enables failfast" in the driver. The - * meaning of failfast is that if the driver (on this host) ever encounters the - * scsi error return code RESERVATION_CONFLICT from the device, it should - * immediately panic the host. The motivation for this ioctl is that if this - * host does encounter reservation conflict, the underlying cause is that some - * other host of the cluster has decided that this host is no longer in the - * cluster and has seized control of the disks for itself. Since this host is no - * longer in the cluster, it ought to panic itself. The MHIOCENFAILFAST ioctl - * does two things: + * MHIOCRELEASE ioctl. Closely related is the MHIOCENFAILFAST ioctl -- a + * cluster, just after taking ownership of the disk with the MHIOCTKOWN ioctl + * then issues the MHIOCENFAILFAST ioctl. This ioctl "enables failfast" in the + * driver. The meaning of failfast is that if the driver (on this host) ever + * encounters the scsi error return code RESERVATION_CONFLICT from the device, + * it should immediately panic the host. The motivation for this ioctl is that + * if this host does encounter reservation conflict, the underlying cause is + * that some other host of the cluster has decided that this host is no longer + * in the cluster and has seized control of the disks for itself. Since this + * host is no longer in the cluster, it ought to panic itself. The + * MHIOCENFAILFAST ioctl does two things: * (a) it sets a flag that will cause any returned RESERVATION_CONFLICT * error to panic the host * (b) it sets up a periodic timer to test whether this host still has @@ -22498,7 +22505,7 @@ sddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) if (sd_send_polled_RQS(un) == SD_FAILURE) { SD_INFO(SD_LOG_DUMP, un, - "sddump: sd_send_polled_RQS failed\n"); + "sddump: sd_send_polled_RQS failed\n"); } mutex_enter(SD_MUTEX(un)); } @@ -22530,8 +22537,8 @@ sddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) tgt_blkno = tgt_byte_offset / un->un_tgt_blocksize; tgt_nblk = ((tgt_byte_offset + tgt_byte_count + - (un->un_tgt_blocksize - 1)) / - un->un_tgt_blocksize) - tgt_blkno; + (un->un_tgt_blocksize - 1)) / + un->un_tgt_blocksize) - tgt_blkno; /* * Invoke the routine which is going to do read part @@ -22604,8 +22611,8 @@ sddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) #if defined(__i386) || defined(__amd64) blkno = oblkno + - ((wr_bp->b_bcount - dma_resid) / - un->un_tgt_blocksize); + ((wr_bp->b_bcount - dma_resid) / + un->un_tgt_blocksize); nblk = dma_resid / un->un_tgt_blocksize; if (wr_pktp) { @@ -23025,7 +23032,7 @@ sd_ddi_scsi_poll(struct scsi_pkt *pkt) } else if ((sensep != NULL) && (scsi_sense_key(sensep) == - KEY_UNIT_ATTENTION)) { + KEY_UNIT_ATTENTION)) { /* Unit Attention - try again */ busy_count += (SD_SEC_TO_CSEC - 1); /* 1 */ continue; @@ -24453,7 +24460,7 @@ sr_read_tocentry(dev_t dev, caddr_t data, int flag) * READ HEADER command failed, since this is * obsoleted in one spec, its better to return * -1 for an invlid track so that we can still - * recieve the rest of the TOC data. + * receive the rest of the TOC data. */ entry->cdte_datamode = (uchar_t)-1; } @@ -26486,7 +26493,7 @@ sd_setup_next_xfer(struct sd_lun *un, struct buf *bp, /* * Function: sd_panic_for_res_conflict * - * Description: Call panic with a string formated with "Reservation Conflict" + * Description: Call panic with a string formatted with "Reservation Conflict" * and a human readable identifier indicating the SD instance * that experienced the reservation conflict. * @@ -26526,7 +26533,7 @@ static uint_t sd_fault_injection_on = 0; * faultinjection ioctls to inject errors into the * layer model * - * Arguments: cmd - the ioctl cmd recieved + * Arguments: cmd - the ioctl cmd received * arg - the arguments from user and returns */ @@ -26878,7 +26885,7 @@ sd_faultinjection(struct scsi_pkt *pktp) /* if injection is off return */ if (sd_fault_injection_on == 0 || - un->sd_fi_fifo_start == un->sd_fi_fifo_end) { + un->sd_fi_fifo_start == un->sd_fi_fifo_end) { mutex_exit(SD_MUTEX(un)); return; } @@ -27164,7 +27171,7 @@ sd_faultinjection(struct scsi_pkt *pktp) * Firewire hard disks now have partition kstats * * ------------------------------------------------------ - * removable media hotplugable | kstat + * removable media hotpluggable | kstat * ------------------------------------------------------ * false false | Yes * false true | Yes @@ -27366,7 +27373,7 @@ sd_set_unit_attributes(struct sd_lun *un, dev_info_t *devi) */ un->un_f_pkstats_enabled = (ddi_prop_get_int(DDI_DEV_T_ANY, SD_DEVINFO(un), DDI_PROP_DONTPASS, - "enable-partition-kstats", 1)); + "enable-partition-kstats", 1)); /* * Check if HBA has set the "pm-capable" property. diff --git a/usr/src/uts/common/sys/dditypes.h b/usr/src/uts/common/sys/dditypes.h index f38a1c29d1..52b6198972 100644 --- a/usr/src/uts/common/sys/dditypes.h +++ b/usr/src/uts/common/sys/dditypes.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,6 +30,9 @@ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/isa_defs.h> +#ifndef _ASM +#include <sys/types.h> +#endif #ifdef __cplusplus extern "C" { diff --git a/usr/src/uts/common/sys/fm/fs/zfs.h b/usr/src/uts/common/sys/fm/fs/zfs.h index aa5c7ee0d7..8af2701aff 100644 --- a/usr/src/uts/common/sys/fm/fs/zfs.h +++ b/usr/src/uts/common/sys/fm/fs/zfs.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,6 +67,8 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" #define FM_RESOURCE_OK "ok" +#define FM_RESOURCE_REMOVED "removed" +#define FM_RESOURCE_AUTOREPLACE "autoreplace" #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index 354e837212..deecc0d36a 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -54,7 +54,7 @@ typedef enum { /* * Properties are identified by these constants and must be added to the - * end of this list to ensure that external conumsers are not affected + * end of this list to ensure that external consumers are not affected * by the change. The property list also determines how 'zfs get' will * display them. If you make any changes to this list, be sure to update * the property table in usr/src/common/zfs/zfs_prop.c. @@ -96,11 +96,16 @@ typedef enum { ZFS_PROP_XATTR, ZFS_PROP_NUMCLONES, /* not exposed to the user */ ZFS_PROP_COPIES, - ZFS_PROP_BOOTFS + ZPOOL_PROP_BOOTFS, + ZPOOL_PROP_AUTOREPLACE, + ZPOOL_PROP_NAME } zfs_prop_t; typedef zfs_prop_t zpool_prop_t; +#define ZPOOL_PROP_CONT ZFS_PROP_CONT +#define ZPOOL_PROP_INVAL ZFS_PROP_INVAL + #define ZFS_PROP_VALUE "value" #define ZFS_PROP_SOURCE "source" @@ -123,17 +128,18 @@ boolean_t zfs_prop_user(const char *); int zfs_prop_readonly(zfs_prop_t); const char *zfs_prop_default_string(zfs_prop_t); const char *zfs_prop_to_name(zfs_prop_t); -const char *zpool_prop_to_name(zfs_prop_t); +const char *zpool_prop_to_name(zpool_prop_t); uint64_t zfs_prop_default_numeric(zfs_prop_t); int zfs_prop_inheritable(zfs_prop_t); int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); +uint64_t zpool_prop_default_numeric(zpool_prop_t); /* * Property Iterator */ typedef zfs_prop_t (*zfs_prop_f)(zfs_prop_t, void *); -typedef zfs_prop_f zpool_prop_f; +typedef zpool_prop_t (*zpool_prop_f)(zpool_prop_t, void *); extern zfs_prop_t zfs_prop_iter(zfs_prop_f, void *, boolean_t); extern zpool_prop_t zpool_prop_iter(zpool_prop_f, void *, boolean_t); @@ -201,7 +207,6 @@ extern zpool_prop_t zpool_prop_iter(zpool_prop_f, void *, boolean_t); #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_STATS "stats" #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" -#define ZPOOL_CONFIG_OFFLINE "offline" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" #define ZPOOL_CONFIG_SPARES "spares" @@ -210,6 +215,17 @@ extern zpool_prop_t zpool_prop_iter(zpool_prop_f, void *, boolean_t); #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ +#define ZPOOL_CONFIG_UNSPARE "unspare" +#define ZPOOL_CONFIG_PHYS_PATH "phys_path" +/* + * The persistent vdev state is stored as separate values rather than a single + * 'vdev_state' entry. This is because a device can be in multiple states, such + * as offline and degraded. + */ +#define ZPOOL_CONFIG_OFFLINE "offline" +#define ZPOOL_CONFIG_FAULTED "faulted" +#define ZPOOL_CONFIG_DEGRADED "degraded" +#define ZPOOL_CONFIG_REMOVED "removed" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" @@ -243,11 +259,15 @@ typedef enum vdev_state { VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ VDEV_STATE_CLOSED, /* Not currently open */ VDEV_STATE_OFFLINE, /* Not allowed to open */ + VDEV_STATE_REMOVED, /* Explicitly removed from system */ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ + VDEV_STATE_FAULTED, /* External request to fault device */ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ VDEV_STATE_HEALTHY /* Presumed good */ } vdev_state_t; +#define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY + /* * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field * of the vdev stats structure uses these constants to distinguish why. @@ -262,7 +282,8 @@ typedef enum vdev_aux { VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ - VDEV_AUX_SPARED /* hot spare used in another pool */ + VDEV_AUX_SPARED, /* hot spare used in another pool */ + VDEV_AUX_ERR_EXCEEDED /* too many errors */ } vdev_aux_t; /* @@ -369,8 +390,7 @@ typedef enum zfs_ioc { ZFS_IOC_POOL_LOG_HISTORY, ZFS_IOC_VDEV_ADD, ZFS_IOC_VDEV_REMOVE, - ZFS_IOC_VDEV_ONLINE, - ZFS_IOC_VDEV_OFFLINE, + ZFS_IOC_VDEV_SET_STATE, ZFS_IOC_VDEV_ATTACH, ZFS_IOC_VDEV_DETACH, ZFS_IOC_VDEV_SETPATH, @@ -427,6 +447,39 @@ typedef enum { #define ZPOOL_HIST_TIME "history time" #define ZPOOL_HIST_CMD "history command" +/* + * Flags for ZFS_IOC_VDEV_SET_STATE + */ +#define ZFS_ONLINE_CHECKREMOVE 0x1 +#define ZFS_ONLINE_UNSPARE 0x2 +#define ZFS_ONLINE_FORCEFAULT 0x4 +#define ZFS_OFFLINE_TEMPORARY 0x1 + +/* + * Sysevent payload members. ZFS will generate the following sysevents with the + * given payloads: + * + * ESC_ZFS_RESILVER_START + * ESC_ZFS_RESILVER_END + * ESC_ZFS_POOL_DESTROY + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * + * ESC_ZFS_VDEV_REMOVE + * ESC_ZFS_VDEV_CLEAR + * ESC_ZFS_VDEV_CHECK + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) + * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 + */ +#define ZFS_EV_POOL_NAME "pool_name" +#define ZFS_EV_POOL_GUID "pool_guid" +#define ZFS_EV_VDEV_PATH "vdev_path" +#define ZFS_EV_VDEV_GUID "vdev_guid" + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/lofi.h b/usr/src/uts/common/sys/lofi.h index a5f0eb1d97..362af884e3 100644 --- a/usr/src/uts/common/sys/lofi.h +++ b/usr/src/uts/common/sys/lofi.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -92,13 +91,20 @@ extern "C" { * ioctl(ld, LOFI_GET_MAXMINOR, &li); * maxminor = li.li_minor; * + * If the 'li_force' flag is set for any of the LOFI_UNMAP_* commands, then if + * the device is busy, the underlying vnode will be closed, and any subsequent + * operations will fail. It will behave as if the device had been forcibly + * removed, so the DKIOCSTATE ioctl will return DKIO_DEV_GONE. When the device + * is last closed, it will be torn down. + * * Oh, and last but not least: these ioctls are totally private and only * for use by lofiadm(1M). * */ struct lofi_ioctl { - uint32_t li_minor; + uint32_t li_minor; + boolean_t li_force; char li_filename[MAXPATHLEN + 1]; }; @@ -134,9 +140,13 @@ extern uint32_t lofi_max_files; ((vtype == VREG) || (vtype == VBLK) || (vtype == VCHR)) struct lofi_state { - char *ls_filename; /* filename to open */ - size_t ls_filename_sz; - struct vnode *ls_vp; /* open vnode */ + char *ls_filename; /* filename to open */ + size_t ls_filename_sz; + struct vnode *ls_vp; /* open vnode */ + kmutex_t ls_vp_lock; /* protects ls_vp */ + kcondvar_t ls_vp_cv; /* signal changes to ls_vp */ + uint32_t ls_vp_iocount; /* # pending I/O requests */ + boolean_t ls_vp_closereq; /* force close requested */ u_offset_t ls_vp_size; uint32_t ls_blk_open; uint32_t ls_chr_open; diff --git a/usr/src/uts/common/sys/sysevent/eventdefs.h b/usr/src/uts/common/sys/sysevent/eventdefs.h index 7e8eff763f..69f01b9af4 100644 --- a/usr/src/uts/common/sys/sysevent/eventdefs.h +++ b/usr/src/uts/common/sys/sysevent/eventdefs.h @@ -51,6 +51,7 @@ extern "C" { #define EC_DEV_REMOVE "EC_dev_remove" /* device remove event class */ #define EC_DEV_BRANCH "EC_dev_branch" /* device tree branch event class */ #define EC_FM "EC_fm" /* FMA error report event */ +#define EC_ZFS "EC_zfs" /* ZFS event */ /* * The following event class is reserved for exclusive use @@ -215,6 +216,17 @@ extern "C" { #define ESC_ACPIEV_LOW "ESC_acpiev_low" #define ESC_ACPIEV_STATE_CHANGE "ESC_acpiev_state_change" +/* + * ZFS subclass definitions. supporting attributes (name/value paris) are found + * in sys/fs/zfs.h + */ +#define ESC_ZFS_RESILVER_START "ESC_ZFS_resilver_start" +#define ESC_ZFS_RESILVER_FINISH "ESC_ZFS_resilver_finish" +#define ESC_ZFS_VDEV_REMOVE "ESC_ZFS_vdev_remove" +#define ESC_ZFS_POOL_DESTROY "ESC_ZFS_pool_destroy" +#define ESC_ZFS_VDEV_CLEAR "ESC_ZFS_vdev_clear" +#define ESC_ZFS_VDEV_CHECK "ESC_ZFS_vdev_check" + #ifdef __cplusplus } #endif |