diff options
author | Keith M Wesolowski <wesolows@foobazco.org> | 2013-08-29 22:55:15 +0000 |
---|---|---|
committer | Keith M Wesolowski <wesolows@foobazco.org> | 2013-08-29 22:55:25 +0000 |
commit | 72556f5bbf7b7abd71cb57d6362d62790417afa6 (patch) | |
tree | 1631d3ecbb62cf38bc45c02823ba5f5d99cbaf8c /usr/src | |
parent | e9c1240c4cdcc39caeef0ce9cb97380612ae0f9d (diff) | |
parent | 22e30981d82a0b6dc89253596ededafae8655e00 (diff) | |
download | illumos-joyent-72556f5bbf7b7abd71cb57d6362d62790417afa6.tar.gz |
[illumos-gate merge]
commit 22e30981d82a0b6dc89253596ededafae8655e00
3954 metaslabs continue to load even after hitting zfs_mg_alloc_failure limit
4080 zpool clear fails to clear pool
4081 need zfs_mg_noalloc_threshold
commit 3a9fcf3eb769315c8984c438833b8b1453206fa4
4049 filesystem(5) documents /usr/bin/amd64 twice
commit 511588bb13d2462265d682dc1cb7ba5c7a27a771
4048 cpu_acpi is too verbose about disabled SpeedStep/PowerNow! support
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/man/man5/filesystem.5 | 16 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/metaslab.c | 113 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/metaslab_impl.h | 5 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_ioctl.c | 2 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zio.c | 5 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/cpupm/cpu_acpi.c | 10 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/cpupm/pwrnow.c | 10 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/cpupm/speedstep.c | 10 |
8 files changed, 132 insertions, 39 deletions
diff --git a/usr/src/man/man5/filesystem.5 b/usr/src/man/man5/filesystem.5 index fc01037b68..53cf68b47e 100644 --- a/usr/src/man/man5/filesystem.5 +++ b/usr/src/man/man5/filesystem.5 @@ -4,7 +4,7 @@ .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH FILESYSTEM 5 "Aug 24, 2009" +.TH FILESYSTEM 5 "Aug 26, 2013" .SH NAME filesystem \- File system organization .SH SYNOPSIS @@ -2049,20 +2049,6 @@ add-on system software or for applications is .sp .ne 2 .na -\fB\fB/usr/bin/amd64\fR\fR -.ad -.sp .6 -.RS 4n -x86 platform-dependent, user-invoked executables. This directory should not be -part of a user's \fB$PATH\fR. A wrapper in \fB/usr/bin\fR should invoke the -executable in this directory. See \fBisaexec\fR(3C). An approved installation -location for bundled Solaris software. The analogous location for add-on system -software or for applications is \fB/opt/\fIpackagename\fR/bin/amd64\fR. -.RE - -.sp -.ne 2 -.na \fB\fB/usr/bin/\fIsubsystem\fR\fR\fR .ad .sp .6 diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index d297bc9ad8..c7ff80ffa8 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -58,9 +58,25 @@ int zfs_condense_pct = 200; /* * This value defines the number of allowed allocation failures per vdev. * If a device reaches this threshold in a given txg then we consider skipping - * allocations on that device. + * allocations on that device. The value of zfs_mg_alloc_failures is computed + * in zio_init() unless it has been overridden in /etc/system. */ -int zfs_mg_alloc_failures; +int zfs_mg_alloc_failures = 0; + +/* + * The zfs_mg_noalloc_threshold defines which metaslab groups should + * be eligible for allocation. The value is defined as a percentage of + * a free space. Metaslab groups that have more free space than + * zfs_mg_noalloc_threshold are always eligible for allocations. Once + * a metaslab group's free space is less than or equal to the + * zfs_mg_noalloc_threshold the allocator will avoid allocating to that + * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. + * Once all groups in the pool reach zfs_mg_noalloc_threshold then all + * groups are allowed to accept allocations. Gang blocks are always + * eligible to allocate on any metaslab group. The default value of 0 means + * no metaslab group will be excluded based on this criterion. + */ +int zfs_mg_noalloc_threshold = 0; /* * Metaslab debugging: when set, keeps all space maps in core to verify frees. @@ -224,6 +240,53 @@ metaslab_compare(const void *x1, const void *x2) return (0); } +/* + * Update the allocatable flag and the metaslab group's capacity. + * The allocatable flag is set to true if the capacity is below + * the zfs_mg_noalloc_threshold. If a metaslab group transitions + * from allocatable to non-allocatable or vice versa then the metaslab + * group's class is updated to reflect the transition. + */ +static void +metaslab_group_alloc_update(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + metaslab_class_t *mc = mg->mg_class; + vdev_stat_t *vs = &vd->vdev_stat; + boolean_t was_allocatable; + + ASSERT(vd == vd->vdev_top); + + mutex_enter(&mg->mg_lock); + was_allocatable = mg->mg_allocatable; + + mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / + (vs->vs_space + 1); + + mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold); + + /* + * The mc_alloc_groups maintains a count of the number of + * groups in this metaslab class that are still above the + * zfs_mg_noalloc_threshold. This is used by the allocating + * threads to determine if they should avoid allocations to + * a given group. The allocator will avoid allocations to a group + * if that group has reached or is below the zfs_mg_noalloc_threshold + * and there are still other groups that are above the threshold. + * When a group transitions from allocatable to non-allocatable or + * vice versa we update the metaslab class to reflect that change. + * When the mc_alloc_groups value drops to 0 that means that all + * groups have reached the zfs_mg_noalloc_threshold making all groups + * eligible for allocations. This effectively means that all devices + * are balanced again. + */ + if (was_allocatable && !mg->mg_allocatable) + mc->mc_alloc_groups--; + else if (!was_allocatable && mg->mg_allocatable) + mc->mc_alloc_groups++; + mutex_exit(&mg->mg_lock); +} + metaslab_group_t * metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) { @@ -274,6 +337,7 @@ metaslab_group_activate(metaslab_group_t *mg) return; mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); + metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_rotor) == NULL) { mg->mg_prev = mg; @@ -359,6 +423,29 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) } /* + * Determine if a given metaslab group should skip allocations. A metaslab + * group should avoid allocations if its used capacity has crossed the + * zfs_mg_noalloc_threshold and there is at least one metaslab group + * that can still handle allocations. + */ +static boolean_t +metaslab_group_allocatable(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; + metaslab_class_t *mc = mg->mg_class; + + /* + * A metaslab group is considered allocatable if its free capacity + * is greater than the set value of zfs_mg_noalloc_threshold, it's + * associated with a slog, or there are no other metaslab groups + * with free capacity greater than zfs_mg_noalloc_threshold. + */ + return (mg->mg_free_capacity > zfs_mg_noalloc_threshold || + mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); +} + +/* * ========================================================================== * Common allocator routines * ========================================================================== @@ -1307,6 +1394,8 @@ metaslab_sync_reassess(metaslab_group_t *mg) vdev_t *vd = mg->mg_vd; int64_t failures = mg->mg_alloc_failures; + metaslab_group_alloc_update(mg); + /* * Re-evaluate all metaslabs which have lower offsets than the * bonus area. @@ -1408,6 +1497,8 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, if (msp == NULL) return (-1ULL); + mutex_enter(&msp->ms_lock); + /* * If we've already reached the allowable number of failed * allocation attempts on this metaslab group then we @@ -1424,11 +1515,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, "asize %llu, failures %llu", spa_name(spa), mg->mg_vd->vdev_id, txg, mg, psize, asize, mg->mg_alloc_failures); + mutex_exit(&msp->ms_lock); return (-1ULL); } - mutex_enter(&msp->ms_lock); - /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that @@ -1581,6 +1671,21 @@ top: } else { allocatable = vdev_allocatable(vd); } + + /* + * Determine if the selected metaslab group is eligible + * for allocations. If we're ganging or have requested + * an allocation for the smallest gang block size + * then we don't want to avoid allocating to the this + * metaslab group. If we're in this condition we should + * try to allocate from any device possible so that we + * don't inadvertently return ENOSPC and suspend the pool + * even though space is still available. + */ + if (allocatable && CAN_FASTGANG(flags) && + psize > SPA_GANGBLOCKSIZE) + allocatable = metaslab_group_allocatable(mg); + if (!allocatable) goto next; diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h index 138e14ef59..de88803da8 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -45,6 +45,7 @@ struct metaslab_class { metaslab_group_t *mc_rotor; space_map_ops_t *mc_ops; uint64_t mc_aliquot; + uint64_t mc_alloc_groups; /* # of allocatable groups */ uint64_t mc_alloc; /* total allocated space */ uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_space; /* total space (alloc + free) */ @@ -57,6 +58,8 @@ struct metaslab_group { uint64_t mg_aliquot; uint64_t mg_bonus_area; uint64_t mg_alloc_failures; + boolean_t mg_allocatable; /* can we allocate? */ + uint64_t mg_free_capacity; /* percentage free */ int64_t mg_bias; int64_t mg_activation_count; metaslab_class_t *mg_class; diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index 27a02749af..84d2218620 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -5440,7 +5440,7 @@ zfs_ioctl_init(void) zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, - zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED); + zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED); diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index de445a9e10..eb06151cee 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -172,7 +172,8 @@ zio_init(void) * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs * to fail 3 times per txg or 8 failures, whichever is greater. */ - zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); + if (zfs_mg_alloc_failures == 0) + zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); zio_inject_init(); } @@ -2373,7 +2374,7 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, old_bp, - METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); + METASLAB_HINTBP_AVOID); } if (error == 0) { diff --git a/usr/src/uts/i86pc/os/cpupm/cpu_acpi.c b/usr/src/uts/i86pc/os/cpupm/cpu_acpi.c index b4ec5f9e5e..323990aa1a 100644 --- a/usr/src/uts/i86pc/os/cpupm/cpu_acpi.c +++ b/usr/src/uts/i86pc/os/cpupm/cpu_acpi.c @@ -441,10 +441,6 @@ cpu_acpi_cache_supported_states(cpu_acpi_handle_t handle, if (astatus == AE_NOT_FOUND) { DTRACE_PROBE3(cpu_acpi__eval__err, int, handle->cs_id, int, objtype, int, astatus); - if (objtype == PSS_OBJ) - cmn_err(CE_NOTE, "!cpu_acpi: _PSS package " - "evaluation failed for with status %d for " - "CPU %d.", astatus, handle->cs_id); return (1); } cmn_err(CE_NOTE, "!cpu_acpi: error %d evaluating %s package " @@ -840,24 +836,18 @@ cpu_acpi_cache_pstate_data(cpu_acpi_handle_t handle) if (cpu_acpi_cache_pct(handle) < 0) { DTRACE_PROBE2(cpu_acpi__cache__err, int, handle->cs_id, int, PCT_OBJ); - cmn_err(CE_NOTE, "!cpu_acpi: error parsing _PCT for " - "CPU %d", handle->cs_id); return (-1); } if (cpu_acpi_cache_pstates(handle) != 0) { DTRACE_PROBE2(cpu_acpi__cache__err, int, handle->cs_id, int, PSS_OBJ); - cmn_err(CE_NOTE, "!cpu_acpi: error parsing _PSS for " - "CPU %d", handle->cs_id); return (-1); } if (cpu_acpi_cache_psd(handle) < 0) { DTRACE_PROBE2(cpu_acpi__cache__err, int, handle->cs_id, int, PSD_OBJ); - cmn_err(CE_NOTE, "!cpu_acpi: error parsing _PSD for " - "CPU %d", handle->cs_id); return (-1); } diff --git a/usr/src/uts/i86pc/os/cpupm/pwrnow.c b/usr/src/uts/i86pc/os/cpupm/pwrnow.c index 0116fe9157..a58e85256a 100644 --- a/usr/src/uts/i86pc/os/cpupm/pwrnow.c +++ b/usr/src/uts/i86pc/os/cpupm/pwrnow.c @@ -170,6 +170,7 @@ pwrnow_init(cpu_t *cp) (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; cpu_acpi_pct_t *pct_stat; + static int logged = 0; PWRNOW_DEBUG(("pwrnow_init: processor %d\n", cp->cpu_id)); @@ -177,9 +178,12 @@ pwrnow_init(cpu_t *cp) * Cache the P-state specific ACPI data. */ if (cpu_acpi_cache_pstate_data(handle) != 0) { - cmn_err(CE_NOTE, "!PowerNow! support is being " - "disabled due to errors parsing ACPI P-state objects " - "exported by BIOS."); + if (!logged) { + cmn_err(CE_NOTE, "!PowerNow! support is being " + "disabled due to errors parsing ACPI P-state " + "objects exported by BIOS."); + logged = 1; + } pwrnow_fini(cp); return (PWRNOW_RET_NO_PM); } diff --git a/usr/src/uts/i86pc/os/cpupm/speedstep.c b/usr/src/uts/i86pc/os/cpupm/speedstep.c index 27df5c022d..021eab7e41 100644 --- a/usr/src/uts/i86pc/os/cpupm/speedstep.c +++ b/usr/src/uts/i86pc/os/cpupm/speedstep.c @@ -185,6 +185,7 @@ speedstep_init(cpu_t *cp) (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state; cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; cpu_acpi_pct_t *pct_stat; + static int logged = 0; ESSDEBUG(("speedstep_init: processor %d\n", cp->cpu_id)); @@ -192,9 +193,12 @@ speedstep_init(cpu_t *cp) * Cache the P-state specific ACPI data. */ if (cpu_acpi_cache_pstate_data(handle) != 0) { - cmn_err(CE_NOTE, "!SpeedStep support is being " - "disabled due to errors parsing ACPI P-state objects " - "exported by BIOS."); + if (!logged) { + cmn_err(CE_NOTE, "!SpeedStep support is being " + "disabled due to errors parsing ACPI P-state " + "objects exported by BIOS."); + logged = 1; + } speedstep_fini(cp); return (ESS_RET_NO_PM); } |