diff options
author | Bryan Cantrill <bryan@joyent.com> | 2013-09-20 08:33:22 +0000 |
---|---|---|
committer | Robert Mustacchi <rm@joyent.com> | 2013-11-06 15:56:44 -0800 |
commit | b3d32f0ceb59362ba287dcfd6de471e98bfc7fa9 (patch) | |
tree | 795e0fb431def1d43b0e478c32f9f08e59a75803 | |
parent | 5a450eee20035a2a426a48161ab3a50e2921878c (diff) | |
download | illumos-joyent-b3d32f0ceb59362ba287dcfd6de471e98bfc7fa9.tar.gz |
4161 deadlock between zfs_read() and zfs_putpage()
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Keith Wesolowski <keith.wesolowski@joyent.com>
Reviewed by: Ilya Usvyatsky <ilya.usvyatsky@nexenta.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
-rw-r--r-- | usr/src/lib/libzpool/common/kernel.c | 13 | ||||
-rw-r--r-- | usr/src/man/man9f/rwlock.9f | 47 | ||||
-rw-r--r-- | usr/src/uts/common/os/rwlock.c | 80 | ||||
-rw-r--r-- | usr/src/uts/common/os/rwstlock.c | 15 | ||||
-rw-r--r-- | usr/src/uts/common/sys/rwlock.h | 9 | ||||
-rw-r--r-- | usr/src/uts/common/vm/as.h | 22 |
6 files changed, 145 insertions, 41 deletions
diff --git a/usr/src/lib/libzpool/common/kernel.c b/usr/src/lib/libzpool/common/kernel.c index 4dd614f7c1..bf7042b15d 100644 --- a/usr/src/lib/libzpool/common/kernel.c +++ b/usr/src/lib/libzpool/common/kernel.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ #include <assert.h> @@ -221,10 +222,10 @@ rw_enter(krwlock_t *rwlp, krw_t rw) ASSERT(rwlp->rw_owner != (void *)-1UL); ASSERT(rwlp->rw_owner != curthread); - if (rw == RW_READER) - VERIFY(rw_rdlock(&rwlp->rw_lock) == 0); - else + if (rw == RW_WRITER) VERIFY(rw_wrlock(&rwlp->rw_lock) == 0); + else + VERIFY(rw_rdlock(&rwlp->rw_lock) == 0); rwlp->rw_owner = curthread; } @@ -247,10 +248,10 @@ rw_tryenter(krwlock_t *rwlp, krw_t rw) ASSERT(rwlp->initialized == B_TRUE); ASSERT(rwlp->rw_owner != (void *)-1UL); - if (rw == RW_READER) - rv = rw_tryrdlock(&rwlp->rw_lock); - else + if (rw == RW_WRITER) rv = rw_trywrlock(&rwlp->rw_lock); + else + rv = rw_tryrdlock(&rwlp->rw_lock); if (rv == 0) { rwlp->rw_owner = curthread; diff --git a/usr/src/man/man9f/rwlock.9f b/usr/src/man/man9f/rwlock.9f index c0ca866204..1ff7ea4f03 100644 --- a/usr/src/man/man9f/rwlock.9f +++ b/usr/src/man/man9f/rwlock.9f @@ -1,9 +1,10 @@ '\" te .\" Copyright (c) 2006 Sun Microsystems, Inc. All Rights Reserved +.\" Copyright (c) 2013, Joyent, Inc. All rights reserved. .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH RWLOCK 9F "Jan 16, 2006" +.TH RWLOCK 9F "Sep 19, 2013" .SH NAME rwlock, rw_init, rw_destroy, rw_enter, rw_exit, rw_tryenter, rw_downgrade, rw_tryupgrade, rw_read_locked \- readers/writer lock functions @@ -100,9 +101,11 @@ Type-specific argument for initialization function. \fB\fIenter_type\fR\fR .ad .RS 14n -One of the values \fBRW_READER\fR or \fBRW_WRITER\fR, indicating whether the -lock is to be acquired non-exclusively (\fBRW_READER\fR) or exclusively -(\fBRW_WRITER\fR). +One of the values \fBRW_WRITER\fR, \fBRW_READER\fR or +\fBRW_READER_STARVEWRITER\fR, indicating whether the +lock is to be acquired exclusively (\fBRW_WRITER\fR), non-exclusively +(\fBRW_READER\fR) or non-exclusively without regard to any threads +that may be blocked on exclusive access (\fBRW_READER_STARVEWRITER\fR). .RE .SH DESCRIPTION @@ -111,7 +114,7 @@ lock is to be acquired non-exclusively (\fBRW_READER\fR) or exclusively A multiple-readers, single-writer lock is represented by the \fBkrwlock_t\fR data type. This type of lock will allow many threads to have simultaneous read-only access to an object. Only one thread may have write access at any one -time. An object which is searched more frequently than it is changed is a good +time. An object that is searched more frequently than it is changed is a good candidate for a readers/writer lock. .sp .LP @@ -137,17 +140,29 @@ destroyed. .sp .LP The \fBrw_enter()\fR function acquires the lock, and blocks if necessary. If -\fIenter_type\fR is \fBRW_READER\fR, the caller blocks if there is a writer or -a thread attempting to enter for writing. If \fIenter_type\fR is -\fBRW_WRITER\fR, the caller blocks if any thread holds the lock. +\fIenter_type\fR is \fBRW_WRITER\fR, the caller blocks if any thread holds +the lock. If \fIenter_type\fR is \fBRW_READER\fR, the caller blocks if there +is a writer or a thread attempting to enter for writing. If \fIenter_type\fR +is \fBRW_READER_STARVEWRITER\fR, the caller blocks only if there is a writer; +if the lock is held for reading and a thread is blocked attempting to enter +for writing, the caller will acquire the lock as a reader instead of +blocking on the pending writer. + .sp .LP -NOTE: It is a programming error for any thread to acquire an rwlock it already -holds, even as a reader. Doing so can deadlock the system: if thread R acquires -the lock as a reader, then thread W tries to acquire the lock as a writer, W -will set write-wanted and block. When R tries to get its second read hold on -the lock, it will honor the write-wanted bit and block waiting for W; but W -cannot run until R drops the lock. Thus threads R and W deadlock. +NOTE: It is a programming error for any thread to acquire an rwlock as +\fBRW_READER\fR that it already holds. Doing so can deadlock the system: if +thread R acquires the lock as \fBRW_READER\fR, then thread W tries to acquire +the lock as a writer, W will set write-wanted and block. When R tries to get +its second read hold on the lock, it will honor the write-wanted bit and block +waiting for W; but W cannot run until R drops the lock. Thus threads R and W +deadlock. To opt out of this behavior -- that is, to safely allow a lock to +be grabbed recursively as a reader -- the lock should be acquired as +\fBRW_READER_STARVEWRITER\fR, which will allow R to get its second read hold +without regard for the write-wanted bit set by W. Note that the +\fBRW_READER_STARVEWRITER\fR behavior will starve writers in the presence of +infinite readers; it should be used with care, and only where the default +\fBRW_READER\fR behavior is unacceptable. .sp .LP The \fBrw_exit()\fR function releases the lock and may wake up one or more @@ -159,12 +174,12 @@ The \fBrw_tryenter()\fR function attempts to enter the lock, like successfully entered, and zero otherwise. .sp .LP -A thread which holds the lock exclusively (entered with \fBRW_WRITER\fR), may +A thread that holds the lock exclusively (entered with \fBRW_WRITER\fR), may call \fBrw_downgrade()\fR to convert to holding the lock non-exclusively (as if entered with \fBRW_READER\fR). One or more waiting readers may be unblocked. .sp .LP -The \fBrw_tryupgrade()\fR function can be called by a thread which holds the +The \fBrw_tryupgrade()\fR function can be called by a thread that holds the lock for reading to attempt to convert to holding it for writing. This upgrade can only succeed if no other thread is holding the lock and no other thread is blocked waiting to acquire the lock for writing. diff --git a/usr/src/uts/common/os/rwlock.c b/usr/src/uts/common/os/rwlock.c index 4c5e6c4360..e35b564983 100644 --- a/usr/src/uts/common/os/rwlock.c +++ b/usr/src/uts/common/os/rwlock.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ + #include <sys/param.h> #include <sys/thread.h> #include <sys/cmn_err.h> @@ -62,9 +66,10 @@ * by incrementing the hold count (i.e. adding 8, aka RW_READ_LOCK). * * A writer will fail to acquire the lock if any other thread owns it. - * A reader will fail if the lock is either owned or wanted by a writer. - * rw_tryenter() returns 0 in these cases; rw_enter() blocks until the - * lock becomes available. + * A reader will fail if the lock is either owned (in the RW_READER and + * RW_READER_STARVEWRITER cases) or wanted by a writer (in the RW_READER + * case). rw_tryenter() returns 0 in these cases; rw_enter() blocks until + * the lock becomes available. * * When a thread blocks it acquires the rwlock's hashed turnstile lock and * attempts to set RW_HAS_WAITERS (and RW_WRITE_WANTED in the writer case) @@ -137,6 +142,42 @@ * that writers always inherit priority from blocked readers, and the * writer will awaken those readers as soon as it exits the lock. * + * Finally, note that this hybrid scheme -- and indeed, any scheme that + * satisfies requirement (2) -- has an important consequence: if a lock is + * held as reader and a writer subsequently becomes blocked, any further + * readers must be blocked to avoid writer starvation. This implementation + * detail has ramifications for the semantics of rwlocks, as it prohibits + * recursively acquiring an rwlock as reader: any writer that wishes to + * acquire the lock after the first but before the second acquisition as + * reader will block the second acquisition -- resulting in deadlock. This + * itself is not necessarily prohibitive, as it is often straightforward to + * prevent a single thread from recursively acquiring an rwlock as reader. + * However, a more subtle situation arises when both a traditional mutex and + * a reader lock are acquired by two different threads in opposite order. + * (That is, one thread first acquires the mutex and then the rwlock as + * reader; the other acquires the rwlock as reader and then the mutex.) As + * with the single threaded case, this is fine absent a blocked writer: the + * thread that acquires the mutex before acquiring the rwlock as reader will + * be able to successfully acquire the rwlock -- even as/if the other thread + * has the rwlock as reader and is blocked on the held mutex. However, if + * an unrelated writer (that is, a third thread) becomes blocked on the + * rwlock after the first thread acquires the rwlock as reader but before + * it's able to acquire the mutex, the second thread -- with the mutex held + * -- will not be able to acquire the rwlock as reader due to the waiting + * writer, deadlocking the three threads. Unlike the single-threaded + * (recursive) rwlock acquisition case, this case can be quite a bit + * thornier to fix, especially as there is nothing inherently wrong in the + * locking strategy: the deadlock is really induced by requirement (2), not + * the consumers of the rwlock. To permit such consumers, we allow rwlock + * acquirers to explicitly opt out of requirement (2) by specifying + * RW_READER_STARVEWRITER when acquiring the rwlock. This (obviously) means + * that inifinite readers can starve writers, but it also allows for + * multiple readers in the presence of other synchronization primitives + * without regard for lock-ordering. And while certainly odd (and perhaps + * unwise), RW_READER_STARVEWRITER can be safely used alongside RW_READER on + * the same lock -- RW_READER_STARVEWRITER describes only the act of lock + * acquisition with respect to waiting writers, not the lock itself. + * * rw_downgrade() follows the same wakeup policy as an exiting writer. * * rw_tryupgrade() has the same failure mode as rw_tryenter() for a @@ -213,7 +254,7 @@ rw_locked(rwlock_impl_t *lp, krw_t rw) { uintptr_t old = lp->rw_wwwh; - if (rw == RW_READER) + if (rw == RW_READER || rw == RW_READER_STARVEWRITER) return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED)); if (rw == RW_WRITER) @@ -230,7 +271,7 @@ void (*rw_lock_delay)(uint_t) = NULL; * Called from the assembly version if anything complicated is going on. * The only semantic difference between calling rw_enter() and calling * rw_enter_sleep() directly is that we assume the caller has already done - * a THREAD_KPRI_REQUEST() in the RW_READER case. + * a THREAD_KPRI_REQUEST() in the RW_READER cases. */ void rw_enter_sleep(rwlock_impl_t *lp, krw_t rw) @@ -245,6 +286,10 @@ rw_enter_sleep(rwlock_impl_t *lp, krw_t rw) lock_value = RW_READ_LOCK; lock_busy = RW_WRITE_CLAIMED; lock_wait = RW_HAS_WAITERS; + } else if (rw == RW_READER_STARVEWRITER) { + lock_value = RW_READ_LOCK; + lock_busy = RW_WRITE_LOCKED; + lock_wait = RW_HAS_WAITERS; } else { lock_value = RW_WRITE_LOCK(curthread); lock_busy = (uintptr_t)RW_LOCKED; @@ -304,7 +349,7 @@ rw_enter_sleep(rwlock_impl_t *lp, krw_t rw) ASSERT(lp->rw_wwwh & RW_LOCKED); sleep_time = -gethrtime(); - if (rw == RW_READER) { + if (rw != RW_WRITER) { THREAD_KPRI_RELEASE(); CPU_STATS_ADDQ(CPU, sys, rw_rdfails, 1); (void) turnstile_block(ts, TS_READER_Q, lp, @@ -413,6 +458,24 @@ rw_exit_wakeup(rwlock_impl_t *lp) } /* + * This appears to be the final exit of a lock with waiters. + * If we do not have the lock as writer (that is, if this is + * the last exit of a reader with waiting writers), we will + * grab the lock as writer to prevent additional readers. + * (This is required because a reader that is acquiring the + * lock via RW_READER_STARVEWRITER will not observe the + * RW_WRITE_WANTED bit -- and we could therefore be racing + * with such readers here.) + */ + if (!(old & RW_WRITE_LOCKED)) { + new = RW_WRITE_LOCK(curthread) | + RW_HAS_WAITERS | RW_WRITE_WANTED; + + if (casip(&lp->rw_wwwh, old, new) != old) + continue; + } + + /* * Perform the final exit of a lock that has waiters. */ ts = turnstile_lookup(lp); @@ -473,12 +536,13 @@ rw_tryenter(krwlock_t *rwlp, krw_t rw) rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; uintptr_t old; - if (rw == RW_READER) { + if (rw != RW_WRITER) { uint_t backoff = 0; int loop_count = 0; THREAD_KPRI_REQUEST(); for (;;) { - if ((old = lp->rw_wwwh) & RW_WRITE_CLAIMED) { + if ((old = lp->rw_wwwh) & (rw == RW_READER ? + RW_WRITE_CLAIMED : RW_WRITE_LOCKED)) { THREAD_KPRI_RELEASE(); return (0); } diff --git a/usr/src/uts/common/os/rwstlock.c b/usr/src/uts/common/os/rwstlock.c index 7308447fab..85f1ef6e05 100644 --- a/usr/src/uts/common/os/rwstlock.c +++ b/usr/src/uts/common/os/rwstlock.c @@ -24,7 +24,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ #include <sys/rwstlock.h> #include <sys/errno.h> @@ -38,7 +40,9 @@ * other than the one that acquired the lock. * * There is no priority inheritance mechanism for these locks. - * Writers have priority over readers, so reader starvation is possible. + * For RW_READER, writers have priority over readers, so reader starvation + * is possible; as with rwlocks, this behavior may be overridden by + * specifying RW_READER_STARVEWRITER. */ /* @@ -61,8 +65,9 @@ rwst_enter_common(rwstlock_t *l, krw_t rw, int flags) intptr_t readers; mutex_enter(&l->rwst_lock); - if (rw == RW_READER) { - while (RWST_WRITE_HELD(l) || RWST_WRITE_WANTED(l)) { + if (rw == RW_READER || rw == RW_READER_STARVEWRITER) { + while (RWST_WRITE_HELD(l) || + (rw != RW_READER_STARVEWRITER && RWST_WRITE_WANTED(l))) { if (flags & RWST_TRYENTER) { mutex_exit(&l->rwst_lock); @@ -164,7 +169,7 @@ rwst_tryenter(rwstlock_t *l, krw_t rw) int rwst_lock_held(rwstlock_t *l, krw_t rw) { - if (rw == RW_READER) + if (rw != RW_WRITER) return (RWST_READ_HELD(l)); ASSERT(rw == RW_WRITER); return (RWST_WRITE_OWNER(l)); diff --git a/usr/src/uts/common/sys/rwlock.h b/usr/src/uts/common/sys/rwlock.h index 4694eb9f37..dc2eb93ae3 100644 --- a/usr/src/uts/common/sys/rwlock.h +++ b/usr/src/uts/common/sys/rwlock.h @@ -23,11 +23,13 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_RWLOCK_H #define _SYS_RWLOCK_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Public interface to readers/writer locks. See rwlock(9F) for details. */ @@ -47,7 +49,8 @@ typedef enum { typedef enum { RW_WRITER, - RW_READER + RW_READER, + RW_READER_STARVEWRITER } krw_t; typedef struct _krwlock { diff --git a/usr/src/uts/common/vm/as.h b/usr/src/uts/common/vm/as.h index 98e496ef36..d70c655f9c 100644 --- a/usr/src/uts/common/vm/as.h +++ b/usr/src/uts/common/vm/as.h @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ + /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ @@ -224,12 +228,24 @@ enum as_cbdelete_rc { extern struct as kas; /* kernel's address space */ /* - * Macros for address space locking. + * Macros for address space locking. Note that we use RW_READER_STARVEWRITER + * whenever we acquire the address space lock as reader to assure that it can + * be used without regard to lock order in conjunction with filesystem locks. + * This allows filesystems to safely induce user-level page faults with + * filesystem locks held while concurrently allowing filesystem entry points + * acquiring those same locks to be called with the address space lock held as + * reader. RW_READER_STARVEWRITER thus prevents reader/reader+RW_WRITE_WANTED + * deadlocks in the style of fop_write()+as_fault()/as_*()+fop_putpage() and + * fop_read()+as_fault()/as_*()+fop_getpage(). (See the Big Theory Statement + * in rwlock.c for more information on the semantics of and motivation behind + * RW_READER_STARVEWRITER.) */ -#define AS_LOCK_ENTER(as, lock, type) rw_enter((lock), (type)) +#define AS_LOCK_ENTER(as, lock, type) rw_enter((lock), \ + (type) == RW_READER ? RW_READER_STARVEWRITER : (type)) #define AS_LOCK_EXIT(as, lock) rw_exit((lock)) #define AS_LOCK_DESTROY(as, lock) rw_destroy((lock)) -#define AS_LOCK_TRYENTER(as, lock, type) rw_tryenter((lock), (type)) +#define AS_LOCK_TRYENTER(as, lock, type) rw_tryenter((lock), \ + (type) == RW_READER ? RW_READER_STARVEWRITER : (type)) /* * Macros to test lock states. |