summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDeepak Honnalli <Deepak.Honnalli@Sun.COM>2008-09-25 11:52:01 +0530
committerDeepak Honnalli <Deepak.Honnalli@Sun.COM>2008-09-25 11:52:01 +0530
commit1384c58687fc765ca3d9a998204826d3dd0ce419 (patch)
tree7ffd53cb62bc335ecb7309087790a804fbc0a123
parentaecfc01d1bad84e66649703f7fc2926ef70b34ba (diff)
downloadillumos-gate-1384c58687fc765ca3d9a998204826d3dd0ce419.tar.gz
6503547 deadlock between utilities accessing /proc and processes using nfs
-rw-r--r--usr/src/uts/common/fs/nfs/nfs3_vnops.c38
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_vnops.c34
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_vnops.c37
-rw-r--r--usr/src/uts/common/nfs/rnode.h15
-rw-r--r--usr/src/uts/common/nfs/rnode4.h15
5 files changed, 101 insertions, 38 deletions
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
index e438bc301c..84620044a7 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
@@ -573,7 +573,8 @@ nfs3_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
*/
if ((vp->v_flag & VNOCACHE) ||
(((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
- rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
+ rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
+ !vn_has_cached_data(vp))) {
return (nfs3_directio_read(vp, uiop, cr));
}
@@ -733,7 +734,8 @@ nfs3_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
*/
if ((vp->v_flag & VNOCACHE) ||
(((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
- rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
+ rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
+ !vn_has_cached_data(vp))) {
size_t bufsize;
int count;
u_offset_t org_offset;
@@ -5207,8 +5209,29 @@ nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
* rp->r_lkserlock to avoid a race with concurrent lock requests.
*/
rp = VTOR(vp);
- if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
+
+ /*
+ * Atomically increment r_inmap after acquiring r_rwlock. The
+ * idea here is to acquire r_rwlock to block read/write and
+ * not to protect r_inmap. r_inmap will inform nfs3_read/write()
+ * that we are in nfs3_map(). Now, r_rwlock is acquired in order
+ * and we can prevent the deadlock that would have occurred
+ * when nfs3_addmap() would have acquired it out of order.
+ *
+ * Since we are not protecting r_inmap by any lock, we do not
+ * hold any lock when we decrement it. We atomically decrement
+ * r_inmap after we release r_lkserlock.
+ */
+
+ if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
+ return (EINTR);
+ atomic_add_int(&rp->r_inmap, 1);
+ nfs_rw_exit(&rp->r_rwlock);
+
+ if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
+ atomic_add_int(&rp->r_inmap, -1);
return (EINTR);
+ }
if (vp->v_flag & VNOCACHE) {
error = EAGAIN;
@@ -5248,6 +5271,7 @@ nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
done:
nfs_rw_exit(&rp->r_lkserlock);
+ atomic_add_int(&rp->r_inmap, -1);
return (error);
}
@@ -5264,16 +5288,8 @@ nfs3_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
if (nfs_zone() != VTOMI(vp)->mi_zone)
return (EIO);
- /*
- * Need to hold rwlock while incrementing the mapcnt so that
- * mmap'ing can be serialized with writes so that the caching
- * can be handled correctly.
- */
rp = VTOR(vp);
- if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
- return (EINTR);
atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
- nfs_rw_exit(&rp->r_rwlock);
return (0);
}
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
index ed49f63422..4caafc6049 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
@@ -2627,7 +2627,7 @@ nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
*/
if ((vp->v_flag & VNOCACHE) ||
(((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
- rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) {
+ rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
size_t resid = 0;
return (nfs4read(vp, NULL, uiop->uio_loffset,
@@ -2813,7 +2813,7 @@ nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
*/
if ((vp->v_flag & VNOCACHE) ||
(((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
- rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) {
+ rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
size_t bufsize;
int count;
u_offset_t org_offset;
@@ -10376,9 +10376,29 @@ nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
* This means portions of the file are locked (through VOP_FRLOCK).
* In this case the map request must be refused. We use
* rp->r_lkserlock to avoid a race with concurrent lock requests.
+ *
+ * Atomically increment r_inmap after acquiring r_rwlock. The
+ * idea here is to acquire r_rwlock to block read/write and
+ * not to protect r_inmap. r_inmap will inform nfs4_read/write()
+ * that we are in nfs4_map(). Now, r_rwlock is acquired in order
+ * and we can prevent the deadlock that would have occurred
+ * when nfs4_addmap() would have acquired it out of order.
+ *
+ * Since we are not protecting r_inmap by any lock, we do not
+ * hold any lock when we decrement it. We atomically decrement
+ * r_inmap after we release r_lkserlock.
*/
- if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
+
+ if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
+ return (EINTR);
+ atomic_add_int(&rp->r_inmap, 1);
+ nfs_rw_exit(&rp->r_rwlock);
+
+ if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) {
+ atomic_add_int(&rp->r_inmap, -1);
return (EINTR);
+ }
+
if (vp->v_flag & VNOCACHE) {
error = EAGAIN;
@@ -10483,6 +10503,7 @@ nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
done:
nfs_rw_exit(&rp->r_lkserlock);
+ atomic_add_int(&rp->r_inmap, -1);
return (error);
}
@@ -10622,20 +10643,13 @@ nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
return (ENOSYS);
/*
- * Need to hold rwlock while incrementing the mapcnt so that
- * mmap'ing can be serialized with writes so that the caching
- * can be handled correctly.
- *
* Don't need to update the open stream first, since this
* mmap can't add any additional share access that isn't
* already contained in the open stream (for the case where we
* open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't
* take into account os_mmap_read[write] counts).
*/
- if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
- return (EINTR);
atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
- nfs_rw_exit(&rp->r_rwlock);
if (vp->v_type == VREG) {
/*
diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c
index 8da7d8f06f..73a619c238 100644
--- a/usr/src/uts/common/fs/nfs/nfs_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c
@@ -448,7 +448,8 @@ nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
*/
if ((vp->v_flag & VNOCACHE) ||
(((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
- rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
+ rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
+ !vn_has_cached_data(vp))) {
size_t bufsize;
size_t resid = 0;
@@ -634,7 +635,8 @@ nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
*/
if ((vp->v_flag & VNOCACHE) ||
(((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
- rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
+ rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
+ !vn_has_cached_data(vp))) {
size_t bufsize;
int count;
uint_t org_offset;
@@ -4295,9 +4297,29 @@ nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
* rp->r_lkserlock to avoid a race with concurrent lock requests.
*/
rp = VTOR(vp);
- if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
+
+ /*
+ * Atomically increment r_inmap after acquiring r_rwlock. The
+ * idea here is to acquire r_rwlock to block read/write and
+ * not to protect r_inmap. r_inmap will inform nfs_read/write()
+ * that we are in nfs_map(). Now, r_rwlock is acquired in order
+ * and we can prevent the deadlock that would have occurred
+ * when nfs_addmap() would have acquired it out of order.
+ *
+ * Since we are not protecting r_inmap by any lock, we do not
+ * hold any lock when we decrement it. We atomically decrement
+ * r_inmap after we release r_lkserlock.
+ */
+
+ if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
return (EINTR);
+ atomic_add_int(&rp->r_inmap, 1);
+ nfs_rw_exit(&rp->r_rwlock);
+ if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
+ atomic_add_int(&rp->r_inmap, -1);
+ return (EINTR);
+ }
if (vp->v_flag & VNOCACHE) {
error = EAGAIN;
goto done;
@@ -4336,6 +4358,7 @@ nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
done:
nfs_rw_exit(&rp->r_lkserlock);
+ atomic_add_int(&rp->r_inmap, -1);
return (error);
}
@@ -4352,16 +4375,8 @@ nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
if (nfs_zone() != VTOMI(vp)->mi_zone)
return (EIO);
- /*
- * Need to hold rwlock while incrementing the mapcnt so that
- * mmap'ing can be serialized with writes so that the caching
- * can be handled correctly.
- */
rp = VTOR(vp);
- if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
- return (EINTR);
atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
- nfs_rw_exit(&rp->r_rwlock);
return (0);
}
diff --git a/usr/src/uts/common/nfs/rnode.h b/usr/src/uts/common/nfs/rnode.h
index f2a956040c..3df90d8ffb 100644
--- a/usr/src/uts/common/nfs/rnode.h
+++ b/usr/src/uts/common/nfs/rnode.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -29,8 +29,6 @@
#ifndef _NFS_RNODE_H
#define _NFS_RNODE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/avl.h>
#include <sys/list.h>
#include <nfs/nfs.h>
@@ -221,6 +219,16 @@ typedef struct rhashq {
* set r_modaddr and release r_statelock as long as the r_rwlock
* writer lock is held.
*
+ * r_inmap informs nfsX_read()/write() that there is a call to nfsX_map()
+ * in progress. nfsX_read()/write() check r_inmap to decide whether
+ * to perform directio on the file or not. r_inmap is atomically
+ * incremented in nfsX_map() before the address space routines are
+ * called and atomically decremented just before nfsX_map() exits.
+ * r_inmap is not protected by any lock.
+ *
+ * r_mapcnt tells that the rnode has mapped pages. r_inmap can be 0
+ * while the rnode has mapped pages.
+ *
* 64-bit offsets: the code formerly assumed that atomic reads of
* r_size were safe and reliable; on 32-bit architectures, this is
* not true since an intervening bus cycle from another processor
@@ -283,6 +291,7 @@ typedef struct rnode {
acache_t *r_acache; /* list of access cache entries */
kthread_t *r_serial; /* id of purging thread */
list_t r_indelmap; /* list of delmap callers */
+ uint_t r_inmap; /* to serialize read/write and mmap */
} rnode_t;
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/nfs/rnode4.h b/usr/src/uts/common/nfs/rnode4.h
index 7827e24679..7f4c705698 100644
--- a/usr/src/uts/common/nfs/rnode4.h
+++ b/usr/src/uts/common/nfs/rnode4.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -29,8 +29,6 @@
#ifndef _NFS_RNODE4_H
#define _NFS_RNODE4_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -200,6 +198,16 @@ typedef struct r4hashq {
* set r_modaddr and release r_statelock as long as the r_rwlock
* writer lock is held.
*
+ * r_inmap informs nfs4_read()/write() that there is a call to nfs4_map()
+ * in progress. nfs4_read()/write() check r_inmap to decide whether
+ * to perform directio on the file or not. r_inmap is atomically
+ * incremented in nfs4_map() before the address space routines are
+ * called and atomically decremented just before nfs4_map() exits.
+ * r_inmap is not protected by any lock.
+ *
+ * r_mapcnt tells that the rnode has mapped pages. r_inmap can be 0
+ * while the rnode has mapped pages.
+ *
* 64-bit offsets: the code formerly assumed that atomic reads of
* r_size were safe and reliable; on 32-bit architectures, this is
* not true since an intervening bus cycle from another processor
@@ -326,6 +334,7 @@ typedef struct rnode4 {
/* stub type was set */
nfs4_stub_type_t r_stub_type;
/* e.g. mirror-mount */
+ uint_t r_inmap; /* to serialize read/write and mmap */
} rnode4_t;
#define r_vnode r_svnode.sv_r_vnode